Merge remote-tracking branch 'origin/x86/boot' into x86/mm2
Coming patches to x86/mm2 require the changes and advanced baseline in x86/boot. Resolved Conflicts: arch/x86/kernel/setup.c mm/nobootmem.c Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
This commit is contained in:
34
mm/Kconfig
34
mm/Kconfig
@@ -143,6 +143,25 @@ config NO_BOOTMEM
|
||||
config MEMORY_ISOLATION
|
||||
boolean
|
||||
|
||||
config MOVABLE_NODE
|
||||
boolean "Enable to assign a node which has only movable memory"
|
||||
depends on HAVE_MEMBLOCK
|
||||
depends on NO_BOOTMEM
|
||||
depends on X86_64
|
||||
depends on NUMA
|
||||
default n
|
||||
help
|
||||
Allow a node to have only movable memory. Pages used by the kernel,
|
||||
such as direct mapping pages cannot be migrated. So the corresponding
|
||||
memory device cannot be hotplugged. This option allows users to
|
||||
online all the memory of a node as movable memory so that the whole
|
||||
node can be hotplugged. Users who don't use the memory hotplug
|
||||
feature are fine with this option on since they don't online memory
|
||||
as movable.
|
||||
|
||||
Say Y here if you want to hotplug a whole node.
|
||||
Say N here if you want kernel to use memory on all nodes evenly.
|
||||
|
||||
# eventually, we can have this option just 'select SPARSEMEM'
|
||||
config MEMORY_HOTPLUG
|
||||
bool "Allow for memory hot-add"
|
||||
@@ -187,6 +206,21 @@ config SPLIT_PTLOCK_CPUS
|
||||
default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC
|
||||
default "4"
|
||||
|
||||
#
|
||||
# support for memory balloon compaction
|
||||
config BALLOON_COMPACTION
|
||||
bool "Allow for balloon memory compaction/migration"
|
||||
def_bool y
|
||||
depends on COMPACTION && VIRTIO_BALLOON
|
||||
help
|
||||
Memory fragmentation introduced by ballooning might reduce
|
||||
significantly the number of 2MB contiguous memory blocks that can be
|
||||
used within a guest, thus imposing performance penalties associated
|
||||
with the reduced number of transparent huge pages that could be used
|
||||
by the guest workload. Allowing the compaction & migration for memory
|
||||
pages enlisted as being part of memory balloon devices avoids the
|
||||
scenario aforementioned and helps improving memory defragmentation.
|
||||
|
||||
#
|
||||
# support for memory compaction
|
||||
config COMPACTION
|
||||
|
@@ -16,7 +16,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
|
||||
readahead.o swap.o truncate.o vmscan.o shmem.o \
|
||||
util.o mmzone.o vmstat.o backing-dev.o \
|
||||
mm_init.o mmu_context.o percpu.o slab_common.o \
|
||||
compaction.o interval_tree.o $(mmu-y)
|
||||
compaction.o balloon_compaction.o \
|
||||
interval_tree.o $(mmu-y)
|
||||
|
||||
obj-y += init-mm.o
|
||||
|
||||
|
302
mm/balloon_compaction.c
Normal file
302
mm/balloon_compaction.c
Normal file
@@ -0,0 +1,302 @@
|
||||
/*
|
||||
* mm/balloon_compaction.c
|
||||
*
|
||||
* Common interface for making balloon pages movable by compaction.
|
||||
*
|
||||
* Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com>
|
||||
*/
|
||||
#include <linux/mm.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/balloon_compaction.h>
|
||||
|
||||
/*
|
||||
* balloon_devinfo_alloc - allocates a balloon device information descriptor.
|
||||
* @balloon_dev_descriptor: pointer to reference the balloon device which
|
||||
* this struct balloon_dev_info will be servicing.
|
||||
*
|
||||
* Driver must call it to properly allocate and initialize an instance of
|
||||
* struct balloon_dev_info which will be used to reference a balloon device
|
||||
* as well as to keep track of the balloon device page list.
|
||||
*/
|
||||
struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor)
|
||||
{
|
||||
struct balloon_dev_info *b_dev_info;
|
||||
b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL);
|
||||
if (!b_dev_info)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
b_dev_info->balloon_device = balloon_dev_descriptor;
|
||||
b_dev_info->mapping = NULL;
|
||||
b_dev_info->isolated_pages = 0;
|
||||
spin_lock_init(&b_dev_info->pages_lock);
|
||||
INIT_LIST_HEAD(&b_dev_info->pages);
|
||||
|
||||
return b_dev_info;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(balloon_devinfo_alloc);
|
||||
|
||||
/*
|
||||
* balloon_page_enqueue - allocates a new page and inserts it into the balloon
|
||||
* page list.
|
||||
* @b_dev_info: balloon device decriptor where we will insert a new page to
|
||||
*
|
||||
* Driver must call it to properly allocate a new enlisted balloon page
|
||||
* before definetively removing it from the guest system.
|
||||
* This function returns the page address for the recently enqueued page or
|
||||
* NULL in the case we fail to allocate a new page this turn.
|
||||
*/
|
||||
struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct page *page = alloc_page(balloon_mapping_gfp_mask() |
|
||||
__GFP_NOMEMALLOC | __GFP_NORETRY);
|
||||
if (!page)
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* Block others from accessing the 'page' when we get around to
|
||||
* establishing additional references. We should be the only one
|
||||
* holding a reference to the 'page' at this point.
|
||||
*/
|
||||
BUG_ON(!trylock_page(page));
|
||||
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
|
||||
balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages);
|
||||
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
|
||||
unlock_page(page);
|
||||
return page;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(balloon_page_enqueue);
|
||||
|
||||
/*
|
||||
* balloon_page_dequeue - removes a page from balloon's page list and returns
|
||||
* the its address to allow the driver release the page.
|
||||
* @b_dev_info: balloon device decriptor where we will grab a page from.
|
||||
*
|
||||
* Driver must call it to properly de-allocate a previous enlisted balloon page
|
||||
* before definetively releasing it back to the guest system.
|
||||
* This function returns the page address for the recently dequeued page or
|
||||
* NULL in the case we find balloon's page list temporarily empty due to
|
||||
* compaction isolated pages.
|
||||
*/
|
||||
struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
|
||||
{
|
||||
struct page *page, *tmp;
|
||||
unsigned long flags;
|
||||
bool dequeued_page;
|
||||
|
||||
dequeued_page = false;
|
||||
list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) {
|
||||
/*
|
||||
* Block others from accessing the 'page' while we get around
|
||||
* establishing additional references and preparing the 'page'
|
||||
* to be released by the balloon driver.
|
||||
*/
|
||||
if (trylock_page(page)) {
|
||||
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
|
||||
/*
|
||||
* Raise the page refcount here to prevent any wrong
|
||||
* attempt to isolate this page, in case of coliding
|
||||
* with balloon_page_isolate() just after we release
|
||||
* the page lock.
|
||||
*
|
||||
* balloon_page_free() will take care of dropping
|
||||
* this extra refcount later.
|
||||
*/
|
||||
get_page(page);
|
||||
balloon_page_delete(page);
|
||||
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
|
||||
unlock_page(page);
|
||||
dequeued_page = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!dequeued_page) {
|
||||
/*
|
||||
* If we are unable to dequeue a balloon page because the page
|
||||
* list is empty and there is no isolated pages, then something
|
||||
* went out of track and some balloon pages are lost.
|
||||
* BUG() here, otherwise the balloon driver may get stuck into
|
||||
* an infinite loop while attempting to release all its pages.
|
||||
*/
|
||||
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
|
||||
if (unlikely(list_empty(&b_dev_info->pages) &&
|
||||
!b_dev_info->isolated_pages))
|
||||
BUG();
|
||||
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
|
||||
page = NULL;
|
||||
}
|
||||
return page;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(balloon_page_dequeue);
|
||||
|
||||
#ifdef CONFIG_BALLOON_COMPACTION
|
||||
/*
|
||||
* balloon_mapping_alloc - allocates a special ->mapping for ballooned pages.
|
||||
* @b_dev_info: holds the balloon device information descriptor.
|
||||
* @a_ops: balloon_mapping address_space_operations descriptor.
|
||||
*
|
||||
* Driver must call it to properly allocate and initialize an instance of
|
||||
* struct address_space which will be used as the special page->mapping for
|
||||
* balloon device enlisted page instances.
|
||||
*/
|
||||
struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info,
|
||||
const struct address_space_operations *a_ops)
|
||||
{
|
||||
struct address_space *mapping;
|
||||
|
||||
mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
|
||||
if (!mapping)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
/*
|
||||
* Give a clean 'zeroed' status to all elements of this special
|
||||
* balloon page->mapping struct address_space instance.
|
||||
*/
|
||||
address_space_init_once(mapping);
|
||||
|
||||
/*
|
||||
* Set mapping->flags appropriately, to allow balloon pages
|
||||
* ->mapping identification.
|
||||
*/
|
||||
mapping_set_balloon(mapping);
|
||||
mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask());
|
||||
|
||||
/* balloon's page->mapping->a_ops callback descriptor */
|
||||
mapping->a_ops = a_ops;
|
||||
|
||||
/*
|
||||
* Establish a pointer reference back to the balloon device descriptor
|
||||
* this particular page->mapping will be servicing.
|
||||
* This is used by compaction / migration procedures to identify and
|
||||
* access the balloon device pageset while isolating / migrating pages.
|
||||
*
|
||||
* As some balloon drivers can register multiple balloon devices
|
||||
* for a single guest, this also helps compaction / migration to
|
||||
* properly deal with multiple balloon pagesets, when required.
|
||||
*/
|
||||
mapping->private_data = b_dev_info;
|
||||
b_dev_info->mapping = mapping;
|
||||
|
||||
return mapping;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(balloon_mapping_alloc);
|
||||
|
||||
static inline void __isolate_balloon_page(struct page *page)
|
||||
{
|
||||
struct balloon_dev_info *b_dev_info = page->mapping->private_data;
|
||||
unsigned long flags;
|
||||
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
|
||||
list_del(&page->lru);
|
||||
b_dev_info->isolated_pages++;
|
||||
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
|
||||
}
|
||||
|
||||
static inline void __putback_balloon_page(struct page *page)
|
||||
{
|
||||
struct balloon_dev_info *b_dev_info = page->mapping->private_data;
|
||||
unsigned long flags;
|
||||
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
|
||||
list_add(&page->lru, &b_dev_info->pages);
|
||||
b_dev_info->isolated_pages--;
|
||||
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
|
||||
}
|
||||
|
||||
static inline int __migrate_balloon_page(struct address_space *mapping,
|
||||
struct page *newpage, struct page *page, enum migrate_mode mode)
|
||||
{
|
||||
return page->mapping->a_ops->migratepage(mapping, newpage, page, mode);
|
||||
}
|
||||
|
||||
/* __isolate_lru_page() counterpart for a ballooned page */
|
||||
bool balloon_page_isolate(struct page *page)
|
||||
{
|
||||
/*
|
||||
* Avoid burning cycles with pages that are yet under __free_pages(),
|
||||
* or just got freed under us.
|
||||
*
|
||||
* In case we 'win' a race for a balloon page being freed under us and
|
||||
* raise its refcount preventing __free_pages() from doing its job
|
||||
* the put_page() at the end of this block will take care of
|
||||
* release this page, thus avoiding a nasty leakage.
|
||||
*/
|
||||
if (likely(get_page_unless_zero(page))) {
|
||||
/*
|
||||
* As balloon pages are not isolated from LRU lists, concurrent
|
||||
* compaction threads can race against page migration functions
|
||||
* as well as race against the balloon driver releasing a page.
|
||||
*
|
||||
* In order to avoid having an already isolated balloon page
|
||||
* being (wrongly) re-isolated while it is under migration,
|
||||
* or to avoid attempting to isolate pages being released by
|
||||
* the balloon driver, lets be sure we have the page lock
|
||||
* before proceeding with the balloon page isolation steps.
|
||||
*/
|
||||
if (likely(trylock_page(page))) {
|
||||
/*
|
||||
* A ballooned page, by default, has just one refcount.
|
||||
* Prevent concurrent compaction threads from isolating
|
||||
* an already isolated balloon page by refcount check.
|
||||
*/
|
||||
if (__is_movable_balloon_page(page) &&
|
||||
page_count(page) == 2) {
|
||||
__isolate_balloon_page(page);
|
||||
unlock_page(page);
|
||||
return true;
|
||||
}
|
||||
unlock_page(page);
|
||||
}
|
||||
put_page(page);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/* putback_lru_page() counterpart for a ballooned page */
|
||||
void balloon_page_putback(struct page *page)
|
||||
{
|
||||
/*
|
||||
* 'lock_page()' stabilizes the page and prevents races against
|
||||
* concurrent isolation threads attempting to re-isolate it.
|
||||
*/
|
||||
lock_page(page);
|
||||
|
||||
if (__is_movable_balloon_page(page)) {
|
||||
__putback_balloon_page(page);
|
||||
/* drop the extra ref count taken for page isolation */
|
||||
put_page(page);
|
||||
} else {
|
||||
WARN_ON(1);
|
||||
dump_page(page);
|
||||
}
|
||||
unlock_page(page);
|
||||
}
|
||||
|
||||
/* move_to_new_page() counterpart for a ballooned page */
|
||||
int balloon_page_migrate(struct page *newpage,
|
||||
struct page *page, enum migrate_mode mode)
|
||||
{
|
||||
struct address_space *mapping;
|
||||
int rc = -EAGAIN;
|
||||
|
||||
/*
|
||||
* Block others from accessing the 'newpage' when we get around to
|
||||
* establishing additional references. We should be the only one
|
||||
* holding a reference to the 'newpage' at this point.
|
||||
*/
|
||||
BUG_ON(!trylock_page(newpage));
|
||||
|
||||
if (WARN_ON(!__is_movable_balloon_page(page))) {
|
||||
dump_page(page);
|
||||
unlock_page(newpage);
|
||||
return rc;
|
||||
}
|
||||
|
||||
mapping = page->mapping;
|
||||
if (mapping)
|
||||
rc = __migrate_balloon_page(mapping, newpage, page, mode);
|
||||
|
||||
unlock_page(newpage);
|
||||
return rc;
|
||||
}
|
||||
#endif /* CONFIG_BALLOON_COMPACTION */
|
103
mm/bootmem.c
103
mm/bootmem.c
@@ -147,21 +147,21 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
|
||||
|
||||
/*
|
||||
* free_bootmem_late - free bootmem pages directly to page allocator
|
||||
* @addr: starting address of the range
|
||||
* @addr: starting physical address of the range
|
||||
* @size: size of the range in bytes
|
||||
*
|
||||
* This is only useful when the bootmem allocator has already been torn
|
||||
* down, but we are still initializing the system. Pages are given directly
|
||||
* to the page allocator, no bootmem metadata is updated because it is gone.
|
||||
*/
|
||||
void __init free_bootmem_late(unsigned long addr, unsigned long size)
|
||||
void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
|
||||
{
|
||||
unsigned long cursor, end;
|
||||
|
||||
kmemleak_free_part(__va(addr), size);
|
||||
kmemleak_free_part(__va(physaddr), size);
|
||||
|
||||
cursor = PFN_UP(addr);
|
||||
end = PFN_DOWN(addr + size);
|
||||
cursor = PFN_UP(physaddr);
|
||||
end = PFN_DOWN(physaddr + size);
|
||||
|
||||
for (; cursor < end; cursor++) {
|
||||
__free_pages_bootmem(pfn_to_page(cursor), 0);
|
||||
@@ -185,10 +185,23 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
|
||||
|
||||
while (start < end) {
|
||||
unsigned long *map, idx, vec;
|
||||
unsigned shift;
|
||||
|
||||
map = bdata->node_bootmem_map;
|
||||
idx = start - bdata->node_min_pfn;
|
||||
shift = idx & (BITS_PER_LONG - 1);
|
||||
/*
|
||||
* vec holds at most BITS_PER_LONG map bits,
|
||||
* bit 0 corresponds to start.
|
||||
*/
|
||||
vec = ~map[idx / BITS_PER_LONG];
|
||||
|
||||
if (shift) {
|
||||
vec >>= shift;
|
||||
if (end - start >= BITS_PER_LONG)
|
||||
vec |= ~map[idx / BITS_PER_LONG + 1] <<
|
||||
(BITS_PER_LONG - shift);
|
||||
}
|
||||
/*
|
||||
* If we have a properly aligned and fully unreserved
|
||||
* BITS_PER_LONG block of pages in front of us, free
|
||||
@@ -201,19 +214,18 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
|
||||
count += BITS_PER_LONG;
|
||||
start += BITS_PER_LONG;
|
||||
} else {
|
||||
unsigned long off = 0;
|
||||
unsigned long cur = start;
|
||||
|
||||
vec >>= start & (BITS_PER_LONG - 1);
|
||||
while (vec) {
|
||||
start = ALIGN(start + 1, BITS_PER_LONG);
|
||||
while (vec && cur != start) {
|
||||
if (vec & 1) {
|
||||
page = pfn_to_page(start + off);
|
||||
page = pfn_to_page(cur);
|
||||
__free_pages_bootmem(page, 0);
|
||||
count++;
|
||||
}
|
||||
vec >>= 1;
|
||||
off++;
|
||||
++cur;
|
||||
}
|
||||
start = ALIGN(start + 1, BITS_PER_LONG);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -229,6 +241,22 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
|
||||
return count;
|
||||
}
|
||||
|
||||
static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
|
||||
{
|
||||
struct zone *z;
|
||||
|
||||
/*
|
||||
* In free_area_init_core(), highmem zone's managed_pages is set to
|
||||
* present_pages, and bootmem allocator doesn't allocate from highmem
|
||||
* zones. So there's no need to recalculate managed_pages because all
|
||||
* highmem pages will be managed by the buddy system. Here highmem
|
||||
* zone also includes highmem movable zone.
|
||||
*/
|
||||
for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
|
||||
if (!is_highmem(z))
|
||||
z->managed_pages = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* free_all_bootmem_node - release a node's free pages to the buddy allocator
|
||||
* @pgdat: node to be released
|
||||
@@ -238,6 +266,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
|
||||
unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
|
||||
{
|
||||
register_page_bootmem_info_node(pgdat);
|
||||
reset_node_lowmem_managed_pages(pgdat);
|
||||
return free_all_bootmem_core(pgdat->bdata);
|
||||
}
|
||||
|
||||
@@ -250,6 +279,10 @@ unsigned long __init free_all_bootmem(void)
|
||||
{
|
||||
unsigned long total_pages = 0;
|
||||
bootmem_data_t *bdata;
|
||||
struct pglist_data *pgdat;
|
||||
|
||||
for_each_online_pgdat(pgdat)
|
||||
reset_node_lowmem_managed_pages(pgdat);
|
||||
|
||||
list_for_each_entry(bdata, &bdata_list, list)
|
||||
total_pages += free_all_bootmem_core(bdata);
|
||||
@@ -377,21 +410,21 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
|
||||
|
||||
/**
|
||||
* free_bootmem - mark a page range as usable
|
||||
* @addr: starting address of the range
|
||||
* @addr: starting physical address of the range
|
||||
* @size: size of the range in bytes
|
||||
*
|
||||
* Partial pages will be considered reserved and left as they are.
|
||||
*
|
||||
* The range must be contiguous but may span node boundaries.
|
||||
*/
|
||||
void __init free_bootmem(unsigned long addr, unsigned long size)
|
||||
void __init free_bootmem(unsigned long physaddr, unsigned long size)
|
||||
{
|
||||
unsigned long start, end;
|
||||
|
||||
kmemleak_free_part(__va(addr), size);
|
||||
kmemleak_free_part(__va(physaddr), size);
|
||||
|
||||
start = PFN_UP(addr);
|
||||
end = PFN_DOWN(addr + size);
|
||||
start = PFN_UP(physaddr);
|
||||
end = PFN_DOWN(physaddr + size);
|
||||
|
||||
mark_bootmem(start, end, 0, 0);
|
||||
}
|
||||
@@ -439,12 +472,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
|
||||
return mark_bootmem(start, end, 1, flags);
|
||||
}
|
||||
|
||||
int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
|
||||
int flags)
|
||||
{
|
||||
return reserve_bootmem(phys, len, flags);
|
||||
}
|
||||
|
||||
static unsigned long __init align_idx(struct bootmem_data *bdata,
|
||||
unsigned long idx, unsigned long step)
|
||||
{
|
||||
@@ -575,27 +602,6 @@ find_block:
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
|
||||
unsigned long size, unsigned long align,
|
||||
unsigned long goal, unsigned long limit)
|
||||
{
|
||||
if (WARN_ON_ONCE(slab_is_available()))
|
||||
return kzalloc(size, GFP_NOWAIT);
|
||||
|
||||
#ifdef CONFIG_HAVE_ARCH_BOOTMEM
|
||||
{
|
||||
bootmem_data_t *p_bdata;
|
||||
|
||||
p_bdata = bootmem_arch_preferred_node(bdata, size, align,
|
||||
goal, limit);
|
||||
if (p_bdata)
|
||||
return alloc_bootmem_bdata(p_bdata, size, align,
|
||||
goal, limit);
|
||||
}
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void * __init alloc_bootmem_core(unsigned long size,
|
||||
unsigned long align,
|
||||
unsigned long goal,
|
||||
@@ -604,9 +610,8 @@ static void * __init alloc_bootmem_core(unsigned long size,
|
||||
bootmem_data_t *bdata;
|
||||
void *region;
|
||||
|
||||
region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit);
|
||||
if (region)
|
||||
return region;
|
||||
if (WARN_ON_ONCE(slab_is_available()))
|
||||
return kzalloc(size, GFP_NOWAIT);
|
||||
|
||||
list_for_each_entry(bdata, &bdata_list, list) {
|
||||
if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
|
||||
@@ -704,11 +709,9 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
|
||||
{
|
||||
void *ptr;
|
||||
|
||||
if (WARN_ON_ONCE(slab_is_available()))
|
||||
return kzalloc(size, GFP_NOWAIT);
|
||||
again:
|
||||
ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size,
|
||||
align, goal, limit);
|
||||
if (ptr)
|
||||
return ptr;
|
||||
|
||||
/* do not panic in alloc_bootmem_bdata() */
|
||||
if (limit && goal + size > limit)
|
||||
|
166
mm/compaction.c
166
mm/compaction.c
@@ -14,8 +14,24 @@
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/sysfs.h>
|
||||
#include <linux/balloon_compaction.h>
|
||||
#include "internal.h"
|
||||
|
||||
#ifdef CONFIG_COMPACTION
|
||||
static inline void count_compact_event(enum vm_event_item item)
|
||||
{
|
||||
count_vm_event(item);
|
||||
}
|
||||
|
||||
static inline void count_compact_events(enum vm_event_item item, long delta)
|
||||
{
|
||||
count_vm_events(item, delta);
|
||||
}
|
||||
#else
|
||||
#define count_compact_event(item) do { } while (0)
|
||||
#define count_compact_events(item, delta) do { } while (0)
|
||||
#endif
|
||||
|
||||
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
@@ -214,60 +230,6 @@ static bool suitable_migration_target(struct page *page)
|
||||
return false;
|
||||
}
|
||||
|
||||
static void compact_capture_page(struct compact_control *cc)
|
||||
{
|
||||
unsigned long flags;
|
||||
int mtype, mtype_low, mtype_high;
|
||||
|
||||
if (!cc->page || *cc->page)
|
||||
return;
|
||||
|
||||
/*
|
||||
* For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
|
||||
* regardless of the migratetype of the freelist is is captured from.
|
||||
* This is fine because the order for a high-order MIGRATE_MOVABLE
|
||||
* allocation is typically at least a pageblock size and overall
|
||||
* fragmentation is not impaired. Other allocation types must
|
||||
* capture pages from their own migratelist because otherwise they
|
||||
* could pollute other pageblocks like MIGRATE_MOVABLE with
|
||||
* difficult to move pages and making fragmentation worse overall.
|
||||
*/
|
||||
if (cc->migratetype == MIGRATE_MOVABLE) {
|
||||
mtype_low = 0;
|
||||
mtype_high = MIGRATE_PCPTYPES;
|
||||
} else {
|
||||
mtype_low = cc->migratetype;
|
||||
mtype_high = cc->migratetype + 1;
|
||||
}
|
||||
|
||||
/* Speculatively examine the free lists without zone lock */
|
||||
for (mtype = mtype_low; mtype < mtype_high; mtype++) {
|
||||
int order;
|
||||
for (order = cc->order; order < MAX_ORDER; order++) {
|
||||
struct page *page;
|
||||
struct free_area *area;
|
||||
area = &(cc->zone->free_area[order]);
|
||||
if (list_empty(&area->free_list[mtype]))
|
||||
continue;
|
||||
|
||||
/* Take the lock and attempt capture of the page */
|
||||
if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
|
||||
return;
|
||||
if (!list_empty(&area->free_list[mtype])) {
|
||||
page = list_entry(area->free_list[mtype].next,
|
||||
struct page, lru);
|
||||
if (capture_free_page(page, cc->order, mtype)) {
|
||||
spin_unlock_irqrestore(&cc->zone->lock,
|
||||
flags);
|
||||
*cc->page = page;
|
||||
return;
|
||||
}
|
||||
}
|
||||
spin_unlock_irqrestore(&cc->zone->lock, flags);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Isolate free pages onto a private freelist. Caller must hold zone->lock.
|
||||
* If @strict is true, will abort returning 0 on any invalid PFNs or non-free
|
||||
@@ -356,6 +318,9 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
|
||||
if (blockpfn == end_pfn)
|
||||
update_pageblock_skip(cc, valid_page, total_isolated, false);
|
||||
|
||||
count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
|
||||
if (total_isolated)
|
||||
count_compact_events(COMPACTISOLATED, total_isolated);
|
||||
return total_isolated;
|
||||
}
|
||||
|
||||
@@ -565,9 +530,24 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
|
||||
goto next_pageblock;
|
||||
}
|
||||
|
||||
/* Check may be lockless but that's ok as we recheck later */
|
||||
if (!PageLRU(page))
|
||||
/*
|
||||
* Check may be lockless but that's ok as we recheck later.
|
||||
* It's possible to migrate LRU pages and balloon pages
|
||||
* Skip any other type of page
|
||||
*/
|
||||
if (!PageLRU(page)) {
|
||||
if (unlikely(balloon_page_movable(page))) {
|
||||
if (locked && balloon_page_isolate(page)) {
|
||||
/* Successfully isolated */
|
||||
cc->finished_update_migrate = true;
|
||||
list_add(&page->lru, migratelist);
|
||||
cc->nr_migratepages++;
|
||||
nr_isolated++;
|
||||
goto check_compact_cluster;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* PageLRU is set. lru_lock normally excludes isolation
|
||||
@@ -621,6 +601,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
|
||||
cc->nr_migratepages++;
|
||||
nr_isolated++;
|
||||
|
||||
check_compact_cluster:
|
||||
/* Avoid isolating too much */
|
||||
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
|
||||
++low_pfn;
|
||||
@@ -646,6 +627,10 @@ next_pageblock:
|
||||
|
||||
trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
|
||||
|
||||
count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
|
||||
if (nr_isolated)
|
||||
count_compact_events(COMPACTISOLATED, nr_isolated);
|
||||
|
||||
return low_pfn;
|
||||
}
|
||||
|
||||
@@ -713,7 +698,15 @@ static void isolate_freepages(struct zone *zone,
|
||||
|
||||
/* Found a block suitable for isolating free pages from */
|
||||
isolated = 0;
|
||||
end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
|
||||
|
||||
/*
|
||||
* As pfn may not start aligned, pfn+pageblock_nr_page
|
||||
* may cross a MAX_ORDER_NR_PAGES boundary and miss
|
||||
* a pfn_valid check. Ensure isolate_freepages_block()
|
||||
* only scans within a pageblock
|
||||
*/
|
||||
end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
|
||||
end_pfn = min(end_pfn, zone_end_pfn);
|
||||
isolated = isolate_freepages_block(cc, pfn, end_pfn,
|
||||
freelist, false);
|
||||
nr_freepages += isolated;
|
||||
@@ -823,6 +816,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
|
||||
static int compact_finished(struct zone *zone,
|
||||
struct compact_control *cc)
|
||||
{
|
||||
unsigned int order;
|
||||
unsigned long watermark;
|
||||
|
||||
if (fatal_signal_pending(current))
|
||||
@@ -857,22 +851,16 @@ static int compact_finished(struct zone *zone,
|
||||
return COMPACT_CONTINUE;
|
||||
|
||||
/* Direct compactor: Is a suitable page free? */
|
||||
if (cc->page) {
|
||||
/* Was a suitable page captured? */
|
||||
if (*cc->page)
|
||||
return COMPACT_PARTIAL;
|
||||
} else {
|
||||
unsigned int order;
|
||||
for (order = cc->order; order < MAX_ORDER; order++) {
|
||||
struct free_area *area = &zone->free_area[cc->order];
|
||||
/* Job done if page is free of the right migratetype */
|
||||
if (!list_empty(&area->free_list[cc->migratetype]))
|
||||
return COMPACT_PARTIAL;
|
||||
for (order = cc->order; order < MAX_ORDER; order++) {
|
||||
struct free_area *area = &zone->free_area[order];
|
||||
|
||||
/* Job done if allocation would set block type */
|
||||
if (cc->order >= pageblock_order && area->nr_free)
|
||||
return COMPACT_PARTIAL;
|
||||
}
|
||||
/* Job done if page is free of the right migratetype */
|
||||
if (!list_empty(&area->free_list[cc->migratetype]))
|
||||
return COMPACT_PARTIAL;
|
||||
|
||||
/* Job done if allocation would set block type */
|
||||
if (cc->order >= pageblock_order && area->nr_free)
|
||||
return COMPACT_PARTIAL;
|
||||
}
|
||||
|
||||
return COMPACT_CONTINUE;
|
||||
@@ -978,7 +966,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
|
||||
switch (isolate_migratepages(zone, cc)) {
|
||||
case ISOLATE_ABORT:
|
||||
ret = COMPACT_PARTIAL;
|
||||
putback_lru_pages(&cc->migratepages);
|
||||
putback_movable_pages(&cc->migratepages);
|
||||
cc->nr_migratepages = 0;
|
||||
goto out;
|
||||
case ISOLATE_NONE:
|
||||
@@ -990,29 +978,23 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
|
||||
nr_migrate = cc->nr_migratepages;
|
||||
err = migrate_pages(&cc->migratepages, compaction_alloc,
|
||||
(unsigned long)cc, false,
|
||||
cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
|
||||
cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
|
||||
MR_COMPACTION);
|
||||
update_nr_listpages(cc);
|
||||
nr_remaining = cc->nr_migratepages;
|
||||
|
||||
count_vm_event(COMPACTBLOCKS);
|
||||
count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
|
||||
if (nr_remaining)
|
||||
count_vm_events(COMPACTPAGEFAILED, nr_remaining);
|
||||
trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
|
||||
nr_remaining);
|
||||
|
||||
/* Release LRU pages not migrated */
|
||||
/* Release isolated pages not migrated */
|
||||
if (err) {
|
||||
putback_lru_pages(&cc->migratepages);
|
||||
putback_movable_pages(&cc->migratepages);
|
||||
cc->nr_migratepages = 0;
|
||||
if (err == -ENOMEM) {
|
||||
ret = COMPACT_PARTIAL;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
/* Capture a page now if it is a suitable size */
|
||||
compact_capture_page(cc);
|
||||
}
|
||||
|
||||
out:
|
||||
@@ -1025,8 +1007,7 @@ out:
|
||||
|
||||
static unsigned long compact_zone_order(struct zone *zone,
|
||||
int order, gfp_t gfp_mask,
|
||||
bool sync, bool *contended,
|
||||
struct page **page)
|
||||
bool sync, bool *contended)
|
||||
{
|
||||
unsigned long ret;
|
||||
struct compact_control cc = {
|
||||
@@ -1036,7 +1017,6 @@ static unsigned long compact_zone_order(struct zone *zone,
|
||||
.migratetype = allocflags_to_migratetype(gfp_mask),
|
||||
.zone = zone,
|
||||
.sync = sync,
|
||||
.page = page,
|
||||
};
|
||||
INIT_LIST_HEAD(&cc.freepages);
|
||||
INIT_LIST_HEAD(&cc.migratepages);
|
||||
@@ -1066,7 +1046,7 @@ int sysctl_extfrag_threshold = 500;
|
||||
*/
|
||||
unsigned long try_to_compact_pages(struct zonelist *zonelist,
|
||||
int order, gfp_t gfp_mask, nodemask_t *nodemask,
|
||||
bool sync, bool *contended, struct page **page)
|
||||
bool sync, bool *contended)
|
||||
{
|
||||
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
|
||||
int may_enter_fs = gfp_mask & __GFP_FS;
|
||||
@@ -1080,7 +1060,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
|
||||
if (!order || !may_enter_fs || !may_perform_io)
|
||||
return rc;
|
||||
|
||||
count_vm_event(COMPACTSTALL);
|
||||
count_compact_event(COMPACTSTALL);
|
||||
|
||||
#ifdef CONFIG_CMA
|
||||
if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
|
||||
@@ -1092,7 +1072,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
|
||||
int status;
|
||||
|
||||
status = compact_zone_order(zone, order, gfp_mask, sync,
|
||||
contended, page);
|
||||
contended);
|
||||
rc = max(status, rc);
|
||||
|
||||
/* If a normal allocation would succeed, stop compacting */
|
||||
@@ -1148,7 +1128,6 @@ int compact_pgdat(pg_data_t *pgdat, int order)
|
||||
struct compact_control cc = {
|
||||
.order = order,
|
||||
.sync = false,
|
||||
.page = NULL,
|
||||
};
|
||||
|
||||
return __compact_pgdat(pgdat, &cc);
|
||||
@@ -1159,14 +1138,13 @@ static int compact_node(int nid)
|
||||
struct compact_control cc = {
|
||||
.order = -1,
|
||||
.sync = true,
|
||||
.page = NULL,
|
||||
};
|
||||
|
||||
return __compact_pgdat(NODE_DATA(nid), &cc);
|
||||
}
|
||||
|
||||
/* Compact all nodes in the system */
|
||||
static int compact_nodes(void)
|
||||
static void compact_nodes(void)
|
||||
{
|
||||
int nid;
|
||||
|
||||
@@ -1175,8 +1153,6 @@ static int compact_nodes(void)
|
||||
|
||||
for_each_online_node(nid)
|
||||
compact_node(nid);
|
||||
|
||||
return COMPACT_COMPLETE;
|
||||
}
|
||||
|
||||
/* The written value is actually unused, all memory is compacted */
|
||||
@@ -1187,7 +1163,7 @@ int sysctl_compaction_handler(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *length, loff_t *ppos)
|
||||
{
|
||||
if (write)
|
||||
return compact_nodes();
|
||||
compact_nodes();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
55
mm/dmapool.c
55
mm/dmapool.c
@@ -50,7 +50,6 @@ struct dma_pool { /* the pool */
|
||||
size_t allocation;
|
||||
size_t boundary;
|
||||
char name[32];
|
||||
wait_queue_head_t waitq;
|
||||
struct list_head pools;
|
||||
};
|
||||
|
||||
@@ -62,8 +61,6 @@ struct dma_page { /* cacheable header for 'allocation' bytes */
|
||||
unsigned int offset;
|
||||
};
|
||||
|
||||
#define POOL_TIMEOUT_JIFFIES ((100 /* msec */ * HZ) / 1000)
|
||||
|
||||
static DEFINE_MUTEX(pools_lock);
|
||||
|
||||
static ssize_t
|
||||
@@ -172,7 +169,6 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
|
||||
retval->size = size;
|
||||
retval->boundary = boundary;
|
||||
retval->allocation = allocation;
|
||||
init_waitqueue_head(&retval->waitq);
|
||||
|
||||
if (dev) {
|
||||
int ret;
|
||||
@@ -227,7 +223,6 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
|
||||
memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
|
||||
#endif
|
||||
pool_initialise_page(pool, page);
|
||||
list_add(&page->page_list, &pool->page_list);
|
||||
page->in_use = 0;
|
||||
page->offset = 0;
|
||||
} else {
|
||||
@@ -315,30 +310,21 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
|
||||
might_sleep_if(mem_flags & __GFP_WAIT);
|
||||
|
||||
spin_lock_irqsave(&pool->lock, flags);
|
||||
restart:
|
||||
list_for_each_entry(page, &pool->page_list, page_list) {
|
||||
if (page->offset < pool->allocation)
|
||||
goto ready;
|
||||
}
|
||||
page = pool_alloc_page(pool, GFP_ATOMIC);
|
||||
if (!page) {
|
||||
if (mem_flags & __GFP_WAIT) {
|
||||
DECLARE_WAITQUEUE(wait, current);
|
||||
|
||||
__set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
__add_wait_queue(&pool->waitq, &wait);
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
/* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
|
||||
schedule_timeout(POOL_TIMEOUT_JIFFIES);
|
||||
page = pool_alloc_page(pool, mem_flags);
|
||||
if (!page)
|
||||
return NULL;
|
||||
|
||||
spin_lock_irqsave(&pool->lock, flags);
|
||||
__remove_wait_queue(&pool->waitq, &wait);
|
||||
goto restart;
|
||||
}
|
||||
retval = NULL;
|
||||
goto done;
|
||||
}
|
||||
spin_lock_irqsave(&pool->lock, flags);
|
||||
|
||||
list_add(&page->page_list, &pool->page_list);
|
||||
ready:
|
||||
page->in_use++;
|
||||
offset = page->offset;
|
||||
@@ -346,9 +332,32 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
|
||||
retval = offset + page->vaddr;
|
||||
*handle = offset + page->dma;
|
||||
#ifdef DMAPOOL_DEBUG
|
||||
{
|
||||
int i;
|
||||
u8 *data = retval;
|
||||
/* page->offset is stored in first 4 bytes */
|
||||
for (i = sizeof(page->offset); i < pool->size; i++) {
|
||||
if (data[i] == POOL_POISON_FREED)
|
||||
continue;
|
||||
if (pool->dev)
|
||||
dev_err(pool->dev,
|
||||
"dma_pool_alloc %s, %p (corruped)\n",
|
||||
pool->name, retval);
|
||||
else
|
||||
pr_err("dma_pool_alloc %s, %p (corruped)\n",
|
||||
pool->name, retval);
|
||||
|
||||
/*
|
||||
* Dump the first 4 bytes even if they are not
|
||||
* POOL_POISON_FREED
|
||||
*/
|
||||
print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1,
|
||||
data, pool->size, 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
memset(retval, POOL_POISON_ALLOCATED, pool->size);
|
||||
#endif
|
||||
done:
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
return retval;
|
||||
}
|
||||
@@ -435,8 +444,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
|
||||
page->in_use--;
|
||||
*(int *)vaddr = page->offset;
|
||||
page->offset = offset;
|
||||
if (waitqueue_active(&pool->waitq))
|
||||
wake_up_locked(&pool->waitq);
|
||||
/*
|
||||
* Resist a temptation to do
|
||||
* if (!is_page_busy(page)) pool_free_page(pool, page);
|
||||
|
30
mm/highmem.c
30
mm/highmem.c
@@ -99,12 +99,13 @@ struct page *kmap_to_page(void *vaddr)
|
||||
unsigned long addr = (unsigned long)vaddr;
|
||||
|
||||
if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
|
||||
int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT;
|
||||
int i = PKMAP_NR(addr);
|
||||
return pte_page(pkmap_page_table[i]);
|
||||
}
|
||||
|
||||
return virt_to_page(addr);
|
||||
}
|
||||
EXPORT_SYMBOL(kmap_to_page);
|
||||
|
||||
static void flush_all_zero_pkmaps(void)
|
||||
{
|
||||
@@ -137,8 +138,7 @@ static void flush_all_zero_pkmaps(void)
|
||||
* So no dangers, even with speculative execution.
|
||||
*/
|
||||
page = pte_page(pkmap_page_table[i]);
|
||||
pte_clear(&init_mm, (unsigned long)page_address(page),
|
||||
&pkmap_page_table[i]);
|
||||
pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]);
|
||||
|
||||
set_page_address(page, NULL);
|
||||
need_flush = 1;
|
||||
@@ -324,11 +324,7 @@ struct page_address_map {
|
||||
struct list_head list;
|
||||
};
|
||||
|
||||
/*
|
||||
* page_address_map freelist, allocated from page_address_maps.
|
||||
*/
|
||||
static struct list_head page_address_pool; /* freelist */
|
||||
static spinlock_t pool_lock; /* protects page_address_pool */
|
||||
static struct page_address_map page_address_maps[LAST_PKMAP];
|
||||
|
||||
/*
|
||||
* Hash table bucket
|
||||
@@ -393,14 +389,7 @@ void set_page_address(struct page *page, void *virtual)
|
||||
|
||||
pas = page_slot(page);
|
||||
if (virtual) { /* Add */
|
||||
BUG_ON(list_empty(&page_address_pool));
|
||||
|
||||
spin_lock_irqsave(&pool_lock, flags);
|
||||
pam = list_entry(page_address_pool.next,
|
||||
struct page_address_map, list);
|
||||
list_del(&pam->list);
|
||||
spin_unlock_irqrestore(&pool_lock, flags);
|
||||
|
||||
pam = &page_address_maps[PKMAP_NR((unsigned long)virtual)];
|
||||
pam->page = page;
|
||||
pam->virtual = virtual;
|
||||
|
||||
@@ -413,9 +402,6 @@ void set_page_address(struct page *page, void *virtual)
|
||||
if (pam->page == page) {
|
||||
list_del(&pam->list);
|
||||
spin_unlock_irqrestore(&pas->lock, flags);
|
||||
spin_lock_irqsave(&pool_lock, flags);
|
||||
list_add_tail(&pam->list, &page_address_pool);
|
||||
spin_unlock_irqrestore(&pool_lock, flags);
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
@@ -425,20 +411,14 @@ done:
|
||||
return;
|
||||
}
|
||||
|
||||
static struct page_address_map page_address_maps[LAST_PKMAP];
|
||||
|
||||
void __init page_address_init(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
INIT_LIST_HEAD(&page_address_pool);
|
||||
for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
|
||||
list_add(&page_address_maps[i].list, &page_address_pool);
|
||||
for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
|
||||
INIT_LIST_HEAD(&page_address_htable[i].lh);
|
||||
spin_lock_init(&page_address_htable[i].lock);
|
||||
}
|
||||
spin_lock_init(&pool_lock);
|
||||
}
|
||||
|
||||
#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
|
||||
|
662
mm/huge_memory.c
662
mm/huge_memory.c
파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
Load Diff
63
mm/hugetlb.c
63
mm/hugetlb.c
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* Generic hugetlb support.
|
||||
* (C) William Irwin, April 2004
|
||||
* (C) Nadia Yvette Chambers, April 2004
|
||||
*/
|
||||
#include <linux/list.h>
|
||||
#include <linux/init.h>
|
||||
@@ -1057,7 +1057,7 @@ static void return_unused_surplus_pages(struct hstate *h,
|
||||
* on-line nodes with memory and will handle the hstate accounting.
|
||||
*/
|
||||
while (nr_pages--) {
|
||||
if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1))
|
||||
if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -1180,14 +1180,14 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
|
||||
int __weak alloc_bootmem_huge_page(struct hstate *h)
|
||||
{
|
||||
struct huge_bootmem_page *m;
|
||||
int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
|
||||
int nr_nodes = nodes_weight(node_states[N_MEMORY]);
|
||||
|
||||
while (nr_nodes) {
|
||||
void *addr;
|
||||
|
||||
addr = __alloc_bootmem_node_nopanic(
|
||||
NODE_DATA(hstate_next_node_to_alloc(h,
|
||||
&node_states[N_HIGH_MEMORY])),
|
||||
&node_states[N_MEMORY])),
|
||||
huge_page_size(h), huge_page_size(h), 0);
|
||||
|
||||
if (addr) {
|
||||
@@ -1259,7 +1259,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
|
||||
if (!alloc_bootmem_huge_page(h))
|
||||
break;
|
||||
} else if (!alloc_fresh_huge_page(h,
|
||||
&node_states[N_HIGH_MEMORY]))
|
||||
&node_states[N_MEMORY]))
|
||||
break;
|
||||
}
|
||||
h->max_huge_pages = i;
|
||||
@@ -1527,7 +1527,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
|
||||
if (!(obey_mempolicy &&
|
||||
init_nodemask_of_mempolicy(nodes_allowed))) {
|
||||
NODEMASK_FREE(nodes_allowed);
|
||||
nodes_allowed = &node_states[N_HIGH_MEMORY];
|
||||
nodes_allowed = &node_states[N_MEMORY];
|
||||
}
|
||||
} else if (nodes_allowed) {
|
||||
/*
|
||||
@@ -1537,11 +1537,11 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
|
||||
count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
|
||||
init_nodemask_of_node(nodes_allowed, nid);
|
||||
} else
|
||||
nodes_allowed = &node_states[N_HIGH_MEMORY];
|
||||
nodes_allowed = &node_states[N_MEMORY];
|
||||
|
||||
h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
|
||||
|
||||
if (nodes_allowed != &node_states[N_HIGH_MEMORY])
|
||||
if (nodes_allowed != &node_states[N_MEMORY])
|
||||
NODEMASK_FREE(nodes_allowed);
|
||||
|
||||
return len;
|
||||
@@ -1800,7 +1800,7 @@ static void hugetlb_unregister_all_nodes(void)
|
||||
* remove hstate attributes from any nodes that have them.
|
||||
*/
|
||||
for (nid = 0; nid < nr_node_ids; nid++)
|
||||
hugetlb_unregister_node(&node_devices[nid]);
|
||||
hugetlb_unregister_node(node_devices[nid]);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1844,8 +1844,8 @@ static void hugetlb_register_all_nodes(void)
|
||||
{
|
||||
int nid;
|
||||
|
||||
for_each_node_state(nid, N_HIGH_MEMORY) {
|
||||
struct node *node = &node_devices[nid];
|
||||
for_each_node_state(nid, N_MEMORY) {
|
||||
struct node *node = node_devices[nid];
|
||||
if (node->dev.id == nid)
|
||||
hugetlb_register_node(node);
|
||||
}
|
||||
@@ -1906,14 +1906,12 @@ static int __init hugetlb_init(void)
|
||||
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
|
||||
|
||||
hugetlb_init_hstates();
|
||||
|
||||
gather_bootmem_prealloc();
|
||||
|
||||
report_hugepages();
|
||||
|
||||
hugetlb_sysfs_init();
|
||||
|
||||
hugetlb_register_all_nodes();
|
||||
hugetlb_cgroup_file_init();
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1939,17 +1937,10 @@ void __init hugetlb_add_hstate(unsigned order)
|
||||
for (i = 0; i < MAX_NUMNODES; ++i)
|
||||
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
|
||||
INIT_LIST_HEAD(&h->hugepage_activelist);
|
||||
h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
|
||||
h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
|
||||
h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
|
||||
h->next_nid_to_free = first_node(node_states[N_MEMORY]);
|
||||
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
|
||||
huge_page_size(h)/1024);
|
||||
/*
|
||||
* Add cgroup control files only if the huge page consists
|
||||
* of more than two normal pages. This is because we use
|
||||
* page[2].lru.next for storing cgoup details.
|
||||
*/
|
||||
if (order >= HUGETLB_CGROUP_MIN_ORDER)
|
||||
hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
|
||||
|
||||
parsed_hstate = h;
|
||||
}
|
||||
@@ -2035,11 +2026,11 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
|
||||
if (!(obey_mempolicy &&
|
||||
init_nodemask_of_mempolicy(nodes_allowed))) {
|
||||
NODEMASK_FREE(nodes_allowed);
|
||||
nodes_allowed = &node_states[N_HIGH_MEMORY];
|
||||
nodes_allowed = &node_states[N_MEMORY];
|
||||
}
|
||||
h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
|
||||
|
||||
if (nodes_allowed != &node_states[N_HIGH_MEMORY])
|
||||
if (nodes_allowed != &node_states[N_MEMORY])
|
||||
NODEMASK_FREE(nodes_allowed);
|
||||
}
|
||||
out:
|
||||
@@ -2386,8 +2377,10 @@ again:
|
||||
/*
|
||||
* HWPoisoned hugepage is already unmapped and dropped reference
|
||||
*/
|
||||
if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
|
||||
if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
|
||||
pte_clear(mm, address, ptep);
|
||||
continue;
|
||||
}
|
||||
|
||||
page = pte_page(pte);
|
||||
/*
|
||||
@@ -3014,7 +3007,7 @@ same_page:
|
||||
return i ? i : -EFAULT;
|
||||
}
|
||||
|
||||
void hugetlb_change_protection(struct vm_area_struct *vma,
|
||||
unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
|
||||
unsigned long address, unsigned long end, pgprot_t newprot)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
@@ -3022,6 +3015,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
|
||||
pte_t *ptep;
|
||||
pte_t pte;
|
||||
struct hstate *h = hstate_vma(vma);
|
||||
unsigned long pages = 0;
|
||||
|
||||
BUG_ON(address >= end);
|
||||
flush_cache_range(vma, address, end);
|
||||
@@ -3032,12 +3026,15 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
|
||||
ptep = huge_pte_offset(mm, address);
|
||||
if (!ptep)
|
||||
continue;
|
||||
if (huge_pmd_unshare(mm, &address, ptep))
|
||||
if (huge_pmd_unshare(mm, &address, ptep)) {
|
||||
pages++;
|
||||
continue;
|
||||
}
|
||||
if (!huge_pte_none(huge_ptep_get(ptep))) {
|
||||
pte = huge_ptep_get_and_clear(mm, address, ptep);
|
||||
pte = pte_mkhuge(pte_modify(pte, newprot));
|
||||
set_huge_pte_at(mm, address, ptep, pte);
|
||||
pages++;
|
||||
}
|
||||
}
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
@@ -3049,6 +3046,8 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
|
||||
*/
|
||||
flush_tlb_range(vma, start, end);
|
||||
mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
|
||||
|
||||
return pages << h->order;
|
||||
}
|
||||
|
||||
int hugetlb_reserve_pages(struct inode *inode,
|
||||
@@ -3170,7 +3169,13 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
|
||||
|
||||
spin_lock(&hugetlb_lock);
|
||||
if (is_hugepage_on_freelist(hpage)) {
|
||||
list_del(&hpage->lru);
|
||||
/*
|
||||
* Hwpoisoned hugepage isn't linked to activelist or freelist,
|
||||
* but dangling hpage->lru can trigger list-debug warnings
|
||||
* (this happens when we call unpoison_memory() on it),
|
||||
* so let it point to itself with list_del_init().
|
||||
*/
|
||||
list_del_init(&hpage->lru);
|
||||
set_page_refcounted(hpage);
|
||||
h->free_huge_pages--;
|
||||
h->free_huge_pages_node[nid]--;
|
||||
|
@@ -77,7 +77,7 @@ static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
|
||||
return false;
|
||||
}
|
||||
|
||||
static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
|
||||
static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgroup)
|
||||
{
|
||||
int idx;
|
||||
struct cgroup *parent_cgroup;
|
||||
@@ -101,7 +101,7 @@ static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
|
||||
return &h_cgroup->css;
|
||||
}
|
||||
|
||||
static void hugetlb_cgroup_destroy(struct cgroup *cgroup)
|
||||
static void hugetlb_cgroup_css_free(struct cgroup *cgroup)
|
||||
{
|
||||
struct hugetlb_cgroup *h_cgroup;
|
||||
|
||||
@@ -155,18 +155,13 @@ out:
|
||||
* Force the hugetlb cgroup to empty the hugetlb resources by moving them to
|
||||
* the parent cgroup.
|
||||
*/
|
||||
static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
|
||||
static void hugetlb_cgroup_css_offline(struct cgroup *cgroup)
|
||||
{
|
||||
struct hstate *h;
|
||||
struct page *page;
|
||||
int ret = 0, idx = 0;
|
||||
int idx = 0;
|
||||
|
||||
do {
|
||||
if (cgroup_task_count(cgroup) ||
|
||||
!list_empty(&cgroup->children)) {
|
||||
ret = -EBUSY;
|
||||
goto out;
|
||||
}
|
||||
for_each_hstate(h) {
|
||||
spin_lock(&hugetlb_lock);
|
||||
list_for_each_entry(page, &h->hugepage_activelist, lru)
|
||||
@@ -177,8 +172,6 @@ static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
|
||||
}
|
||||
cond_resched();
|
||||
} while (hugetlb_cgroup_have_usage(cgroup));
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
|
||||
@@ -340,7 +333,7 @@ static char *mem_fmt(char *buf, int size, unsigned long hsize)
|
||||
return buf;
|
||||
}
|
||||
|
||||
int __init hugetlb_cgroup_file_init(int idx)
|
||||
static void __init __hugetlb_cgroup_file_init(int idx)
|
||||
{
|
||||
char buf[32];
|
||||
struct cftype *cft;
|
||||
@@ -382,7 +375,22 @@ int __init hugetlb_cgroup_file_init(int idx)
|
||||
|
||||
WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
|
||||
|
||||
return 0;
|
||||
return;
|
||||
}
|
||||
|
||||
void __init hugetlb_cgroup_file_init(void)
|
||||
{
|
||||
struct hstate *h;
|
||||
|
||||
for_each_hstate(h) {
|
||||
/*
|
||||
* Add cgroup control files only if the huge page consists
|
||||
* of more than two normal pages. This is because we use
|
||||
* page[2].lru.next for storing cgroup details.
|
||||
*/
|
||||
if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
|
||||
__hugetlb_cgroup_file_init(hstate_index(h));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -411,8 +419,8 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
|
||||
|
||||
struct cgroup_subsys hugetlb_subsys = {
|
||||
.name = "hugetlb",
|
||||
.create = hugetlb_cgroup_create,
|
||||
.pre_destroy = hugetlb_cgroup_pre_destroy,
|
||||
.destroy = hugetlb_cgroup_destroy,
|
||||
.subsys_id = hugetlb_subsys_id,
|
||||
.css_alloc = hugetlb_cgroup_css_alloc,
|
||||
.css_offline = hugetlb_cgroup_css_offline,
|
||||
.css_free = hugetlb_cgroup_css_free,
|
||||
.subsys_id = hugetlb_subsys_id,
|
||||
};
|
||||
|
@@ -91,6 +91,11 @@ extern unsigned long highest_memmap_pfn;
|
||||
extern int isolate_lru_page(struct page *page);
|
||||
extern void putback_lru_page(struct page *page);
|
||||
|
||||
/*
|
||||
* in mm/rmap.c:
|
||||
*/
|
||||
extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
|
||||
|
||||
/*
|
||||
* in mm/page_alloc.c
|
||||
*/
|
||||
@@ -130,7 +135,6 @@ struct compact_control {
|
||||
int migratetype; /* MOVABLE, RECLAIMABLE etc */
|
||||
struct zone *zone;
|
||||
bool contended; /* True if a lock was contended */
|
||||
struct page **page; /* Page captured of requested size */
|
||||
};
|
||||
|
||||
unsigned long
|
||||
@@ -212,15 +216,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
|
||||
{
|
||||
if (TestClearPageMlocked(page)) {
|
||||
unsigned long flags;
|
||||
int nr_pages = hpage_nr_pages(page);
|
||||
|
||||
local_irq_save(flags);
|
||||
__dec_zone_page_state(page, NR_MLOCK);
|
||||
__mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
|
||||
SetPageMlocked(newpage);
|
||||
__inc_zone_page_state(newpage, NR_MLOCK);
|
||||
__mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
}
|
||||
|
||||
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
extern unsigned long vma_address(struct page *page,
|
||||
struct vm_area_struct *vma);
|
||||
|
@@ -1556,7 +1556,8 @@ static int dump_str_object_info(const char *str)
|
||||
struct kmemleak_object *object;
|
||||
unsigned long addr;
|
||||
|
||||
addr= simple_strtoul(str, NULL, 0);
|
||||
if (kstrtoul(str, 0, &addr))
|
||||
return -EINVAL;
|
||||
object = find_and_get_object(addr, 0);
|
||||
if (!object) {
|
||||
pr_info("Unknown object at 0x%08lx\n", addr);
|
||||
|
37
mm/ksm.c
37
mm/ksm.c
@@ -778,8 +778,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
|
||||
struct page *kpage, pte_t orig_pte)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pte_t *ptep;
|
||||
spinlock_t *ptl;
|
||||
@@ -792,18 +790,10 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
|
||||
if (addr == -EFAULT)
|
||||
goto out;
|
||||
|
||||
pgd = pgd_offset(mm, addr);
|
||||
if (!pgd_present(*pgd))
|
||||
pmd = mm_find_pmd(mm, addr);
|
||||
if (!pmd)
|
||||
goto out;
|
||||
|
||||
pud = pud_offset(pgd, addr);
|
||||
if (!pud_present(*pud))
|
||||
goto out;
|
||||
|
||||
pmd = pmd_offset(pud, addr);
|
||||
BUG_ON(pmd_trans_huge(*pmd));
|
||||
if (!pmd_present(*pmd))
|
||||
goto out;
|
||||
|
||||
mmun_start = addr;
|
||||
mmun_end = addr + PAGE_SIZE;
|
||||
@@ -1634,7 +1624,7 @@ again:
|
||||
struct anon_vma_chain *vmac;
|
||||
struct vm_area_struct *vma;
|
||||
|
||||
anon_vma_lock(anon_vma);
|
||||
anon_vma_lock_read(anon_vma);
|
||||
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
|
||||
0, ULONG_MAX) {
|
||||
vma = vmac->vma;
|
||||
@@ -1658,7 +1648,7 @@ again:
|
||||
if (!search_new_forks || !mapcount)
|
||||
break;
|
||||
}
|
||||
anon_vma_unlock(anon_vma);
|
||||
anon_vma_unlock_read(anon_vma);
|
||||
if (!mapcount)
|
||||
goto out;
|
||||
}
|
||||
@@ -1688,7 +1678,7 @@ again:
|
||||
struct anon_vma_chain *vmac;
|
||||
struct vm_area_struct *vma;
|
||||
|
||||
anon_vma_lock(anon_vma);
|
||||
anon_vma_lock_read(anon_vma);
|
||||
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
|
||||
0, ULONG_MAX) {
|
||||
vma = vmac->vma;
|
||||
@@ -1707,11 +1697,11 @@ again:
|
||||
ret = try_to_unmap_one(page, vma,
|
||||
rmap_item->address, flags);
|
||||
if (ret != SWAP_AGAIN || !page_mapped(page)) {
|
||||
anon_vma_unlock(anon_vma);
|
||||
anon_vma_unlock_read(anon_vma);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
anon_vma_unlock(anon_vma);
|
||||
anon_vma_unlock_read(anon_vma);
|
||||
}
|
||||
if (!search_new_forks++)
|
||||
goto again;
|
||||
@@ -1741,7 +1731,7 @@ again:
|
||||
struct anon_vma_chain *vmac;
|
||||
struct vm_area_struct *vma;
|
||||
|
||||
anon_vma_lock(anon_vma);
|
||||
anon_vma_lock_read(anon_vma);
|
||||
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
|
||||
0, ULONG_MAX) {
|
||||
vma = vmac->vma;
|
||||
@@ -1759,11 +1749,11 @@ again:
|
||||
|
||||
ret = rmap_one(page, vma, rmap_item->address, arg);
|
||||
if (ret != SWAP_AGAIN) {
|
||||
anon_vma_unlock(anon_vma);
|
||||
anon_vma_unlock_read(anon_vma);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
anon_vma_unlock(anon_vma);
|
||||
anon_vma_unlock_read(anon_vma);
|
||||
}
|
||||
if (!search_new_forks++)
|
||||
goto again;
|
||||
@@ -1929,12 +1919,9 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
if (ksm_run != flags) {
|
||||
ksm_run = flags;
|
||||
if (flags & KSM_RUN_UNMERGE) {
|
||||
int oom_score_adj;
|
||||
|
||||
oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
|
||||
set_current_oom_origin();
|
||||
err = unmerge_and_remove_all_rmap_items();
|
||||
compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX,
|
||||
oom_score_adj);
|
||||
clear_current_oom_origin();
|
||||
if (err) {
|
||||
ksm_run = KSM_RUN_STOP;
|
||||
count = err;
|
||||
|
@@ -314,7 +314,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
|
||||
}
|
||||
|
||||
this->size += next->size;
|
||||
memmove(next, next + 1, (type->cnt - (i + 1)) * sizeof(*next));
|
||||
/* move forward from next + 1, index of which is i + 2 */
|
||||
memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next));
|
||||
type->cnt--;
|
||||
}
|
||||
}
|
||||
|
1497
mm/memcontrol.c
1497
mm/memcontrol.c
파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
Load Diff
@@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
|
||||
struct anon_vma *av;
|
||||
pgoff_t pgoff;
|
||||
|
||||
av = page_lock_anon_vma(page);
|
||||
av = page_lock_anon_vma_read(page);
|
||||
if (av == NULL) /* Not actually mapped anymore */
|
||||
return;
|
||||
|
||||
@@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
|
||||
}
|
||||
}
|
||||
read_unlock(&tasklist_lock);
|
||||
page_unlock_anon_vma(av);
|
||||
page_unlock_anon_vma_read(av);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -781,16 +781,16 @@ static struct page_state {
|
||||
{ compound, compound, "huge", me_huge_page },
|
||||
#endif
|
||||
|
||||
{ sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty },
|
||||
{ sc|dirty, sc, "swapcache", me_swapcache_clean },
|
||||
{ sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
|
||||
{ sc|dirty, sc, "clean swapcache", me_swapcache_clean },
|
||||
|
||||
{ unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
|
||||
{ unevict, unevict, "unevictable LRU", me_pagecache_clean},
|
||||
{ unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
|
||||
{ unevict, unevict, "clean unevictable LRU", me_pagecache_clean },
|
||||
|
||||
{ mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty },
|
||||
{ mlock, mlock, "mlocked LRU", me_pagecache_clean },
|
||||
{ mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
|
||||
{ mlock, mlock, "clean mlocked LRU", me_pagecache_clean },
|
||||
|
||||
{ lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
|
||||
{ lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
|
||||
{ lru|dirty, lru, "clean LRU", me_pagecache_clean },
|
||||
|
||||
/*
|
||||
@@ -812,14 +812,14 @@ static struct page_state {
|
||||
#undef slab
|
||||
#undef reserved
|
||||
|
||||
/*
|
||||
* "Dirty/Clean" indication is not 100% accurate due to the possibility of
|
||||
* setting PG_dirty outside page lock. See also comment above set_page_dirty().
|
||||
*/
|
||||
static void action_result(unsigned long pfn, char *msg, int result)
|
||||
{
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
|
||||
printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
|
||||
pfn,
|
||||
PageDirty(page) ? "dirty " : "",
|
||||
msg, action_name[result]);
|
||||
pr_err("MCE %#lx: %s page recovery: %s\n",
|
||||
pfn, msg, action_name[result]);
|
||||
}
|
||||
|
||||
static int page_action(struct page_state *ps, struct page *p,
|
||||
@@ -1385,7 +1385,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
|
||||
* Isolate the page, so that it doesn't get reallocated if it
|
||||
* was free.
|
||||
*/
|
||||
set_migratetype_isolate(p);
|
||||
set_migratetype_isolate(p, true);
|
||||
/*
|
||||
* When the target page is a free hugepage, just remove it
|
||||
* from free hugepage list.
|
||||
@@ -1476,9 +1476,17 @@ int soft_offline_page(struct page *page, int flags)
|
||||
{
|
||||
int ret;
|
||||
unsigned long pfn = page_to_pfn(page);
|
||||
struct page *hpage = compound_trans_head(page);
|
||||
|
||||
if (PageHuge(page))
|
||||
return soft_offline_huge_page(page, flags);
|
||||
if (PageTransHuge(hpage)) {
|
||||
if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
|
||||
pr_info("soft offline: %#lx: failed to split THP\n",
|
||||
pfn);
|
||||
return -EBUSY;
|
||||
}
|
||||
}
|
||||
|
||||
ret = get_any_page(page, pfn, flags);
|
||||
if (ret < 0)
|
||||
@@ -1558,7 +1566,8 @@ int soft_offline_page(struct page *page, int flags)
|
||||
page_is_file_cache(page));
|
||||
list_add(&page->lru, &pagelist);
|
||||
ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
|
||||
false, MIGRATE_SYNC);
|
||||
false, MIGRATE_SYNC,
|
||||
MR_MEMORY_FAILURE);
|
||||
if (ret) {
|
||||
putback_lru_pages(&pagelist);
|
||||
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
|
||||
|
251
mm/memory.c
251
mm/memory.c
@@ -57,6 +57,8 @@
|
||||
#include <linux/swapops.h>
|
||||
#include <linux/elf.h>
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/migrate.h>
|
||||
#include <linux/string.h>
|
||||
|
||||
#include <asm/io.h>
|
||||
#include <asm/pgalloc.h>
|
||||
@@ -182,10 +184,14 @@ static int tlb_next_batch(struct mmu_gather *tlb)
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
|
||||
return 0;
|
||||
|
||||
batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
|
||||
if (!batch)
|
||||
return 0;
|
||||
|
||||
tlb->batch_count++;
|
||||
batch->next = NULL;
|
||||
batch->nr = 0;
|
||||
batch->max = MAX_GATHER_BATCH;
|
||||
@@ -214,6 +220,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
|
||||
tlb->local.nr = 0;
|
||||
tlb->local.max = ARRAY_SIZE(tlb->__pages);
|
||||
tlb->active = &tlb->local;
|
||||
tlb->batch_count = 0;
|
||||
|
||||
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
|
||||
tlb->batch = NULL;
|
||||
@@ -717,20 +724,6 @@ static inline bool is_cow_mapping(vm_flags_t flags)
|
||||
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
|
||||
}
|
||||
|
||||
#ifndef is_zero_pfn
|
||||
static inline int is_zero_pfn(unsigned long pfn)
|
||||
{
|
||||
return pfn == zero_pfn;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef my_zero_pfn
|
||||
static inline unsigned long my_zero_pfn(unsigned long addr)
|
||||
{
|
||||
return zero_pfn;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* vm_normal_page -- This function gets the "struct page" associated with a pte.
|
||||
*
|
||||
@@ -1250,7 +1243,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
|
||||
BUG();
|
||||
}
|
||||
#endif
|
||||
split_huge_page_pmd(vma->vm_mm, pmd);
|
||||
split_huge_page_pmd(vma, addr, pmd);
|
||||
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
|
||||
goto next;
|
||||
/* fall through */
|
||||
@@ -1517,9 +1510,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
|
||||
page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
|
||||
goto out;
|
||||
}
|
||||
if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
|
||||
goto no_page_table;
|
||||
if (pmd_trans_huge(*pmd)) {
|
||||
if (flags & FOLL_SPLIT) {
|
||||
split_huge_page_pmd(mm, pmd);
|
||||
split_huge_page_pmd(vma, address, pmd);
|
||||
goto split_fallthrough;
|
||||
}
|
||||
spin_lock(&mm->page_table_lock);
|
||||
@@ -1546,6 +1541,8 @@ split_fallthrough:
|
||||
pte = *ptep;
|
||||
if (!pte_present(pte))
|
||||
goto no_page;
|
||||
if ((flags & FOLL_NUMA) && pte_numa(pte))
|
||||
goto no_page;
|
||||
if ((flags & FOLL_WRITE) && !pte_write(pte))
|
||||
goto unlock;
|
||||
|
||||
@@ -1697,6 +1694,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
|
||||
vm_flags &= (gup_flags & FOLL_FORCE) ?
|
||||
(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
|
||||
|
||||
/*
|
||||
* If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
|
||||
* would be called on PROT_NONE ranges. We must never invoke
|
||||
* handle_mm_fault on PROT_NONE ranges or the NUMA hinting
|
||||
* page faults would unprotect the PROT_NONE ranges if
|
||||
* _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
|
||||
* bitflag. So to avoid that, don't set FOLL_NUMA if
|
||||
* FOLL_FORCE is set.
|
||||
*/
|
||||
if (!(gup_flags & FOLL_FORCE))
|
||||
gup_flags |= FOLL_NUMA;
|
||||
|
||||
i = 0;
|
||||
|
||||
do {
|
||||
@@ -2794,13 +2804,8 @@ unlock:
|
||||
oom_free_new:
|
||||
page_cache_release(new_page);
|
||||
oom:
|
||||
if (old_page) {
|
||||
if (page_mkwrite) {
|
||||
unlock_page(old_page);
|
||||
page_cache_release(old_page);
|
||||
}
|
||||
if (old_page)
|
||||
page_cache_release(old_page);
|
||||
}
|
||||
return VM_FAULT_OOM;
|
||||
|
||||
unwritable_page:
|
||||
@@ -3431,6 +3436,170 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
|
||||
}
|
||||
|
||||
int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
|
||||
unsigned long addr, int current_nid)
|
||||
{
|
||||
get_page(page);
|
||||
|
||||
count_vm_numa_event(NUMA_HINT_FAULTS);
|
||||
if (current_nid == numa_node_id())
|
||||
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
|
||||
|
||||
return mpol_misplaced(page, vma, addr);
|
||||
}
|
||||
|
||||
int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
|
||||
{
|
||||
struct page *page = NULL;
|
||||
spinlock_t *ptl;
|
||||
int current_nid = -1;
|
||||
int target_nid;
|
||||
bool migrated = false;
|
||||
|
||||
/*
|
||||
* The "pte" at this point cannot be used safely without
|
||||
* validation through pte_unmap_same(). It's of NUMA type but
|
||||
* the pfn may be screwed if the read is non atomic.
|
||||
*
|
||||
* ptep_modify_prot_start is not called as this is clearing
|
||||
* the _PAGE_NUMA bit and it is not really expected that there
|
||||
* would be concurrent hardware modifications to the PTE.
|
||||
*/
|
||||
ptl = pte_lockptr(mm, pmd);
|
||||
spin_lock(ptl);
|
||||
if (unlikely(!pte_same(*ptep, pte))) {
|
||||
pte_unmap_unlock(ptep, ptl);
|
||||
goto out;
|
||||
}
|
||||
|
||||
pte = pte_mknonnuma(pte);
|
||||
set_pte_at(mm, addr, ptep, pte);
|
||||
update_mmu_cache(vma, addr, ptep);
|
||||
|
||||
page = vm_normal_page(vma, addr, pte);
|
||||
if (!page) {
|
||||
pte_unmap_unlock(ptep, ptl);
|
||||
return 0;
|
||||
}
|
||||
|
||||
current_nid = page_to_nid(page);
|
||||
target_nid = numa_migrate_prep(page, vma, addr, current_nid);
|
||||
pte_unmap_unlock(ptep, ptl);
|
||||
if (target_nid == -1) {
|
||||
/*
|
||||
* Account for the fault against the current node if it not
|
||||
* being replaced regardless of where the page is located.
|
||||
*/
|
||||
current_nid = numa_node_id();
|
||||
put_page(page);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Migrate to the requested node */
|
||||
migrated = migrate_misplaced_page(page, target_nid);
|
||||
if (migrated)
|
||||
current_nid = target_nid;
|
||||
|
||||
out:
|
||||
if (current_nid != -1)
|
||||
task_numa_fault(current_nid, 1, migrated);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* NUMA hinting page fault entry point for regular pmds */
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long addr, pmd_t *pmdp)
|
||||
{
|
||||
pmd_t pmd;
|
||||
pte_t *pte, *orig_pte;
|
||||
unsigned long _addr = addr & PMD_MASK;
|
||||
unsigned long offset;
|
||||
spinlock_t *ptl;
|
||||
bool numa = false;
|
||||
int local_nid = numa_node_id();
|
||||
|
||||
spin_lock(&mm->page_table_lock);
|
||||
pmd = *pmdp;
|
||||
if (pmd_numa(pmd)) {
|
||||
set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
|
||||
numa = true;
|
||||
}
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
|
||||
if (!numa)
|
||||
return 0;
|
||||
|
||||
/* we're in a page fault so some vma must be in the range */
|
||||
BUG_ON(!vma);
|
||||
BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
|
||||
offset = max(_addr, vma->vm_start) & ~PMD_MASK;
|
||||
VM_BUG_ON(offset >= PMD_SIZE);
|
||||
orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
|
||||
pte += offset >> PAGE_SHIFT;
|
||||
for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
|
||||
pte_t pteval = *pte;
|
||||
struct page *page;
|
||||
int curr_nid = local_nid;
|
||||
int target_nid;
|
||||
bool migrated;
|
||||
if (!pte_present(pteval))
|
||||
continue;
|
||||
if (!pte_numa(pteval))
|
||||
continue;
|
||||
if (addr >= vma->vm_end) {
|
||||
vma = find_vma(mm, addr);
|
||||
/* there's a pte present so there must be a vma */
|
||||
BUG_ON(!vma);
|
||||
BUG_ON(addr < vma->vm_start);
|
||||
}
|
||||
if (pte_numa(pteval)) {
|
||||
pteval = pte_mknonnuma(pteval);
|
||||
set_pte_at(mm, addr, pte, pteval);
|
||||
}
|
||||
page = vm_normal_page(vma, addr, pteval);
|
||||
if (unlikely(!page))
|
||||
continue;
|
||||
/* only check non-shared pages */
|
||||
if (unlikely(page_mapcount(page) != 1))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Note that the NUMA fault is later accounted to either
|
||||
* the node that is currently running or where the page is
|
||||
* migrated to.
|
||||
*/
|
||||
curr_nid = local_nid;
|
||||
target_nid = numa_migrate_prep(page, vma, addr,
|
||||
page_to_nid(page));
|
||||
if (target_nid == -1) {
|
||||
put_page(page);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Migrate to the requested node */
|
||||
pte_unmap_unlock(pte, ptl);
|
||||
migrated = migrate_misplaced_page(page, target_nid);
|
||||
if (migrated)
|
||||
curr_nid = target_nid;
|
||||
task_numa_fault(curr_nid, 1, migrated);
|
||||
|
||||
pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
|
||||
}
|
||||
pte_unmap_unlock(orig_pte, ptl);
|
||||
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long addr, pmd_t *pmdp)
|
||||
{
|
||||
BUG();
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_NUMA_BALANCING */
|
||||
|
||||
/*
|
||||
* These routines also need to handle stuff like marking pages dirty
|
||||
* and/or accessed for architectures that don't do it in hardware (most
|
||||
@@ -3469,6 +3638,9 @@ int handle_pte_fault(struct mm_struct *mm,
|
||||
pte, pmd, flags, entry);
|
||||
}
|
||||
|
||||
if (pte_numa(entry))
|
||||
return do_numa_page(mm, vma, address, entry, pte, pmd);
|
||||
|
||||
ptl = pte_lockptr(mm, pmd);
|
||||
spin_lock(ptl);
|
||||
if (unlikely(!pte_same(*pte, entry)))
|
||||
@@ -3537,9 +3709,21 @@ retry:
|
||||
|
||||
barrier();
|
||||
if (pmd_trans_huge(orig_pmd)) {
|
||||
if (flags & FAULT_FLAG_WRITE &&
|
||||
!pmd_write(orig_pmd) &&
|
||||
!pmd_trans_splitting(orig_pmd)) {
|
||||
unsigned int dirty = flags & FAULT_FLAG_WRITE;
|
||||
|
||||
/*
|
||||
* If the pmd is splitting, return and retry the
|
||||
* the fault. Alternative: wait until the split
|
||||
* is done, and goto retry.
|
||||
*/
|
||||
if (pmd_trans_splitting(orig_pmd))
|
||||
return 0;
|
||||
|
||||
if (pmd_numa(orig_pmd))
|
||||
return do_huge_pmd_numa_page(mm, vma, address,
|
||||
orig_pmd, pmd);
|
||||
|
||||
if (dirty && !pmd_write(orig_pmd)) {
|
||||
ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
|
||||
orig_pmd);
|
||||
/*
|
||||
@@ -3550,17 +3734,25 @@ retry:
|
||||
if (unlikely(ret & VM_FAULT_OOM))
|
||||
goto retry;
|
||||
return ret;
|
||||
} else {
|
||||
huge_pmd_set_accessed(mm, vma, address, pmd,
|
||||
orig_pmd, dirty);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (pmd_numa(*pmd))
|
||||
return do_pmd_numa_page(mm, vma, address, pmd);
|
||||
|
||||
/*
|
||||
* Use __pte_alloc instead of pte_alloc_map, because we can't
|
||||
* run pte_offset_map on the pmd, if an huge pmd could
|
||||
* materialize from under us from a different thread.
|
||||
*/
|
||||
if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
|
||||
if (unlikely(pmd_none(*pmd)) &&
|
||||
unlikely(__pte_alloc(mm, vma, pmd, address)))
|
||||
return VM_FAULT_OOM;
|
||||
/* if an huge pmd materialized from under us just retry later */
|
||||
if (unlikely(pmd_trans_huge(*pmd)))
|
||||
@@ -3940,15 +4132,12 @@ void print_vma_addr(char *prefix, unsigned long ip)
|
||||
struct file *f = vma->vm_file;
|
||||
char *buf = (char *)__get_free_page(GFP_KERNEL);
|
||||
if (buf) {
|
||||
char *p, *s;
|
||||
char *p;
|
||||
|
||||
p = d_path(&f->f_path, buf, PAGE_SIZE);
|
||||
if (IS_ERR(p))
|
||||
p = "?";
|
||||
s = strrchr(p, '/');
|
||||
if (s)
|
||||
p = s+1;
|
||||
printk("%s%s[%lx+%lx]", prefix, p,
|
||||
printk("%s%s[%lx+%lx]", prefix, kbasename(p),
|
||||
vma->vm_start,
|
||||
vma->vm_end - vma->vm_start);
|
||||
free_page((unsigned long)buf);
|
||||
|
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page,
|
||||
void __ref put_page_bootmem(struct page *page)
|
||||
{
|
||||
unsigned long type;
|
||||
static DEFINE_MUTEX(ppb_lock);
|
||||
|
||||
type = (unsigned long) page->lru.next;
|
||||
BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
|
||||
@@ -115,7 +116,14 @@ void __ref put_page_bootmem(struct page *page)
|
||||
ClearPagePrivate(page);
|
||||
set_page_private(page, 0);
|
||||
INIT_LIST_HEAD(&page->lru);
|
||||
|
||||
/*
|
||||
* Please refer to comment for __free_pages_bootmem()
|
||||
* for why we serialize here.
|
||||
*/
|
||||
mutex_lock(&ppb_lock);
|
||||
__free_pages_bootmem(page, 0);
|
||||
mutex_unlock(&ppb_lock);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -205,7 +213,7 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
|
||||
zone_span_writelock(zone);
|
||||
|
||||
old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
|
||||
if (start_pfn < zone->zone_start_pfn)
|
||||
if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn)
|
||||
zone->zone_start_pfn = start_pfn;
|
||||
|
||||
zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
|
||||
@@ -214,13 +222,134 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
|
||||
zone_span_writeunlock(zone);
|
||||
}
|
||||
|
||||
static void resize_zone(struct zone *zone, unsigned long start_pfn,
|
||||
unsigned long end_pfn)
|
||||
{
|
||||
zone_span_writelock(zone);
|
||||
|
||||
if (end_pfn - start_pfn) {
|
||||
zone->zone_start_pfn = start_pfn;
|
||||
zone->spanned_pages = end_pfn - start_pfn;
|
||||
} else {
|
||||
/*
|
||||
* make it consist as free_area_init_core(),
|
||||
* if spanned_pages = 0, then keep start_pfn = 0
|
||||
*/
|
||||
zone->zone_start_pfn = 0;
|
||||
zone->spanned_pages = 0;
|
||||
}
|
||||
|
||||
zone_span_writeunlock(zone);
|
||||
}
|
||||
|
||||
static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
|
||||
unsigned long end_pfn)
|
||||
{
|
||||
enum zone_type zid = zone_idx(zone);
|
||||
int nid = zone->zone_pgdat->node_id;
|
||||
unsigned long pfn;
|
||||
|
||||
for (pfn = start_pfn; pfn < end_pfn; pfn++)
|
||||
set_page_links(pfn_to_page(pfn), zid, nid, pfn);
|
||||
}
|
||||
|
||||
static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
|
||||
unsigned long start_pfn, unsigned long end_pfn)
|
||||
{
|
||||
int ret;
|
||||
unsigned long flags;
|
||||
unsigned long z1_start_pfn;
|
||||
|
||||
if (!z1->wait_table) {
|
||||
ret = init_currently_empty_zone(z1, start_pfn,
|
||||
end_pfn - start_pfn, MEMMAP_HOTPLUG);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
pgdat_resize_lock(z1->zone_pgdat, &flags);
|
||||
|
||||
/* can't move pfns which are higher than @z2 */
|
||||
if (end_pfn > z2->zone_start_pfn + z2->spanned_pages)
|
||||
goto out_fail;
|
||||
/* the move out part mast at the left most of @z2 */
|
||||
if (start_pfn > z2->zone_start_pfn)
|
||||
goto out_fail;
|
||||
/* must included/overlap */
|
||||
if (end_pfn <= z2->zone_start_pfn)
|
||||
goto out_fail;
|
||||
|
||||
/* use start_pfn for z1's start_pfn if z1 is empty */
|
||||
if (z1->spanned_pages)
|
||||
z1_start_pfn = z1->zone_start_pfn;
|
||||
else
|
||||
z1_start_pfn = start_pfn;
|
||||
|
||||
resize_zone(z1, z1_start_pfn, end_pfn);
|
||||
resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages);
|
||||
|
||||
pgdat_resize_unlock(z1->zone_pgdat, &flags);
|
||||
|
||||
fix_zone_id(z1, start_pfn, end_pfn);
|
||||
|
||||
return 0;
|
||||
out_fail:
|
||||
pgdat_resize_unlock(z1->zone_pgdat, &flags);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
|
||||
unsigned long start_pfn, unsigned long end_pfn)
|
||||
{
|
||||
int ret;
|
||||
unsigned long flags;
|
||||
unsigned long z2_end_pfn;
|
||||
|
||||
if (!z2->wait_table) {
|
||||
ret = init_currently_empty_zone(z2, start_pfn,
|
||||
end_pfn - start_pfn, MEMMAP_HOTPLUG);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
pgdat_resize_lock(z1->zone_pgdat, &flags);
|
||||
|
||||
/* can't move pfns which are lower than @z1 */
|
||||
if (z1->zone_start_pfn > start_pfn)
|
||||
goto out_fail;
|
||||
/* the move out part mast at the right most of @z1 */
|
||||
if (z1->zone_start_pfn + z1->spanned_pages > end_pfn)
|
||||
goto out_fail;
|
||||
/* must included/overlap */
|
||||
if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages)
|
||||
goto out_fail;
|
||||
|
||||
/* use end_pfn for z2's end_pfn if z2 is empty */
|
||||
if (z2->spanned_pages)
|
||||
z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages;
|
||||
else
|
||||
z2_end_pfn = end_pfn;
|
||||
|
||||
resize_zone(z1, z1->zone_start_pfn, start_pfn);
|
||||
resize_zone(z2, start_pfn, z2_end_pfn);
|
||||
|
||||
pgdat_resize_unlock(z1->zone_pgdat, &flags);
|
||||
|
||||
fix_zone_id(z2, start_pfn, end_pfn);
|
||||
|
||||
return 0;
|
||||
out_fail:
|
||||
pgdat_resize_unlock(z1->zone_pgdat, &flags);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
|
||||
unsigned long end_pfn)
|
||||
{
|
||||
unsigned long old_pgdat_end_pfn =
|
||||
pgdat->node_start_pfn + pgdat->node_spanned_pages;
|
||||
|
||||
if (start_pfn < pgdat->node_start_pfn)
|
||||
if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
|
||||
pgdat->node_start_pfn = start_pfn;
|
||||
|
||||
pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
|
||||
@@ -460,8 +589,99 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MOVABLE_NODE
|
||||
/*
|
||||
* When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have
|
||||
* normal memory.
|
||||
*/
|
||||
static bool can_online_high_movable(struct zone *zone)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
#else /* CONFIG_MOVABLE_NODE */
|
||||
/* ensure every online node has NORMAL memory */
|
||||
static bool can_online_high_movable(struct zone *zone)
|
||||
{
|
||||
return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
|
||||
}
|
||||
#endif /* CONFIG_MOVABLE_NODE */
|
||||
|
||||
int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
|
||||
/* check which state of node_states will be changed when online memory */
|
||||
static void node_states_check_changes_online(unsigned long nr_pages,
|
||||
struct zone *zone, struct memory_notify *arg)
|
||||
{
|
||||
int nid = zone_to_nid(zone);
|
||||
enum zone_type zone_last = ZONE_NORMAL;
|
||||
|
||||
/*
|
||||
* If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
|
||||
* contains nodes which have zones of 0...ZONE_NORMAL,
|
||||
* set zone_last to ZONE_NORMAL.
|
||||
*
|
||||
* If we don't have HIGHMEM nor movable node,
|
||||
* node_states[N_NORMAL_MEMORY] contains nodes which have zones of
|
||||
* 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
|
||||
*/
|
||||
if (N_MEMORY == N_NORMAL_MEMORY)
|
||||
zone_last = ZONE_MOVABLE;
|
||||
|
||||
/*
|
||||
* if the memory to be online is in a zone of 0...zone_last, and
|
||||
* the zones of 0...zone_last don't have memory before online, we will
|
||||
* need to set the node to node_states[N_NORMAL_MEMORY] after
|
||||
* the memory is online.
|
||||
*/
|
||||
if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
|
||||
arg->status_change_nid_normal = nid;
|
||||
else
|
||||
arg->status_change_nid_normal = -1;
|
||||
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
/*
|
||||
* If we have movable node, node_states[N_HIGH_MEMORY]
|
||||
* contains nodes which have zones of 0...ZONE_HIGHMEM,
|
||||
* set zone_last to ZONE_HIGHMEM.
|
||||
*
|
||||
* If we don't have movable node, node_states[N_NORMAL_MEMORY]
|
||||
* contains nodes which have zones of 0...ZONE_MOVABLE,
|
||||
* set zone_last to ZONE_MOVABLE.
|
||||
*/
|
||||
zone_last = ZONE_HIGHMEM;
|
||||
if (N_MEMORY == N_HIGH_MEMORY)
|
||||
zone_last = ZONE_MOVABLE;
|
||||
|
||||
if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
|
||||
arg->status_change_nid_high = nid;
|
||||
else
|
||||
arg->status_change_nid_high = -1;
|
||||
#else
|
||||
arg->status_change_nid_high = arg->status_change_nid_normal;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* if the node don't have memory befor online, we will need to
|
||||
* set the node to node_states[N_MEMORY] after the memory
|
||||
* is online.
|
||||
*/
|
||||
if (!node_state(nid, N_MEMORY))
|
||||
arg->status_change_nid = nid;
|
||||
else
|
||||
arg->status_change_nid = -1;
|
||||
}
|
||||
|
||||
static void node_states_set_node(int node, struct memory_notify *arg)
|
||||
{
|
||||
if (arg->status_change_nid_normal >= 0)
|
||||
node_set_state(node, N_NORMAL_MEMORY);
|
||||
|
||||
if (arg->status_change_nid_high >= 0)
|
||||
node_set_state(node, N_HIGH_MEMORY);
|
||||
|
||||
node_set_state(node, N_MEMORY);
|
||||
}
|
||||
|
||||
|
||||
int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
|
||||
{
|
||||
unsigned long onlined_pages = 0;
|
||||
struct zone *zone;
|
||||
@@ -471,13 +691,40 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
|
||||
struct memory_notify arg;
|
||||
|
||||
lock_memory_hotplug();
|
||||
/*
|
||||
* This doesn't need a lock to do pfn_to_page().
|
||||
* The section can't be removed here because of the
|
||||
* memory_block->state_mutex.
|
||||
*/
|
||||
zone = page_zone(pfn_to_page(pfn));
|
||||
|
||||
if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
|
||||
!can_online_high_movable(zone)) {
|
||||
unlock_memory_hotplug();
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
|
||||
if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) {
|
||||
unlock_memory_hotplug();
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
|
||||
if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) {
|
||||
unlock_memory_hotplug();
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/* Previous code may changed the zone of the pfn range */
|
||||
zone = page_zone(pfn_to_page(pfn));
|
||||
|
||||
arg.start_pfn = pfn;
|
||||
arg.nr_pages = nr_pages;
|
||||
arg.status_change_nid = -1;
|
||||
node_states_check_changes_online(nr_pages, zone, &arg);
|
||||
|
||||
nid = page_to_nid(pfn_to_page(pfn));
|
||||
if (node_present_pages(nid) == 0)
|
||||
arg.status_change_nid = nid;
|
||||
|
||||
ret = memory_notify(MEM_GOING_ONLINE, &arg);
|
||||
ret = notifier_to_errno(ret);
|
||||
@@ -486,24 +733,22 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
|
||||
unlock_memory_hotplug();
|
||||
return ret;
|
||||
}
|
||||
/*
|
||||
* This doesn't need a lock to do pfn_to_page().
|
||||
* The section can't be removed here because of the
|
||||
* memory_block->state_mutex.
|
||||
*/
|
||||
zone = page_zone(pfn_to_page(pfn));
|
||||
/*
|
||||
* If this zone is not populated, then it is not in zonelist.
|
||||
* This means the page allocator ignores this zone.
|
||||
* So, zonelist must be updated after online.
|
||||
*/
|
||||
mutex_lock(&zonelists_mutex);
|
||||
if (!populated_zone(zone))
|
||||
if (!populated_zone(zone)) {
|
||||
need_zonelists_rebuild = 1;
|
||||
build_all_zonelists(NULL, zone);
|
||||
}
|
||||
|
||||
ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
|
||||
online_pages_range);
|
||||
if (ret) {
|
||||
if (need_zonelists_rebuild)
|
||||
zone_pcp_reset(zone);
|
||||
mutex_unlock(&zonelists_mutex);
|
||||
printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
|
||||
(unsigned long long) pfn << PAGE_SHIFT,
|
||||
@@ -514,12 +759,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
|
||||
return ret;
|
||||
}
|
||||
|
||||
zone->managed_pages += onlined_pages;
|
||||
zone->present_pages += onlined_pages;
|
||||
zone->zone_pgdat->node_present_pages += onlined_pages;
|
||||
if (onlined_pages) {
|
||||
node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
|
||||
node_states_set_node(zone_to_nid(zone), &arg);
|
||||
if (need_zonelists_rebuild)
|
||||
build_all_zonelists(NULL, zone);
|
||||
build_all_zonelists(NULL, NULL);
|
||||
else
|
||||
zone_pcp_update(zone);
|
||||
}
|
||||
@@ -812,7 +1058,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
|
||||
* migrate_pages returns # of failed pages.
|
||||
*/
|
||||
ret = migrate_pages(&source, alloc_migrate_target, 0,
|
||||
true, MIGRATE_SYNC);
|
||||
true, MIGRATE_SYNC,
|
||||
MR_MEMORY_HOTPLUG);
|
||||
if (ret)
|
||||
putback_lru_pages(&source);
|
||||
}
|
||||
@@ -847,7 +1094,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
|
||||
{
|
||||
int ret;
|
||||
long offlined = *(long *)data;
|
||||
ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
|
||||
ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
|
||||
offlined = nr_pages;
|
||||
if (!ret)
|
||||
*(long *)data += offlined;
|
||||
@@ -867,6 +1114,132 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
|
||||
return offlined;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MOVABLE_NODE
|
||||
/*
|
||||
* When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have
|
||||
* normal memory.
|
||||
*/
|
||||
static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
#else /* CONFIG_MOVABLE_NODE */
|
||||
/* ensure the node has NORMAL memory if it is still online */
|
||||
static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
|
||||
{
|
||||
struct pglist_data *pgdat = zone->zone_pgdat;
|
||||
unsigned long present_pages = 0;
|
||||
enum zone_type zt;
|
||||
|
||||
for (zt = 0; zt <= ZONE_NORMAL; zt++)
|
||||
present_pages += pgdat->node_zones[zt].present_pages;
|
||||
|
||||
if (present_pages > nr_pages)
|
||||
return true;
|
||||
|
||||
present_pages = 0;
|
||||
for (; zt <= ZONE_MOVABLE; zt++)
|
||||
present_pages += pgdat->node_zones[zt].present_pages;
|
||||
|
||||
/*
|
||||
* we can't offline the last normal memory until all
|
||||
* higher memory is offlined.
|
||||
*/
|
||||
return present_pages == 0;
|
||||
}
|
||||
#endif /* CONFIG_MOVABLE_NODE */
|
||||
|
||||
/* check which state of node_states will be changed when offline memory */
|
||||
static void node_states_check_changes_offline(unsigned long nr_pages,
|
||||
struct zone *zone, struct memory_notify *arg)
|
||||
{
|
||||
struct pglist_data *pgdat = zone->zone_pgdat;
|
||||
unsigned long present_pages = 0;
|
||||
enum zone_type zt, zone_last = ZONE_NORMAL;
|
||||
|
||||
/*
|
||||
* If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
|
||||
* contains nodes which have zones of 0...ZONE_NORMAL,
|
||||
* set zone_last to ZONE_NORMAL.
|
||||
*
|
||||
* If we don't have HIGHMEM nor movable node,
|
||||
* node_states[N_NORMAL_MEMORY] contains nodes which have zones of
|
||||
* 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
|
||||
*/
|
||||
if (N_MEMORY == N_NORMAL_MEMORY)
|
||||
zone_last = ZONE_MOVABLE;
|
||||
|
||||
/*
|
||||
* check whether node_states[N_NORMAL_MEMORY] will be changed.
|
||||
* If the memory to be offline is in a zone of 0...zone_last,
|
||||
* and it is the last present memory, 0...zone_last will
|
||||
* become empty after offline , thus we can determind we will
|
||||
* need to clear the node from node_states[N_NORMAL_MEMORY].
|
||||
*/
|
||||
for (zt = 0; zt <= zone_last; zt++)
|
||||
present_pages += pgdat->node_zones[zt].present_pages;
|
||||
if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
|
||||
arg->status_change_nid_normal = zone_to_nid(zone);
|
||||
else
|
||||
arg->status_change_nid_normal = -1;
|
||||
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
/*
|
||||
* If we have movable node, node_states[N_HIGH_MEMORY]
|
||||
* contains nodes which have zones of 0...ZONE_HIGHMEM,
|
||||
* set zone_last to ZONE_HIGHMEM.
|
||||
*
|
||||
* If we don't have movable node, node_states[N_NORMAL_MEMORY]
|
||||
* contains nodes which have zones of 0...ZONE_MOVABLE,
|
||||
* set zone_last to ZONE_MOVABLE.
|
||||
*/
|
||||
zone_last = ZONE_HIGHMEM;
|
||||
if (N_MEMORY == N_HIGH_MEMORY)
|
||||
zone_last = ZONE_MOVABLE;
|
||||
|
||||
for (; zt <= zone_last; zt++)
|
||||
present_pages += pgdat->node_zones[zt].present_pages;
|
||||
if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
|
||||
arg->status_change_nid_high = zone_to_nid(zone);
|
||||
else
|
||||
arg->status_change_nid_high = -1;
|
||||
#else
|
||||
arg->status_change_nid_high = arg->status_change_nid_normal;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
|
||||
*/
|
||||
zone_last = ZONE_MOVABLE;
|
||||
|
||||
/*
|
||||
* check whether node_states[N_HIGH_MEMORY] will be changed
|
||||
* If we try to offline the last present @nr_pages from the node,
|
||||
* we can determind we will need to clear the node from
|
||||
* node_states[N_HIGH_MEMORY].
|
||||
*/
|
||||
for (; zt <= zone_last; zt++)
|
||||
present_pages += pgdat->node_zones[zt].present_pages;
|
||||
if (nr_pages >= present_pages)
|
||||
arg->status_change_nid = zone_to_nid(zone);
|
||||
else
|
||||
arg->status_change_nid = -1;
|
||||
}
|
||||
|
||||
static void node_states_clear_node(int node, struct memory_notify *arg)
|
||||
{
|
||||
if (arg->status_change_nid_normal >= 0)
|
||||
node_clear_state(node, N_NORMAL_MEMORY);
|
||||
|
||||
if ((N_MEMORY != N_NORMAL_MEMORY) &&
|
||||
(arg->status_change_nid_high >= 0))
|
||||
node_clear_state(node, N_HIGH_MEMORY);
|
||||
|
||||
if ((N_MEMORY != N_HIGH_MEMORY) &&
|
||||
(arg->status_change_nid >= 0))
|
||||
node_clear_state(node, N_MEMORY);
|
||||
}
|
||||
|
||||
static int __ref __offline_pages(unsigned long start_pfn,
|
||||
unsigned long end_pfn, unsigned long timeout)
|
||||
{
|
||||
@@ -893,16 +1266,19 @@ static int __ref __offline_pages(unsigned long start_pfn,
|
||||
node = zone_to_nid(zone);
|
||||
nr_pages = end_pfn - start_pfn;
|
||||
|
||||
ret = -EINVAL;
|
||||
if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
|
||||
goto out;
|
||||
|
||||
/* set above range as isolated */
|
||||
ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
|
||||
ret = start_isolate_page_range(start_pfn, end_pfn,
|
||||
MIGRATE_MOVABLE, true);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
arg.start_pfn = start_pfn;
|
||||
arg.nr_pages = nr_pages;
|
||||
arg.status_change_nid = -1;
|
||||
if (nr_pages >= node_present_pages(node))
|
||||
arg.status_change_nid = node;
|
||||
node_states_check_changes_offline(nr_pages, zone, &arg);
|
||||
|
||||
ret = memory_notify(MEM_GOING_OFFLINE, &arg);
|
||||
ret = notifier_to_errno(ret);
|
||||
@@ -943,10 +1319,10 @@ repeat:
|
||||
goto repeat;
|
||||
}
|
||||
}
|
||||
/* drain all zone's lru pagevec, this is asyncronous... */
|
||||
/* drain all zone's lru pagevec, this is asynchronous... */
|
||||
lru_add_drain_all();
|
||||
yield();
|
||||
/* drain pcp pages , this is synchrouns. */
|
||||
/* drain pcp pages, this is synchronous. */
|
||||
drain_all_pages();
|
||||
/* check again */
|
||||
offlined_pages = check_pages_isolated(start_pfn, end_pfn);
|
||||
@@ -955,12 +1331,13 @@ repeat:
|
||||
goto failed_removal;
|
||||
}
|
||||
printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
|
||||
/* Ok, all of our target is islaoted.
|
||||
/* Ok, all of our target is isolated.
|
||||
We cannot do rollback at this point. */
|
||||
offline_isolated_pages(start_pfn, end_pfn);
|
||||
/* reset pagetype flags and makes migrate type to be MOVABLE */
|
||||
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
|
||||
/* removal success */
|
||||
zone->managed_pages -= offlined_pages;
|
||||
zone->present_pages -= offlined_pages;
|
||||
zone->zone_pgdat->node_present_pages -= offlined_pages;
|
||||
totalram_pages -= offlined_pages;
|
||||
@@ -975,10 +1352,9 @@ repeat:
|
||||
} else
|
||||
zone_pcp_update(zone);
|
||||
|
||||
if (!node_present_pages(node)) {
|
||||
node_clear_state(node, N_HIGH_MEMORY);
|
||||
node_states_clear_node(node, &arg);
|
||||
if (arg.status_change_nid >= 0)
|
||||
kswapd_stop(node);
|
||||
}
|
||||
|
||||
vm_total_pages = nr_free_pagecache_pages();
|
||||
writeback_set_ratelimit();
|
||||
|
472
mm/mempolicy.c
472
mm/mempolicy.c
@@ -90,6 +90,7 @@
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/ctype.h>
|
||||
#include <linux/mm_inline.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/uaccess.h>
|
||||
@@ -117,6 +118,26 @@ static struct mempolicy default_policy = {
|
||||
.flags = MPOL_F_LOCAL,
|
||||
};
|
||||
|
||||
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
|
||||
|
||||
static struct mempolicy *get_task_policy(struct task_struct *p)
|
||||
{
|
||||
struct mempolicy *pol = p->mempolicy;
|
||||
int node;
|
||||
|
||||
if (!pol) {
|
||||
node = numa_node_id();
|
||||
if (node != -1)
|
||||
pol = &preferred_node_policy[node];
|
||||
|
||||
/* preferred_node_policy is not initialised early in boot */
|
||||
if (!pol->mode)
|
||||
pol = NULL;
|
||||
}
|
||||
|
||||
return pol;
|
||||
}
|
||||
|
||||
static const struct mempolicy_operations {
|
||||
int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
|
||||
/*
|
||||
@@ -212,9 +233,9 @@ static int mpol_set_nodemask(struct mempolicy *pol,
|
||||
/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
|
||||
if (pol == NULL)
|
||||
return 0;
|
||||
/* Check N_HIGH_MEMORY */
|
||||
/* Check N_MEMORY */
|
||||
nodes_and(nsc->mask1,
|
||||
cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
|
||||
cpuset_current_mems_allowed, node_states[N_MEMORY]);
|
||||
|
||||
VM_BUG_ON(!nodes);
|
||||
if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
|
||||
@@ -254,7 +275,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
|
||||
if (mode == MPOL_DEFAULT) {
|
||||
if (nodes && !nodes_empty(*nodes))
|
||||
return ERR_PTR(-EINVAL);
|
||||
return NULL; /* simply delete any existing policy */
|
||||
return NULL;
|
||||
}
|
||||
VM_BUG_ON(!nodes);
|
||||
|
||||
@@ -269,6 +290,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
|
||||
(flags & MPOL_F_RELATIVE_NODES)))
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
} else if (mode == MPOL_LOCAL) {
|
||||
if (!nodes_empty(*nodes))
|
||||
return ERR_PTR(-EINVAL);
|
||||
mode = MPOL_PREFERRED;
|
||||
} else if (nodes_empty(*nodes))
|
||||
return ERR_PTR(-EINVAL);
|
||||
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
|
||||
@@ -511,7 +536,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
|
||||
pmd = pmd_offset(pud, addr);
|
||||
do {
|
||||
next = pmd_addr_end(addr, end);
|
||||
split_huge_page_pmd(vma->vm_mm, pmd);
|
||||
split_huge_page_pmd(vma, addr, pmd);
|
||||
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
|
||||
continue;
|
||||
if (check_pte_range(vma, pmd, addr, next, nodes,
|
||||
@@ -561,6 +586,36 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
|
||||
/*
|
||||
* This is used to mark a range of virtual addresses to be inaccessible.
|
||||
* These are later cleared by a NUMA hinting fault. Depending on these
|
||||
* faults, pages may be migrated for better NUMA placement.
|
||||
*
|
||||
* This is assuming that NUMA faults are handled using PROT_NONE. If
|
||||
* an architecture makes a different choice, it will need further
|
||||
* changes to the core.
|
||||
*/
|
||||
unsigned long change_prot_numa(struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long end)
|
||||
{
|
||||
int nr_updated;
|
||||
BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
|
||||
|
||||
nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
|
||||
if (nr_updated)
|
||||
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
|
||||
|
||||
return nr_updated;
|
||||
}
|
||||
#else
|
||||
static unsigned long change_prot_numa(struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long end)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
|
||||
|
||||
/*
|
||||
* Check if all pages in a range are on a set of nodes.
|
||||
* If pagelist != NULL then isolate pages from the LRU and
|
||||
@@ -579,22 +634,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
|
||||
return ERR_PTR(-EFAULT);
|
||||
prev = NULL;
|
||||
for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
|
||||
unsigned long endvma = vma->vm_end;
|
||||
|
||||
if (endvma > end)
|
||||
endvma = end;
|
||||
if (vma->vm_start > start)
|
||||
start = vma->vm_start;
|
||||
|
||||
if (!(flags & MPOL_MF_DISCONTIG_OK)) {
|
||||
if (!vma->vm_next && vma->vm_end < end)
|
||||
return ERR_PTR(-EFAULT);
|
||||
if (prev && prev->vm_end < vma->vm_start)
|
||||
return ERR_PTR(-EFAULT);
|
||||
}
|
||||
if (!is_vm_hugetlb_page(vma) &&
|
||||
((flags & MPOL_MF_STRICT) ||
|
||||
((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
|
||||
vma_migratable(vma)))) {
|
||||
unsigned long endvma = vma->vm_end;
|
||||
|
||||
if (endvma > end)
|
||||
endvma = end;
|
||||
if (vma->vm_start > start)
|
||||
start = vma->vm_start;
|
||||
if (is_vm_hugetlb_page(vma))
|
||||
goto next;
|
||||
|
||||
if (flags & MPOL_MF_LAZY) {
|
||||
change_prot_numa(vma, start, endvma);
|
||||
goto next;
|
||||
}
|
||||
|
||||
if ((flags & MPOL_MF_STRICT) ||
|
||||
((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
|
||||
vma_migratable(vma))) {
|
||||
|
||||
err = check_pgd_range(vma, start, endvma, nodes,
|
||||
flags, private);
|
||||
if (err) {
|
||||
@@ -602,6 +667,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
|
||||
break;
|
||||
}
|
||||
}
|
||||
next:
|
||||
prev = vma;
|
||||
}
|
||||
return first;
|
||||
@@ -961,7 +1027,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
|
||||
|
||||
if (!list_empty(&pagelist)) {
|
||||
err = migrate_pages(&pagelist, new_node_page, dest,
|
||||
false, MIGRATE_SYNC);
|
||||
false, MIGRATE_SYNC,
|
||||
MR_SYSCALL);
|
||||
if (err)
|
||||
putback_lru_pages(&pagelist);
|
||||
}
|
||||
@@ -1133,8 +1200,7 @@ static long do_mbind(unsigned long start, unsigned long len,
|
||||
int err;
|
||||
LIST_HEAD(pagelist);
|
||||
|
||||
if (flags & ~(unsigned long)(MPOL_MF_STRICT |
|
||||
MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
|
||||
if (flags & ~(unsigned long)MPOL_MF_VALID)
|
||||
return -EINVAL;
|
||||
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
|
||||
return -EPERM;
|
||||
@@ -1157,6 +1223,9 @@ static long do_mbind(unsigned long start, unsigned long len,
|
||||
if (IS_ERR(new))
|
||||
return PTR_ERR(new);
|
||||
|
||||
if (flags & MPOL_MF_LAZY)
|
||||
new->flags |= MPOL_F_MOF;
|
||||
|
||||
/*
|
||||
* If we are using the default policy then operation
|
||||
* on discontinuous address spaces is okay after all
|
||||
@@ -1193,21 +1262,24 @@ static long do_mbind(unsigned long start, unsigned long len,
|
||||
vma = check_range(mm, start, end, nmask,
|
||||
flags | MPOL_MF_INVERT, &pagelist);
|
||||
|
||||
err = PTR_ERR(vma);
|
||||
if (!IS_ERR(vma)) {
|
||||
int nr_failed = 0;
|
||||
|
||||
err = PTR_ERR(vma); /* maybe ... */
|
||||
if (!IS_ERR(vma))
|
||||
err = mbind_range(mm, start, end, new);
|
||||
|
||||
if (!err) {
|
||||
int nr_failed = 0;
|
||||
|
||||
if (!list_empty(&pagelist)) {
|
||||
WARN_ON_ONCE(flags & MPOL_MF_LAZY);
|
||||
nr_failed = migrate_pages(&pagelist, new_vma_page,
|
||||
(unsigned long)vma,
|
||||
false, MIGRATE_SYNC);
|
||||
false, MIGRATE_SYNC,
|
||||
MR_MEMPOLICY_MBIND);
|
||||
if (nr_failed)
|
||||
putback_lru_pages(&pagelist);
|
||||
}
|
||||
|
||||
if (!err && nr_failed && (flags & MPOL_MF_STRICT))
|
||||
if (nr_failed && (flags & MPOL_MF_STRICT))
|
||||
err = -EIO;
|
||||
} else
|
||||
putback_lru_pages(&pagelist);
|
||||
@@ -1388,7 +1460,7 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
|
||||
goto out_put;
|
||||
}
|
||||
|
||||
if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
|
||||
if (!nodes_subset(*new, node_states[N_MEMORY])) {
|
||||
err = -EINVAL;
|
||||
goto out_put;
|
||||
}
|
||||
@@ -1546,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
|
||||
struct mempolicy *get_vma_policy(struct task_struct *task,
|
||||
struct vm_area_struct *vma, unsigned long addr)
|
||||
{
|
||||
struct mempolicy *pol = task->mempolicy;
|
||||
struct mempolicy *pol = get_task_policy(task);
|
||||
|
||||
if (vma) {
|
||||
if (vma->vm_ops && vma->vm_ops->get_policy) {
|
||||
@@ -1907,7 +1979,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
|
||||
unsigned long addr, int node)
|
||||
{
|
||||
struct mempolicy *pol;
|
||||
struct zonelist *zl;
|
||||
struct page *page;
|
||||
unsigned int cpuset_mems_cookie;
|
||||
|
||||
@@ -1926,23 +1997,11 @@ retry_cpuset:
|
||||
|
||||
return page;
|
||||
}
|
||||
zl = policy_zonelist(gfp, pol, node);
|
||||
if (unlikely(mpol_needs_cond_ref(pol))) {
|
||||
/*
|
||||
* slow path: ref counted shared policy
|
||||
*/
|
||||
struct page *page = __alloc_pages_nodemask(gfp, order,
|
||||
zl, policy_nodemask(gfp, pol));
|
||||
__mpol_put(pol);
|
||||
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
|
||||
goto retry_cpuset;
|
||||
return page;
|
||||
}
|
||||
/*
|
||||
* fast path: default or task policy
|
||||
*/
|
||||
page = __alloc_pages_nodemask(gfp, order, zl,
|
||||
page = __alloc_pages_nodemask(gfp, order,
|
||||
policy_zonelist(gfp, pol, node),
|
||||
policy_nodemask(gfp, pol));
|
||||
if (unlikely(mpol_needs_cond_ref(pol)))
|
||||
__mpol_put(pol);
|
||||
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
|
||||
goto retry_cpuset;
|
||||
return page;
|
||||
@@ -1969,7 +2028,7 @@ retry_cpuset:
|
||||
*/
|
||||
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
|
||||
{
|
||||
struct mempolicy *pol = current->mempolicy;
|
||||
struct mempolicy *pol = get_task_policy(current);
|
||||
struct page *page;
|
||||
unsigned int cpuset_mems_cookie;
|
||||
|
||||
@@ -2037,28 +2096,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
|
||||
return new;
|
||||
}
|
||||
|
||||
/*
|
||||
* If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
|
||||
* eliminate the * MPOL_F_* flags that require conditional ref and
|
||||
* [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly
|
||||
* after return. Use the returned value.
|
||||
*
|
||||
* Allows use of a mempolicy for, e.g., multiple allocations with a single
|
||||
* policy lookup, even if the policy needs/has extra ref on lookup.
|
||||
* shmem_readahead needs this.
|
||||
*/
|
||||
struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
|
||||
struct mempolicy *frompol)
|
||||
{
|
||||
if (!mpol_needs_cond_ref(frompol))
|
||||
return frompol;
|
||||
|
||||
*tompol = *frompol;
|
||||
tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */
|
||||
__mpol_put(frompol);
|
||||
return tompol;
|
||||
}
|
||||
|
||||
/* Slow path of a mempolicy comparison */
|
||||
bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
|
||||
{
|
||||
@@ -2095,7 +2132,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
|
||||
*/
|
||||
|
||||
/* lookup first element intersecting start-end */
|
||||
/* Caller holds sp->mutex */
|
||||
/* Caller holds sp->lock */
|
||||
static struct sp_node *
|
||||
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
|
||||
{
|
||||
@@ -2159,13 +2196,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
|
||||
|
||||
if (!sp->root.rb_node)
|
||||
return NULL;
|
||||
mutex_lock(&sp->mutex);
|
||||
spin_lock(&sp->lock);
|
||||
sn = sp_lookup(sp, idx, idx+1);
|
||||
if (sn) {
|
||||
mpol_get(sn->policy);
|
||||
pol = sn->policy;
|
||||
}
|
||||
mutex_unlock(&sp->mutex);
|
||||
spin_unlock(&sp->lock);
|
||||
return pol;
|
||||
}
|
||||
|
||||
@@ -2175,6 +2212,115 @@ static void sp_free(struct sp_node *n)
|
||||
kmem_cache_free(sn_cache, n);
|
||||
}
|
||||
|
||||
/**
|
||||
* mpol_misplaced - check whether current page node is valid in policy
|
||||
*
|
||||
* @page - page to be checked
|
||||
* @vma - vm area where page mapped
|
||||
* @addr - virtual address where page mapped
|
||||
*
|
||||
* Lookup current policy node id for vma,addr and "compare to" page's
|
||||
* node id.
|
||||
*
|
||||
* Returns:
|
||||
* -1 - not misplaced, page is in the right node
|
||||
* node - node id where the page should be
|
||||
*
|
||||
* Policy determination "mimics" alloc_page_vma().
|
||||
* Called from fault path where we know the vma and faulting address.
|
||||
*/
|
||||
int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
|
||||
{
|
||||
struct mempolicy *pol;
|
||||
struct zone *zone;
|
||||
int curnid = page_to_nid(page);
|
||||
unsigned long pgoff;
|
||||
int polnid = -1;
|
||||
int ret = -1;
|
||||
|
||||
BUG_ON(!vma);
|
||||
|
||||
pol = get_vma_policy(current, vma, addr);
|
||||
if (!(pol->flags & MPOL_F_MOF))
|
||||
goto out;
|
||||
|
||||
switch (pol->mode) {
|
||||
case MPOL_INTERLEAVE:
|
||||
BUG_ON(addr >= vma->vm_end);
|
||||
BUG_ON(addr < vma->vm_start);
|
||||
|
||||
pgoff = vma->vm_pgoff;
|
||||
pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
|
||||
polnid = offset_il_node(pol, vma, pgoff);
|
||||
break;
|
||||
|
||||
case MPOL_PREFERRED:
|
||||
if (pol->flags & MPOL_F_LOCAL)
|
||||
polnid = numa_node_id();
|
||||
else
|
||||
polnid = pol->v.preferred_node;
|
||||
break;
|
||||
|
||||
case MPOL_BIND:
|
||||
/*
|
||||
* allows binding to multiple nodes.
|
||||
* use current page if in policy nodemask,
|
||||
* else select nearest allowed node, if any.
|
||||
* If no allowed nodes, use current [!misplaced].
|
||||
*/
|
||||
if (node_isset(curnid, pol->v.nodes))
|
||||
goto out;
|
||||
(void)first_zones_zonelist(
|
||||
node_zonelist(numa_node_id(), GFP_HIGHUSER),
|
||||
gfp_zone(GFP_HIGHUSER),
|
||||
&pol->v.nodes, &zone);
|
||||
polnid = zone->node;
|
||||
break;
|
||||
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
|
||||
/* Migrate the page towards the node whose CPU is referencing it */
|
||||
if (pol->flags & MPOL_F_MORON) {
|
||||
int last_nid;
|
||||
|
||||
polnid = numa_node_id();
|
||||
|
||||
/*
|
||||
* Multi-stage node selection is used in conjunction
|
||||
* with a periodic migration fault to build a temporal
|
||||
* task<->page relation. By using a two-stage filter we
|
||||
* remove short/unlikely relations.
|
||||
*
|
||||
* Using P(p) ~ n_p / n_t as per frequentist
|
||||
* probability, we can equate a task's usage of a
|
||||
* particular page (n_p) per total usage of this
|
||||
* page (n_t) (in a given time-span) to a probability.
|
||||
*
|
||||
* Our periodic faults will sample this probability and
|
||||
* getting the same result twice in a row, given these
|
||||
* samples are fully independent, is then given by
|
||||
* P(n)^2, provided our sample period is sufficiently
|
||||
* short compared to the usage pattern.
|
||||
*
|
||||
* This quadric squishes small probabilities, making
|
||||
* it less likely we act on an unlikely task<->page
|
||||
* relation.
|
||||
*/
|
||||
last_nid = page_xchg_last_nid(page, polnid);
|
||||
if (last_nid != polnid)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (curnid != polnid)
|
||||
ret = polnid;
|
||||
out:
|
||||
mpol_cond_put(pol);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
|
||||
{
|
||||
pr_debug("deleting %lx-l%lx\n", n->start, n->end);
|
||||
@@ -2182,6 +2328,14 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n)
|
||||
sp_free(n);
|
||||
}
|
||||
|
||||
static void sp_node_init(struct sp_node *node, unsigned long start,
|
||||
unsigned long end, struct mempolicy *pol)
|
||||
{
|
||||
node->start = start;
|
||||
node->end = end;
|
||||
node->policy = pol;
|
||||
}
|
||||
|
||||
static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
|
||||
struct mempolicy *pol)
|
||||
{
|
||||
@@ -2198,10 +2352,7 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
|
||||
return NULL;
|
||||
}
|
||||
newpol->flags |= MPOL_F_SHARED;
|
||||
|
||||
n->start = start;
|
||||
n->end = end;
|
||||
n->policy = newpol;
|
||||
sp_node_init(n, start, end, newpol);
|
||||
|
||||
return n;
|
||||
}
|
||||
@@ -2211,9 +2362,12 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
|
||||
unsigned long end, struct sp_node *new)
|
||||
{
|
||||
struct sp_node *n;
|
||||
struct sp_node *n_new = NULL;
|
||||
struct mempolicy *mpol_new = NULL;
|
||||
int ret = 0;
|
||||
|
||||
mutex_lock(&sp->mutex);
|
||||
restart:
|
||||
spin_lock(&sp->lock);
|
||||
n = sp_lookup(sp, start, end);
|
||||
/* Take care of old policies in the same range. */
|
||||
while (n && n->start < end) {
|
||||
@@ -2226,14 +2380,16 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
|
||||
} else {
|
||||
/* Old policy spanning whole new range. */
|
||||
if (n->end > end) {
|
||||
struct sp_node *new2;
|
||||
new2 = sp_alloc(end, n->end, n->policy);
|
||||
if (!new2) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
if (!n_new)
|
||||
goto alloc_new;
|
||||
|
||||
*mpol_new = *n->policy;
|
||||
atomic_set(&mpol_new->refcnt, 1);
|
||||
sp_node_init(n_new, n->end, end, mpol_new);
|
||||
sp_insert(sp, n_new);
|
||||
n->end = start;
|
||||
sp_insert(sp, new2);
|
||||
n_new = NULL;
|
||||
mpol_new = NULL;
|
||||
break;
|
||||
} else
|
||||
n->end = start;
|
||||
@@ -2244,9 +2400,27 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
|
||||
}
|
||||
if (new)
|
||||
sp_insert(sp, new);
|
||||
out:
|
||||
mutex_unlock(&sp->mutex);
|
||||
spin_unlock(&sp->lock);
|
||||
ret = 0;
|
||||
|
||||
err_out:
|
||||
if (mpol_new)
|
||||
mpol_put(mpol_new);
|
||||
if (n_new)
|
||||
kmem_cache_free(sn_cache, n_new);
|
||||
|
||||
return ret;
|
||||
|
||||
alloc_new:
|
||||
spin_unlock(&sp->lock);
|
||||
ret = -ENOMEM;
|
||||
n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
|
||||
if (!n_new)
|
||||
goto err_out;
|
||||
mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
|
||||
if (!mpol_new)
|
||||
goto err_out;
|
||||
goto restart;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -2264,7 +2438,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
|
||||
int ret;
|
||||
|
||||
sp->root = RB_ROOT; /* empty tree == default mempolicy */
|
||||
mutex_init(&sp->mutex);
|
||||
spin_lock_init(&sp->lock);
|
||||
|
||||
if (mpol) {
|
||||
struct vm_area_struct pvma;
|
||||
@@ -2330,16 +2504,60 @@ void mpol_free_shared_policy(struct shared_policy *p)
|
||||
|
||||
if (!p->root.rb_node)
|
||||
return;
|
||||
mutex_lock(&p->mutex);
|
||||
spin_lock(&p->lock);
|
||||
next = rb_first(&p->root);
|
||||
while (next) {
|
||||
n = rb_entry(next, struct sp_node, nd);
|
||||
next = rb_next(&n->nd);
|
||||
sp_delete(p, n);
|
||||
}
|
||||
mutex_unlock(&p->mutex);
|
||||
spin_unlock(&p->lock);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
static bool __initdata numabalancing_override;
|
||||
|
||||
static void __init check_numabalancing_enable(void)
|
||||
{
|
||||
bool numabalancing_default = false;
|
||||
|
||||
if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
|
||||
numabalancing_default = true;
|
||||
|
||||
if (nr_node_ids > 1 && !numabalancing_override) {
|
||||
printk(KERN_INFO "Enabling automatic NUMA balancing. "
|
||||
"Configure with numa_balancing= or sysctl");
|
||||
set_numabalancing_state(numabalancing_default);
|
||||
}
|
||||
}
|
||||
|
||||
static int __init setup_numabalancing(char *str)
|
||||
{
|
||||
int ret = 0;
|
||||
if (!str)
|
||||
goto out;
|
||||
numabalancing_override = true;
|
||||
|
||||
if (!strcmp(str, "enable")) {
|
||||
set_numabalancing_state(true);
|
||||
ret = 1;
|
||||
} else if (!strcmp(str, "disable")) {
|
||||
set_numabalancing_state(false);
|
||||
ret = 1;
|
||||
}
|
||||
out:
|
||||
if (!ret)
|
||||
printk(KERN_WARNING "Unable to parse numa_balancing=\n");
|
||||
|
||||
return ret;
|
||||
}
|
||||
__setup("numa_balancing=", setup_numabalancing);
|
||||
#else
|
||||
static inline void __init check_numabalancing_enable(void)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_NUMA_BALANCING */
|
||||
|
||||
/* assumes fs == KERNEL_DS */
|
||||
void __init numa_policy_init(void)
|
||||
{
|
||||
@@ -2355,13 +2573,22 @@ void __init numa_policy_init(void)
|
||||
sizeof(struct sp_node),
|
||||
0, SLAB_PANIC, NULL);
|
||||
|
||||
for_each_node(nid) {
|
||||
preferred_node_policy[nid] = (struct mempolicy) {
|
||||
.refcnt = ATOMIC_INIT(1),
|
||||
.mode = MPOL_PREFERRED,
|
||||
.flags = MPOL_F_MOF | MPOL_F_MORON,
|
||||
.v = { .preferred_node = nid, },
|
||||
};
|
||||
}
|
||||
|
||||
/*
|
||||
* Set interleaving policy for system init. Interleaving is only
|
||||
* enabled across suitably sized nodes (default is >= 16MB), or
|
||||
* fall back to the largest node if they're all smaller.
|
||||
*/
|
||||
nodes_clear(interleave_nodes);
|
||||
for_each_node_state(nid, N_HIGH_MEMORY) {
|
||||
for_each_node_state(nid, N_MEMORY) {
|
||||
unsigned long total_pages = node_present_pages(nid);
|
||||
|
||||
/* Preserve the largest node */
|
||||
@@ -2381,6 +2608,8 @@ void __init numa_policy_init(void)
|
||||
|
||||
if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
|
||||
printk("numa_policy_init: interleaving failed\n");
|
||||
|
||||
check_numabalancing_enable();
|
||||
}
|
||||
|
||||
/* Reset policy of current process to default */
|
||||
@@ -2394,44 +2623,34 @@ void numa_default_policy(void)
|
||||
*/
|
||||
|
||||
/*
|
||||
* "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
|
||||
* Used only for mpol_parse_str() and mpol_to_str()
|
||||
* "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
|
||||
*/
|
||||
#define MPOL_LOCAL MPOL_MAX
|
||||
static const char * const policy_modes[] =
|
||||
{
|
||||
[MPOL_DEFAULT] = "default",
|
||||
[MPOL_PREFERRED] = "prefer",
|
||||
[MPOL_BIND] = "bind",
|
||||
[MPOL_INTERLEAVE] = "interleave",
|
||||
[MPOL_LOCAL] = "local"
|
||||
[MPOL_LOCAL] = "local",
|
||||
};
|
||||
|
||||
|
||||
#ifdef CONFIG_TMPFS
|
||||
/**
|
||||
* mpol_parse_str - parse string to mempolicy
|
||||
* mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
|
||||
* @str: string containing mempolicy to parse
|
||||
* @mpol: pointer to struct mempolicy pointer, returned on success.
|
||||
* @no_context: flag whether to "contextualize" the mempolicy
|
||||
*
|
||||
* Format of input:
|
||||
* <mode>[=<flags>][:<nodelist>]
|
||||
*
|
||||
* if @no_context is true, save the input nodemask in w.user_nodemask in
|
||||
* the returned mempolicy. This will be used to "clone" the mempolicy in
|
||||
* a specific context [cpuset] at a later time. Used to parse tmpfs mpol
|
||||
* mount option. Note that if 'static' or 'relative' mode flags were
|
||||
* specified, the input nodemask will already have been saved. Saving
|
||||
* it again is redundant, but safe.
|
||||
*
|
||||
* On success, returns 0, else 1
|
||||
*/
|
||||
int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
|
||||
int mpol_parse_str(char *str, struct mempolicy **mpol)
|
||||
{
|
||||
struct mempolicy *new = NULL;
|
||||
unsigned short mode;
|
||||
unsigned short uninitialized_var(mode_flags);
|
||||
unsigned short mode_flags;
|
||||
nodemask_t nodes;
|
||||
char *nodelist = strchr(str, ':');
|
||||
char *flags = strchr(str, '=');
|
||||
@@ -2442,7 +2661,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
|
||||
*nodelist++ = '\0';
|
||||
if (nodelist_parse(nodelist, nodes))
|
||||
goto out;
|
||||
if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
|
||||
if (!nodes_subset(nodes, node_states[N_MEMORY]))
|
||||
goto out;
|
||||
} else
|
||||
nodes_clear(nodes);
|
||||
@@ -2450,12 +2669,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
|
||||
if (flags)
|
||||
*flags++ = '\0'; /* terminate mode string */
|
||||
|
||||
for (mode = 0; mode <= MPOL_LOCAL; mode++) {
|
||||
for (mode = 0; mode < MPOL_MAX; mode++) {
|
||||
if (!strcmp(str, policy_modes[mode])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (mode > MPOL_LOCAL)
|
||||
if (mode >= MPOL_MAX)
|
||||
goto out;
|
||||
|
||||
switch (mode) {
|
||||
@@ -2476,7 +2695,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
|
||||
* Default to online nodes with memory if no nodelist
|
||||
*/
|
||||
if (!nodelist)
|
||||
nodes = node_states[N_HIGH_MEMORY];
|
||||
nodes = node_states[N_MEMORY];
|
||||
break;
|
||||
case MPOL_LOCAL:
|
||||
/*
|
||||
@@ -2519,24 +2738,23 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
|
||||
if (IS_ERR(new))
|
||||
goto out;
|
||||
|
||||
if (no_context) {
|
||||
/* save for contextualization */
|
||||
new->w.user_nodemask = nodes;
|
||||
} else {
|
||||
int ret;
|
||||
NODEMASK_SCRATCH(scratch);
|
||||
if (scratch) {
|
||||
task_lock(current);
|
||||
ret = mpol_set_nodemask(new, &nodes, scratch);
|
||||
task_unlock(current);
|
||||
} else
|
||||
ret = -ENOMEM;
|
||||
NODEMASK_SCRATCH_FREE(scratch);
|
||||
if (ret) {
|
||||
mpol_put(new);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Save nodes for mpol_to_str() to show the tmpfs mount options
|
||||
* for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
|
||||
*/
|
||||
if (mode != MPOL_PREFERRED)
|
||||
new->v.nodes = nodes;
|
||||
else if (nodelist)
|
||||
new->v.preferred_node = first_node(nodes);
|
||||
else
|
||||
new->flags |= MPOL_F_LOCAL;
|
||||
|
||||
/*
|
||||
* Save nodes for contextualization: this will be used to "clone"
|
||||
* the mempolicy in a specific context [cpuset] at a later time.
|
||||
*/
|
||||
new->w.user_nodemask = nodes;
|
||||
|
||||
err = 0;
|
||||
|
||||
out:
|
||||
@@ -2556,13 +2774,12 @@ out:
|
||||
* @buffer: to contain formatted mempolicy string
|
||||
* @maxlen: length of @buffer
|
||||
* @pol: pointer to mempolicy to be formatted
|
||||
* @no_context: "context free" mempolicy - use nodemask in w.user_nodemask
|
||||
*
|
||||
* Convert a mempolicy into a string.
|
||||
* Returns the number of characters in buffer (if positive)
|
||||
* or an error (negative)
|
||||
*/
|
||||
int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
|
||||
int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
|
||||
{
|
||||
char *p = buffer;
|
||||
int l;
|
||||
@@ -2588,7 +2805,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
|
||||
case MPOL_PREFERRED:
|
||||
nodes_clear(nodes);
|
||||
if (flags & MPOL_F_LOCAL)
|
||||
mode = MPOL_LOCAL; /* pseudo-policy */
|
||||
mode = MPOL_LOCAL;
|
||||
else
|
||||
node_set(pol->v.preferred_node, nodes);
|
||||
break;
|
||||
@@ -2596,10 +2813,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
|
||||
case MPOL_BIND:
|
||||
/* Fall through */
|
||||
case MPOL_INTERLEAVE:
|
||||
if (no_context)
|
||||
nodes = pol->w.user_nodemask;
|
||||
else
|
||||
nodes = pol->v.nodes;
|
||||
nodes = pol->v.nodes;
|
||||
break;
|
||||
|
||||
default:
|
||||
|
450
mm/migrate.c
450
mm/migrate.c
@@ -35,9 +35,13 @@
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/hugetlb_cgroup.h>
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/balloon_compaction.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include <trace/events/migrate.h>
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
/*
|
||||
@@ -79,7 +83,30 @@ void putback_lru_pages(struct list_head *l)
|
||||
list_del(&page->lru);
|
||||
dec_zone_page_state(page, NR_ISOLATED_ANON +
|
||||
page_is_file_cache(page));
|
||||
putback_lru_page(page);
|
||||
putback_lru_page(page);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Put previously isolated pages back onto the appropriate lists
|
||||
* from where they were once taken off for compaction/migration.
|
||||
*
|
||||
* This function shall be used instead of putback_lru_pages(),
|
||||
* whenever the isolated pageset has been built by isolate_migratepages_range()
|
||||
*/
|
||||
void putback_movable_pages(struct list_head *l)
|
||||
{
|
||||
struct page *page;
|
||||
struct page *page2;
|
||||
|
||||
list_for_each_entry_safe(page, page2, l, lru) {
|
||||
list_del(&page->lru);
|
||||
dec_zone_page_state(page, NR_ISOLATED_ANON +
|
||||
page_is_file_cache(page));
|
||||
if (unlikely(balloon_page_movable(page)))
|
||||
balloon_page_putback(page);
|
||||
else
|
||||
putback_lru_page(page);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -91,8 +118,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
swp_entry_t entry;
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pte_t *ptep, pte;
|
||||
spinlock_t *ptl;
|
||||
@@ -103,19 +128,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
|
||||
goto out;
|
||||
ptl = &mm->page_table_lock;
|
||||
} else {
|
||||
pgd = pgd_offset(mm, addr);
|
||||
if (!pgd_present(*pgd))
|
||||
pmd = mm_find_pmd(mm, addr);
|
||||
if (!pmd)
|
||||
goto out;
|
||||
|
||||
pud = pud_offset(pgd, addr);
|
||||
if (!pud_present(*pud))
|
||||
goto out;
|
||||
|
||||
pmd = pmd_offset(pud, addr);
|
||||
if (pmd_trans_huge(*pmd))
|
||||
goto out;
|
||||
if (!pmd_present(*pmd))
|
||||
goto out;
|
||||
|
||||
ptep = pte_offset_map(pmd, addr);
|
||||
|
||||
@@ -279,14 +296,14 @@ static int migrate_page_move_mapping(struct address_space *mapping,
|
||||
struct page *newpage, struct page *page,
|
||||
struct buffer_head *head, enum migrate_mode mode)
|
||||
{
|
||||
int expected_count;
|
||||
int expected_count = 0;
|
||||
void **pslot;
|
||||
|
||||
if (!mapping) {
|
||||
/* Anonymous page without mapping */
|
||||
if (page_count(page) != 1)
|
||||
return -EAGAIN;
|
||||
return 0;
|
||||
return MIGRATEPAGE_SUCCESS;
|
||||
}
|
||||
|
||||
spin_lock_irq(&mapping->tree_lock);
|
||||
@@ -356,7 +373,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
|
||||
}
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
|
||||
return 0;
|
||||
return MIGRATEPAGE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -372,7 +389,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
|
||||
if (!mapping) {
|
||||
if (page_count(page) != 1)
|
||||
return -EAGAIN;
|
||||
return 0;
|
||||
return MIGRATEPAGE_SUCCESS;
|
||||
}
|
||||
|
||||
spin_lock_irq(&mapping->tree_lock);
|
||||
@@ -399,7 +416,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
|
||||
page_unfreeze_refs(page, expected_count - 1);
|
||||
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
return 0;
|
||||
return MIGRATEPAGE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -407,7 +424,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
|
||||
*/
|
||||
void migrate_page_copy(struct page *newpage, struct page *page)
|
||||
{
|
||||
if (PageHuge(page))
|
||||
if (PageHuge(page) || PageTransHuge(page))
|
||||
copy_huge_page(newpage, page);
|
||||
else
|
||||
copy_highpage(newpage, page);
|
||||
@@ -486,11 +503,11 @@ int migrate_page(struct address_space *mapping,
|
||||
|
||||
rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
|
||||
|
||||
if (rc)
|
||||
if (rc != MIGRATEPAGE_SUCCESS)
|
||||
return rc;
|
||||
|
||||
migrate_page_copy(newpage, page);
|
||||
return 0;
|
||||
return MIGRATEPAGE_SUCCESS;
|
||||
}
|
||||
EXPORT_SYMBOL(migrate_page);
|
||||
|
||||
@@ -513,7 +530,7 @@ int buffer_migrate_page(struct address_space *mapping,
|
||||
|
||||
rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
|
||||
|
||||
if (rc)
|
||||
if (rc != MIGRATEPAGE_SUCCESS)
|
||||
return rc;
|
||||
|
||||
/*
|
||||
@@ -549,7 +566,7 @@ int buffer_migrate_page(struct address_space *mapping,
|
||||
|
||||
} while (bh != head);
|
||||
|
||||
return 0;
|
||||
return MIGRATEPAGE_SUCCESS;
|
||||
}
|
||||
EXPORT_SYMBOL(buffer_migrate_page);
|
||||
#endif
|
||||
@@ -628,7 +645,7 @@ static int fallback_migrate_page(struct address_space *mapping,
|
||||
*
|
||||
* Return value:
|
||||
* < 0 - error code
|
||||
* == 0 - success
|
||||
* MIGRATEPAGE_SUCCESS - success
|
||||
*/
|
||||
static int move_to_new_page(struct page *newpage, struct page *page,
|
||||
int remap_swapcache, enum migrate_mode mode)
|
||||
@@ -665,7 +682,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
|
||||
else
|
||||
rc = fallback_migrate_page(mapping, newpage, page, mode);
|
||||
|
||||
if (rc) {
|
||||
if (rc != MIGRATEPAGE_SUCCESS) {
|
||||
newpage->mapping = NULL;
|
||||
} else {
|
||||
if (remap_swapcache)
|
||||
@@ -751,7 +768,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
|
||||
*/
|
||||
if (PageAnon(page)) {
|
||||
/*
|
||||
* Only page_lock_anon_vma() understands the subtleties of
|
||||
* Only page_lock_anon_vma_read() understands the subtleties of
|
||||
* getting a hold on an anon_vma from outside one of its mms.
|
||||
*/
|
||||
anon_vma = page_get_anon_vma(page);
|
||||
@@ -778,6 +795,18 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
|
||||
}
|
||||
}
|
||||
|
||||
if (unlikely(balloon_page_movable(page))) {
|
||||
/*
|
||||
* A ballooned page does not need any special attention from
|
||||
* physical to virtual reverse mapping procedures.
|
||||
* Skip any attempt to unmap PTEs or to remap swap cache,
|
||||
* in order to avoid burning cycles at rmap level, and perform
|
||||
* the page migration right away (proteced by page lock).
|
||||
*/
|
||||
rc = balloon_page_migrate(newpage, page, mode);
|
||||
goto uncharge;
|
||||
}
|
||||
|
||||
/*
|
||||
* Corner case handling:
|
||||
* 1. When a new swap-cache page is read into, it is added to the LRU
|
||||
@@ -814,7 +843,9 @@ skip_unmap:
|
||||
put_anon_vma(anon_vma);
|
||||
|
||||
uncharge:
|
||||
mem_cgroup_end_migration(mem, page, newpage, rc == 0);
|
||||
mem_cgroup_end_migration(mem, page, newpage,
|
||||
(rc == MIGRATEPAGE_SUCCESS ||
|
||||
rc == MIGRATEPAGE_BALLOON_SUCCESS));
|
||||
unlock:
|
||||
unlock_page(page);
|
||||
out:
|
||||
@@ -846,6 +877,18 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
|
||||
goto out;
|
||||
|
||||
rc = __unmap_and_move(page, newpage, force, offlining, mode);
|
||||
|
||||
if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
|
||||
/*
|
||||
* A ballooned page has been migrated already.
|
||||
* Now, it's the time to wrap-up counters,
|
||||
* handle the page back to Buddy and return.
|
||||
*/
|
||||
dec_zone_page_state(page, NR_ISOLATED_ANON +
|
||||
page_is_file_cache(page));
|
||||
balloon_page_free(page);
|
||||
return MIGRATEPAGE_SUCCESS;
|
||||
}
|
||||
out:
|
||||
if (rc != -EAGAIN) {
|
||||
/*
|
||||
@@ -958,10 +1001,11 @@ out:
|
||||
*/
|
||||
int migrate_pages(struct list_head *from,
|
||||
new_page_t get_new_page, unsigned long private, bool offlining,
|
||||
enum migrate_mode mode)
|
||||
enum migrate_mode mode, int reason)
|
||||
{
|
||||
int retry = 1;
|
||||
int nr_failed = 0;
|
||||
int nr_succeeded = 0;
|
||||
int pass = 0;
|
||||
struct page *page;
|
||||
struct page *page2;
|
||||
@@ -987,7 +1031,8 @@ int migrate_pages(struct list_head *from,
|
||||
case -EAGAIN:
|
||||
retry++;
|
||||
break;
|
||||
case 0:
|
||||
case MIGRATEPAGE_SUCCESS:
|
||||
nr_succeeded++;
|
||||
break;
|
||||
default:
|
||||
/* Permanent failure */
|
||||
@@ -996,15 +1041,18 @@ int migrate_pages(struct list_head *from,
|
||||
}
|
||||
}
|
||||
}
|
||||
rc = 0;
|
||||
rc = nr_failed + retry;
|
||||
out:
|
||||
if (nr_succeeded)
|
||||
count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
|
||||
if (nr_failed)
|
||||
count_vm_events(PGMIGRATE_FAIL, nr_failed);
|
||||
trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
|
||||
|
||||
if (!swapwrite)
|
||||
current->flags &= ~PF_SWAPWRITE;
|
||||
|
||||
if (rc)
|
||||
return rc;
|
||||
|
||||
return nr_failed + retry;
|
||||
return rc;
|
||||
}
|
||||
|
||||
int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
|
||||
@@ -1024,7 +1072,7 @@ int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
|
||||
/* try again */
|
||||
cond_resched();
|
||||
break;
|
||||
case 0:
|
||||
case MIGRATEPAGE_SUCCESS:
|
||||
goto out;
|
||||
default:
|
||||
rc = -EIO;
|
||||
@@ -1139,7 +1187,8 @@ set_status:
|
||||
err = 0;
|
||||
if (!list_empty(&pagelist)) {
|
||||
err = migrate_pages(&pagelist, new_page_node,
|
||||
(unsigned long)pm, 0, MIGRATE_SYNC);
|
||||
(unsigned long)pm, 0, MIGRATE_SYNC,
|
||||
MR_SYSCALL);
|
||||
if (err)
|
||||
putback_lru_pages(&pagelist);
|
||||
}
|
||||
@@ -1201,7 +1250,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
|
||||
if (node < 0 || node >= MAX_NUMNODES)
|
||||
goto out_pm;
|
||||
|
||||
if (!node_state(node, N_HIGH_MEMORY))
|
||||
if (!node_state(node, N_MEMORY))
|
||||
goto out_pm;
|
||||
|
||||
err = -EACCES;
|
||||
@@ -1403,4 +1452,329 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
|
||||
}
|
||||
return err;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
/*
|
||||
* Returns true if this is a safe migration target node for misplaced NUMA
|
||||
* pages. Currently it only checks the watermarks which crude
|
||||
*/
|
||||
static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
|
||||
int nr_migrate_pages)
|
||||
{
|
||||
int z;
|
||||
for (z = pgdat->nr_zones - 1; z >= 0; z--) {
|
||||
struct zone *zone = pgdat->node_zones + z;
|
||||
|
||||
if (!populated_zone(zone))
|
||||
continue;
|
||||
|
||||
if (zone->all_unreclaimable)
|
||||
continue;
|
||||
|
||||
/* Avoid waking kswapd by allocating pages_to_migrate pages. */
|
||||
if (!zone_watermark_ok(zone, 0,
|
||||
high_wmark_pages(zone) +
|
||||
nr_migrate_pages,
|
||||
0, 0))
|
||||
continue;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static struct page *alloc_misplaced_dst_page(struct page *page,
|
||||
unsigned long data,
|
||||
int **result)
|
||||
{
|
||||
int nid = (int) data;
|
||||
struct page *newpage;
|
||||
|
||||
newpage = alloc_pages_exact_node(nid,
|
||||
(GFP_HIGHUSER_MOVABLE | GFP_THISNODE |
|
||||
__GFP_NOMEMALLOC | __GFP_NORETRY |
|
||||
__GFP_NOWARN) &
|
||||
~GFP_IOFS, 0);
|
||||
if (newpage)
|
||||
page_xchg_last_nid(newpage, page_last_nid(page));
|
||||
|
||||
return newpage;
|
||||
}
|
||||
|
||||
/*
|
||||
* page migration rate limiting control.
|
||||
* Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
|
||||
* window of time. Default here says do not migrate more than 1280M per second.
|
||||
* If a node is rate-limited then PTE NUMA updates are also rate-limited. However
|
||||
* as it is faults that reset the window, pte updates will happen unconditionally
|
||||
* if there has not been a fault since @pteupdate_interval_millisecs after the
|
||||
* throttle window closed.
|
||||
*/
|
||||
static unsigned int migrate_interval_millisecs __read_mostly = 100;
|
||||
static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
|
||||
static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
|
||||
|
||||
/* Returns true if NUMA migration is currently rate limited */
|
||||
bool migrate_ratelimited(int node)
|
||||
{
|
||||
pg_data_t *pgdat = NODE_DATA(node);
|
||||
|
||||
if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
|
||||
msecs_to_jiffies(pteupdate_interval_millisecs)))
|
||||
return false;
|
||||
|
||||
if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Returns true if the node is migrate rate-limited after the update */
|
||||
bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
|
||||
{
|
||||
bool rate_limited = false;
|
||||
|
||||
/*
|
||||
* Rate-limit the amount of data that is being migrated to a node.
|
||||
* Optimal placement is no good if the memory bus is saturated and
|
||||
* all the time is being spent migrating!
|
||||
*/
|
||||
spin_lock(&pgdat->numabalancing_migrate_lock);
|
||||
if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
|
||||
pgdat->numabalancing_migrate_nr_pages = 0;
|
||||
pgdat->numabalancing_migrate_next_window = jiffies +
|
||||
msecs_to_jiffies(migrate_interval_millisecs);
|
||||
}
|
||||
if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
|
||||
rate_limited = true;
|
||||
else
|
||||
pgdat->numabalancing_migrate_nr_pages += nr_pages;
|
||||
spin_unlock(&pgdat->numabalancing_migrate_lock);
|
||||
|
||||
return rate_limited;
|
||||
}
|
||||
|
||||
int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
/* Avoid migrating to a node that is nearly full */
|
||||
if (migrate_balanced_pgdat(pgdat, 1)) {
|
||||
int page_lru;
|
||||
|
||||
if (isolate_lru_page(page)) {
|
||||
put_page(page);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Page is isolated */
|
||||
ret = 1;
|
||||
page_lru = page_is_file_cache(page);
|
||||
if (!PageTransHuge(page))
|
||||
inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
|
||||
else
|
||||
mod_zone_page_state(page_zone(page),
|
||||
NR_ISOLATED_ANON + page_lru,
|
||||
HPAGE_PMD_NR);
|
||||
}
|
||||
|
||||
/*
|
||||
* Page is either isolated or there is not enough space on the target
|
||||
* node. If isolated, then it has taken a reference count and the
|
||||
* callers reference can be safely dropped without the page
|
||||
* disappearing underneath us during migration. Otherwise the page is
|
||||
* not to be migrated but the callers reference should still be
|
||||
* dropped so it does not leak.
|
||||
*/
|
||||
put_page(page);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Attempt to migrate a misplaced page to the specified destination
|
||||
* node. Caller is expected to have an elevated reference count on
|
||||
* the page that will be dropped by this function before returning.
|
||||
*/
|
||||
int migrate_misplaced_page(struct page *page, int node)
|
||||
{
|
||||
pg_data_t *pgdat = NODE_DATA(node);
|
||||
int isolated = 0;
|
||||
int nr_remaining;
|
||||
LIST_HEAD(migratepages);
|
||||
|
||||
/*
|
||||
* Don't migrate pages that are mapped in multiple processes.
|
||||
* TODO: Handle false sharing detection instead of this hammer
|
||||
*/
|
||||
if (page_mapcount(page) != 1) {
|
||||
put_page(page);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Rate-limit the amount of data that is being migrated to a node.
|
||||
* Optimal placement is no good if the memory bus is saturated and
|
||||
* all the time is being spent migrating!
|
||||
*/
|
||||
if (numamigrate_update_ratelimit(pgdat, 1)) {
|
||||
put_page(page);
|
||||
goto out;
|
||||
}
|
||||
|
||||
isolated = numamigrate_isolate_page(pgdat, page);
|
||||
if (!isolated)
|
||||
goto out;
|
||||
|
||||
list_add(&page->lru, &migratepages);
|
||||
nr_remaining = migrate_pages(&migratepages,
|
||||
alloc_misplaced_dst_page,
|
||||
node, false, MIGRATE_ASYNC,
|
||||
MR_NUMA_MISPLACED);
|
||||
if (nr_remaining) {
|
||||
putback_lru_pages(&migratepages);
|
||||
isolated = 0;
|
||||
} else
|
||||
count_vm_numa_event(NUMA_PAGE_MIGRATE);
|
||||
BUG_ON(!list_empty(&migratepages));
|
||||
out:
|
||||
return isolated;
|
||||
}
|
||||
#endif /* CONFIG_NUMA_BALANCING */
|
||||
|
||||
#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
|
||||
int migrate_misplaced_transhuge_page(struct mm_struct *mm,
|
||||
struct vm_area_struct *vma,
|
||||
pmd_t *pmd, pmd_t entry,
|
||||
unsigned long address,
|
||||
struct page *page, int node)
|
||||
{
|
||||
unsigned long haddr = address & HPAGE_PMD_MASK;
|
||||
pg_data_t *pgdat = NODE_DATA(node);
|
||||
int isolated = 0;
|
||||
struct page *new_page = NULL;
|
||||
struct mem_cgroup *memcg = NULL;
|
||||
int page_lru = page_is_file_cache(page);
|
||||
|
||||
/*
|
||||
* Don't migrate pages that are mapped in multiple processes.
|
||||
* TODO: Handle false sharing detection instead of this hammer
|
||||
*/
|
||||
if (page_mapcount(page) != 1)
|
||||
goto out_dropref;
|
||||
|
||||
/*
|
||||
* Rate-limit the amount of data that is being migrated to a node.
|
||||
* Optimal placement is no good if the memory bus is saturated and
|
||||
* all the time is being spent migrating!
|
||||
*/
|
||||
if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
|
||||
goto out_dropref;
|
||||
|
||||
new_page = alloc_pages_node(node,
|
||||
(GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
|
||||
if (!new_page) {
|
||||
count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
|
||||
goto out_dropref;
|
||||
}
|
||||
page_xchg_last_nid(new_page, page_last_nid(page));
|
||||
|
||||
isolated = numamigrate_isolate_page(pgdat, page);
|
||||
|
||||
/*
|
||||
* Failing to isolate or a GUP pin prevents migration. The expected
|
||||
* page count is 2. 1 for anonymous pages without a mapping and 1
|
||||
* for the callers pin. If the page was isolated, the page will
|
||||
* need to be put back on the LRU.
|
||||
*/
|
||||
if (!isolated || page_count(page) != 2) {
|
||||
count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
|
||||
put_page(new_page);
|
||||
if (isolated) {
|
||||
putback_lru_page(page);
|
||||
isolated = 0;
|
||||
goto out;
|
||||
}
|
||||
goto out_keep_locked;
|
||||
}
|
||||
|
||||
/* Prepare a page as a migration target */
|
||||
__set_page_locked(new_page);
|
||||
SetPageSwapBacked(new_page);
|
||||
|
||||
/* anon mapping, we can simply copy page->mapping to the new page: */
|
||||
new_page->mapping = page->mapping;
|
||||
new_page->index = page->index;
|
||||
migrate_page_copy(new_page, page);
|
||||
WARN_ON(PageLRU(new_page));
|
||||
|
||||
/* Recheck the target PMD */
|
||||
spin_lock(&mm->page_table_lock);
|
||||
if (unlikely(!pmd_same(*pmd, entry))) {
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
|
||||
/* Reverse changes made by migrate_page_copy() */
|
||||
if (TestClearPageActive(new_page))
|
||||
SetPageActive(page);
|
||||
if (TestClearPageUnevictable(new_page))
|
||||
SetPageUnevictable(page);
|
||||
mlock_migrate_page(page, new_page);
|
||||
|
||||
unlock_page(new_page);
|
||||
put_page(new_page); /* Free it */
|
||||
|
||||
unlock_page(page);
|
||||
putback_lru_page(page);
|
||||
|
||||
count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Traditional migration needs to prepare the memcg charge
|
||||
* transaction early to prevent the old page from being
|
||||
* uncharged when installing migration entries. Here we can
|
||||
* save the potential rollback and start the charge transfer
|
||||
* only when migration is already known to end successfully.
|
||||
*/
|
||||
mem_cgroup_prepare_migration(page, new_page, &memcg);
|
||||
|
||||
entry = mk_pmd(new_page, vma->vm_page_prot);
|
||||
entry = pmd_mknonnuma(entry);
|
||||
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
|
||||
entry = pmd_mkhuge(entry);
|
||||
|
||||
page_add_new_anon_rmap(new_page, vma, haddr);
|
||||
|
||||
set_pmd_at(mm, haddr, pmd, entry);
|
||||
update_mmu_cache_pmd(vma, address, &entry);
|
||||
page_remove_rmap(page);
|
||||
/*
|
||||
* Finish the charge transaction under the page table lock to
|
||||
* prevent split_huge_page() from dividing up the charge
|
||||
* before it's fully transferred to the new page.
|
||||
*/
|
||||
mem_cgroup_end_migration(memcg, page, new_page, true);
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
|
||||
unlock_page(new_page);
|
||||
unlock_page(page);
|
||||
put_page(page); /* Drop the rmap reference */
|
||||
put_page(page); /* Drop the LRU isolation reference */
|
||||
|
||||
count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
|
||||
count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
|
||||
|
||||
out:
|
||||
mod_zone_page_state(page_zone(page),
|
||||
NR_ISOLATED_ANON + page_lru,
|
||||
-HPAGE_PMD_NR);
|
||||
return isolated;
|
||||
|
||||
out_dropref:
|
||||
put_page(page);
|
||||
out_keep_locked:
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_NUMA_BALANCING */
|
||||
|
||||
#endif /* CONFIG_NUMA */
|
||||
|
569
mm/mmap.c
569
mm/mmap.c
@@ -31,6 +31,7 @@
|
||||
#include <linux/audit.h>
|
||||
#include <linux/khugepaged.h>
|
||||
#include <linux/uprobes.h>
|
||||
#include <linux/rbtree_augmented.h>
|
||||
|
||||
#include <asm/uaccess.h>
|
||||
#include <asm/cacheflush.h>
|
||||
@@ -88,6 +89,20 @@ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
|
||||
*/
|
||||
struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
|
||||
|
||||
/*
|
||||
* The global memory commitment made in the system can be a metric
|
||||
* that can be used to drive ballooning decisions when Linux is hosted
|
||||
* as a guest. On Hyper-V, the host implements a policy engine for dynamically
|
||||
* balancing memory across competing virtual machines that are hosted.
|
||||
* Several metrics drive this policy engine including the guest reported
|
||||
* memory commitment.
|
||||
*/
|
||||
unsigned long vm_memory_committed(void)
|
||||
{
|
||||
return percpu_counter_read_positive(&vm_committed_as);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(vm_memory_committed);
|
||||
|
||||
/*
|
||||
* Check that a process has enough memory to allocate a new virtual
|
||||
* mapping. 0 means there is enough memory for the allocation to
|
||||
@@ -297,40 +312,88 @@ out:
|
||||
return retval;
|
||||
}
|
||||
|
||||
static long vma_compute_subtree_gap(struct vm_area_struct *vma)
|
||||
{
|
||||
unsigned long max, subtree_gap;
|
||||
max = vma->vm_start;
|
||||
if (vma->vm_prev)
|
||||
max -= vma->vm_prev->vm_end;
|
||||
if (vma->vm_rb.rb_left) {
|
||||
subtree_gap = rb_entry(vma->vm_rb.rb_left,
|
||||
struct vm_area_struct, vm_rb)->rb_subtree_gap;
|
||||
if (subtree_gap > max)
|
||||
max = subtree_gap;
|
||||
}
|
||||
if (vma->vm_rb.rb_right) {
|
||||
subtree_gap = rb_entry(vma->vm_rb.rb_right,
|
||||
struct vm_area_struct, vm_rb)->rb_subtree_gap;
|
||||
if (subtree_gap > max)
|
||||
max = subtree_gap;
|
||||
}
|
||||
return max;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_DEBUG_VM_RB
|
||||
static int browse_rb(struct rb_root *root)
|
||||
{
|
||||
int i = 0, j;
|
||||
int i = 0, j, bug = 0;
|
||||
struct rb_node *nd, *pn = NULL;
|
||||
unsigned long prev = 0, pend = 0;
|
||||
|
||||
for (nd = rb_first(root); nd; nd = rb_next(nd)) {
|
||||
struct vm_area_struct *vma;
|
||||
vma = rb_entry(nd, struct vm_area_struct, vm_rb);
|
||||
if (vma->vm_start < prev)
|
||||
printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1;
|
||||
if (vma->vm_start < pend)
|
||||
if (vma->vm_start < prev) {
|
||||
printk("vm_start %lx prev %lx\n", vma->vm_start, prev);
|
||||
bug = 1;
|
||||
}
|
||||
if (vma->vm_start < pend) {
|
||||
printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
|
||||
if (vma->vm_start > vma->vm_end)
|
||||
printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start);
|
||||
bug = 1;
|
||||
}
|
||||
if (vma->vm_start > vma->vm_end) {
|
||||
printk("vm_end %lx < vm_start %lx\n",
|
||||
vma->vm_end, vma->vm_start);
|
||||
bug = 1;
|
||||
}
|
||||
if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
|
||||
printk("free gap %lx, correct %lx\n",
|
||||
vma->rb_subtree_gap,
|
||||
vma_compute_subtree_gap(vma));
|
||||
bug = 1;
|
||||
}
|
||||
i++;
|
||||
pn = nd;
|
||||
prev = vma->vm_start;
|
||||
pend = vma->vm_end;
|
||||
}
|
||||
j = 0;
|
||||
for (nd = pn; nd; nd = rb_prev(nd)) {
|
||||
for (nd = pn; nd; nd = rb_prev(nd))
|
||||
j++;
|
||||
if (i != j) {
|
||||
printk("backwards %d, forwards %d\n", j, i);
|
||||
bug = 1;
|
||||
}
|
||||
return bug ? -1 : i;
|
||||
}
|
||||
|
||||
static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
|
||||
{
|
||||
struct rb_node *nd;
|
||||
|
||||
for (nd = rb_first(root); nd; nd = rb_next(nd)) {
|
||||
struct vm_area_struct *vma;
|
||||
vma = rb_entry(nd, struct vm_area_struct, vm_rb);
|
||||
BUG_ON(vma != ignore &&
|
||||
vma->rb_subtree_gap != vma_compute_subtree_gap(vma));
|
||||
}
|
||||
if (i != j)
|
||||
printk("backwards %d, forwards %d\n", j, i), i = 0;
|
||||
return i;
|
||||
}
|
||||
|
||||
void validate_mm(struct mm_struct *mm)
|
||||
{
|
||||
int bug = 0;
|
||||
int i = 0;
|
||||
unsigned long highest_address = 0;
|
||||
struct vm_area_struct *vma = mm->mmap;
|
||||
while (vma) {
|
||||
struct anon_vma_chain *avc;
|
||||
@@ -338,20 +401,73 @@ void validate_mm(struct mm_struct *mm)
|
||||
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
|
||||
anon_vma_interval_tree_verify(avc);
|
||||
vma_unlock_anon_vma(vma);
|
||||
highest_address = vma->vm_end;
|
||||
vma = vma->vm_next;
|
||||
i++;
|
||||
}
|
||||
if (i != mm->map_count)
|
||||
printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
|
||||
if (i != mm->map_count) {
|
||||
printk("map_count %d vm_next %d\n", mm->map_count, i);
|
||||
bug = 1;
|
||||
}
|
||||
if (highest_address != mm->highest_vm_end) {
|
||||
printk("mm->highest_vm_end %lx, found %lx\n",
|
||||
mm->highest_vm_end, highest_address);
|
||||
bug = 1;
|
||||
}
|
||||
i = browse_rb(&mm->mm_rb);
|
||||
if (i != mm->map_count)
|
||||
printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
|
||||
if (i != mm->map_count) {
|
||||
printk("map_count %d rb %d\n", mm->map_count, i);
|
||||
bug = 1;
|
||||
}
|
||||
BUG_ON(bug);
|
||||
}
|
||||
#else
|
||||
#define validate_mm_rb(root, ignore) do { } while (0)
|
||||
#define validate_mm(mm) do { } while (0)
|
||||
#endif
|
||||
|
||||
RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
|
||||
unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
|
||||
|
||||
/*
|
||||
* Update augmented rbtree rb_subtree_gap values after vma->vm_start or
|
||||
* vma->vm_prev->vm_end values changed, without modifying the vma's position
|
||||
* in the rbtree.
|
||||
*/
|
||||
static void vma_gap_update(struct vm_area_struct *vma)
|
||||
{
|
||||
/*
|
||||
* As it turns out, RB_DECLARE_CALLBACKS() already created a callback
|
||||
* function that does exacltly what we want.
|
||||
*/
|
||||
vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
|
||||
}
|
||||
|
||||
static inline void vma_rb_insert(struct vm_area_struct *vma,
|
||||
struct rb_root *root)
|
||||
{
|
||||
/* All rb_subtree_gap values must be consistent prior to insertion */
|
||||
validate_mm_rb(root, NULL);
|
||||
|
||||
rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
|
||||
}
|
||||
|
||||
static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
|
||||
{
|
||||
/*
|
||||
* All rb_subtree_gap values must be consistent prior to erase,
|
||||
* with the possible exception of the vma being erased.
|
||||
*/
|
||||
validate_mm_rb(root, vma);
|
||||
|
||||
/*
|
||||
* Note rb_erase_augmented is a fairly large inline function,
|
||||
* so make sure we instantiate it only once with our desired
|
||||
* augmented rbtree callbacks.
|
||||
*/
|
||||
rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
|
||||
}
|
||||
|
||||
/*
|
||||
* vma has some anon_vma assigned, and is already inserted on that
|
||||
* anon_vma's interval trees.
|
||||
@@ -421,8 +537,25 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr,
|
||||
void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
struct rb_node **rb_link, struct rb_node *rb_parent)
|
||||
{
|
||||
/* Update tracking information for the gap following the new vma. */
|
||||
if (vma->vm_next)
|
||||
vma_gap_update(vma->vm_next);
|
||||
else
|
||||
mm->highest_vm_end = vma->vm_end;
|
||||
|
||||
/*
|
||||
* vma->vm_prev wasn't known when we followed the rbtree to find the
|
||||
* correct insertion point for that vma. As a result, we could not
|
||||
* update the vma vm_rb parents rb_subtree_gap values on the way down.
|
||||
* So, we first insert the vma with a zero rb_subtree_gap value
|
||||
* (to be consistent with what we did on the way down), and then
|
||||
* immediately update the gap to the correct value. Finally we
|
||||
* rebalance the rbtree after all augmented values have been set.
|
||||
*/
|
||||
rb_link_node(&vma->vm_rb, rb_parent, rb_link);
|
||||
rb_insert_color(&vma->vm_rb, &mm->mm_rb);
|
||||
vma->rb_subtree_gap = 0;
|
||||
vma_gap_update(vma);
|
||||
vma_rb_insert(vma, &mm->mm_rb);
|
||||
}
|
||||
|
||||
static void __vma_link_file(struct vm_area_struct *vma)
|
||||
@@ -498,12 +631,12 @@ static inline void
|
||||
__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
struct vm_area_struct *prev)
|
||||
{
|
||||
struct vm_area_struct *next = vma->vm_next;
|
||||
struct vm_area_struct *next;
|
||||
|
||||
prev->vm_next = next;
|
||||
vma_rb_erase(vma, &mm->mm_rb);
|
||||
prev->vm_next = next = vma->vm_next;
|
||||
if (next)
|
||||
next->vm_prev = prev;
|
||||
rb_erase(&vma->vm_rb, &mm->mm_rb);
|
||||
if (mm->mmap_cache == vma)
|
||||
mm->mmap_cache = prev;
|
||||
}
|
||||
@@ -525,6 +658,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
|
||||
struct rb_root *root = NULL;
|
||||
struct anon_vma *anon_vma = NULL;
|
||||
struct file *file = vma->vm_file;
|
||||
bool start_changed = false, end_changed = false;
|
||||
long adjust_next = 0;
|
||||
int remove_next = 0;
|
||||
|
||||
@@ -602,7 +736,7 @@ again: remove_next = 1 + (end > next->vm_end);
|
||||
if (anon_vma) {
|
||||
VM_BUG_ON(adjust_next && next->anon_vma &&
|
||||
anon_vma != next->anon_vma);
|
||||
anon_vma_lock(anon_vma);
|
||||
anon_vma_lock_write(anon_vma);
|
||||
anon_vma_interval_tree_pre_update_vma(vma);
|
||||
if (adjust_next)
|
||||
anon_vma_interval_tree_pre_update_vma(next);
|
||||
@@ -615,8 +749,14 @@ again: remove_next = 1 + (end > next->vm_end);
|
||||
vma_interval_tree_remove(next, root);
|
||||
}
|
||||
|
||||
vma->vm_start = start;
|
||||
vma->vm_end = end;
|
||||
if (start != vma->vm_start) {
|
||||
vma->vm_start = start;
|
||||
start_changed = true;
|
||||
}
|
||||
if (end != vma->vm_end) {
|
||||
vma->vm_end = end;
|
||||
end_changed = true;
|
||||
}
|
||||
vma->vm_pgoff = pgoff;
|
||||
if (adjust_next) {
|
||||
next->vm_start += adjust_next << PAGE_SHIFT;
|
||||
@@ -645,6 +785,15 @@ again: remove_next = 1 + (end > next->vm_end);
|
||||
* (it may either follow vma or precede it).
|
||||
*/
|
||||
__insert_vm_struct(mm, insert);
|
||||
} else {
|
||||
if (start_changed)
|
||||
vma_gap_update(vma);
|
||||
if (end_changed) {
|
||||
if (!next)
|
||||
mm->highest_vm_end = end;
|
||||
else if (!adjust_next)
|
||||
vma_gap_update(next);
|
||||
}
|
||||
}
|
||||
|
||||
if (anon_vma) {
|
||||
@@ -678,10 +827,13 @@ again: remove_next = 1 + (end > next->vm_end);
|
||||
* we must remove another next too. It would clutter
|
||||
* up the code too much to do both in one go.
|
||||
*/
|
||||
if (remove_next == 2) {
|
||||
next = vma->vm_next;
|
||||
next = vma->vm_next;
|
||||
if (remove_next == 2)
|
||||
goto again;
|
||||
}
|
||||
else if (next)
|
||||
vma_gap_update(next);
|
||||
else
|
||||
mm->highest_vm_end = end;
|
||||
}
|
||||
if (insert && file)
|
||||
uprobe_mmap(insert);
|
||||
@@ -1153,8 +1305,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
|
||||
* memory so no accounting is necessary
|
||||
*/
|
||||
file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
|
||||
VM_NORESERVE, &user,
|
||||
HUGETLB_ANONHUGE_INODE);
|
||||
VM_NORESERVE,
|
||||
&user, HUGETLB_ANONHUGE_INODE,
|
||||
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
|
||||
if (IS_ERR(file))
|
||||
return PTR_ERR(file);
|
||||
}
|
||||
@@ -1335,7 +1488,11 @@ munmap_back:
|
||||
*
|
||||
* Answer: Yes, several device drivers can do it in their
|
||||
* f_op->mmap method. -DaveM
|
||||
* Bug: If addr is changed, prev, rb_link, rb_parent should
|
||||
* be updated for vma_link()
|
||||
*/
|
||||
WARN_ON_ONCE(addr != vma->vm_start);
|
||||
|
||||
addr = vma->vm_start;
|
||||
pgoff = vma->vm_pgoff;
|
||||
vm_flags = vma->vm_flags;
|
||||
@@ -1400,6 +1557,206 @@ unacct_error:
|
||||
return error;
|
||||
}
|
||||
|
||||
unsigned long unmapped_area(struct vm_unmapped_area_info *info)
|
||||
{
|
||||
/*
|
||||
* We implement the search by looking for an rbtree node that
|
||||
* immediately follows a suitable gap. That is,
|
||||
* - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
|
||||
* - gap_end = vma->vm_start >= info->low_limit + length;
|
||||
* - gap_end - gap_start >= length
|
||||
*/
|
||||
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct vm_area_struct *vma;
|
||||
unsigned long length, low_limit, high_limit, gap_start, gap_end;
|
||||
|
||||
/* Adjust search length to account for worst case alignment overhead */
|
||||
length = info->length + info->align_mask;
|
||||
if (length < info->length)
|
||||
return -ENOMEM;
|
||||
|
||||
/* Adjust search limits by the desired length */
|
||||
if (info->high_limit < length)
|
||||
return -ENOMEM;
|
||||
high_limit = info->high_limit - length;
|
||||
|
||||
if (info->low_limit > high_limit)
|
||||
return -ENOMEM;
|
||||
low_limit = info->low_limit + length;
|
||||
|
||||
/* Check if rbtree root looks promising */
|
||||
if (RB_EMPTY_ROOT(&mm->mm_rb))
|
||||
goto check_highest;
|
||||
vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
|
||||
if (vma->rb_subtree_gap < length)
|
||||
goto check_highest;
|
||||
|
||||
while (true) {
|
||||
/* Visit left subtree if it looks promising */
|
||||
gap_end = vma->vm_start;
|
||||
if (gap_end >= low_limit && vma->vm_rb.rb_left) {
|
||||
struct vm_area_struct *left =
|
||||
rb_entry(vma->vm_rb.rb_left,
|
||||
struct vm_area_struct, vm_rb);
|
||||
if (left->rb_subtree_gap >= length) {
|
||||
vma = left;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
|
||||
check_current:
|
||||
/* Check if current node has a suitable gap */
|
||||
if (gap_start > high_limit)
|
||||
return -ENOMEM;
|
||||
if (gap_end >= low_limit && gap_end - gap_start >= length)
|
||||
goto found;
|
||||
|
||||
/* Visit right subtree if it looks promising */
|
||||
if (vma->vm_rb.rb_right) {
|
||||
struct vm_area_struct *right =
|
||||
rb_entry(vma->vm_rb.rb_right,
|
||||
struct vm_area_struct, vm_rb);
|
||||
if (right->rb_subtree_gap >= length) {
|
||||
vma = right;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* Go back up the rbtree to find next candidate node */
|
||||
while (true) {
|
||||
struct rb_node *prev = &vma->vm_rb;
|
||||
if (!rb_parent(prev))
|
||||
goto check_highest;
|
||||
vma = rb_entry(rb_parent(prev),
|
||||
struct vm_area_struct, vm_rb);
|
||||
if (prev == vma->vm_rb.rb_left) {
|
||||
gap_start = vma->vm_prev->vm_end;
|
||||
gap_end = vma->vm_start;
|
||||
goto check_current;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
check_highest:
|
||||
/* Check highest gap, which does not precede any rbtree node */
|
||||
gap_start = mm->highest_vm_end;
|
||||
gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
|
||||
if (gap_start > high_limit)
|
||||
return -ENOMEM;
|
||||
|
||||
found:
|
||||
/* We found a suitable gap. Clip it with the original low_limit. */
|
||||
if (gap_start < info->low_limit)
|
||||
gap_start = info->low_limit;
|
||||
|
||||
/* Adjust gap address to the desired alignment */
|
||||
gap_start += (info->align_offset - gap_start) & info->align_mask;
|
||||
|
||||
VM_BUG_ON(gap_start + info->length > info->high_limit);
|
||||
VM_BUG_ON(gap_start + info->length > gap_end);
|
||||
return gap_start;
|
||||
}
|
||||
|
||||
unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct vm_area_struct *vma;
|
||||
unsigned long length, low_limit, high_limit, gap_start, gap_end;
|
||||
|
||||
/* Adjust search length to account for worst case alignment overhead */
|
||||
length = info->length + info->align_mask;
|
||||
if (length < info->length)
|
||||
return -ENOMEM;
|
||||
|
||||
/*
|
||||
* Adjust search limits by the desired length.
|
||||
* See implementation comment at top of unmapped_area().
|
||||
*/
|
||||
gap_end = info->high_limit;
|
||||
if (gap_end < length)
|
||||
return -ENOMEM;
|
||||
high_limit = gap_end - length;
|
||||
|
||||
if (info->low_limit > high_limit)
|
||||
return -ENOMEM;
|
||||
low_limit = info->low_limit + length;
|
||||
|
||||
/* Check highest gap, which does not precede any rbtree node */
|
||||
gap_start = mm->highest_vm_end;
|
||||
if (gap_start <= high_limit)
|
||||
goto found_highest;
|
||||
|
||||
/* Check if rbtree root looks promising */
|
||||
if (RB_EMPTY_ROOT(&mm->mm_rb))
|
||||
return -ENOMEM;
|
||||
vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
|
||||
if (vma->rb_subtree_gap < length)
|
||||
return -ENOMEM;
|
||||
|
||||
while (true) {
|
||||
/* Visit right subtree if it looks promising */
|
||||
gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
|
||||
if (gap_start <= high_limit && vma->vm_rb.rb_right) {
|
||||
struct vm_area_struct *right =
|
||||
rb_entry(vma->vm_rb.rb_right,
|
||||
struct vm_area_struct, vm_rb);
|
||||
if (right->rb_subtree_gap >= length) {
|
||||
vma = right;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
check_current:
|
||||
/* Check if current node has a suitable gap */
|
||||
gap_end = vma->vm_start;
|
||||
if (gap_end < low_limit)
|
||||
return -ENOMEM;
|
||||
if (gap_start <= high_limit && gap_end - gap_start >= length)
|
||||
goto found;
|
||||
|
||||
/* Visit left subtree if it looks promising */
|
||||
if (vma->vm_rb.rb_left) {
|
||||
struct vm_area_struct *left =
|
||||
rb_entry(vma->vm_rb.rb_left,
|
||||
struct vm_area_struct, vm_rb);
|
||||
if (left->rb_subtree_gap >= length) {
|
||||
vma = left;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* Go back up the rbtree to find next candidate node */
|
||||
while (true) {
|
||||
struct rb_node *prev = &vma->vm_rb;
|
||||
if (!rb_parent(prev))
|
||||
return -ENOMEM;
|
||||
vma = rb_entry(rb_parent(prev),
|
||||
struct vm_area_struct, vm_rb);
|
||||
if (prev == vma->vm_rb.rb_right) {
|
||||
gap_start = vma->vm_prev ?
|
||||
vma->vm_prev->vm_end : 0;
|
||||
goto check_current;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
found:
|
||||
/* We found a suitable gap. Clip it with the original high_limit. */
|
||||
if (gap_end > info->high_limit)
|
||||
gap_end = info->high_limit;
|
||||
|
||||
found_highest:
|
||||
/* Compute highest gap address at the desired alignment */
|
||||
gap_end -= info->length;
|
||||
gap_end -= (gap_end - info->align_offset) & info->align_mask;
|
||||
|
||||
VM_BUG_ON(gap_end < info->low_limit);
|
||||
VM_BUG_ON(gap_end < gap_start);
|
||||
return gap_end;
|
||||
}
|
||||
|
||||
/* Get an address range which is currently unmapped.
|
||||
* For shmat() with addr=0.
|
||||
*
|
||||
@@ -1418,7 +1775,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct vm_area_struct *vma;
|
||||
unsigned long start_addr;
|
||||
struct vm_unmapped_area_info info;
|
||||
|
||||
if (len > TASK_SIZE)
|
||||
return -ENOMEM;
|
||||
@@ -1433,40 +1790,13 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
|
||||
(!vma || addr + len <= vma->vm_start))
|
||||
return addr;
|
||||
}
|
||||
if (len > mm->cached_hole_size) {
|
||||
start_addr = addr = mm->free_area_cache;
|
||||
} else {
|
||||
start_addr = addr = TASK_UNMAPPED_BASE;
|
||||
mm->cached_hole_size = 0;
|
||||
}
|
||||
|
||||
full_search:
|
||||
for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
|
||||
/* At this point: (!vma || addr < vma->vm_end). */
|
||||
if (TASK_SIZE - len < addr) {
|
||||
/*
|
||||
* Start a new search - just in case we missed
|
||||
* some holes.
|
||||
*/
|
||||
if (start_addr != TASK_UNMAPPED_BASE) {
|
||||
addr = TASK_UNMAPPED_BASE;
|
||||
start_addr = addr;
|
||||
mm->cached_hole_size = 0;
|
||||
goto full_search;
|
||||
}
|
||||
return -ENOMEM;
|
||||
}
|
||||
if (!vma || addr + len <= vma->vm_start) {
|
||||
/*
|
||||
* Remember the place where we stopped the search:
|
||||
*/
|
||||
mm->free_area_cache = addr + len;
|
||||
return addr;
|
||||
}
|
||||
if (addr + mm->cached_hole_size < vma->vm_start)
|
||||
mm->cached_hole_size = vma->vm_start - addr;
|
||||
addr = vma->vm_end;
|
||||
}
|
||||
info.flags = 0;
|
||||
info.length = len;
|
||||
info.low_limit = TASK_UNMAPPED_BASE;
|
||||
info.high_limit = TASK_SIZE;
|
||||
info.align_mask = 0;
|
||||
return vm_unmapped_area(&info);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1491,7 +1821,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
struct mm_struct *mm = current->mm;
|
||||
unsigned long addr = addr0, start_addr;
|
||||
unsigned long addr = addr0;
|
||||
struct vm_unmapped_area_info info;
|
||||
|
||||
/* requested length too big for entire address space */
|
||||
if (len > TASK_SIZE)
|
||||
@@ -1509,53 +1840,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
|
||||
return addr;
|
||||
}
|
||||
|
||||
/* check if free_area_cache is useful for us */
|
||||
if (len <= mm->cached_hole_size) {
|
||||
mm->cached_hole_size = 0;
|
||||
mm->free_area_cache = mm->mmap_base;
|
||||
}
|
||||
|
||||
try_again:
|
||||
/* either no address requested or can't fit in requested address hole */
|
||||
start_addr = addr = mm->free_area_cache;
|
||||
|
||||
if (addr < len)
|
||||
goto fail;
|
||||
|
||||
addr -= len;
|
||||
do {
|
||||
/*
|
||||
* Lookup failure means no vma is above this address,
|
||||
* else if new region fits below vma->vm_start,
|
||||
* return with success:
|
||||
*/
|
||||
vma = find_vma(mm, addr);
|
||||
if (!vma || addr+len <= vma->vm_start)
|
||||
/* remember the address as a hint for next time */
|
||||
return (mm->free_area_cache = addr);
|
||||
|
||||
/* remember the largest hole we saw so far */
|
||||
if (addr + mm->cached_hole_size < vma->vm_start)
|
||||
mm->cached_hole_size = vma->vm_start - addr;
|
||||
|
||||
/* try just below the current vma->vm_start */
|
||||
addr = vma->vm_start-len;
|
||||
} while (len < vma->vm_start);
|
||||
|
||||
fail:
|
||||
/*
|
||||
* if hint left us with no space for the requested
|
||||
* mapping then try again:
|
||||
*
|
||||
* Note: this is different with the case of bottomup
|
||||
* which does the fully line-search, but we use find_vma
|
||||
* here that causes some holes skipped.
|
||||
*/
|
||||
if (start_addr != mm->mmap_base) {
|
||||
mm->free_area_cache = mm->mmap_base;
|
||||
mm->cached_hole_size = 0;
|
||||
goto try_again;
|
||||
}
|
||||
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
|
||||
info.length = len;
|
||||
info.low_limit = PAGE_SIZE;
|
||||
info.high_limit = mm->mmap_base;
|
||||
info.align_mask = 0;
|
||||
addr = vm_unmapped_area(&info);
|
||||
|
||||
/*
|
||||
* A failed mmap() very likely causes application failure,
|
||||
@@ -1563,14 +1853,13 @@ fail:
|
||||
* can happen with large stack limits and large mmap()
|
||||
* allocations.
|
||||
*/
|
||||
mm->cached_hole_size = ~0UL;
|
||||
mm->free_area_cache = TASK_UNMAPPED_BASE;
|
||||
addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
|
||||
/*
|
||||
* Restore the topdown base:
|
||||
*/
|
||||
mm->free_area_cache = mm->mmap_base;
|
||||
mm->cached_hole_size = ~0UL;
|
||||
if (addr & ~PAGE_MASK) {
|
||||
VM_BUG_ON(addr != -ENOMEM);
|
||||
info.flags = 0;
|
||||
info.low_limit = TASK_UNMAPPED_BASE;
|
||||
info.high_limit = TASK_SIZE;
|
||||
addr = vm_unmapped_area(&info);
|
||||
}
|
||||
|
||||
return addr;
|
||||
}
|
||||
@@ -1780,9 +2069,27 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
|
||||
if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
|
||||
error = acct_stack_growth(vma, size, grow);
|
||||
if (!error) {
|
||||
/*
|
||||
* vma_gap_update() doesn't support concurrent
|
||||
* updates, but we only hold a shared mmap_sem
|
||||
* lock here, so we need to protect against
|
||||
* concurrent vma expansions.
|
||||
* vma_lock_anon_vma() doesn't help here, as
|
||||
* we don't guarantee that all growable vmas
|
||||
* in a mm share the same root anon vma.
|
||||
* So, we reuse mm->page_table_lock to guard
|
||||
* against concurrent vma expansions.
|
||||
*/
|
||||
spin_lock(&vma->vm_mm->page_table_lock);
|
||||
anon_vma_interval_tree_pre_update_vma(vma);
|
||||
vma->vm_end = address;
|
||||
anon_vma_interval_tree_post_update_vma(vma);
|
||||
if (vma->vm_next)
|
||||
vma_gap_update(vma->vm_next);
|
||||
else
|
||||
vma->vm_mm->highest_vm_end = address;
|
||||
spin_unlock(&vma->vm_mm->page_table_lock);
|
||||
|
||||
perf_event_mmap(vma);
|
||||
}
|
||||
}
|
||||
@@ -1833,10 +2140,25 @@ int expand_downwards(struct vm_area_struct *vma,
|
||||
if (grow <= vma->vm_pgoff) {
|
||||
error = acct_stack_growth(vma, size, grow);
|
||||
if (!error) {
|
||||
/*
|
||||
* vma_gap_update() doesn't support concurrent
|
||||
* updates, but we only hold a shared mmap_sem
|
||||
* lock here, so we need to protect against
|
||||
* concurrent vma expansions.
|
||||
* vma_lock_anon_vma() doesn't help here, as
|
||||
* we don't guarantee that all growable vmas
|
||||
* in a mm share the same root anon vma.
|
||||
* So, we reuse mm->page_table_lock to guard
|
||||
* against concurrent vma expansions.
|
||||
*/
|
||||
spin_lock(&vma->vm_mm->page_table_lock);
|
||||
anon_vma_interval_tree_pre_update_vma(vma);
|
||||
vma->vm_start = address;
|
||||
vma->vm_pgoff -= grow;
|
||||
anon_vma_interval_tree_post_update_vma(vma);
|
||||
vma_gap_update(vma);
|
||||
spin_unlock(&vma->vm_mm->page_table_lock);
|
||||
|
||||
perf_event_mmap(vma);
|
||||
}
|
||||
}
|
||||
@@ -1959,14 +2281,17 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
|
||||
vma->vm_prev = NULL;
|
||||
do {
|
||||
rb_erase(&vma->vm_rb, &mm->mm_rb);
|
||||
vma_rb_erase(vma, &mm->mm_rb);
|
||||
mm->map_count--;
|
||||
tail_vma = vma;
|
||||
vma = vma->vm_next;
|
||||
} while (vma && vma->vm_start < end);
|
||||
*insertion_point = vma;
|
||||
if (vma)
|
||||
if (vma) {
|
||||
vma->vm_prev = prev;
|
||||
vma_gap_update(vma);
|
||||
} else
|
||||
mm->highest_vm_end = prev ? prev->vm_end : 0;
|
||||
tail_vma->vm_next = NULL;
|
||||
if (mm->unmap_area == arch_unmap_area)
|
||||
addr = prev ? prev->vm_end : mm->mmap_base;
|
||||
@@ -2561,15 +2886,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
|
||||
* The LSB of head.next can't change from under us
|
||||
* because we hold the mm_all_locks_mutex.
|
||||
*/
|
||||
mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem);
|
||||
down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);
|
||||
/*
|
||||
* We can safely modify head.next after taking the
|
||||
* anon_vma->root->mutex. If some other vma in this mm shares
|
||||
* anon_vma->root->rwsem. If some other vma in this mm shares
|
||||
* the same anon_vma we won't take it again.
|
||||
*
|
||||
* No need of atomic instructions here, head.next
|
||||
* can't change from under us thanks to the
|
||||
* anon_vma->root->mutex.
|
||||
* anon_vma->root->rwsem.
|
||||
*/
|
||||
if (__test_and_set_bit(0, (unsigned long *)
|
||||
&anon_vma->root->rb_root.rb_node))
|
||||
@@ -2671,7 +2996,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
|
||||
*
|
||||
* No need of atomic instructions here, head.next
|
||||
* can't change from under us until we release the
|
||||
* anon_vma->root->mutex.
|
||||
* anon_vma->root->rwsem.
|
||||
*/
|
||||
if (!__test_and_clear_bit(0, (unsigned long *)
|
||||
&anon_vma->root->rb_root.rb_node))
|
||||
|
151
mm/mprotect.c
151
mm/mprotect.c
@@ -35,12 +35,16 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
|
||||
}
|
||||
#endif
|
||||
|
||||
static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
|
||||
static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
unsigned long addr, unsigned long end, pgprot_t newprot,
|
||||
int dirty_accountable)
|
||||
int dirty_accountable, int prot_numa, bool *ret_all_same_node)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
pte_t *pte, oldpte;
|
||||
spinlock_t *ptl;
|
||||
unsigned long pages = 0;
|
||||
bool all_same_node = true;
|
||||
int last_nid = -1;
|
||||
|
||||
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
|
||||
arch_enter_lazy_mmu_mode();
|
||||
@@ -48,17 +52,43 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
|
||||
oldpte = *pte;
|
||||
if (pte_present(oldpte)) {
|
||||
pte_t ptent;
|
||||
bool updated = false;
|
||||
|
||||
ptent = ptep_modify_prot_start(mm, addr, pte);
|
||||
ptent = pte_modify(ptent, newprot);
|
||||
if (!prot_numa) {
|
||||
ptent = pte_modify(ptent, newprot);
|
||||
updated = true;
|
||||
} else {
|
||||
struct page *page;
|
||||
|
||||
page = vm_normal_page(vma, addr, oldpte);
|
||||
if (page) {
|
||||
int this_nid = page_to_nid(page);
|
||||
if (last_nid == -1)
|
||||
last_nid = this_nid;
|
||||
if (last_nid != this_nid)
|
||||
all_same_node = false;
|
||||
|
||||
/* only check non-shared pages */
|
||||
if (!pte_numa(oldpte) &&
|
||||
page_mapcount(page) == 1) {
|
||||
ptent = pte_mknuma(ptent);
|
||||
updated = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Avoid taking write faults for pages we know to be
|
||||
* dirty.
|
||||
*/
|
||||
if (dirty_accountable && pte_dirty(ptent))
|
||||
if (dirty_accountable && pte_dirty(ptent)) {
|
||||
ptent = pte_mkwrite(ptent);
|
||||
updated = true;
|
||||
}
|
||||
|
||||
if (updated)
|
||||
pages++;
|
||||
ptep_modify_prot_commit(mm, addr, pte, ptent);
|
||||
} else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
|
||||
swp_entry_t entry = pte_to_swp_entry(oldpte);
|
||||
@@ -72,61 +102,101 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
|
||||
set_pte_at(mm, addr, pte,
|
||||
swp_entry_to_pte(entry));
|
||||
}
|
||||
pages++;
|
||||
}
|
||||
} while (pte++, addr += PAGE_SIZE, addr != end);
|
||||
arch_leave_lazy_mmu_mode();
|
||||
pte_unmap_unlock(pte - 1, ptl);
|
||||
|
||||
*ret_all_same_node = all_same_node;
|
||||
return pages;
|
||||
}
|
||||
|
||||
static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
|
||||
unsigned long addr, unsigned long end, pgprot_t newprot,
|
||||
int dirty_accountable)
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
|
||||
pmd_t *pmd)
|
||||
{
|
||||
spin_lock(&mm->page_table_lock);
|
||||
set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
}
|
||||
#else
|
||||
static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
|
||||
pmd_t *pmd)
|
||||
{
|
||||
BUG();
|
||||
}
|
||||
#endif /* CONFIG_NUMA_BALANCING */
|
||||
|
||||
static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
|
||||
pud_t *pud, unsigned long addr, unsigned long end,
|
||||
pgprot_t newprot, int dirty_accountable, int prot_numa)
|
||||
{
|
||||
pmd_t *pmd;
|
||||
unsigned long next;
|
||||
unsigned long pages = 0;
|
||||
bool all_same_node;
|
||||
|
||||
pmd = pmd_offset(pud, addr);
|
||||
do {
|
||||
next = pmd_addr_end(addr, end);
|
||||
if (pmd_trans_huge(*pmd)) {
|
||||
if (next - addr != HPAGE_PMD_SIZE)
|
||||
split_huge_page_pmd(vma->vm_mm, pmd);
|
||||
else if (change_huge_pmd(vma, pmd, addr, newprot))
|
||||
split_huge_page_pmd(vma, addr, pmd);
|
||||
else if (change_huge_pmd(vma, pmd, addr, newprot,
|
||||
prot_numa)) {
|
||||
pages += HPAGE_PMD_NR;
|
||||
continue;
|
||||
}
|
||||
/* fall through */
|
||||
}
|
||||
if (pmd_none_or_clear_bad(pmd))
|
||||
continue;
|
||||
change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
|
||||
dirty_accountable);
|
||||
pages += change_pte_range(vma, pmd, addr, next, newprot,
|
||||
dirty_accountable, prot_numa, &all_same_node);
|
||||
|
||||
/*
|
||||
* If we are changing protections for NUMA hinting faults then
|
||||
* set pmd_numa if the examined pages were all on the same
|
||||
* node. This allows a regular PMD to be handled as one fault
|
||||
* and effectively batches the taking of the PTL
|
||||
*/
|
||||
if (prot_numa && all_same_node)
|
||||
change_pmd_protnuma(vma->vm_mm, addr, pmd);
|
||||
} while (pmd++, addr = next, addr != end);
|
||||
|
||||
return pages;
|
||||
}
|
||||
|
||||
static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
|
||||
unsigned long addr, unsigned long end, pgprot_t newprot,
|
||||
int dirty_accountable)
|
||||
static inline unsigned long change_pud_range(struct vm_area_struct *vma,
|
||||
pgd_t *pgd, unsigned long addr, unsigned long end,
|
||||
pgprot_t newprot, int dirty_accountable, int prot_numa)
|
||||
{
|
||||
pud_t *pud;
|
||||
unsigned long next;
|
||||
unsigned long pages = 0;
|
||||
|
||||
pud = pud_offset(pgd, addr);
|
||||
do {
|
||||
next = pud_addr_end(addr, end);
|
||||
if (pud_none_or_clear_bad(pud))
|
||||
continue;
|
||||
change_pmd_range(vma, pud, addr, next, newprot,
|
||||
dirty_accountable);
|
||||
pages += change_pmd_range(vma, pud, addr, next, newprot,
|
||||
dirty_accountable, prot_numa);
|
||||
} while (pud++, addr = next, addr != end);
|
||||
|
||||
return pages;
|
||||
}
|
||||
|
||||
static void change_protection(struct vm_area_struct *vma,
|
||||
static unsigned long change_protection_range(struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long end, pgprot_t newprot,
|
||||
int dirty_accountable)
|
||||
int dirty_accountable, int prot_numa)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
pgd_t *pgd;
|
||||
unsigned long next;
|
||||
unsigned long start = addr;
|
||||
unsigned long pages = 0;
|
||||
|
||||
BUG_ON(addr >= end);
|
||||
pgd = pgd_offset(mm, addr);
|
||||
@@ -135,10 +205,32 @@ static void change_protection(struct vm_area_struct *vma,
|
||||
next = pgd_addr_end(addr, end);
|
||||
if (pgd_none_or_clear_bad(pgd))
|
||||
continue;
|
||||
change_pud_range(vma, pgd, addr, next, newprot,
|
||||
dirty_accountable);
|
||||
pages += change_pud_range(vma, pgd, addr, next, newprot,
|
||||
dirty_accountable, prot_numa);
|
||||
} while (pgd++, addr = next, addr != end);
|
||||
flush_tlb_range(vma, start, end);
|
||||
|
||||
/* Only flush the TLB if we actually modified any entries: */
|
||||
if (pages)
|
||||
flush_tlb_range(vma, start, end);
|
||||
|
||||
return pages;
|
||||
}
|
||||
|
||||
unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
|
||||
unsigned long end, pgprot_t newprot,
|
||||
int dirty_accountable, int prot_numa)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
unsigned long pages;
|
||||
|
||||
mmu_notifier_invalidate_range_start(mm, start, end);
|
||||
if (is_vm_hugetlb_page(vma))
|
||||
pages = hugetlb_change_protection(vma, start, end, newprot);
|
||||
else
|
||||
pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
|
||||
mmu_notifier_invalidate_range_end(mm, start, end);
|
||||
|
||||
return pages;
|
||||
}
|
||||
|
||||
int
|
||||
@@ -213,12 +305,9 @@ success:
|
||||
dirty_accountable = 1;
|
||||
}
|
||||
|
||||
mmu_notifier_invalidate_range_start(mm, start, end);
|
||||
if (is_vm_hugetlb_page(vma))
|
||||
hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
|
||||
else
|
||||
change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
|
||||
mmu_notifier_invalidate_range_end(mm, start, end);
|
||||
change_protection(vma, start, end, vma->vm_page_prot,
|
||||
dirty_accountable, 0);
|
||||
|
||||
vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
|
||||
vm_stat_account(mm, newflags, vma->vm_file, nrpages);
|
||||
perf_event_mmap(vma);
|
||||
@@ -274,8 +363,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
|
||||
error = -EINVAL;
|
||||
if (!(vma->vm_flags & VM_GROWSDOWN))
|
||||
goto out;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
if (vma->vm_start > start)
|
||||
goto out;
|
||||
if (unlikely(grows & PROT_GROWSUP)) {
|
||||
@@ -291,9 +379,10 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
|
||||
for (nstart = start ; ; ) {
|
||||
unsigned long newflags;
|
||||
|
||||
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
|
||||
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
|
||||
|
||||
newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
|
||||
newflags = vm_flags;
|
||||
newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
|
||||
|
||||
/* newflags >> 4 shift VM_MAY% in place of VM_% */
|
||||
if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
|
||||
|
@@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
|
||||
}
|
||||
if (vma->anon_vma) {
|
||||
anon_vma = vma->anon_vma;
|
||||
anon_vma_lock(anon_vma);
|
||||
anon_vma_lock_write(anon_vma);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -182,7 +182,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
|
||||
need_flush = true;
|
||||
continue;
|
||||
} else if (!err) {
|
||||
split_huge_page_pmd(vma->vm_mm, old_pmd);
|
||||
split_huge_page_pmd(vma, old_addr, old_pmd);
|
||||
}
|
||||
VM_BUG_ON(pmd_trans_huge(*old_pmd));
|
||||
}
|
||||
|
@@ -137,6 +137,22 @@ unsigned long __init free_low_memory_core_early(int nodeid)
|
||||
return count;
|
||||
}
|
||||
|
||||
static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
|
||||
{
|
||||
struct zone *z;
|
||||
|
||||
/*
|
||||
* In free_area_init_core(), highmem zone's managed_pages is set to
|
||||
* present_pages, and bootmem allocator doesn't allocate from highmem
|
||||
* zones. So there's no need to recalculate managed_pages because all
|
||||
* highmem pages will be managed by the buddy system. Here highmem
|
||||
* zone also includes highmem movable zone.
|
||||
*/
|
||||
for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
|
||||
if (!is_highmem(z))
|
||||
z->managed_pages = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* free_all_bootmem - release free pages to the buddy allocator
|
||||
*
|
||||
@@ -144,6 +160,11 @@ unsigned long __init free_low_memory_core_early(int nodeid)
|
||||
*/
|
||||
unsigned long __init free_all_bootmem(void)
|
||||
{
|
||||
struct pglist_data *pgdat;
|
||||
|
||||
for_each_online_pgdat(pgdat)
|
||||
reset_node_lowmem_managed_pages(pgdat);
|
||||
|
||||
/*
|
||||
* We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
|
||||
* because in some case like Node0 doesn't have RAM installed
|
||||
|
15
mm/nommu.c
15
mm/nommu.c
@@ -66,6 +66,21 @@ int heap_stack_gap = 0;
|
||||
|
||||
atomic_long_t mmap_pages_allocated;
|
||||
|
||||
/*
|
||||
* The global memory commitment made in the system can be a metric
|
||||
* that can be used to drive ballooning decisions when Linux is hosted
|
||||
* as a guest. On Hyper-V, the host implements a policy engine for dynamically
|
||||
* balancing memory across competing virtual machines that are hosted.
|
||||
* Several metrics drive this policy engine including the guest reported
|
||||
* memory commitment.
|
||||
*/
|
||||
unsigned long vm_memory_committed(void)
|
||||
{
|
||||
return percpu_counter_read_positive(&vm_committed_as);
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL_GPL(vm_memory_committed);
|
||||
|
||||
EXPORT_SYMBOL(mem_map);
|
||||
EXPORT_SYMBOL(num_physpages);
|
||||
|
||||
|
138
mm/oom_kill.c
138
mm/oom_kill.c
@@ -44,48 +44,6 @@ int sysctl_oom_kill_allocating_task;
|
||||
int sysctl_oom_dump_tasks = 1;
|
||||
static DEFINE_SPINLOCK(zone_scan_lock);
|
||||
|
||||
/*
|
||||
* compare_swap_oom_score_adj() - compare and swap current's oom_score_adj
|
||||
* @old_val: old oom_score_adj for compare
|
||||
* @new_val: new oom_score_adj for swap
|
||||
*
|
||||
* Sets the oom_score_adj value for current to @new_val iff its present value is
|
||||
* @old_val. Usually used to reinstate a previous value to prevent racing with
|
||||
* userspacing tuning the value in the interim.
|
||||
*/
|
||||
void compare_swap_oom_score_adj(int old_val, int new_val)
|
||||
{
|
||||
struct sighand_struct *sighand = current->sighand;
|
||||
|
||||
spin_lock_irq(&sighand->siglock);
|
||||
if (current->signal->oom_score_adj == old_val)
|
||||
current->signal->oom_score_adj = new_val;
|
||||
trace_oom_score_adj_update(current);
|
||||
spin_unlock_irq(&sighand->siglock);
|
||||
}
|
||||
|
||||
/**
|
||||
* test_set_oom_score_adj() - set current's oom_score_adj and return old value
|
||||
* @new_val: new oom_score_adj value
|
||||
*
|
||||
* Sets the oom_score_adj value for current to @new_val with proper
|
||||
* synchronization and returns the old value. Usually used to temporarily
|
||||
* set a value, save the old value in the caller, and then reinstate it later.
|
||||
*/
|
||||
int test_set_oom_score_adj(int new_val)
|
||||
{
|
||||
struct sighand_struct *sighand = current->sighand;
|
||||
int old_val;
|
||||
|
||||
spin_lock_irq(&sighand->siglock);
|
||||
old_val = current->signal->oom_score_adj;
|
||||
current->signal->oom_score_adj = new_val;
|
||||
trace_oom_score_adj_update(current);
|
||||
spin_unlock_irq(&sighand->siglock);
|
||||
|
||||
return old_val;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
/**
|
||||
* has_intersects_mems_allowed() - check task eligiblity for kill
|
||||
@@ -193,7 +151,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
|
||||
if (!p)
|
||||
return 0;
|
||||
|
||||
adj = p->signal->oom_score_adj;
|
||||
adj = (long)p->signal->oom_score_adj;
|
||||
if (adj == OOM_SCORE_ADJ_MIN) {
|
||||
task_unlock(p);
|
||||
return 0;
|
||||
@@ -257,7 +215,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
|
||||
* the page allocator means a mempolicy is in effect. Cpuset policy
|
||||
* is enforced in get_page_from_freelist().
|
||||
*/
|
||||
if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) {
|
||||
if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
|
||||
*totalpages = total_swap_pages;
|
||||
for_each_node_mask(nid, *nodemask)
|
||||
*totalpages += node_spanned_pages(nid);
|
||||
@@ -310,26 +268,20 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
|
||||
if (!task->mm)
|
||||
return OOM_SCAN_CONTINUE;
|
||||
|
||||
if (task->flags & PF_EXITING) {
|
||||
/*
|
||||
* If task is allocating a lot of memory and has been marked to be
|
||||
* killed first if it triggers an oom, then select it.
|
||||
*/
|
||||
if (oom_task_origin(task))
|
||||
return OOM_SCAN_SELECT;
|
||||
|
||||
if (task->flags & PF_EXITING && !force_kill) {
|
||||
/*
|
||||
* If task is current and is in the process of releasing memory,
|
||||
* allow the "kill" to set TIF_MEMDIE, which will allow it to
|
||||
* access memory reserves. Otherwise, it may stall forever.
|
||||
*
|
||||
* The iteration isn't broken here, however, in case other
|
||||
* threads are found to have already been oom killed.
|
||||
* If this task is not being ptraced on exit, then wait for it
|
||||
* to finish before killing some other task unnecessarily.
|
||||
*/
|
||||
if (task == current)
|
||||
return OOM_SCAN_SELECT;
|
||||
else if (!force_kill) {
|
||||
/*
|
||||
* If this task is not being ptraced on exit, then wait
|
||||
* for it to finish before killing some other task
|
||||
* unnecessarily.
|
||||
*/
|
||||
if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
|
||||
return OOM_SCAN_ABORT;
|
||||
}
|
||||
if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
|
||||
return OOM_SCAN_ABORT;
|
||||
}
|
||||
return OOM_SCAN_OK;
|
||||
}
|
||||
@@ -412,7 +364,7 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
|
||||
continue;
|
||||
}
|
||||
|
||||
pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n",
|
||||
pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n",
|
||||
task->pid, from_kuid(&init_user_ns, task_uid(task)),
|
||||
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
|
||||
task->mm->nr_ptes,
|
||||
@@ -428,7 +380,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
|
||||
{
|
||||
task_lock(current);
|
||||
pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
|
||||
"oom_score_adj=%d\n",
|
||||
"oom_score_adj=%hd\n",
|
||||
current->comm, gfp_mask, order,
|
||||
current->signal->oom_score_adj);
|
||||
cpuset_print_task_mems_allowed(current);
|
||||
@@ -639,43 +591,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
|
||||
spin_unlock(&zone_scan_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to acquire the oom killer lock for all system zones. Returns zero if a
|
||||
* parallel oom killing is taking place, otherwise locks all zones and returns
|
||||
* non-zero.
|
||||
*/
|
||||
static int try_set_system_oom(void)
|
||||
{
|
||||
struct zone *zone;
|
||||
int ret = 1;
|
||||
|
||||
spin_lock(&zone_scan_lock);
|
||||
for_each_populated_zone(zone)
|
||||
if (zone_is_oom_locked(zone)) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
for_each_populated_zone(zone)
|
||||
zone_set_flag(zone, ZONE_OOM_LOCKED);
|
||||
out:
|
||||
spin_unlock(&zone_scan_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Clears ZONE_OOM_LOCKED for all system zones so that failed allocation
|
||||
* attempts or page faults may now recall the oom killer, if necessary.
|
||||
*/
|
||||
static void clear_system_oom(void)
|
||||
{
|
||||
struct zone *zone;
|
||||
|
||||
spin_lock(&zone_scan_lock);
|
||||
for_each_populated_zone(zone)
|
||||
zone_clear_flag(zone, ZONE_OOM_LOCKED);
|
||||
spin_unlock(&zone_scan_lock);
|
||||
}
|
||||
|
||||
/**
|
||||
* out_of_memory - kill the "best" process when we run out of memory
|
||||
* @zonelist: zonelist pointer
|
||||
@@ -706,11 +621,11 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
|
||||
return;
|
||||
|
||||
/*
|
||||
* If current has a pending SIGKILL, then automatically select it. The
|
||||
* goal is to allow it to allocate so that it may quickly exit and free
|
||||
* its memory.
|
||||
* If current has a pending SIGKILL or is exiting, then automatically
|
||||
* select it. The goal is to allow it to allocate so that it may
|
||||
* quickly exit and free its memory.
|
||||
*/
|
||||
if (fatal_signal_pending(current)) {
|
||||
if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
|
||||
set_thread_flag(TIF_MEMDIE);
|
||||
return;
|
||||
}
|
||||
@@ -756,15 +671,16 @@ out:
|
||||
|
||||
/*
|
||||
* The pagefault handler calls here because it is out of memory, so kill a
|
||||
* memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel
|
||||
* oom killing is already in progress so do nothing. If a task is found with
|
||||
* TIF_MEMDIE set, it has been killed so do nothing and allow it to exit.
|
||||
* memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a
|
||||
* parallel oom killing is already in progress so do nothing.
|
||||
*/
|
||||
void pagefault_out_of_memory(void)
|
||||
{
|
||||
if (try_set_system_oom()) {
|
||||
struct zonelist *zonelist = node_zonelist(first_online_node,
|
||||
GFP_KERNEL);
|
||||
|
||||
if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
|
||||
out_of_memory(NULL, 0, 0, NULL, false);
|
||||
clear_system_oom();
|
||||
clear_zonelist_oom(zonelist, GFP_KERNEL);
|
||||
}
|
||||
schedule_timeout_killable(1);
|
||||
}
|
||||
|
@@ -200,6 +200,18 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
|
||||
x += zone_page_state(z, NR_FREE_PAGES) +
|
||||
zone_reclaimable_pages(z) - z->dirty_balance_reserve;
|
||||
}
|
||||
/*
|
||||
* Unreclaimable memory (kernel memory or anonymous memory
|
||||
* without swap) can bring down the dirtyable pages below
|
||||
* the zone's dirty balance reserve and the above calculation
|
||||
* will underflow. However we still want to add in nodes
|
||||
* which are below threshold (negative values) to get a more
|
||||
* accurate calculation but make sure that the total never
|
||||
* underflows.
|
||||
*/
|
||||
if ((long)x < 0)
|
||||
x = 0;
|
||||
|
||||
/*
|
||||
* Make sure that the number of highmem pages is never larger
|
||||
* than the number of the total dirtyable memory. This can only
|
||||
@@ -222,8 +234,8 @@ static unsigned long global_dirtyable_memory(void)
|
||||
{
|
||||
unsigned long x;
|
||||
|
||||
x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() -
|
||||
dirty_balance_reserve;
|
||||
x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
|
||||
x -= min(x, dirty_balance_reserve);
|
||||
|
||||
if (!vm_highmem_is_dirtyable)
|
||||
x -= highmem_dirtyable_memory(x);
|
||||
@@ -290,9 +302,12 @@ static unsigned long zone_dirtyable_memory(struct zone *zone)
|
||||
* highmem zone can hold its share of dirty pages, so we don't
|
||||
* care about vm_highmem_is_dirtyable here.
|
||||
*/
|
||||
return zone_page_state(zone, NR_FREE_PAGES) +
|
||||
zone_reclaimable_pages(zone) -
|
||||
zone->dirty_balance_reserve;
|
||||
unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) +
|
||||
zone_reclaimable_pages(zone);
|
||||
|
||||
/* don't allow this to underflow */
|
||||
nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
|
||||
return nr_pages;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1069,7 +1084,7 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi,
|
||||
}
|
||||
|
||||
/*
|
||||
* After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
|
||||
* After a task dirtied this many pages, balance_dirty_pages_ratelimited()
|
||||
* will look to see if it needs to start dirty throttling.
|
||||
*
|
||||
* If dirty_poll_interval is too low, big NUMA machines will call the expensive
|
||||
@@ -1436,9 +1451,8 @@ static DEFINE_PER_CPU(int, bdp_ratelimits);
|
||||
DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
|
||||
|
||||
/**
|
||||
* balance_dirty_pages_ratelimited_nr - balance dirty memory state
|
||||
* balance_dirty_pages_ratelimited - balance dirty memory state
|
||||
* @mapping: address_space which was dirtied
|
||||
* @nr_pages_dirtied: number of pages which the caller has just dirtied
|
||||
*
|
||||
* Processes which are dirtying memory should call in here once for each page
|
||||
* which was newly dirtied. The function will periodically check the system's
|
||||
@@ -1449,8 +1463,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
|
||||
* limit we decrease the ratelimiting by a lot, to prevent individual processes
|
||||
* from overshooting the limit by (ratelimit_pages) each.
|
||||
*/
|
||||
void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
|
||||
unsigned long nr_pages_dirtied)
|
||||
void balance_dirty_pages_ratelimited(struct address_space *mapping)
|
||||
{
|
||||
struct backing_dev_info *bdi = mapping->backing_dev_info;
|
||||
int ratelimit;
|
||||
@@ -1484,6 +1497,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
|
||||
*/
|
||||
p = &__get_cpu_var(dirty_throttle_leaks);
|
||||
if (*p > 0 && current->nr_dirtied < ratelimit) {
|
||||
unsigned long nr_pages_dirtied;
|
||||
nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
|
||||
*p -= nr_pages_dirtied;
|
||||
current->nr_dirtied += nr_pages_dirtied;
|
||||
@@ -1493,7 +1507,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
|
||||
if (unlikely(current->nr_dirtied >= ratelimit))
|
||||
balance_dirty_pages(mapping, current->nr_dirtied);
|
||||
}
|
||||
EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
|
||||
EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
|
||||
|
||||
void throttle_vm_writeout(gfp_t gfp_mask)
|
||||
{
|
||||
|
421
mm/page_alloc.c
421
mm/page_alloc.c
@@ -89,6 +89,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
|
||||
[N_NORMAL_MEMORY] = { { [0] = 1UL } },
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
[N_HIGH_MEMORY] = { { [0] = 1UL } },
|
||||
#endif
|
||||
#ifdef CONFIG_MOVABLE_NODE
|
||||
[N_MEMORY] = { { [0] = 1UL } },
|
||||
#endif
|
||||
[N_CPU] = { { [0] = 1UL } },
|
||||
#endif /* NUMA */
|
||||
@@ -218,11 +221,6 @@ EXPORT_SYMBOL(nr_online_nodes);
|
||||
|
||||
int page_group_by_mobility_disabled __read_mostly;
|
||||
|
||||
/*
|
||||
* NOTE:
|
||||
* Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
|
||||
* Instead, use {un}set_pageblock_isolate.
|
||||
*/
|
||||
void set_pageblock_migratetype(struct page *page, int migratetype)
|
||||
{
|
||||
|
||||
@@ -368,8 +366,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
|
||||
int nr_pages = 1 << order;
|
||||
int bad = 0;
|
||||
|
||||
if (unlikely(compound_order(page) != order) ||
|
||||
unlikely(!PageHead(page))) {
|
||||
if (unlikely(compound_order(page) != order)) {
|
||||
bad_page(page);
|
||||
bad++;
|
||||
}
|
||||
@@ -523,7 +520,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
|
||||
* If a block is freed, and its buddy is also free, then this
|
||||
* triggers coalescing into a block of larger size.
|
||||
*
|
||||
* -- wli
|
||||
* -- nyc
|
||||
*/
|
||||
|
||||
static inline void __free_one_page(struct page *page,
|
||||
@@ -608,6 +605,7 @@ static inline int free_pages_check(struct page *page)
|
||||
bad_page(page);
|
||||
return 1;
|
||||
}
|
||||
reset_page_last_nid(page);
|
||||
if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
|
||||
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
|
||||
return 0;
|
||||
@@ -667,11 +665,13 @@ static void free_pcppages_bulk(struct zone *zone, int count,
|
||||
/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
|
||||
__free_one_page(page, zone, 0, mt);
|
||||
trace_mm_page_pcpu_drain(page, 0, mt);
|
||||
if (is_migrate_cma(mt))
|
||||
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
|
||||
if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {
|
||||
__mod_zone_page_state(zone, NR_FREE_PAGES, 1);
|
||||
if (is_migrate_cma(mt))
|
||||
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
|
||||
}
|
||||
} while (--to_free && --batch_free && !list_empty(list));
|
||||
}
|
||||
__mod_zone_page_state(zone, NR_FREE_PAGES, count);
|
||||
spin_unlock(&zone->lock);
|
||||
}
|
||||
|
||||
@@ -730,6 +730,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* Read access to zone->managed_pages is safe because it's unsigned long,
|
||||
* but we still need to serialize writers. Currently all callers of
|
||||
* __free_pages_bootmem() except put_page_bootmem() should only be used
|
||||
* at boot time. So for shorter boot time, we shift the burden to
|
||||
* put_page_bootmem() to serialize writers.
|
||||
*/
|
||||
void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
|
||||
{
|
||||
unsigned int nr_pages = 1 << order;
|
||||
@@ -745,6 +752,7 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
|
||||
set_page_count(p, 0);
|
||||
}
|
||||
|
||||
page_zone(page)->managed_pages += 1 << order;
|
||||
set_page_refcounted(page);
|
||||
__free_pages(page, order);
|
||||
}
|
||||
@@ -780,7 +788,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
|
||||
* large block of memory acted on by a series of small allocations.
|
||||
* This behavior is a critical factor in sglist merging's success.
|
||||
*
|
||||
* -- wli
|
||||
* -- nyc
|
||||
*/
|
||||
static inline void expand(struct zone *zone, struct page *page,
|
||||
int low, int high, struct free_area *area,
|
||||
@@ -1376,14 +1384,8 @@ void split_page(struct page *page, unsigned int order)
|
||||
set_page_refcounted(page + i);
|
||||
}
|
||||
|
||||
/*
|
||||
* Similar to the split_page family of functions except that the page
|
||||
* required at the given order and being isolated now to prevent races
|
||||
* with parallel allocators
|
||||
*/
|
||||
int capture_free_page(struct page *page, int alloc_order, int migratetype)
|
||||
static int __isolate_free_page(struct page *page, unsigned int order)
|
||||
{
|
||||
unsigned int order;
|
||||
unsigned long watermark;
|
||||
struct zone *zone;
|
||||
int mt;
|
||||
@@ -1391,27 +1393,23 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
|
||||
BUG_ON(!PageBuddy(page));
|
||||
|
||||
zone = page_zone(page);
|
||||
order = page_order(page);
|
||||
mt = get_pageblock_migratetype(page);
|
||||
|
||||
/* Obey watermarks as if the page was being allocated */
|
||||
watermark = low_wmark_pages(zone) + (1 << order);
|
||||
if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
|
||||
return 0;
|
||||
if (mt != MIGRATE_ISOLATE) {
|
||||
/* Obey watermarks as if the page was being allocated */
|
||||
watermark = low_wmark_pages(zone) + (1 << order);
|
||||
if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
|
||||
return 0;
|
||||
|
||||
__mod_zone_freepage_state(zone, -(1UL << order), mt);
|
||||
}
|
||||
|
||||
/* Remove page from free list */
|
||||
list_del(&page->lru);
|
||||
zone->free_area[order].nr_free--;
|
||||
rmv_page_order(page);
|
||||
|
||||
mt = get_pageblock_migratetype(page);
|
||||
if (unlikely(mt != MIGRATE_ISOLATE))
|
||||
__mod_zone_freepage_state(zone, -(1UL << order), mt);
|
||||
|
||||
if (alloc_order != order)
|
||||
expand(zone, page, alloc_order, order,
|
||||
&zone->free_area[order], migratetype);
|
||||
|
||||
/* Set the pageblock if the captured page is at least a pageblock */
|
||||
/* Set the pageblock if the isolated page is at least a pageblock */
|
||||
if (order >= pageblock_order - 1) {
|
||||
struct page *endpage = page + (1 << order) - 1;
|
||||
for (; page < endpage; page += pageblock_nr_pages) {
|
||||
@@ -1440,10 +1438,9 @@ int split_free_page(struct page *page)
|
||||
unsigned int order;
|
||||
int nr_pages;
|
||||
|
||||
BUG_ON(!PageBuddy(page));
|
||||
order = page_order(page);
|
||||
|
||||
nr_pages = capture_free_page(page, order, 0);
|
||||
nr_pages = __isolate_free_page(page, order);
|
||||
if (!nr_pages)
|
||||
return 0;
|
||||
|
||||
@@ -1641,20 +1638,6 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
|
||||
return true;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_ISOLATION
|
||||
static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
|
||||
{
|
||||
if (unlikely(zone->nr_pageblock_isolate))
|
||||
return zone->nr_pageblock_isolate * pageblock_nr_pages;
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
|
||||
int classzone_idx, int alloc_flags)
|
||||
{
|
||||
@@ -1670,14 +1653,6 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
|
||||
if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
|
||||
free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
|
||||
|
||||
/*
|
||||
* If the zone has MIGRATE_ISOLATE type free pages, we should consider
|
||||
* it. nr_zone_isolate_freepages is never accurate so kswapd might not
|
||||
* sleep although it could do so. But this is more desirable for memory
|
||||
* hotplug than sleeping which can cause a livelock in the direct
|
||||
* reclaim path.
|
||||
*/
|
||||
free_pages -= nr_zone_isolate_freepages(z);
|
||||
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
|
||||
free_pages);
|
||||
}
|
||||
@@ -1692,7 +1667,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
|
||||
*
|
||||
* If the zonelist cache is present in the passed in zonelist, then
|
||||
* returns a pointer to the allowed node mask (either the current
|
||||
* tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
|
||||
* tasks mems_allowed, or node_states[N_MEMORY].)
|
||||
*
|
||||
* If the zonelist cache is not available for this zonelist, does
|
||||
* nothing and returns NULL.
|
||||
@@ -1721,7 +1696,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
|
||||
|
||||
allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
|
||||
&cpuset_current_mems_allowed :
|
||||
&node_states[N_HIGH_MEMORY];
|
||||
&node_states[N_MEMORY];
|
||||
return allowednodes;
|
||||
}
|
||||
|
||||
@@ -1871,7 +1846,7 @@ zonelist_scan:
|
||||
*/
|
||||
for_each_zone_zonelist_nodemask(zone, z, zonelist,
|
||||
high_zoneidx, nodemask) {
|
||||
if (NUMA_BUILD && zlc_active &&
|
||||
if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
|
||||
!zlc_zone_worth_trying(zonelist, z, allowednodes))
|
||||
continue;
|
||||
if ((alloc_flags & ALLOC_CPUSET) &&
|
||||
@@ -1917,7 +1892,8 @@ zonelist_scan:
|
||||
classzone_idx, alloc_flags))
|
||||
goto try_this_zone;
|
||||
|
||||
if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
|
||||
if (IS_ENABLED(CONFIG_NUMA) &&
|
||||
!did_zlc_setup && nr_online_nodes > 1) {
|
||||
/*
|
||||
* we do zlc_setup if there are multiple nodes
|
||||
* and before considering the first zone allowed
|
||||
@@ -1936,7 +1912,7 @@ zonelist_scan:
|
||||
* As we may have just activated ZLC, check if the first
|
||||
* eligible zone has failed zone_reclaim recently.
|
||||
*/
|
||||
if (NUMA_BUILD && zlc_active &&
|
||||
if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
|
||||
!zlc_zone_worth_trying(zonelist, z, allowednodes))
|
||||
continue;
|
||||
|
||||
@@ -1962,11 +1938,11 @@ try_this_zone:
|
||||
if (page)
|
||||
break;
|
||||
this_zone_full:
|
||||
if (NUMA_BUILD)
|
||||
if (IS_ENABLED(CONFIG_NUMA))
|
||||
zlc_mark_zone_full(zonelist, z);
|
||||
}
|
||||
|
||||
if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
|
||||
if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
|
||||
/* Disable zlc cache for second zonelist scan */
|
||||
zlc_active = 0;
|
||||
goto zonelist_scan;
|
||||
@@ -2148,8 +2124,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
|
||||
bool *contended_compaction, bool *deferred_compaction,
|
||||
unsigned long *did_some_progress)
|
||||
{
|
||||
struct page *page = NULL;
|
||||
|
||||
if (!order)
|
||||
return NULL;
|
||||
|
||||
@@ -2161,16 +2135,12 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
|
||||
current->flags |= PF_MEMALLOC;
|
||||
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
|
||||
nodemask, sync_migration,
|
||||
contended_compaction, &page);
|
||||
contended_compaction);
|
||||
current->flags &= ~PF_MEMALLOC;
|
||||
|
||||
/* If compaction captured a page, prep and use it */
|
||||
if (page) {
|
||||
prep_new_page(page, order, gfp_mask);
|
||||
goto got_page;
|
||||
}
|
||||
|
||||
if (*did_some_progress != COMPACT_SKIPPED) {
|
||||
struct page *page;
|
||||
|
||||
/* Page migration frees to the PCP lists but we want merging */
|
||||
drain_pages(get_cpu());
|
||||
put_cpu();
|
||||
@@ -2180,7 +2150,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
|
||||
alloc_flags & ~ALLOC_NO_WATERMARKS,
|
||||
preferred_zone, migratetype);
|
||||
if (page) {
|
||||
got_page:
|
||||
preferred_zone->compact_blockskip_flush = false;
|
||||
preferred_zone->compact_considered = 0;
|
||||
preferred_zone->compact_defer_shift = 0;
|
||||
@@ -2266,7 +2235,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
|
||||
return NULL;
|
||||
|
||||
/* After successful reclaim, reconsider all zones for allocation */
|
||||
if (NUMA_BUILD)
|
||||
if (IS_ENABLED(CONFIG_NUMA))
|
||||
zlc_clear_zones_full(zonelist);
|
||||
|
||||
retry:
|
||||
@@ -2412,12 +2381,14 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
|
||||
* allowed per node queues are empty and that nodes are
|
||||
* over allocated.
|
||||
*/
|
||||
if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
|
||||
if (IS_ENABLED(CONFIG_NUMA) &&
|
||||
(gfp_mask & GFP_THISNODE) == GFP_THISNODE)
|
||||
goto nopage;
|
||||
|
||||
restart:
|
||||
wake_all_kswapd(order, zonelist, high_zoneidx,
|
||||
zone_idx(preferred_zone));
|
||||
if (!(gfp_mask & __GFP_NO_KSWAPD))
|
||||
wake_all_kswapd(order, zonelist, high_zoneidx,
|
||||
zone_idx(preferred_zone));
|
||||
|
||||
/*
|
||||
* OK, we're below the kswapd watermark and have kicked background
|
||||
@@ -2494,7 +2465,7 @@ rebalance:
|
||||
* system then fail the allocation instead of entering direct reclaim.
|
||||
*/
|
||||
if ((deferred_compaction || contended_compaction) &&
|
||||
(gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE)
|
||||
(gfp_mask & __GFP_NO_KSWAPD))
|
||||
goto nopage;
|
||||
|
||||
/* Try direct reclaim and then allocating */
|
||||
@@ -2595,6 +2566,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
|
||||
int migratetype = allocflags_to_migratetype(gfp_mask);
|
||||
unsigned int cpuset_mems_cookie;
|
||||
int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
|
||||
struct mem_cgroup *memcg = NULL;
|
||||
|
||||
gfp_mask &= gfp_allowed_mask;
|
||||
|
||||
@@ -2613,6 +2585,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
|
||||
if (unlikely(!zonelist->_zonerefs->zone))
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* Will only have any effect when __GFP_KMEMCG is set. This is
|
||||
* verified in the (always inline) callee
|
||||
*/
|
||||
if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
|
||||
return NULL;
|
||||
|
||||
retry_cpuset:
|
||||
cpuset_mems_cookie = get_mems_allowed();
|
||||
|
||||
@@ -2648,6 +2627,8 @@ out:
|
||||
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
|
||||
goto retry_cpuset;
|
||||
|
||||
memcg_kmem_commit_charge(page, memcg, order);
|
||||
|
||||
return page;
|
||||
}
|
||||
EXPORT_SYMBOL(__alloc_pages_nodemask);
|
||||
@@ -2700,6 +2681,31 @@ void free_pages(unsigned long addr, unsigned int order)
|
||||
|
||||
EXPORT_SYMBOL(free_pages);
|
||||
|
||||
/*
|
||||
* __free_memcg_kmem_pages and free_memcg_kmem_pages will free
|
||||
* pages allocated with __GFP_KMEMCG.
|
||||
*
|
||||
* Those pages are accounted to a particular memcg, embedded in the
|
||||
* corresponding page_cgroup. To avoid adding a hit in the allocator to search
|
||||
* for that information only to find out that it is NULL for users who have no
|
||||
* interest in that whatsoever, we provide these functions.
|
||||
*
|
||||
* The caller knows better which flags it relies on.
|
||||
*/
|
||||
void __free_memcg_kmem_pages(struct page *page, unsigned int order)
|
||||
{
|
||||
memcg_kmem_uncharge_pages(page, order);
|
||||
__free_pages(page, order);
|
||||
}
|
||||
|
||||
void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
|
||||
{
|
||||
if (addr != 0) {
|
||||
VM_BUG_ON(!virt_addr_valid((void *)addr));
|
||||
__free_memcg_kmem_pages(virt_to_page((void *)addr), order);
|
||||
}
|
||||
}
|
||||
|
||||
static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
|
||||
{
|
||||
if (addr) {
|
||||
@@ -2818,7 +2824,7 @@ unsigned int nr_free_pagecache_pages(void)
|
||||
|
||||
static inline void show_node(struct zone *zone)
|
||||
{
|
||||
if (NUMA_BUILD)
|
||||
if (IS_ENABLED(CONFIG_NUMA))
|
||||
printk("Node %d ", zone_to_nid(zone));
|
||||
}
|
||||
|
||||
@@ -2876,6 +2882,31 @@ out:
|
||||
|
||||
#define K(x) ((x) << (PAGE_SHIFT-10))
|
||||
|
||||
static void show_migration_types(unsigned char type)
|
||||
{
|
||||
static const char types[MIGRATE_TYPES] = {
|
||||
[MIGRATE_UNMOVABLE] = 'U',
|
||||
[MIGRATE_RECLAIMABLE] = 'E',
|
||||
[MIGRATE_MOVABLE] = 'M',
|
||||
[MIGRATE_RESERVE] = 'R',
|
||||
#ifdef CONFIG_CMA
|
||||
[MIGRATE_CMA] = 'C',
|
||||
#endif
|
||||
[MIGRATE_ISOLATE] = 'I',
|
||||
};
|
||||
char tmp[MIGRATE_TYPES + 1];
|
||||
char *p = tmp;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < MIGRATE_TYPES; i++) {
|
||||
if (type & (1 << i))
|
||||
*p++ = types[i];
|
||||
}
|
||||
|
||||
*p = '\0';
|
||||
printk("(%s) ", tmp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Show free area list (used inside shift_scroll-lock stuff)
|
||||
* We also calculate the percentage fragmentation. We do this by counting the
|
||||
@@ -2950,6 +2981,7 @@ void show_free_areas(unsigned int filter)
|
||||
" isolated(anon):%lukB"
|
||||
" isolated(file):%lukB"
|
||||
" present:%lukB"
|
||||
" managed:%lukB"
|
||||
" mlocked:%lukB"
|
||||
" dirty:%lukB"
|
||||
" writeback:%lukB"
|
||||
@@ -2979,6 +3011,7 @@ void show_free_areas(unsigned int filter)
|
||||
K(zone_page_state(zone, NR_ISOLATED_ANON)),
|
||||
K(zone_page_state(zone, NR_ISOLATED_FILE)),
|
||||
K(zone->present_pages),
|
||||
K(zone->managed_pages),
|
||||
K(zone_page_state(zone, NR_MLOCK)),
|
||||
K(zone_page_state(zone, NR_FILE_DIRTY)),
|
||||
K(zone_page_state(zone, NR_WRITEBACK)),
|
||||
@@ -3004,6 +3037,7 @@ void show_free_areas(unsigned int filter)
|
||||
|
||||
for_each_populated_zone(zone) {
|
||||
unsigned long nr[MAX_ORDER], flags, order, total = 0;
|
||||
unsigned char types[MAX_ORDER];
|
||||
|
||||
if (skip_free_areas_node(filter, zone_to_nid(zone)))
|
||||
continue;
|
||||
@@ -3012,12 +3046,24 @@ void show_free_areas(unsigned int filter)
|
||||
|
||||
spin_lock_irqsave(&zone->lock, flags);
|
||||
for (order = 0; order < MAX_ORDER; order++) {
|
||||
nr[order] = zone->free_area[order].nr_free;
|
||||
struct free_area *area = &zone->free_area[order];
|
||||
int type;
|
||||
|
||||
nr[order] = area->nr_free;
|
||||
total += nr[order] << order;
|
||||
|
||||
types[order] = 0;
|
||||
for (type = 0; type < MIGRATE_TYPES; type++) {
|
||||
if (!list_empty(&area->free_list[type]))
|
||||
types[order] |= 1 << type;
|
||||
}
|
||||
}
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
for (order = 0; order < MAX_ORDER; order++)
|
||||
for (order = 0; order < MAX_ORDER; order++) {
|
||||
printk("%lu*%lukB ", nr[order], K(1UL) << order);
|
||||
if (nr[order])
|
||||
show_migration_types(types[order]);
|
||||
}
|
||||
printk("= %lukB\n", K(total));
|
||||
}
|
||||
|
||||
@@ -3194,7 +3240,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
|
||||
return node;
|
||||
}
|
||||
|
||||
for_each_node_state(n, N_HIGH_MEMORY) {
|
||||
for_each_node_state(n, N_MEMORY) {
|
||||
|
||||
/* Don't want a node to appear more than once */
|
||||
if (node_isset(n, *used_node_mask))
|
||||
@@ -3336,7 +3382,7 @@ static int default_zonelist_order(void)
|
||||
* local memory, NODE_ORDER may be suitable.
|
||||
*/
|
||||
average_size = total_size /
|
||||
(nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
|
||||
(nodes_weight(node_states[N_MEMORY]) + 1);
|
||||
for_each_online_node(nid) {
|
||||
low_kmem_size = 0;
|
||||
total_size = 0;
|
||||
@@ -3826,6 +3872,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
|
||||
mminit_verify_page_links(page, zone, nid, pfn);
|
||||
init_page_count(page);
|
||||
reset_page_mapcount(page);
|
||||
reset_page_last_nid(page);
|
||||
SetPageReserved(page);
|
||||
/*
|
||||
* Mark the block movable so that blocks are reserved for
|
||||
@@ -4432,6 +4479,26 @@ void __init set_pageblock_order(void)
|
||||
|
||||
#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
|
||||
|
||||
static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
|
||||
unsigned long present_pages)
|
||||
{
|
||||
unsigned long pages = spanned_pages;
|
||||
|
||||
/*
|
||||
* Provide a more accurate estimation if there are holes within
|
||||
* the zone and SPARSEMEM is in use. If there are holes within the
|
||||
* zone, each populated memory region may cost us one or two extra
|
||||
* memmap pages due to alignment because memmap pages for each
|
||||
* populated regions may not naturally algined on page boundary.
|
||||
* So the (present_pages >> 4) heuristic is a tradeoff for that.
|
||||
*/
|
||||
if (spanned_pages > present_pages + (present_pages >> 4) &&
|
||||
IS_ENABLED(CONFIG_SPARSEMEM))
|
||||
pages = present_pages;
|
||||
|
||||
return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set up the zone data structures:
|
||||
* - mark all pages reserved
|
||||
@@ -4449,54 +4516,67 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
|
||||
int ret;
|
||||
|
||||
pgdat_resize_init(pgdat);
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
spin_lock_init(&pgdat->numabalancing_migrate_lock);
|
||||
pgdat->numabalancing_migrate_nr_pages = 0;
|
||||
pgdat->numabalancing_migrate_next_window = jiffies;
|
||||
#endif
|
||||
init_waitqueue_head(&pgdat->kswapd_wait);
|
||||
init_waitqueue_head(&pgdat->pfmemalloc_wait);
|
||||
pgdat_page_cgroup_init(pgdat);
|
||||
|
||||
for (j = 0; j < MAX_NR_ZONES; j++) {
|
||||
struct zone *zone = pgdat->node_zones + j;
|
||||
unsigned long size, realsize, memmap_pages;
|
||||
unsigned long size, realsize, freesize, memmap_pages;
|
||||
|
||||
size = zone_spanned_pages_in_node(nid, j, zones_size);
|
||||
realsize = size - zone_absent_pages_in_node(nid, j,
|
||||
realsize = freesize = size - zone_absent_pages_in_node(nid, j,
|
||||
zholes_size);
|
||||
|
||||
/*
|
||||
* Adjust realsize so that it accounts for how much memory
|
||||
* Adjust freesize so that it accounts for how much memory
|
||||
* is used by this zone for memmap. This affects the watermark
|
||||
* and per-cpu initialisations
|
||||
*/
|
||||
memmap_pages =
|
||||
PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
|
||||
if (realsize >= memmap_pages) {
|
||||
realsize -= memmap_pages;
|
||||
memmap_pages = calc_memmap_size(size, realsize);
|
||||
if (freesize >= memmap_pages) {
|
||||
freesize -= memmap_pages;
|
||||
if (memmap_pages)
|
||||
printk(KERN_DEBUG
|
||||
" %s zone: %lu pages used for memmap\n",
|
||||
zone_names[j], memmap_pages);
|
||||
} else
|
||||
printk(KERN_WARNING
|
||||
" %s zone: %lu pages exceeds realsize %lu\n",
|
||||
zone_names[j], memmap_pages, realsize);
|
||||
" %s zone: %lu pages exceeds freesize %lu\n",
|
||||
zone_names[j], memmap_pages, freesize);
|
||||
|
||||
/* Account for reserved pages */
|
||||
if (j == 0 && realsize > dma_reserve) {
|
||||
realsize -= dma_reserve;
|
||||
if (j == 0 && freesize > dma_reserve) {
|
||||
freesize -= dma_reserve;
|
||||
printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
|
||||
zone_names[0], dma_reserve);
|
||||
}
|
||||
|
||||
if (!is_highmem_idx(j))
|
||||
nr_kernel_pages += realsize;
|
||||
nr_all_pages += realsize;
|
||||
nr_kernel_pages += freesize;
|
||||
/* Charge for highmem memmap if there are enough kernel pages */
|
||||
else if (nr_kernel_pages > memmap_pages * 2)
|
||||
nr_kernel_pages -= memmap_pages;
|
||||
nr_all_pages += freesize;
|
||||
|
||||
zone->spanned_pages = size;
|
||||
zone->present_pages = realsize;
|
||||
zone->present_pages = freesize;
|
||||
/*
|
||||
* Set an approximate value for lowmem here, it will be adjusted
|
||||
* when the bootmem allocator frees pages into the buddy system.
|
||||
* And all highmem pages will be managed by the buddy system.
|
||||
*/
|
||||
zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
|
||||
#ifdef CONFIG_NUMA
|
||||
zone->node = nid;
|
||||
zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
|
||||
zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
|
||||
/ 100;
|
||||
zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
|
||||
zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
|
||||
#endif
|
||||
zone->name = zone_names[j];
|
||||
spin_lock_init(&zone->lock);
|
||||
@@ -4687,7 +4767,7 @@ unsigned long __init find_min_pfn_with_active_regions(void)
|
||||
/*
|
||||
* early_calculate_totalpages()
|
||||
* Sum pages in active regions for movable zone.
|
||||
* Populate N_HIGH_MEMORY for calculating usable_nodes.
|
||||
* Populate N_MEMORY for calculating usable_nodes.
|
||||
*/
|
||||
static unsigned long __init early_calculate_totalpages(void)
|
||||
{
|
||||
@@ -4700,7 +4780,7 @@ static unsigned long __init early_calculate_totalpages(void)
|
||||
|
||||
totalpages += pages;
|
||||
if (pages)
|
||||
node_set_state(nid, N_HIGH_MEMORY);
|
||||
node_set_state(nid, N_MEMORY);
|
||||
}
|
||||
return totalpages;
|
||||
}
|
||||
@@ -4717,9 +4797,9 @@ static void __init find_zone_movable_pfns_for_nodes(void)
|
||||
unsigned long usable_startpfn;
|
||||
unsigned long kernelcore_node, kernelcore_remaining;
|
||||
/* save the state before borrow the nodemask */
|
||||
nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
|
||||
nodemask_t saved_node_state = node_states[N_MEMORY];
|
||||
unsigned long totalpages = early_calculate_totalpages();
|
||||
int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
|
||||
int usable_nodes = nodes_weight(node_states[N_MEMORY]);
|
||||
|
||||
/*
|
||||
* If movablecore was specified, calculate what size of
|
||||
@@ -4754,7 +4834,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
|
||||
restart:
|
||||
/* Spread kernelcore memory as evenly as possible throughout nodes */
|
||||
kernelcore_node = required_kernelcore / usable_nodes;
|
||||
for_each_node_state(nid, N_HIGH_MEMORY) {
|
||||
for_each_node_state(nid, N_MEMORY) {
|
||||
unsigned long start_pfn, end_pfn;
|
||||
|
||||
/*
|
||||
@@ -4846,23 +4926,27 @@ restart:
|
||||
|
||||
out:
|
||||
/* restore the node_state */
|
||||
node_states[N_HIGH_MEMORY] = saved_node_state;
|
||||
node_states[N_MEMORY] = saved_node_state;
|
||||
}
|
||||
|
||||
/* Any regular memory on that node ? */
|
||||
static void __init check_for_regular_memory(pg_data_t *pgdat)
|
||||
/* Any regular or high memory on that node ? */
|
||||
static void check_for_memory(pg_data_t *pgdat, int nid)
|
||||
{
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
enum zone_type zone_type;
|
||||
|
||||
for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
|
||||
if (N_MEMORY == N_NORMAL_MEMORY)
|
||||
return;
|
||||
|
||||
for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
|
||||
struct zone *zone = &pgdat->node_zones[zone_type];
|
||||
if (zone->present_pages) {
|
||||
node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
|
||||
node_set_state(nid, N_HIGH_MEMORY);
|
||||
if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
|
||||
zone_type <= ZONE_NORMAL)
|
||||
node_set_state(nid, N_NORMAL_MEMORY);
|
||||
break;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -4945,8 +5029,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
|
||||
|
||||
/* Any memory on that node */
|
||||
if (pgdat->node_present_pages)
|
||||
node_set_state(nid, N_HIGH_MEMORY);
|
||||
check_for_regular_memory(pgdat);
|
||||
node_set_state(nid, N_MEMORY);
|
||||
check_for_memory(pgdat, nid);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5174,10 +5258,6 @@ static void __setup_per_zone_wmarks(void)
|
||||
zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
|
||||
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
|
||||
|
||||
zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);
|
||||
zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);
|
||||
zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);
|
||||
|
||||
setup_zone_migrate_reserve(zone);
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
}
|
||||
@@ -5505,7 +5585,7 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
|
||||
pfn &= (PAGES_PER_SECTION-1);
|
||||
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
|
||||
#else
|
||||
pfn = pfn - zone->zone_start_pfn;
|
||||
pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
|
||||
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
|
||||
#endif /* CONFIG_SPARSEMEM */
|
||||
}
|
||||
@@ -5575,7 +5655,8 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
|
||||
* MIGRATE_MOVABLE block might include unmovable pages. It means you can't
|
||||
* expect this function should be exact.
|
||||
*/
|
||||
bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
|
||||
bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
|
||||
bool skip_hwpoisoned_pages)
|
||||
{
|
||||
unsigned long pfn, iter, found;
|
||||
int mt;
|
||||
@@ -5610,6 +5691,13 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* The HWPoisoned page may be not in buddy system, and
|
||||
* page_count() is not 0.
|
||||
*/
|
||||
if (skip_hwpoisoned_pages && PageHWPoison(page))
|
||||
continue;
|
||||
|
||||
if (!PageLRU(page))
|
||||
found++;
|
||||
/*
|
||||
@@ -5652,7 +5740,7 @@ bool is_pageblock_removable_nolock(struct page *page)
|
||||
zone->zone_start_pfn + zone->spanned_pages <= pfn)
|
||||
return false;
|
||||
|
||||
return !has_unmovable_pages(zone, page, 0);
|
||||
return !has_unmovable_pages(zone, page, 0, true);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CMA
|
||||
@@ -5679,7 +5767,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
|
||||
unsigned int tries = 0;
|
||||
int ret = 0;
|
||||
|
||||
migrate_prep_local();
|
||||
migrate_prep();
|
||||
|
||||
while (pfn < end || !list_empty(&cc->migratepages)) {
|
||||
if (fatal_signal_pending(current)) {
|
||||
@@ -5707,61 +5795,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
|
||||
|
||||
ret = migrate_pages(&cc->migratepages,
|
||||
alloc_migrate_target,
|
||||
0, false, MIGRATE_SYNC);
|
||||
0, false, MIGRATE_SYNC,
|
||||
MR_CMA);
|
||||
}
|
||||
|
||||
putback_lru_pages(&cc->migratepages);
|
||||
putback_movable_pages(&cc->migratepages);
|
||||
return ret > 0 ? 0 : ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Update zone's cma pages counter used for watermark level calculation.
|
||||
*/
|
||||
static inline void __update_cma_watermarks(struct zone *zone, int count)
|
||||
{
|
||||
unsigned long flags;
|
||||
spin_lock_irqsave(&zone->lock, flags);
|
||||
zone->min_cma_pages += count;
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
setup_per_zone_wmarks();
|
||||
}
|
||||
|
||||
/*
|
||||
* Trigger memory pressure bump to reclaim some pages in order to be able to
|
||||
* allocate 'count' pages in single page units. Does similar work as
|
||||
*__alloc_pages_slowpath() function.
|
||||
*/
|
||||
static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
|
||||
{
|
||||
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
|
||||
struct zonelist *zonelist = node_zonelist(0, gfp_mask);
|
||||
int did_some_progress = 0;
|
||||
int order = 1;
|
||||
|
||||
/*
|
||||
* Increase level of watermarks to force kswapd do his job
|
||||
* to stabilise at new watermark level.
|
||||
*/
|
||||
__update_cma_watermarks(zone, count);
|
||||
|
||||
/* Obey watermarks as if the page was being allocated */
|
||||
while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {
|
||||
wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
|
||||
|
||||
did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
|
||||
NULL);
|
||||
if (!did_some_progress) {
|
||||
/* Exhausted what can be done so it's blamo time */
|
||||
out_of_memory(zonelist, gfp_mask, order, NULL, false);
|
||||
}
|
||||
}
|
||||
|
||||
/* Restore original watermark levels. */
|
||||
__update_cma_watermarks(zone, -count);
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
/**
|
||||
* alloc_contig_range() -- tries to allocate given range of pages
|
||||
* @start: start PFN to allocate
|
||||
@@ -5785,7 +5826,6 @@ static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
|
||||
int alloc_contig_range(unsigned long start, unsigned long end,
|
||||
unsigned migratetype)
|
||||
{
|
||||
struct zone *zone = page_zone(pfn_to_page(start));
|
||||
unsigned long outer_start, outer_end;
|
||||
int ret = 0, order;
|
||||
|
||||
@@ -5823,7 +5863,8 @@ int alloc_contig_range(unsigned long start, unsigned long end,
|
||||
*/
|
||||
|
||||
ret = start_isolate_page_range(pfn_max_align_down(start),
|
||||
pfn_max_align_up(end), migratetype);
|
||||
pfn_max_align_up(end), migratetype,
|
||||
false);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@@ -5862,18 +5903,13 @@ int alloc_contig_range(unsigned long start, unsigned long end,
|
||||
}
|
||||
|
||||
/* Make sure the range is really isolated. */
|
||||
if (test_pages_isolated(outer_start, end)) {
|
||||
if (test_pages_isolated(outer_start, end, false)) {
|
||||
pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
|
||||
outer_start, end);
|
||||
ret = -EBUSY;
|
||||
goto done;
|
||||
}
|
||||
|
||||
/*
|
||||
* Reclaim enough pages to make sure that contiguous allocation
|
||||
* will not starve the system.
|
||||
*/
|
||||
__reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
|
||||
|
||||
/* Grab isolated pages from freelists. */
|
||||
outer_end = isolate_freepages_range(&cc, outer_start, end);
|
||||
@@ -5896,8 +5932,15 @@ done:
|
||||
|
||||
void free_contig_range(unsigned long pfn, unsigned nr_pages)
|
||||
{
|
||||
for (; nr_pages--; ++pfn)
|
||||
__free_page(pfn_to_page(pfn));
|
||||
unsigned int count = 0;
|
||||
|
||||
for (; nr_pages--; pfn++) {
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
|
||||
count += page_count(page) != 1;
|
||||
__free_page(page);
|
||||
}
|
||||
WARN(count != 0, "%d pages are still in use!\n", count);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -5931,7 +5974,6 @@ void __meminit zone_pcp_update(struct zone *zone)
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTREMOVE
|
||||
void zone_pcp_reset(struct zone *zone)
|
||||
{
|
||||
unsigned long flags;
|
||||
@@ -5951,6 +5993,7 @@ void zone_pcp_reset(struct zone *zone)
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTREMOVE
|
||||
/*
|
||||
* All pages in the range must be isolated before calling this.
|
||||
*/
|
||||
@@ -5977,6 +6020,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
|
||||
continue;
|
||||
}
|
||||
page = pfn_to_page(pfn);
|
||||
/*
|
||||
* The HWPoisoned page may be not in buddy system, and
|
||||
* page_count() is not 0.
|
||||
*/
|
||||
if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
|
||||
pfn++;
|
||||
SetPageReserved(page);
|
||||
continue;
|
||||
}
|
||||
|
||||
BUG_ON(page_count(page));
|
||||
BUG_ON(!PageBuddy(page));
|
||||
order = page_order(page);
|
||||
@@ -5987,8 +6040,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
|
||||
list_del(&page->lru);
|
||||
rmv_page_order(page);
|
||||
zone->free_area[order].nr_free--;
|
||||
__mod_zone_page_state(zone, NR_FREE_PAGES,
|
||||
- (1UL << order));
|
||||
for (i = 0; i < (1 << order); i++)
|
||||
SetPageReserved((page+i));
|
||||
pfn += (1 << order);
|
||||
|
@@ -251,6 +251,9 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
|
||||
mn->nr_pages, mn->status_change_nid);
|
||||
break;
|
||||
case MEM_CANCEL_ONLINE:
|
||||
offline_page_cgroup(mn->start_pfn,
|
||||
mn->nr_pages, mn->status_change_nid);
|
||||
break;
|
||||
case MEM_GOING_OFFLINE:
|
||||
break;
|
||||
case MEM_ONLINE:
|
||||
@@ -271,7 +274,7 @@ void __init page_cgroup_init(void)
|
||||
if (mem_cgroup_disabled())
|
||||
return;
|
||||
|
||||
for_each_node_state(nid, N_HIGH_MEMORY) {
|
||||
for_each_node_state(nid, N_MEMORY) {
|
||||
unsigned long start_pfn, end_pfn;
|
||||
|
||||
start_pfn = node_start_pfn(nid);
|
||||
|
@@ -8,29 +8,7 @@
|
||||
#include <linux/memory.h>
|
||||
#include "internal.h"
|
||||
|
||||
/* called while holding zone->lock */
|
||||
static void set_pageblock_isolate(struct page *page)
|
||||
{
|
||||
if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
|
||||
return;
|
||||
|
||||
set_pageblock_migratetype(page, MIGRATE_ISOLATE);
|
||||
page_zone(page)->nr_pageblock_isolate++;
|
||||
}
|
||||
|
||||
/* called while holding zone->lock */
|
||||
static void restore_pageblock_isolate(struct page *page, int migratetype)
|
||||
{
|
||||
struct zone *zone = page_zone(page);
|
||||
if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
|
||||
return;
|
||||
|
||||
BUG_ON(zone->nr_pageblock_isolate <= 0);
|
||||
set_pageblock_migratetype(page, migratetype);
|
||||
zone->nr_pageblock_isolate--;
|
||||
}
|
||||
|
||||
int set_migratetype_isolate(struct page *page)
|
||||
int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
|
||||
{
|
||||
struct zone *zone;
|
||||
unsigned long flags, pfn;
|
||||
@@ -66,7 +44,8 @@ int set_migratetype_isolate(struct page *page)
|
||||
* FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
|
||||
* We just check MOVABLE pages.
|
||||
*/
|
||||
if (!has_unmovable_pages(zone, page, arg.pages_found))
|
||||
if (!has_unmovable_pages(zone, page, arg.pages_found,
|
||||
skip_hwpoisoned_pages))
|
||||
ret = 0;
|
||||
|
||||
/*
|
||||
@@ -79,7 +58,7 @@ out:
|
||||
unsigned long nr_pages;
|
||||
int migratetype = get_pageblock_migratetype(page);
|
||||
|
||||
set_pageblock_isolate(page);
|
||||
set_pageblock_migratetype(page, MIGRATE_ISOLATE);
|
||||
nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
|
||||
|
||||
__mod_zone_freepage_state(zone, -nr_pages, migratetype);
|
||||
@@ -102,7 +81,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype)
|
||||
goto out;
|
||||
nr_pages = move_freepages_block(zone, page, migratetype);
|
||||
__mod_zone_freepage_state(zone, nr_pages, migratetype);
|
||||
restore_pageblock_isolate(page, migratetype);
|
||||
set_pageblock_migratetype(page, migratetype);
|
||||
out:
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
}
|
||||
@@ -134,7 +113,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
|
||||
* Returns 0 on success and -EBUSY if any part of range cannot be isolated.
|
||||
*/
|
||||
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
|
||||
unsigned migratetype)
|
||||
unsigned migratetype, bool skip_hwpoisoned_pages)
|
||||
{
|
||||
unsigned long pfn;
|
||||
unsigned long undo_pfn;
|
||||
@@ -147,7 +126,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
|
||||
pfn < end_pfn;
|
||||
pfn += pageblock_nr_pages) {
|
||||
page = __first_valid_page(pfn, pageblock_nr_pages);
|
||||
if (page && set_migratetype_isolate(page)) {
|
||||
if (page &&
|
||||
set_migratetype_isolate(page, skip_hwpoisoned_pages)) {
|
||||
undo_pfn = pfn;
|
||||
goto undo;
|
||||
}
|
||||
@@ -190,7 +170,8 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
|
||||
* Returns 1 if all pages in the range are isolated.
|
||||
*/
|
||||
static int
|
||||
__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
|
||||
__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
|
||||
bool skip_hwpoisoned_pages)
|
||||
{
|
||||
struct page *page;
|
||||
|
||||
@@ -220,6 +201,14 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
|
||||
else if (page_count(page) == 0 &&
|
||||
get_freepage_migratetype(page) == MIGRATE_ISOLATE)
|
||||
pfn += 1;
|
||||
else if (skip_hwpoisoned_pages && PageHWPoison(page)) {
|
||||
/*
|
||||
* The HWPoisoned page may be not in buddy
|
||||
* system, and page_count() is not 0.
|
||||
*/
|
||||
pfn++;
|
||||
continue;
|
||||
}
|
||||
else
|
||||
break;
|
||||
}
|
||||
@@ -228,7 +217,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
|
||||
return 1;
|
||||
}
|
||||
|
||||
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
|
||||
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
|
||||
bool skip_hwpoisoned_pages)
|
||||
{
|
||||
unsigned long pfn, flags;
|
||||
struct page *page;
|
||||
@@ -251,7 +241,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
|
||||
/* Check all pages are free or Marked as ISOLATED */
|
||||
zone = page_zone(page);
|
||||
spin_lock_irqsave(&zone->lock, flags);
|
||||
ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn);
|
||||
ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn,
|
||||
skip_hwpoisoned_pages);
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
return ret ? 0 : -EBUSY;
|
||||
}
|
||||
|
@@ -58,7 +58,7 @@ again:
|
||||
if (!walk->pte_entry)
|
||||
continue;
|
||||
|
||||
split_huge_page_pmd(walk->mm, pmd);
|
||||
split_huge_page_pmd_mm(walk->mm, addr, pmd);
|
||||
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
|
||||
goto again;
|
||||
err = walk_pte_range(pmd, addr, next, walk);
|
||||
|
@@ -631,7 +631,7 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
|
||||
if (!chunk)
|
||||
return;
|
||||
pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
|
||||
kfree(chunk);
|
||||
pcpu_mem_free(chunk, pcpu_chunk_struct_size);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1380,6 +1380,9 @@ enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
|
||||
|
||||
static int __init percpu_alloc_setup(char *str)
|
||||
{
|
||||
if (!str)
|
||||
return -EINVAL;
|
||||
|
||||
if (0)
|
||||
/* nada */;
|
||||
#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
|
||||
|
@@ -12,8 +12,8 @@
|
||||
|
||||
#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
|
||||
/*
|
||||
* Only sets the access flags (dirty, accessed, and
|
||||
* writable). Furthermore, we know it always gets set to a "more
|
||||
* Only sets the access flags (dirty, accessed), as well as write
|
||||
* permission. Furthermore, we know it always gets set to a "more
|
||||
* permissive" setting, which allows most architectures to optimize
|
||||
* this. We return whether the PTE actually changed, which in turn
|
||||
* instructs the caller to do things like update__mmu_cache. This
|
||||
@@ -27,7 +27,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
|
||||
int changed = !pte_same(*ptep, entry);
|
||||
if (changed) {
|
||||
set_pte_at(vma->vm_mm, address, ptep, entry);
|
||||
flush_tlb_page(vma, address);
|
||||
flush_tlb_fix_spurious_fault(vma, address);
|
||||
}
|
||||
return changed;
|
||||
}
|
||||
@@ -88,7 +88,8 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
|
||||
{
|
||||
pte_t pte;
|
||||
pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
|
||||
flush_tlb_page(vma, address);
|
||||
if (pte_accessible(pte))
|
||||
flush_tlb_page(vma, address);
|
||||
return pte;
|
||||
}
|
||||
#endif
|
||||
|
134
mm/rmap.c
134
mm/rmap.c
@@ -24,7 +24,7 @@
|
||||
* mm->mmap_sem
|
||||
* page->flags PG_locked (lock_page)
|
||||
* mapping->i_mmap_mutex
|
||||
* anon_vma->mutex
|
||||
* anon_vma->rwsem
|
||||
* mm->page_table_lock or pte_lock
|
||||
* zone->lru_lock (in mark_page_accessed, isolate_lru_page)
|
||||
* swap_lock (in swap_duplicate, swap_info_get)
|
||||
@@ -37,7 +37,7 @@
|
||||
* in arch-dependent flush_dcache_mmap_lock,
|
||||
* within bdi.wb->list_lock in __sync_single_inode)
|
||||
*
|
||||
* anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon)
|
||||
* anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
|
||||
* ->tasklist_lock
|
||||
* pte map lock
|
||||
*/
|
||||
@@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
|
||||
VM_BUG_ON(atomic_read(&anon_vma->refcount));
|
||||
|
||||
/*
|
||||
* Synchronize against page_lock_anon_vma() such that
|
||||
* Synchronize against page_lock_anon_vma_read() such that
|
||||
* we can safely hold the lock without the anon_vma getting
|
||||
* freed.
|
||||
*
|
||||
* Relies on the full mb implied by the atomic_dec_and_test() from
|
||||
* put_anon_vma() against the acquire barrier implied by
|
||||
* mutex_trylock() from page_lock_anon_vma(). This orders:
|
||||
* down_read_trylock() from page_lock_anon_vma_read(). This orders:
|
||||
*
|
||||
* page_lock_anon_vma() VS put_anon_vma()
|
||||
* mutex_trylock() atomic_dec_and_test()
|
||||
* page_lock_anon_vma_read() VS put_anon_vma()
|
||||
* down_read_trylock() atomic_dec_and_test()
|
||||
* LOCK MB
|
||||
* atomic_read() mutex_is_locked()
|
||||
* atomic_read() rwsem_is_locked()
|
||||
*
|
||||
* LOCK should suffice since the actual taking of the lock must
|
||||
* happen _before_ what follows.
|
||||
*/
|
||||
if (mutex_is_locked(&anon_vma->root->mutex)) {
|
||||
anon_vma_lock(anon_vma);
|
||||
if (rwsem_is_locked(&anon_vma->root->rwsem)) {
|
||||
anon_vma_lock_write(anon_vma);
|
||||
anon_vma_unlock(anon_vma);
|
||||
}
|
||||
|
||||
@@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
|
||||
* allocate a new one.
|
||||
*
|
||||
* Anon-vma allocations are very subtle, because we may have
|
||||
* optimistically looked up an anon_vma in page_lock_anon_vma()
|
||||
* optimistically looked up an anon_vma in page_lock_anon_vma_read()
|
||||
* and that may actually touch the spinlock even in the newly
|
||||
* allocated vma (it depends on RCU to make sure that the
|
||||
* anon_vma isn't actually destroyed).
|
||||
@@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
|
||||
allocated = anon_vma;
|
||||
}
|
||||
|
||||
anon_vma_lock(anon_vma);
|
||||
anon_vma_lock_write(anon_vma);
|
||||
/* page_table_lock to protect against threads */
|
||||
spin_lock(&mm->page_table_lock);
|
||||
if (likely(!vma->anon_vma)) {
|
||||
@@ -219,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
|
||||
struct anon_vma *new_root = anon_vma->root;
|
||||
if (new_root != root) {
|
||||
if (WARN_ON_ONCE(root))
|
||||
mutex_unlock(&root->mutex);
|
||||
up_write(&root->rwsem);
|
||||
root = new_root;
|
||||
mutex_lock(&root->mutex);
|
||||
down_write(&root->rwsem);
|
||||
}
|
||||
return root;
|
||||
}
|
||||
@@ -229,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
|
||||
static inline void unlock_anon_vma_root(struct anon_vma *root)
|
||||
{
|
||||
if (root)
|
||||
mutex_unlock(&root->mutex);
|
||||
up_write(&root->rwsem);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
|
||||
get_anon_vma(anon_vma->root);
|
||||
/* Mark this anon_vma as the one where our new (COWed) pages go. */
|
||||
vma->anon_vma = anon_vma;
|
||||
anon_vma_lock(anon_vma);
|
||||
anon_vma_lock_write(anon_vma);
|
||||
anon_vma_chain_link(vma, avc, anon_vma);
|
||||
anon_vma_unlock(anon_vma);
|
||||
|
||||
@@ -349,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
|
||||
/*
|
||||
* Iterate the list once more, it now only contains empty and unlinked
|
||||
* anon_vmas, destroy them. Could not do before due to __put_anon_vma()
|
||||
* needing to acquire the anon_vma->root->mutex.
|
||||
* needing to write-acquire the anon_vma->root->rwsem.
|
||||
*/
|
||||
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
|
||||
struct anon_vma *anon_vma = avc->anon_vma;
|
||||
@@ -365,7 +365,7 @@ static void anon_vma_ctor(void *data)
|
||||
{
|
||||
struct anon_vma *anon_vma = data;
|
||||
|
||||
mutex_init(&anon_vma->mutex);
|
||||
init_rwsem(&anon_vma->rwsem);
|
||||
atomic_set(&anon_vma->refcount, 0);
|
||||
anon_vma->rb_root = RB_ROOT;
|
||||
}
|
||||
@@ -442,7 +442,7 @@ out:
|
||||
* atomic op -- the trylock. If we fail the trylock, we fall back to getting a
|
||||
* reference like with page_get_anon_vma() and then block on the mutex.
|
||||
*/
|
||||
struct anon_vma *page_lock_anon_vma(struct page *page)
|
||||
struct anon_vma *page_lock_anon_vma_read(struct page *page)
|
||||
{
|
||||
struct anon_vma *anon_vma = NULL;
|
||||
struct anon_vma *root_anon_vma;
|
||||
@@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
|
||||
|
||||
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
|
||||
root_anon_vma = ACCESS_ONCE(anon_vma->root);
|
||||
if (mutex_trylock(&root_anon_vma->mutex)) {
|
||||
if (down_read_trylock(&root_anon_vma->rwsem)) {
|
||||
/*
|
||||
* If the page is still mapped, then this anon_vma is still
|
||||
* its anon_vma, and holding the mutex ensures that it will
|
||||
* not go away, see anon_vma_free().
|
||||
*/
|
||||
if (!page_mapped(page)) {
|
||||
mutex_unlock(&root_anon_vma->mutex);
|
||||
up_read(&root_anon_vma->rwsem);
|
||||
anon_vma = NULL;
|
||||
}
|
||||
goto out;
|
||||
@@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
|
||||
|
||||
/* we pinned the anon_vma, its safe to sleep */
|
||||
rcu_read_unlock();
|
||||
anon_vma_lock(anon_vma);
|
||||
anon_vma_lock_read(anon_vma);
|
||||
|
||||
if (atomic_dec_and_test(&anon_vma->refcount)) {
|
||||
/*
|
||||
* Oops, we held the last refcount, release the lock
|
||||
* and bail -- can't simply use put_anon_vma() because
|
||||
* we'll deadlock on the anon_vma_lock() recursion.
|
||||
* we'll deadlock on the anon_vma_lock_write() recursion.
|
||||
*/
|
||||
anon_vma_unlock(anon_vma);
|
||||
anon_vma_unlock_read(anon_vma);
|
||||
__put_anon_vma(anon_vma);
|
||||
anon_vma = NULL;
|
||||
}
|
||||
@@ -504,9 +504,9 @@ out:
|
||||
return anon_vma;
|
||||
}
|
||||
|
||||
void page_unlock_anon_vma(struct anon_vma *anon_vma)
|
||||
void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
|
||||
{
|
||||
anon_vma_unlock(anon_vma);
|
||||
anon_vma_unlock_read(anon_vma);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -562,6 +562,27 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
|
||||
return address;
|
||||
}
|
||||
|
||||
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd = NULL;
|
||||
|
||||
pgd = pgd_offset(mm, address);
|
||||
if (!pgd_present(*pgd))
|
||||
goto out;
|
||||
|
||||
pud = pud_offset(pgd, address);
|
||||
if (!pud_present(*pud))
|
||||
goto out;
|
||||
|
||||
pmd = pmd_offset(pud, address);
|
||||
if (!pmd_present(*pmd))
|
||||
pmd = NULL;
|
||||
out:
|
||||
return pmd;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check that @page is mapped at @address into @mm.
|
||||
*
|
||||
@@ -574,8 +595,6 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
|
||||
pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
|
||||
unsigned long address, spinlock_t **ptlp, int sync)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pte_t *pte;
|
||||
spinlock_t *ptl;
|
||||
@@ -586,17 +605,10 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
|
||||
goto check;
|
||||
}
|
||||
|
||||
pgd = pgd_offset(mm, address);
|
||||
if (!pgd_present(*pgd))
|
||||
pmd = mm_find_pmd(mm, address);
|
||||
if (!pmd)
|
||||
return NULL;
|
||||
|
||||
pud = pud_offset(pgd, address);
|
||||
if (!pud_present(*pud))
|
||||
return NULL;
|
||||
|
||||
pmd = pmd_offset(pud, address);
|
||||
if (!pmd_present(*pmd))
|
||||
return NULL;
|
||||
if (pmd_trans_huge(*pmd))
|
||||
return NULL;
|
||||
|
||||
@@ -732,7 +744,7 @@ static int page_referenced_anon(struct page *page,
|
||||
struct anon_vma_chain *avc;
|
||||
int referenced = 0;
|
||||
|
||||
anon_vma = page_lock_anon_vma(page);
|
||||
anon_vma = page_lock_anon_vma_read(page);
|
||||
if (!anon_vma)
|
||||
return referenced;
|
||||
|
||||
@@ -754,7 +766,7 @@ static int page_referenced_anon(struct page *page,
|
||||
break;
|
||||
}
|
||||
|
||||
page_unlock_anon_vma(anon_vma);
|
||||
page_unlock_anon_vma_read(anon_vma);
|
||||
return referenced;
|
||||
}
|
||||
|
||||
@@ -1139,9 +1151,11 @@ void page_remove_rmap(struct page *page)
|
||||
* containing the swap entry, but page not yet written to swap.
|
||||
*
|
||||
* And we can skip it on file pages, so long as the filesystem
|
||||
* participates in dirty tracking; but need to catch shm and tmpfs
|
||||
* and ramfs pages which have been modified since creation by read
|
||||
* fault.
|
||||
* participates in dirty tracking (note that this is not only an
|
||||
* optimization but also solves problems caused by dirty flag in
|
||||
* storage key getting set by a write from inside kernel); but need to
|
||||
* catch shm and tmpfs and ramfs pages which have been modified since
|
||||
* creation by read fault.
|
||||
*
|
||||
* Note that mapping must be decided above, before decrementing
|
||||
* mapcount (which luckily provides a barrier): once page is unmapped,
|
||||
@@ -1235,12 +1249,14 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
|
||||
update_hiwater_rss(mm);
|
||||
|
||||
if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
|
||||
if (PageAnon(page))
|
||||
dec_mm_counter(mm, MM_ANONPAGES);
|
||||
else
|
||||
dec_mm_counter(mm, MM_FILEPAGES);
|
||||
if (!PageHuge(page)) {
|
||||
if (PageAnon(page))
|
||||
dec_mm_counter(mm, MM_ANONPAGES);
|
||||
else
|
||||
dec_mm_counter(mm, MM_FILEPAGES);
|
||||
}
|
||||
set_pte_at(mm, address, pte,
|
||||
swp_entry_to_pte(make_hwpoison_entry(page)));
|
||||
swp_entry_to_pte(make_hwpoison_entry(page)));
|
||||
} else if (PageAnon(page)) {
|
||||
swp_entry_t entry = { .val = page_private(page) };
|
||||
|
||||
@@ -1299,7 +1315,7 @@ out_mlock:
|
||||
/*
|
||||
* We need mmap_sem locking, Otherwise VM_LOCKED check makes
|
||||
* unstable result and race. Plus, We can't wait here because
|
||||
* we now hold anon_vma->mutex or mapping->i_mmap_mutex.
|
||||
* we now hold anon_vma->rwsem or mapping->i_mmap_mutex.
|
||||
* if trylock failed, the page remain in evictable lru and later
|
||||
* vmscan could retry to move the page to unevictable lru if the
|
||||
* page is actually mlocked.
|
||||
@@ -1345,8 +1361,6 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
|
||||
struct vm_area_struct *vma, struct page *check_page)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pte_t *pte;
|
||||
pte_t pteval;
|
||||
@@ -1366,16 +1380,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
|
||||
if (end > vma->vm_end)
|
||||
end = vma->vm_end;
|
||||
|
||||
pgd = pgd_offset(mm, address);
|
||||
if (!pgd_present(*pgd))
|
||||
return ret;
|
||||
|
||||
pud = pud_offset(pgd, address);
|
||||
if (!pud_present(*pud))
|
||||
return ret;
|
||||
|
||||
pmd = pmd_offset(pud, address);
|
||||
if (!pmd_present(*pmd))
|
||||
pmd = mm_find_pmd(mm, address);
|
||||
if (!pmd)
|
||||
return ret;
|
||||
|
||||
mmun_start = address;
|
||||
@@ -1474,7 +1480,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
|
||||
struct anon_vma_chain *avc;
|
||||
int ret = SWAP_AGAIN;
|
||||
|
||||
anon_vma = page_lock_anon_vma(page);
|
||||
anon_vma = page_lock_anon_vma_read(page);
|
||||
if (!anon_vma)
|
||||
return ret;
|
||||
|
||||
@@ -1501,7 +1507,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
|
||||
break;
|
||||
}
|
||||
|
||||
page_unlock_anon_vma(anon_vma);
|
||||
page_unlock_anon_vma_read(anon_vma);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -1696,7 +1702,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
|
||||
int ret = SWAP_AGAIN;
|
||||
|
||||
/*
|
||||
* Note: remove_migration_ptes() cannot use page_lock_anon_vma()
|
||||
* Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
|
||||
* because that depends on page_mapped(); but not all its usages
|
||||
* are holding mmap_sem. Users without mmap_sem are required to
|
||||
* take a reference count to prevent the anon_vma disappearing
|
||||
@@ -1704,7 +1710,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
|
||||
anon_vma = page_anon_vma(page);
|
||||
if (!anon_vma)
|
||||
return ret;
|
||||
anon_vma_lock(anon_vma);
|
||||
anon_vma_lock_read(anon_vma);
|
||||
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
|
||||
struct vm_area_struct *vma = avc->vma;
|
||||
unsigned long address = vma_address(page, vma);
|
||||
@@ -1712,7 +1718,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
|
||||
if (ret != SWAP_AGAIN)
|
||||
break;
|
||||
}
|
||||
anon_vma_unlock(anon_vma);
|
||||
anon_vma_unlock_read(anon_vma);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
146
mm/shmem.c
146
mm/shmem.c
@@ -889,7 +889,7 @@ static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
|
||||
if (!mpol || mpol->mode == MPOL_DEFAULT)
|
||||
return; /* show nothing */
|
||||
|
||||
mpol_to_str(buffer, sizeof(buffer), mpol, 1);
|
||||
mpol_to_str(buffer, sizeof(buffer), mpol);
|
||||
|
||||
seq_printf(seq, ",mpol=%s", buffer);
|
||||
}
|
||||
@@ -909,26 +909,9 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
|
||||
|
||||
static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
|
||||
struct shmem_inode_info *info, pgoff_t index)
|
||||
{
|
||||
struct mempolicy mpol, *spol;
|
||||
struct vm_area_struct pvma;
|
||||
|
||||
spol = mpol_cond_copy(&mpol,
|
||||
mpol_shared_policy_lookup(&info->policy, index));
|
||||
|
||||
/* Create a pseudo vma that just contains the policy */
|
||||
pvma.vm_start = 0;
|
||||
/* Bias interleave by inode number to distribute better across nodes */
|
||||
pvma.vm_pgoff = index + info->vfs_inode.i_ino;
|
||||
pvma.vm_ops = NULL;
|
||||
pvma.vm_policy = spol;
|
||||
return swapin_readahead(swap, gfp, &pvma, 0);
|
||||
}
|
||||
|
||||
static struct page *shmem_alloc_page(gfp_t gfp,
|
||||
struct shmem_inode_info *info, pgoff_t index)
|
||||
{
|
||||
struct vm_area_struct pvma;
|
||||
struct page *page;
|
||||
|
||||
/* Create a pseudo vma that just contains the policy */
|
||||
pvma.vm_start = 0;
|
||||
@@ -937,10 +920,33 @@ static struct page *shmem_alloc_page(gfp_t gfp,
|
||||
pvma.vm_ops = NULL;
|
||||
pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
|
||||
|
||||
/*
|
||||
* alloc_page_vma() will drop the shared policy reference
|
||||
*/
|
||||
return alloc_page_vma(gfp, &pvma, 0);
|
||||
page = swapin_readahead(swap, gfp, &pvma, 0);
|
||||
|
||||
/* Drop reference taken by mpol_shared_policy_lookup() */
|
||||
mpol_cond_put(pvma.vm_policy);
|
||||
|
||||
return page;
|
||||
}
|
||||
|
||||
static struct page *shmem_alloc_page(gfp_t gfp,
|
||||
struct shmem_inode_info *info, pgoff_t index)
|
||||
{
|
||||
struct vm_area_struct pvma;
|
||||
struct page *page;
|
||||
|
||||
/* Create a pseudo vma that just contains the policy */
|
||||
pvma.vm_start = 0;
|
||||
/* Bias interleave by inode number to distribute better across nodes */
|
||||
pvma.vm_pgoff = index + info->vfs_inode.i_ino;
|
||||
pvma.vm_ops = NULL;
|
||||
pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
|
||||
|
||||
page = alloc_page_vma(gfp, &pvma, 0);
|
||||
|
||||
/* Drop reference taken by mpol_shared_policy_lookup() */
|
||||
mpol_cond_put(pvma.vm_policy);
|
||||
|
||||
return page;
|
||||
}
|
||||
#else /* !CONFIG_NUMA */
|
||||
#ifdef CONFIG_TMPFS
|
||||
@@ -1709,6 +1715,96 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
|
||||
*/
|
||||
static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
|
||||
pgoff_t index, pgoff_t end, int whence)
|
||||
{
|
||||
struct page *page;
|
||||
struct pagevec pvec;
|
||||
pgoff_t indices[PAGEVEC_SIZE];
|
||||
bool done = false;
|
||||
int i;
|
||||
|
||||
pagevec_init(&pvec, 0);
|
||||
pvec.nr = 1; /* start small: we may be there already */
|
||||
while (!done) {
|
||||
pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
|
||||
pvec.nr, pvec.pages, indices);
|
||||
if (!pvec.nr) {
|
||||
if (whence == SEEK_DATA)
|
||||
index = end;
|
||||
break;
|
||||
}
|
||||
for (i = 0; i < pvec.nr; i++, index++) {
|
||||
if (index < indices[i]) {
|
||||
if (whence == SEEK_HOLE) {
|
||||
done = true;
|
||||
break;
|
||||
}
|
||||
index = indices[i];
|
||||
}
|
||||
page = pvec.pages[i];
|
||||
if (page && !radix_tree_exceptional_entry(page)) {
|
||||
if (!PageUptodate(page))
|
||||
page = NULL;
|
||||
}
|
||||
if (index >= end ||
|
||||
(page && whence == SEEK_DATA) ||
|
||||
(!page && whence == SEEK_HOLE)) {
|
||||
done = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
shmem_deswap_pagevec(&pvec);
|
||||
pagevec_release(&pvec);
|
||||
pvec.nr = PAGEVEC_SIZE;
|
||||
cond_resched();
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
|
||||
{
|
||||
struct address_space *mapping = file->f_mapping;
|
||||
struct inode *inode = mapping->host;
|
||||
pgoff_t start, end;
|
||||
loff_t new_offset;
|
||||
|
||||
if (whence != SEEK_DATA && whence != SEEK_HOLE)
|
||||
return generic_file_llseek_size(file, offset, whence,
|
||||
MAX_LFS_FILESIZE, i_size_read(inode));
|
||||
mutex_lock(&inode->i_mutex);
|
||||
/* We're holding i_mutex so we can access i_size directly */
|
||||
|
||||
if (offset < 0)
|
||||
offset = -EINVAL;
|
||||
else if (offset >= inode->i_size)
|
||||
offset = -ENXIO;
|
||||
else {
|
||||
start = offset >> PAGE_CACHE_SHIFT;
|
||||
end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
|
||||
new_offset = shmem_seek_hole_data(mapping, start, end, whence);
|
||||
new_offset <<= PAGE_CACHE_SHIFT;
|
||||
if (new_offset > offset) {
|
||||
if (new_offset < inode->i_size)
|
||||
offset = new_offset;
|
||||
else if (whence == SEEK_DATA)
|
||||
offset = -ENXIO;
|
||||
else
|
||||
offset = inode->i_size;
|
||||
}
|
||||
}
|
||||
|
||||
if (offset >= 0 && offset != file->f_pos) {
|
||||
file->f_pos = offset;
|
||||
file->f_version = 0;
|
||||
}
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
return offset;
|
||||
}
|
||||
|
||||
static long shmem_fallocate(struct file *file, int mode, loff_t offset,
|
||||
loff_t len)
|
||||
{
|
||||
@@ -2367,7 +2463,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
|
||||
if (!gid_valid(sbinfo->gid))
|
||||
goto bad_val;
|
||||
} else if (!strcmp(this_char,"mpol")) {
|
||||
if (mpol_parse_str(value, &sbinfo->mpol, 1))
|
||||
if (mpol_parse_str(value, &sbinfo->mpol))
|
||||
goto bad_val;
|
||||
} else {
|
||||
printk(KERN_ERR "tmpfs: Bad mount option %s\n",
|
||||
@@ -2580,7 +2676,7 @@ static const struct address_space_operations shmem_aops = {
|
||||
static const struct file_operations shmem_file_operations = {
|
||||
.mmap = shmem_mmap,
|
||||
#ifdef CONFIG_TMPFS
|
||||
.llseek = generic_file_llseek,
|
||||
.llseek = shmem_file_llseek,
|
||||
.read = do_sync_read,
|
||||
.write = do_sync_write,
|
||||
.aio_read = shmem_file_aio_read,
|
||||
|
381
mm/slab.c
381
mm/slab.c
@@ -87,7 +87,6 @@
|
||||
*/
|
||||
|
||||
#include <linux/slab.h>
|
||||
#include "slab.h"
|
||||
#include <linux/mm.h>
|
||||
#include <linux/poison.h>
|
||||
#include <linux/swap.h>
|
||||
@@ -128,6 +127,8 @@
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
#include "slab.h"
|
||||
|
||||
/*
|
||||
* DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
|
||||
* 0 for faster, smaller code (especially in the critical paths).
|
||||
@@ -162,23 +163,6 @@
|
||||
*/
|
||||
static bool pfmemalloc_active __read_mostly;
|
||||
|
||||
/* Legal flag mask for kmem_cache_create(). */
|
||||
#if DEBUG
|
||||
# define CREATE_MASK (SLAB_RED_ZONE | \
|
||||
SLAB_POISON | SLAB_HWCACHE_ALIGN | \
|
||||
SLAB_CACHE_DMA | \
|
||||
SLAB_STORE_USER | \
|
||||
SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
|
||||
SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
|
||||
SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
|
||||
#else
|
||||
# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
|
||||
SLAB_CACHE_DMA | \
|
||||
SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
|
||||
SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
|
||||
SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* kmem_bufctl_t:
|
||||
*
|
||||
@@ -564,15 +548,11 @@ static struct cache_names __initdata cache_names[] = {
|
||||
#undef CACHE
|
||||
};
|
||||
|
||||
static struct arraycache_init initarray_cache __initdata =
|
||||
{ {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
|
||||
static struct arraycache_init initarray_generic =
|
||||
{ {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
|
||||
|
||||
/* internal cache of cache description objs */
|
||||
static struct kmem_list3 *kmem_cache_nodelists[MAX_NUMNODES];
|
||||
static struct kmem_cache kmem_cache_boot = {
|
||||
.nodelists = kmem_cache_nodelists,
|
||||
.batchcount = 1,
|
||||
.limit = BOOT_CPUCACHE_ENTRIES,
|
||||
.shared = 1,
|
||||
@@ -662,6 +642,26 @@ static void init_node_lock_keys(int q)
|
||||
}
|
||||
}
|
||||
|
||||
static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q)
|
||||
{
|
||||
struct kmem_list3 *l3;
|
||||
l3 = cachep->nodelists[q];
|
||||
if (!l3)
|
||||
return;
|
||||
|
||||
slab_set_lock_classes(cachep, &on_slab_l3_key,
|
||||
&on_slab_alc_key, q);
|
||||
}
|
||||
|
||||
static inline void on_slab_lock_classes(struct kmem_cache *cachep)
|
||||
{
|
||||
int node;
|
||||
|
||||
VM_BUG_ON(OFF_SLAB(cachep));
|
||||
for_each_node(node)
|
||||
on_slab_lock_classes_node(cachep, node);
|
||||
}
|
||||
|
||||
static inline void init_lock_keys(void)
|
||||
{
|
||||
int node;
|
||||
@@ -678,6 +678,14 @@ static inline void init_lock_keys(void)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void on_slab_lock_classes(struct kmem_cache *cachep)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node)
|
||||
{
|
||||
}
|
||||
|
||||
static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
|
||||
{
|
||||
}
|
||||
@@ -1406,6 +1414,9 @@ static int __cpuinit cpuup_prepare(long cpu)
|
||||
free_alien_cache(alien);
|
||||
if (cachep->flags & SLAB_DEBUG_OBJECTS)
|
||||
slab_set_debugobj_lock_classes_node(cachep, node);
|
||||
else if (!OFF_SLAB(cachep) &&
|
||||
!(cachep->flags & SLAB_DESTROY_BY_RCU))
|
||||
on_slab_lock_classes_node(cachep, node);
|
||||
}
|
||||
init_node_lock_keys(node);
|
||||
|
||||
@@ -1576,29 +1587,34 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* The memory after the last cpu cache pointer is used for the
|
||||
* the nodelists pointer.
|
||||
*/
|
||||
static void setup_nodelists_pointer(struct kmem_cache *cachep)
|
||||
{
|
||||
cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialisation. Called after the page allocator have been initialised and
|
||||
* before smp_init().
|
||||
*/
|
||||
void __init kmem_cache_init(void)
|
||||
{
|
||||
size_t left_over;
|
||||
struct cache_sizes *sizes;
|
||||
struct cache_names *names;
|
||||
int i;
|
||||
int order;
|
||||
int node;
|
||||
|
||||
kmem_cache = &kmem_cache_boot;
|
||||
setup_nodelists_pointer(kmem_cache);
|
||||
|
||||
if (num_possible_nodes() == 1)
|
||||
use_alien_caches = 0;
|
||||
|
||||
for (i = 0; i < NUM_INIT_LISTS; i++) {
|
||||
for (i = 0; i < NUM_INIT_LISTS; i++)
|
||||
kmem_list3_init(&initkmem_list3[i]);
|
||||
if (i < MAX_NUMNODES)
|
||||
kmem_cache->nodelists[i] = NULL;
|
||||
}
|
||||
|
||||
set_up_list3s(kmem_cache, CACHE_CACHE);
|
||||
|
||||
/*
|
||||
@@ -1629,37 +1645,16 @@ void __init kmem_cache_init(void)
|
||||
* 6) Resize the head arrays of the kmalloc caches to their final sizes.
|
||||
*/
|
||||
|
||||
node = numa_mem_id();
|
||||
|
||||
/* 1) create the kmem_cache */
|
||||
INIT_LIST_HEAD(&slab_caches);
|
||||
list_add(&kmem_cache->list, &slab_caches);
|
||||
kmem_cache->colour_off = cache_line_size();
|
||||
kmem_cache->array[smp_processor_id()] = &initarray_cache.cache;
|
||||
kmem_cache->nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
|
||||
|
||||
/*
|
||||
* struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
|
||||
*/
|
||||
kmem_cache->size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
|
||||
nr_node_ids * sizeof(struct kmem_list3 *);
|
||||
kmem_cache->object_size = kmem_cache->size;
|
||||
kmem_cache->size = ALIGN(kmem_cache->object_size,
|
||||
cache_line_size());
|
||||
kmem_cache->reciprocal_buffer_size =
|
||||
reciprocal_value(kmem_cache->size);
|
||||
|
||||
for (order = 0; order < MAX_ORDER; order++) {
|
||||
cache_estimate(order, kmem_cache->size,
|
||||
cache_line_size(), 0, &left_over, &kmem_cache->num);
|
||||
if (kmem_cache->num)
|
||||
break;
|
||||
}
|
||||
BUG_ON(!kmem_cache->num);
|
||||
kmem_cache->gfporder = order;
|
||||
kmem_cache->colour = left_over / kmem_cache->colour_off;
|
||||
kmem_cache->slab_size = ALIGN(kmem_cache->num * sizeof(kmem_bufctl_t) +
|
||||
sizeof(struct slab), cache_line_size());
|
||||
create_boot_cache(kmem_cache, "kmem_cache",
|
||||
offsetof(struct kmem_cache, array[nr_cpu_ids]) +
|
||||
nr_node_ids * sizeof(struct kmem_list3 *),
|
||||
SLAB_HWCACHE_ALIGN);
|
||||
list_add(&kmem_cache->list, &slab_caches);
|
||||
|
||||
/* 2+3) create the kmalloc caches */
|
||||
sizes = malloc_sizes;
|
||||
@@ -1671,23 +1666,13 @@ void __init kmem_cache_init(void)
|
||||
* bug.
|
||||
*/
|
||||
|
||||
sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
|
||||
sizes[INDEX_AC].cs_cachep->name = names[INDEX_AC].name;
|
||||
sizes[INDEX_AC].cs_cachep->size = sizes[INDEX_AC].cs_size;
|
||||
sizes[INDEX_AC].cs_cachep->object_size = sizes[INDEX_AC].cs_size;
|
||||
sizes[INDEX_AC].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
|
||||
__kmem_cache_create(sizes[INDEX_AC].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
|
||||
list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches);
|
||||
sizes[INDEX_AC].cs_cachep = create_kmalloc_cache(names[INDEX_AC].name,
|
||||
sizes[INDEX_AC].cs_size, ARCH_KMALLOC_FLAGS);
|
||||
|
||||
if (INDEX_AC != INDEX_L3) {
|
||||
sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
|
||||
sizes[INDEX_L3].cs_cachep->name = names[INDEX_L3].name;
|
||||
sizes[INDEX_L3].cs_cachep->size = sizes[INDEX_L3].cs_size;
|
||||
sizes[INDEX_L3].cs_cachep->object_size = sizes[INDEX_L3].cs_size;
|
||||
sizes[INDEX_L3].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
|
||||
__kmem_cache_create(sizes[INDEX_L3].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
|
||||
list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches);
|
||||
}
|
||||
if (INDEX_AC != INDEX_L3)
|
||||
sizes[INDEX_L3].cs_cachep =
|
||||
create_kmalloc_cache(names[INDEX_L3].name,
|
||||
sizes[INDEX_L3].cs_size, ARCH_KMALLOC_FLAGS);
|
||||
|
||||
slab_early_init = 0;
|
||||
|
||||
@@ -1699,24 +1684,14 @@ void __init kmem_cache_init(void)
|
||||
* Note for systems short on memory removing the alignment will
|
||||
* allow tighter packing of the smaller caches.
|
||||
*/
|
||||
if (!sizes->cs_cachep) {
|
||||
sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
|
||||
sizes->cs_cachep->name = names->name;
|
||||
sizes->cs_cachep->size = sizes->cs_size;
|
||||
sizes->cs_cachep->object_size = sizes->cs_size;
|
||||
sizes->cs_cachep->align = ARCH_KMALLOC_MINALIGN;
|
||||
__kmem_cache_create(sizes->cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
|
||||
list_add(&sizes->cs_cachep->list, &slab_caches);
|
||||
}
|
||||
if (!sizes->cs_cachep)
|
||||
sizes->cs_cachep = create_kmalloc_cache(names->name,
|
||||
sizes->cs_size, ARCH_KMALLOC_FLAGS);
|
||||
|
||||
#ifdef CONFIG_ZONE_DMA
|
||||
sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
|
||||
sizes->cs_dmacachep->name = names->name_dma;
|
||||
sizes->cs_dmacachep->size = sizes->cs_size;
|
||||
sizes->cs_dmacachep->object_size = sizes->cs_size;
|
||||
sizes->cs_dmacachep->align = ARCH_KMALLOC_MINALIGN;
|
||||
__kmem_cache_create(sizes->cs_dmacachep,
|
||||
ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC);
|
||||
list_add(&sizes->cs_dmacachep->list, &slab_caches);
|
||||
sizes->cs_dmacachep = create_kmalloc_cache(
|
||||
names->name_dma, sizes->cs_size,
|
||||
SLAB_CACHE_DMA|ARCH_KMALLOC_FLAGS);
|
||||
#endif
|
||||
sizes++;
|
||||
names++;
|
||||
@@ -1727,7 +1702,6 @@ void __init kmem_cache_init(void)
|
||||
|
||||
ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
|
||||
|
||||
BUG_ON(cpu_cache_get(kmem_cache) != &initarray_cache.cache);
|
||||
memcpy(ptr, cpu_cache_get(kmem_cache),
|
||||
sizeof(struct arraycache_init));
|
||||
/*
|
||||
@@ -1921,6 +1895,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
|
||||
if (page->pfmemalloc)
|
||||
SetPageSlabPfmemalloc(page + i);
|
||||
}
|
||||
memcg_bind_pages(cachep, cachep->gfporder);
|
||||
|
||||
if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
|
||||
kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
|
||||
@@ -1957,9 +1932,11 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
|
||||
__ClearPageSlab(page);
|
||||
page++;
|
||||
}
|
||||
|
||||
memcg_release_pages(cachep, cachep->gfporder);
|
||||
if (current->reclaim_state)
|
||||
current->reclaim_state->reclaimed_slab += nr_freed;
|
||||
free_pages((unsigned long)addr, cachep->gfporder);
|
||||
free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder);
|
||||
}
|
||||
|
||||
static void kmem_rcu_free(struct rcu_head *head)
|
||||
@@ -2282,7 +2259,15 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
|
||||
|
||||
if (slab_state == DOWN) {
|
||||
/*
|
||||
* Note: the first kmem_cache_create must create the cache
|
||||
* Note: Creation of first cache (kmem_cache).
|
||||
* The setup_list3s is taken care
|
||||
* of by the caller of __kmem_cache_create
|
||||
*/
|
||||
cachep->array[smp_processor_id()] = &initarray_generic.cache;
|
||||
slab_state = PARTIAL;
|
||||
} else if (slab_state == PARTIAL) {
|
||||
/*
|
||||
* Note: the second kmem_cache_create must create the cache
|
||||
* that's used by kmalloc(24), otherwise the creation of
|
||||
* further caches will BUG().
|
||||
*/
|
||||
@@ -2290,7 +2275,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
|
||||
|
||||
/*
|
||||
* If the cache that's used by kmalloc(sizeof(kmem_list3)) is
|
||||
* the first cache, then we need to set up all its list3s,
|
||||
* the second cache, then we need to set up all its list3s,
|
||||
* otherwise the creation of further caches will BUG().
|
||||
*/
|
||||
set_up_list3s(cachep, SIZE_AC);
|
||||
@@ -2299,6 +2284,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
|
||||
else
|
||||
slab_state = PARTIAL_ARRAYCACHE;
|
||||
} else {
|
||||
/* Remaining boot caches */
|
||||
cachep->array[smp_processor_id()] =
|
||||
kmalloc(sizeof(struct arraycache_init), gfp);
|
||||
|
||||
@@ -2331,11 +2317,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
|
||||
|
||||
/**
|
||||
* __kmem_cache_create - Create a cache.
|
||||
* @name: A string which is used in /proc/slabinfo to identify this cache.
|
||||
* @size: The size of objects to be created in this cache.
|
||||
* @align: The required alignment for the objects.
|
||||
* @cachep: cache management descriptor
|
||||
* @flags: SLAB flags
|
||||
* @ctor: A constructor for the objects.
|
||||
*
|
||||
* Returns a ptr to the cache on success, NULL on failure.
|
||||
* Cannot be called within a int, but can be interrupted.
|
||||
@@ -2378,11 +2361,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
|
||||
if (flags & SLAB_DESTROY_BY_RCU)
|
||||
BUG_ON(flags & SLAB_POISON);
|
||||
#endif
|
||||
/*
|
||||
* Always checks flags, a caller might be expecting debug support which
|
||||
* isn't available.
|
||||
*/
|
||||
BUG_ON(flags & ~CREATE_MASK);
|
||||
|
||||
/*
|
||||
* Check that size is in terms of words. This is needed to avoid
|
||||
@@ -2394,22 +2372,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
|
||||
size &= ~(BYTES_PER_WORD - 1);
|
||||
}
|
||||
|
||||
/* calculate the final buffer alignment: */
|
||||
|
||||
/* 1) arch recommendation: can be overridden for debug */
|
||||
if (flags & SLAB_HWCACHE_ALIGN) {
|
||||
/*
|
||||
* Default alignment: as specified by the arch code. Except if
|
||||
* an object is really small, then squeeze multiple objects into
|
||||
* one cacheline.
|
||||
*/
|
||||
ralign = cache_line_size();
|
||||
while (size <= ralign / 2)
|
||||
ralign /= 2;
|
||||
} else {
|
||||
ralign = BYTES_PER_WORD;
|
||||
}
|
||||
|
||||
/*
|
||||
* Redzoning and user store require word alignment or possibly larger.
|
||||
* Note this will be overridden by architecture or caller mandated
|
||||
@@ -2426,10 +2388,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
|
||||
size &= ~(REDZONE_ALIGN - 1);
|
||||
}
|
||||
|
||||
/* 2) arch mandated alignment */
|
||||
if (ralign < ARCH_SLAB_MINALIGN) {
|
||||
ralign = ARCH_SLAB_MINALIGN;
|
||||
}
|
||||
/* 3) caller mandated alignment */
|
||||
if (ralign < cachep->align) {
|
||||
ralign = cachep->align;
|
||||
@@ -2447,7 +2405,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
|
||||
else
|
||||
gfp = GFP_NOWAIT;
|
||||
|
||||
cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
|
||||
setup_nodelists_pointer(cachep);
|
||||
#if DEBUG
|
||||
|
||||
/*
|
||||
@@ -2566,7 +2524,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
|
||||
WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
|
||||
|
||||
slab_set_debugobj_lock_classes(cachep);
|
||||
}
|
||||
} else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU))
|
||||
on_slab_lock_classes(cachep);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -3530,6 +3489,8 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
|
||||
if (slab_should_failslab(cachep, flags))
|
||||
return NULL;
|
||||
|
||||
cachep = memcg_kmem_get_cache(cachep, flags);
|
||||
|
||||
cache_alloc_debugcheck_before(cachep, flags);
|
||||
local_irq_save(save_flags);
|
||||
|
||||
@@ -3615,6 +3576,8 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
|
||||
if (slab_should_failslab(cachep, flags))
|
||||
return NULL;
|
||||
|
||||
cachep = memcg_kmem_get_cache(cachep, flags);
|
||||
|
||||
cache_alloc_debugcheck_before(cachep, flags);
|
||||
local_irq_save(save_flags);
|
||||
objp = __do_cache_alloc(cachep, flags);
|
||||
@@ -3928,6 +3891,9 @@ EXPORT_SYMBOL(__kmalloc);
|
||||
void kmem_cache_free(struct kmem_cache *cachep, void *objp)
|
||||
{
|
||||
unsigned long flags;
|
||||
cachep = cache_from_obj(cachep, objp);
|
||||
if (!cachep)
|
||||
return;
|
||||
|
||||
local_irq_save(flags);
|
||||
debug_check_no_locks_freed(objp, cachep->object_size);
|
||||
@@ -3969,12 +3935,6 @@ void kfree(const void *objp)
|
||||
}
|
||||
EXPORT_SYMBOL(kfree);
|
||||
|
||||
unsigned int kmem_cache_size(struct kmem_cache *cachep)
|
||||
{
|
||||
return cachep->object_size;
|
||||
}
|
||||
EXPORT_SYMBOL(kmem_cache_size);
|
||||
|
||||
/*
|
||||
* This initializes kmem_list3 or resizes various caches for all nodes.
|
||||
*/
|
||||
@@ -4081,7 +4041,7 @@ static void do_ccupdate_local(void *info)
|
||||
}
|
||||
|
||||
/* Always called with the slab_mutex held */
|
||||
static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
|
||||
static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
|
||||
int batchcount, int shared, gfp_t gfp)
|
||||
{
|
||||
struct ccupdate_struct *new;
|
||||
@@ -4124,12 +4084,49 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
|
||||
return alloc_kmemlist(cachep, gfp);
|
||||
}
|
||||
|
||||
static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
|
||||
int batchcount, int shared, gfp_t gfp)
|
||||
{
|
||||
int ret;
|
||||
struct kmem_cache *c = NULL;
|
||||
int i = 0;
|
||||
|
||||
ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
|
||||
|
||||
if (slab_state < FULL)
|
||||
return ret;
|
||||
|
||||
if ((ret < 0) || !is_root_cache(cachep))
|
||||
return ret;
|
||||
|
||||
VM_BUG_ON(!mutex_is_locked(&slab_mutex));
|
||||
for_each_memcg_cache_index(i) {
|
||||
c = cache_from_memcg(cachep, i);
|
||||
if (c)
|
||||
/* return value determined by the parent cache only */
|
||||
__do_tune_cpucache(c, limit, batchcount, shared, gfp);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Called with slab_mutex held always */
|
||||
static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
|
||||
{
|
||||
int err;
|
||||
int limit, shared;
|
||||
int limit = 0;
|
||||
int shared = 0;
|
||||
int batchcount = 0;
|
||||
|
||||
if (!is_root_cache(cachep)) {
|
||||
struct kmem_cache *root = memcg_root_cache(cachep);
|
||||
limit = root->limit;
|
||||
shared = root->shared;
|
||||
batchcount = root->batchcount;
|
||||
}
|
||||
|
||||
if (limit && shared && batchcount)
|
||||
goto skip_setup;
|
||||
/*
|
||||
* The head array serves three purposes:
|
||||
* - create a LIFO ordering, i.e. return objects that are cache-warm
|
||||
@@ -4171,7 +4168,9 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
|
||||
if (limit > 32)
|
||||
limit = 32;
|
||||
#endif
|
||||
err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
|
||||
batchcount = (limit + 1) / 2;
|
||||
skip_setup:
|
||||
err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
|
||||
if (err)
|
||||
printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
|
||||
cachep->name, -err);
|
||||
@@ -4276,54 +4275,8 @@ out:
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SLABINFO
|
||||
|
||||
static void print_slabinfo_header(struct seq_file *m)
|
||||
void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
|
||||
{
|
||||
/*
|
||||
* Output format version, so at least we can change it
|
||||
* without _too_ many complaints.
|
||||
*/
|
||||
#if STATS
|
||||
seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
|
||||
#else
|
||||
seq_puts(m, "slabinfo - version: 2.1\n");
|
||||
#endif
|
||||
seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
|
||||
"<objperslab> <pagesperslab>");
|
||||
seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
|
||||
seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
|
||||
#if STATS
|
||||
seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
|
||||
"<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
|
||||
seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
|
||||
#endif
|
||||
seq_putc(m, '\n');
|
||||
}
|
||||
|
||||
static void *s_start(struct seq_file *m, loff_t *pos)
|
||||
{
|
||||
loff_t n = *pos;
|
||||
|
||||
mutex_lock(&slab_mutex);
|
||||
if (!n)
|
||||
print_slabinfo_header(m);
|
||||
|
||||
return seq_list_start(&slab_caches, *pos);
|
||||
}
|
||||
|
||||
static void *s_next(struct seq_file *m, void *p, loff_t *pos)
|
||||
{
|
||||
return seq_list_next(p, &slab_caches, pos);
|
||||
}
|
||||
|
||||
static void s_stop(struct seq_file *m, void *p)
|
||||
{
|
||||
mutex_unlock(&slab_mutex);
|
||||
}
|
||||
|
||||
static int s_show(struct seq_file *m, void *p)
|
||||
{
|
||||
struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
|
||||
struct slab *slabp;
|
||||
unsigned long active_objs;
|
||||
unsigned long num_objs;
|
||||
@@ -4378,13 +4331,20 @@ static int s_show(struct seq_file *m, void *p)
|
||||
if (error)
|
||||
printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
|
||||
|
||||
seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
|
||||
name, active_objs, num_objs, cachep->size,
|
||||
cachep->num, (1 << cachep->gfporder));
|
||||
seq_printf(m, " : tunables %4u %4u %4u",
|
||||
cachep->limit, cachep->batchcount, cachep->shared);
|
||||
seq_printf(m, " : slabdata %6lu %6lu %6lu",
|
||||
active_slabs, num_slabs, shared_avail);
|
||||
sinfo->active_objs = active_objs;
|
||||
sinfo->num_objs = num_objs;
|
||||
sinfo->active_slabs = active_slabs;
|
||||
sinfo->num_slabs = num_slabs;
|
||||
sinfo->shared_avail = shared_avail;
|
||||
sinfo->limit = cachep->limit;
|
||||
sinfo->batchcount = cachep->batchcount;
|
||||
sinfo->shared = cachep->shared;
|
||||
sinfo->objects_per_slab = cachep->num;
|
||||
sinfo->cache_order = cachep->gfporder;
|
||||
}
|
||||
|
||||
void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
|
||||
{
|
||||
#if STATS
|
||||
{ /* list3 stats */
|
||||
unsigned long high = cachep->high_mark;
|
||||
@@ -4414,31 +4374,8 @@ static int s_show(struct seq_file *m, void *p)
|
||||
allochit, allocmiss, freehit, freemiss);
|
||||
}
|
||||
#endif
|
||||
seq_putc(m, '\n');
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* slabinfo_op - iterator that generates /proc/slabinfo
|
||||
*
|
||||
* Output layout:
|
||||
* cache-name
|
||||
* num-active-objs
|
||||
* total-objs
|
||||
* object size
|
||||
* num-active-slabs
|
||||
* total-slabs
|
||||
* num-pages-per-slab
|
||||
* + further values on SMP and with statistics enabled
|
||||
*/
|
||||
|
||||
static const struct seq_operations slabinfo_op = {
|
||||
.start = s_start,
|
||||
.next = s_next,
|
||||
.stop = s_stop,
|
||||
.show = s_show,
|
||||
};
|
||||
|
||||
#define MAX_SLABINFO_WRITE 128
|
||||
/**
|
||||
* slabinfo_write - Tuning for the slab allocator
|
||||
@@ -4447,7 +4384,7 @@ static const struct seq_operations slabinfo_op = {
|
||||
* @count: data length
|
||||
* @ppos: unused
|
||||
*/
|
||||
static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
|
||||
ssize_t slabinfo_write(struct file *file, const char __user *buffer,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
|
||||
@@ -4490,19 +4427,6 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
|
||||
return res;
|
||||
}
|
||||
|
||||
static int slabinfo_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return seq_open(file, &slabinfo_op);
|
||||
}
|
||||
|
||||
static const struct file_operations proc_slabinfo_operations = {
|
||||
.open = slabinfo_open,
|
||||
.read = seq_read,
|
||||
.write = slabinfo_write,
|
||||
.llseek = seq_lseek,
|
||||
.release = seq_release,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_DEBUG_SLAB_LEAK
|
||||
|
||||
static void *leaks_start(struct seq_file *m, loff_t *pos)
|
||||
@@ -4631,6 +4555,16 @@ static int leaks_show(struct seq_file *m, void *p)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void *s_next(struct seq_file *m, void *p, loff_t *pos)
|
||||
{
|
||||
return seq_list_next(p, &slab_caches, pos);
|
||||
}
|
||||
|
||||
static void s_stop(struct seq_file *m, void *p)
|
||||
{
|
||||
mutex_unlock(&slab_mutex);
|
||||
}
|
||||
|
||||
static const struct seq_operations slabstats_op = {
|
||||
.start = leaks_start,
|
||||
.next = s_next,
|
||||
@@ -4665,7 +4599,6 @@ static const struct file_operations proc_slabstats_operations = {
|
||||
|
||||
static int __init slab_proc_init(void)
|
||||
{
|
||||
proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations);
|
||||
#ifdef CONFIG_DEBUG_SLAB_LEAK
|
||||
proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
|
||||
#endif
|
||||
|
190
mm/slab.h
190
mm/slab.h
@@ -32,19 +32,201 @@ extern struct list_head slab_caches;
|
||||
/* The slab cache that manages slab cache information */
|
||||
extern struct kmem_cache *kmem_cache;
|
||||
|
||||
unsigned long calculate_alignment(unsigned long flags,
|
||||
unsigned long align, unsigned long size);
|
||||
|
||||
/* Functions provided by the slab allocators */
|
||||
extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags);
|
||||
|
||||
extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
|
||||
unsigned long flags);
|
||||
extern void create_boot_cache(struct kmem_cache *, const char *name,
|
||||
size_t size, unsigned long flags);
|
||||
|
||||
struct mem_cgroup;
|
||||
#ifdef CONFIG_SLUB
|
||||
struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
|
||||
size_t align, unsigned long flags, void (*ctor)(void *));
|
||||
struct kmem_cache *
|
||||
__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
|
||||
size_t align, unsigned long flags, void (*ctor)(void *));
|
||||
#else
|
||||
static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
|
||||
size_t align, unsigned long flags, void (*ctor)(void *))
|
||||
static inline struct kmem_cache *
|
||||
__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
|
||||
size_t align, unsigned long flags, void (*ctor)(void *))
|
||||
{ return NULL; }
|
||||
#endif
|
||||
|
||||
|
||||
/* Legal flag mask for kmem_cache_create(), for various configurations */
|
||||
#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \
|
||||
SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS )
|
||||
|
||||
#if defined(CONFIG_DEBUG_SLAB)
|
||||
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
|
||||
#elif defined(CONFIG_SLUB_DEBUG)
|
||||
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
|
||||
SLAB_TRACE | SLAB_DEBUG_FREE)
|
||||
#else
|
||||
#define SLAB_DEBUG_FLAGS (0)
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_SLAB)
|
||||
#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
|
||||
SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK)
|
||||
#elif defined(CONFIG_SLUB)
|
||||
#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
|
||||
SLAB_TEMPORARY | SLAB_NOTRACK)
|
||||
#else
|
||||
#define SLAB_CACHE_FLAGS (0)
|
||||
#endif
|
||||
|
||||
#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
|
||||
|
||||
int __kmem_cache_shutdown(struct kmem_cache *);
|
||||
|
||||
struct seq_file;
|
||||
struct file;
|
||||
|
||||
struct slabinfo {
|
||||
unsigned long active_objs;
|
||||
unsigned long num_objs;
|
||||
unsigned long active_slabs;
|
||||
unsigned long num_slabs;
|
||||
unsigned long shared_avail;
|
||||
unsigned int limit;
|
||||
unsigned int batchcount;
|
||||
unsigned int shared;
|
||||
unsigned int objects_per_slab;
|
||||
unsigned int cache_order;
|
||||
};
|
||||
|
||||
void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo);
|
||||
void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
|
||||
ssize_t slabinfo_write(struct file *file, const char __user *buffer,
|
||||
size_t count, loff_t *ppos);
|
||||
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
static inline bool is_root_cache(struct kmem_cache *s)
|
||||
{
|
||||
return !s->memcg_params || s->memcg_params->is_root_cache;
|
||||
}
|
||||
|
||||
static inline bool cache_match_memcg(struct kmem_cache *cachep,
|
||||
struct mem_cgroup *memcg)
|
||||
{
|
||||
return (is_root_cache(cachep) && !memcg) ||
|
||||
(cachep->memcg_params->memcg == memcg);
|
||||
}
|
||||
|
||||
static inline void memcg_bind_pages(struct kmem_cache *s, int order)
|
||||
{
|
||||
if (!is_root_cache(s))
|
||||
atomic_add(1 << order, &s->memcg_params->nr_pages);
|
||||
}
|
||||
|
||||
static inline void memcg_release_pages(struct kmem_cache *s, int order)
|
||||
{
|
||||
if (is_root_cache(s))
|
||||
return;
|
||||
|
||||
if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages))
|
||||
mem_cgroup_destroy_cache(s);
|
||||
}
|
||||
|
||||
static inline bool slab_equal_or_root(struct kmem_cache *s,
|
||||
struct kmem_cache *p)
|
||||
{
|
||||
return (p == s) ||
|
||||
(s->memcg_params && (p == s->memcg_params->root_cache));
|
||||
}
|
||||
|
||||
/*
|
||||
* We use suffixes to the name in memcg because we can't have caches
|
||||
* created in the system with the same name. But when we print them
|
||||
* locally, better refer to them with the base name
|
||||
*/
|
||||
static inline const char *cache_name(struct kmem_cache *s)
|
||||
{
|
||||
if (!is_root_cache(s))
|
||||
return s->memcg_params->root_cache->name;
|
||||
return s->name;
|
||||
}
|
||||
|
||||
static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
|
||||
{
|
||||
return s->memcg_params->memcg_caches[idx];
|
||||
}
|
||||
|
||||
static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
|
||||
{
|
||||
if (is_root_cache(s))
|
||||
return s;
|
||||
return s->memcg_params->root_cache;
|
||||
}
|
||||
#else
|
||||
static inline bool is_root_cache(struct kmem_cache *s)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool cache_match_memcg(struct kmem_cache *cachep,
|
||||
struct mem_cgroup *memcg)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline void memcg_bind_pages(struct kmem_cache *s, int order)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void memcg_release_pages(struct kmem_cache *s, int order)
|
||||
{
|
||||
}
|
||||
|
||||
static inline bool slab_equal_or_root(struct kmem_cache *s,
|
||||
struct kmem_cache *p)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline const char *cache_name(struct kmem_cache *s)
|
||||
{
|
||||
return s->name;
|
||||
}
|
||||
|
||||
static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
|
||||
{
|
||||
return s;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
|
||||
{
|
||||
struct kmem_cache *cachep;
|
||||
struct page *page;
|
||||
|
||||
/*
|
||||
* When kmemcg is not being used, both assignments should return the
|
||||
* same value. but we don't want to pay the assignment price in that
|
||||
* case. If it is not compiled in, the compiler should be smart enough
|
||||
* to not do even the assignment. In that case, slab_equal_or_root
|
||||
* will also be a constant.
|
||||
*/
|
||||
if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE))
|
||||
return s;
|
||||
|
||||
page = virt_to_head_page(x);
|
||||
cachep = page->slab_cache;
|
||||
if (slab_equal_or_root(cachep, s))
|
||||
return cachep;
|
||||
|
||||
pr_err("%s: Wrong slab cache. %s but object is from %s\n",
|
||||
__FUNCTION__, cachep->name, s->name);
|
||||
WARN_ON_ONCE(1);
|
||||
return s;
|
||||
}
|
||||
#endif
|
||||
|
292
mm/slab_common.c
292
mm/slab_common.c
@@ -13,9 +13,12 @@
|
||||
#include <linux/module.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/proc_fs.h>
|
||||
#include <asm/cacheflush.h>
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/page.h>
|
||||
#include <linux/memcontrol.h>
|
||||
|
||||
#include "slab.h"
|
||||
|
||||
@@ -25,7 +28,8 @@ DEFINE_MUTEX(slab_mutex);
|
||||
struct kmem_cache *kmem_cache;
|
||||
|
||||
#ifdef CONFIG_DEBUG_VM
|
||||
static int kmem_cache_sanity_check(const char *name, size_t size)
|
||||
static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
|
||||
size_t size)
|
||||
{
|
||||
struct kmem_cache *s = NULL;
|
||||
|
||||
@@ -51,7 +55,13 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!strcmp(s->name, name)) {
|
||||
/*
|
||||
* For simplicity, we won't check this in the list of memcg
|
||||
* caches. We have control over memcg naming, and if there
|
||||
* aren't duplicates in the global list, there won't be any
|
||||
* duplicates in the memcg lists as well.
|
||||
*/
|
||||
if (!memcg && !strcmp(s->name, name)) {
|
||||
pr_err("%s (%s): Cache name already exists.\n",
|
||||
__func__, name);
|
||||
dump_stack();
|
||||
@@ -64,12 +74,69 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
static inline int kmem_cache_sanity_check(const char *name, size_t size)
|
||||
static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg,
|
||||
const char *name, size_t size)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
int memcg_update_all_caches(int num_memcgs)
|
||||
{
|
||||
struct kmem_cache *s;
|
||||
int ret = 0;
|
||||
mutex_lock(&slab_mutex);
|
||||
|
||||
list_for_each_entry(s, &slab_caches, list) {
|
||||
if (!is_root_cache(s))
|
||||
continue;
|
||||
|
||||
ret = memcg_update_cache_size(s, num_memcgs);
|
||||
/*
|
||||
* See comment in memcontrol.c, memcg_update_cache_size:
|
||||
* Instead of freeing the memory, we'll just leave the caches
|
||||
* up to this point in an updated state.
|
||||
*/
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
||||
memcg_update_array_size(num_memcgs);
|
||||
out:
|
||||
mutex_unlock(&slab_mutex);
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Figure out what the alignment of the objects will be given a set of
|
||||
* flags, a user specified alignment and the size of the objects.
|
||||
*/
|
||||
unsigned long calculate_alignment(unsigned long flags,
|
||||
unsigned long align, unsigned long size)
|
||||
{
|
||||
/*
|
||||
* If the user wants hardware cache aligned objects then follow that
|
||||
* suggestion if the object is sufficiently large.
|
||||
*
|
||||
* The hardware cache alignment cannot override the specified
|
||||
* alignment though. If that is greater then use it.
|
||||
*/
|
||||
if (flags & SLAB_HWCACHE_ALIGN) {
|
||||
unsigned long ralign = cache_line_size();
|
||||
while (size <= ralign / 2)
|
||||
ralign /= 2;
|
||||
align = max(align, ralign);
|
||||
}
|
||||
|
||||
if (align < ARCH_SLAB_MINALIGN)
|
||||
align = ARCH_SLAB_MINALIGN;
|
||||
|
||||
return ALIGN(align, sizeof(void *));
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* kmem_cache_create - Create a cache.
|
||||
* @name: A string which is used in /proc/slabinfo to identify this cache.
|
||||
@@ -95,8 +162,10 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
|
||||
* as davem.
|
||||
*/
|
||||
|
||||
struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align,
|
||||
unsigned long flags, void (*ctor)(void *))
|
||||
struct kmem_cache *
|
||||
kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
|
||||
size_t align, unsigned long flags, void (*ctor)(void *),
|
||||
struct kmem_cache *parent_cache)
|
||||
{
|
||||
struct kmem_cache *s = NULL;
|
||||
int err = 0;
|
||||
@@ -104,19 +173,33 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
|
||||
get_online_cpus();
|
||||
mutex_lock(&slab_mutex);
|
||||
|
||||
if (!kmem_cache_sanity_check(name, size) == 0)
|
||||
if (!kmem_cache_sanity_check(memcg, name, size) == 0)
|
||||
goto out_locked;
|
||||
|
||||
/*
|
||||
* Some allocators will constraint the set of valid flags to a subset
|
||||
* of all flags. We expect them to define CACHE_CREATE_MASK in this
|
||||
* case, and we'll just provide them with a sanitized version of the
|
||||
* passed flags.
|
||||
*/
|
||||
flags &= CACHE_CREATE_MASK;
|
||||
|
||||
s = __kmem_cache_alias(name, size, align, flags, ctor);
|
||||
s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
|
||||
if (s)
|
||||
goto out_locked;
|
||||
|
||||
s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
|
||||
if (s) {
|
||||
s->object_size = s->size = size;
|
||||
s->align = align;
|
||||
s->align = calculate_alignment(flags, align, size);
|
||||
s->ctor = ctor;
|
||||
|
||||
if (memcg_register_cache(memcg, s, parent_cache)) {
|
||||
kmem_cache_free(kmem_cache, s);
|
||||
err = -ENOMEM;
|
||||
goto out_locked;
|
||||
}
|
||||
|
||||
s->name = kstrdup(name, GFP_KERNEL);
|
||||
if (!s->name) {
|
||||
kmem_cache_free(kmem_cache, s);
|
||||
@@ -126,10 +209,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
|
||||
|
||||
err = __kmem_cache_create(s, flags);
|
||||
if (!err) {
|
||||
|
||||
s->refcount = 1;
|
||||
list_add(&s->list, &slab_caches);
|
||||
|
||||
memcg_cache_list_add(memcg, s);
|
||||
} else {
|
||||
kfree(s->name);
|
||||
kmem_cache_free(kmem_cache, s);
|
||||
@@ -157,10 +239,20 @@ out_locked:
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
struct kmem_cache *
|
||||
kmem_cache_create(const char *name, size_t size, size_t align,
|
||||
unsigned long flags, void (*ctor)(void *))
|
||||
{
|
||||
return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL);
|
||||
}
|
||||
EXPORT_SYMBOL(kmem_cache_create);
|
||||
|
||||
void kmem_cache_destroy(struct kmem_cache *s)
|
||||
{
|
||||
/* Destroy all the children caches if we aren't a memcg cache */
|
||||
kmem_cache_destroy_memcg_children(s);
|
||||
|
||||
get_online_cpus();
|
||||
mutex_lock(&slab_mutex);
|
||||
s->refcount--;
|
||||
@@ -172,6 +264,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
|
||||
if (s->flags & SLAB_DESTROY_BY_RCU)
|
||||
rcu_barrier();
|
||||
|
||||
memcg_release_cache(s);
|
||||
kfree(s->name);
|
||||
kmem_cache_free(kmem_cache, s);
|
||||
} else {
|
||||
@@ -192,3 +285,182 @@ int slab_is_available(void)
|
||||
{
|
||||
return slab_state >= UP;
|
||||
}
|
||||
|
||||
#ifndef CONFIG_SLOB
|
||||
/* Create a cache during boot when no slab services are available yet */
|
||||
void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size,
|
||||
unsigned long flags)
|
||||
{
|
||||
int err;
|
||||
|
||||
s->name = name;
|
||||
s->size = s->object_size = size;
|
||||
s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
|
||||
err = __kmem_cache_create(s, flags);
|
||||
|
||||
if (err)
|
||||
panic("Creation of kmalloc slab %s size=%zd failed. Reason %d\n",
|
||||
name, size, err);
|
||||
|
||||
s->refcount = -1; /* Exempt from merging for now */
|
||||
}
|
||||
|
||||
struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
|
||||
unsigned long flags)
|
||||
{
|
||||
struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
|
||||
|
||||
if (!s)
|
||||
panic("Out of memory when creating slab %s\n", name);
|
||||
|
||||
create_boot_cache(s, name, size, flags);
|
||||
list_add(&s->list, &slab_caches);
|
||||
s->refcount = 1;
|
||||
return s;
|
||||
}
|
||||
|
||||
#endif /* !CONFIG_SLOB */
|
||||
|
||||
|
||||
#ifdef CONFIG_SLABINFO
|
||||
void print_slabinfo_header(struct seq_file *m)
|
||||
{
|
||||
/*
|
||||
* Output format version, so at least we can change it
|
||||
* without _too_ many complaints.
|
||||
*/
|
||||
#ifdef CONFIG_DEBUG_SLAB
|
||||
seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
|
||||
#else
|
||||
seq_puts(m, "slabinfo - version: 2.1\n");
|
||||
#endif
|
||||
seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
|
||||
"<objperslab> <pagesperslab>");
|
||||
seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
|
||||
seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
|
||||
#ifdef CONFIG_DEBUG_SLAB
|
||||
seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
|
||||
"<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
|
||||
seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
|
||||
#endif
|
||||
seq_putc(m, '\n');
|
||||
}
|
||||
|
||||
static void *s_start(struct seq_file *m, loff_t *pos)
|
||||
{
|
||||
loff_t n = *pos;
|
||||
|
||||
mutex_lock(&slab_mutex);
|
||||
if (!n)
|
||||
print_slabinfo_header(m);
|
||||
|
||||
return seq_list_start(&slab_caches, *pos);
|
||||
}
|
||||
|
||||
static void *s_next(struct seq_file *m, void *p, loff_t *pos)
|
||||
{
|
||||
return seq_list_next(p, &slab_caches, pos);
|
||||
}
|
||||
|
||||
static void s_stop(struct seq_file *m, void *p)
|
||||
{
|
||||
mutex_unlock(&slab_mutex);
|
||||
}
|
||||
|
||||
static void
|
||||
memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
|
||||
{
|
||||
struct kmem_cache *c;
|
||||
struct slabinfo sinfo;
|
||||
int i;
|
||||
|
||||
if (!is_root_cache(s))
|
||||
return;
|
||||
|
||||
for_each_memcg_cache_index(i) {
|
||||
c = cache_from_memcg(s, i);
|
||||
if (!c)
|
||||
continue;
|
||||
|
||||
memset(&sinfo, 0, sizeof(sinfo));
|
||||
get_slabinfo(c, &sinfo);
|
||||
|
||||
info->active_slabs += sinfo.active_slabs;
|
||||
info->num_slabs += sinfo.num_slabs;
|
||||
info->shared_avail += sinfo.shared_avail;
|
||||
info->active_objs += sinfo.active_objs;
|
||||
info->num_objs += sinfo.num_objs;
|
||||
}
|
||||
}
|
||||
|
||||
int cache_show(struct kmem_cache *s, struct seq_file *m)
|
||||
{
|
||||
struct slabinfo sinfo;
|
||||
|
||||
memset(&sinfo, 0, sizeof(sinfo));
|
||||
get_slabinfo(s, &sinfo);
|
||||
|
||||
memcg_accumulate_slabinfo(s, &sinfo);
|
||||
|
||||
seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
|
||||
cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size,
|
||||
sinfo.objects_per_slab, (1 << sinfo.cache_order));
|
||||
|
||||
seq_printf(m, " : tunables %4u %4u %4u",
|
||||
sinfo.limit, sinfo.batchcount, sinfo.shared);
|
||||
seq_printf(m, " : slabdata %6lu %6lu %6lu",
|
||||
sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
|
||||
slabinfo_show_stats(m, s);
|
||||
seq_putc(m, '\n');
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int s_show(struct seq_file *m, void *p)
|
||||
{
|
||||
struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
|
||||
|
||||
if (!is_root_cache(s))
|
||||
return 0;
|
||||
return cache_show(s, m);
|
||||
}
|
||||
|
||||
/*
|
||||
* slabinfo_op - iterator that generates /proc/slabinfo
|
||||
*
|
||||
* Output layout:
|
||||
* cache-name
|
||||
* num-active-objs
|
||||
* total-objs
|
||||
* object size
|
||||
* num-active-slabs
|
||||
* total-slabs
|
||||
* num-pages-per-slab
|
||||
* + further values on SMP and with statistics enabled
|
||||
*/
|
||||
static const struct seq_operations slabinfo_op = {
|
||||
.start = s_start,
|
||||
.next = s_next,
|
||||
.stop = s_stop,
|
||||
.show = s_show,
|
||||
};
|
||||
|
||||
static int slabinfo_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return seq_open(file, &slabinfo_op);
|
||||
}
|
||||
|
||||
static const struct file_operations proc_slabinfo_operations = {
|
||||
.open = slabinfo_open,
|
||||
.read = seq_read,
|
||||
.write = slabinfo_write,
|
||||
.llseek = seq_lseek,
|
||||
.release = seq_release,
|
||||
};
|
||||
|
||||
static int __init slab_proc_init(void)
|
||||
{
|
||||
proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations);
|
||||
return 0;
|
||||
}
|
||||
module_init(slab_proc_init);
|
||||
#endif /* CONFIG_SLABINFO */
|
||||
|
48
mm/slob.c
48
mm/slob.c
@@ -28,9 +28,8 @@
|
||||
* from kmalloc are prepended with a 4-byte header with the kmalloc size.
|
||||
* If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
|
||||
* alloc_pages() directly, allocating compound pages so the page order
|
||||
* does not have to be separately tracked, and also stores the exact
|
||||
* allocation size in page->private so that it can be used to accurately
|
||||
* provide ksize(). These objects are detected in kfree() because slob_page()
|
||||
* does not have to be separately tracked.
|
||||
* These objects are detected in kfree() because PageSlab()
|
||||
* is false for them.
|
||||
*
|
||||
* SLAB is emulated on top of SLOB by simply calling constructors and
|
||||
@@ -59,7 +58,6 @@
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/slab.h>
|
||||
#include "slab.h"
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/swap.h> /* struct reclaim_state */
|
||||
@@ -74,6 +72,7 @@
|
||||
|
||||
#include <linux/atomic.h>
|
||||
|
||||
#include "slab.h"
|
||||
/*
|
||||
* slob_block has a field 'units', which indicates size of block if +ve,
|
||||
* or offset of next block if -ve (in SLOB_UNITs).
|
||||
@@ -124,7 +123,6 @@ static inline void clear_slob_page_free(struct page *sp)
|
||||
|
||||
#define SLOB_UNIT sizeof(slob_t)
|
||||
#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
|
||||
#define SLOB_ALIGN L1_CACHE_BYTES
|
||||
|
||||
/*
|
||||
* struct slob_rcu is inserted at the tail of allocated slob blocks, which
|
||||
@@ -455,11 +453,6 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
|
||||
if (likely(order))
|
||||
gfp |= __GFP_COMP;
|
||||
ret = slob_new_pages(gfp, order, node);
|
||||
if (ret) {
|
||||
struct page *page;
|
||||
page = virt_to_page(ret);
|
||||
page->private = size;
|
||||
}
|
||||
|
||||
trace_kmalloc_node(caller, ret,
|
||||
size, PAGE_SIZE << order, gfp, node);
|
||||
@@ -506,7 +499,7 @@ void kfree(const void *block)
|
||||
unsigned int *m = (unsigned int *)(block - align);
|
||||
slob_free(m, *m + align);
|
||||
} else
|
||||
put_page(sp);
|
||||
__free_pages(sp, compound_order(sp));
|
||||
}
|
||||
EXPORT_SYMBOL(kfree);
|
||||
|
||||
@@ -514,37 +507,30 @@ EXPORT_SYMBOL(kfree);
|
||||
size_t ksize(const void *block)
|
||||
{
|
||||
struct page *sp;
|
||||
int align;
|
||||
unsigned int *m;
|
||||
|
||||
BUG_ON(!block);
|
||||
if (unlikely(block == ZERO_SIZE_PTR))
|
||||
return 0;
|
||||
|
||||
sp = virt_to_page(block);
|
||||
if (PageSlab(sp)) {
|
||||
int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
|
||||
unsigned int *m = (unsigned int *)(block - align);
|
||||
return SLOB_UNITS(*m) * SLOB_UNIT;
|
||||
} else
|
||||
return sp->private;
|
||||
if (unlikely(!PageSlab(sp)))
|
||||
return PAGE_SIZE << compound_order(sp);
|
||||
|
||||
align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
|
||||
m = (unsigned int *)(block - align);
|
||||
return SLOB_UNITS(*m) * SLOB_UNIT;
|
||||
}
|
||||
EXPORT_SYMBOL(ksize);
|
||||
|
||||
int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
|
||||
{
|
||||
size_t align = c->size;
|
||||
|
||||
if (flags & SLAB_DESTROY_BY_RCU) {
|
||||
/* leave room for rcu footer at the end of object */
|
||||
c->size += sizeof(struct slob_rcu);
|
||||
}
|
||||
c->flags = flags;
|
||||
/* ignore alignment unless it's forced */
|
||||
c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
|
||||
if (c->align < ARCH_SLAB_MINALIGN)
|
||||
c->align = ARCH_SLAB_MINALIGN;
|
||||
if (c->align < align)
|
||||
c->align = align;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -558,12 +544,12 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
|
||||
|
||||
if (c->size < PAGE_SIZE) {
|
||||
b = slob_alloc(c->size, flags, c->align, node);
|
||||
trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
|
||||
trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
|
||||
SLOB_UNITS(c->size) * SLOB_UNIT,
|
||||
flags, node);
|
||||
} else {
|
||||
b = slob_new_pages(flags, get_order(c->size), node);
|
||||
trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
|
||||
trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
|
||||
PAGE_SIZE << get_order(c->size),
|
||||
flags, node);
|
||||
}
|
||||
@@ -608,12 +594,6 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
|
||||
}
|
||||
EXPORT_SYMBOL(kmem_cache_free);
|
||||
|
||||
unsigned int kmem_cache_size(struct kmem_cache *c)
|
||||
{
|
||||
return c->size;
|
||||
}
|
||||
EXPORT_SYMBOL(kmem_cache_size);
|
||||
|
||||
int __kmem_cache_shutdown(struct kmem_cache *c)
|
||||
{
|
||||
/* No way to check for remaining objects */
|
||||
|
453
mm/slub.c
453
mm/slub.c
@@ -31,6 +31,7 @@
|
||||
#include <linux/fault-inject.h>
|
||||
#include <linux/stacktrace.h>
|
||||
#include <linux/prefetch.h>
|
||||
#include <linux/memcontrol.h>
|
||||
|
||||
#include <trace/events/kmem.h>
|
||||
|
||||
@@ -112,9 +113,6 @@
|
||||
* the fast path and disables lockless freelists.
|
||||
*/
|
||||
|
||||
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
|
||||
SLAB_TRACE | SLAB_DEBUG_FREE)
|
||||
|
||||
static inline int kmem_cache_debug(struct kmem_cache *s)
|
||||
{
|
||||
#ifdef CONFIG_SLUB_DEBUG
|
||||
@@ -179,8 +177,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
|
||||
#define __OBJECT_POISON 0x80000000UL /* Poison object */
|
||||
#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */
|
||||
|
||||
static int kmem_size = sizeof(struct kmem_cache);
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
static struct notifier_block slab_notifier;
|
||||
#endif
|
||||
@@ -205,13 +201,14 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
|
||||
static int sysfs_slab_add(struct kmem_cache *);
|
||||
static int sysfs_slab_alias(struct kmem_cache *, const char *);
|
||||
static void sysfs_slab_remove(struct kmem_cache *);
|
||||
|
||||
static void memcg_propagate_slab_attrs(struct kmem_cache *s);
|
||||
#else
|
||||
static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
|
||||
static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
|
||||
{ return 0; }
|
||||
static inline void sysfs_slab_remove(struct kmem_cache *s) { }
|
||||
|
||||
static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
|
||||
#endif
|
||||
|
||||
static inline void stat(const struct kmem_cache *s, enum stat_item si)
|
||||
@@ -1092,11 +1089,11 @@ static noinline struct kmem_cache_node *free_debug_processing(
|
||||
if (!check_object(s, page, object, SLUB_RED_ACTIVE))
|
||||
goto out;
|
||||
|
||||
if (unlikely(s != page->slab)) {
|
||||
if (unlikely(s != page->slab_cache)) {
|
||||
if (!PageSlab(page)) {
|
||||
slab_err(s, page, "Attempt to free object(0x%p) "
|
||||
"outside of slab", object);
|
||||
} else if (!page->slab) {
|
||||
} else if (!page->slab_cache) {
|
||||
printk(KERN_ERR
|
||||
"SLUB <none>: no slab for object 0x%p.\n",
|
||||
object);
|
||||
@@ -1348,6 +1345,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
|
||||
void *start;
|
||||
void *last;
|
||||
void *p;
|
||||
int order;
|
||||
|
||||
BUG_ON(flags & GFP_SLAB_BUG_MASK);
|
||||
|
||||
@@ -1356,8 +1354,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
|
||||
if (!page)
|
||||
goto out;
|
||||
|
||||
order = compound_order(page);
|
||||
inc_slabs_node(s, page_to_nid(page), page->objects);
|
||||
page->slab = s;
|
||||
memcg_bind_pages(s, order);
|
||||
page->slab_cache = s;
|
||||
__SetPageSlab(page);
|
||||
if (page->pfmemalloc)
|
||||
SetPageSlabPfmemalloc(page);
|
||||
@@ -1365,7 +1365,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
|
||||
start = page_address(page);
|
||||
|
||||
if (unlikely(s->flags & SLAB_POISON))
|
||||
memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page));
|
||||
memset(start, POISON_INUSE, PAGE_SIZE << order);
|
||||
|
||||
last = start;
|
||||
for_each_object(p, s, start, page->objects) {
|
||||
@@ -1406,10 +1406,12 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
|
||||
|
||||
__ClearPageSlabPfmemalloc(page);
|
||||
__ClearPageSlab(page);
|
||||
|
||||
memcg_release_pages(s, order);
|
||||
reset_page_mapcount(page);
|
||||
if (current->reclaim_state)
|
||||
current->reclaim_state->reclaimed_slab += pages;
|
||||
__free_pages(page, order);
|
||||
__free_memcg_kmem_pages(page, order);
|
||||
}
|
||||
|
||||
#define need_reserve_slab_rcu \
|
||||
@@ -1424,7 +1426,7 @@ static void rcu_free_slab(struct rcu_head *h)
|
||||
else
|
||||
page = container_of((struct list_head *)h, struct page, lru);
|
||||
|
||||
__free_slab(page->slab, page);
|
||||
__free_slab(page->slab_cache, page);
|
||||
}
|
||||
|
||||
static void free_slab(struct kmem_cache *s, struct page *page)
|
||||
@@ -1872,12 +1874,14 @@ redo:
|
||||
/*
|
||||
* Unfreeze all the cpu partial slabs.
|
||||
*
|
||||
* This function must be called with interrupt disabled.
|
||||
* This function must be called with interrupts disabled
|
||||
* for the cpu using c (or some other guarantee must be there
|
||||
* to guarantee no concurrent accesses).
|
||||
*/
|
||||
static void unfreeze_partials(struct kmem_cache *s)
|
||||
static void unfreeze_partials(struct kmem_cache *s,
|
||||
struct kmem_cache_cpu *c)
|
||||
{
|
||||
struct kmem_cache_node *n = NULL, *n2 = NULL;
|
||||
struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
|
||||
struct page *page, *discard_page = NULL;
|
||||
|
||||
while ((page = c->partial)) {
|
||||
@@ -1963,7 +1967,7 @@ static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
|
||||
* set to the per node partial list.
|
||||
*/
|
||||
local_irq_save(flags);
|
||||
unfreeze_partials(s);
|
||||
unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
|
||||
local_irq_restore(flags);
|
||||
oldpage = NULL;
|
||||
pobjects = 0;
|
||||
@@ -2006,7 +2010,7 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
|
||||
if (c->page)
|
||||
flush_slab(s, c);
|
||||
|
||||
unfreeze_partials(s);
|
||||
unfreeze_partials(s, c);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2325,6 +2329,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
|
||||
if (slab_pre_alloc_hook(s, gfpflags))
|
||||
return NULL;
|
||||
|
||||
s = memcg_kmem_get_cache(s, gfpflags);
|
||||
redo:
|
||||
|
||||
/*
|
||||
@@ -2459,7 +2464,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
|
||||
void *prior;
|
||||
void **object = (void *)x;
|
||||
int was_frozen;
|
||||
int inuse;
|
||||
struct page new;
|
||||
unsigned long counters;
|
||||
struct kmem_cache_node *n = NULL;
|
||||
@@ -2472,13 +2476,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
|
||||
return;
|
||||
|
||||
do {
|
||||
if (unlikely(n)) {
|
||||
spin_unlock_irqrestore(&n->list_lock, flags);
|
||||
n = NULL;
|
||||
}
|
||||
prior = page->freelist;
|
||||
counters = page->counters;
|
||||
set_freepointer(s, object, prior);
|
||||
new.counters = counters;
|
||||
was_frozen = new.frozen;
|
||||
new.inuse--;
|
||||
if ((!new.inuse || !prior) && !was_frozen && !n) {
|
||||
if ((!new.inuse || !prior) && !was_frozen) {
|
||||
|
||||
if (!kmem_cache_debug(s) && !prior)
|
||||
|
||||
@@ -2503,7 +2511,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
|
||||
|
||||
}
|
||||
}
|
||||
inuse = new.inuse;
|
||||
|
||||
} while (!cmpxchg_double_slab(s, page,
|
||||
prior, counters,
|
||||
@@ -2529,25 +2536,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* was_frozen may have been set after we acquired the list_lock in
|
||||
* an earlier loop. So we need to check it here again.
|
||||
*/
|
||||
if (was_frozen)
|
||||
stat(s, FREE_FROZEN);
|
||||
else {
|
||||
if (unlikely(!inuse && n->nr_partial > s->min_partial))
|
||||
goto slab_empty;
|
||||
if (unlikely(!new.inuse && n->nr_partial > s->min_partial))
|
||||
goto slab_empty;
|
||||
|
||||
/*
|
||||
* Objects left in the slab. If it was not on the partial list before
|
||||
* then add it.
|
||||
*/
|
||||
if (unlikely(!prior)) {
|
||||
remove_full(s, page);
|
||||
add_partial(n, page, DEACTIVATE_TO_TAIL);
|
||||
stat(s, FREE_ADD_PARTIAL);
|
||||
}
|
||||
/*
|
||||
* Objects left in the slab. If it was not on the partial list before
|
||||
* then add it.
|
||||
*/
|
||||
if (kmem_cache_debug(s) && unlikely(!prior)) {
|
||||
remove_full(s, page);
|
||||
add_partial(n, page, DEACTIVATE_TO_TAIL);
|
||||
stat(s, FREE_ADD_PARTIAL);
|
||||
}
|
||||
spin_unlock_irqrestore(&n->list_lock, flags);
|
||||
return;
|
||||
@@ -2619,19 +2618,10 @@ redo:
|
||||
|
||||
void kmem_cache_free(struct kmem_cache *s, void *x)
|
||||
{
|
||||
struct page *page;
|
||||
|
||||
page = virt_to_head_page(x);
|
||||
|
||||
if (kmem_cache_debug(s) && page->slab != s) {
|
||||
pr_err("kmem_cache_free: Wrong slab cache. %s but object"
|
||||
" is from %s\n", page->slab->name, s->name);
|
||||
WARN_ON_ONCE(1);
|
||||
s = cache_from_obj(s, x);
|
||||
if (!s)
|
||||
return;
|
||||
}
|
||||
|
||||
slab_free(s, page, x, _RET_IP_);
|
||||
|
||||
slab_free(s, virt_to_head_page(x), x, _RET_IP_);
|
||||
trace_kmem_cache_free(_RET_IP_, x);
|
||||
}
|
||||
EXPORT_SYMBOL(kmem_cache_free);
|
||||
@@ -2769,32 +2759,6 @@ static inline int calculate_order(int size, int reserved)
|
||||
return -ENOSYS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Figure out what the alignment of the objects will be.
|
||||
*/
|
||||
static unsigned long calculate_alignment(unsigned long flags,
|
||||
unsigned long align, unsigned long size)
|
||||
{
|
||||
/*
|
||||
* If the user wants hardware cache aligned objects then follow that
|
||||
* suggestion if the object is sufficiently large.
|
||||
*
|
||||
* The hardware cache alignment cannot override the specified
|
||||
* alignment though. If that is greater then use it.
|
||||
*/
|
||||
if (flags & SLAB_HWCACHE_ALIGN) {
|
||||
unsigned long ralign = cache_line_size();
|
||||
while (size <= ralign / 2)
|
||||
ralign /= 2;
|
||||
align = max(align, ralign);
|
||||
}
|
||||
|
||||
if (align < ARCH_SLAB_MINALIGN)
|
||||
align = ARCH_SLAB_MINALIGN;
|
||||
|
||||
return ALIGN(align, sizeof(void *));
|
||||
}
|
||||
|
||||
static void
|
||||
init_kmem_cache_node(struct kmem_cache_node *n)
|
||||
{
|
||||
@@ -2928,7 +2892,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
|
||||
{
|
||||
unsigned long flags = s->flags;
|
||||
unsigned long size = s->object_size;
|
||||
unsigned long align = s->align;
|
||||
int order;
|
||||
|
||||
/*
|
||||
@@ -2999,20 +2962,12 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
|
||||
size += sizeof(void *);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Determine the alignment based on various parameters that the
|
||||
* user specified and the dynamic determination of cache line size
|
||||
* on bootup.
|
||||
*/
|
||||
align = calculate_alignment(flags, align, s->object_size);
|
||||
s->align = align;
|
||||
|
||||
/*
|
||||
* SLUB stores one object immediately after another beginning from
|
||||
* offset 0. In order to align the objects we have to simply size
|
||||
* each object to conform to the alignment.
|
||||
*/
|
||||
size = ALIGN(size, align);
|
||||
size = ALIGN(size, s->align);
|
||||
s->size = size;
|
||||
if (forced_order >= 0)
|
||||
order = forced_order;
|
||||
@@ -3041,7 +2996,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
|
||||
s->max = s->oo;
|
||||
|
||||
return !!oo_objects(s->oo);
|
||||
|
||||
}
|
||||
|
||||
static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
|
||||
@@ -3127,15 +3081,6 @@ error:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine the size of a slab object
|
||||
*/
|
||||
unsigned int kmem_cache_size(struct kmem_cache *s)
|
||||
{
|
||||
return s->object_size;
|
||||
}
|
||||
EXPORT_SYMBOL(kmem_cache_size);
|
||||
|
||||
static void list_slab_objects(struct kmem_cache *s, struct page *page,
|
||||
const char *text)
|
||||
{
|
||||
@@ -3208,8 +3153,19 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
|
||||
{
|
||||
int rc = kmem_cache_close(s);
|
||||
|
||||
if (!rc)
|
||||
if (!rc) {
|
||||
/*
|
||||
* We do the same lock strategy around sysfs_slab_add, see
|
||||
* __kmem_cache_create. Because this is pretty much the last
|
||||
* operation we do and the lock will be released shortly after
|
||||
* that in slab_common.c, we could just move sysfs_slab_remove
|
||||
* to a later point in common code. We should do that when we
|
||||
* have a common sysfs framework for all allocators.
|
||||
*/
|
||||
mutex_unlock(&slab_mutex);
|
||||
sysfs_slab_remove(s);
|
||||
mutex_lock(&slab_mutex);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
@@ -3261,32 +3217,6 @@ static int __init setup_slub_nomerge(char *str)
|
||||
|
||||
__setup("slub_nomerge", setup_slub_nomerge);
|
||||
|
||||
static struct kmem_cache *__init create_kmalloc_cache(const char *name,
|
||||
int size, unsigned int flags)
|
||||
{
|
||||
struct kmem_cache *s;
|
||||
|
||||
s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
|
||||
|
||||
s->name = name;
|
||||
s->size = s->object_size = size;
|
||||
s->align = ARCH_KMALLOC_MINALIGN;
|
||||
|
||||
/*
|
||||
* This function is called with IRQs disabled during early-boot on
|
||||
* single CPU so there's no need to take slab_mutex here.
|
||||
*/
|
||||
if (kmem_cache_open(s, flags))
|
||||
goto panic;
|
||||
|
||||
list_add(&s->list, &slab_caches);
|
||||
return s;
|
||||
|
||||
panic:
|
||||
panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Conversion table for small slabs sizes / 8 to the index in the
|
||||
* kmalloc array. This is necessary for slabs < 192 since we have non power
|
||||
@@ -3372,7 +3302,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
|
||||
struct page *page;
|
||||
void *ptr = NULL;
|
||||
|
||||
flags |= __GFP_COMP | __GFP_NOTRACK;
|
||||
flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG;
|
||||
page = alloc_pages_node(node, flags, get_order(size));
|
||||
if (page)
|
||||
ptr = page_address(page);
|
||||
@@ -3424,7 +3354,7 @@ size_t ksize(const void *object)
|
||||
return PAGE_SIZE << compound_order(page);
|
||||
}
|
||||
|
||||
return slab_ksize(page->slab);
|
||||
return slab_ksize(page->slab_cache);
|
||||
}
|
||||
EXPORT_SYMBOL(ksize);
|
||||
|
||||
@@ -3449,8 +3379,8 @@ bool verify_mem_not_deleted(const void *x)
|
||||
}
|
||||
|
||||
slab_lock(page);
|
||||
if (on_freelist(page->slab, page, object)) {
|
||||
object_err(page->slab, page, object, "Object is on free-list");
|
||||
if (on_freelist(page->slab_cache, page, object)) {
|
||||
object_err(page->slab_cache, page, object, "Object is on free-list");
|
||||
rv = false;
|
||||
} else {
|
||||
rv = true;
|
||||
@@ -3478,10 +3408,10 @@ void kfree(const void *x)
|
||||
if (unlikely(!PageSlab(page))) {
|
||||
BUG_ON(!PageCompound(page));
|
||||
kmemleak_free(x);
|
||||
__free_pages(page, compound_order(page));
|
||||
__free_memcg_kmem_pages(page, compound_order(page));
|
||||
return;
|
||||
}
|
||||
slab_free(page->slab, page, object, _RET_IP_);
|
||||
slab_free(page->slab_cache, page, object, _RET_IP_);
|
||||
}
|
||||
EXPORT_SYMBOL(kfree);
|
||||
|
||||
@@ -3573,7 +3503,7 @@ static void slab_mem_offline_callback(void *arg)
|
||||
struct memory_notify *marg = arg;
|
||||
int offline_node;
|
||||
|
||||
offline_node = marg->status_change_nid;
|
||||
offline_node = marg->status_change_nid_normal;
|
||||
|
||||
/*
|
||||
* If the node still has available memory. we need kmem_cache_node
|
||||
@@ -3606,7 +3536,7 @@ static int slab_mem_going_online_callback(void *arg)
|
||||
struct kmem_cache_node *n;
|
||||
struct kmem_cache *s;
|
||||
struct memory_notify *marg = arg;
|
||||
int nid = marg->status_change_nid;
|
||||
int nid = marg->status_change_nid_normal;
|
||||
int ret = 0;
|
||||
|
||||
/*
|
||||
@@ -3676,15 +3606,16 @@ static int slab_memory_callback(struct notifier_block *self,
|
||||
|
||||
/*
|
||||
* Used for early kmem_cache structures that were allocated using
|
||||
* the page allocator
|
||||
* the page allocator. Allocate them properly then fix up the pointers
|
||||
* that may be pointing to the wrong kmem_cache structure.
|
||||
*/
|
||||
|
||||
static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
|
||||
static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
|
||||
{
|
||||
int node;
|
||||
struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
|
||||
|
||||
list_add(&s->list, &slab_caches);
|
||||
s->refcount = -1;
|
||||
memcpy(s, static_cache, kmem_cache->object_size);
|
||||
|
||||
for_each_node_state(node, N_NORMAL_MEMORY) {
|
||||
struct kmem_cache_node *n = get_node(s, node);
|
||||
@@ -3692,78 +3623,52 @@ static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
|
||||
|
||||
if (n) {
|
||||
list_for_each_entry(p, &n->partial, lru)
|
||||
p->slab = s;
|
||||
p->slab_cache = s;
|
||||
|
||||
#ifdef CONFIG_SLUB_DEBUG
|
||||
list_for_each_entry(p, &n->full, lru)
|
||||
p->slab = s;
|
||||
p->slab_cache = s;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
list_add(&s->list, &slab_caches);
|
||||
return s;
|
||||
}
|
||||
|
||||
void __init kmem_cache_init(void)
|
||||
{
|
||||
static __initdata struct kmem_cache boot_kmem_cache,
|
||||
boot_kmem_cache_node;
|
||||
int i;
|
||||
int caches = 0;
|
||||
struct kmem_cache *temp_kmem_cache;
|
||||
int order;
|
||||
struct kmem_cache *temp_kmem_cache_node;
|
||||
unsigned long kmalloc_size;
|
||||
int caches = 2;
|
||||
|
||||
if (debug_guardpage_minorder())
|
||||
slub_max_order = 0;
|
||||
|
||||
kmem_size = offsetof(struct kmem_cache, node) +
|
||||
nr_node_ids * sizeof(struct kmem_cache_node *);
|
||||
kmem_cache_node = &boot_kmem_cache_node;
|
||||
kmem_cache = &boot_kmem_cache;
|
||||
|
||||
/* Allocate two kmem_caches from the page allocator */
|
||||
kmalloc_size = ALIGN(kmem_size, cache_line_size());
|
||||
order = get_order(2 * kmalloc_size);
|
||||
kmem_cache = (void *)__get_free_pages(GFP_NOWAIT | __GFP_ZERO, order);
|
||||
|
||||
/*
|
||||
* Must first have the slab cache available for the allocations of the
|
||||
* struct kmem_cache_node's. There is special bootstrap code in
|
||||
* kmem_cache_open for slab_state == DOWN.
|
||||
*/
|
||||
kmem_cache_node = (void *)kmem_cache + kmalloc_size;
|
||||
|
||||
kmem_cache_node->name = "kmem_cache_node";
|
||||
kmem_cache_node->size = kmem_cache_node->object_size =
|
||||
sizeof(struct kmem_cache_node);
|
||||
kmem_cache_open(kmem_cache_node, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
|
||||
create_boot_cache(kmem_cache_node, "kmem_cache_node",
|
||||
sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN);
|
||||
|
||||
hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
|
||||
|
||||
/* Able to allocate the per node structures */
|
||||
slab_state = PARTIAL;
|
||||
|
||||
temp_kmem_cache = kmem_cache;
|
||||
kmem_cache->name = "kmem_cache";
|
||||
kmem_cache->size = kmem_cache->object_size = kmem_size;
|
||||
kmem_cache_open(kmem_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
|
||||
create_boot_cache(kmem_cache, "kmem_cache",
|
||||
offsetof(struct kmem_cache, node) +
|
||||
nr_node_ids * sizeof(struct kmem_cache_node *),
|
||||
SLAB_HWCACHE_ALIGN);
|
||||
|
||||
kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
|
||||
memcpy(kmem_cache, temp_kmem_cache, kmem_size);
|
||||
kmem_cache = bootstrap(&boot_kmem_cache);
|
||||
|
||||
/*
|
||||
* Allocate kmem_cache_node properly from the kmem_cache slab.
|
||||
* kmem_cache_node is separately allocated so no need to
|
||||
* update any list pointers.
|
||||
*/
|
||||
temp_kmem_cache_node = kmem_cache_node;
|
||||
|
||||
kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
|
||||
memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size);
|
||||
|
||||
kmem_cache_bootstrap_fixup(kmem_cache_node);
|
||||
|
||||
caches++;
|
||||
kmem_cache_bootstrap_fixup(kmem_cache);
|
||||
caches++;
|
||||
/* Free temporary boot structure */
|
||||
free_pages((unsigned long)temp_kmem_cache, order);
|
||||
kmem_cache_node = bootstrap(&boot_kmem_cache_node);
|
||||
|
||||
/* Now we can use the kmem_cache to allocate kmalloc slabs */
|
||||
|
||||
@@ -3891,7 +3796,7 @@ static int slab_unmergeable(struct kmem_cache *s)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct kmem_cache *find_mergeable(size_t size,
|
||||
static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
|
||||
size_t align, unsigned long flags, const char *name,
|
||||
void (*ctor)(void *))
|
||||
{
|
||||
@@ -3927,17 +3832,21 @@ static struct kmem_cache *find_mergeable(size_t size,
|
||||
if (s->size - size >= sizeof(void *))
|
||||
continue;
|
||||
|
||||
if (!cache_match_memcg(s, memcg))
|
||||
continue;
|
||||
|
||||
return s;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
|
||||
size_t align, unsigned long flags, void (*ctor)(void *))
|
||||
struct kmem_cache *
|
||||
__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
|
||||
size_t align, unsigned long flags, void (*ctor)(void *))
|
||||
{
|
||||
struct kmem_cache *s;
|
||||
|
||||
s = find_mergeable(size, align, flags, name, ctor);
|
||||
s = find_mergeable(memcg, size, align, flags, name, ctor);
|
||||
if (s) {
|
||||
s->refcount++;
|
||||
/*
|
||||
@@ -3964,6 +3873,11 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
/* Mutex is not taken during early boot */
|
||||
if (slab_state <= UP)
|
||||
return 0;
|
||||
|
||||
memcg_propagate_slab_attrs(s);
|
||||
mutex_unlock(&slab_mutex);
|
||||
err = sysfs_slab_add(s);
|
||||
mutex_lock(&slab_mutex);
|
||||
@@ -5197,10 +5111,95 @@ static ssize_t slab_attr_store(struct kobject *kobj,
|
||||
return -EIO;
|
||||
|
||||
err = attribute->store(s, buf, len);
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
|
||||
int i;
|
||||
|
||||
mutex_lock(&slab_mutex);
|
||||
if (s->max_attr_size < len)
|
||||
s->max_attr_size = len;
|
||||
|
||||
/*
|
||||
* This is a best effort propagation, so this function's return
|
||||
* value will be determined by the parent cache only. This is
|
||||
* basically because not all attributes will have a well
|
||||
* defined semantics for rollbacks - most of the actions will
|
||||
* have permanent effects.
|
||||
*
|
||||
* Returning the error value of any of the children that fail
|
||||
* is not 100 % defined, in the sense that users seeing the
|
||||
* error code won't be able to know anything about the state of
|
||||
* the cache.
|
||||
*
|
||||
* Only returning the error code for the parent cache at least
|
||||
* has well defined semantics. The cache being written to
|
||||
* directly either failed or succeeded, in which case we loop
|
||||
* through the descendants with best-effort propagation.
|
||||
*/
|
||||
for_each_memcg_cache_index(i) {
|
||||
struct kmem_cache *c = cache_from_memcg(s, i);
|
||||
if (c)
|
||||
attribute->store(c, buf, len);
|
||||
}
|
||||
mutex_unlock(&slab_mutex);
|
||||
}
|
||||
#endif
|
||||
return err;
|
||||
}
|
||||
|
||||
static void memcg_propagate_slab_attrs(struct kmem_cache *s)
|
||||
{
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
int i;
|
||||
char *buffer = NULL;
|
||||
|
||||
if (!is_root_cache(s))
|
||||
return;
|
||||
|
||||
/*
|
||||
* This mean this cache had no attribute written. Therefore, no point
|
||||
* in copying default values around
|
||||
*/
|
||||
if (!s->max_attr_size)
|
||||
return;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
|
||||
char mbuf[64];
|
||||
char *buf;
|
||||
struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
|
||||
|
||||
if (!attr || !attr->store || !attr->show)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* It is really bad that we have to allocate here, so we will
|
||||
* do it only as a fallback. If we actually allocate, though,
|
||||
* we can just use the allocated buffer until the end.
|
||||
*
|
||||
* Most of the slub attributes will tend to be very small in
|
||||
* size, but sysfs allows buffers up to a page, so they can
|
||||
* theoretically happen.
|
||||
*/
|
||||
if (buffer)
|
||||
buf = buffer;
|
||||
else if (s->max_attr_size < ARRAY_SIZE(mbuf))
|
||||
buf = mbuf;
|
||||
else {
|
||||
buffer = (char *) get_zeroed_page(GFP_KERNEL);
|
||||
if (WARN_ON(!buffer))
|
||||
continue;
|
||||
buf = buffer;
|
||||
}
|
||||
|
||||
attr->show(s->memcg_params->root_cache, buf);
|
||||
attr->store(s, buf, strlen(buf));
|
||||
}
|
||||
|
||||
if (buffer)
|
||||
free_page((unsigned long)buffer);
|
||||
#endif
|
||||
}
|
||||
|
||||
static const struct sysfs_ops slab_sysfs_ops = {
|
||||
.show = slab_attr_show,
|
||||
.store = slab_attr_store,
|
||||
@@ -5257,6 +5256,12 @@ static char *create_unique_id(struct kmem_cache *s)
|
||||
if (p != name + 1)
|
||||
*p++ = '-';
|
||||
p += sprintf(p, "%07d", s->size);
|
||||
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
if (!is_root_cache(s))
|
||||
p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg));
|
||||
#endif
|
||||
|
||||
BUG_ON(p > name + ID_STR_LENGTH - 1);
|
||||
return name;
|
||||
}
|
||||
@@ -5265,13 +5270,8 @@ static int sysfs_slab_add(struct kmem_cache *s)
|
||||
{
|
||||
int err;
|
||||
const char *name;
|
||||
int unmergeable;
|
||||
int unmergeable = slab_unmergeable(s);
|
||||
|
||||
if (slab_state < FULL)
|
||||
/* Defer until later */
|
||||
return 0;
|
||||
|
||||
unmergeable = slab_unmergeable(s);
|
||||
if (unmergeable) {
|
||||
/*
|
||||
* Slabcache can never be merged so we can use the name proper.
|
||||
@@ -5405,49 +5405,14 @@ __initcall(slab_sysfs_init);
|
||||
* The /proc/slabinfo ABI
|
||||
*/
|
||||
#ifdef CONFIG_SLABINFO
|
||||
static void print_slabinfo_header(struct seq_file *m)
|
||||
{
|
||||
seq_puts(m, "slabinfo - version: 2.1\n");
|
||||
seq_puts(m, "# name <active_objs> <num_objs> <object_size> "
|
||||
"<objperslab> <pagesperslab>");
|
||||
seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
|
||||
seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
|
||||
seq_putc(m, '\n');
|
||||
}
|
||||
|
||||
static void *s_start(struct seq_file *m, loff_t *pos)
|
||||
{
|
||||
loff_t n = *pos;
|
||||
|
||||
mutex_lock(&slab_mutex);
|
||||
if (!n)
|
||||
print_slabinfo_header(m);
|
||||
|
||||
return seq_list_start(&slab_caches, *pos);
|
||||
}
|
||||
|
||||
static void *s_next(struct seq_file *m, void *p, loff_t *pos)
|
||||
{
|
||||
return seq_list_next(p, &slab_caches, pos);
|
||||
}
|
||||
|
||||
static void s_stop(struct seq_file *m, void *p)
|
||||
{
|
||||
mutex_unlock(&slab_mutex);
|
||||
}
|
||||
|
||||
static int s_show(struct seq_file *m, void *p)
|
||||
void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
|
||||
{
|
||||
unsigned long nr_partials = 0;
|
||||
unsigned long nr_slabs = 0;
|
||||
unsigned long nr_inuse = 0;
|
||||
unsigned long nr_objs = 0;
|
||||
unsigned long nr_free = 0;
|
||||
struct kmem_cache *s;
|
||||
int node;
|
||||
|
||||
s = list_entry(p, struct kmem_cache, list);
|
||||
|
||||
for_each_online_node(node) {
|
||||
struct kmem_cache_node *n = get_node(s, node);
|
||||
|
||||
@@ -5460,41 +5425,21 @@ static int s_show(struct seq_file *m, void *p)
|
||||
nr_free += count_partial(n, count_free);
|
||||
}
|
||||
|
||||
nr_inuse = nr_objs - nr_free;
|
||||
|
||||
seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse,
|
||||
nr_objs, s->size, oo_objects(s->oo),
|
||||
(1 << oo_order(s->oo)));
|
||||
seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0);
|
||||
seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs,
|
||||
0UL);
|
||||
seq_putc(m, '\n');
|
||||
return 0;
|
||||
sinfo->active_objs = nr_objs - nr_free;
|
||||
sinfo->num_objs = nr_objs;
|
||||
sinfo->active_slabs = nr_slabs;
|
||||
sinfo->num_slabs = nr_slabs;
|
||||
sinfo->objects_per_slab = oo_objects(s->oo);
|
||||
sinfo->cache_order = oo_order(s->oo);
|
||||
}
|
||||
|
||||
static const struct seq_operations slabinfo_op = {
|
||||
.start = s_start,
|
||||
.next = s_next,
|
||||
.stop = s_stop,
|
||||
.show = s_show,
|
||||
};
|
||||
|
||||
static int slabinfo_open(struct inode *inode, struct file *file)
|
||||
void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
|
||||
{
|
||||
return seq_open(file, &slabinfo_op);
|
||||
}
|
||||
|
||||
static const struct file_operations proc_slabinfo_operations = {
|
||||
.open = slabinfo_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = seq_release,
|
||||
};
|
||||
|
||||
static int __init slab_proc_init(void)
|
||||
ssize_t slabinfo_write(struct file *file, const char __user *buffer,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations);
|
||||
return 0;
|
||||
return -EIO;
|
||||
}
|
||||
module_init(slab_proc_init);
|
||||
#endif /* CONFIG_SLABINFO */
|
||||
|
35
mm/sparse.c
35
mm/sparse.c
@@ -617,7 +617,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
|
||||
{
|
||||
return; /* XXX: Not implemented yet */
|
||||
}
|
||||
static void free_map_bootmem(struct page *page, unsigned long nr_pages)
|
||||
static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
|
||||
{
|
||||
}
|
||||
#else
|
||||
@@ -638,7 +638,6 @@ static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
|
||||
got_map_page:
|
||||
ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
|
||||
got_map_ptr:
|
||||
memset(ret, 0, memmap_size);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@@ -658,10 +657,11 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
|
||||
get_order(sizeof(struct page) * nr_pages));
|
||||
}
|
||||
|
||||
static void free_map_bootmem(struct page *page, unsigned long nr_pages)
|
||||
static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
|
||||
{
|
||||
unsigned long maps_section_nr, removing_section_nr, i;
|
||||
unsigned long magic;
|
||||
struct page *page = virt_to_page(memmap);
|
||||
|
||||
for (i = 0; i < nr_pages; i++, page++) {
|
||||
magic = (unsigned long) page->lru.next;
|
||||
@@ -710,13 +710,10 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
|
||||
*/
|
||||
|
||||
if (memmap) {
|
||||
struct page *memmap_page;
|
||||
memmap_page = virt_to_page(memmap);
|
||||
|
||||
nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
|
||||
>> PAGE_SHIFT;
|
||||
|
||||
free_map_bootmem(memmap_page, nr_pages);
|
||||
free_map_bootmem(memmap, nr_pages);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -760,6 +757,8 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
|
||||
goto out;
|
||||
}
|
||||
|
||||
memset(memmap, 0, sizeof(struct page) * nr_pages);
|
||||
|
||||
ms->section_mem_map |= SECTION_MARKED_PRESENT;
|
||||
|
||||
ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
|
||||
@@ -773,6 +772,27 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_FAILURE
|
||||
static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (!memmap)
|
||||
return;
|
||||
|
||||
for (i = 0; i < PAGES_PER_SECTION; i++) {
|
||||
if (PageHWPoison(&memmap[i])) {
|
||||
atomic_long_sub(1, &mce_bad_pages);
|
||||
ClearPageHWPoison(&memmap[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
|
||||
{
|
||||
struct page *memmap = NULL;
|
||||
@@ -786,6 +806,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
|
||||
ms->pageblock_flags = NULL;
|
||||
}
|
||||
|
||||
clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
|
||||
free_section_usemap(memmap, usemap);
|
||||
}
|
||||
#endif
|
||||
|
@@ -1443,13 +1443,12 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
|
||||
return generic_swapfile_activate(sis, swap_file, span);
|
||||
}
|
||||
|
||||
static void enable_swap_info(struct swap_info_struct *p, int prio,
|
||||
static void _enable_swap_info(struct swap_info_struct *p, int prio,
|
||||
unsigned char *swap_map,
|
||||
unsigned long *frontswap_map)
|
||||
{
|
||||
int i, prev;
|
||||
|
||||
spin_lock(&swap_lock);
|
||||
if (prio >= 0)
|
||||
p->prio = prio;
|
||||
else
|
||||
@@ -1472,10 +1471,25 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
|
||||
swap_list.head = swap_list.next = p->type;
|
||||
else
|
||||
swap_info[prev]->next = p->type;
|
||||
}
|
||||
|
||||
static void enable_swap_info(struct swap_info_struct *p, int prio,
|
||||
unsigned char *swap_map,
|
||||
unsigned long *frontswap_map)
|
||||
{
|
||||
spin_lock(&swap_lock);
|
||||
_enable_swap_info(p, prio, swap_map, frontswap_map);
|
||||
frontswap_init(p->type);
|
||||
spin_unlock(&swap_lock);
|
||||
}
|
||||
|
||||
static void reinsert_swap_info(struct swap_info_struct *p)
|
||||
{
|
||||
spin_lock(&swap_lock);
|
||||
_enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
|
||||
spin_unlock(&swap_lock);
|
||||
}
|
||||
|
||||
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
|
||||
{
|
||||
struct swap_info_struct *p = NULL;
|
||||
@@ -1484,7 +1498,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
|
||||
struct address_space *mapping;
|
||||
struct inode *inode;
|
||||
struct filename *pathname;
|
||||
int oom_score_adj;
|
||||
int i, type, prev;
|
||||
int err;
|
||||
|
||||
@@ -1543,19 +1556,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
|
||||
p->flags &= ~SWP_WRITEOK;
|
||||
spin_unlock(&swap_lock);
|
||||
|
||||
oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
|
||||
set_current_oom_origin();
|
||||
err = try_to_unuse(type, false, 0); /* force all pages to be unused */
|
||||
compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
|
||||
clear_current_oom_origin();
|
||||
|
||||
if (err) {
|
||||
/*
|
||||
* reading p->prio and p->swap_map outside the lock is
|
||||
* safe here because only sys_swapon and sys_swapoff
|
||||
* change them, and there can be no other sys_swapon or
|
||||
* sys_swapoff for this swap_info_struct at this point.
|
||||
*/
|
||||
/* re-insert swap space back into swap_list */
|
||||
enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
|
||||
reinsert_swap_info(p);
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
|
@@ -576,29 +576,6 @@ void truncate_setsize(struct inode *inode, loff_t newsize)
|
||||
}
|
||||
EXPORT_SYMBOL(truncate_setsize);
|
||||
|
||||
/**
|
||||
* vmtruncate - unmap mappings "freed" by truncate() syscall
|
||||
* @inode: inode of the file used
|
||||
* @newsize: file offset to start truncating
|
||||
*
|
||||
* This function is deprecated and truncate_setsize or truncate_pagecache
|
||||
* should be used instead, together with filesystem specific block truncation.
|
||||
*/
|
||||
int vmtruncate(struct inode *inode, loff_t newsize)
|
||||
{
|
||||
int error;
|
||||
|
||||
error = inode_newsize_ok(inode, newsize);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
truncate_setsize(inode, newsize);
|
||||
if (inode->i_op->truncate)
|
||||
inode->i_op->truncate(inode);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(vmtruncate);
|
||||
|
||||
/**
|
||||
* truncate_pagecache_range - unmap and remove pagecache that is hole-punched
|
||||
* @inode: inode
|
||||
|
@@ -152,7 +152,7 @@ EXPORT_SYMBOL(__krealloc);
|
||||
*
|
||||
* The contents of the object pointed to are preserved up to the
|
||||
* lesser of the new and old sizes. If @p is %NULL, krealloc()
|
||||
* behaves exactly like kmalloc(). If @size is 0 and @p is not a
|
||||
* behaves exactly like kmalloc(). If @new_size is 0 and @p is not a
|
||||
* %NULL pointer, the object pointed to is freed.
|
||||
*/
|
||||
void *krealloc(const void *p, size_t new_size, gfp_t flags)
|
||||
|
@@ -2550,7 +2550,7 @@ static void s_stop(struct seq_file *m, void *p)
|
||||
|
||||
static void show_numa_info(struct seq_file *m, struct vm_struct *v)
|
||||
{
|
||||
if (NUMA_BUILD) {
|
||||
if (IS_ENABLED(CONFIG_NUMA)) {
|
||||
unsigned int nr, *counters = m->private;
|
||||
|
||||
if (!counters)
|
||||
@@ -2615,7 +2615,7 @@ static int vmalloc_open(struct inode *inode, struct file *file)
|
||||
unsigned int *ptr = NULL;
|
||||
int ret;
|
||||
|
||||
if (NUMA_BUILD) {
|
||||
if (IS_ENABLED(CONFIG_NUMA)) {
|
||||
ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
|
||||
if (ptr == NULL)
|
||||
return -ENOMEM;
|
||||
|
242
mm/vmscan.c
242
mm/vmscan.c
@@ -1177,7 +1177,11 @@ int isolate_lru_page(struct page *page)
|
||||
}
|
||||
|
||||
/*
|
||||
* Are there way too many processes in the direct reclaim path already?
|
||||
* A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
|
||||
* then get resheduled. When there are massive number of tasks doing page
|
||||
* allocation, such sleeping direct reclaimers may keep piling up on each CPU,
|
||||
* the LRU list will go small and be scanned faster than necessary, leading to
|
||||
* unnecessary swapping, thrashing and OOM.
|
||||
*/
|
||||
static int too_many_isolated(struct zone *zone, int file,
|
||||
struct scan_control *sc)
|
||||
@@ -1198,6 +1202,14 @@ static int too_many_isolated(struct zone *zone, int file,
|
||||
isolated = zone_page_state(zone, NR_ISOLATED_ANON);
|
||||
}
|
||||
|
||||
/*
|
||||
* GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
|
||||
* won't get blocked by normal direct-reclaimers, forming a circular
|
||||
* deadlock.
|
||||
*/
|
||||
if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
|
||||
inactive >>= 3;
|
||||
|
||||
return isolated > inactive;
|
||||
}
|
||||
|
||||
@@ -1679,13 +1691,24 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
|
||||
|
||||
if (global_reclaim(sc)) {
|
||||
free = zone_page_state(zone, NR_FREE_PAGES);
|
||||
/* If we have very few page cache pages,
|
||||
force-scan anon pages. */
|
||||
if (unlikely(file + free <= high_wmark_pages(zone))) {
|
||||
/*
|
||||
* If we have very few page cache pages, force-scan
|
||||
* anon pages.
|
||||
*/
|
||||
fraction[0] = 1;
|
||||
fraction[1] = 0;
|
||||
denominator = 1;
|
||||
goto out;
|
||||
} else if (!inactive_file_is_low_global(zone)) {
|
||||
/*
|
||||
* There is enough inactive page cache, do not
|
||||
* reclaim anything from the working set right now.
|
||||
*/
|
||||
fraction[0] = 0;
|
||||
fraction[1] = 1;
|
||||
denominator = 1;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1752,7 +1775,7 @@ out:
|
||||
/* Use reclaim/compaction for costly allocs or under memory pressure */
|
||||
static bool in_reclaim_compaction(struct scan_control *sc)
|
||||
{
|
||||
if (COMPACTION_BUILD && sc->order &&
|
||||
if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
|
||||
(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
|
||||
sc->priority < DEF_PRIORITY - 2))
|
||||
return true;
|
||||
@@ -2005,7 +2028,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
|
||||
if (zone->all_unreclaimable &&
|
||||
sc->priority != DEF_PRIORITY)
|
||||
continue; /* Let kswapd poll it */
|
||||
if (COMPACTION_BUILD) {
|
||||
if (IS_ENABLED(CONFIG_COMPACTION)) {
|
||||
/*
|
||||
* If we already have plenty of memory free for
|
||||
* compaction in this zone, don't free any more.
|
||||
@@ -2207,9 +2230,12 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
|
||||
* Throttle direct reclaimers if backing storage is backed by the network
|
||||
* and the PFMEMALLOC reserve for the preferred node is getting dangerously
|
||||
* depleted. kswapd will continue to make progress and wake the processes
|
||||
* when the low watermark is reached
|
||||
* when the low watermark is reached.
|
||||
*
|
||||
* Returns true if a fatal signal was delivered during throttling. If this
|
||||
* happens, the page allocator should not consider triggering the OOM killer.
|
||||
*/
|
||||
static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
|
||||
static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
|
||||
nodemask_t *nodemask)
|
||||
{
|
||||
struct zone *zone;
|
||||
@@ -2224,13 +2250,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
|
||||
* processes to block on log_wait_commit().
|
||||
*/
|
||||
if (current->flags & PF_KTHREAD)
|
||||
return;
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* If a fatal signal is pending, this process should not throttle.
|
||||
* It should return quickly so it can exit and free its memory
|
||||
*/
|
||||
if (fatal_signal_pending(current))
|
||||
goto out;
|
||||
|
||||
/* Check if the pfmemalloc reserves are ok */
|
||||
first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
|
||||
pgdat = zone->zone_pgdat;
|
||||
if (pfmemalloc_watermark_ok(pgdat))
|
||||
return;
|
||||
goto out;
|
||||
|
||||
/* Account for the throttling */
|
||||
count_vm_event(PGSCAN_DIRECT_THROTTLE);
|
||||
@@ -2246,12 +2279,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
|
||||
if (!(gfp_mask & __GFP_FS)) {
|
||||
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
|
||||
pfmemalloc_watermark_ok(pgdat), HZ);
|
||||
return;
|
||||
|
||||
goto check_pending;
|
||||
}
|
||||
|
||||
/* Throttle until kswapd wakes the process */
|
||||
wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
|
||||
pfmemalloc_watermark_ok(pgdat));
|
||||
|
||||
check_pending:
|
||||
if (fatal_signal_pending(current))
|
||||
return true;
|
||||
|
||||
out:
|
||||
return false;
|
||||
}
|
||||
|
||||
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
|
||||
@@ -2273,13 +2314,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
|
||||
.gfp_mask = sc.gfp_mask,
|
||||
};
|
||||
|
||||
throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
|
||||
|
||||
/*
|
||||
* Do not enter reclaim if fatal signal is pending. 1 is returned so
|
||||
* that the page allocator does not consider triggering OOM
|
||||
* Do not enter reclaim if fatal signal was delivered while throttled.
|
||||
* 1 is returned so that the page allocator does not OOM kill at this
|
||||
* point.
|
||||
*/
|
||||
if (fatal_signal_pending(current))
|
||||
if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
|
||||
return 1;
|
||||
|
||||
trace_mm_vmscan_direct_reclaim_begin(order,
|
||||
@@ -2397,13 +2437,31 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
|
||||
} while (memcg);
|
||||
}
|
||||
|
||||
static bool zone_balanced(struct zone *zone, int order,
|
||||
unsigned long balance_gap, int classzone_idx)
|
||||
{
|
||||
if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
|
||||
balance_gap, classzone_idx, 0))
|
||||
return false;
|
||||
|
||||
if (IS_ENABLED(CONFIG_COMPACTION) && order &&
|
||||
!compaction_suitable(zone, order))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* pgdat_balanced is used when checking if a node is balanced for high-order
|
||||
* allocations. Only zones that meet watermarks and are in a zone allowed
|
||||
* by the callers classzone_idx are added to balanced_pages. The total of
|
||||
* balanced pages must be at least 25% of the zones allowed by classzone_idx
|
||||
* for the node to be considered balanced. Forcing all zones to be balanced
|
||||
* for high orders can cause excessive reclaim when there are imbalanced zones.
|
||||
* pgdat_balanced() is used when checking if a node is balanced.
|
||||
*
|
||||
* For order-0, all zones must be balanced!
|
||||
*
|
||||
* For high-order allocations only zones that meet watermarks and are in a
|
||||
* zone allowed by the callers classzone_idx are added to balanced_pages. The
|
||||
* total of balanced pages must be at least 25% of the zones allowed by
|
||||
* classzone_idx for the node to be considered balanced. Forcing all zones to
|
||||
* be balanced for high orders can cause excessive reclaim when there are
|
||||
* imbalanced zones.
|
||||
* The choice of 25% is due to
|
||||
* o a 16M DMA zone that is balanced will not balance a zone on any
|
||||
* reasonable sized machine
|
||||
@@ -2413,17 +2471,43 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
|
||||
* Similarly, on x86-64 the Normal zone would need to be at least 1G
|
||||
* to balance a node on its own. These seemed like reasonable ratios.
|
||||
*/
|
||||
static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
|
||||
int classzone_idx)
|
||||
static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
|
||||
{
|
||||
unsigned long present_pages = 0;
|
||||
unsigned long balanced_pages = 0;
|
||||
int i;
|
||||
|
||||
for (i = 0; i <= classzone_idx; i++)
|
||||
present_pages += pgdat->node_zones[i].present_pages;
|
||||
/* Check the watermark levels */
|
||||
for (i = 0; i <= classzone_idx; i++) {
|
||||
struct zone *zone = pgdat->node_zones + i;
|
||||
|
||||
/* A special case here: if zone has no page, we think it's balanced */
|
||||
return balanced_pages >= (present_pages >> 2);
|
||||
if (!populated_zone(zone))
|
||||
continue;
|
||||
|
||||
present_pages += zone->present_pages;
|
||||
|
||||
/*
|
||||
* A special case here:
|
||||
*
|
||||
* balance_pgdat() skips over all_unreclaimable after
|
||||
* DEF_PRIORITY. Effectively, it considers them balanced so
|
||||
* they must be considered balanced here as well!
|
||||
*/
|
||||
if (zone->all_unreclaimable) {
|
||||
balanced_pages += zone->present_pages;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (zone_balanced(zone, order, 0, i))
|
||||
balanced_pages += zone->present_pages;
|
||||
else if (!order)
|
||||
return false;
|
||||
}
|
||||
|
||||
if (order)
|
||||
return balanced_pages >= (present_pages >> 2);
|
||||
else
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2435,10 +2519,6 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
|
||||
static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
|
||||
int classzone_idx)
|
||||
{
|
||||
int i;
|
||||
unsigned long balanced = 0;
|
||||
bool all_zones_ok = true;
|
||||
|
||||
/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
|
||||
if (remaining)
|
||||
return false;
|
||||
@@ -2457,40 +2537,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Check the watermark levels */
|
||||
for (i = 0; i <= classzone_idx; i++) {
|
||||
struct zone *zone = pgdat->node_zones + i;
|
||||
|
||||
if (!populated_zone(zone))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* balance_pgdat() skips over all_unreclaimable after
|
||||
* DEF_PRIORITY. Effectively, it considers them balanced so
|
||||
* they must be considered balanced here as well if kswapd
|
||||
* is to sleep
|
||||
*/
|
||||
if (zone->all_unreclaimable) {
|
||||
balanced += zone->present_pages;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
|
||||
i, 0))
|
||||
all_zones_ok = false;
|
||||
else
|
||||
balanced += zone->present_pages;
|
||||
}
|
||||
|
||||
/*
|
||||
* For high-order requests, the balanced zones must contain at least
|
||||
* 25% of the nodes pages for kswapd to sleep. For order-0, all zones
|
||||
* must be balanced
|
||||
*/
|
||||
if (order)
|
||||
return pgdat_balanced(pgdat, balanced, classzone_idx);
|
||||
else
|
||||
return all_zones_ok;
|
||||
return pgdat_balanced(pgdat, order, classzone_idx);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2517,8 +2564,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
|
||||
static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
|
||||
int *classzone_idx)
|
||||
{
|
||||
int all_zones_ok;
|
||||
unsigned long balanced;
|
||||
struct zone *unbalanced_zone;
|
||||
int i;
|
||||
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
|
||||
unsigned long total_scanned;
|
||||
@@ -2551,8 +2597,7 @@ loop_again:
|
||||
unsigned long lru_pages = 0;
|
||||
int has_under_min_watermark_zone = 0;
|
||||
|
||||
all_zones_ok = 1;
|
||||
balanced = 0;
|
||||
unbalanced_zone = NULL;
|
||||
|
||||
/*
|
||||
* Scan in the highmem->dma direction for the highest
|
||||
@@ -2585,8 +2630,7 @@ loop_again:
|
||||
break;
|
||||
}
|
||||
|
||||
if (!zone_watermark_ok_safe(zone, order,
|
||||
high_wmark_pages(zone), 0, 0)) {
|
||||
if (!zone_balanced(zone, order, 0, 0)) {
|
||||
end_zone = i;
|
||||
break;
|
||||
} else {
|
||||
@@ -2656,15 +2700,14 @@ loop_again:
|
||||
* Do not reclaim more than needed for compaction.
|
||||
*/
|
||||
testorder = order;
|
||||
if (COMPACTION_BUILD && order &&
|
||||
if (IS_ENABLED(CONFIG_COMPACTION) && order &&
|
||||
compaction_suitable(zone, order) !=
|
||||
COMPACT_SKIPPED)
|
||||
testorder = 0;
|
||||
|
||||
if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
|
||||
!zone_watermark_ok_safe(zone, testorder,
|
||||
high_wmark_pages(zone) + balance_gap,
|
||||
end_zone, 0)) {
|
||||
!zone_balanced(zone, testorder,
|
||||
balance_gap, end_zone)) {
|
||||
shrink_zone(zone, &sc);
|
||||
|
||||
reclaim_state->reclaimed_slab = 0;
|
||||
@@ -2691,9 +2734,8 @@ loop_again:
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!zone_watermark_ok_safe(zone, testorder,
|
||||
high_wmark_pages(zone), end_zone, 0)) {
|
||||
all_zones_ok = 0;
|
||||
if (!zone_balanced(zone, testorder, 0, end_zone)) {
|
||||
unbalanced_zone = zone;
|
||||
/*
|
||||
* We are still under min water mark. This
|
||||
* means that we have a GFP_ATOMIC allocation
|
||||
@@ -2711,8 +2753,6 @@ loop_again:
|
||||
* speculatively avoid congestion waits
|
||||
*/
|
||||
zone_clear_flag(zone, ZONE_CONGESTED);
|
||||
if (i <= *classzone_idx)
|
||||
balanced += zone->present_pages;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -2726,7 +2766,7 @@ loop_again:
|
||||
pfmemalloc_watermark_ok(pgdat))
|
||||
wake_up(&pgdat->pfmemalloc_wait);
|
||||
|
||||
if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
|
||||
if (pgdat_balanced(pgdat, order, *classzone_idx))
|
||||
break; /* kswapd: all done */
|
||||
/*
|
||||
* OK, kswapd is getting into trouble. Take a nap, then take
|
||||
@@ -2735,8 +2775,8 @@ loop_again:
|
||||
if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
|
||||
if (has_under_min_watermark_zone)
|
||||
count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
|
||||
else
|
||||
congestion_wait(BLK_RW_ASYNC, HZ/10);
|
||||
else if (unbalanced_zone)
|
||||
wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2750,12 +2790,7 @@ loop_again:
|
||||
} while (--sc.priority >= 0);
|
||||
out:
|
||||
|
||||
/*
|
||||
* order-0: All zones must meet high watermark for a balanced node
|
||||
* high-order: Balanced zones must make up at least 25% of the node
|
||||
* for the node to be balanced
|
||||
*/
|
||||
if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
|
||||
if (!pgdat_balanced(pgdat, order, *classzone_idx)) {
|
||||
cond_resched();
|
||||
|
||||
try_to_freeze();
|
||||
@@ -2797,29 +2832,10 @@ out:
|
||||
if (!populated_zone(zone))
|
||||
continue;
|
||||
|
||||
if (zone->all_unreclaimable &&
|
||||
sc.priority != DEF_PRIORITY)
|
||||
continue;
|
||||
|
||||
/* Would compaction fail due to lack of free memory? */
|
||||
if (COMPACTION_BUILD &&
|
||||
compaction_suitable(zone, order) == COMPACT_SKIPPED)
|
||||
goto loop_again;
|
||||
|
||||
/* Confirm the zone is balanced for order-0 */
|
||||
if (!zone_watermark_ok(zone, 0,
|
||||
high_wmark_pages(zone), 0, 0)) {
|
||||
order = sc.order = 0;
|
||||
goto loop_again;
|
||||
}
|
||||
|
||||
/* Check if the memory needs to be defragmented. */
|
||||
if (zone_watermark_ok(zone, order,
|
||||
low_wmark_pages(zone), *classzone_idx, 0))
|
||||
zones_need_compaction = 0;
|
||||
|
||||
/* If balanced, clear the congested flag */
|
||||
zone_clear_flag(zone, ZONE_CONGESTED);
|
||||
}
|
||||
|
||||
if (zones_need_compaction)
|
||||
@@ -2944,7 +2960,7 @@ static int kswapd(void *p)
|
||||
classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
|
||||
balanced_classzone_idx = classzone_idx;
|
||||
for ( ; ; ) {
|
||||
int ret;
|
||||
bool ret;
|
||||
|
||||
/*
|
||||
* If the last balance_pgdat was unsuccessful it's unlikely a
|
||||
@@ -3106,13 +3122,13 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
|
||||
not required for correctness. So if the last cpu in a node goes
|
||||
away, we get changed to run anywhere: as the first one comes back,
|
||||
restore their cpu bindings. */
|
||||
static int __devinit cpu_callback(struct notifier_block *nfb,
|
||||
unsigned long action, void *hcpu)
|
||||
static int cpu_callback(struct notifier_block *nfb, unsigned long action,
|
||||
void *hcpu)
|
||||
{
|
||||
int nid;
|
||||
|
||||
if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
|
||||
for_each_node_state(nid, N_HIGH_MEMORY) {
|
||||
for_each_node_state(nid, N_MEMORY) {
|
||||
pg_data_t *pgdat = NODE_DATA(nid);
|
||||
const struct cpumask *mask;
|
||||
|
||||
@@ -3168,7 +3184,7 @@ static int __init kswapd_init(void)
|
||||
int nid;
|
||||
|
||||
swap_setup();
|
||||
for_each_node_state(nid, N_HIGH_MEMORY)
|
||||
for_each_node_state(nid, N_MEMORY)
|
||||
kswapd_run(nid);
|
||||
hotcpu_notifier(cpu_callback, 0);
|
||||
return 0;
|
||||
|
28
mm/vmstat.c
28
mm/vmstat.c
@@ -774,10 +774,20 @@ const char * const vmstat_text[] = {
|
||||
|
||||
"pgrotated",
|
||||
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
"numa_pte_updates",
|
||||
"numa_hint_faults",
|
||||
"numa_hint_faults_local",
|
||||
"numa_pages_migrated",
|
||||
#endif
|
||||
#ifdef CONFIG_MIGRATION
|
||||
"pgmigrate_success",
|
||||
"pgmigrate_fail",
|
||||
#endif
|
||||
#ifdef CONFIG_COMPACTION
|
||||
"compact_blocks_moved",
|
||||
"compact_pages_moved",
|
||||
"compact_pagemigrate_failed",
|
||||
"compact_migrate_scanned",
|
||||
"compact_free_scanned",
|
||||
"compact_isolated",
|
||||
"compact_stall",
|
||||
"compact_fail",
|
||||
"compact_success",
|
||||
@@ -801,6 +811,8 @@ const char * const vmstat_text[] = {
|
||||
"thp_collapse_alloc",
|
||||
"thp_collapse_alloc_failed",
|
||||
"thp_split",
|
||||
"thp_zero_page_alloc",
|
||||
"thp_zero_page_alloc_failed",
|
||||
#endif
|
||||
|
||||
#endif /* CONFIG_VM_EVENTS_COUNTERS */
|
||||
@@ -930,7 +942,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg)
|
||||
pg_data_t *pgdat = (pg_data_t *)arg;
|
||||
|
||||
/* check memoryless node */
|
||||
if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
|
||||
if (!node_state(pgdat->node_id, N_MEMORY))
|
||||
return 0;
|
||||
|
||||
seq_printf(m, "Page block order: %d\n", pageblock_order);
|
||||
@@ -992,14 +1004,16 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
|
||||
"\n high %lu"
|
||||
"\n scanned %lu"
|
||||
"\n spanned %lu"
|
||||
"\n present %lu",
|
||||
"\n present %lu"
|
||||
"\n managed %lu",
|
||||
zone_page_state(zone, NR_FREE_PAGES),
|
||||
min_wmark_pages(zone),
|
||||
low_wmark_pages(zone),
|
||||
high_wmark_pages(zone),
|
||||
zone->pages_scanned,
|
||||
zone->spanned_pages,
|
||||
zone->present_pages);
|
||||
zone->present_pages,
|
||||
zone->managed_pages);
|
||||
|
||||
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
|
||||
seq_printf(m, "\n %-12s %lu", vmstat_text[i],
|
||||
@@ -1292,7 +1306,7 @@ static int unusable_show(struct seq_file *m, void *arg)
|
||||
pg_data_t *pgdat = (pg_data_t *)arg;
|
||||
|
||||
/* check memoryless node */
|
||||
if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
|
||||
if (!node_state(pgdat->node_id, N_MEMORY))
|
||||
return 0;
|
||||
|
||||
walk_zones_in_node(m, pgdat, unusable_show_print);
|
||||
|
Reference in New Issue
Block a user