Merge remote-tracking branch 'origin/x86/boot' into x86/mm2

Coming patches to x86/mm2 require the changes and advanced baseline in
x86/boot.

Resolved Conflicts:
	arch/x86/kernel/setup.c
	mm/nobootmem.c

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
This commit is contained in:
H. Peter Anvin
2013-01-29 14:59:09 -08:00
11256개의 변경된 파일524356개의 추가작업 그리고 299996개의 파일을 삭제

파일 보기

@@ -143,6 +143,25 @@ config NO_BOOTMEM
config MEMORY_ISOLATION
boolean
config MOVABLE_NODE
boolean "Enable to assign a node which has only movable memory"
depends on HAVE_MEMBLOCK
depends on NO_BOOTMEM
depends on X86_64
depends on NUMA
default n
help
Allow a node to have only movable memory. Pages used by the kernel,
such as direct mapping pages cannot be migrated. So the corresponding
memory device cannot be hotplugged. This option allows users to
online all the memory of a node as movable memory so that the whole
node can be hotplugged. Users who don't use the memory hotplug
feature are fine with this option on since they don't online memory
as movable.
Say Y here if you want to hotplug a whole node.
Say N here if you want kernel to use memory on all nodes evenly.
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
@@ -187,6 +206,21 @@ config SPLIT_PTLOCK_CPUS
default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC
default "4"
#
# support for memory balloon compaction
config BALLOON_COMPACTION
bool "Allow for balloon memory compaction/migration"
def_bool y
depends on COMPACTION && VIRTIO_BALLOON
help
Memory fragmentation introduced by ballooning might reduce
significantly the number of 2MB contiguous memory blocks that can be
used within a guest, thus imposing performance penalties associated
with the reduced number of transparent huge pages that could be used
by the guest workload. Allowing the compaction & migration for memory
pages enlisted as being part of memory balloon devices avoids the
scenario aforementioned and helps improving memory defragmentation.
#
# support for memory compaction
config COMPACTION

파일 보기

@@ -16,7 +16,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
compaction.o interval_tree.o $(mmu-y)
compaction.o balloon_compaction.o \
interval_tree.o $(mmu-y)
obj-y += init-mm.o

302
mm/balloon_compaction.c Normal file
파일 보기

@@ -0,0 +1,302 @@
/*
* mm/balloon_compaction.c
*
* Common interface for making balloon pages movable by compaction.
*
* Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com>
*/
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/balloon_compaction.h>
/*
* balloon_devinfo_alloc - allocates a balloon device information descriptor.
* @balloon_dev_descriptor: pointer to reference the balloon device which
* this struct balloon_dev_info will be servicing.
*
* Driver must call it to properly allocate and initialize an instance of
* struct balloon_dev_info which will be used to reference a balloon device
* as well as to keep track of the balloon device page list.
*/
struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor)
{
struct balloon_dev_info *b_dev_info;
b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL);
if (!b_dev_info)
return ERR_PTR(-ENOMEM);
b_dev_info->balloon_device = balloon_dev_descriptor;
b_dev_info->mapping = NULL;
b_dev_info->isolated_pages = 0;
spin_lock_init(&b_dev_info->pages_lock);
INIT_LIST_HEAD(&b_dev_info->pages);
return b_dev_info;
}
EXPORT_SYMBOL_GPL(balloon_devinfo_alloc);
/*
* balloon_page_enqueue - allocates a new page and inserts it into the balloon
* page list.
* @b_dev_info: balloon device decriptor where we will insert a new page to
*
* Driver must call it to properly allocate a new enlisted balloon page
* before definetively removing it from the guest system.
* This function returns the page address for the recently enqueued page or
* NULL in the case we fail to allocate a new page this turn.
*/
struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
{
unsigned long flags;
struct page *page = alloc_page(balloon_mapping_gfp_mask() |
__GFP_NOMEMALLOC | __GFP_NORETRY);
if (!page)
return NULL;
/*
* Block others from accessing the 'page' when we get around to
* establishing additional references. We should be the only one
* holding a reference to the 'page' at this point.
*/
BUG_ON(!trylock_page(page));
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages);
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
unlock_page(page);
return page;
}
EXPORT_SYMBOL_GPL(balloon_page_enqueue);
/*
* balloon_page_dequeue - removes a page from balloon's page list and returns
* the its address to allow the driver release the page.
* @b_dev_info: balloon device decriptor where we will grab a page from.
*
* Driver must call it to properly de-allocate a previous enlisted balloon page
* before definetively releasing it back to the guest system.
* This function returns the page address for the recently dequeued page or
* NULL in the case we find balloon's page list temporarily empty due to
* compaction isolated pages.
*/
struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
{
struct page *page, *tmp;
unsigned long flags;
bool dequeued_page;
dequeued_page = false;
list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) {
/*
* Block others from accessing the 'page' while we get around
* establishing additional references and preparing the 'page'
* to be released by the balloon driver.
*/
if (trylock_page(page)) {
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
/*
* Raise the page refcount here to prevent any wrong
* attempt to isolate this page, in case of coliding
* with balloon_page_isolate() just after we release
* the page lock.
*
* balloon_page_free() will take care of dropping
* this extra refcount later.
*/
get_page(page);
balloon_page_delete(page);
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
unlock_page(page);
dequeued_page = true;
break;
}
}
if (!dequeued_page) {
/*
* If we are unable to dequeue a balloon page because the page
* list is empty and there is no isolated pages, then something
* went out of track and some balloon pages are lost.
* BUG() here, otherwise the balloon driver may get stuck into
* an infinite loop while attempting to release all its pages.
*/
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
if (unlikely(list_empty(&b_dev_info->pages) &&
!b_dev_info->isolated_pages))
BUG();
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
page = NULL;
}
return page;
}
EXPORT_SYMBOL_GPL(balloon_page_dequeue);
#ifdef CONFIG_BALLOON_COMPACTION
/*
* balloon_mapping_alloc - allocates a special ->mapping for ballooned pages.
* @b_dev_info: holds the balloon device information descriptor.
* @a_ops: balloon_mapping address_space_operations descriptor.
*
* Driver must call it to properly allocate and initialize an instance of
* struct address_space which will be used as the special page->mapping for
* balloon device enlisted page instances.
*/
struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info,
const struct address_space_operations *a_ops)
{
struct address_space *mapping;
mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
if (!mapping)
return ERR_PTR(-ENOMEM);
/*
* Give a clean 'zeroed' status to all elements of this special
* balloon page->mapping struct address_space instance.
*/
address_space_init_once(mapping);
/*
* Set mapping->flags appropriately, to allow balloon pages
* ->mapping identification.
*/
mapping_set_balloon(mapping);
mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask());
/* balloon's page->mapping->a_ops callback descriptor */
mapping->a_ops = a_ops;
/*
* Establish a pointer reference back to the balloon device descriptor
* this particular page->mapping will be servicing.
* This is used by compaction / migration procedures to identify and
* access the balloon device pageset while isolating / migrating pages.
*
* As some balloon drivers can register multiple balloon devices
* for a single guest, this also helps compaction / migration to
* properly deal with multiple balloon pagesets, when required.
*/
mapping->private_data = b_dev_info;
b_dev_info->mapping = mapping;
return mapping;
}
EXPORT_SYMBOL_GPL(balloon_mapping_alloc);
static inline void __isolate_balloon_page(struct page *page)
{
struct balloon_dev_info *b_dev_info = page->mapping->private_data;
unsigned long flags;
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
list_del(&page->lru);
b_dev_info->isolated_pages++;
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
}
static inline void __putback_balloon_page(struct page *page)
{
struct balloon_dev_info *b_dev_info = page->mapping->private_data;
unsigned long flags;
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
list_add(&page->lru, &b_dev_info->pages);
b_dev_info->isolated_pages--;
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
}
static inline int __migrate_balloon_page(struct address_space *mapping,
struct page *newpage, struct page *page, enum migrate_mode mode)
{
return page->mapping->a_ops->migratepage(mapping, newpage, page, mode);
}
/* __isolate_lru_page() counterpart for a ballooned page */
bool balloon_page_isolate(struct page *page)
{
/*
* Avoid burning cycles with pages that are yet under __free_pages(),
* or just got freed under us.
*
* In case we 'win' a race for a balloon page being freed under us and
* raise its refcount preventing __free_pages() from doing its job
* the put_page() at the end of this block will take care of
* release this page, thus avoiding a nasty leakage.
*/
if (likely(get_page_unless_zero(page))) {
/*
* As balloon pages are not isolated from LRU lists, concurrent
* compaction threads can race against page migration functions
* as well as race against the balloon driver releasing a page.
*
* In order to avoid having an already isolated balloon page
* being (wrongly) re-isolated while it is under migration,
* or to avoid attempting to isolate pages being released by
* the balloon driver, lets be sure we have the page lock
* before proceeding with the balloon page isolation steps.
*/
if (likely(trylock_page(page))) {
/*
* A ballooned page, by default, has just one refcount.
* Prevent concurrent compaction threads from isolating
* an already isolated balloon page by refcount check.
*/
if (__is_movable_balloon_page(page) &&
page_count(page) == 2) {
__isolate_balloon_page(page);
unlock_page(page);
return true;
}
unlock_page(page);
}
put_page(page);
}
return false;
}
/* putback_lru_page() counterpart for a ballooned page */
void balloon_page_putback(struct page *page)
{
/*
* 'lock_page()' stabilizes the page and prevents races against
* concurrent isolation threads attempting to re-isolate it.
*/
lock_page(page);
if (__is_movable_balloon_page(page)) {
__putback_balloon_page(page);
/* drop the extra ref count taken for page isolation */
put_page(page);
} else {
WARN_ON(1);
dump_page(page);
}
unlock_page(page);
}
/* move_to_new_page() counterpart for a ballooned page */
int balloon_page_migrate(struct page *newpage,
struct page *page, enum migrate_mode mode)
{
struct address_space *mapping;
int rc = -EAGAIN;
/*
* Block others from accessing the 'newpage' when we get around to
* establishing additional references. We should be the only one
* holding a reference to the 'newpage' at this point.
*/
BUG_ON(!trylock_page(newpage));
if (WARN_ON(!__is_movable_balloon_page(page))) {
dump_page(page);
unlock_page(newpage);
return rc;
}
mapping = page->mapping;
if (mapping)
rc = __migrate_balloon_page(mapping, newpage, page, mode);
unlock_page(newpage);
return rc;
}
#endif /* CONFIG_BALLOON_COMPACTION */

파일 보기

@@ -147,21 +147,21 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
/*
* free_bootmem_late - free bootmem pages directly to page allocator
* @addr: starting address of the range
* @addr: starting physical address of the range
* @size: size of the range in bytes
*
* This is only useful when the bootmem allocator has already been torn
* down, but we are still initializing the system. Pages are given directly
* to the page allocator, no bootmem metadata is updated because it is gone.
*/
void __init free_bootmem_late(unsigned long addr, unsigned long size)
void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
{
unsigned long cursor, end;
kmemleak_free_part(__va(addr), size);
kmemleak_free_part(__va(physaddr), size);
cursor = PFN_UP(addr);
end = PFN_DOWN(addr + size);
cursor = PFN_UP(physaddr);
end = PFN_DOWN(physaddr + size);
for (; cursor < end; cursor++) {
__free_pages_bootmem(pfn_to_page(cursor), 0);
@@ -185,10 +185,23 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
while (start < end) {
unsigned long *map, idx, vec;
unsigned shift;
map = bdata->node_bootmem_map;
idx = start - bdata->node_min_pfn;
shift = idx & (BITS_PER_LONG - 1);
/*
* vec holds at most BITS_PER_LONG map bits,
* bit 0 corresponds to start.
*/
vec = ~map[idx / BITS_PER_LONG];
if (shift) {
vec >>= shift;
if (end - start >= BITS_PER_LONG)
vec |= ~map[idx / BITS_PER_LONG + 1] <<
(BITS_PER_LONG - shift);
}
/*
* If we have a properly aligned and fully unreserved
* BITS_PER_LONG block of pages in front of us, free
@@ -201,19 +214,18 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
count += BITS_PER_LONG;
start += BITS_PER_LONG;
} else {
unsigned long off = 0;
unsigned long cur = start;
vec >>= start & (BITS_PER_LONG - 1);
while (vec) {
start = ALIGN(start + 1, BITS_PER_LONG);
while (vec && cur != start) {
if (vec & 1) {
page = pfn_to_page(start + off);
page = pfn_to_page(cur);
__free_pages_bootmem(page, 0);
count++;
}
vec >>= 1;
off++;
++cur;
}
start = ALIGN(start + 1, BITS_PER_LONG);
}
}
@@ -229,6 +241,22 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
return count;
}
static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
{
struct zone *z;
/*
* In free_area_init_core(), highmem zone's managed_pages is set to
* present_pages, and bootmem allocator doesn't allocate from highmem
* zones. So there's no need to recalculate managed_pages because all
* highmem pages will be managed by the buddy system. Here highmem
* zone also includes highmem movable zone.
*/
for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
if (!is_highmem(z))
z->managed_pages = 0;
}
/**
* free_all_bootmem_node - release a node's free pages to the buddy allocator
* @pgdat: node to be released
@@ -238,6 +266,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
{
register_page_bootmem_info_node(pgdat);
reset_node_lowmem_managed_pages(pgdat);
return free_all_bootmem_core(pgdat->bdata);
}
@@ -250,6 +279,10 @@ unsigned long __init free_all_bootmem(void)
{
unsigned long total_pages = 0;
bootmem_data_t *bdata;
struct pglist_data *pgdat;
for_each_online_pgdat(pgdat)
reset_node_lowmem_managed_pages(pgdat);
list_for_each_entry(bdata, &bdata_list, list)
total_pages += free_all_bootmem_core(bdata);
@@ -377,21 +410,21 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
/**
* free_bootmem - mark a page range as usable
* @addr: starting address of the range
* @addr: starting physical address of the range
* @size: size of the range in bytes
*
* Partial pages will be considered reserved and left as they are.
*
* The range must be contiguous but may span node boundaries.
*/
void __init free_bootmem(unsigned long addr, unsigned long size)
void __init free_bootmem(unsigned long physaddr, unsigned long size)
{
unsigned long start, end;
kmemleak_free_part(__va(addr), size);
kmemleak_free_part(__va(physaddr), size);
start = PFN_UP(addr);
end = PFN_DOWN(addr + size);
start = PFN_UP(physaddr);
end = PFN_DOWN(physaddr + size);
mark_bootmem(start, end, 0, 0);
}
@@ -439,12 +472,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
return mark_bootmem(start, end, 1, flags);
}
int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
int flags)
{
return reserve_bootmem(phys, len, flags);
}
static unsigned long __init align_idx(struct bootmem_data *bdata,
unsigned long idx, unsigned long step)
{
@@ -575,27 +602,6 @@ find_block:
return NULL;
}
static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
unsigned long size, unsigned long align,
unsigned long goal, unsigned long limit)
{
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc(size, GFP_NOWAIT);
#ifdef CONFIG_HAVE_ARCH_BOOTMEM
{
bootmem_data_t *p_bdata;
p_bdata = bootmem_arch_preferred_node(bdata, size, align,
goal, limit);
if (p_bdata)
return alloc_bootmem_bdata(p_bdata, size, align,
goal, limit);
}
#endif
return NULL;
}
static void * __init alloc_bootmem_core(unsigned long size,
unsigned long align,
unsigned long goal,
@@ -604,9 +610,8 @@ static void * __init alloc_bootmem_core(unsigned long size,
bootmem_data_t *bdata;
void *region;
region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit);
if (region)
return region;
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc(size, GFP_NOWAIT);
list_for_each_entry(bdata, &bdata_list, list) {
if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
@@ -704,11 +709,9 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
{
void *ptr;
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc(size, GFP_NOWAIT);
again:
ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size,
align, goal, limit);
if (ptr)
return ptr;
/* do not panic in alloc_bootmem_bdata() */
if (limit && goal + size > limit)

파일 보기

@@ -14,8 +14,24 @@
#include <linux/backing-dev.h>
#include <linux/sysctl.h>
#include <linux/sysfs.h>
#include <linux/balloon_compaction.h>
#include "internal.h"
#ifdef CONFIG_COMPACTION
static inline void count_compact_event(enum vm_event_item item)
{
count_vm_event(item);
}
static inline void count_compact_events(enum vm_event_item item, long delta)
{
count_vm_events(item, delta);
}
#else
#define count_compact_event(item) do { } while (0)
#define count_compact_events(item, delta) do { } while (0)
#endif
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
#define CREATE_TRACE_POINTS
@@ -214,60 +230,6 @@ static bool suitable_migration_target(struct page *page)
return false;
}
static void compact_capture_page(struct compact_control *cc)
{
unsigned long flags;
int mtype, mtype_low, mtype_high;
if (!cc->page || *cc->page)
return;
/*
* For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
* regardless of the migratetype of the freelist is is captured from.
* This is fine because the order for a high-order MIGRATE_MOVABLE
* allocation is typically at least a pageblock size and overall
* fragmentation is not impaired. Other allocation types must
* capture pages from their own migratelist because otherwise they
* could pollute other pageblocks like MIGRATE_MOVABLE with
* difficult to move pages and making fragmentation worse overall.
*/
if (cc->migratetype == MIGRATE_MOVABLE) {
mtype_low = 0;
mtype_high = MIGRATE_PCPTYPES;
} else {
mtype_low = cc->migratetype;
mtype_high = cc->migratetype + 1;
}
/* Speculatively examine the free lists without zone lock */
for (mtype = mtype_low; mtype < mtype_high; mtype++) {
int order;
for (order = cc->order; order < MAX_ORDER; order++) {
struct page *page;
struct free_area *area;
area = &(cc->zone->free_area[order]);
if (list_empty(&area->free_list[mtype]))
continue;
/* Take the lock and attempt capture of the page */
if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
return;
if (!list_empty(&area->free_list[mtype])) {
page = list_entry(area->free_list[mtype].next,
struct page, lru);
if (capture_free_page(page, cc->order, mtype)) {
spin_unlock_irqrestore(&cc->zone->lock,
flags);
*cc->page = page;
return;
}
}
spin_unlock_irqrestore(&cc->zone->lock, flags);
}
}
}
/*
* Isolate free pages onto a private freelist. Caller must hold zone->lock.
* If @strict is true, will abort returning 0 on any invalid PFNs or non-free
@@ -356,6 +318,9 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (blockpfn == end_pfn)
update_pageblock_skip(cc, valid_page, total_isolated, false);
count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
if (total_isolated)
count_compact_events(COMPACTISOLATED, total_isolated);
return total_isolated;
}
@@ -565,9 +530,24 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
goto next_pageblock;
}
/* Check may be lockless but that's ok as we recheck later */
if (!PageLRU(page))
/*
* Check may be lockless but that's ok as we recheck later.
* It's possible to migrate LRU pages and balloon pages
* Skip any other type of page
*/
if (!PageLRU(page)) {
if (unlikely(balloon_page_movable(page))) {
if (locked && balloon_page_isolate(page)) {
/* Successfully isolated */
cc->finished_update_migrate = true;
list_add(&page->lru, migratelist);
cc->nr_migratepages++;
nr_isolated++;
goto check_compact_cluster;
}
}
continue;
}
/*
* PageLRU is set. lru_lock normally excludes isolation
@@ -621,6 +601,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
cc->nr_migratepages++;
nr_isolated++;
check_compact_cluster:
/* Avoid isolating too much */
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
++low_pfn;
@@ -646,6 +627,10 @@ next_pageblock:
trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
if (nr_isolated)
count_compact_events(COMPACTISOLATED, nr_isolated);
return low_pfn;
}
@@ -713,7 +698,15 @@ static void isolate_freepages(struct zone *zone,
/* Found a block suitable for isolating free pages from */
isolated = 0;
end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
/*
* As pfn may not start aligned, pfn+pageblock_nr_page
* may cross a MAX_ORDER_NR_PAGES boundary and miss
* a pfn_valid check. Ensure isolate_freepages_block()
* only scans within a pageblock
*/
end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
end_pfn = min(end_pfn, zone_end_pfn);
isolated = isolate_freepages_block(cc, pfn, end_pfn,
freelist, false);
nr_freepages += isolated;
@@ -823,6 +816,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
static int compact_finished(struct zone *zone,
struct compact_control *cc)
{
unsigned int order;
unsigned long watermark;
if (fatal_signal_pending(current))
@@ -857,22 +851,16 @@ static int compact_finished(struct zone *zone,
return COMPACT_CONTINUE;
/* Direct compactor: Is a suitable page free? */
if (cc->page) {
/* Was a suitable page captured? */
if (*cc->page)
return COMPACT_PARTIAL;
} else {
unsigned int order;
for (order = cc->order; order < MAX_ORDER; order++) {
struct free_area *area = &zone->free_area[cc->order];
/* Job done if page is free of the right migratetype */
if (!list_empty(&area->free_list[cc->migratetype]))
return COMPACT_PARTIAL;
for (order = cc->order; order < MAX_ORDER; order++) {
struct free_area *area = &zone->free_area[order];
/* Job done if allocation would set block type */
if (cc->order >= pageblock_order && area->nr_free)
return COMPACT_PARTIAL;
}
/* Job done if page is free of the right migratetype */
if (!list_empty(&area->free_list[cc->migratetype]))
return COMPACT_PARTIAL;
/* Job done if allocation would set block type */
if (cc->order >= pageblock_order && area->nr_free)
return COMPACT_PARTIAL;
}
return COMPACT_CONTINUE;
@@ -978,7 +966,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
switch (isolate_migratepages(zone, cc)) {
case ISOLATE_ABORT:
ret = COMPACT_PARTIAL;
putback_lru_pages(&cc->migratepages);
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
goto out;
case ISOLATE_NONE:
@@ -990,29 +978,23 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
nr_migrate = cc->nr_migratepages;
err = migrate_pages(&cc->migratepages, compaction_alloc,
(unsigned long)cc, false,
cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
MR_COMPACTION);
update_nr_listpages(cc);
nr_remaining = cc->nr_migratepages;
count_vm_event(COMPACTBLOCKS);
count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
if (nr_remaining)
count_vm_events(COMPACTPAGEFAILED, nr_remaining);
trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
nr_remaining);
/* Release LRU pages not migrated */
/* Release isolated pages not migrated */
if (err) {
putback_lru_pages(&cc->migratepages);
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
if (err == -ENOMEM) {
ret = COMPACT_PARTIAL;
goto out;
}
}
/* Capture a page now if it is a suitable size */
compact_capture_page(cc);
}
out:
@@ -1025,8 +1007,7 @@ out:
static unsigned long compact_zone_order(struct zone *zone,
int order, gfp_t gfp_mask,
bool sync, bool *contended,
struct page **page)
bool sync, bool *contended)
{
unsigned long ret;
struct compact_control cc = {
@@ -1036,7 +1017,6 @@ static unsigned long compact_zone_order(struct zone *zone,
.migratetype = allocflags_to_migratetype(gfp_mask),
.zone = zone,
.sync = sync,
.page = page,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
@@ -1066,7 +1046,7 @@ int sysctl_extfrag_threshold = 500;
*/
unsigned long try_to_compact_pages(struct zonelist *zonelist,
int order, gfp_t gfp_mask, nodemask_t *nodemask,
bool sync, bool *contended, struct page **page)
bool sync, bool *contended)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1080,7 +1060,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
if (!order || !may_enter_fs || !may_perform_io)
return rc;
count_vm_event(COMPACTSTALL);
count_compact_event(COMPACTSTALL);
#ifdef CONFIG_CMA
if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
@@ -1092,7 +1072,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
int status;
status = compact_zone_order(zone, order, gfp_mask, sync,
contended, page);
contended);
rc = max(status, rc);
/* If a normal allocation would succeed, stop compacting */
@@ -1148,7 +1128,6 @@ int compact_pgdat(pg_data_t *pgdat, int order)
struct compact_control cc = {
.order = order,
.sync = false,
.page = NULL,
};
return __compact_pgdat(pgdat, &cc);
@@ -1159,14 +1138,13 @@ static int compact_node(int nid)
struct compact_control cc = {
.order = -1,
.sync = true,
.page = NULL,
};
return __compact_pgdat(NODE_DATA(nid), &cc);
}
/* Compact all nodes in the system */
static int compact_nodes(void)
static void compact_nodes(void)
{
int nid;
@@ -1175,8 +1153,6 @@ static int compact_nodes(void)
for_each_online_node(nid)
compact_node(nid);
return COMPACT_COMPLETE;
}
/* The written value is actually unused, all memory is compacted */
@@ -1187,7 +1163,7 @@ int sysctl_compaction_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
if (write)
return compact_nodes();
compact_nodes();
return 0;
}

파일 보기

@@ -50,7 +50,6 @@ struct dma_pool { /* the pool */
size_t allocation;
size_t boundary;
char name[32];
wait_queue_head_t waitq;
struct list_head pools;
};
@@ -62,8 +61,6 @@ struct dma_page { /* cacheable header for 'allocation' bytes */
unsigned int offset;
};
#define POOL_TIMEOUT_JIFFIES ((100 /* msec */ * HZ) / 1000)
static DEFINE_MUTEX(pools_lock);
static ssize_t
@@ -172,7 +169,6 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
retval->size = size;
retval->boundary = boundary;
retval->allocation = allocation;
init_waitqueue_head(&retval->waitq);
if (dev) {
int ret;
@@ -227,7 +223,6 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
#endif
pool_initialise_page(pool, page);
list_add(&page->page_list, &pool->page_list);
page->in_use = 0;
page->offset = 0;
} else {
@@ -315,30 +310,21 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
might_sleep_if(mem_flags & __GFP_WAIT);
spin_lock_irqsave(&pool->lock, flags);
restart:
list_for_each_entry(page, &pool->page_list, page_list) {
if (page->offset < pool->allocation)
goto ready;
}
page = pool_alloc_page(pool, GFP_ATOMIC);
if (!page) {
if (mem_flags & __GFP_WAIT) {
DECLARE_WAITQUEUE(wait, current);
__set_current_state(TASK_UNINTERRUPTIBLE);
__add_wait_queue(&pool->waitq, &wait);
spin_unlock_irqrestore(&pool->lock, flags);
/* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
spin_unlock_irqrestore(&pool->lock, flags);
schedule_timeout(POOL_TIMEOUT_JIFFIES);
page = pool_alloc_page(pool, mem_flags);
if (!page)
return NULL;
spin_lock_irqsave(&pool->lock, flags);
__remove_wait_queue(&pool->waitq, &wait);
goto restart;
}
retval = NULL;
goto done;
}
spin_lock_irqsave(&pool->lock, flags);
list_add(&page->page_list, &pool->page_list);
ready:
page->in_use++;
offset = page->offset;
@@ -346,9 +332,32 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
retval = offset + page->vaddr;
*handle = offset + page->dma;
#ifdef DMAPOOL_DEBUG
{
int i;
u8 *data = retval;
/* page->offset is stored in first 4 bytes */
for (i = sizeof(page->offset); i < pool->size; i++) {
if (data[i] == POOL_POISON_FREED)
continue;
if (pool->dev)
dev_err(pool->dev,
"dma_pool_alloc %s, %p (corruped)\n",
pool->name, retval);
else
pr_err("dma_pool_alloc %s, %p (corruped)\n",
pool->name, retval);
/*
* Dump the first 4 bytes even if they are not
* POOL_POISON_FREED
*/
print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1,
data, pool->size, 1);
break;
}
}
memset(retval, POOL_POISON_ALLOCATED, pool->size);
#endif
done:
spin_unlock_irqrestore(&pool->lock, flags);
return retval;
}
@@ -435,8 +444,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
page->in_use--;
*(int *)vaddr = page->offset;
page->offset = offset;
if (waitqueue_active(&pool->waitq))
wake_up_locked(&pool->waitq);
/*
* Resist a temptation to do
* if (!is_page_busy(page)) pool_free_page(pool, page);

파일 보기

@@ -99,12 +99,13 @@ struct page *kmap_to_page(void *vaddr)
unsigned long addr = (unsigned long)vaddr;
if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT;
int i = PKMAP_NR(addr);
return pte_page(pkmap_page_table[i]);
}
return virt_to_page(addr);
}
EXPORT_SYMBOL(kmap_to_page);
static void flush_all_zero_pkmaps(void)
{
@@ -137,8 +138,7 @@ static void flush_all_zero_pkmaps(void)
* So no dangers, even with speculative execution.
*/
page = pte_page(pkmap_page_table[i]);
pte_clear(&init_mm, (unsigned long)page_address(page),
&pkmap_page_table[i]);
pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]);
set_page_address(page, NULL);
need_flush = 1;
@@ -324,11 +324,7 @@ struct page_address_map {
struct list_head list;
};
/*
* page_address_map freelist, allocated from page_address_maps.
*/
static struct list_head page_address_pool; /* freelist */
static spinlock_t pool_lock; /* protects page_address_pool */
static struct page_address_map page_address_maps[LAST_PKMAP];
/*
* Hash table bucket
@@ -393,14 +389,7 @@ void set_page_address(struct page *page, void *virtual)
pas = page_slot(page);
if (virtual) { /* Add */
BUG_ON(list_empty(&page_address_pool));
spin_lock_irqsave(&pool_lock, flags);
pam = list_entry(page_address_pool.next,
struct page_address_map, list);
list_del(&pam->list);
spin_unlock_irqrestore(&pool_lock, flags);
pam = &page_address_maps[PKMAP_NR((unsigned long)virtual)];
pam->page = page;
pam->virtual = virtual;
@@ -413,9 +402,6 @@ void set_page_address(struct page *page, void *virtual)
if (pam->page == page) {
list_del(&pam->list);
spin_unlock_irqrestore(&pas->lock, flags);
spin_lock_irqsave(&pool_lock, flags);
list_add_tail(&pam->list, &page_address_pool);
spin_unlock_irqrestore(&pool_lock, flags);
goto done;
}
}
@@ -425,20 +411,14 @@ done:
return;
}
static struct page_address_map page_address_maps[LAST_PKMAP];
void __init page_address_init(void)
{
int i;
INIT_LIST_HEAD(&page_address_pool);
for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
list_add(&page_address_maps[i].list, &page_address_pool);
for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
INIT_LIST_HEAD(&page_address_htable[i].lh);
spin_lock_init(&page_address_htable[i].lock);
}
spin_lock_init(&pool_lock);
}
#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다. Load Diff

파일 보기

@@ -1,6 +1,6 @@
/*
* Generic hugetlb support.
* (C) William Irwin, April 2004
* (C) Nadia Yvette Chambers, April 2004
*/
#include <linux/list.h>
#include <linux/init.h>
@@ -1057,7 +1057,7 @@ static void return_unused_surplus_pages(struct hstate *h,
* on-line nodes with memory and will handle the hstate accounting.
*/
while (nr_pages--) {
if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1))
if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
break;
}
}
@@ -1180,14 +1180,14 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
int __weak alloc_bootmem_huge_page(struct hstate *h)
{
struct huge_bootmem_page *m;
int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
int nr_nodes = nodes_weight(node_states[N_MEMORY]);
while (nr_nodes) {
void *addr;
addr = __alloc_bootmem_node_nopanic(
NODE_DATA(hstate_next_node_to_alloc(h,
&node_states[N_HIGH_MEMORY])),
&node_states[N_MEMORY])),
huge_page_size(h), huge_page_size(h), 0);
if (addr) {
@@ -1259,7 +1259,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_fresh_huge_page(h,
&node_states[N_HIGH_MEMORY]))
&node_states[N_MEMORY]))
break;
}
h->max_huge_pages = i;
@@ -1527,7 +1527,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
if (!(obey_mempolicy &&
init_nodemask_of_mempolicy(nodes_allowed))) {
NODEMASK_FREE(nodes_allowed);
nodes_allowed = &node_states[N_HIGH_MEMORY];
nodes_allowed = &node_states[N_MEMORY];
}
} else if (nodes_allowed) {
/*
@@ -1537,11 +1537,11 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
init_nodemask_of_node(nodes_allowed, nid);
} else
nodes_allowed = &node_states[N_HIGH_MEMORY];
nodes_allowed = &node_states[N_MEMORY];
h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
if (nodes_allowed != &node_states[N_HIGH_MEMORY])
if (nodes_allowed != &node_states[N_MEMORY])
NODEMASK_FREE(nodes_allowed);
return len;
@@ -1800,7 +1800,7 @@ static void hugetlb_unregister_all_nodes(void)
* remove hstate attributes from any nodes that have them.
*/
for (nid = 0; nid < nr_node_ids; nid++)
hugetlb_unregister_node(&node_devices[nid]);
hugetlb_unregister_node(node_devices[nid]);
}
/*
@@ -1844,8 +1844,8 @@ static void hugetlb_register_all_nodes(void)
{
int nid;
for_each_node_state(nid, N_HIGH_MEMORY) {
struct node *node = &node_devices[nid];
for_each_node_state(nid, N_MEMORY) {
struct node *node = node_devices[nid];
if (node->dev.id == nid)
hugetlb_register_node(node);
}
@@ -1906,14 +1906,12 @@ static int __init hugetlb_init(void)
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
hugetlb_init_hstates();
gather_bootmem_prealloc();
report_hugepages();
hugetlb_sysfs_init();
hugetlb_register_all_nodes();
hugetlb_cgroup_file_init();
return 0;
}
@@ -1939,17 +1937,10 @@ void __init hugetlb_add_hstate(unsigned order)
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
INIT_LIST_HEAD(&h->hugepage_activelist);
h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
h->next_nid_to_free = first_node(node_states[N_MEMORY]);
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/1024);
/*
* Add cgroup control files only if the huge page consists
* of more than two normal pages. This is because we use
* page[2].lru.next for storing cgoup details.
*/
if (order >= HUGETLB_CGROUP_MIN_ORDER)
hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
parsed_hstate = h;
}
@@ -2035,11 +2026,11 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
if (!(obey_mempolicy &&
init_nodemask_of_mempolicy(nodes_allowed))) {
NODEMASK_FREE(nodes_allowed);
nodes_allowed = &node_states[N_HIGH_MEMORY];
nodes_allowed = &node_states[N_MEMORY];
}
h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
if (nodes_allowed != &node_states[N_HIGH_MEMORY])
if (nodes_allowed != &node_states[N_MEMORY])
NODEMASK_FREE(nodes_allowed);
}
out:
@@ -2386,8 +2377,10 @@ again:
/*
* HWPoisoned hugepage is already unmapped and dropped reference
*/
if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
pte_clear(mm, address, ptep);
continue;
}
page = pte_page(pte);
/*
@@ -3014,7 +3007,7 @@ same_page:
return i ? i : -EFAULT;
}
void hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot)
{
struct mm_struct *mm = vma->vm_mm;
@@ -3022,6 +3015,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
pte_t *ptep;
pte_t pte;
struct hstate *h = hstate_vma(vma);
unsigned long pages = 0;
BUG_ON(address >= end);
flush_cache_range(vma, address, end);
@@ -3032,12 +3026,15 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
if (huge_pmd_unshare(mm, &address, ptep))
if (huge_pmd_unshare(mm, &address, ptep)) {
pages++;
continue;
}
if (!huge_pte_none(huge_ptep_get(ptep))) {
pte = huge_ptep_get_and_clear(mm, address, ptep);
pte = pte_mkhuge(pte_modify(pte, newprot));
set_huge_pte_at(mm, address, ptep, pte);
pages++;
}
}
spin_unlock(&mm->page_table_lock);
@@ -3049,6 +3046,8 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
*/
flush_tlb_range(vma, start, end);
mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
return pages << h->order;
}
int hugetlb_reserve_pages(struct inode *inode,
@@ -3170,7 +3169,13 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
spin_lock(&hugetlb_lock);
if (is_hugepage_on_freelist(hpage)) {
list_del(&hpage->lru);
/*
* Hwpoisoned hugepage isn't linked to activelist or freelist,
* but dangling hpage->lru can trigger list-debug warnings
* (this happens when we call unpoison_memory() on it),
* so let it point to itself with list_del_init().
*/
list_del_init(&hpage->lru);
set_page_refcounted(hpage);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;

파일 보기

@@ -77,7 +77,7 @@ static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
return false;
}
static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgroup)
{
int idx;
struct cgroup *parent_cgroup;
@@ -101,7 +101,7 @@ static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
return &h_cgroup->css;
}
static void hugetlb_cgroup_destroy(struct cgroup *cgroup)
static void hugetlb_cgroup_css_free(struct cgroup *cgroup)
{
struct hugetlb_cgroup *h_cgroup;
@@ -155,18 +155,13 @@ out:
* Force the hugetlb cgroup to empty the hugetlb resources by moving them to
* the parent cgroup.
*/
static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
static void hugetlb_cgroup_css_offline(struct cgroup *cgroup)
{
struct hstate *h;
struct page *page;
int ret = 0, idx = 0;
int idx = 0;
do {
if (cgroup_task_count(cgroup) ||
!list_empty(&cgroup->children)) {
ret = -EBUSY;
goto out;
}
for_each_hstate(h) {
spin_lock(&hugetlb_lock);
list_for_each_entry(page, &h->hugepage_activelist, lru)
@@ -177,8 +172,6 @@ static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
}
cond_resched();
} while (hugetlb_cgroup_have_usage(cgroup));
out:
return ret;
}
int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
@@ -340,7 +333,7 @@ static char *mem_fmt(char *buf, int size, unsigned long hsize)
return buf;
}
int __init hugetlb_cgroup_file_init(int idx)
static void __init __hugetlb_cgroup_file_init(int idx)
{
char buf[32];
struct cftype *cft;
@@ -382,7 +375,22 @@ int __init hugetlb_cgroup_file_init(int idx)
WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
return 0;
return;
}
void __init hugetlb_cgroup_file_init(void)
{
struct hstate *h;
for_each_hstate(h) {
/*
* Add cgroup control files only if the huge page consists
* of more than two normal pages. This is because we use
* page[2].lru.next for storing cgroup details.
*/
if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
__hugetlb_cgroup_file_init(hstate_index(h));
}
}
/*
@@ -411,8 +419,8 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
struct cgroup_subsys hugetlb_subsys = {
.name = "hugetlb",
.create = hugetlb_cgroup_create,
.pre_destroy = hugetlb_cgroup_pre_destroy,
.destroy = hugetlb_cgroup_destroy,
.subsys_id = hugetlb_subsys_id,
.css_alloc = hugetlb_cgroup_css_alloc,
.css_offline = hugetlb_cgroup_css_offline,
.css_free = hugetlb_cgroup_css_free,
.subsys_id = hugetlb_subsys_id,
};

파일 보기

@@ -91,6 +91,11 @@ extern unsigned long highest_memmap_pfn;
extern int isolate_lru_page(struct page *page);
extern void putback_lru_page(struct page *page);
/*
* in mm/rmap.c:
*/
extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
/*
* in mm/page_alloc.c
*/
@@ -130,7 +135,6 @@ struct compact_control {
int migratetype; /* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
bool contended; /* True if a lock was contended */
struct page **page; /* Page captured of requested size */
};
unsigned long
@@ -212,15 +216,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
{
if (TestClearPageMlocked(page)) {
unsigned long flags;
int nr_pages = hpage_nr_pages(page);
local_irq_save(flags);
__dec_zone_page_state(page, NR_MLOCK);
__mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
SetPageMlocked(newpage);
__inc_zone_page_state(newpage, NR_MLOCK);
__mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
local_irq_restore(flags);
}
}
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern unsigned long vma_address(struct page *page,
struct vm_area_struct *vma);

파일 보기

@@ -1556,7 +1556,8 @@ static int dump_str_object_info(const char *str)
struct kmemleak_object *object;
unsigned long addr;
addr= simple_strtoul(str, NULL, 0);
if (kstrtoul(str, 0, &addr))
return -EINVAL;
object = find_and_get_object(addr, 0);
if (!object) {
pr_info("Unknown object at 0x%08lx\n", addr);

파일 보기

@@ -778,8 +778,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
struct page *kpage, pte_t orig_pte)
{
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep;
spinlock_t *ptl;
@@ -792,18 +790,10 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
if (addr == -EFAULT)
goto out;
pgd = pgd_offset(mm, addr);
if (!pgd_present(*pgd))
pmd = mm_find_pmd(mm, addr);
if (!pmd)
goto out;
pud = pud_offset(pgd, addr);
if (!pud_present(*pud))
goto out;
pmd = pmd_offset(pud, addr);
BUG_ON(pmd_trans_huge(*pmd));
if (!pmd_present(*pmd))
goto out;
mmun_start = addr;
mmun_end = addr + PAGE_SIZE;
@@ -1634,7 +1624,7 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
anon_vma_lock(anon_vma);
anon_vma_lock_read(anon_vma);
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
vma = vmac->vma;
@@ -1658,7 +1648,7 @@ again:
if (!search_new_forks || !mapcount)
break;
}
anon_vma_unlock(anon_vma);
anon_vma_unlock_read(anon_vma);
if (!mapcount)
goto out;
}
@@ -1688,7 +1678,7 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
anon_vma_lock(anon_vma);
anon_vma_lock_read(anon_vma);
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
vma = vmac->vma;
@@ -1707,11 +1697,11 @@ again:
ret = try_to_unmap_one(page, vma,
rmap_item->address, flags);
if (ret != SWAP_AGAIN || !page_mapped(page)) {
anon_vma_unlock(anon_vma);
anon_vma_unlock_read(anon_vma);
goto out;
}
}
anon_vma_unlock(anon_vma);
anon_vma_unlock_read(anon_vma);
}
if (!search_new_forks++)
goto again;
@@ -1741,7 +1731,7 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
anon_vma_lock(anon_vma);
anon_vma_lock_read(anon_vma);
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
vma = vmac->vma;
@@ -1759,11 +1749,11 @@ again:
ret = rmap_one(page, vma, rmap_item->address, arg);
if (ret != SWAP_AGAIN) {
anon_vma_unlock(anon_vma);
anon_vma_unlock_read(anon_vma);
goto out;
}
}
anon_vma_unlock(anon_vma);
anon_vma_unlock_read(anon_vma);
}
if (!search_new_forks++)
goto again;
@@ -1929,12 +1919,9 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
if (ksm_run != flags) {
ksm_run = flags;
if (flags & KSM_RUN_UNMERGE) {
int oom_score_adj;
oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
set_current_oom_origin();
err = unmerge_and_remove_all_rmap_items();
compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX,
oom_score_adj);
clear_current_oom_origin();
if (err) {
ksm_run = KSM_RUN_STOP;
count = err;

파일 보기

@@ -314,7 +314,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
}
this->size += next->size;
memmove(next, next + 1, (type->cnt - (i + 1)) * sizeof(*next));
/* move forward from next + 1, index of which is i + 2 */
memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next));
type->cnt--;
}
}

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다. Load Diff

파일 보기

@@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
struct anon_vma *av;
pgoff_t pgoff;
av = page_lock_anon_vma(page);
av = page_lock_anon_vma_read(page);
if (av == NULL) /* Not actually mapped anymore */
return;
@@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
}
}
read_unlock(&tasklist_lock);
page_unlock_anon_vma(av);
page_unlock_anon_vma_read(av);
}
/*
@@ -781,16 +781,16 @@ static struct page_state {
{ compound, compound, "huge", me_huge_page },
#endif
{ sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty },
{ sc|dirty, sc, "swapcache", me_swapcache_clean },
{ sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
{ sc|dirty, sc, "clean swapcache", me_swapcache_clean },
{ unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
{ unevict, unevict, "unevictable LRU", me_pagecache_clean},
{ unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
{ unevict, unevict, "clean unevictable LRU", me_pagecache_clean },
{ mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty },
{ mlock, mlock, "mlocked LRU", me_pagecache_clean },
{ mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
{ mlock, mlock, "clean mlocked LRU", me_pagecache_clean },
{ lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
{ lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
{ lru|dirty, lru, "clean LRU", me_pagecache_clean },
/*
@@ -812,14 +812,14 @@ static struct page_state {
#undef slab
#undef reserved
/*
* "Dirty/Clean" indication is not 100% accurate due to the possibility of
* setting PG_dirty outside page lock. See also comment above set_page_dirty().
*/
static void action_result(unsigned long pfn, char *msg, int result)
{
struct page *page = pfn_to_page(pfn);
printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
pfn,
PageDirty(page) ? "dirty " : "",
msg, action_name[result]);
pr_err("MCE %#lx: %s page recovery: %s\n",
pfn, msg, action_name[result]);
}
static int page_action(struct page_state *ps, struct page *p,
@@ -1385,7 +1385,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
* Isolate the page, so that it doesn't get reallocated if it
* was free.
*/
set_migratetype_isolate(p);
set_migratetype_isolate(p, true);
/*
* When the target page is a free hugepage, just remove it
* from free hugepage list.
@@ -1476,9 +1476,17 @@ int soft_offline_page(struct page *page, int flags)
{
int ret;
unsigned long pfn = page_to_pfn(page);
struct page *hpage = compound_trans_head(page);
if (PageHuge(page))
return soft_offline_huge_page(page, flags);
if (PageTransHuge(hpage)) {
if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
pr_info("soft offline: %#lx: failed to split THP\n",
pfn);
return -EBUSY;
}
}
ret = get_any_page(page, pfn, flags);
if (ret < 0)
@@ -1558,7 +1566,8 @@ int soft_offline_page(struct page *page, int flags)
page_is_file_cache(page));
list_add(&page->lru, &pagelist);
ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
false, MIGRATE_SYNC);
false, MIGRATE_SYNC,
MR_MEMORY_FAILURE);
if (ret) {
putback_lru_pages(&pagelist);
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",

파일 보기

@@ -57,6 +57,8 @@
#include <linux/swapops.h>
#include <linux/elf.h>
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
#include <asm/io.h>
#include <asm/pgalloc.h>
@@ -182,10 +184,14 @@ static int tlb_next_batch(struct mmu_gather *tlb)
return 1;
}
if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
return 0;
batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
if (!batch)
return 0;
tlb->batch_count++;
batch->next = NULL;
batch->nr = 0;
batch->max = MAX_GATHER_BATCH;
@@ -214,6 +220,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
tlb->local.nr = 0;
tlb->local.max = ARRAY_SIZE(tlb->__pages);
tlb->active = &tlb->local;
tlb->batch_count = 0;
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
tlb->batch = NULL;
@@ -717,20 +724,6 @@ static inline bool is_cow_mapping(vm_flags_t flags)
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
#ifndef is_zero_pfn
static inline int is_zero_pfn(unsigned long pfn)
{
return pfn == zero_pfn;
}
#endif
#ifndef my_zero_pfn
static inline unsigned long my_zero_pfn(unsigned long addr)
{
return zero_pfn;
}
#endif
/*
* vm_normal_page -- This function gets the "struct page" associated with a pte.
*
@@ -1250,7 +1243,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
BUG();
}
#endif
split_huge_page_pmd(vma->vm_mm, pmd);
split_huge_page_pmd(vma, addr, pmd);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
@@ -1517,9 +1510,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
goto out;
}
if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
goto no_page_table;
if (pmd_trans_huge(*pmd)) {
if (flags & FOLL_SPLIT) {
split_huge_page_pmd(mm, pmd);
split_huge_page_pmd(vma, address, pmd);
goto split_fallthrough;
}
spin_lock(&mm->page_table_lock);
@@ -1546,6 +1541,8 @@ split_fallthrough:
pte = *ptep;
if (!pte_present(pte))
goto no_page;
if ((flags & FOLL_NUMA) && pte_numa(pte))
goto no_page;
if ((flags & FOLL_WRITE) && !pte_write(pte))
goto unlock;
@@ -1697,6 +1694,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
vm_flags &= (gup_flags & FOLL_FORCE) ?
(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
/*
* If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
* would be called on PROT_NONE ranges. We must never invoke
* handle_mm_fault on PROT_NONE ranges or the NUMA hinting
* page faults would unprotect the PROT_NONE ranges if
* _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
* bitflag. So to avoid that, don't set FOLL_NUMA if
* FOLL_FORCE is set.
*/
if (!(gup_flags & FOLL_FORCE))
gup_flags |= FOLL_NUMA;
i = 0;
do {
@@ -2794,13 +2804,8 @@ unlock:
oom_free_new:
page_cache_release(new_page);
oom:
if (old_page) {
if (page_mkwrite) {
unlock_page(old_page);
page_cache_release(old_page);
}
if (old_page)
page_cache_release(old_page);
}
return VM_FAULT_OOM;
unwritable_page:
@@ -3431,6 +3436,170 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
unsigned long addr, int current_nid)
{
get_page(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
if (current_nid == numa_node_id())
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
return mpol_misplaced(page, vma, addr);
}
int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
{
struct page *page = NULL;
spinlock_t *ptl;
int current_nid = -1;
int target_nid;
bool migrated = false;
/*
* The "pte" at this point cannot be used safely without
* validation through pte_unmap_same(). It's of NUMA type but
* the pfn may be screwed if the read is non atomic.
*
* ptep_modify_prot_start is not called as this is clearing
* the _PAGE_NUMA bit and it is not really expected that there
* would be concurrent hardware modifications to the PTE.
*/
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (unlikely(!pte_same(*ptep, pte))) {
pte_unmap_unlock(ptep, ptl);
goto out;
}
pte = pte_mknonnuma(pte);
set_pte_at(mm, addr, ptep, pte);
update_mmu_cache(vma, addr, ptep);
page = vm_normal_page(vma, addr, pte);
if (!page) {
pte_unmap_unlock(ptep, ptl);
return 0;
}
current_nid = page_to_nid(page);
target_nid = numa_migrate_prep(page, vma, addr, current_nid);
pte_unmap_unlock(ptep, ptl);
if (target_nid == -1) {
/*
* Account for the fault against the current node if it not
* being replaced regardless of where the page is located.
*/
current_nid = numa_node_id();
put_page(page);
goto out;
}
/* Migrate to the requested node */
migrated = migrate_misplaced_page(page, target_nid);
if (migrated)
current_nid = target_nid;
out:
if (current_nid != -1)
task_numa_fault(current_nid, 1, migrated);
return 0;
}
/* NUMA hinting page fault entry point for regular pmds */
#ifdef CONFIG_NUMA_BALANCING
static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
pmd_t pmd;
pte_t *pte, *orig_pte;
unsigned long _addr = addr & PMD_MASK;
unsigned long offset;
spinlock_t *ptl;
bool numa = false;
int local_nid = numa_node_id();
spin_lock(&mm->page_table_lock);
pmd = *pmdp;
if (pmd_numa(pmd)) {
set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
numa = true;
}
spin_unlock(&mm->page_table_lock);
if (!numa)
return 0;
/* we're in a page fault so some vma must be in the range */
BUG_ON(!vma);
BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
offset = max(_addr, vma->vm_start) & ~PMD_MASK;
VM_BUG_ON(offset >= PMD_SIZE);
orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
pte += offset >> PAGE_SHIFT;
for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
pte_t pteval = *pte;
struct page *page;
int curr_nid = local_nid;
int target_nid;
bool migrated;
if (!pte_present(pteval))
continue;
if (!pte_numa(pteval))
continue;
if (addr >= vma->vm_end) {
vma = find_vma(mm, addr);
/* there's a pte present so there must be a vma */
BUG_ON(!vma);
BUG_ON(addr < vma->vm_start);
}
if (pte_numa(pteval)) {
pteval = pte_mknonnuma(pteval);
set_pte_at(mm, addr, pte, pteval);
}
page = vm_normal_page(vma, addr, pteval);
if (unlikely(!page))
continue;
/* only check non-shared pages */
if (unlikely(page_mapcount(page) != 1))
continue;
/*
* Note that the NUMA fault is later accounted to either
* the node that is currently running or where the page is
* migrated to.
*/
curr_nid = local_nid;
target_nid = numa_migrate_prep(page, vma, addr,
page_to_nid(page));
if (target_nid == -1) {
put_page(page);
continue;
}
/* Migrate to the requested node */
pte_unmap_unlock(pte, ptl);
migrated = migrate_misplaced_page(page, target_nid);
if (migrated)
curr_nid = target_nid;
task_numa_fault(curr_nid, 1, migrated);
pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
}
pte_unmap_unlock(orig_pte, ptl);
return 0;
}
#else
static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
BUG();
return 0;
}
#endif /* CONFIG_NUMA_BALANCING */
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
@@ -3469,6 +3638,9 @@ int handle_pte_fault(struct mm_struct *mm,
pte, pmd, flags, entry);
}
if (pte_numa(entry))
return do_numa_page(mm, vma, address, entry, pte, pmd);
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (unlikely(!pte_same(*pte, entry)))
@@ -3537,9 +3709,21 @@ retry:
barrier();
if (pmd_trans_huge(orig_pmd)) {
if (flags & FAULT_FLAG_WRITE &&
!pmd_write(orig_pmd) &&
!pmd_trans_splitting(orig_pmd)) {
unsigned int dirty = flags & FAULT_FLAG_WRITE;
/*
* If the pmd is splitting, return and retry the
* the fault. Alternative: wait until the split
* is done, and goto retry.
*/
if (pmd_trans_splitting(orig_pmd))
return 0;
if (pmd_numa(orig_pmd))
return do_huge_pmd_numa_page(mm, vma, address,
orig_pmd, pmd);
if (dirty && !pmd_write(orig_pmd)) {
ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
orig_pmd);
/*
@@ -3550,17 +3734,25 @@ retry:
if (unlikely(ret & VM_FAULT_OOM))
goto retry;
return ret;
} else {
huge_pmd_set_accessed(mm, vma, address, pmd,
orig_pmd, dirty);
}
return 0;
}
}
if (pmd_numa(*pmd))
return do_pmd_numa_page(mm, vma, address, pmd);
/*
* Use __pte_alloc instead of pte_alloc_map, because we can't
* run pte_offset_map on the pmd, if an huge pmd could
* materialize from under us from a different thread.
*/
if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
if (unlikely(pmd_none(*pmd)) &&
unlikely(__pte_alloc(mm, vma, pmd, address)))
return VM_FAULT_OOM;
/* if an huge pmd materialized from under us just retry later */
if (unlikely(pmd_trans_huge(*pmd)))
@@ -3940,15 +4132,12 @@ void print_vma_addr(char *prefix, unsigned long ip)
struct file *f = vma->vm_file;
char *buf = (char *)__get_free_page(GFP_KERNEL);
if (buf) {
char *p, *s;
char *p;
p = d_path(&f->f_path, buf, PAGE_SIZE);
if (IS_ERR(p))
p = "?";
s = strrchr(p, '/');
if (s)
p = s+1;
printk("%s%s[%lx+%lx]", prefix, p,
printk("%s%s[%lx+%lx]", prefix, kbasename(p),
vma->vm_start,
vma->vm_end - vma->vm_start);
free_page((unsigned long)buf);

파일 보기

@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page,
void __ref put_page_bootmem(struct page *page)
{
unsigned long type;
static DEFINE_MUTEX(ppb_lock);
type = (unsigned long) page->lru.next;
BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -115,7 +116,14 @@ void __ref put_page_bootmem(struct page *page)
ClearPagePrivate(page);
set_page_private(page, 0);
INIT_LIST_HEAD(&page->lru);
/*
* Please refer to comment for __free_pages_bootmem()
* for why we serialize here.
*/
mutex_lock(&ppb_lock);
__free_pages_bootmem(page, 0);
mutex_unlock(&ppb_lock);
}
}
@@ -205,7 +213,7 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
zone_span_writelock(zone);
old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
if (start_pfn < zone->zone_start_pfn)
if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn)
zone->zone_start_pfn = start_pfn;
zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
@@ -214,13 +222,134 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
zone_span_writeunlock(zone);
}
static void resize_zone(struct zone *zone, unsigned long start_pfn,
unsigned long end_pfn)
{
zone_span_writelock(zone);
if (end_pfn - start_pfn) {
zone->zone_start_pfn = start_pfn;
zone->spanned_pages = end_pfn - start_pfn;
} else {
/*
* make it consist as free_area_init_core(),
* if spanned_pages = 0, then keep start_pfn = 0
*/
zone->zone_start_pfn = 0;
zone->spanned_pages = 0;
}
zone_span_writeunlock(zone);
}
static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
unsigned long end_pfn)
{
enum zone_type zid = zone_idx(zone);
int nid = zone->zone_pgdat->node_id;
unsigned long pfn;
for (pfn = start_pfn; pfn < end_pfn; pfn++)
set_page_links(pfn_to_page(pfn), zid, nid, pfn);
}
static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
unsigned long start_pfn, unsigned long end_pfn)
{
int ret;
unsigned long flags;
unsigned long z1_start_pfn;
if (!z1->wait_table) {
ret = init_currently_empty_zone(z1, start_pfn,
end_pfn - start_pfn, MEMMAP_HOTPLUG);
if (ret)
return ret;
}
pgdat_resize_lock(z1->zone_pgdat, &flags);
/* can't move pfns which are higher than @z2 */
if (end_pfn > z2->zone_start_pfn + z2->spanned_pages)
goto out_fail;
/* the move out part mast at the left most of @z2 */
if (start_pfn > z2->zone_start_pfn)
goto out_fail;
/* must included/overlap */
if (end_pfn <= z2->zone_start_pfn)
goto out_fail;
/* use start_pfn for z1's start_pfn if z1 is empty */
if (z1->spanned_pages)
z1_start_pfn = z1->zone_start_pfn;
else
z1_start_pfn = start_pfn;
resize_zone(z1, z1_start_pfn, end_pfn);
resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages);
pgdat_resize_unlock(z1->zone_pgdat, &flags);
fix_zone_id(z1, start_pfn, end_pfn);
return 0;
out_fail:
pgdat_resize_unlock(z1->zone_pgdat, &flags);
return -1;
}
static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
unsigned long start_pfn, unsigned long end_pfn)
{
int ret;
unsigned long flags;
unsigned long z2_end_pfn;
if (!z2->wait_table) {
ret = init_currently_empty_zone(z2, start_pfn,
end_pfn - start_pfn, MEMMAP_HOTPLUG);
if (ret)
return ret;
}
pgdat_resize_lock(z1->zone_pgdat, &flags);
/* can't move pfns which are lower than @z1 */
if (z1->zone_start_pfn > start_pfn)
goto out_fail;
/* the move out part mast at the right most of @z1 */
if (z1->zone_start_pfn + z1->spanned_pages > end_pfn)
goto out_fail;
/* must included/overlap */
if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages)
goto out_fail;
/* use end_pfn for z2's end_pfn if z2 is empty */
if (z2->spanned_pages)
z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages;
else
z2_end_pfn = end_pfn;
resize_zone(z1, z1->zone_start_pfn, start_pfn);
resize_zone(z2, start_pfn, z2_end_pfn);
pgdat_resize_unlock(z1->zone_pgdat, &flags);
fix_zone_id(z2, start_pfn, end_pfn);
return 0;
out_fail:
pgdat_resize_unlock(z1->zone_pgdat, &flags);
return -1;
}
static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
unsigned long end_pfn)
{
unsigned long old_pgdat_end_pfn =
pgdat->node_start_pfn + pgdat->node_spanned_pages;
if (start_pfn < pgdat->node_start_pfn)
if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
pgdat->node_start_pfn = start_pfn;
pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
@@ -460,8 +589,99 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
return 0;
}
#ifdef CONFIG_MOVABLE_NODE
/*
* When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have
* normal memory.
*/
static bool can_online_high_movable(struct zone *zone)
{
return true;
}
#else /* CONFIG_MOVABLE_NODE */
/* ensure every online node has NORMAL memory */
static bool can_online_high_movable(struct zone *zone)
{
return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
}
#endif /* CONFIG_MOVABLE_NODE */
int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
/* check which state of node_states will be changed when online memory */
static void node_states_check_changes_online(unsigned long nr_pages,
struct zone *zone, struct memory_notify *arg)
{
int nid = zone_to_nid(zone);
enum zone_type zone_last = ZONE_NORMAL;
/*
* If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
* contains nodes which have zones of 0...ZONE_NORMAL,
* set zone_last to ZONE_NORMAL.
*
* If we don't have HIGHMEM nor movable node,
* node_states[N_NORMAL_MEMORY] contains nodes which have zones of
* 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
*/
if (N_MEMORY == N_NORMAL_MEMORY)
zone_last = ZONE_MOVABLE;
/*
* if the memory to be online is in a zone of 0...zone_last, and
* the zones of 0...zone_last don't have memory before online, we will
* need to set the node to node_states[N_NORMAL_MEMORY] after
* the memory is online.
*/
if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
arg->status_change_nid_normal = nid;
else
arg->status_change_nid_normal = -1;
#ifdef CONFIG_HIGHMEM
/*
* If we have movable node, node_states[N_HIGH_MEMORY]
* contains nodes which have zones of 0...ZONE_HIGHMEM,
* set zone_last to ZONE_HIGHMEM.
*
* If we don't have movable node, node_states[N_NORMAL_MEMORY]
* contains nodes which have zones of 0...ZONE_MOVABLE,
* set zone_last to ZONE_MOVABLE.
*/
zone_last = ZONE_HIGHMEM;
if (N_MEMORY == N_HIGH_MEMORY)
zone_last = ZONE_MOVABLE;
if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
arg->status_change_nid_high = nid;
else
arg->status_change_nid_high = -1;
#else
arg->status_change_nid_high = arg->status_change_nid_normal;
#endif
/*
* if the node don't have memory befor online, we will need to
* set the node to node_states[N_MEMORY] after the memory
* is online.
*/
if (!node_state(nid, N_MEMORY))
arg->status_change_nid = nid;
else
arg->status_change_nid = -1;
}
static void node_states_set_node(int node, struct memory_notify *arg)
{
if (arg->status_change_nid_normal >= 0)
node_set_state(node, N_NORMAL_MEMORY);
if (arg->status_change_nid_high >= 0)
node_set_state(node, N_HIGH_MEMORY);
node_set_state(node, N_MEMORY);
}
int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
{
unsigned long onlined_pages = 0;
struct zone *zone;
@@ -471,13 +691,40 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
struct memory_notify arg;
lock_memory_hotplug();
/*
* This doesn't need a lock to do pfn_to_page().
* The section can't be removed here because of the
* memory_block->state_mutex.
*/
zone = page_zone(pfn_to_page(pfn));
if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
!can_online_high_movable(zone)) {
unlock_memory_hotplug();
return -1;
}
if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) {
unlock_memory_hotplug();
return -1;
}
}
if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) {
unlock_memory_hotplug();
return -1;
}
}
/* Previous code may changed the zone of the pfn range */
zone = page_zone(pfn_to_page(pfn));
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
arg.status_change_nid = -1;
node_states_check_changes_online(nr_pages, zone, &arg);
nid = page_to_nid(pfn_to_page(pfn));
if (node_present_pages(nid) == 0)
arg.status_change_nid = nid;
ret = memory_notify(MEM_GOING_ONLINE, &arg);
ret = notifier_to_errno(ret);
@@ -486,24 +733,22 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
unlock_memory_hotplug();
return ret;
}
/*
* This doesn't need a lock to do pfn_to_page().
* The section can't be removed here because of the
* memory_block->state_mutex.
*/
zone = page_zone(pfn_to_page(pfn));
/*
* If this zone is not populated, then it is not in zonelist.
* This means the page allocator ignores this zone.
* So, zonelist must be updated after online.
*/
mutex_lock(&zonelists_mutex);
if (!populated_zone(zone))
if (!populated_zone(zone)) {
need_zonelists_rebuild = 1;
build_all_zonelists(NULL, zone);
}
ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
online_pages_range);
if (ret) {
if (need_zonelists_rebuild)
zone_pcp_reset(zone);
mutex_unlock(&zonelists_mutex);
printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
(unsigned long long) pfn << PAGE_SHIFT,
@@ -514,12 +759,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
return ret;
}
zone->managed_pages += onlined_pages;
zone->present_pages += onlined_pages;
zone->zone_pgdat->node_present_pages += onlined_pages;
if (onlined_pages) {
node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
node_states_set_node(zone_to_nid(zone), &arg);
if (need_zonelists_rebuild)
build_all_zonelists(NULL, zone);
build_all_zonelists(NULL, NULL);
else
zone_pcp_update(zone);
}
@@ -812,7 +1058,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
* migrate_pages returns # of failed pages.
*/
ret = migrate_pages(&source, alloc_migrate_target, 0,
true, MIGRATE_SYNC);
true, MIGRATE_SYNC,
MR_MEMORY_HOTPLUG);
if (ret)
putback_lru_pages(&source);
}
@@ -847,7 +1094,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
{
int ret;
long offlined = *(long *)data;
ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
offlined = nr_pages;
if (!ret)
*(long *)data += offlined;
@@ -867,6 +1114,132 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
return offlined;
}
#ifdef CONFIG_MOVABLE_NODE
/*
* When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have
* normal memory.
*/
static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
{
return true;
}
#else /* CONFIG_MOVABLE_NODE */
/* ensure the node has NORMAL memory if it is still online */
static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
{
struct pglist_data *pgdat = zone->zone_pgdat;
unsigned long present_pages = 0;
enum zone_type zt;
for (zt = 0; zt <= ZONE_NORMAL; zt++)
present_pages += pgdat->node_zones[zt].present_pages;
if (present_pages > nr_pages)
return true;
present_pages = 0;
for (; zt <= ZONE_MOVABLE; zt++)
present_pages += pgdat->node_zones[zt].present_pages;
/*
* we can't offline the last normal memory until all
* higher memory is offlined.
*/
return present_pages == 0;
}
#endif /* CONFIG_MOVABLE_NODE */
/* check which state of node_states will be changed when offline memory */
static void node_states_check_changes_offline(unsigned long nr_pages,
struct zone *zone, struct memory_notify *arg)
{
struct pglist_data *pgdat = zone->zone_pgdat;
unsigned long present_pages = 0;
enum zone_type zt, zone_last = ZONE_NORMAL;
/*
* If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
* contains nodes which have zones of 0...ZONE_NORMAL,
* set zone_last to ZONE_NORMAL.
*
* If we don't have HIGHMEM nor movable node,
* node_states[N_NORMAL_MEMORY] contains nodes which have zones of
* 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
*/
if (N_MEMORY == N_NORMAL_MEMORY)
zone_last = ZONE_MOVABLE;
/*
* check whether node_states[N_NORMAL_MEMORY] will be changed.
* If the memory to be offline is in a zone of 0...zone_last,
* and it is the last present memory, 0...zone_last will
* become empty after offline , thus we can determind we will
* need to clear the node from node_states[N_NORMAL_MEMORY].
*/
for (zt = 0; zt <= zone_last; zt++)
present_pages += pgdat->node_zones[zt].present_pages;
if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
arg->status_change_nid_normal = zone_to_nid(zone);
else
arg->status_change_nid_normal = -1;
#ifdef CONFIG_HIGHMEM
/*
* If we have movable node, node_states[N_HIGH_MEMORY]
* contains nodes which have zones of 0...ZONE_HIGHMEM,
* set zone_last to ZONE_HIGHMEM.
*
* If we don't have movable node, node_states[N_NORMAL_MEMORY]
* contains nodes which have zones of 0...ZONE_MOVABLE,
* set zone_last to ZONE_MOVABLE.
*/
zone_last = ZONE_HIGHMEM;
if (N_MEMORY == N_HIGH_MEMORY)
zone_last = ZONE_MOVABLE;
for (; zt <= zone_last; zt++)
present_pages += pgdat->node_zones[zt].present_pages;
if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
arg->status_change_nid_high = zone_to_nid(zone);
else
arg->status_change_nid_high = -1;
#else
arg->status_change_nid_high = arg->status_change_nid_normal;
#endif
/*
* node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
*/
zone_last = ZONE_MOVABLE;
/*
* check whether node_states[N_HIGH_MEMORY] will be changed
* If we try to offline the last present @nr_pages from the node,
* we can determind we will need to clear the node from
* node_states[N_HIGH_MEMORY].
*/
for (; zt <= zone_last; zt++)
present_pages += pgdat->node_zones[zt].present_pages;
if (nr_pages >= present_pages)
arg->status_change_nid = zone_to_nid(zone);
else
arg->status_change_nid = -1;
}
static void node_states_clear_node(int node, struct memory_notify *arg)
{
if (arg->status_change_nid_normal >= 0)
node_clear_state(node, N_NORMAL_MEMORY);
if ((N_MEMORY != N_NORMAL_MEMORY) &&
(arg->status_change_nid_high >= 0))
node_clear_state(node, N_HIGH_MEMORY);
if ((N_MEMORY != N_HIGH_MEMORY) &&
(arg->status_change_nid >= 0))
node_clear_state(node, N_MEMORY);
}
static int __ref __offline_pages(unsigned long start_pfn,
unsigned long end_pfn, unsigned long timeout)
{
@@ -893,16 +1266,19 @@ static int __ref __offline_pages(unsigned long start_pfn,
node = zone_to_nid(zone);
nr_pages = end_pfn - start_pfn;
ret = -EINVAL;
if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
goto out;
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE, true);
if (ret)
goto out;
arg.start_pfn = start_pfn;
arg.nr_pages = nr_pages;
arg.status_change_nid = -1;
if (nr_pages >= node_present_pages(node))
arg.status_change_nid = node;
node_states_check_changes_offline(nr_pages, zone, &arg);
ret = memory_notify(MEM_GOING_OFFLINE, &arg);
ret = notifier_to_errno(ret);
@@ -943,10 +1319,10 @@ repeat:
goto repeat;
}
}
/* drain all zone's lru pagevec, this is asyncronous... */
/* drain all zone's lru pagevec, this is asynchronous... */
lru_add_drain_all();
yield();
/* drain pcp pages , this is synchrouns. */
/* drain pcp pages, this is synchronous. */
drain_all_pages();
/* check again */
offlined_pages = check_pages_isolated(start_pfn, end_pfn);
@@ -955,12 +1331,13 @@ repeat:
goto failed_removal;
}
printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
/* Ok, all of our target is islaoted.
/* Ok, all of our target is isolated.
We cannot do rollback at this point. */
offline_isolated_pages(start_pfn, end_pfn);
/* reset pagetype flags and makes migrate type to be MOVABLE */
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
/* removal success */
zone->managed_pages -= offlined_pages;
zone->present_pages -= offlined_pages;
zone->zone_pgdat->node_present_pages -= offlined_pages;
totalram_pages -= offlined_pages;
@@ -975,10 +1352,9 @@ repeat:
} else
zone_pcp_update(zone);
if (!node_present_pages(node)) {
node_clear_state(node, N_HIGH_MEMORY);
node_states_clear_node(node, &arg);
if (arg.status_change_nid >= 0)
kswapd_stop(node);
}
vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();

파일 보기

@@ -90,6 +90,7 @@
#include <linux/syscalls.h>
#include <linux/ctype.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
@@ -117,6 +118,26 @@ static struct mempolicy default_policy = {
.flags = MPOL_F_LOCAL,
};
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
static struct mempolicy *get_task_policy(struct task_struct *p)
{
struct mempolicy *pol = p->mempolicy;
int node;
if (!pol) {
node = numa_node_id();
if (node != -1)
pol = &preferred_node_policy[node];
/* preferred_node_policy is not initialised early in boot */
if (!pol->mode)
pol = NULL;
}
return pol;
}
static const struct mempolicy_operations {
int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
/*
@@ -212,9 +233,9 @@ static int mpol_set_nodemask(struct mempolicy *pol,
/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
if (pol == NULL)
return 0;
/* Check N_HIGH_MEMORY */
/* Check N_MEMORY */
nodes_and(nsc->mask1,
cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
cpuset_current_mems_allowed, node_states[N_MEMORY]);
VM_BUG_ON(!nodes);
if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
@@ -254,7 +275,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
if (mode == MPOL_DEFAULT) {
if (nodes && !nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
return NULL; /* simply delete any existing policy */
return NULL;
}
VM_BUG_ON(!nodes);
@@ -269,6 +290,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
(flags & MPOL_F_RELATIVE_NODES)))
return ERR_PTR(-EINVAL);
}
} else if (mode == MPOL_LOCAL) {
if (!nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
mode = MPOL_PREFERRED;
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -511,7 +536,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
split_huge_page_pmd(vma->vm_mm, pmd);
split_huge_page_pmd(vma, addr, pmd);
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
continue;
if (check_pte_range(vma, pmd, addr, next, nodes,
@@ -561,6 +586,36 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
return 0;
}
#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
/*
* This is used to mark a range of virtual addresses to be inaccessible.
* These are later cleared by a NUMA hinting fault. Depending on these
* faults, pages may be migrated for better NUMA placement.
*
* This is assuming that NUMA faults are handled using PROT_NONE. If
* an architecture makes a different choice, it will need further
* changes to the core.
*/
unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
int nr_updated;
BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
if (nr_updated)
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
return nr_updated;
}
#else
static unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
return 0;
}
#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
/*
* Check if all pages in a range are on a set of nodes.
* If pagelist != NULL then isolate pages from the LRU and
@@ -579,22 +634,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
return ERR_PTR(-EFAULT);
prev = NULL;
for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
unsigned long endvma = vma->vm_end;
if (endvma > end)
endvma = end;
if (vma->vm_start > start)
start = vma->vm_start;
if (!(flags & MPOL_MF_DISCONTIG_OK)) {
if (!vma->vm_next && vma->vm_end < end)
return ERR_PTR(-EFAULT);
if (prev && prev->vm_end < vma->vm_start)
return ERR_PTR(-EFAULT);
}
if (!is_vm_hugetlb_page(vma) &&
((flags & MPOL_MF_STRICT) ||
((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
vma_migratable(vma)))) {
unsigned long endvma = vma->vm_end;
if (endvma > end)
endvma = end;
if (vma->vm_start > start)
start = vma->vm_start;
if (is_vm_hugetlb_page(vma))
goto next;
if (flags & MPOL_MF_LAZY) {
change_prot_numa(vma, start, endvma);
goto next;
}
if ((flags & MPOL_MF_STRICT) ||
((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
vma_migratable(vma))) {
err = check_pgd_range(vma, start, endvma, nodes,
flags, private);
if (err) {
@@ -602,6 +667,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
break;
}
}
next:
prev = vma;
}
return first;
@@ -961,7 +1027,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_node_page, dest,
false, MIGRATE_SYNC);
false, MIGRATE_SYNC,
MR_SYSCALL);
if (err)
putback_lru_pages(&pagelist);
}
@@ -1133,8 +1200,7 @@ static long do_mbind(unsigned long start, unsigned long len,
int err;
LIST_HEAD(pagelist);
if (flags & ~(unsigned long)(MPOL_MF_STRICT |
MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
if (flags & ~(unsigned long)MPOL_MF_VALID)
return -EINVAL;
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
return -EPERM;
@@ -1157,6 +1223,9 @@ static long do_mbind(unsigned long start, unsigned long len,
if (IS_ERR(new))
return PTR_ERR(new);
if (flags & MPOL_MF_LAZY)
new->flags |= MPOL_F_MOF;
/*
* If we are using the default policy then operation
* on discontinuous address spaces is okay after all
@@ -1193,21 +1262,24 @@ static long do_mbind(unsigned long start, unsigned long len,
vma = check_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
err = PTR_ERR(vma);
if (!IS_ERR(vma)) {
int nr_failed = 0;
err = PTR_ERR(vma); /* maybe ... */
if (!IS_ERR(vma))
err = mbind_range(mm, start, end, new);
if (!err) {
int nr_failed = 0;
if (!list_empty(&pagelist)) {
WARN_ON_ONCE(flags & MPOL_MF_LAZY);
nr_failed = migrate_pages(&pagelist, new_vma_page,
(unsigned long)vma,
false, MIGRATE_SYNC);
false, MIGRATE_SYNC,
MR_MEMPOLICY_MBIND);
if (nr_failed)
putback_lru_pages(&pagelist);
}
if (!err && nr_failed && (flags & MPOL_MF_STRICT))
if (nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
} else
putback_lru_pages(&pagelist);
@@ -1388,7 +1460,7 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
goto out_put;
}
if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
if (!nodes_subset(*new, node_states[N_MEMORY])) {
err = -EINVAL;
goto out_put;
}
@@ -1546,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
struct mempolicy *get_vma_policy(struct task_struct *task,
struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol = task->mempolicy;
struct mempolicy *pol = get_task_policy(task);
if (vma) {
if (vma->vm_ops && vma->vm_ops->get_policy) {
@@ -1907,7 +1979,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, int node)
{
struct mempolicy *pol;
struct zonelist *zl;
struct page *page;
unsigned int cpuset_mems_cookie;
@@ -1926,23 +1997,11 @@ retry_cpuset:
return page;
}
zl = policy_zonelist(gfp, pol, node);
if (unlikely(mpol_needs_cond_ref(pol))) {
/*
* slow path: ref counted shared policy
*/
struct page *page = __alloc_pages_nodemask(gfp, order,
zl, policy_nodemask(gfp, pol));
__mpol_put(pol);
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
return page;
}
/*
* fast path: default or task policy
*/
page = __alloc_pages_nodemask(gfp, order, zl,
page = __alloc_pages_nodemask(gfp, order,
policy_zonelist(gfp, pol, node),
policy_nodemask(gfp, pol));
if (unlikely(mpol_needs_cond_ref(pol)))
__mpol_put(pol);
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
return page;
@@ -1969,7 +2028,7 @@ retry_cpuset:
*/
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = current->mempolicy;
struct mempolicy *pol = get_task_policy(current);
struct page *page;
unsigned int cpuset_mems_cookie;
@@ -2037,28 +2096,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
return new;
}
/*
* If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
* eliminate the * MPOL_F_* flags that require conditional ref and
* [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly
* after return. Use the returned value.
*
* Allows use of a mempolicy for, e.g., multiple allocations with a single
* policy lookup, even if the policy needs/has extra ref on lookup.
* shmem_readahead needs this.
*/
struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
struct mempolicy *frompol)
{
if (!mpol_needs_cond_ref(frompol))
return frompol;
*tompol = *frompol;
tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */
__mpol_put(frompol);
return tompol;
}
/* Slow path of a mempolicy comparison */
bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
@@ -2095,7 +2132,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
*/
/* lookup first element intersecting start-end */
/* Caller holds sp->mutex */
/* Caller holds sp->lock */
static struct sp_node *
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
{
@@ -2159,13 +2196,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
if (!sp->root.rb_node)
return NULL;
mutex_lock(&sp->mutex);
spin_lock(&sp->lock);
sn = sp_lookup(sp, idx, idx+1);
if (sn) {
mpol_get(sn->policy);
pol = sn->policy;
}
mutex_unlock(&sp->mutex);
spin_unlock(&sp->lock);
return pol;
}
@@ -2175,6 +2212,115 @@ static void sp_free(struct sp_node *n)
kmem_cache_free(sn_cache, n);
}
/**
* mpol_misplaced - check whether current page node is valid in policy
*
* @page - page to be checked
* @vma - vm area where page mapped
* @addr - virtual address where page mapped
*
* Lookup current policy node id for vma,addr and "compare to" page's
* node id.
*
* Returns:
* -1 - not misplaced, page is in the right node
* node - node id where the page should be
*
* Policy determination "mimics" alloc_page_vma().
* Called from fault path where we know the vma and faulting address.
*/
int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol;
struct zone *zone;
int curnid = page_to_nid(page);
unsigned long pgoff;
int polnid = -1;
int ret = -1;
BUG_ON(!vma);
pol = get_vma_policy(current, vma, addr);
if (!(pol->flags & MPOL_F_MOF))
goto out;
switch (pol->mode) {
case MPOL_INTERLEAVE:
BUG_ON(addr >= vma->vm_end);
BUG_ON(addr < vma->vm_start);
pgoff = vma->vm_pgoff;
pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
polnid = offset_il_node(pol, vma, pgoff);
break;
case MPOL_PREFERRED:
if (pol->flags & MPOL_F_LOCAL)
polnid = numa_node_id();
else
polnid = pol->v.preferred_node;
break;
case MPOL_BIND:
/*
* allows binding to multiple nodes.
* use current page if in policy nodemask,
* else select nearest allowed node, if any.
* If no allowed nodes, use current [!misplaced].
*/
if (node_isset(curnid, pol->v.nodes))
goto out;
(void)first_zones_zonelist(
node_zonelist(numa_node_id(), GFP_HIGHUSER),
gfp_zone(GFP_HIGHUSER),
&pol->v.nodes, &zone);
polnid = zone->node;
break;
default:
BUG();
}
/* Migrate the page towards the node whose CPU is referencing it */
if (pol->flags & MPOL_F_MORON) {
int last_nid;
polnid = numa_node_id();
/*
* Multi-stage node selection is used in conjunction
* with a periodic migration fault to build a temporal
* task<->page relation. By using a two-stage filter we
* remove short/unlikely relations.
*
* Using P(p) ~ n_p / n_t as per frequentist
* probability, we can equate a task's usage of a
* particular page (n_p) per total usage of this
* page (n_t) (in a given time-span) to a probability.
*
* Our periodic faults will sample this probability and
* getting the same result twice in a row, given these
* samples are fully independent, is then given by
* P(n)^2, provided our sample period is sufficiently
* short compared to the usage pattern.
*
* This quadric squishes small probabilities, making
* it less likely we act on an unlikely task<->page
* relation.
*/
last_nid = page_xchg_last_nid(page, polnid);
if (last_nid != polnid)
goto out;
}
if (curnid != polnid)
ret = polnid;
out:
mpol_cond_put(pol);
return ret;
}
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
pr_debug("deleting %lx-l%lx\n", n->start, n->end);
@@ -2182,6 +2328,14 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n)
sp_free(n);
}
static void sp_node_init(struct sp_node *node, unsigned long start,
unsigned long end, struct mempolicy *pol)
{
node->start = start;
node->end = end;
node->policy = pol;
}
static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
struct mempolicy *pol)
{
@@ -2198,10 +2352,7 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
return NULL;
}
newpol->flags |= MPOL_F_SHARED;
n->start = start;
n->end = end;
n->policy = newpol;
sp_node_init(n, start, end, newpol);
return n;
}
@@ -2211,9 +2362,12 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
unsigned long end, struct sp_node *new)
{
struct sp_node *n;
struct sp_node *n_new = NULL;
struct mempolicy *mpol_new = NULL;
int ret = 0;
mutex_lock(&sp->mutex);
restart:
spin_lock(&sp->lock);
n = sp_lookup(sp, start, end);
/* Take care of old policies in the same range. */
while (n && n->start < end) {
@@ -2226,14 +2380,16 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
} else {
/* Old policy spanning whole new range. */
if (n->end > end) {
struct sp_node *new2;
new2 = sp_alloc(end, n->end, n->policy);
if (!new2) {
ret = -ENOMEM;
goto out;
}
if (!n_new)
goto alloc_new;
*mpol_new = *n->policy;
atomic_set(&mpol_new->refcnt, 1);
sp_node_init(n_new, n->end, end, mpol_new);
sp_insert(sp, n_new);
n->end = start;
sp_insert(sp, new2);
n_new = NULL;
mpol_new = NULL;
break;
} else
n->end = start;
@@ -2244,9 +2400,27 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
}
if (new)
sp_insert(sp, new);
out:
mutex_unlock(&sp->mutex);
spin_unlock(&sp->lock);
ret = 0;
err_out:
if (mpol_new)
mpol_put(mpol_new);
if (n_new)
kmem_cache_free(sn_cache, n_new);
return ret;
alloc_new:
spin_unlock(&sp->lock);
ret = -ENOMEM;
n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
if (!n_new)
goto err_out;
mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!mpol_new)
goto err_out;
goto restart;
}
/**
@@ -2264,7 +2438,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
int ret;
sp->root = RB_ROOT; /* empty tree == default mempolicy */
mutex_init(&sp->mutex);
spin_lock_init(&sp->lock);
if (mpol) {
struct vm_area_struct pvma;
@@ -2330,16 +2504,60 @@ void mpol_free_shared_policy(struct shared_policy *p)
if (!p->root.rb_node)
return;
mutex_lock(&p->mutex);
spin_lock(&p->lock);
next = rb_first(&p->root);
while (next) {
n = rb_entry(next, struct sp_node, nd);
next = rb_next(&n->nd);
sp_delete(p, n);
}
mutex_unlock(&p->mutex);
spin_unlock(&p->lock);
}
#ifdef CONFIG_NUMA_BALANCING
static bool __initdata numabalancing_override;
static void __init check_numabalancing_enable(void)
{
bool numabalancing_default = false;
if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
numabalancing_default = true;
if (nr_node_ids > 1 && !numabalancing_override) {
printk(KERN_INFO "Enabling automatic NUMA balancing. "
"Configure with numa_balancing= or sysctl");
set_numabalancing_state(numabalancing_default);
}
}
static int __init setup_numabalancing(char *str)
{
int ret = 0;
if (!str)
goto out;
numabalancing_override = true;
if (!strcmp(str, "enable")) {
set_numabalancing_state(true);
ret = 1;
} else if (!strcmp(str, "disable")) {
set_numabalancing_state(false);
ret = 1;
}
out:
if (!ret)
printk(KERN_WARNING "Unable to parse numa_balancing=\n");
return ret;
}
__setup("numa_balancing=", setup_numabalancing);
#else
static inline void __init check_numabalancing_enable(void)
{
}
#endif /* CONFIG_NUMA_BALANCING */
/* assumes fs == KERNEL_DS */
void __init numa_policy_init(void)
{
@@ -2355,13 +2573,22 @@ void __init numa_policy_init(void)
sizeof(struct sp_node),
0, SLAB_PANIC, NULL);
for_each_node(nid) {
preferred_node_policy[nid] = (struct mempolicy) {
.refcnt = ATOMIC_INIT(1),
.mode = MPOL_PREFERRED,
.flags = MPOL_F_MOF | MPOL_F_MORON,
.v = { .preferred_node = nid, },
};
}
/*
* Set interleaving policy for system init. Interleaving is only
* enabled across suitably sized nodes (default is >= 16MB), or
* fall back to the largest node if they're all smaller.
*/
nodes_clear(interleave_nodes);
for_each_node_state(nid, N_HIGH_MEMORY) {
for_each_node_state(nid, N_MEMORY) {
unsigned long total_pages = node_present_pages(nid);
/* Preserve the largest node */
@@ -2381,6 +2608,8 @@ void __init numa_policy_init(void)
if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
printk("numa_policy_init: interleaving failed\n");
check_numabalancing_enable();
}
/* Reset policy of current process to default */
@@ -2394,44 +2623,34 @@ void numa_default_policy(void)
*/
/*
* "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
* Used only for mpol_parse_str() and mpol_to_str()
* "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
*/
#define MPOL_LOCAL MPOL_MAX
static const char * const policy_modes[] =
{
[MPOL_DEFAULT] = "default",
[MPOL_PREFERRED] = "prefer",
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
[MPOL_LOCAL] = "local"
[MPOL_LOCAL] = "local",
};
#ifdef CONFIG_TMPFS
/**
* mpol_parse_str - parse string to mempolicy
* mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
* @str: string containing mempolicy to parse
* @mpol: pointer to struct mempolicy pointer, returned on success.
* @no_context: flag whether to "contextualize" the mempolicy
*
* Format of input:
* <mode>[=<flags>][:<nodelist>]
*
* if @no_context is true, save the input nodemask in w.user_nodemask in
* the returned mempolicy. This will be used to "clone" the mempolicy in
* a specific context [cpuset] at a later time. Used to parse tmpfs mpol
* mount option. Note that if 'static' or 'relative' mode flags were
* specified, the input nodemask will already have been saved. Saving
* it again is redundant, but safe.
*
* On success, returns 0, else 1
*/
int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
int mpol_parse_str(char *str, struct mempolicy **mpol)
{
struct mempolicy *new = NULL;
unsigned short mode;
unsigned short uninitialized_var(mode_flags);
unsigned short mode_flags;
nodemask_t nodes;
char *nodelist = strchr(str, ':');
char *flags = strchr(str, '=');
@@ -2442,7 +2661,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
*nodelist++ = '\0';
if (nodelist_parse(nodelist, nodes))
goto out;
if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
if (!nodes_subset(nodes, node_states[N_MEMORY]))
goto out;
} else
nodes_clear(nodes);
@@ -2450,12 +2669,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
if (flags)
*flags++ = '\0'; /* terminate mode string */
for (mode = 0; mode <= MPOL_LOCAL; mode++) {
for (mode = 0; mode < MPOL_MAX; mode++) {
if (!strcmp(str, policy_modes[mode])) {
break;
}
}
if (mode > MPOL_LOCAL)
if (mode >= MPOL_MAX)
goto out;
switch (mode) {
@@ -2476,7 +2695,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
* Default to online nodes with memory if no nodelist
*/
if (!nodelist)
nodes = node_states[N_HIGH_MEMORY];
nodes = node_states[N_MEMORY];
break;
case MPOL_LOCAL:
/*
@@ -2519,24 +2738,23 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
if (IS_ERR(new))
goto out;
if (no_context) {
/* save for contextualization */
new->w.user_nodemask = nodes;
} else {
int ret;
NODEMASK_SCRATCH(scratch);
if (scratch) {
task_lock(current);
ret = mpol_set_nodemask(new, &nodes, scratch);
task_unlock(current);
} else
ret = -ENOMEM;
NODEMASK_SCRATCH_FREE(scratch);
if (ret) {
mpol_put(new);
goto out;
}
}
/*
* Save nodes for mpol_to_str() to show the tmpfs mount options
* for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
*/
if (mode != MPOL_PREFERRED)
new->v.nodes = nodes;
else if (nodelist)
new->v.preferred_node = first_node(nodes);
else
new->flags |= MPOL_F_LOCAL;
/*
* Save nodes for contextualization: this will be used to "clone"
* the mempolicy in a specific context [cpuset] at a later time.
*/
new->w.user_nodemask = nodes;
err = 0;
out:
@@ -2556,13 +2774,12 @@ out:
* @buffer: to contain formatted mempolicy string
* @maxlen: length of @buffer
* @pol: pointer to mempolicy to be formatted
* @no_context: "context free" mempolicy - use nodemask in w.user_nodemask
*
* Convert a mempolicy into a string.
* Returns the number of characters in buffer (if positive)
* or an error (negative)
*/
int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
{
char *p = buffer;
int l;
@@ -2588,7 +2805,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
case MPOL_PREFERRED:
nodes_clear(nodes);
if (flags & MPOL_F_LOCAL)
mode = MPOL_LOCAL; /* pseudo-policy */
mode = MPOL_LOCAL;
else
node_set(pol->v.preferred_node, nodes);
break;
@@ -2596,10 +2813,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
case MPOL_BIND:
/* Fall through */
case MPOL_INTERLEAVE:
if (no_context)
nodes = pol->w.user_nodemask;
else
nodes = pol->v.nodes;
nodes = pol->v.nodes;
break;
default:

파일 보기

@@ -35,9 +35,13 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/gfp.h>
#include <linux/balloon_compaction.h>
#include <asm/tlbflush.h>
#define CREATE_TRACE_POINTS
#include <trace/events/migrate.h>
#include "internal.h"
/*
@@ -79,7 +83,30 @@ void putback_lru_pages(struct list_head *l)
list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
putback_lru_page(page);
putback_lru_page(page);
}
}
/*
* Put previously isolated pages back onto the appropriate lists
* from where they were once taken off for compaction/migration.
*
* This function shall be used instead of putback_lru_pages(),
* whenever the isolated pageset has been built by isolate_migratepages_range()
*/
void putback_movable_pages(struct list_head *l)
{
struct page *page;
struct page *page2;
list_for_each_entry_safe(page, page2, l, lru) {
list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
if (unlikely(balloon_page_movable(page)))
balloon_page_putback(page);
else
putback_lru_page(page);
}
}
@@ -91,8 +118,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
swp_entry_t entry;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep, pte;
spinlock_t *ptl;
@@ -103,19 +128,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
goto out;
ptl = &mm->page_table_lock;
} else {
pgd = pgd_offset(mm, addr);
if (!pgd_present(*pgd))
pmd = mm_find_pmd(mm, addr);
if (!pmd)
goto out;
pud = pud_offset(pgd, addr);
if (!pud_present(*pud))
goto out;
pmd = pmd_offset(pud, addr);
if (pmd_trans_huge(*pmd))
goto out;
if (!pmd_present(*pmd))
goto out;
ptep = pte_offset_map(pmd, addr);
@@ -279,14 +296,14 @@ static int migrate_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page,
struct buffer_head *head, enum migrate_mode mode)
{
int expected_count;
int expected_count = 0;
void **pslot;
if (!mapping) {
/* Anonymous page without mapping */
if (page_count(page) != 1)
return -EAGAIN;
return 0;
return MIGRATEPAGE_SUCCESS;
}
spin_lock_irq(&mapping->tree_lock);
@@ -356,7 +373,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
}
spin_unlock_irq(&mapping->tree_lock);
return 0;
return MIGRATEPAGE_SUCCESS;
}
/*
@@ -372,7 +389,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
if (!mapping) {
if (page_count(page) != 1)
return -EAGAIN;
return 0;
return MIGRATEPAGE_SUCCESS;
}
spin_lock_irq(&mapping->tree_lock);
@@ -399,7 +416,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
page_unfreeze_refs(page, expected_count - 1);
spin_unlock_irq(&mapping->tree_lock);
return 0;
return MIGRATEPAGE_SUCCESS;
}
/*
@@ -407,7 +424,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
*/
void migrate_page_copy(struct page *newpage, struct page *page)
{
if (PageHuge(page))
if (PageHuge(page) || PageTransHuge(page))
copy_huge_page(newpage, page);
else
copy_highpage(newpage, page);
@@ -486,11 +503,11 @@ int migrate_page(struct address_space *mapping,
rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
if (rc)
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
migrate_page_copy(newpage, page);
return 0;
return MIGRATEPAGE_SUCCESS;
}
EXPORT_SYMBOL(migrate_page);
@@ -513,7 +530,7 @@ int buffer_migrate_page(struct address_space *mapping,
rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
if (rc)
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
/*
@@ -549,7 +566,7 @@ int buffer_migrate_page(struct address_space *mapping,
} while (bh != head);
return 0;
return MIGRATEPAGE_SUCCESS;
}
EXPORT_SYMBOL(buffer_migrate_page);
#endif
@@ -628,7 +645,7 @@ static int fallback_migrate_page(struct address_space *mapping,
*
* Return value:
* < 0 - error code
* == 0 - success
* MIGRATEPAGE_SUCCESS - success
*/
static int move_to_new_page(struct page *newpage, struct page *page,
int remap_swapcache, enum migrate_mode mode)
@@ -665,7 +682,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
else
rc = fallback_migrate_page(mapping, newpage, page, mode);
if (rc) {
if (rc != MIGRATEPAGE_SUCCESS) {
newpage->mapping = NULL;
} else {
if (remap_swapcache)
@@ -751,7 +768,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
*/
if (PageAnon(page)) {
/*
* Only page_lock_anon_vma() understands the subtleties of
* Only page_lock_anon_vma_read() understands the subtleties of
* getting a hold on an anon_vma from outside one of its mms.
*/
anon_vma = page_get_anon_vma(page);
@@ -778,6 +795,18 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
}
}
if (unlikely(balloon_page_movable(page))) {
/*
* A ballooned page does not need any special attention from
* physical to virtual reverse mapping procedures.
* Skip any attempt to unmap PTEs or to remap swap cache,
* in order to avoid burning cycles at rmap level, and perform
* the page migration right away (proteced by page lock).
*/
rc = balloon_page_migrate(newpage, page, mode);
goto uncharge;
}
/*
* Corner case handling:
* 1. When a new swap-cache page is read into, it is added to the LRU
@@ -814,7 +843,9 @@ skip_unmap:
put_anon_vma(anon_vma);
uncharge:
mem_cgroup_end_migration(mem, page, newpage, rc == 0);
mem_cgroup_end_migration(mem, page, newpage,
(rc == MIGRATEPAGE_SUCCESS ||
rc == MIGRATEPAGE_BALLOON_SUCCESS));
unlock:
unlock_page(page);
out:
@@ -846,6 +877,18 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
goto out;
rc = __unmap_and_move(page, newpage, force, offlining, mode);
if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
/*
* A ballooned page has been migrated already.
* Now, it's the time to wrap-up counters,
* handle the page back to Buddy and return.
*/
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
balloon_page_free(page);
return MIGRATEPAGE_SUCCESS;
}
out:
if (rc != -EAGAIN) {
/*
@@ -958,10 +1001,11 @@ out:
*/
int migrate_pages(struct list_head *from,
new_page_t get_new_page, unsigned long private, bool offlining,
enum migrate_mode mode)
enum migrate_mode mode, int reason)
{
int retry = 1;
int nr_failed = 0;
int nr_succeeded = 0;
int pass = 0;
struct page *page;
struct page *page2;
@@ -987,7 +1031,8 @@ int migrate_pages(struct list_head *from,
case -EAGAIN:
retry++;
break;
case 0:
case MIGRATEPAGE_SUCCESS:
nr_succeeded++;
break;
default:
/* Permanent failure */
@@ -996,15 +1041,18 @@ int migrate_pages(struct list_head *from,
}
}
}
rc = 0;
rc = nr_failed + retry;
out:
if (nr_succeeded)
count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
if (nr_failed)
count_vm_events(PGMIGRATE_FAIL, nr_failed);
trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
if (!swapwrite)
current->flags &= ~PF_SWAPWRITE;
if (rc)
return rc;
return nr_failed + retry;
return rc;
}
int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
@@ -1024,7 +1072,7 @@ int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
/* try again */
cond_resched();
break;
case 0:
case MIGRATEPAGE_SUCCESS:
goto out;
default:
rc = -EIO;
@@ -1139,7 +1187,8 @@ set_status:
err = 0;
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_page_node,
(unsigned long)pm, 0, MIGRATE_SYNC);
(unsigned long)pm, 0, MIGRATE_SYNC,
MR_SYSCALL);
if (err)
putback_lru_pages(&pagelist);
}
@@ -1201,7 +1250,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
if (node < 0 || node >= MAX_NUMNODES)
goto out_pm;
if (!node_state(node, N_HIGH_MEMORY))
if (!node_state(node, N_MEMORY))
goto out_pm;
err = -EACCES;
@@ -1403,4 +1452,329 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
}
return err;
}
#endif
#ifdef CONFIG_NUMA_BALANCING
/*
* Returns true if this is a safe migration target node for misplaced NUMA
* pages. Currently it only checks the watermarks which crude
*/
static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
int nr_migrate_pages)
{
int z;
for (z = pgdat->nr_zones - 1; z >= 0; z--) {
struct zone *zone = pgdat->node_zones + z;
if (!populated_zone(zone))
continue;
if (zone->all_unreclaimable)
continue;
/* Avoid waking kswapd by allocating pages_to_migrate pages. */
if (!zone_watermark_ok(zone, 0,
high_wmark_pages(zone) +
nr_migrate_pages,
0, 0))
continue;
return true;
}
return false;
}
static struct page *alloc_misplaced_dst_page(struct page *page,
unsigned long data,
int **result)
{
int nid = (int) data;
struct page *newpage;
newpage = alloc_pages_exact_node(nid,
(GFP_HIGHUSER_MOVABLE | GFP_THISNODE |
__GFP_NOMEMALLOC | __GFP_NORETRY |
__GFP_NOWARN) &
~GFP_IOFS, 0);
if (newpage)
page_xchg_last_nid(newpage, page_last_nid(page));
return newpage;
}
/*
* page migration rate limiting control.
* Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
* window of time. Default here says do not migrate more than 1280M per second.
* If a node is rate-limited then PTE NUMA updates are also rate-limited. However
* as it is faults that reset the window, pte updates will happen unconditionally
* if there has not been a fault since @pteupdate_interval_millisecs after the
* throttle window closed.
*/
static unsigned int migrate_interval_millisecs __read_mostly = 100;
static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
/* Returns true if NUMA migration is currently rate limited */
bool migrate_ratelimited(int node)
{
pg_data_t *pgdat = NODE_DATA(node);
if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
msecs_to_jiffies(pteupdate_interval_millisecs)))
return false;
if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
return false;
return true;
}
/* Returns true if the node is migrate rate-limited after the update */
bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
{
bool rate_limited = false;
/*
* Rate-limit the amount of data that is being migrated to a node.
* Optimal placement is no good if the memory bus is saturated and
* all the time is being spent migrating!
*/
spin_lock(&pgdat->numabalancing_migrate_lock);
if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
pgdat->numabalancing_migrate_nr_pages = 0;
pgdat->numabalancing_migrate_next_window = jiffies +
msecs_to_jiffies(migrate_interval_millisecs);
}
if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
rate_limited = true;
else
pgdat->numabalancing_migrate_nr_pages += nr_pages;
spin_unlock(&pgdat->numabalancing_migrate_lock);
return rate_limited;
}
int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
{
int ret = 0;
/* Avoid migrating to a node that is nearly full */
if (migrate_balanced_pgdat(pgdat, 1)) {
int page_lru;
if (isolate_lru_page(page)) {
put_page(page);
return 0;
}
/* Page is isolated */
ret = 1;
page_lru = page_is_file_cache(page);
if (!PageTransHuge(page))
inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
else
mod_zone_page_state(page_zone(page),
NR_ISOLATED_ANON + page_lru,
HPAGE_PMD_NR);
}
/*
* Page is either isolated or there is not enough space on the target
* node. If isolated, then it has taken a reference count and the
* callers reference can be safely dropped without the page
* disappearing underneath us during migration. Otherwise the page is
* not to be migrated but the callers reference should still be
* dropped so it does not leak.
*/
put_page(page);
return ret;
}
/*
* Attempt to migrate a misplaced page to the specified destination
* node. Caller is expected to have an elevated reference count on
* the page that will be dropped by this function before returning.
*/
int migrate_misplaced_page(struct page *page, int node)
{
pg_data_t *pgdat = NODE_DATA(node);
int isolated = 0;
int nr_remaining;
LIST_HEAD(migratepages);
/*
* Don't migrate pages that are mapped in multiple processes.
* TODO: Handle false sharing detection instead of this hammer
*/
if (page_mapcount(page) != 1) {
put_page(page);
goto out;
}
/*
* Rate-limit the amount of data that is being migrated to a node.
* Optimal placement is no good if the memory bus is saturated and
* all the time is being spent migrating!
*/
if (numamigrate_update_ratelimit(pgdat, 1)) {
put_page(page);
goto out;
}
isolated = numamigrate_isolate_page(pgdat, page);
if (!isolated)
goto out;
list_add(&page->lru, &migratepages);
nr_remaining = migrate_pages(&migratepages,
alloc_misplaced_dst_page,
node, false, MIGRATE_ASYNC,
MR_NUMA_MISPLACED);
if (nr_remaining) {
putback_lru_pages(&migratepages);
isolated = 0;
} else
count_vm_numa_event(NUMA_PAGE_MIGRATE);
BUG_ON(!list_empty(&migratepages));
out:
return isolated;
}
#endif /* CONFIG_NUMA_BALANCING */
#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
int migrate_misplaced_transhuge_page(struct mm_struct *mm,
struct vm_area_struct *vma,
pmd_t *pmd, pmd_t entry,
unsigned long address,
struct page *page, int node)
{
unsigned long haddr = address & HPAGE_PMD_MASK;
pg_data_t *pgdat = NODE_DATA(node);
int isolated = 0;
struct page *new_page = NULL;
struct mem_cgroup *memcg = NULL;
int page_lru = page_is_file_cache(page);
/*
* Don't migrate pages that are mapped in multiple processes.
* TODO: Handle false sharing detection instead of this hammer
*/
if (page_mapcount(page) != 1)
goto out_dropref;
/*
* Rate-limit the amount of data that is being migrated to a node.
* Optimal placement is no good if the memory bus is saturated and
* all the time is being spent migrating!
*/
if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
goto out_dropref;
new_page = alloc_pages_node(node,
(GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
if (!new_page) {
count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
goto out_dropref;
}
page_xchg_last_nid(new_page, page_last_nid(page));
isolated = numamigrate_isolate_page(pgdat, page);
/*
* Failing to isolate or a GUP pin prevents migration. The expected
* page count is 2. 1 for anonymous pages without a mapping and 1
* for the callers pin. If the page was isolated, the page will
* need to be put back on the LRU.
*/
if (!isolated || page_count(page) != 2) {
count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
put_page(new_page);
if (isolated) {
putback_lru_page(page);
isolated = 0;
goto out;
}
goto out_keep_locked;
}
/* Prepare a page as a migration target */
__set_page_locked(new_page);
SetPageSwapBacked(new_page);
/* anon mapping, we can simply copy page->mapping to the new page: */
new_page->mapping = page->mapping;
new_page->index = page->index;
migrate_page_copy(new_page, page);
WARN_ON(PageLRU(new_page));
/* Recheck the target PMD */
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(*pmd, entry))) {
spin_unlock(&mm->page_table_lock);
/* Reverse changes made by migrate_page_copy() */
if (TestClearPageActive(new_page))
SetPageActive(page);
if (TestClearPageUnevictable(new_page))
SetPageUnevictable(page);
mlock_migrate_page(page, new_page);
unlock_page(new_page);
put_page(new_page); /* Free it */
unlock_page(page);
putback_lru_page(page);
count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
goto out;
}
/*
* Traditional migration needs to prepare the memcg charge
* transaction early to prevent the old page from being
* uncharged when installing migration entries. Here we can
* save the potential rollback and start the charge transfer
* only when migration is already known to end successfully.
*/
mem_cgroup_prepare_migration(page, new_page, &memcg);
entry = mk_pmd(new_page, vma->vm_page_prot);
entry = pmd_mknonnuma(entry);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
entry = pmd_mkhuge(entry);
page_add_new_anon_rmap(new_page, vma, haddr);
set_pmd_at(mm, haddr, pmd, entry);
update_mmu_cache_pmd(vma, address, &entry);
page_remove_rmap(page);
/*
* Finish the charge transaction under the page table lock to
* prevent split_huge_page() from dividing up the charge
* before it's fully transferred to the new page.
*/
mem_cgroup_end_migration(memcg, page, new_page, true);
spin_unlock(&mm->page_table_lock);
unlock_page(new_page);
unlock_page(page);
put_page(page); /* Drop the rmap reference */
put_page(page); /* Drop the LRU isolation reference */
count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
out:
mod_zone_page_state(page_zone(page),
NR_ISOLATED_ANON + page_lru,
-HPAGE_PMD_NR);
return isolated;
out_dropref:
put_page(page);
out_keep_locked:
return 0;
}
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_NUMA */

569
mm/mmap.c
파일 보기

@@ -31,6 +31,7 @@
#include <linux/audit.h>
#include <linux/khugepaged.h>
#include <linux/uprobes.h>
#include <linux/rbtree_augmented.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -88,6 +89,20 @@ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
*/
struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
/*
* The global memory commitment made in the system can be a metric
* that can be used to drive ballooning decisions when Linux is hosted
* as a guest. On Hyper-V, the host implements a policy engine for dynamically
* balancing memory across competing virtual machines that are hosted.
* Several metrics drive this policy engine including the guest reported
* memory commitment.
*/
unsigned long vm_memory_committed(void)
{
return percpu_counter_read_positive(&vm_committed_as);
}
EXPORT_SYMBOL_GPL(vm_memory_committed);
/*
* Check that a process has enough memory to allocate a new virtual
* mapping. 0 means there is enough memory for the allocation to
@@ -297,40 +312,88 @@ out:
return retval;
}
static long vma_compute_subtree_gap(struct vm_area_struct *vma)
{
unsigned long max, subtree_gap;
max = vma->vm_start;
if (vma->vm_prev)
max -= vma->vm_prev->vm_end;
if (vma->vm_rb.rb_left) {
subtree_gap = rb_entry(vma->vm_rb.rb_left,
struct vm_area_struct, vm_rb)->rb_subtree_gap;
if (subtree_gap > max)
max = subtree_gap;
}
if (vma->vm_rb.rb_right) {
subtree_gap = rb_entry(vma->vm_rb.rb_right,
struct vm_area_struct, vm_rb)->rb_subtree_gap;
if (subtree_gap > max)
max = subtree_gap;
}
return max;
}
#ifdef CONFIG_DEBUG_VM_RB
static int browse_rb(struct rb_root *root)
{
int i = 0, j;
int i = 0, j, bug = 0;
struct rb_node *nd, *pn = NULL;
unsigned long prev = 0, pend = 0;
for (nd = rb_first(root); nd; nd = rb_next(nd)) {
struct vm_area_struct *vma;
vma = rb_entry(nd, struct vm_area_struct, vm_rb);
if (vma->vm_start < prev)
printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1;
if (vma->vm_start < pend)
if (vma->vm_start < prev) {
printk("vm_start %lx prev %lx\n", vma->vm_start, prev);
bug = 1;
}
if (vma->vm_start < pend) {
printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
if (vma->vm_start > vma->vm_end)
printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start);
bug = 1;
}
if (vma->vm_start > vma->vm_end) {
printk("vm_end %lx < vm_start %lx\n",
vma->vm_end, vma->vm_start);
bug = 1;
}
if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
printk("free gap %lx, correct %lx\n",
vma->rb_subtree_gap,
vma_compute_subtree_gap(vma));
bug = 1;
}
i++;
pn = nd;
prev = vma->vm_start;
pend = vma->vm_end;
}
j = 0;
for (nd = pn; nd; nd = rb_prev(nd)) {
for (nd = pn; nd; nd = rb_prev(nd))
j++;
if (i != j) {
printk("backwards %d, forwards %d\n", j, i);
bug = 1;
}
return bug ? -1 : i;
}
static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
{
struct rb_node *nd;
for (nd = rb_first(root); nd; nd = rb_next(nd)) {
struct vm_area_struct *vma;
vma = rb_entry(nd, struct vm_area_struct, vm_rb);
BUG_ON(vma != ignore &&
vma->rb_subtree_gap != vma_compute_subtree_gap(vma));
}
if (i != j)
printk("backwards %d, forwards %d\n", j, i), i = 0;
return i;
}
void validate_mm(struct mm_struct *mm)
{
int bug = 0;
int i = 0;
unsigned long highest_address = 0;
struct vm_area_struct *vma = mm->mmap;
while (vma) {
struct anon_vma_chain *avc;
@@ -338,20 +401,73 @@ void validate_mm(struct mm_struct *mm)
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
anon_vma_interval_tree_verify(avc);
vma_unlock_anon_vma(vma);
highest_address = vma->vm_end;
vma = vma->vm_next;
i++;
}
if (i != mm->map_count)
printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
if (i != mm->map_count) {
printk("map_count %d vm_next %d\n", mm->map_count, i);
bug = 1;
}
if (highest_address != mm->highest_vm_end) {
printk("mm->highest_vm_end %lx, found %lx\n",
mm->highest_vm_end, highest_address);
bug = 1;
}
i = browse_rb(&mm->mm_rb);
if (i != mm->map_count)
printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
if (i != mm->map_count) {
printk("map_count %d rb %d\n", mm->map_count, i);
bug = 1;
}
BUG_ON(bug);
}
#else
#define validate_mm_rb(root, ignore) do { } while (0)
#define validate_mm(mm) do { } while (0)
#endif
RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
/*
* Update augmented rbtree rb_subtree_gap values after vma->vm_start or
* vma->vm_prev->vm_end values changed, without modifying the vma's position
* in the rbtree.
*/
static void vma_gap_update(struct vm_area_struct *vma)
{
/*
* As it turns out, RB_DECLARE_CALLBACKS() already created a callback
* function that does exacltly what we want.
*/
vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
}
static inline void vma_rb_insert(struct vm_area_struct *vma,
struct rb_root *root)
{
/* All rb_subtree_gap values must be consistent prior to insertion */
validate_mm_rb(root, NULL);
rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
}
static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
{
/*
* All rb_subtree_gap values must be consistent prior to erase,
* with the possible exception of the vma being erased.
*/
validate_mm_rb(root, vma);
/*
* Note rb_erase_augmented is a fairly large inline function,
* so make sure we instantiate it only once with our desired
* augmented rbtree callbacks.
*/
rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
}
/*
* vma has some anon_vma assigned, and is already inserted on that
* anon_vma's interval trees.
@@ -421,8 +537,25 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr,
void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
struct rb_node **rb_link, struct rb_node *rb_parent)
{
/* Update tracking information for the gap following the new vma. */
if (vma->vm_next)
vma_gap_update(vma->vm_next);
else
mm->highest_vm_end = vma->vm_end;
/*
* vma->vm_prev wasn't known when we followed the rbtree to find the
* correct insertion point for that vma. As a result, we could not
* update the vma vm_rb parents rb_subtree_gap values on the way down.
* So, we first insert the vma with a zero rb_subtree_gap value
* (to be consistent with what we did on the way down), and then
* immediately update the gap to the correct value. Finally we
* rebalance the rbtree after all augmented values have been set.
*/
rb_link_node(&vma->vm_rb, rb_parent, rb_link);
rb_insert_color(&vma->vm_rb, &mm->mm_rb);
vma->rb_subtree_gap = 0;
vma_gap_update(vma);
vma_rb_insert(vma, &mm->mm_rb);
}
static void __vma_link_file(struct vm_area_struct *vma)
@@ -498,12 +631,12 @@ static inline void
__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev)
{
struct vm_area_struct *next = vma->vm_next;
struct vm_area_struct *next;
prev->vm_next = next;
vma_rb_erase(vma, &mm->mm_rb);
prev->vm_next = next = vma->vm_next;
if (next)
next->vm_prev = prev;
rb_erase(&vma->vm_rb, &mm->mm_rb);
if (mm->mmap_cache == vma)
mm->mmap_cache = prev;
}
@@ -525,6 +658,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
struct rb_root *root = NULL;
struct anon_vma *anon_vma = NULL;
struct file *file = vma->vm_file;
bool start_changed = false, end_changed = false;
long adjust_next = 0;
int remove_next = 0;
@@ -602,7 +736,7 @@ again: remove_next = 1 + (end > next->vm_end);
if (anon_vma) {
VM_BUG_ON(adjust_next && next->anon_vma &&
anon_vma != next->anon_vma);
anon_vma_lock(anon_vma);
anon_vma_lock_write(anon_vma);
anon_vma_interval_tree_pre_update_vma(vma);
if (adjust_next)
anon_vma_interval_tree_pre_update_vma(next);
@@ -615,8 +749,14 @@ again: remove_next = 1 + (end > next->vm_end);
vma_interval_tree_remove(next, root);
}
vma->vm_start = start;
vma->vm_end = end;
if (start != vma->vm_start) {
vma->vm_start = start;
start_changed = true;
}
if (end != vma->vm_end) {
vma->vm_end = end;
end_changed = true;
}
vma->vm_pgoff = pgoff;
if (adjust_next) {
next->vm_start += adjust_next << PAGE_SHIFT;
@@ -645,6 +785,15 @@ again: remove_next = 1 + (end > next->vm_end);
* (it may either follow vma or precede it).
*/
__insert_vm_struct(mm, insert);
} else {
if (start_changed)
vma_gap_update(vma);
if (end_changed) {
if (!next)
mm->highest_vm_end = end;
else if (!adjust_next)
vma_gap_update(next);
}
}
if (anon_vma) {
@@ -678,10 +827,13 @@ again: remove_next = 1 + (end > next->vm_end);
* we must remove another next too. It would clutter
* up the code too much to do both in one go.
*/
if (remove_next == 2) {
next = vma->vm_next;
next = vma->vm_next;
if (remove_next == 2)
goto again;
}
else if (next)
vma_gap_update(next);
else
mm->highest_vm_end = end;
}
if (insert && file)
uprobe_mmap(insert);
@@ -1153,8 +1305,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
* memory so no accounting is necessary
*/
file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
VM_NORESERVE, &user,
HUGETLB_ANONHUGE_INODE);
VM_NORESERVE,
&user, HUGETLB_ANONHUGE_INODE,
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (IS_ERR(file))
return PTR_ERR(file);
}
@@ -1335,7 +1488,11 @@ munmap_back:
*
* Answer: Yes, several device drivers can do it in their
* f_op->mmap method. -DaveM
* Bug: If addr is changed, prev, rb_link, rb_parent should
* be updated for vma_link()
*/
WARN_ON_ONCE(addr != vma->vm_start);
addr = vma->vm_start;
pgoff = vma->vm_pgoff;
vm_flags = vma->vm_flags;
@@ -1400,6 +1557,206 @@ unacct_error:
return error;
}
unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
/*
* We implement the search by looking for an rbtree node that
* immediately follows a suitable gap. That is,
* - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
* - gap_end = vma->vm_start >= info->low_limit + length;
* - gap_end - gap_start >= length
*/
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long length, low_limit, high_limit, gap_start, gap_end;
/* Adjust search length to account for worst case alignment overhead */
length = info->length + info->align_mask;
if (length < info->length)
return -ENOMEM;
/* Adjust search limits by the desired length */
if (info->high_limit < length)
return -ENOMEM;
high_limit = info->high_limit - length;
if (info->low_limit > high_limit)
return -ENOMEM;
low_limit = info->low_limit + length;
/* Check if rbtree root looks promising */
if (RB_EMPTY_ROOT(&mm->mm_rb))
goto check_highest;
vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
if (vma->rb_subtree_gap < length)
goto check_highest;
while (true) {
/* Visit left subtree if it looks promising */
gap_end = vma->vm_start;
if (gap_end >= low_limit && vma->vm_rb.rb_left) {
struct vm_area_struct *left =
rb_entry(vma->vm_rb.rb_left,
struct vm_area_struct, vm_rb);
if (left->rb_subtree_gap >= length) {
vma = left;
continue;
}
}
gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
check_current:
/* Check if current node has a suitable gap */
if (gap_start > high_limit)
return -ENOMEM;
if (gap_end >= low_limit && gap_end - gap_start >= length)
goto found;
/* Visit right subtree if it looks promising */
if (vma->vm_rb.rb_right) {
struct vm_area_struct *right =
rb_entry(vma->vm_rb.rb_right,
struct vm_area_struct, vm_rb);
if (right->rb_subtree_gap >= length) {
vma = right;
continue;
}
}
/* Go back up the rbtree to find next candidate node */
while (true) {
struct rb_node *prev = &vma->vm_rb;
if (!rb_parent(prev))
goto check_highest;
vma = rb_entry(rb_parent(prev),
struct vm_area_struct, vm_rb);
if (prev == vma->vm_rb.rb_left) {
gap_start = vma->vm_prev->vm_end;
gap_end = vma->vm_start;
goto check_current;
}
}
}
check_highest:
/* Check highest gap, which does not precede any rbtree node */
gap_start = mm->highest_vm_end;
gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
if (gap_start > high_limit)
return -ENOMEM;
found:
/* We found a suitable gap. Clip it with the original low_limit. */
if (gap_start < info->low_limit)
gap_start = info->low_limit;
/* Adjust gap address to the desired alignment */
gap_start += (info->align_offset - gap_start) & info->align_mask;
VM_BUG_ON(gap_start + info->length > info->high_limit);
VM_BUG_ON(gap_start + info->length > gap_end);
return gap_start;
}
unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long length, low_limit, high_limit, gap_start, gap_end;
/* Adjust search length to account for worst case alignment overhead */
length = info->length + info->align_mask;
if (length < info->length)
return -ENOMEM;
/*
* Adjust search limits by the desired length.
* See implementation comment at top of unmapped_area().
*/
gap_end = info->high_limit;
if (gap_end < length)
return -ENOMEM;
high_limit = gap_end - length;
if (info->low_limit > high_limit)
return -ENOMEM;
low_limit = info->low_limit + length;
/* Check highest gap, which does not precede any rbtree node */
gap_start = mm->highest_vm_end;
if (gap_start <= high_limit)
goto found_highest;
/* Check if rbtree root looks promising */
if (RB_EMPTY_ROOT(&mm->mm_rb))
return -ENOMEM;
vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
if (vma->rb_subtree_gap < length)
return -ENOMEM;
while (true) {
/* Visit right subtree if it looks promising */
gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
if (gap_start <= high_limit && vma->vm_rb.rb_right) {
struct vm_area_struct *right =
rb_entry(vma->vm_rb.rb_right,
struct vm_area_struct, vm_rb);
if (right->rb_subtree_gap >= length) {
vma = right;
continue;
}
}
check_current:
/* Check if current node has a suitable gap */
gap_end = vma->vm_start;
if (gap_end < low_limit)
return -ENOMEM;
if (gap_start <= high_limit && gap_end - gap_start >= length)
goto found;
/* Visit left subtree if it looks promising */
if (vma->vm_rb.rb_left) {
struct vm_area_struct *left =
rb_entry(vma->vm_rb.rb_left,
struct vm_area_struct, vm_rb);
if (left->rb_subtree_gap >= length) {
vma = left;
continue;
}
}
/* Go back up the rbtree to find next candidate node */
while (true) {
struct rb_node *prev = &vma->vm_rb;
if (!rb_parent(prev))
return -ENOMEM;
vma = rb_entry(rb_parent(prev),
struct vm_area_struct, vm_rb);
if (prev == vma->vm_rb.rb_right) {
gap_start = vma->vm_prev ?
vma->vm_prev->vm_end : 0;
goto check_current;
}
}
}
found:
/* We found a suitable gap. Clip it with the original high_limit. */
if (gap_end > info->high_limit)
gap_end = info->high_limit;
found_highest:
/* Compute highest gap address at the desired alignment */
gap_end -= info->length;
gap_end -= (gap_end - info->align_offset) & info->align_mask;
VM_BUG_ON(gap_end < info->low_limit);
VM_BUG_ON(gap_end < gap_start);
return gap_end;
}
/* Get an address range which is currently unmapped.
* For shmat() with addr=0.
*
@@ -1418,7 +1775,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long start_addr;
struct vm_unmapped_area_info info;
if (len > TASK_SIZE)
return -ENOMEM;
@@ -1433,40 +1790,13 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
(!vma || addr + len <= vma->vm_start))
return addr;
}
if (len > mm->cached_hole_size) {
start_addr = addr = mm->free_area_cache;
} else {
start_addr = addr = TASK_UNMAPPED_BASE;
mm->cached_hole_size = 0;
}
full_search:
for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
/* At this point: (!vma || addr < vma->vm_end). */
if (TASK_SIZE - len < addr) {
/*
* Start a new search - just in case we missed
* some holes.
*/
if (start_addr != TASK_UNMAPPED_BASE) {
addr = TASK_UNMAPPED_BASE;
start_addr = addr;
mm->cached_hole_size = 0;
goto full_search;
}
return -ENOMEM;
}
if (!vma || addr + len <= vma->vm_start) {
/*
* Remember the place where we stopped the search:
*/
mm->free_area_cache = addr + len;
return addr;
}
if (addr + mm->cached_hole_size < vma->vm_start)
mm->cached_hole_size = vma->vm_start - addr;
addr = vma->vm_end;
}
info.flags = 0;
info.length = len;
info.low_limit = TASK_UNMAPPED_BASE;
info.high_limit = TASK_SIZE;
info.align_mask = 0;
return vm_unmapped_area(&info);
}
#endif
@@ -1491,7 +1821,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
unsigned long addr = addr0, start_addr;
unsigned long addr = addr0;
struct vm_unmapped_area_info info;
/* requested length too big for entire address space */
if (len > TASK_SIZE)
@@ -1509,53 +1840,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
return addr;
}
/* check if free_area_cache is useful for us */
if (len <= mm->cached_hole_size) {
mm->cached_hole_size = 0;
mm->free_area_cache = mm->mmap_base;
}
try_again:
/* either no address requested or can't fit in requested address hole */
start_addr = addr = mm->free_area_cache;
if (addr < len)
goto fail;
addr -= len;
do {
/*
* Lookup failure means no vma is above this address,
* else if new region fits below vma->vm_start,
* return with success:
*/
vma = find_vma(mm, addr);
if (!vma || addr+len <= vma->vm_start)
/* remember the address as a hint for next time */
return (mm->free_area_cache = addr);
/* remember the largest hole we saw so far */
if (addr + mm->cached_hole_size < vma->vm_start)
mm->cached_hole_size = vma->vm_start - addr;
/* try just below the current vma->vm_start */
addr = vma->vm_start-len;
} while (len < vma->vm_start);
fail:
/*
* if hint left us with no space for the requested
* mapping then try again:
*
* Note: this is different with the case of bottomup
* which does the fully line-search, but we use find_vma
* here that causes some holes skipped.
*/
if (start_addr != mm->mmap_base) {
mm->free_area_cache = mm->mmap_base;
mm->cached_hole_size = 0;
goto try_again;
}
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = PAGE_SIZE;
info.high_limit = mm->mmap_base;
info.align_mask = 0;
addr = vm_unmapped_area(&info);
/*
* A failed mmap() very likely causes application failure,
@@ -1563,14 +1853,13 @@ fail:
* can happen with large stack limits and large mmap()
* allocations.
*/
mm->cached_hole_size = ~0UL;
mm->free_area_cache = TASK_UNMAPPED_BASE;
addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
/*
* Restore the topdown base:
*/
mm->free_area_cache = mm->mmap_base;
mm->cached_hole_size = ~0UL;
if (addr & ~PAGE_MASK) {
VM_BUG_ON(addr != -ENOMEM);
info.flags = 0;
info.low_limit = TASK_UNMAPPED_BASE;
info.high_limit = TASK_SIZE;
addr = vm_unmapped_area(&info);
}
return addr;
}
@@ -1780,9 +2069,27 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
error = acct_stack_growth(vma, size, grow);
if (!error) {
/*
* vma_gap_update() doesn't support concurrent
* updates, but we only hold a shared mmap_sem
* lock here, so we need to protect against
* concurrent vma expansions.
* vma_lock_anon_vma() doesn't help here, as
* we don't guarantee that all growable vmas
* in a mm share the same root anon vma.
* So, we reuse mm->page_table_lock to guard
* against concurrent vma expansions.
*/
spin_lock(&vma->vm_mm->page_table_lock);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
anon_vma_interval_tree_post_update_vma(vma);
if (vma->vm_next)
vma_gap_update(vma->vm_next);
else
vma->vm_mm->highest_vm_end = address;
spin_unlock(&vma->vm_mm->page_table_lock);
perf_event_mmap(vma);
}
}
@@ -1833,10 +2140,25 @@ int expand_downwards(struct vm_area_struct *vma,
if (grow <= vma->vm_pgoff) {
error = acct_stack_growth(vma, size, grow);
if (!error) {
/*
* vma_gap_update() doesn't support concurrent
* updates, but we only hold a shared mmap_sem
* lock here, so we need to protect against
* concurrent vma expansions.
* vma_lock_anon_vma() doesn't help here, as
* we don't guarantee that all growable vmas
* in a mm share the same root anon vma.
* So, we reuse mm->page_table_lock to guard
* against concurrent vma expansions.
*/
spin_lock(&vma->vm_mm->page_table_lock);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
vma->vm_pgoff -= grow;
anon_vma_interval_tree_post_update_vma(vma);
vma_gap_update(vma);
spin_unlock(&vma->vm_mm->page_table_lock);
perf_event_mmap(vma);
}
}
@@ -1959,14 +2281,17 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
vma->vm_prev = NULL;
do {
rb_erase(&vma->vm_rb, &mm->mm_rb);
vma_rb_erase(vma, &mm->mm_rb);
mm->map_count--;
tail_vma = vma;
vma = vma->vm_next;
} while (vma && vma->vm_start < end);
*insertion_point = vma;
if (vma)
if (vma) {
vma->vm_prev = prev;
vma_gap_update(vma);
} else
mm->highest_vm_end = prev ? prev->vm_end : 0;
tail_vma->vm_next = NULL;
if (mm->unmap_area == arch_unmap_area)
addr = prev ? prev->vm_end : mm->mmap_base;
@@ -2561,15 +2886,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
* The LSB of head.next can't change from under us
* because we hold the mm_all_locks_mutex.
*/
mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem);
down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);
/*
* We can safely modify head.next after taking the
* anon_vma->root->mutex. If some other vma in this mm shares
* anon_vma->root->rwsem. If some other vma in this mm shares
* the same anon_vma we won't take it again.
*
* No need of atomic instructions here, head.next
* can't change from under us thanks to the
* anon_vma->root->mutex.
* anon_vma->root->rwsem.
*/
if (__test_and_set_bit(0, (unsigned long *)
&anon_vma->root->rb_root.rb_node))
@@ -2671,7 +2996,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
*
* No need of atomic instructions here, head.next
* can't change from under us until we release the
* anon_vma->root->mutex.
* anon_vma->root->rwsem.
*/
if (!__test_and_clear_bit(0, (unsigned long *)
&anon_vma->root->rb_root.rb_node))

파일 보기

@@ -35,12 +35,16 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
}
#endif
static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable)
int dirty_accountable, int prot_numa, bool *ret_all_same_node)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *pte, oldpte;
spinlock_t *ptl;
unsigned long pages = 0;
bool all_same_node = true;
int last_nid = -1;
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
@@ -48,17 +52,43 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
oldpte = *pte;
if (pte_present(oldpte)) {
pte_t ptent;
bool updated = false;
ptent = ptep_modify_prot_start(mm, addr, pte);
ptent = pte_modify(ptent, newprot);
if (!prot_numa) {
ptent = pte_modify(ptent, newprot);
updated = true;
} else {
struct page *page;
page = vm_normal_page(vma, addr, oldpte);
if (page) {
int this_nid = page_to_nid(page);
if (last_nid == -1)
last_nid = this_nid;
if (last_nid != this_nid)
all_same_node = false;
/* only check non-shared pages */
if (!pte_numa(oldpte) &&
page_mapcount(page) == 1) {
ptent = pte_mknuma(ptent);
updated = true;
}
}
}
/*
* Avoid taking write faults for pages we know to be
* dirty.
*/
if (dirty_accountable && pte_dirty(ptent))
if (dirty_accountable && pte_dirty(ptent)) {
ptent = pte_mkwrite(ptent);
updated = true;
}
if (updated)
pages++;
ptep_modify_prot_commit(mm, addr, pte, ptent);
} else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
@@ -72,61 +102,101 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
set_pte_at(mm, addr, pte,
swp_entry_to_pte(entry));
}
pages++;
}
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
*ret_all_same_node = all_same_node;
return pages;
}
static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable)
#ifdef CONFIG_NUMA_BALANCING
static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
pmd_t *pmd)
{
spin_lock(&mm->page_table_lock);
set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
spin_unlock(&mm->page_table_lock);
}
#else
static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
pmd_t *pmd)
{
BUG();
}
#endif /* CONFIG_NUMA_BALANCING */
static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pud_t *pud, unsigned long addr, unsigned long end,
pgprot_t newprot, int dirty_accountable, int prot_numa)
{
pmd_t *pmd;
unsigned long next;
unsigned long pages = 0;
bool all_same_node;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
split_huge_page_pmd(vma->vm_mm, pmd);
else if (change_huge_pmd(vma, pmd, addr, newprot))
split_huge_page_pmd(vma, addr, pmd);
else if (change_huge_pmd(vma, pmd, addr, newprot,
prot_numa)) {
pages += HPAGE_PMD_NR;
continue;
}
/* fall through */
}
if (pmd_none_or_clear_bad(pmd))
continue;
change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
dirty_accountable);
pages += change_pte_range(vma, pmd, addr, next, newprot,
dirty_accountable, prot_numa, &all_same_node);
/*
* If we are changing protections for NUMA hinting faults then
* set pmd_numa if the examined pages were all on the same
* node. This allows a regular PMD to be handled as one fault
* and effectively batches the taking of the PTL
*/
if (prot_numa && all_same_node)
change_pmd_protnuma(vma->vm_mm, addr, pmd);
} while (pmd++, addr = next, addr != end);
return pages;
}
static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable)
static inline unsigned long change_pud_range(struct vm_area_struct *vma,
pgd_t *pgd, unsigned long addr, unsigned long end,
pgprot_t newprot, int dirty_accountable, int prot_numa)
{
pud_t *pud;
unsigned long next;
unsigned long pages = 0;
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
change_pmd_range(vma, pud, addr, next, newprot,
dirty_accountable);
pages += change_pmd_range(vma, pud, addr, next, newprot,
dirty_accountable, prot_numa);
} while (pud++, addr = next, addr != end);
return pages;
}
static void change_protection(struct vm_area_struct *vma,
static unsigned long change_protection_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable)
int dirty_accountable, int prot_numa)
{
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
unsigned long next;
unsigned long start = addr;
unsigned long pages = 0;
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
@@ -135,10 +205,32 @@ static void change_protection(struct vm_area_struct *vma,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
change_pud_range(vma, pgd, addr, next, newprot,
dirty_accountable);
pages += change_pud_range(vma, pgd, addr, next, newprot,
dirty_accountable, prot_numa);
} while (pgd++, addr = next, addr != end);
flush_tlb_range(vma, start, end);
/* Only flush the TLB if we actually modified any entries: */
if (pages)
flush_tlb_range(vma, start, end);
return pages;
}
unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgprot_t newprot,
int dirty_accountable, int prot_numa)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long pages;
mmu_notifier_invalidate_range_start(mm, start, end);
if (is_vm_hugetlb_page(vma))
pages = hugetlb_change_protection(vma, start, end, newprot);
else
pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
mmu_notifier_invalidate_range_end(mm, start, end);
return pages;
}
int
@@ -213,12 +305,9 @@ success:
dirty_accountable = 1;
}
mmu_notifier_invalidate_range_start(mm, start, end);
if (is_vm_hugetlb_page(vma))
hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
else
change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
mmu_notifier_invalidate_range_end(mm, start, end);
change_protection(vma, start, end, vma->vm_page_prot,
dirty_accountable, 0);
vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
vm_stat_account(mm, newflags, vma->vm_file, nrpages);
perf_event_mmap(vma);
@@ -274,8 +363,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
error = -EINVAL;
if (!(vma->vm_flags & VM_GROWSDOWN))
goto out;
}
else {
} else {
if (vma->vm_start > start)
goto out;
if (unlikely(grows & PROT_GROWSUP)) {
@@ -291,9 +379,10 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
for (nstart = start ; ; ) {
unsigned long newflags;
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
newflags = vm_flags;
newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
/* newflags >> 4 shift VM_MAY% in place of VM_% */
if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {

파일 보기

@@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
}
if (vma->anon_vma) {
anon_vma = vma->anon_vma;
anon_vma_lock(anon_vma);
anon_vma_lock_write(anon_vma);
}
}
@@ -182,7 +182,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
need_flush = true;
continue;
} else if (!err) {
split_huge_page_pmd(vma->vm_mm, old_pmd);
split_huge_page_pmd(vma, old_addr, old_pmd);
}
VM_BUG_ON(pmd_trans_huge(*old_pmd));
}

파일 보기

@@ -137,6 +137,22 @@ unsigned long __init free_low_memory_core_early(int nodeid)
return count;
}
static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
{
struct zone *z;
/*
* In free_area_init_core(), highmem zone's managed_pages is set to
* present_pages, and bootmem allocator doesn't allocate from highmem
* zones. So there's no need to recalculate managed_pages because all
* highmem pages will be managed by the buddy system. Here highmem
* zone also includes highmem movable zone.
*/
for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
if (!is_highmem(z))
z->managed_pages = 0;
}
/**
* free_all_bootmem - release free pages to the buddy allocator
*
@@ -144,6 +160,11 @@ unsigned long __init free_low_memory_core_early(int nodeid)
*/
unsigned long __init free_all_bootmem(void)
{
struct pglist_data *pgdat;
for_each_online_pgdat(pgdat)
reset_node_lowmem_managed_pages(pgdat);
/*
* We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
* because in some case like Node0 doesn't have RAM installed

파일 보기

@@ -66,6 +66,21 @@ int heap_stack_gap = 0;
atomic_long_t mmap_pages_allocated;
/*
* The global memory commitment made in the system can be a metric
* that can be used to drive ballooning decisions when Linux is hosted
* as a guest. On Hyper-V, the host implements a policy engine for dynamically
* balancing memory across competing virtual machines that are hosted.
* Several metrics drive this policy engine including the guest reported
* memory commitment.
*/
unsigned long vm_memory_committed(void)
{
return percpu_counter_read_positive(&vm_committed_as);
}
EXPORT_SYMBOL_GPL(vm_memory_committed);
EXPORT_SYMBOL(mem_map);
EXPORT_SYMBOL(num_physpages);

파일 보기

@@ -44,48 +44,6 @@ int sysctl_oom_kill_allocating_task;
int sysctl_oom_dump_tasks = 1;
static DEFINE_SPINLOCK(zone_scan_lock);
/*
* compare_swap_oom_score_adj() - compare and swap current's oom_score_adj
* @old_val: old oom_score_adj for compare
* @new_val: new oom_score_adj for swap
*
* Sets the oom_score_adj value for current to @new_val iff its present value is
* @old_val. Usually used to reinstate a previous value to prevent racing with
* userspacing tuning the value in the interim.
*/
void compare_swap_oom_score_adj(int old_val, int new_val)
{
struct sighand_struct *sighand = current->sighand;
spin_lock_irq(&sighand->siglock);
if (current->signal->oom_score_adj == old_val)
current->signal->oom_score_adj = new_val;
trace_oom_score_adj_update(current);
spin_unlock_irq(&sighand->siglock);
}
/**
* test_set_oom_score_adj() - set current's oom_score_adj and return old value
* @new_val: new oom_score_adj value
*
* Sets the oom_score_adj value for current to @new_val with proper
* synchronization and returns the old value. Usually used to temporarily
* set a value, save the old value in the caller, and then reinstate it later.
*/
int test_set_oom_score_adj(int new_val)
{
struct sighand_struct *sighand = current->sighand;
int old_val;
spin_lock_irq(&sighand->siglock);
old_val = current->signal->oom_score_adj;
current->signal->oom_score_adj = new_val;
trace_oom_score_adj_update(current);
spin_unlock_irq(&sighand->siglock);
return old_val;
}
#ifdef CONFIG_NUMA
/**
* has_intersects_mems_allowed() - check task eligiblity for kill
@@ -193,7 +151,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
if (!p)
return 0;
adj = p->signal->oom_score_adj;
adj = (long)p->signal->oom_score_adj;
if (adj == OOM_SCORE_ADJ_MIN) {
task_unlock(p);
return 0;
@@ -257,7 +215,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
* the page allocator means a mempolicy is in effect. Cpuset policy
* is enforced in get_page_from_freelist().
*/
if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) {
if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
*totalpages = total_swap_pages;
for_each_node_mask(nid, *nodemask)
*totalpages += node_spanned_pages(nid);
@@ -310,26 +268,20 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
if (!task->mm)
return OOM_SCAN_CONTINUE;
if (task->flags & PF_EXITING) {
/*
* If task is allocating a lot of memory and has been marked to be
* killed first if it triggers an oom, then select it.
*/
if (oom_task_origin(task))
return OOM_SCAN_SELECT;
if (task->flags & PF_EXITING && !force_kill) {
/*
* If task is current and is in the process of releasing memory,
* allow the "kill" to set TIF_MEMDIE, which will allow it to
* access memory reserves. Otherwise, it may stall forever.
*
* The iteration isn't broken here, however, in case other
* threads are found to have already been oom killed.
* If this task is not being ptraced on exit, then wait for it
* to finish before killing some other task unnecessarily.
*/
if (task == current)
return OOM_SCAN_SELECT;
else if (!force_kill) {
/*
* If this task is not being ptraced on exit, then wait
* for it to finish before killing some other task
* unnecessarily.
*/
if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
return OOM_SCAN_ABORT;
}
if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
return OOM_SCAN_ABORT;
}
return OOM_SCAN_OK;
}
@@ -412,7 +364,7 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
continue;
}
pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n",
pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n",
task->pid, from_kuid(&init_user_ns, task_uid(task)),
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
task->mm->nr_ptes,
@@ -428,7 +380,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
{
task_lock(current);
pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
"oom_score_adj=%d\n",
"oom_score_adj=%hd\n",
current->comm, gfp_mask, order,
current->signal->oom_score_adj);
cpuset_print_task_mems_allowed(current);
@@ -639,43 +591,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
spin_unlock(&zone_scan_lock);
}
/*
* Try to acquire the oom killer lock for all system zones. Returns zero if a
* parallel oom killing is taking place, otherwise locks all zones and returns
* non-zero.
*/
static int try_set_system_oom(void)
{
struct zone *zone;
int ret = 1;
spin_lock(&zone_scan_lock);
for_each_populated_zone(zone)
if (zone_is_oom_locked(zone)) {
ret = 0;
goto out;
}
for_each_populated_zone(zone)
zone_set_flag(zone, ZONE_OOM_LOCKED);
out:
spin_unlock(&zone_scan_lock);
return ret;
}
/*
* Clears ZONE_OOM_LOCKED for all system zones so that failed allocation
* attempts or page faults may now recall the oom killer, if necessary.
*/
static void clear_system_oom(void)
{
struct zone *zone;
spin_lock(&zone_scan_lock);
for_each_populated_zone(zone)
zone_clear_flag(zone, ZONE_OOM_LOCKED);
spin_unlock(&zone_scan_lock);
}
/**
* out_of_memory - kill the "best" process when we run out of memory
* @zonelist: zonelist pointer
@@ -706,11 +621,11 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
return;
/*
* If current has a pending SIGKILL, then automatically select it. The
* goal is to allow it to allocate so that it may quickly exit and free
* its memory.
* If current has a pending SIGKILL or is exiting, then automatically
* select it. The goal is to allow it to allocate so that it may
* quickly exit and free its memory.
*/
if (fatal_signal_pending(current)) {
if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
set_thread_flag(TIF_MEMDIE);
return;
}
@@ -756,15 +671,16 @@ out:
/*
* The pagefault handler calls here because it is out of memory, so kill a
* memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel
* oom killing is already in progress so do nothing. If a task is found with
* TIF_MEMDIE set, it has been killed so do nothing and allow it to exit.
* memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a
* parallel oom killing is already in progress so do nothing.
*/
void pagefault_out_of_memory(void)
{
if (try_set_system_oom()) {
struct zonelist *zonelist = node_zonelist(first_online_node,
GFP_KERNEL);
if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
out_of_memory(NULL, 0, 0, NULL, false);
clear_system_oom();
clear_zonelist_oom(zonelist, GFP_KERNEL);
}
schedule_timeout_killable(1);
}

파일 보기

@@ -200,6 +200,18 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
x += zone_page_state(z, NR_FREE_PAGES) +
zone_reclaimable_pages(z) - z->dirty_balance_reserve;
}
/*
* Unreclaimable memory (kernel memory or anonymous memory
* without swap) can bring down the dirtyable pages below
* the zone's dirty balance reserve and the above calculation
* will underflow. However we still want to add in nodes
* which are below threshold (negative values) to get a more
* accurate calculation but make sure that the total never
* underflows.
*/
if ((long)x < 0)
x = 0;
/*
* Make sure that the number of highmem pages is never larger
* than the number of the total dirtyable memory. This can only
@@ -222,8 +234,8 @@ static unsigned long global_dirtyable_memory(void)
{
unsigned long x;
x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() -
dirty_balance_reserve;
x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
x -= min(x, dirty_balance_reserve);
if (!vm_highmem_is_dirtyable)
x -= highmem_dirtyable_memory(x);
@@ -290,9 +302,12 @@ static unsigned long zone_dirtyable_memory(struct zone *zone)
* highmem zone can hold its share of dirty pages, so we don't
* care about vm_highmem_is_dirtyable here.
*/
return zone_page_state(zone, NR_FREE_PAGES) +
zone_reclaimable_pages(zone) -
zone->dirty_balance_reserve;
unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) +
zone_reclaimable_pages(zone);
/* don't allow this to underflow */
nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
return nr_pages;
}
/**
@@ -1069,7 +1084,7 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi,
}
/*
* After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
* After a task dirtied this many pages, balance_dirty_pages_ratelimited()
* will look to see if it needs to start dirty throttling.
*
* If dirty_poll_interval is too low, big NUMA machines will call the expensive
@@ -1436,9 +1451,8 @@ static DEFINE_PER_CPU(int, bdp_ratelimits);
DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
/**
* balance_dirty_pages_ratelimited_nr - balance dirty memory state
* balance_dirty_pages_ratelimited - balance dirty memory state
* @mapping: address_space which was dirtied
* @nr_pages_dirtied: number of pages which the caller has just dirtied
*
* Processes which are dirtying memory should call in here once for each page
* which was newly dirtied. The function will periodically check the system's
@@ -1449,8 +1463,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
* limit we decrease the ratelimiting by a lot, to prevent individual processes
* from overshooting the limit by (ratelimit_pages) each.
*/
void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
unsigned long nr_pages_dirtied)
void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
struct backing_dev_info *bdi = mapping->backing_dev_info;
int ratelimit;
@@ -1484,6 +1497,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
*/
p = &__get_cpu_var(dirty_throttle_leaks);
if (*p > 0 && current->nr_dirtied < ratelimit) {
unsigned long nr_pages_dirtied;
nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
*p -= nr_pages_dirtied;
current->nr_dirtied += nr_pages_dirtied;
@@ -1493,7 +1507,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
if (unlikely(current->nr_dirtied >= ratelimit))
balance_dirty_pages(mapping, current->nr_dirtied);
}
EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
void throttle_vm_writeout(gfp_t gfp_mask)
{

파일 보기

@@ -89,6 +89,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
[N_NORMAL_MEMORY] = { { [0] = 1UL } },
#ifdef CONFIG_HIGHMEM
[N_HIGH_MEMORY] = { { [0] = 1UL } },
#endif
#ifdef CONFIG_MOVABLE_NODE
[N_MEMORY] = { { [0] = 1UL } },
#endif
[N_CPU] = { { [0] = 1UL } },
#endif /* NUMA */
@@ -218,11 +221,6 @@ EXPORT_SYMBOL(nr_online_nodes);
int page_group_by_mobility_disabled __read_mostly;
/*
* NOTE:
* Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
* Instead, use {un}set_pageblock_isolate.
*/
void set_pageblock_migratetype(struct page *page, int migratetype)
{
@@ -368,8 +366,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
int nr_pages = 1 << order;
int bad = 0;
if (unlikely(compound_order(page) != order) ||
unlikely(!PageHead(page))) {
if (unlikely(compound_order(page) != order)) {
bad_page(page);
bad++;
}
@@ -523,7 +520,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
* If a block is freed, and its buddy is also free, then this
* triggers coalescing into a block of larger size.
*
* -- wli
* -- nyc
*/
static inline void __free_one_page(struct page *page,
@@ -608,6 +605,7 @@ static inline int free_pages_check(struct page *page)
bad_page(page);
return 1;
}
reset_page_last_nid(page);
if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
return 0;
@@ -667,11 +665,13 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
__free_one_page(page, zone, 0, mt);
trace_mm_page_pcpu_drain(page, 0, mt);
if (is_migrate_cma(mt))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {
__mod_zone_page_state(zone, NR_FREE_PAGES, 1);
if (is_migrate_cma(mt))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
}
} while (--to_free && --batch_free && !list_empty(list));
}
__mod_zone_page_state(zone, NR_FREE_PAGES, count);
spin_unlock(&zone->lock);
}
@@ -730,6 +730,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
local_irq_restore(flags);
}
/*
* Read access to zone->managed_pages is safe because it's unsigned long,
* but we still need to serialize writers. Currently all callers of
* __free_pages_bootmem() except put_page_bootmem() should only be used
* at boot time. So for shorter boot time, we shift the burden to
* put_page_bootmem() to serialize writers.
*/
void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
{
unsigned int nr_pages = 1 << order;
@@ -745,6 +752,7 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
set_page_count(p, 0);
}
page_zone(page)->managed_pages += 1 << order;
set_page_refcounted(page);
__free_pages(page, order);
}
@@ -780,7 +788,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
* large block of memory acted on by a series of small allocations.
* This behavior is a critical factor in sglist merging's success.
*
* -- wli
* -- nyc
*/
static inline void expand(struct zone *zone, struct page *page,
int low, int high, struct free_area *area,
@@ -1376,14 +1384,8 @@ void split_page(struct page *page, unsigned int order)
set_page_refcounted(page + i);
}
/*
* Similar to the split_page family of functions except that the page
* required at the given order and being isolated now to prevent races
* with parallel allocators
*/
int capture_free_page(struct page *page, int alloc_order, int migratetype)
static int __isolate_free_page(struct page *page, unsigned int order)
{
unsigned int order;
unsigned long watermark;
struct zone *zone;
int mt;
@@ -1391,27 +1393,23 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
BUG_ON(!PageBuddy(page));
zone = page_zone(page);
order = page_order(page);
mt = get_pageblock_migratetype(page);
/* Obey watermarks as if the page was being allocated */
watermark = low_wmark_pages(zone) + (1 << order);
if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
return 0;
if (mt != MIGRATE_ISOLATE) {
/* Obey watermarks as if the page was being allocated */
watermark = low_wmark_pages(zone) + (1 << order);
if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
return 0;
__mod_zone_freepage_state(zone, -(1UL << order), mt);
}
/* Remove page from free list */
list_del(&page->lru);
zone->free_area[order].nr_free--;
rmv_page_order(page);
mt = get_pageblock_migratetype(page);
if (unlikely(mt != MIGRATE_ISOLATE))
__mod_zone_freepage_state(zone, -(1UL << order), mt);
if (alloc_order != order)
expand(zone, page, alloc_order, order,
&zone->free_area[order], migratetype);
/* Set the pageblock if the captured page is at least a pageblock */
/* Set the pageblock if the isolated page is at least a pageblock */
if (order >= pageblock_order - 1) {
struct page *endpage = page + (1 << order) - 1;
for (; page < endpage; page += pageblock_nr_pages) {
@@ -1440,10 +1438,9 @@ int split_free_page(struct page *page)
unsigned int order;
int nr_pages;
BUG_ON(!PageBuddy(page));
order = page_order(page);
nr_pages = capture_free_page(page, order, 0);
nr_pages = __isolate_free_page(page, order);
if (!nr_pages)
return 0;
@@ -1641,20 +1638,6 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
return true;
}
#ifdef CONFIG_MEMORY_ISOLATION
static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
{
if (unlikely(zone->nr_pageblock_isolate))
return zone->nr_pageblock_isolate * pageblock_nr_pages;
return 0;
}
#else
static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
{
return 0;
}
#endif
bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
int classzone_idx, int alloc_flags)
{
@@ -1670,14 +1653,6 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
/*
* If the zone has MIGRATE_ISOLATE type free pages, we should consider
* it. nr_zone_isolate_freepages is never accurate so kswapd might not
* sleep although it could do so. But this is more desirable for memory
* hotplug than sleeping which can cause a livelock in the direct
* reclaim path.
*/
free_pages -= nr_zone_isolate_freepages(z);
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
free_pages);
}
@@ -1692,7 +1667,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
*
* If the zonelist cache is present in the passed in zonelist, then
* returns a pointer to the allowed node mask (either the current
* tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
* tasks mems_allowed, or node_states[N_MEMORY].)
*
* If the zonelist cache is not available for this zonelist, does
* nothing and returns NULL.
@@ -1721,7 +1696,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
&cpuset_current_mems_allowed :
&node_states[N_HIGH_MEMORY];
&node_states[N_MEMORY];
return allowednodes;
}
@@ -1871,7 +1846,7 @@ zonelist_scan:
*/
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask) {
if (NUMA_BUILD && zlc_active &&
if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
if ((alloc_flags & ALLOC_CPUSET) &&
@@ -1917,7 +1892,8 @@ zonelist_scan:
classzone_idx, alloc_flags))
goto try_this_zone;
if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
if (IS_ENABLED(CONFIG_NUMA) &&
!did_zlc_setup && nr_online_nodes > 1) {
/*
* we do zlc_setup if there are multiple nodes
* and before considering the first zone allowed
@@ -1936,7 +1912,7 @@ zonelist_scan:
* As we may have just activated ZLC, check if the first
* eligible zone has failed zone_reclaim recently.
*/
if (NUMA_BUILD && zlc_active &&
if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
@@ -1962,11 +1938,11 @@ try_this_zone:
if (page)
break;
this_zone_full:
if (NUMA_BUILD)
if (IS_ENABLED(CONFIG_NUMA))
zlc_mark_zone_full(zonelist, z);
}
if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
/* Disable zlc cache for second zonelist scan */
zlc_active = 0;
goto zonelist_scan;
@@ -2148,8 +2124,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
bool *contended_compaction, bool *deferred_compaction,
unsigned long *did_some_progress)
{
struct page *page = NULL;
if (!order)
return NULL;
@@ -2161,16 +2135,12 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
current->flags |= PF_MEMALLOC;
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
nodemask, sync_migration,
contended_compaction, &page);
contended_compaction);
current->flags &= ~PF_MEMALLOC;
/* If compaction captured a page, prep and use it */
if (page) {
prep_new_page(page, order, gfp_mask);
goto got_page;
}
if (*did_some_progress != COMPACT_SKIPPED) {
struct page *page;
/* Page migration frees to the PCP lists but we want merging */
drain_pages(get_cpu());
put_cpu();
@@ -2180,7 +2150,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
alloc_flags & ~ALLOC_NO_WATERMARKS,
preferred_zone, migratetype);
if (page) {
got_page:
preferred_zone->compact_blockskip_flush = false;
preferred_zone->compact_considered = 0;
preferred_zone->compact_defer_shift = 0;
@@ -2266,7 +2235,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
return NULL;
/* After successful reclaim, reconsider all zones for allocation */
if (NUMA_BUILD)
if (IS_ENABLED(CONFIG_NUMA))
zlc_clear_zones_full(zonelist);
retry:
@@ -2412,12 +2381,14 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
* allowed per node queues are empty and that nodes are
* over allocated.
*/
if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
if (IS_ENABLED(CONFIG_NUMA) &&
(gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage;
restart:
wake_all_kswapd(order, zonelist, high_zoneidx,
zone_idx(preferred_zone));
if (!(gfp_mask & __GFP_NO_KSWAPD))
wake_all_kswapd(order, zonelist, high_zoneidx,
zone_idx(preferred_zone));
/*
* OK, we're below the kswapd watermark and have kicked background
@@ -2494,7 +2465,7 @@ rebalance:
* system then fail the allocation instead of entering direct reclaim.
*/
if ((deferred_compaction || contended_compaction) &&
(gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE)
(gfp_mask & __GFP_NO_KSWAPD))
goto nopage;
/* Try direct reclaim and then allocating */
@@ -2595,6 +2566,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
int migratetype = allocflags_to_migratetype(gfp_mask);
unsigned int cpuset_mems_cookie;
int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
struct mem_cgroup *memcg = NULL;
gfp_mask &= gfp_allowed_mask;
@@ -2613,6 +2585,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
/*
* Will only have any effect when __GFP_KMEMCG is set. This is
* verified in the (always inline) callee
*/
if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
return NULL;
retry_cpuset:
cpuset_mems_cookie = get_mems_allowed();
@@ -2648,6 +2627,8 @@ out:
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
memcg_kmem_commit_charge(page, memcg, order);
return page;
}
EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2700,6 +2681,31 @@ void free_pages(unsigned long addr, unsigned int order)
EXPORT_SYMBOL(free_pages);
/*
* __free_memcg_kmem_pages and free_memcg_kmem_pages will free
* pages allocated with __GFP_KMEMCG.
*
* Those pages are accounted to a particular memcg, embedded in the
* corresponding page_cgroup. To avoid adding a hit in the allocator to search
* for that information only to find out that it is NULL for users who have no
* interest in that whatsoever, we provide these functions.
*
* The caller knows better which flags it relies on.
*/
void __free_memcg_kmem_pages(struct page *page, unsigned int order)
{
memcg_kmem_uncharge_pages(page, order);
__free_pages(page, order);
}
void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
{
if (addr != 0) {
VM_BUG_ON(!virt_addr_valid((void *)addr));
__free_memcg_kmem_pages(virt_to_page((void *)addr), order);
}
}
static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
{
if (addr) {
@@ -2818,7 +2824,7 @@ unsigned int nr_free_pagecache_pages(void)
static inline void show_node(struct zone *zone)
{
if (NUMA_BUILD)
if (IS_ENABLED(CONFIG_NUMA))
printk("Node %d ", zone_to_nid(zone));
}
@@ -2876,6 +2882,31 @@ out:
#define K(x) ((x) << (PAGE_SHIFT-10))
static void show_migration_types(unsigned char type)
{
static const char types[MIGRATE_TYPES] = {
[MIGRATE_UNMOVABLE] = 'U',
[MIGRATE_RECLAIMABLE] = 'E',
[MIGRATE_MOVABLE] = 'M',
[MIGRATE_RESERVE] = 'R',
#ifdef CONFIG_CMA
[MIGRATE_CMA] = 'C',
#endif
[MIGRATE_ISOLATE] = 'I',
};
char tmp[MIGRATE_TYPES + 1];
char *p = tmp;
int i;
for (i = 0; i < MIGRATE_TYPES; i++) {
if (type & (1 << i))
*p++ = types[i];
}
*p = '\0';
printk("(%s) ", tmp);
}
/*
* Show free area list (used inside shift_scroll-lock stuff)
* We also calculate the percentage fragmentation. We do this by counting the
@@ -2950,6 +2981,7 @@ void show_free_areas(unsigned int filter)
" isolated(anon):%lukB"
" isolated(file):%lukB"
" present:%lukB"
" managed:%lukB"
" mlocked:%lukB"
" dirty:%lukB"
" writeback:%lukB"
@@ -2979,6 +3011,7 @@ void show_free_areas(unsigned int filter)
K(zone_page_state(zone, NR_ISOLATED_ANON)),
K(zone_page_state(zone, NR_ISOLATED_FILE)),
K(zone->present_pages),
K(zone->managed_pages),
K(zone_page_state(zone, NR_MLOCK)),
K(zone_page_state(zone, NR_FILE_DIRTY)),
K(zone_page_state(zone, NR_WRITEBACK)),
@@ -3004,6 +3037,7 @@ void show_free_areas(unsigned int filter)
for_each_populated_zone(zone) {
unsigned long nr[MAX_ORDER], flags, order, total = 0;
unsigned char types[MAX_ORDER];
if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue;
@@ -3012,12 +3046,24 @@ void show_free_areas(unsigned int filter)
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
nr[order] = zone->free_area[order].nr_free;
struct free_area *area = &zone->free_area[order];
int type;
nr[order] = area->nr_free;
total += nr[order] << order;
types[order] = 0;
for (type = 0; type < MIGRATE_TYPES; type++) {
if (!list_empty(&area->free_list[type]))
types[order] |= 1 << type;
}
}
spin_unlock_irqrestore(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++)
for (order = 0; order < MAX_ORDER; order++) {
printk("%lu*%lukB ", nr[order], K(1UL) << order);
if (nr[order])
show_migration_types(types[order]);
}
printk("= %lukB\n", K(total));
}
@@ -3194,7 +3240,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
return node;
}
for_each_node_state(n, N_HIGH_MEMORY) {
for_each_node_state(n, N_MEMORY) {
/* Don't want a node to appear more than once */
if (node_isset(n, *used_node_mask))
@@ -3336,7 +3382,7 @@ static int default_zonelist_order(void)
* local memory, NODE_ORDER may be suitable.
*/
average_size = total_size /
(nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
(nodes_weight(node_states[N_MEMORY]) + 1);
for_each_online_node(nid) {
low_kmem_size = 0;
total_size = 0;
@@ -3826,6 +3872,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
mminit_verify_page_links(page, zone, nid, pfn);
init_page_count(page);
reset_page_mapcount(page);
reset_page_last_nid(page);
SetPageReserved(page);
/*
* Mark the block movable so that blocks are reserved for
@@ -4432,6 +4479,26 @@ void __init set_pageblock_order(void)
#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
unsigned long present_pages)
{
unsigned long pages = spanned_pages;
/*
* Provide a more accurate estimation if there are holes within
* the zone and SPARSEMEM is in use. If there are holes within the
* zone, each populated memory region may cost us one or two extra
* memmap pages due to alignment because memmap pages for each
* populated regions may not naturally algined on page boundary.
* So the (present_pages >> 4) heuristic is a tradeoff for that.
*/
if (spanned_pages > present_pages + (present_pages >> 4) &&
IS_ENABLED(CONFIG_SPARSEMEM))
pages = present_pages;
return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
}
/*
* Set up the zone data structures:
* - mark all pages reserved
@@ -4449,54 +4516,67 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
int ret;
pgdat_resize_init(pgdat);
#ifdef CONFIG_NUMA_BALANCING
spin_lock_init(&pgdat->numabalancing_migrate_lock);
pgdat->numabalancing_migrate_nr_pages = 0;
pgdat->numabalancing_migrate_next_window = jiffies;
#endif
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
pgdat_page_cgroup_init(pgdat);
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, memmap_pages;
unsigned long size, realsize, freesize, memmap_pages;
size = zone_spanned_pages_in_node(nid, j, zones_size);
realsize = size - zone_absent_pages_in_node(nid, j,
realsize = freesize = size - zone_absent_pages_in_node(nid, j,
zholes_size);
/*
* Adjust realsize so that it accounts for how much memory
* Adjust freesize so that it accounts for how much memory
* is used by this zone for memmap. This affects the watermark
* and per-cpu initialisations
*/
memmap_pages =
PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
if (realsize >= memmap_pages) {
realsize -= memmap_pages;
memmap_pages = calc_memmap_size(size, realsize);
if (freesize >= memmap_pages) {
freesize -= memmap_pages;
if (memmap_pages)
printk(KERN_DEBUG
" %s zone: %lu pages used for memmap\n",
zone_names[j], memmap_pages);
} else
printk(KERN_WARNING
" %s zone: %lu pages exceeds realsize %lu\n",
zone_names[j], memmap_pages, realsize);
" %s zone: %lu pages exceeds freesize %lu\n",
zone_names[j], memmap_pages, freesize);
/* Account for reserved pages */
if (j == 0 && realsize > dma_reserve) {
realsize -= dma_reserve;
if (j == 0 && freesize > dma_reserve) {
freesize -= dma_reserve;
printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
zone_names[0], dma_reserve);
}
if (!is_highmem_idx(j))
nr_kernel_pages += realsize;
nr_all_pages += realsize;
nr_kernel_pages += freesize;
/* Charge for highmem memmap if there are enough kernel pages */
else if (nr_kernel_pages > memmap_pages * 2)
nr_kernel_pages -= memmap_pages;
nr_all_pages += freesize;
zone->spanned_pages = size;
zone->present_pages = realsize;
zone->present_pages = freesize;
/*
* Set an approximate value for lowmem here, it will be adjusted
* when the bootmem allocator frees pages into the buddy system.
* And all highmem pages will be managed by the buddy system.
*/
zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
#ifdef CONFIG_NUMA
zone->node = nid;
zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
/ 100;
zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
#endif
zone->name = zone_names[j];
spin_lock_init(&zone->lock);
@@ -4687,7 +4767,7 @@ unsigned long __init find_min_pfn_with_active_regions(void)
/*
* early_calculate_totalpages()
* Sum pages in active regions for movable zone.
* Populate N_HIGH_MEMORY for calculating usable_nodes.
* Populate N_MEMORY for calculating usable_nodes.
*/
static unsigned long __init early_calculate_totalpages(void)
{
@@ -4700,7 +4780,7 @@ static unsigned long __init early_calculate_totalpages(void)
totalpages += pages;
if (pages)
node_set_state(nid, N_HIGH_MEMORY);
node_set_state(nid, N_MEMORY);
}
return totalpages;
}
@@ -4717,9 +4797,9 @@ static void __init find_zone_movable_pfns_for_nodes(void)
unsigned long usable_startpfn;
unsigned long kernelcore_node, kernelcore_remaining;
/* save the state before borrow the nodemask */
nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
nodemask_t saved_node_state = node_states[N_MEMORY];
unsigned long totalpages = early_calculate_totalpages();
int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
int usable_nodes = nodes_weight(node_states[N_MEMORY]);
/*
* If movablecore was specified, calculate what size of
@@ -4754,7 +4834,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
restart:
/* Spread kernelcore memory as evenly as possible throughout nodes */
kernelcore_node = required_kernelcore / usable_nodes;
for_each_node_state(nid, N_HIGH_MEMORY) {
for_each_node_state(nid, N_MEMORY) {
unsigned long start_pfn, end_pfn;
/*
@@ -4846,23 +4926,27 @@ restart:
out:
/* restore the node_state */
node_states[N_HIGH_MEMORY] = saved_node_state;
node_states[N_MEMORY] = saved_node_state;
}
/* Any regular memory on that node ? */
static void __init check_for_regular_memory(pg_data_t *pgdat)
/* Any regular or high memory on that node ? */
static void check_for_memory(pg_data_t *pgdat, int nid)
{
#ifdef CONFIG_HIGHMEM
enum zone_type zone_type;
for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
if (N_MEMORY == N_NORMAL_MEMORY)
return;
for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
if (zone->present_pages) {
node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
node_set_state(nid, N_HIGH_MEMORY);
if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
zone_type <= ZONE_NORMAL)
node_set_state(nid, N_NORMAL_MEMORY);
break;
}
}
#endif
}
/**
@@ -4945,8 +5029,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
/* Any memory on that node */
if (pgdat->node_present_pages)
node_set_state(nid, N_HIGH_MEMORY);
check_for_regular_memory(pgdat);
node_set_state(nid, N_MEMORY);
check_for_memory(pgdat, nid);
}
}
@@ -5174,10 +5258,6 @@ static void __setup_per_zone_wmarks(void)
zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);
zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);
zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);
setup_zone_migrate_reserve(zone);
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -5505,7 +5585,7 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
pfn &= (PAGES_PER_SECTION-1);
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
#else
pfn = pfn - zone->zone_start_pfn;
pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
#endif /* CONFIG_SPARSEMEM */
}
@@ -5575,7 +5655,8 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
* MIGRATE_MOVABLE block might include unmovable pages. It means you can't
* expect this function should be exact.
*/
bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
bool skip_hwpoisoned_pages)
{
unsigned long pfn, iter, found;
int mt;
@@ -5610,6 +5691,13 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
continue;
}
/*
* The HWPoisoned page may be not in buddy system, and
* page_count() is not 0.
*/
if (skip_hwpoisoned_pages && PageHWPoison(page))
continue;
if (!PageLRU(page))
found++;
/*
@@ -5652,7 +5740,7 @@ bool is_pageblock_removable_nolock(struct page *page)
zone->zone_start_pfn + zone->spanned_pages <= pfn)
return false;
return !has_unmovable_pages(zone, page, 0);
return !has_unmovable_pages(zone, page, 0, true);
}
#ifdef CONFIG_CMA
@@ -5679,7 +5767,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
unsigned int tries = 0;
int ret = 0;
migrate_prep_local();
migrate_prep();
while (pfn < end || !list_empty(&cc->migratepages)) {
if (fatal_signal_pending(current)) {
@@ -5707,61 +5795,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
ret = migrate_pages(&cc->migratepages,
alloc_migrate_target,
0, false, MIGRATE_SYNC);
0, false, MIGRATE_SYNC,
MR_CMA);
}
putback_lru_pages(&cc->migratepages);
putback_movable_pages(&cc->migratepages);
return ret > 0 ? 0 : ret;
}
/*
* Update zone's cma pages counter used for watermark level calculation.
*/
static inline void __update_cma_watermarks(struct zone *zone, int count)
{
unsigned long flags;
spin_lock_irqsave(&zone->lock, flags);
zone->min_cma_pages += count;
spin_unlock_irqrestore(&zone->lock, flags);
setup_per_zone_wmarks();
}
/*
* Trigger memory pressure bump to reclaim some pages in order to be able to
* allocate 'count' pages in single page units. Does similar work as
*__alloc_pages_slowpath() function.
*/
static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
struct zonelist *zonelist = node_zonelist(0, gfp_mask);
int did_some_progress = 0;
int order = 1;
/*
* Increase level of watermarks to force kswapd do his job
* to stabilise at new watermark level.
*/
__update_cma_watermarks(zone, count);
/* Obey watermarks as if the page was being allocated */
while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {
wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
NULL);
if (!did_some_progress) {
/* Exhausted what can be done so it's blamo time */
out_of_memory(zonelist, gfp_mask, order, NULL, false);
}
}
/* Restore original watermark levels. */
__update_cma_watermarks(zone, -count);
return count;
}
/**
* alloc_contig_range() -- tries to allocate given range of pages
* @start: start PFN to allocate
@@ -5785,7 +5826,6 @@ static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
int alloc_contig_range(unsigned long start, unsigned long end,
unsigned migratetype)
{
struct zone *zone = page_zone(pfn_to_page(start));
unsigned long outer_start, outer_end;
int ret = 0, order;
@@ -5823,7 +5863,8 @@ int alloc_contig_range(unsigned long start, unsigned long end,
*/
ret = start_isolate_page_range(pfn_max_align_down(start),
pfn_max_align_up(end), migratetype);
pfn_max_align_up(end), migratetype,
false);
if (ret)
return ret;
@@ -5862,18 +5903,13 @@ int alloc_contig_range(unsigned long start, unsigned long end,
}
/* Make sure the range is really isolated. */
if (test_pages_isolated(outer_start, end)) {
if (test_pages_isolated(outer_start, end, false)) {
pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
outer_start, end);
ret = -EBUSY;
goto done;
}
/*
* Reclaim enough pages to make sure that contiguous allocation
* will not starve the system.
*/
__reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
/* Grab isolated pages from freelists. */
outer_end = isolate_freepages_range(&cc, outer_start, end);
@@ -5896,8 +5932,15 @@ done:
void free_contig_range(unsigned long pfn, unsigned nr_pages)
{
for (; nr_pages--; ++pfn)
__free_page(pfn_to_page(pfn));
unsigned int count = 0;
for (; nr_pages--; pfn++) {
struct page *page = pfn_to_page(pfn);
count += page_count(page) != 1;
__free_page(page);
}
WARN(count != 0, "%d pages are still in use!\n", count);
}
#endif
@@ -5931,7 +5974,6 @@ void __meminit zone_pcp_update(struct zone *zone)
}
#endif
#ifdef CONFIG_MEMORY_HOTREMOVE
void zone_pcp_reset(struct zone *zone)
{
unsigned long flags;
@@ -5951,6 +5993,7 @@ void zone_pcp_reset(struct zone *zone)
local_irq_restore(flags);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
/*
* All pages in the range must be isolated before calling this.
*/
@@ -5977,6 +6020,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
continue;
}
page = pfn_to_page(pfn);
/*
* The HWPoisoned page may be not in buddy system, and
* page_count() is not 0.
*/
if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
pfn++;
SetPageReserved(page);
continue;
}
BUG_ON(page_count(page));
BUG_ON(!PageBuddy(page));
order = page_order(page);
@@ -5987,8 +6040,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
list_del(&page->lru);
rmv_page_order(page);
zone->free_area[order].nr_free--;
__mod_zone_page_state(zone, NR_FREE_PAGES,
- (1UL << order));
for (i = 0; i < (1 << order); i++)
SetPageReserved((page+i));
pfn += (1 << order);

파일 보기

@@ -251,6 +251,9 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
mn->nr_pages, mn->status_change_nid);
break;
case MEM_CANCEL_ONLINE:
offline_page_cgroup(mn->start_pfn,
mn->nr_pages, mn->status_change_nid);
break;
case MEM_GOING_OFFLINE:
break;
case MEM_ONLINE:
@@ -271,7 +274,7 @@ void __init page_cgroup_init(void)
if (mem_cgroup_disabled())
return;
for_each_node_state(nid, N_HIGH_MEMORY) {
for_each_node_state(nid, N_MEMORY) {
unsigned long start_pfn, end_pfn;
start_pfn = node_start_pfn(nid);

파일 보기

@@ -8,29 +8,7 @@
#include <linux/memory.h>
#include "internal.h"
/* called while holding zone->lock */
static void set_pageblock_isolate(struct page *page)
{
if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
return;
set_pageblock_migratetype(page, MIGRATE_ISOLATE);
page_zone(page)->nr_pageblock_isolate++;
}
/* called while holding zone->lock */
static void restore_pageblock_isolate(struct page *page, int migratetype)
{
struct zone *zone = page_zone(page);
if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
return;
BUG_ON(zone->nr_pageblock_isolate <= 0);
set_pageblock_migratetype(page, migratetype);
zone->nr_pageblock_isolate--;
}
int set_migratetype_isolate(struct page *page)
int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
{
struct zone *zone;
unsigned long flags, pfn;
@@ -66,7 +44,8 @@ int set_migratetype_isolate(struct page *page)
* FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
* We just check MOVABLE pages.
*/
if (!has_unmovable_pages(zone, page, arg.pages_found))
if (!has_unmovable_pages(zone, page, arg.pages_found,
skip_hwpoisoned_pages))
ret = 0;
/*
@@ -79,7 +58,7 @@ out:
unsigned long nr_pages;
int migratetype = get_pageblock_migratetype(page);
set_pageblock_isolate(page);
set_pageblock_migratetype(page, MIGRATE_ISOLATE);
nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
__mod_zone_freepage_state(zone, -nr_pages, migratetype);
@@ -102,7 +81,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype)
goto out;
nr_pages = move_freepages_block(zone, page, migratetype);
__mod_zone_freepage_state(zone, nr_pages, migratetype);
restore_pageblock_isolate(page, migratetype);
set_pageblock_migratetype(page, migratetype);
out:
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -134,7 +113,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* Returns 0 on success and -EBUSY if any part of range cannot be isolated.
*/
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
unsigned migratetype)
unsigned migratetype, bool skip_hwpoisoned_pages)
{
unsigned long pfn;
unsigned long undo_pfn;
@@ -147,7 +126,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
pfn < end_pfn;
pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
if (page && set_migratetype_isolate(page)) {
if (page &&
set_migratetype_isolate(page, skip_hwpoisoned_pages)) {
undo_pfn = pfn;
goto undo;
}
@@ -190,7 +170,8 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
* Returns 1 if all pages in the range are isolated.
*/
static int
__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
bool skip_hwpoisoned_pages)
{
struct page *page;
@@ -220,6 +201,14 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
else if (page_count(page) == 0 &&
get_freepage_migratetype(page) == MIGRATE_ISOLATE)
pfn += 1;
else if (skip_hwpoisoned_pages && PageHWPoison(page)) {
/*
* The HWPoisoned page may be not in buddy
* system, and page_count() is not 0.
*/
pfn++;
continue;
}
else
break;
}
@@ -228,7 +217,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
return 1;
}
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
bool skip_hwpoisoned_pages)
{
unsigned long pfn, flags;
struct page *page;
@@ -251,7 +241,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
/* Check all pages are free or Marked as ISOLATED */
zone = page_zone(page);
spin_lock_irqsave(&zone->lock, flags);
ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn);
ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn,
skip_hwpoisoned_pages);
spin_unlock_irqrestore(&zone->lock, flags);
return ret ? 0 : -EBUSY;
}

파일 보기

@@ -58,7 +58,7 @@ again:
if (!walk->pte_entry)
continue;
split_huge_page_pmd(walk->mm, pmd);
split_huge_page_pmd_mm(walk->mm, addr, pmd);
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
goto again;
err = walk_pte_range(pmd, addr, next, walk);

파일 보기

@@ -631,7 +631,7 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
if (!chunk)
return;
pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
kfree(chunk);
pcpu_mem_free(chunk, pcpu_chunk_struct_size);
}
/*
@@ -1380,6 +1380,9 @@ enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
static int __init percpu_alloc_setup(char *str)
{
if (!str)
return -EINVAL;
if (0)
/* nada */;
#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK

파일 보기

@@ -12,8 +12,8 @@
#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
/*
* Only sets the access flags (dirty, accessed, and
* writable). Furthermore, we know it always gets set to a "more
* Only sets the access flags (dirty, accessed), as well as write
* permission. Furthermore, we know it always gets set to a "more
* permissive" setting, which allows most architectures to optimize
* this. We return whether the PTE actually changed, which in turn
* instructs the caller to do things like update__mmu_cache. This
@@ -27,7 +27,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
int changed = !pte_same(*ptep, entry);
if (changed) {
set_pte_at(vma->vm_mm, address, ptep, entry);
flush_tlb_page(vma, address);
flush_tlb_fix_spurious_fault(vma, address);
}
return changed;
}
@@ -88,7 +88,8 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
{
pte_t pte;
pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
flush_tlb_page(vma, address);
if (pte_accessible(pte))
flush_tlb_page(vma, address);
return pte;
}
#endif

134
mm/rmap.c
파일 보기

@@ -24,7 +24,7 @@
* mm->mmap_sem
* page->flags PG_locked (lock_page)
* mapping->i_mmap_mutex
* anon_vma->mutex
* anon_vma->rwsem
* mm->page_table_lock or pte_lock
* zone->lru_lock (in mark_page_accessed, isolate_lru_page)
* swap_lock (in swap_duplicate, swap_info_get)
@@ -37,7 +37,7 @@
* in arch-dependent flush_dcache_mmap_lock,
* within bdi.wb->list_lock in __sync_single_inode)
*
* anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon)
* anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
* ->tasklist_lock
* pte map lock
*/
@@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
VM_BUG_ON(atomic_read(&anon_vma->refcount));
/*
* Synchronize against page_lock_anon_vma() such that
* Synchronize against page_lock_anon_vma_read() such that
* we can safely hold the lock without the anon_vma getting
* freed.
*
* Relies on the full mb implied by the atomic_dec_and_test() from
* put_anon_vma() against the acquire barrier implied by
* mutex_trylock() from page_lock_anon_vma(). This orders:
* down_read_trylock() from page_lock_anon_vma_read(). This orders:
*
* page_lock_anon_vma() VS put_anon_vma()
* mutex_trylock() atomic_dec_and_test()
* page_lock_anon_vma_read() VS put_anon_vma()
* down_read_trylock() atomic_dec_and_test()
* LOCK MB
* atomic_read() mutex_is_locked()
* atomic_read() rwsem_is_locked()
*
* LOCK should suffice since the actual taking of the lock must
* happen _before_ what follows.
*/
if (mutex_is_locked(&anon_vma->root->mutex)) {
anon_vma_lock(anon_vma);
if (rwsem_is_locked(&anon_vma->root->rwsem)) {
anon_vma_lock_write(anon_vma);
anon_vma_unlock(anon_vma);
}
@@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
* allocate a new one.
*
* Anon-vma allocations are very subtle, because we may have
* optimistically looked up an anon_vma in page_lock_anon_vma()
* optimistically looked up an anon_vma in page_lock_anon_vma_read()
* and that may actually touch the spinlock even in the newly
* allocated vma (it depends on RCU to make sure that the
* anon_vma isn't actually destroyed).
@@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
allocated = anon_vma;
}
anon_vma_lock(anon_vma);
anon_vma_lock_write(anon_vma);
/* page_table_lock to protect against threads */
spin_lock(&mm->page_table_lock);
if (likely(!vma->anon_vma)) {
@@ -219,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
struct anon_vma *new_root = anon_vma->root;
if (new_root != root) {
if (WARN_ON_ONCE(root))
mutex_unlock(&root->mutex);
up_write(&root->rwsem);
root = new_root;
mutex_lock(&root->mutex);
down_write(&root->rwsem);
}
return root;
}
@@ -229,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
static inline void unlock_anon_vma_root(struct anon_vma *root)
{
if (root)
mutex_unlock(&root->mutex);
up_write(&root->rwsem);
}
/*
@@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
get_anon_vma(anon_vma->root);
/* Mark this anon_vma as the one where our new (COWed) pages go. */
vma->anon_vma = anon_vma;
anon_vma_lock(anon_vma);
anon_vma_lock_write(anon_vma);
anon_vma_chain_link(vma, avc, anon_vma);
anon_vma_unlock(anon_vma);
@@ -349,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
/*
* Iterate the list once more, it now only contains empty and unlinked
* anon_vmas, destroy them. Could not do before due to __put_anon_vma()
* needing to acquire the anon_vma->root->mutex.
* needing to write-acquire the anon_vma->root->rwsem.
*/
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
@@ -365,7 +365,7 @@ static void anon_vma_ctor(void *data)
{
struct anon_vma *anon_vma = data;
mutex_init(&anon_vma->mutex);
init_rwsem(&anon_vma->rwsem);
atomic_set(&anon_vma->refcount, 0);
anon_vma->rb_root = RB_ROOT;
}
@@ -442,7 +442,7 @@ out:
* atomic op -- the trylock. If we fail the trylock, we fall back to getting a
* reference like with page_get_anon_vma() and then block on the mutex.
*/
struct anon_vma *page_lock_anon_vma(struct page *page)
struct anon_vma *page_lock_anon_vma_read(struct page *page)
{
struct anon_vma *anon_vma = NULL;
struct anon_vma *root_anon_vma;
@@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
root_anon_vma = ACCESS_ONCE(anon_vma->root);
if (mutex_trylock(&root_anon_vma->mutex)) {
if (down_read_trylock(&root_anon_vma->rwsem)) {
/*
* If the page is still mapped, then this anon_vma is still
* its anon_vma, and holding the mutex ensures that it will
* not go away, see anon_vma_free().
*/
if (!page_mapped(page)) {
mutex_unlock(&root_anon_vma->mutex);
up_read(&root_anon_vma->rwsem);
anon_vma = NULL;
}
goto out;
@@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
/* we pinned the anon_vma, its safe to sleep */
rcu_read_unlock();
anon_vma_lock(anon_vma);
anon_vma_lock_read(anon_vma);
if (atomic_dec_and_test(&anon_vma->refcount)) {
/*
* Oops, we held the last refcount, release the lock
* and bail -- can't simply use put_anon_vma() because
* we'll deadlock on the anon_vma_lock() recursion.
* we'll deadlock on the anon_vma_lock_write() recursion.
*/
anon_vma_unlock(anon_vma);
anon_vma_unlock_read(anon_vma);
__put_anon_vma(anon_vma);
anon_vma = NULL;
}
@@ -504,9 +504,9 @@ out:
return anon_vma;
}
void page_unlock_anon_vma(struct anon_vma *anon_vma)
void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
{
anon_vma_unlock(anon_vma);
anon_vma_unlock_read(anon_vma);
}
/*
@@ -562,6 +562,27 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
return address;
}
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd = NULL;
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
goto out;
pud = pud_offset(pgd, address);
if (!pud_present(*pud))
goto out;
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
pmd = NULL;
out:
return pmd;
}
/*
* Check that @page is mapped at @address into @mm.
*
@@ -574,8 +595,6 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
unsigned long address, spinlock_t **ptlp, int sync)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
spinlock_t *ptl;
@@ -586,17 +605,10 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
goto check;
}
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
pmd = mm_find_pmd(mm, address);
if (!pmd)
return NULL;
pud = pud_offset(pgd, address);
if (!pud_present(*pud))
return NULL;
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return NULL;
if (pmd_trans_huge(*pmd))
return NULL;
@@ -732,7 +744,7 @@ static int page_referenced_anon(struct page *page,
struct anon_vma_chain *avc;
int referenced = 0;
anon_vma = page_lock_anon_vma(page);
anon_vma = page_lock_anon_vma_read(page);
if (!anon_vma)
return referenced;
@@ -754,7 +766,7 @@ static int page_referenced_anon(struct page *page,
break;
}
page_unlock_anon_vma(anon_vma);
page_unlock_anon_vma_read(anon_vma);
return referenced;
}
@@ -1139,9 +1151,11 @@ void page_remove_rmap(struct page *page)
* containing the swap entry, but page not yet written to swap.
*
* And we can skip it on file pages, so long as the filesystem
* participates in dirty tracking; but need to catch shm and tmpfs
* and ramfs pages which have been modified since creation by read
* fault.
* participates in dirty tracking (note that this is not only an
* optimization but also solves problems caused by dirty flag in
* storage key getting set by a write from inside kernel); but need to
* catch shm and tmpfs and ramfs pages which have been modified since
* creation by read fault.
*
* Note that mapping must be decided above, before decrementing
* mapcount (which luckily provides a barrier): once page is unmapped,
@@ -1235,12 +1249,14 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
update_hiwater_rss(mm);
if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
if (PageAnon(page))
dec_mm_counter(mm, MM_ANONPAGES);
else
dec_mm_counter(mm, MM_FILEPAGES);
if (!PageHuge(page)) {
if (PageAnon(page))
dec_mm_counter(mm, MM_ANONPAGES);
else
dec_mm_counter(mm, MM_FILEPAGES);
}
set_pte_at(mm, address, pte,
swp_entry_to_pte(make_hwpoison_entry(page)));
swp_entry_to_pte(make_hwpoison_entry(page)));
} else if (PageAnon(page)) {
swp_entry_t entry = { .val = page_private(page) };
@@ -1299,7 +1315,7 @@ out_mlock:
/*
* We need mmap_sem locking, Otherwise VM_LOCKED check makes
* unstable result and race. Plus, We can't wait here because
* we now hold anon_vma->mutex or mapping->i_mmap_mutex.
* we now hold anon_vma->rwsem or mapping->i_mmap_mutex.
* if trylock failed, the page remain in evictable lru and later
* vmscan could retry to move the page to unevictable lru if the
* page is actually mlocked.
@@ -1345,8 +1361,6 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
struct vm_area_struct *vma, struct page *check_page)
{
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pte_t pteval;
@@ -1366,16 +1380,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
if (end > vma->vm_end)
end = vma->vm_end;
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
return ret;
pud = pud_offset(pgd, address);
if (!pud_present(*pud))
return ret;
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
pmd = mm_find_pmd(mm, address);
if (!pmd)
return ret;
mmun_start = address;
@@ -1474,7 +1480,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
anon_vma = page_lock_anon_vma(page);
anon_vma = page_lock_anon_vma_read(page);
if (!anon_vma)
return ret;
@@ -1501,7 +1507,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
break;
}
page_unlock_anon_vma(anon_vma);
page_unlock_anon_vma_read(anon_vma);
return ret;
}
@@ -1696,7 +1702,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
int ret = SWAP_AGAIN;
/*
* Note: remove_migration_ptes() cannot use page_lock_anon_vma()
* Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
* because that depends on page_mapped(); but not all its usages
* are holding mmap_sem. Users without mmap_sem are required to
* take a reference count to prevent the anon_vma disappearing
@@ -1704,7 +1710,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
anon_vma = page_anon_vma(page);
if (!anon_vma)
return ret;
anon_vma_lock(anon_vma);
anon_vma_lock_read(anon_vma);
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
@@ -1712,7 +1718,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
if (ret != SWAP_AGAIN)
break;
}
anon_vma_unlock(anon_vma);
anon_vma_unlock_read(anon_vma);
return ret;
}

파일 보기

@@ -889,7 +889,7 @@ static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
if (!mpol || mpol->mode == MPOL_DEFAULT)
return; /* show nothing */
mpol_to_str(buffer, sizeof(buffer), mpol, 1);
mpol_to_str(buffer, sizeof(buffer), mpol);
seq_printf(seq, ",mpol=%s", buffer);
}
@@ -909,26 +909,9 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
struct mempolicy mpol, *spol;
struct vm_area_struct pvma;
spol = mpol_cond_copy(&mpol,
mpol_shared_policy_lookup(&info->policy, index));
/* Create a pseudo vma that just contains the policy */
pvma.vm_start = 0;
/* Bias interleave by inode number to distribute better across nodes */
pvma.vm_pgoff = index + info->vfs_inode.i_ino;
pvma.vm_ops = NULL;
pvma.vm_policy = spol;
return swapin_readahead(swap, gfp, &pvma, 0);
}
static struct page *shmem_alloc_page(gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
struct vm_area_struct pvma;
struct page *page;
/* Create a pseudo vma that just contains the policy */
pvma.vm_start = 0;
@@ -937,10 +920,33 @@ static struct page *shmem_alloc_page(gfp_t gfp,
pvma.vm_ops = NULL;
pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
/*
* alloc_page_vma() will drop the shared policy reference
*/
return alloc_page_vma(gfp, &pvma, 0);
page = swapin_readahead(swap, gfp, &pvma, 0);
/* Drop reference taken by mpol_shared_policy_lookup() */
mpol_cond_put(pvma.vm_policy);
return page;
}
static struct page *shmem_alloc_page(gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
struct vm_area_struct pvma;
struct page *page;
/* Create a pseudo vma that just contains the policy */
pvma.vm_start = 0;
/* Bias interleave by inode number to distribute better across nodes */
pvma.vm_pgoff = index + info->vfs_inode.i_ino;
pvma.vm_ops = NULL;
pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
page = alloc_page_vma(gfp, &pvma, 0);
/* Drop reference taken by mpol_shared_policy_lookup() */
mpol_cond_put(pvma.vm_policy);
return page;
}
#else /* !CONFIG_NUMA */
#ifdef CONFIG_TMPFS
@@ -1709,6 +1715,96 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
return error;
}
/*
* llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
*/
static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
pgoff_t index, pgoff_t end, int whence)
{
struct page *page;
struct pagevec pvec;
pgoff_t indices[PAGEVEC_SIZE];
bool done = false;
int i;
pagevec_init(&pvec, 0);
pvec.nr = 1; /* start small: we may be there already */
while (!done) {
pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
pvec.nr, pvec.pages, indices);
if (!pvec.nr) {
if (whence == SEEK_DATA)
index = end;
break;
}
for (i = 0; i < pvec.nr; i++, index++) {
if (index < indices[i]) {
if (whence == SEEK_HOLE) {
done = true;
break;
}
index = indices[i];
}
page = pvec.pages[i];
if (page && !radix_tree_exceptional_entry(page)) {
if (!PageUptodate(page))
page = NULL;
}
if (index >= end ||
(page && whence == SEEK_DATA) ||
(!page && whence == SEEK_HOLE)) {
done = true;
break;
}
}
shmem_deswap_pagevec(&pvec);
pagevec_release(&pvec);
pvec.nr = PAGEVEC_SIZE;
cond_resched();
}
return index;
}
static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
{
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
pgoff_t start, end;
loff_t new_offset;
if (whence != SEEK_DATA && whence != SEEK_HOLE)
return generic_file_llseek_size(file, offset, whence,
MAX_LFS_FILESIZE, i_size_read(inode));
mutex_lock(&inode->i_mutex);
/* We're holding i_mutex so we can access i_size directly */
if (offset < 0)
offset = -EINVAL;
else if (offset >= inode->i_size)
offset = -ENXIO;
else {
start = offset >> PAGE_CACHE_SHIFT;
end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
new_offset = shmem_seek_hole_data(mapping, start, end, whence);
new_offset <<= PAGE_CACHE_SHIFT;
if (new_offset > offset) {
if (new_offset < inode->i_size)
offset = new_offset;
else if (whence == SEEK_DATA)
offset = -ENXIO;
else
offset = inode->i_size;
}
}
if (offset >= 0 && offset != file->f_pos) {
file->f_pos = offset;
file->f_version = 0;
}
mutex_unlock(&inode->i_mutex);
return offset;
}
static long shmem_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
{
@@ -2367,7 +2463,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
if (!gid_valid(sbinfo->gid))
goto bad_val;
} else if (!strcmp(this_char,"mpol")) {
if (mpol_parse_str(value, &sbinfo->mpol, 1))
if (mpol_parse_str(value, &sbinfo->mpol))
goto bad_val;
} else {
printk(KERN_ERR "tmpfs: Bad mount option %s\n",
@@ -2580,7 +2676,7 @@ static const struct address_space_operations shmem_aops = {
static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
#ifdef CONFIG_TMPFS
.llseek = generic_file_llseek,
.llseek = shmem_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.aio_read = shmem_file_aio_read,

381
mm/slab.c
파일 보기

@@ -87,7 +87,6 @@
*/
#include <linux/slab.h>
#include "slab.h"
#include <linux/mm.h>
#include <linux/poison.h>
#include <linux/swap.h>
@@ -128,6 +127,8 @@
#include "internal.h"
#include "slab.h"
/*
* DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
* 0 for faster, smaller code (especially in the critical paths).
@@ -162,23 +163,6 @@
*/
static bool pfmemalloc_active __read_mostly;
/* Legal flag mask for kmem_cache_create(). */
#if DEBUG
# define CREATE_MASK (SLAB_RED_ZONE | \
SLAB_POISON | SLAB_HWCACHE_ALIGN | \
SLAB_CACHE_DMA | \
SLAB_STORE_USER | \
SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
#else
# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
SLAB_CACHE_DMA | \
SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
#endif
/*
* kmem_bufctl_t:
*
@@ -564,15 +548,11 @@ static struct cache_names __initdata cache_names[] = {
#undef CACHE
};
static struct arraycache_init initarray_cache __initdata =
{ {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
static struct arraycache_init initarray_generic =
{ {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
/* internal cache of cache description objs */
static struct kmem_list3 *kmem_cache_nodelists[MAX_NUMNODES];
static struct kmem_cache kmem_cache_boot = {
.nodelists = kmem_cache_nodelists,
.batchcount = 1,
.limit = BOOT_CPUCACHE_ENTRIES,
.shared = 1,
@@ -662,6 +642,26 @@ static void init_node_lock_keys(int q)
}
}
static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q)
{
struct kmem_list3 *l3;
l3 = cachep->nodelists[q];
if (!l3)
return;
slab_set_lock_classes(cachep, &on_slab_l3_key,
&on_slab_alc_key, q);
}
static inline void on_slab_lock_classes(struct kmem_cache *cachep)
{
int node;
VM_BUG_ON(OFF_SLAB(cachep));
for_each_node(node)
on_slab_lock_classes_node(cachep, node);
}
static inline void init_lock_keys(void)
{
int node;
@@ -678,6 +678,14 @@ static inline void init_lock_keys(void)
{
}
static inline void on_slab_lock_classes(struct kmem_cache *cachep)
{
}
static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node)
{
}
static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
{
}
@@ -1406,6 +1414,9 @@ static int __cpuinit cpuup_prepare(long cpu)
free_alien_cache(alien);
if (cachep->flags & SLAB_DEBUG_OBJECTS)
slab_set_debugobj_lock_classes_node(cachep, node);
else if (!OFF_SLAB(cachep) &&
!(cachep->flags & SLAB_DESTROY_BY_RCU))
on_slab_lock_classes_node(cachep, node);
}
init_node_lock_keys(node);
@@ -1576,29 +1587,34 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index)
}
}
/*
* The memory after the last cpu cache pointer is used for the
* the nodelists pointer.
*/
static void setup_nodelists_pointer(struct kmem_cache *cachep)
{
cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
}
/*
* Initialisation. Called after the page allocator have been initialised and
* before smp_init().
*/
void __init kmem_cache_init(void)
{
size_t left_over;
struct cache_sizes *sizes;
struct cache_names *names;
int i;
int order;
int node;
kmem_cache = &kmem_cache_boot;
setup_nodelists_pointer(kmem_cache);
if (num_possible_nodes() == 1)
use_alien_caches = 0;
for (i = 0; i < NUM_INIT_LISTS; i++) {
for (i = 0; i < NUM_INIT_LISTS; i++)
kmem_list3_init(&initkmem_list3[i]);
if (i < MAX_NUMNODES)
kmem_cache->nodelists[i] = NULL;
}
set_up_list3s(kmem_cache, CACHE_CACHE);
/*
@@ -1629,37 +1645,16 @@ void __init kmem_cache_init(void)
* 6) Resize the head arrays of the kmalloc caches to their final sizes.
*/
node = numa_mem_id();
/* 1) create the kmem_cache */
INIT_LIST_HEAD(&slab_caches);
list_add(&kmem_cache->list, &slab_caches);
kmem_cache->colour_off = cache_line_size();
kmem_cache->array[smp_processor_id()] = &initarray_cache.cache;
kmem_cache->nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
/*
* struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
*/
kmem_cache->size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
nr_node_ids * sizeof(struct kmem_list3 *);
kmem_cache->object_size = kmem_cache->size;
kmem_cache->size = ALIGN(kmem_cache->object_size,
cache_line_size());
kmem_cache->reciprocal_buffer_size =
reciprocal_value(kmem_cache->size);
for (order = 0; order < MAX_ORDER; order++) {
cache_estimate(order, kmem_cache->size,
cache_line_size(), 0, &left_over, &kmem_cache->num);
if (kmem_cache->num)
break;
}
BUG_ON(!kmem_cache->num);
kmem_cache->gfporder = order;
kmem_cache->colour = left_over / kmem_cache->colour_off;
kmem_cache->slab_size = ALIGN(kmem_cache->num * sizeof(kmem_bufctl_t) +
sizeof(struct slab), cache_line_size());
create_boot_cache(kmem_cache, "kmem_cache",
offsetof(struct kmem_cache, array[nr_cpu_ids]) +
nr_node_ids * sizeof(struct kmem_list3 *),
SLAB_HWCACHE_ALIGN);
list_add(&kmem_cache->list, &slab_caches);
/* 2+3) create the kmalloc caches */
sizes = malloc_sizes;
@@ -1671,23 +1666,13 @@ void __init kmem_cache_init(void)
* bug.
*/
sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
sizes[INDEX_AC].cs_cachep->name = names[INDEX_AC].name;
sizes[INDEX_AC].cs_cachep->size = sizes[INDEX_AC].cs_size;
sizes[INDEX_AC].cs_cachep->object_size = sizes[INDEX_AC].cs_size;
sizes[INDEX_AC].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
__kmem_cache_create(sizes[INDEX_AC].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches);
sizes[INDEX_AC].cs_cachep = create_kmalloc_cache(names[INDEX_AC].name,
sizes[INDEX_AC].cs_size, ARCH_KMALLOC_FLAGS);
if (INDEX_AC != INDEX_L3) {
sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
sizes[INDEX_L3].cs_cachep->name = names[INDEX_L3].name;
sizes[INDEX_L3].cs_cachep->size = sizes[INDEX_L3].cs_size;
sizes[INDEX_L3].cs_cachep->object_size = sizes[INDEX_L3].cs_size;
sizes[INDEX_L3].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
__kmem_cache_create(sizes[INDEX_L3].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches);
}
if (INDEX_AC != INDEX_L3)
sizes[INDEX_L3].cs_cachep =
create_kmalloc_cache(names[INDEX_L3].name,
sizes[INDEX_L3].cs_size, ARCH_KMALLOC_FLAGS);
slab_early_init = 0;
@@ -1699,24 +1684,14 @@ void __init kmem_cache_init(void)
* Note for systems short on memory removing the alignment will
* allow tighter packing of the smaller caches.
*/
if (!sizes->cs_cachep) {
sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
sizes->cs_cachep->name = names->name;
sizes->cs_cachep->size = sizes->cs_size;
sizes->cs_cachep->object_size = sizes->cs_size;
sizes->cs_cachep->align = ARCH_KMALLOC_MINALIGN;
__kmem_cache_create(sizes->cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
list_add(&sizes->cs_cachep->list, &slab_caches);
}
if (!sizes->cs_cachep)
sizes->cs_cachep = create_kmalloc_cache(names->name,
sizes->cs_size, ARCH_KMALLOC_FLAGS);
#ifdef CONFIG_ZONE_DMA
sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
sizes->cs_dmacachep->name = names->name_dma;
sizes->cs_dmacachep->size = sizes->cs_size;
sizes->cs_dmacachep->object_size = sizes->cs_size;
sizes->cs_dmacachep->align = ARCH_KMALLOC_MINALIGN;
__kmem_cache_create(sizes->cs_dmacachep,
ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC);
list_add(&sizes->cs_dmacachep->list, &slab_caches);
sizes->cs_dmacachep = create_kmalloc_cache(
names->name_dma, sizes->cs_size,
SLAB_CACHE_DMA|ARCH_KMALLOC_FLAGS);
#endif
sizes++;
names++;
@@ -1727,7 +1702,6 @@ void __init kmem_cache_init(void)
ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
BUG_ON(cpu_cache_get(kmem_cache) != &initarray_cache.cache);
memcpy(ptr, cpu_cache_get(kmem_cache),
sizeof(struct arraycache_init));
/*
@@ -1921,6 +1895,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
if (page->pfmemalloc)
SetPageSlabPfmemalloc(page + i);
}
memcg_bind_pages(cachep, cachep->gfporder);
if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
@@ -1957,9 +1932,11 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
__ClearPageSlab(page);
page++;
}
memcg_release_pages(cachep, cachep->gfporder);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += nr_freed;
free_pages((unsigned long)addr, cachep->gfporder);
free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder);
}
static void kmem_rcu_free(struct rcu_head *head)
@@ -2282,7 +2259,15 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
if (slab_state == DOWN) {
/*
* Note: the first kmem_cache_create must create the cache
* Note: Creation of first cache (kmem_cache).
* The setup_list3s is taken care
* of by the caller of __kmem_cache_create
*/
cachep->array[smp_processor_id()] = &initarray_generic.cache;
slab_state = PARTIAL;
} else if (slab_state == PARTIAL) {
/*
* Note: the second kmem_cache_create must create the cache
* that's used by kmalloc(24), otherwise the creation of
* further caches will BUG().
*/
@@ -2290,7 +2275,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
/*
* If the cache that's used by kmalloc(sizeof(kmem_list3)) is
* the first cache, then we need to set up all its list3s,
* the second cache, then we need to set up all its list3s,
* otherwise the creation of further caches will BUG().
*/
set_up_list3s(cachep, SIZE_AC);
@@ -2299,6 +2284,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
else
slab_state = PARTIAL_ARRAYCACHE;
} else {
/* Remaining boot caches */
cachep->array[smp_processor_id()] =
kmalloc(sizeof(struct arraycache_init), gfp);
@@ -2331,11 +2317,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
/**
* __kmem_cache_create - Create a cache.
* @name: A string which is used in /proc/slabinfo to identify this cache.
* @size: The size of objects to be created in this cache.
* @align: The required alignment for the objects.
* @cachep: cache management descriptor
* @flags: SLAB flags
* @ctor: A constructor for the objects.
*
* Returns a ptr to the cache on success, NULL on failure.
* Cannot be called within a int, but can be interrupted.
@@ -2378,11 +2361,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
if (flags & SLAB_DESTROY_BY_RCU)
BUG_ON(flags & SLAB_POISON);
#endif
/*
* Always checks flags, a caller might be expecting debug support which
* isn't available.
*/
BUG_ON(flags & ~CREATE_MASK);
/*
* Check that size is in terms of words. This is needed to avoid
@@ -2394,22 +2372,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
size &= ~(BYTES_PER_WORD - 1);
}
/* calculate the final buffer alignment: */
/* 1) arch recommendation: can be overridden for debug */
if (flags & SLAB_HWCACHE_ALIGN) {
/*
* Default alignment: as specified by the arch code. Except if
* an object is really small, then squeeze multiple objects into
* one cacheline.
*/
ralign = cache_line_size();
while (size <= ralign / 2)
ralign /= 2;
} else {
ralign = BYTES_PER_WORD;
}
/*
* Redzoning and user store require word alignment or possibly larger.
* Note this will be overridden by architecture or caller mandated
@@ -2426,10 +2388,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
size &= ~(REDZONE_ALIGN - 1);
}
/* 2) arch mandated alignment */
if (ralign < ARCH_SLAB_MINALIGN) {
ralign = ARCH_SLAB_MINALIGN;
}
/* 3) caller mandated alignment */
if (ralign < cachep->align) {
ralign = cachep->align;
@@ -2447,7 +2405,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
else
gfp = GFP_NOWAIT;
cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
setup_nodelists_pointer(cachep);
#if DEBUG
/*
@@ -2566,7 +2524,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
slab_set_debugobj_lock_classes(cachep);
}
} else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU))
on_slab_lock_classes(cachep);
return 0;
}
@@ -3530,6 +3489,8 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
if (slab_should_failslab(cachep, flags))
return NULL;
cachep = memcg_kmem_get_cache(cachep, flags);
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
@@ -3615,6 +3576,8 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
if (slab_should_failslab(cachep, flags))
return NULL;
cachep = memcg_kmem_get_cache(cachep, flags);
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
objp = __do_cache_alloc(cachep, flags);
@@ -3928,6 +3891,9 @@ EXPORT_SYMBOL(__kmalloc);
void kmem_cache_free(struct kmem_cache *cachep, void *objp)
{
unsigned long flags;
cachep = cache_from_obj(cachep, objp);
if (!cachep)
return;
local_irq_save(flags);
debug_check_no_locks_freed(objp, cachep->object_size);
@@ -3969,12 +3935,6 @@ void kfree(const void *objp)
}
EXPORT_SYMBOL(kfree);
unsigned int kmem_cache_size(struct kmem_cache *cachep)
{
return cachep->object_size;
}
EXPORT_SYMBOL(kmem_cache_size);
/*
* This initializes kmem_list3 or resizes various caches for all nodes.
*/
@@ -4081,7 +4041,7 @@ static void do_ccupdate_local(void *info)
}
/* Always called with the slab_mutex held */
static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
int batchcount, int shared, gfp_t gfp)
{
struct ccupdate_struct *new;
@@ -4124,12 +4084,49 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
return alloc_kmemlist(cachep, gfp);
}
static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
int batchcount, int shared, gfp_t gfp)
{
int ret;
struct kmem_cache *c = NULL;
int i = 0;
ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
if (slab_state < FULL)
return ret;
if ((ret < 0) || !is_root_cache(cachep))
return ret;
VM_BUG_ON(!mutex_is_locked(&slab_mutex));
for_each_memcg_cache_index(i) {
c = cache_from_memcg(cachep, i);
if (c)
/* return value determined by the parent cache only */
__do_tune_cpucache(c, limit, batchcount, shared, gfp);
}
return ret;
}
/* Called with slab_mutex held always */
static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
{
int err;
int limit, shared;
int limit = 0;
int shared = 0;
int batchcount = 0;
if (!is_root_cache(cachep)) {
struct kmem_cache *root = memcg_root_cache(cachep);
limit = root->limit;
shared = root->shared;
batchcount = root->batchcount;
}
if (limit && shared && batchcount)
goto skip_setup;
/*
* The head array serves three purposes:
* - create a LIFO ordering, i.e. return objects that are cache-warm
@@ -4171,7 +4168,9 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
if (limit > 32)
limit = 32;
#endif
err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
batchcount = (limit + 1) / 2;
skip_setup:
err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
if (err)
printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
cachep->name, -err);
@@ -4276,54 +4275,8 @@ out:
}
#ifdef CONFIG_SLABINFO
static void print_slabinfo_header(struct seq_file *m)
void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
{
/*
* Output format version, so at least we can change it
* without _too_ many complaints.
*/
#if STATS
seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
#else
seq_puts(m, "slabinfo - version: 2.1\n");
#endif
seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
"<objperslab> <pagesperslab>");
seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
#if STATS
seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
"<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
#endif
seq_putc(m, '\n');
}
static void *s_start(struct seq_file *m, loff_t *pos)
{
loff_t n = *pos;
mutex_lock(&slab_mutex);
if (!n)
print_slabinfo_header(m);
return seq_list_start(&slab_caches, *pos);
}
static void *s_next(struct seq_file *m, void *p, loff_t *pos)
{
return seq_list_next(p, &slab_caches, pos);
}
static void s_stop(struct seq_file *m, void *p)
{
mutex_unlock(&slab_mutex);
}
static int s_show(struct seq_file *m, void *p)
{
struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
struct slab *slabp;
unsigned long active_objs;
unsigned long num_objs;
@@ -4378,13 +4331,20 @@ static int s_show(struct seq_file *m, void *p)
if (error)
printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
name, active_objs, num_objs, cachep->size,
cachep->num, (1 << cachep->gfporder));
seq_printf(m, " : tunables %4u %4u %4u",
cachep->limit, cachep->batchcount, cachep->shared);
seq_printf(m, " : slabdata %6lu %6lu %6lu",
active_slabs, num_slabs, shared_avail);
sinfo->active_objs = active_objs;
sinfo->num_objs = num_objs;
sinfo->active_slabs = active_slabs;
sinfo->num_slabs = num_slabs;
sinfo->shared_avail = shared_avail;
sinfo->limit = cachep->limit;
sinfo->batchcount = cachep->batchcount;
sinfo->shared = cachep->shared;
sinfo->objects_per_slab = cachep->num;
sinfo->cache_order = cachep->gfporder;
}
void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
{
#if STATS
{ /* list3 stats */
unsigned long high = cachep->high_mark;
@@ -4414,31 +4374,8 @@ static int s_show(struct seq_file *m, void *p)
allochit, allocmiss, freehit, freemiss);
}
#endif
seq_putc(m, '\n');
return 0;
}
/*
* slabinfo_op - iterator that generates /proc/slabinfo
*
* Output layout:
* cache-name
* num-active-objs
* total-objs
* object size
* num-active-slabs
* total-slabs
* num-pages-per-slab
* + further values on SMP and with statistics enabled
*/
static const struct seq_operations slabinfo_op = {
.start = s_start,
.next = s_next,
.stop = s_stop,
.show = s_show,
};
#define MAX_SLABINFO_WRITE 128
/**
* slabinfo_write - Tuning for the slab allocator
@@ -4447,7 +4384,7 @@ static const struct seq_operations slabinfo_op = {
* @count: data length
* @ppos: unused
*/
static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
ssize_t slabinfo_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
@@ -4490,19 +4427,6 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
return res;
}
static int slabinfo_open(struct inode *inode, struct file *file)
{
return seq_open(file, &slabinfo_op);
}
static const struct file_operations proc_slabinfo_operations = {
.open = slabinfo_open,
.read = seq_read,
.write = slabinfo_write,
.llseek = seq_lseek,
.release = seq_release,
};
#ifdef CONFIG_DEBUG_SLAB_LEAK
static void *leaks_start(struct seq_file *m, loff_t *pos)
@@ -4631,6 +4555,16 @@ static int leaks_show(struct seq_file *m, void *p)
return 0;
}
static void *s_next(struct seq_file *m, void *p, loff_t *pos)
{
return seq_list_next(p, &slab_caches, pos);
}
static void s_stop(struct seq_file *m, void *p)
{
mutex_unlock(&slab_mutex);
}
static const struct seq_operations slabstats_op = {
.start = leaks_start,
.next = s_next,
@@ -4665,7 +4599,6 @@ static const struct file_operations proc_slabstats_operations = {
static int __init slab_proc_init(void)
{
proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations);
#ifdef CONFIG_DEBUG_SLAB_LEAK
proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
#endif

190
mm/slab.h
파일 보기

@@ -32,19 +32,201 @@ extern struct list_head slab_caches;
/* The slab cache that manages slab cache information */
extern struct kmem_cache *kmem_cache;
unsigned long calculate_alignment(unsigned long flags,
unsigned long align, unsigned long size);
/* Functions provided by the slab allocators */
extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags);
extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
unsigned long flags);
extern void create_boot_cache(struct kmem_cache *, const char *name,
size_t size, unsigned long flags);
struct mem_cgroup;
#ifdef CONFIG_SLUB
struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
size_t align, unsigned long flags, void (*ctor)(void *));
struct kmem_cache *
__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
size_t align, unsigned long flags, void (*ctor)(void *));
#else
static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
size_t align, unsigned long flags, void (*ctor)(void *))
static inline struct kmem_cache *
__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
size_t align, unsigned long flags, void (*ctor)(void *))
{ return NULL; }
#endif
/* Legal flag mask for kmem_cache_create(), for various configurations */
#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \
SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS )
#if defined(CONFIG_DEBUG_SLAB)
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
#elif defined(CONFIG_SLUB_DEBUG)
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
SLAB_TRACE | SLAB_DEBUG_FREE)
#else
#define SLAB_DEBUG_FLAGS (0)
#endif
#if defined(CONFIG_SLAB)
#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK)
#elif defined(CONFIG_SLUB)
#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
SLAB_TEMPORARY | SLAB_NOTRACK)
#else
#define SLAB_CACHE_FLAGS (0)
#endif
#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
int __kmem_cache_shutdown(struct kmem_cache *);
struct seq_file;
struct file;
struct slabinfo {
unsigned long active_objs;
unsigned long num_objs;
unsigned long active_slabs;
unsigned long num_slabs;
unsigned long shared_avail;
unsigned int limit;
unsigned int batchcount;
unsigned int shared;
unsigned int objects_per_slab;
unsigned int cache_order;
};
void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo);
void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
ssize_t slabinfo_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos);
#ifdef CONFIG_MEMCG_KMEM
static inline bool is_root_cache(struct kmem_cache *s)
{
return !s->memcg_params || s->memcg_params->is_root_cache;
}
static inline bool cache_match_memcg(struct kmem_cache *cachep,
struct mem_cgroup *memcg)
{
return (is_root_cache(cachep) && !memcg) ||
(cachep->memcg_params->memcg == memcg);
}
static inline void memcg_bind_pages(struct kmem_cache *s, int order)
{
if (!is_root_cache(s))
atomic_add(1 << order, &s->memcg_params->nr_pages);
}
static inline void memcg_release_pages(struct kmem_cache *s, int order)
{
if (is_root_cache(s))
return;
if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages))
mem_cgroup_destroy_cache(s);
}
static inline bool slab_equal_or_root(struct kmem_cache *s,
struct kmem_cache *p)
{
return (p == s) ||
(s->memcg_params && (p == s->memcg_params->root_cache));
}
/*
* We use suffixes to the name in memcg because we can't have caches
* created in the system with the same name. But when we print them
* locally, better refer to them with the base name
*/
static inline const char *cache_name(struct kmem_cache *s)
{
if (!is_root_cache(s))
return s->memcg_params->root_cache->name;
return s->name;
}
static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
{
return s->memcg_params->memcg_caches[idx];
}
static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
{
if (is_root_cache(s))
return s;
return s->memcg_params->root_cache;
}
#else
static inline bool is_root_cache(struct kmem_cache *s)
{
return true;
}
static inline bool cache_match_memcg(struct kmem_cache *cachep,
struct mem_cgroup *memcg)
{
return true;
}
static inline void memcg_bind_pages(struct kmem_cache *s, int order)
{
}
static inline void memcg_release_pages(struct kmem_cache *s, int order)
{
}
static inline bool slab_equal_or_root(struct kmem_cache *s,
struct kmem_cache *p)
{
return true;
}
static inline const char *cache_name(struct kmem_cache *s)
{
return s->name;
}
static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
{
return NULL;
}
static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
{
return s;
}
#endif
static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
{
struct kmem_cache *cachep;
struct page *page;
/*
* When kmemcg is not being used, both assignments should return the
* same value. but we don't want to pay the assignment price in that
* case. If it is not compiled in, the compiler should be smart enough
* to not do even the assignment. In that case, slab_equal_or_root
* will also be a constant.
*/
if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE))
return s;
page = virt_to_head_page(x);
cachep = page->slab_cache;
if (slab_equal_or_root(cachep, s))
return cachep;
pr_err("%s: Wrong slab cache. %s but object is from %s\n",
__FUNCTION__, cachep->name, s->name);
WARN_ON_ONCE(1);
return s;
}
#endif

파일 보기

@@ -13,9 +13,12 @@
#include <linux/module.h>
#include <linux/cpu.h>
#include <linux/uaccess.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/page.h>
#include <linux/memcontrol.h>
#include "slab.h"
@@ -25,7 +28,8 @@ DEFINE_MUTEX(slab_mutex);
struct kmem_cache *kmem_cache;
#ifdef CONFIG_DEBUG_VM
static int kmem_cache_sanity_check(const char *name, size_t size)
static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
size_t size)
{
struct kmem_cache *s = NULL;
@@ -51,7 +55,13 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
continue;
}
if (!strcmp(s->name, name)) {
/*
* For simplicity, we won't check this in the list of memcg
* caches. We have control over memcg naming, and if there
* aren't duplicates in the global list, there won't be any
* duplicates in the memcg lists as well.
*/
if (!memcg && !strcmp(s->name, name)) {
pr_err("%s (%s): Cache name already exists.\n",
__func__, name);
dump_stack();
@@ -64,12 +74,69 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
return 0;
}
#else
static inline int kmem_cache_sanity_check(const char *name, size_t size)
static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg,
const char *name, size_t size)
{
return 0;
}
#endif
#ifdef CONFIG_MEMCG_KMEM
int memcg_update_all_caches(int num_memcgs)
{
struct kmem_cache *s;
int ret = 0;
mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list) {
if (!is_root_cache(s))
continue;
ret = memcg_update_cache_size(s, num_memcgs);
/*
* See comment in memcontrol.c, memcg_update_cache_size:
* Instead of freeing the memory, we'll just leave the caches
* up to this point in an updated state.
*/
if (ret)
goto out;
}
memcg_update_array_size(num_memcgs);
out:
mutex_unlock(&slab_mutex);
return ret;
}
#endif
/*
* Figure out what the alignment of the objects will be given a set of
* flags, a user specified alignment and the size of the objects.
*/
unsigned long calculate_alignment(unsigned long flags,
unsigned long align, unsigned long size)
{
/*
* If the user wants hardware cache aligned objects then follow that
* suggestion if the object is sufficiently large.
*
* The hardware cache alignment cannot override the specified
* alignment though. If that is greater then use it.
*/
if (flags & SLAB_HWCACHE_ALIGN) {
unsigned long ralign = cache_line_size();
while (size <= ralign / 2)
ralign /= 2;
align = max(align, ralign);
}
if (align < ARCH_SLAB_MINALIGN)
align = ARCH_SLAB_MINALIGN;
return ALIGN(align, sizeof(void *));
}
/*
* kmem_cache_create - Create a cache.
* @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -95,8 +162,10 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
* as davem.
*/
struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
struct kmem_cache *
kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
size_t align, unsigned long flags, void (*ctor)(void *),
struct kmem_cache *parent_cache)
{
struct kmem_cache *s = NULL;
int err = 0;
@@ -104,19 +173,33 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
get_online_cpus();
mutex_lock(&slab_mutex);
if (!kmem_cache_sanity_check(name, size) == 0)
if (!kmem_cache_sanity_check(memcg, name, size) == 0)
goto out_locked;
/*
* Some allocators will constraint the set of valid flags to a subset
* of all flags. We expect them to define CACHE_CREATE_MASK in this
* case, and we'll just provide them with a sanitized version of the
* passed flags.
*/
flags &= CACHE_CREATE_MASK;
s = __kmem_cache_alias(name, size, align, flags, ctor);
s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
if (s)
goto out_locked;
s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
if (s) {
s->object_size = s->size = size;
s->align = align;
s->align = calculate_alignment(flags, align, size);
s->ctor = ctor;
if (memcg_register_cache(memcg, s, parent_cache)) {
kmem_cache_free(kmem_cache, s);
err = -ENOMEM;
goto out_locked;
}
s->name = kstrdup(name, GFP_KERNEL);
if (!s->name) {
kmem_cache_free(kmem_cache, s);
@@ -126,10 +209,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
err = __kmem_cache_create(s, flags);
if (!err) {
s->refcount = 1;
list_add(&s->list, &slab_caches);
memcg_cache_list_add(memcg, s);
} else {
kfree(s->name);
kmem_cache_free(kmem_cache, s);
@@ -157,10 +239,20 @@ out_locked:
return s;
}
struct kmem_cache *
kmem_cache_create(const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
{
return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL);
}
EXPORT_SYMBOL(kmem_cache_create);
void kmem_cache_destroy(struct kmem_cache *s)
{
/* Destroy all the children caches if we aren't a memcg cache */
kmem_cache_destroy_memcg_children(s);
get_online_cpus();
mutex_lock(&slab_mutex);
s->refcount--;
@@ -172,6 +264,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (s->flags & SLAB_DESTROY_BY_RCU)
rcu_barrier();
memcg_release_cache(s);
kfree(s->name);
kmem_cache_free(kmem_cache, s);
} else {
@@ -192,3 +285,182 @@ int slab_is_available(void)
{
return slab_state >= UP;
}
#ifndef CONFIG_SLOB
/* Create a cache during boot when no slab services are available yet */
void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size,
unsigned long flags)
{
int err;
s->name = name;
s->size = s->object_size = size;
s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
err = __kmem_cache_create(s, flags);
if (err)
panic("Creation of kmalloc slab %s size=%zd failed. Reason %d\n",
name, size, err);
s->refcount = -1; /* Exempt from merging for now */
}
struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
unsigned long flags)
{
struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
if (!s)
panic("Out of memory when creating slab %s\n", name);
create_boot_cache(s, name, size, flags);
list_add(&s->list, &slab_caches);
s->refcount = 1;
return s;
}
#endif /* !CONFIG_SLOB */
#ifdef CONFIG_SLABINFO
void print_slabinfo_header(struct seq_file *m)
{
/*
* Output format version, so at least we can change it
* without _too_ many complaints.
*/
#ifdef CONFIG_DEBUG_SLAB
seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
#else
seq_puts(m, "slabinfo - version: 2.1\n");
#endif
seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
"<objperslab> <pagesperslab>");
seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
#ifdef CONFIG_DEBUG_SLAB
seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
"<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
#endif
seq_putc(m, '\n');
}
static void *s_start(struct seq_file *m, loff_t *pos)
{
loff_t n = *pos;
mutex_lock(&slab_mutex);
if (!n)
print_slabinfo_header(m);
return seq_list_start(&slab_caches, *pos);
}
static void *s_next(struct seq_file *m, void *p, loff_t *pos)
{
return seq_list_next(p, &slab_caches, pos);
}
static void s_stop(struct seq_file *m, void *p)
{
mutex_unlock(&slab_mutex);
}
static void
memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
{
struct kmem_cache *c;
struct slabinfo sinfo;
int i;
if (!is_root_cache(s))
return;
for_each_memcg_cache_index(i) {
c = cache_from_memcg(s, i);
if (!c)
continue;
memset(&sinfo, 0, sizeof(sinfo));
get_slabinfo(c, &sinfo);
info->active_slabs += sinfo.active_slabs;
info->num_slabs += sinfo.num_slabs;
info->shared_avail += sinfo.shared_avail;
info->active_objs += sinfo.active_objs;
info->num_objs += sinfo.num_objs;
}
}
int cache_show(struct kmem_cache *s, struct seq_file *m)
{
struct slabinfo sinfo;
memset(&sinfo, 0, sizeof(sinfo));
get_slabinfo(s, &sinfo);
memcg_accumulate_slabinfo(s, &sinfo);
seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size,
sinfo.objects_per_slab, (1 << sinfo.cache_order));
seq_printf(m, " : tunables %4u %4u %4u",
sinfo.limit, sinfo.batchcount, sinfo.shared);
seq_printf(m, " : slabdata %6lu %6lu %6lu",
sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
slabinfo_show_stats(m, s);
seq_putc(m, '\n');
return 0;
}
static int s_show(struct seq_file *m, void *p)
{
struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
if (!is_root_cache(s))
return 0;
return cache_show(s, m);
}
/*
* slabinfo_op - iterator that generates /proc/slabinfo
*
* Output layout:
* cache-name
* num-active-objs
* total-objs
* object size
* num-active-slabs
* total-slabs
* num-pages-per-slab
* + further values on SMP and with statistics enabled
*/
static const struct seq_operations slabinfo_op = {
.start = s_start,
.next = s_next,
.stop = s_stop,
.show = s_show,
};
static int slabinfo_open(struct inode *inode, struct file *file)
{
return seq_open(file, &slabinfo_op);
}
static const struct file_operations proc_slabinfo_operations = {
.open = slabinfo_open,
.read = seq_read,
.write = slabinfo_write,
.llseek = seq_lseek,
.release = seq_release,
};
static int __init slab_proc_init(void)
{
proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations);
return 0;
}
module_init(slab_proc_init);
#endif /* CONFIG_SLABINFO */

파일 보기

@@ -28,9 +28,8 @@
* from kmalloc are prepended with a 4-byte header with the kmalloc size.
* If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
* alloc_pages() directly, allocating compound pages so the page order
* does not have to be separately tracked, and also stores the exact
* allocation size in page->private so that it can be used to accurately
* provide ksize(). These objects are detected in kfree() because slob_page()
* does not have to be separately tracked.
* These objects are detected in kfree() because PageSlab()
* is false for them.
*
* SLAB is emulated on top of SLOB by simply calling constructors and
@@ -59,7 +58,6 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include "slab.h"
#include <linux/mm.h>
#include <linux/swap.h> /* struct reclaim_state */
@@ -74,6 +72,7 @@
#include <linux/atomic.h>
#include "slab.h"
/*
* slob_block has a field 'units', which indicates size of block if +ve,
* or offset of next block if -ve (in SLOB_UNITs).
@@ -124,7 +123,6 @@ static inline void clear_slob_page_free(struct page *sp)
#define SLOB_UNIT sizeof(slob_t)
#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
#define SLOB_ALIGN L1_CACHE_BYTES
/*
* struct slob_rcu is inserted at the tail of allocated slob blocks, which
@@ -455,11 +453,6 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
if (likely(order))
gfp |= __GFP_COMP;
ret = slob_new_pages(gfp, order, node);
if (ret) {
struct page *page;
page = virt_to_page(ret);
page->private = size;
}
trace_kmalloc_node(caller, ret,
size, PAGE_SIZE << order, gfp, node);
@@ -506,7 +499,7 @@ void kfree(const void *block)
unsigned int *m = (unsigned int *)(block - align);
slob_free(m, *m + align);
} else
put_page(sp);
__free_pages(sp, compound_order(sp));
}
EXPORT_SYMBOL(kfree);
@@ -514,37 +507,30 @@ EXPORT_SYMBOL(kfree);
size_t ksize(const void *block)
{
struct page *sp;
int align;
unsigned int *m;
BUG_ON(!block);
if (unlikely(block == ZERO_SIZE_PTR))
return 0;
sp = virt_to_page(block);
if (PageSlab(sp)) {
int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
unsigned int *m = (unsigned int *)(block - align);
return SLOB_UNITS(*m) * SLOB_UNIT;
} else
return sp->private;
if (unlikely(!PageSlab(sp)))
return PAGE_SIZE << compound_order(sp);
align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
m = (unsigned int *)(block - align);
return SLOB_UNITS(*m) * SLOB_UNIT;
}
EXPORT_SYMBOL(ksize);
int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
{
size_t align = c->size;
if (flags & SLAB_DESTROY_BY_RCU) {
/* leave room for rcu footer at the end of object */
c->size += sizeof(struct slob_rcu);
}
c->flags = flags;
/* ignore alignment unless it's forced */
c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
if (c->align < ARCH_SLAB_MINALIGN)
c->align = ARCH_SLAB_MINALIGN;
if (c->align < align)
c->align = align;
return 0;
}
@@ -558,12 +544,12 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
if (c->size < PAGE_SIZE) {
b = slob_alloc(c->size, flags, c->align, node);
trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
SLOB_UNITS(c->size) * SLOB_UNIT,
flags, node);
} else {
b = slob_new_pages(flags, get_order(c->size), node);
trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
PAGE_SIZE << get_order(c->size),
flags, node);
}
@@ -608,12 +594,6 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
}
EXPORT_SYMBOL(kmem_cache_free);
unsigned int kmem_cache_size(struct kmem_cache *c)
{
return c->size;
}
EXPORT_SYMBOL(kmem_cache_size);
int __kmem_cache_shutdown(struct kmem_cache *c)
{
/* No way to check for remaining objects */

453
mm/slub.c
파일 보기

@@ -31,6 +31,7 @@
#include <linux/fault-inject.h>
#include <linux/stacktrace.h>
#include <linux/prefetch.h>
#include <linux/memcontrol.h>
#include <trace/events/kmem.h>
@@ -112,9 +113,6 @@
* the fast path and disables lockless freelists.
*/
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
SLAB_TRACE | SLAB_DEBUG_FREE)
static inline int kmem_cache_debug(struct kmem_cache *s)
{
#ifdef CONFIG_SLUB_DEBUG
@@ -179,8 +177,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
#define __OBJECT_POISON 0x80000000UL /* Poison object */
#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */
static int kmem_size = sizeof(struct kmem_cache);
#ifdef CONFIG_SMP
static struct notifier_block slab_notifier;
#endif
@@ -205,13 +201,14 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
static void sysfs_slab_remove(struct kmem_cache *);
static void memcg_propagate_slab_attrs(struct kmem_cache *s);
#else
static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
{ return 0; }
static inline void sysfs_slab_remove(struct kmem_cache *s) { }
static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
#endif
static inline void stat(const struct kmem_cache *s, enum stat_item si)
@@ -1092,11 +1089,11 @@ static noinline struct kmem_cache_node *free_debug_processing(
if (!check_object(s, page, object, SLUB_RED_ACTIVE))
goto out;
if (unlikely(s != page->slab)) {
if (unlikely(s != page->slab_cache)) {
if (!PageSlab(page)) {
slab_err(s, page, "Attempt to free object(0x%p) "
"outside of slab", object);
} else if (!page->slab) {
} else if (!page->slab_cache) {
printk(KERN_ERR
"SLUB <none>: no slab for object 0x%p.\n",
object);
@@ -1348,6 +1345,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
void *start;
void *last;
void *p;
int order;
BUG_ON(flags & GFP_SLAB_BUG_MASK);
@@ -1356,8 +1354,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
if (!page)
goto out;
order = compound_order(page);
inc_slabs_node(s, page_to_nid(page), page->objects);
page->slab = s;
memcg_bind_pages(s, order);
page->slab_cache = s;
__SetPageSlab(page);
if (page->pfmemalloc)
SetPageSlabPfmemalloc(page);
@@ -1365,7 +1365,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
start = page_address(page);
if (unlikely(s->flags & SLAB_POISON))
memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page));
memset(start, POISON_INUSE, PAGE_SIZE << order);
last = start;
for_each_object(p, s, start, page->objects) {
@@ -1406,10 +1406,12 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
__ClearPageSlabPfmemalloc(page);
__ClearPageSlab(page);
memcg_release_pages(s, order);
reset_page_mapcount(page);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
__free_pages(page, order);
__free_memcg_kmem_pages(page, order);
}
#define need_reserve_slab_rcu \
@@ -1424,7 +1426,7 @@ static void rcu_free_slab(struct rcu_head *h)
else
page = container_of((struct list_head *)h, struct page, lru);
__free_slab(page->slab, page);
__free_slab(page->slab_cache, page);
}
static void free_slab(struct kmem_cache *s, struct page *page)
@@ -1872,12 +1874,14 @@ redo:
/*
* Unfreeze all the cpu partial slabs.
*
* This function must be called with interrupt disabled.
* This function must be called with interrupts disabled
* for the cpu using c (or some other guarantee must be there
* to guarantee no concurrent accesses).
*/
static void unfreeze_partials(struct kmem_cache *s)
static void unfreeze_partials(struct kmem_cache *s,
struct kmem_cache_cpu *c)
{
struct kmem_cache_node *n = NULL, *n2 = NULL;
struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
struct page *page, *discard_page = NULL;
while ((page = c->partial)) {
@@ -1963,7 +1967,7 @@ static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
* set to the per node partial list.
*/
local_irq_save(flags);
unfreeze_partials(s);
unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
local_irq_restore(flags);
oldpage = NULL;
pobjects = 0;
@@ -2006,7 +2010,7 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
if (c->page)
flush_slab(s, c);
unfreeze_partials(s);
unfreeze_partials(s, c);
}
}
@@ -2325,6 +2329,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
if (slab_pre_alloc_hook(s, gfpflags))
return NULL;
s = memcg_kmem_get_cache(s, gfpflags);
redo:
/*
@@ -2459,7 +2464,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
void *prior;
void **object = (void *)x;
int was_frozen;
int inuse;
struct page new;
unsigned long counters;
struct kmem_cache_node *n = NULL;
@@ -2472,13 +2476,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
return;
do {
if (unlikely(n)) {
spin_unlock_irqrestore(&n->list_lock, flags);
n = NULL;
}
prior = page->freelist;
counters = page->counters;
set_freepointer(s, object, prior);
new.counters = counters;
was_frozen = new.frozen;
new.inuse--;
if ((!new.inuse || !prior) && !was_frozen && !n) {
if ((!new.inuse || !prior) && !was_frozen) {
if (!kmem_cache_debug(s) && !prior)
@@ -2503,7 +2511,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
}
}
inuse = new.inuse;
} while (!cmpxchg_double_slab(s, page,
prior, counters,
@@ -2529,25 +2536,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
return;
}
/*
* was_frozen may have been set after we acquired the list_lock in
* an earlier loop. So we need to check it here again.
*/
if (was_frozen)
stat(s, FREE_FROZEN);
else {
if (unlikely(!inuse && n->nr_partial > s->min_partial))
goto slab_empty;
if (unlikely(!new.inuse && n->nr_partial > s->min_partial))
goto slab_empty;
/*
* Objects left in the slab. If it was not on the partial list before
* then add it.
*/
if (unlikely(!prior)) {
remove_full(s, page);
add_partial(n, page, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
/*
* Objects left in the slab. If it was not on the partial list before
* then add it.
*/
if (kmem_cache_debug(s) && unlikely(!prior)) {
remove_full(s, page);
add_partial(n, page, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
spin_unlock_irqrestore(&n->list_lock, flags);
return;
@@ -2619,19 +2618,10 @@ redo:
void kmem_cache_free(struct kmem_cache *s, void *x)
{
struct page *page;
page = virt_to_head_page(x);
if (kmem_cache_debug(s) && page->slab != s) {
pr_err("kmem_cache_free: Wrong slab cache. %s but object"
" is from %s\n", page->slab->name, s->name);
WARN_ON_ONCE(1);
s = cache_from_obj(s, x);
if (!s)
return;
}
slab_free(s, page, x, _RET_IP_);
slab_free(s, virt_to_head_page(x), x, _RET_IP_);
trace_kmem_cache_free(_RET_IP_, x);
}
EXPORT_SYMBOL(kmem_cache_free);
@@ -2769,32 +2759,6 @@ static inline int calculate_order(int size, int reserved)
return -ENOSYS;
}
/*
* Figure out what the alignment of the objects will be.
*/
static unsigned long calculate_alignment(unsigned long flags,
unsigned long align, unsigned long size)
{
/*
* If the user wants hardware cache aligned objects then follow that
* suggestion if the object is sufficiently large.
*
* The hardware cache alignment cannot override the specified
* alignment though. If that is greater then use it.
*/
if (flags & SLAB_HWCACHE_ALIGN) {
unsigned long ralign = cache_line_size();
while (size <= ralign / 2)
ralign /= 2;
align = max(align, ralign);
}
if (align < ARCH_SLAB_MINALIGN)
align = ARCH_SLAB_MINALIGN;
return ALIGN(align, sizeof(void *));
}
static void
init_kmem_cache_node(struct kmem_cache_node *n)
{
@@ -2928,7 +2892,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
{
unsigned long flags = s->flags;
unsigned long size = s->object_size;
unsigned long align = s->align;
int order;
/*
@@ -2999,20 +2962,12 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
size += sizeof(void *);
#endif
/*
* Determine the alignment based on various parameters that the
* user specified and the dynamic determination of cache line size
* on bootup.
*/
align = calculate_alignment(flags, align, s->object_size);
s->align = align;
/*
* SLUB stores one object immediately after another beginning from
* offset 0. In order to align the objects we have to simply size
* each object to conform to the alignment.
*/
size = ALIGN(size, align);
size = ALIGN(size, s->align);
s->size = size;
if (forced_order >= 0)
order = forced_order;
@@ -3041,7 +2996,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
s->max = s->oo;
return !!oo_objects(s->oo);
}
static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
@@ -3127,15 +3081,6 @@ error:
return -EINVAL;
}
/*
* Determine the size of a slab object
*/
unsigned int kmem_cache_size(struct kmem_cache *s)
{
return s->object_size;
}
EXPORT_SYMBOL(kmem_cache_size);
static void list_slab_objects(struct kmem_cache *s, struct page *page,
const char *text)
{
@@ -3208,8 +3153,19 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
{
int rc = kmem_cache_close(s);
if (!rc)
if (!rc) {
/*
* We do the same lock strategy around sysfs_slab_add, see
* __kmem_cache_create. Because this is pretty much the last
* operation we do and the lock will be released shortly after
* that in slab_common.c, we could just move sysfs_slab_remove
* to a later point in common code. We should do that when we
* have a common sysfs framework for all allocators.
*/
mutex_unlock(&slab_mutex);
sysfs_slab_remove(s);
mutex_lock(&slab_mutex);
}
return rc;
}
@@ -3261,32 +3217,6 @@ static int __init setup_slub_nomerge(char *str)
__setup("slub_nomerge", setup_slub_nomerge);
static struct kmem_cache *__init create_kmalloc_cache(const char *name,
int size, unsigned int flags)
{
struct kmem_cache *s;
s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
s->name = name;
s->size = s->object_size = size;
s->align = ARCH_KMALLOC_MINALIGN;
/*
* This function is called with IRQs disabled during early-boot on
* single CPU so there's no need to take slab_mutex here.
*/
if (kmem_cache_open(s, flags))
goto panic;
list_add(&s->list, &slab_caches);
return s;
panic:
panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
return NULL;
}
/*
* Conversion table for small slabs sizes / 8 to the index in the
* kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -3372,7 +3302,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
struct page *page;
void *ptr = NULL;
flags |= __GFP_COMP | __GFP_NOTRACK;
flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG;
page = alloc_pages_node(node, flags, get_order(size));
if (page)
ptr = page_address(page);
@@ -3424,7 +3354,7 @@ size_t ksize(const void *object)
return PAGE_SIZE << compound_order(page);
}
return slab_ksize(page->slab);
return slab_ksize(page->slab_cache);
}
EXPORT_SYMBOL(ksize);
@@ -3449,8 +3379,8 @@ bool verify_mem_not_deleted(const void *x)
}
slab_lock(page);
if (on_freelist(page->slab, page, object)) {
object_err(page->slab, page, object, "Object is on free-list");
if (on_freelist(page->slab_cache, page, object)) {
object_err(page->slab_cache, page, object, "Object is on free-list");
rv = false;
} else {
rv = true;
@@ -3478,10 +3408,10 @@ void kfree(const void *x)
if (unlikely(!PageSlab(page))) {
BUG_ON(!PageCompound(page));
kmemleak_free(x);
__free_pages(page, compound_order(page));
__free_memcg_kmem_pages(page, compound_order(page));
return;
}
slab_free(page->slab, page, object, _RET_IP_);
slab_free(page->slab_cache, page, object, _RET_IP_);
}
EXPORT_SYMBOL(kfree);
@@ -3573,7 +3503,7 @@ static void slab_mem_offline_callback(void *arg)
struct memory_notify *marg = arg;
int offline_node;
offline_node = marg->status_change_nid;
offline_node = marg->status_change_nid_normal;
/*
* If the node still has available memory. we need kmem_cache_node
@@ -3606,7 +3536,7 @@ static int slab_mem_going_online_callback(void *arg)
struct kmem_cache_node *n;
struct kmem_cache *s;
struct memory_notify *marg = arg;
int nid = marg->status_change_nid;
int nid = marg->status_change_nid_normal;
int ret = 0;
/*
@@ -3676,15 +3606,16 @@ static int slab_memory_callback(struct notifier_block *self,
/*
* Used for early kmem_cache structures that were allocated using
* the page allocator
* the page allocator. Allocate them properly then fix up the pointers
* that may be pointing to the wrong kmem_cache structure.
*/
static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
{
int node;
struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
list_add(&s->list, &slab_caches);
s->refcount = -1;
memcpy(s, static_cache, kmem_cache->object_size);
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = get_node(s, node);
@@ -3692,78 +3623,52 @@ static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
if (n) {
list_for_each_entry(p, &n->partial, lru)
p->slab = s;
p->slab_cache = s;
#ifdef CONFIG_SLUB_DEBUG
list_for_each_entry(p, &n->full, lru)
p->slab = s;
p->slab_cache = s;
#endif
}
}
list_add(&s->list, &slab_caches);
return s;
}
void __init kmem_cache_init(void)
{
static __initdata struct kmem_cache boot_kmem_cache,
boot_kmem_cache_node;
int i;
int caches = 0;
struct kmem_cache *temp_kmem_cache;
int order;
struct kmem_cache *temp_kmem_cache_node;
unsigned long kmalloc_size;
int caches = 2;
if (debug_guardpage_minorder())
slub_max_order = 0;
kmem_size = offsetof(struct kmem_cache, node) +
nr_node_ids * sizeof(struct kmem_cache_node *);
kmem_cache_node = &boot_kmem_cache_node;
kmem_cache = &boot_kmem_cache;
/* Allocate two kmem_caches from the page allocator */
kmalloc_size = ALIGN(kmem_size, cache_line_size());
order = get_order(2 * kmalloc_size);
kmem_cache = (void *)__get_free_pages(GFP_NOWAIT | __GFP_ZERO, order);
/*
* Must first have the slab cache available for the allocations of the
* struct kmem_cache_node's. There is special bootstrap code in
* kmem_cache_open for slab_state == DOWN.
*/
kmem_cache_node = (void *)kmem_cache + kmalloc_size;
kmem_cache_node->name = "kmem_cache_node";
kmem_cache_node->size = kmem_cache_node->object_size =
sizeof(struct kmem_cache_node);
kmem_cache_open(kmem_cache_node, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
create_boot_cache(kmem_cache_node, "kmem_cache_node",
sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN);
hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
/* Able to allocate the per node structures */
slab_state = PARTIAL;
temp_kmem_cache = kmem_cache;
kmem_cache->name = "kmem_cache";
kmem_cache->size = kmem_cache->object_size = kmem_size;
kmem_cache_open(kmem_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
create_boot_cache(kmem_cache, "kmem_cache",
offsetof(struct kmem_cache, node) +
nr_node_ids * sizeof(struct kmem_cache_node *),
SLAB_HWCACHE_ALIGN);
kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
memcpy(kmem_cache, temp_kmem_cache, kmem_size);
kmem_cache = bootstrap(&boot_kmem_cache);
/*
* Allocate kmem_cache_node properly from the kmem_cache slab.
* kmem_cache_node is separately allocated so no need to
* update any list pointers.
*/
temp_kmem_cache_node = kmem_cache_node;
kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size);
kmem_cache_bootstrap_fixup(kmem_cache_node);
caches++;
kmem_cache_bootstrap_fixup(kmem_cache);
caches++;
/* Free temporary boot structure */
free_pages((unsigned long)temp_kmem_cache, order);
kmem_cache_node = bootstrap(&boot_kmem_cache_node);
/* Now we can use the kmem_cache to allocate kmalloc slabs */
@@ -3891,7 +3796,7 @@ static int slab_unmergeable(struct kmem_cache *s)
return 0;
}
static struct kmem_cache *find_mergeable(size_t size,
static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
size_t align, unsigned long flags, const char *name,
void (*ctor)(void *))
{
@@ -3927,17 +3832,21 @@ static struct kmem_cache *find_mergeable(size_t size,
if (s->size - size >= sizeof(void *))
continue;
if (!cache_match_memcg(s, memcg))
continue;
return s;
}
return NULL;
}
struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
size_t align, unsigned long flags, void (*ctor)(void *))
struct kmem_cache *
__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
size_t align, unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *s;
s = find_mergeable(size, align, flags, name, ctor);
s = find_mergeable(memcg, size, align, flags, name, ctor);
if (s) {
s->refcount++;
/*
@@ -3964,6 +3873,11 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
if (err)
return err;
/* Mutex is not taken during early boot */
if (slab_state <= UP)
return 0;
memcg_propagate_slab_attrs(s);
mutex_unlock(&slab_mutex);
err = sysfs_slab_add(s);
mutex_lock(&slab_mutex);
@@ -5197,10 +5111,95 @@ static ssize_t slab_attr_store(struct kobject *kobj,
return -EIO;
err = attribute->store(s, buf, len);
#ifdef CONFIG_MEMCG_KMEM
if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
int i;
mutex_lock(&slab_mutex);
if (s->max_attr_size < len)
s->max_attr_size = len;
/*
* This is a best effort propagation, so this function's return
* value will be determined by the parent cache only. This is
* basically because not all attributes will have a well
* defined semantics for rollbacks - most of the actions will
* have permanent effects.
*
* Returning the error value of any of the children that fail
* is not 100 % defined, in the sense that users seeing the
* error code won't be able to know anything about the state of
* the cache.
*
* Only returning the error code for the parent cache at least
* has well defined semantics. The cache being written to
* directly either failed or succeeded, in which case we loop
* through the descendants with best-effort propagation.
*/
for_each_memcg_cache_index(i) {
struct kmem_cache *c = cache_from_memcg(s, i);
if (c)
attribute->store(c, buf, len);
}
mutex_unlock(&slab_mutex);
}
#endif
return err;
}
static void memcg_propagate_slab_attrs(struct kmem_cache *s)
{
#ifdef CONFIG_MEMCG_KMEM
int i;
char *buffer = NULL;
if (!is_root_cache(s))
return;
/*
* This mean this cache had no attribute written. Therefore, no point
* in copying default values around
*/
if (!s->max_attr_size)
return;
for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
char mbuf[64];
char *buf;
struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
if (!attr || !attr->store || !attr->show)
continue;
/*
* It is really bad that we have to allocate here, so we will
* do it only as a fallback. If we actually allocate, though,
* we can just use the allocated buffer until the end.
*
* Most of the slub attributes will tend to be very small in
* size, but sysfs allows buffers up to a page, so they can
* theoretically happen.
*/
if (buffer)
buf = buffer;
else if (s->max_attr_size < ARRAY_SIZE(mbuf))
buf = mbuf;
else {
buffer = (char *) get_zeroed_page(GFP_KERNEL);
if (WARN_ON(!buffer))
continue;
buf = buffer;
}
attr->show(s->memcg_params->root_cache, buf);
attr->store(s, buf, strlen(buf));
}
if (buffer)
free_page((unsigned long)buffer);
#endif
}
static const struct sysfs_ops slab_sysfs_ops = {
.show = slab_attr_show,
.store = slab_attr_store,
@@ -5257,6 +5256,12 @@ static char *create_unique_id(struct kmem_cache *s)
if (p != name + 1)
*p++ = '-';
p += sprintf(p, "%07d", s->size);
#ifdef CONFIG_MEMCG_KMEM
if (!is_root_cache(s))
p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg));
#endif
BUG_ON(p > name + ID_STR_LENGTH - 1);
return name;
}
@@ -5265,13 +5270,8 @@ static int sysfs_slab_add(struct kmem_cache *s)
{
int err;
const char *name;
int unmergeable;
int unmergeable = slab_unmergeable(s);
if (slab_state < FULL)
/* Defer until later */
return 0;
unmergeable = slab_unmergeable(s);
if (unmergeable) {
/*
* Slabcache can never be merged so we can use the name proper.
@@ -5405,49 +5405,14 @@ __initcall(slab_sysfs_init);
* The /proc/slabinfo ABI
*/
#ifdef CONFIG_SLABINFO
static void print_slabinfo_header(struct seq_file *m)
{
seq_puts(m, "slabinfo - version: 2.1\n");
seq_puts(m, "# name <active_objs> <num_objs> <object_size> "
"<objperslab> <pagesperslab>");
seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
seq_putc(m, '\n');
}
static void *s_start(struct seq_file *m, loff_t *pos)
{
loff_t n = *pos;
mutex_lock(&slab_mutex);
if (!n)
print_slabinfo_header(m);
return seq_list_start(&slab_caches, *pos);
}
static void *s_next(struct seq_file *m, void *p, loff_t *pos)
{
return seq_list_next(p, &slab_caches, pos);
}
static void s_stop(struct seq_file *m, void *p)
{
mutex_unlock(&slab_mutex);
}
static int s_show(struct seq_file *m, void *p)
void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
{
unsigned long nr_partials = 0;
unsigned long nr_slabs = 0;
unsigned long nr_inuse = 0;
unsigned long nr_objs = 0;
unsigned long nr_free = 0;
struct kmem_cache *s;
int node;
s = list_entry(p, struct kmem_cache, list);
for_each_online_node(node) {
struct kmem_cache_node *n = get_node(s, node);
@@ -5460,41 +5425,21 @@ static int s_show(struct seq_file *m, void *p)
nr_free += count_partial(n, count_free);
}
nr_inuse = nr_objs - nr_free;
seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse,
nr_objs, s->size, oo_objects(s->oo),
(1 << oo_order(s->oo)));
seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0);
seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs,
0UL);
seq_putc(m, '\n');
return 0;
sinfo->active_objs = nr_objs - nr_free;
sinfo->num_objs = nr_objs;
sinfo->active_slabs = nr_slabs;
sinfo->num_slabs = nr_slabs;
sinfo->objects_per_slab = oo_objects(s->oo);
sinfo->cache_order = oo_order(s->oo);
}
static const struct seq_operations slabinfo_op = {
.start = s_start,
.next = s_next,
.stop = s_stop,
.show = s_show,
};
static int slabinfo_open(struct inode *inode, struct file *file)
void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
{
return seq_open(file, &slabinfo_op);
}
static const struct file_operations proc_slabinfo_operations = {
.open = slabinfo_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
static int __init slab_proc_init(void)
ssize_t slabinfo_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations);
return 0;
return -EIO;
}
module_init(slab_proc_init);
#endif /* CONFIG_SLABINFO */

파일 보기

@@ -617,7 +617,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
{
return; /* XXX: Not implemented yet */
}
static void free_map_bootmem(struct page *page, unsigned long nr_pages)
static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
{
}
#else
@@ -638,7 +638,6 @@ static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
got_map_page:
ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
got_map_ptr:
memset(ret, 0, memmap_size);
return ret;
}
@@ -658,10 +657,11 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
get_order(sizeof(struct page) * nr_pages));
}
static void free_map_bootmem(struct page *page, unsigned long nr_pages)
static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
{
unsigned long maps_section_nr, removing_section_nr, i;
unsigned long magic;
struct page *page = virt_to_page(memmap);
for (i = 0; i < nr_pages; i++, page++) {
magic = (unsigned long) page->lru.next;
@@ -710,13 +710,10 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
*/
if (memmap) {
struct page *memmap_page;
memmap_page = virt_to_page(memmap);
nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
>> PAGE_SHIFT;
free_map_bootmem(memmap_page, nr_pages);
free_map_bootmem(memmap, nr_pages);
}
}
@@ -760,6 +757,8 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
goto out;
}
memset(memmap, 0, sizeof(struct page) * nr_pages);
ms->section_mem_map |= SECTION_MARKED_PRESENT;
ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
@@ -773,6 +772,27 @@ out:
return ret;
}
#ifdef CONFIG_MEMORY_FAILURE
static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
int i;
if (!memmap)
return;
for (i = 0; i < PAGES_PER_SECTION; i++) {
if (PageHWPoison(&memmap[i])) {
atomic_long_sub(1, &mce_bad_pages);
ClearPageHWPoison(&memmap[i]);
}
}
}
#else
static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
}
#endif
void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
{
struct page *memmap = NULL;
@@ -786,6 +806,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
ms->pageblock_flags = NULL;
}
clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
free_section_usemap(memmap, usemap);
}
#endif

파일 보기

@@ -1443,13 +1443,12 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
return generic_swapfile_activate(sis, swap_file, span);
}
static void enable_swap_info(struct swap_info_struct *p, int prio,
static void _enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
unsigned long *frontswap_map)
{
int i, prev;
spin_lock(&swap_lock);
if (prio >= 0)
p->prio = prio;
else
@@ -1472,10 +1471,25 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
swap_list.head = swap_list.next = p->type;
else
swap_info[prev]->next = p->type;
}
static void enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
unsigned long *frontswap_map)
{
spin_lock(&swap_lock);
_enable_swap_info(p, prio, swap_map, frontswap_map);
frontswap_init(p->type);
spin_unlock(&swap_lock);
}
static void reinsert_swap_info(struct swap_info_struct *p)
{
spin_lock(&swap_lock);
_enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
spin_unlock(&swap_lock);
}
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
struct swap_info_struct *p = NULL;
@@ -1484,7 +1498,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
struct address_space *mapping;
struct inode *inode;
struct filename *pathname;
int oom_score_adj;
int i, type, prev;
int err;
@@ -1543,19 +1556,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
p->flags &= ~SWP_WRITEOK;
spin_unlock(&swap_lock);
oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
set_current_oom_origin();
err = try_to_unuse(type, false, 0); /* force all pages to be unused */
compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
clear_current_oom_origin();
if (err) {
/*
* reading p->prio and p->swap_map outside the lock is
* safe here because only sys_swapon and sys_swapoff
* change them, and there can be no other sys_swapon or
* sys_swapoff for this swap_info_struct at this point.
*/
/* re-insert swap space back into swap_list */
enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
reinsert_swap_info(p);
goto out_dput;
}

파일 보기

@@ -576,29 +576,6 @@ void truncate_setsize(struct inode *inode, loff_t newsize)
}
EXPORT_SYMBOL(truncate_setsize);
/**
* vmtruncate - unmap mappings "freed" by truncate() syscall
* @inode: inode of the file used
* @newsize: file offset to start truncating
*
* This function is deprecated and truncate_setsize or truncate_pagecache
* should be used instead, together with filesystem specific block truncation.
*/
int vmtruncate(struct inode *inode, loff_t newsize)
{
int error;
error = inode_newsize_ok(inode, newsize);
if (error)
return error;
truncate_setsize(inode, newsize);
if (inode->i_op->truncate)
inode->i_op->truncate(inode);
return 0;
}
EXPORT_SYMBOL(vmtruncate);
/**
* truncate_pagecache_range - unmap and remove pagecache that is hole-punched
* @inode: inode

파일 보기

@@ -152,7 +152,7 @@ EXPORT_SYMBOL(__krealloc);
*
* The contents of the object pointed to are preserved up to the
* lesser of the new and old sizes. If @p is %NULL, krealloc()
* behaves exactly like kmalloc(). If @size is 0 and @p is not a
* behaves exactly like kmalloc(). If @new_size is 0 and @p is not a
* %NULL pointer, the object pointed to is freed.
*/
void *krealloc(const void *p, size_t new_size, gfp_t flags)

파일 보기

@@ -2550,7 +2550,7 @@ static void s_stop(struct seq_file *m, void *p)
static void show_numa_info(struct seq_file *m, struct vm_struct *v)
{
if (NUMA_BUILD) {
if (IS_ENABLED(CONFIG_NUMA)) {
unsigned int nr, *counters = m->private;
if (!counters)
@@ -2615,7 +2615,7 @@ static int vmalloc_open(struct inode *inode, struct file *file)
unsigned int *ptr = NULL;
int ret;
if (NUMA_BUILD) {
if (IS_ENABLED(CONFIG_NUMA)) {
ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
if (ptr == NULL)
return -ENOMEM;

파일 보기

@@ -1177,7 +1177,11 @@ int isolate_lru_page(struct page *page)
}
/*
* Are there way too many processes in the direct reclaim path already?
* A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
* then get resheduled. When there are massive number of tasks doing page
* allocation, such sleeping direct reclaimers may keep piling up on each CPU,
* the LRU list will go small and be scanned faster than necessary, leading to
* unnecessary swapping, thrashing and OOM.
*/
static int too_many_isolated(struct zone *zone, int file,
struct scan_control *sc)
@@ -1198,6 +1202,14 @@ static int too_many_isolated(struct zone *zone, int file,
isolated = zone_page_state(zone, NR_ISOLATED_ANON);
}
/*
* GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
* won't get blocked by normal direct-reclaimers, forming a circular
* deadlock.
*/
if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
inactive >>= 3;
return isolated > inactive;
}
@@ -1679,13 +1691,24 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
if (global_reclaim(sc)) {
free = zone_page_state(zone, NR_FREE_PAGES);
/* If we have very few page cache pages,
force-scan anon pages. */
if (unlikely(file + free <= high_wmark_pages(zone))) {
/*
* If we have very few page cache pages, force-scan
* anon pages.
*/
fraction[0] = 1;
fraction[1] = 0;
denominator = 1;
goto out;
} else if (!inactive_file_is_low_global(zone)) {
/*
* There is enough inactive page cache, do not
* reclaim anything from the working set right now.
*/
fraction[0] = 0;
fraction[1] = 1;
denominator = 1;
goto out;
}
}
@@ -1752,7 +1775,7 @@ out:
/* Use reclaim/compaction for costly allocs or under memory pressure */
static bool in_reclaim_compaction(struct scan_control *sc)
{
if (COMPACTION_BUILD && sc->order &&
if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
sc->priority < DEF_PRIORITY - 2))
return true;
@@ -2005,7 +2028,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
if (zone->all_unreclaimable &&
sc->priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
if (COMPACTION_BUILD) {
if (IS_ENABLED(CONFIG_COMPACTION)) {
/*
* If we already have plenty of memory free for
* compaction in this zone, don't free any more.
@@ -2207,9 +2230,12 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
* Throttle direct reclaimers if backing storage is backed by the network
* and the PFMEMALLOC reserve for the preferred node is getting dangerously
* depleted. kswapd will continue to make progress and wake the processes
* when the low watermark is reached
* when the low watermark is reached.
*
* Returns true if a fatal signal was delivered during throttling. If this
* happens, the page allocator should not consider triggering the OOM killer.
*/
static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
nodemask_t *nodemask)
{
struct zone *zone;
@@ -2224,13 +2250,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
* processes to block on log_wait_commit().
*/
if (current->flags & PF_KTHREAD)
return;
goto out;
/*
* If a fatal signal is pending, this process should not throttle.
* It should return quickly so it can exit and free its memory
*/
if (fatal_signal_pending(current))
goto out;
/* Check if the pfmemalloc reserves are ok */
first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
pgdat = zone->zone_pgdat;
if (pfmemalloc_watermark_ok(pgdat))
return;
goto out;
/* Account for the throttling */
count_vm_event(PGSCAN_DIRECT_THROTTLE);
@@ -2246,12 +2279,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
if (!(gfp_mask & __GFP_FS)) {
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
pfmemalloc_watermark_ok(pgdat), HZ);
return;
goto check_pending;
}
/* Throttle until kswapd wakes the process */
wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
pfmemalloc_watermark_ok(pgdat));
check_pending:
if (fatal_signal_pending(current))
return true;
out:
return false;
}
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
@@ -2273,13 +2314,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.gfp_mask = sc.gfp_mask,
};
throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
/*
* Do not enter reclaim if fatal signal is pending. 1 is returned so
* that the page allocator does not consider triggering OOM
* Do not enter reclaim if fatal signal was delivered while throttled.
* 1 is returned so that the page allocator does not OOM kill at this
* point.
*/
if (fatal_signal_pending(current))
if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
return 1;
trace_mm_vmscan_direct_reclaim_begin(order,
@@ -2397,13 +2437,31 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
} while (memcg);
}
static bool zone_balanced(struct zone *zone, int order,
unsigned long balance_gap, int classzone_idx)
{
if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
balance_gap, classzone_idx, 0))
return false;
if (IS_ENABLED(CONFIG_COMPACTION) && order &&
!compaction_suitable(zone, order))
return false;
return true;
}
/*
* pgdat_balanced is used when checking if a node is balanced for high-order
* allocations. Only zones that meet watermarks and are in a zone allowed
* by the callers classzone_idx are added to balanced_pages. The total of
* balanced pages must be at least 25% of the zones allowed by classzone_idx
* for the node to be considered balanced. Forcing all zones to be balanced
* for high orders can cause excessive reclaim when there are imbalanced zones.
* pgdat_balanced() is used when checking if a node is balanced.
*
* For order-0, all zones must be balanced!
*
* For high-order allocations only zones that meet watermarks and are in a
* zone allowed by the callers classzone_idx are added to balanced_pages. The
* total of balanced pages must be at least 25% of the zones allowed by
* classzone_idx for the node to be considered balanced. Forcing all zones to
* be balanced for high orders can cause excessive reclaim when there are
* imbalanced zones.
* The choice of 25% is due to
* o a 16M DMA zone that is balanced will not balance a zone on any
* reasonable sized machine
@@ -2413,17 +2471,43 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
* Similarly, on x86-64 the Normal zone would need to be at least 1G
* to balance a node on its own. These seemed like reasonable ratios.
*/
static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
int classzone_idx)
static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
{
unsigned long present_pages = 0;
unsigned long balanced_pages = 0;
int i;
for (i = 0; i <= classzone_idx; i++)
present_pages += pgdat->node_zones[i].present_pages;
/* Check the watermark levels */
for (i = 0; i <= classzone_idx; i++) {
struct zone *zone = pgdat->node_zones + i;
/* A special case here: if zone has no page, we think it's balanced */
return balanced_pages >= (present_pages >> 2);
if (!populated_zone(zone))
continue;
present_pages += zone->present_pages;
/*
* A special case here:
*
* balance_pgdat() skips over all_unreclaimable after
* DEF_PRIORITY. Effectively, it considers them balanced so
* they must be considered balanced here as well!
*/
if (zone->all_unreclaimable) {
balanced_pages += zone->present_pages;
continue;
}
if (zone_balanced(zone, order, 0, i))
balanced_pages += zone->present_pages;
else if (!order)
return false;
}
if (order)
return balanced_pages >= (present_pages >> 2);
else
return true;
}
/*
@@ -2435,10 +2519,6 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
int classzone_idx)
{
int i;
unsigned long balanced = 0;
bool all_zones_ok = true;
/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
if (remaining)
return false;
@@ -2457,40 +2537,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
return false;
}
/* Check the watermark levels */
for (i = 0; i <= classzone_idx; i++) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;
/*
* balance_pgdat() skips over all_unreclaimable after
* DEF_PRIORITY. Effectively, it considers them balanced so
* they must be considered balanced here as well if kswapd
* is to sleep
*/
if (zone->all_unreclaimable) {
balanced += zone->present_pages;
continue;
}
if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
i, 0))
all_zones_ok = false;
else
balanced += zone->present_pages;
}
/*
* For high-order requests, the balanced zones must contain at least
* 25% of the nodes pages for kswapd to sleep. For order-0, all zones
* must be balanced
*/
if (order)
return pgdat_balanced(pgdat, balanced, classzone_idx);
else
return all_zones_ok;
return pgdat_balanced(pgdat, order, classzone_idx);
}
/*
@@ -2517,8 +2564,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
int *classzone_idx)
{
int all_zones_ok;
unsigned long balanced;
struct zone *unbalanced_zone;
int i;
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long total_scanned;
@@ -2551,8 +2597,7 @@ loop_again:
unsigned long lru_pages = 0;
int has_under_min_watermark_zone = 0;
all_zones_ok = 1;
balanced = 0;
unbalanced_zone = NULL;
/*
* Scan in the highmem->dma direction for the highest
@@ -2585,8 +2630,7 @@ loop_again:
break;
}
if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), 0, 0)) {
if (!zone_balanced(zone, order, 0, 0)) {
end_zone = i;
break;
} else {
@@ -2656,15 +2700,14 @@ loop_again:
* Do not reclaim more than needed for compaction.
*/
testorder = order;
if (COMPACTION_BUILD && order &&
if (IS_ENABLED(CONFIG_COMPACTION) && order &&
compaction_suitable(zone, order) !=
COMPACT_SKIPPED)
testorder = 0;
if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
!zone_watermark_ok_safe(zone, testorder,
high_wmark_pages(zone) + balance_gap,
end_zone, 0)) {
!zone_balanced(zone, testorder,
balance_gap, end_zone)) {
shrink_zone(zone, &sc);
reclaim_state->reclaimed_slab = 0;
@@ -2691,9 +2734,8 @@ loop_again:
continue;
}
if (!zone_watermark_ok_safe(zone, testorder,
high_wmark_pages(zone), end_zone, 0)) {
all_zones_ok = 0;
if (!zone_balanced(zone, testorder, 0, end_zone)) {
unbalanced_zone = zone;
/*
* We are still under min water mark. This
* means that we have a GFP_ATOMIC allocation
@@ -2711,8 +2753,6 @@ loop_again:
* speculatively avoid congestion waits
*/
zone_clear_flag(zone, ZONE_CONGESTED);
if (i <= *classzone_idx)
balanced += zone->present_pages;
}
}
@@ -2726,7 +2766,7 @@ loop_again:
pfmemalloc_watermark_ok(pgdat))
wake_up(&pgdat->pfmemalloc_wait);
if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
if (pgdat_balanced(pgdat, order, *classzone_idx))
break; /* kswapd: all done */
/*
* OK, kswapd is getting into trouble. Take a nap, then take
@@ -2735,8 +2775,8 @@ loop_again:
if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
if (has_under_min_watermark_zone)
count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
else
congestion_wait(BLK_RW_ASYNC, HZ/10);
else if (unbalanced_zone)
wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10);
}
/*
@@ -2750,12 +2790,7 @@ loop_again:
} while (--sc.priority >= 0);
out:
/*
* order-0: All zones must meet high watermark for a balanced node
* high-order: Balanced zones must make up at least 25% of the node
* for the node to be balanced
*/
if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
if (!pgdat_balanced(pgdat, order, *classzone_idx)) {
cond_resched();
try_to_freeze();
@@ -2797,29 +2832,10 @@ out:
if (!populated_zone(zone))
continue;
if (zone->all_unreclaimable &&
sc.priority != DEF_PRIORITY)
continue;
/* Would compaction fail due to lack of free memory? */
if (COMPACTION_BUILD &&
compaction_suitable(zone, order) == COMPACT_SKIPPED)
goto loop_again;
/* Confirm the zone is balanced for order-0 */
if (!zone_watermark_ok(zone, 0,
high_wmark_pages(zone), 0, 0)) {
order = sc.order = 0;
goto loop_again;
}
/* Check if the memory needs to be defragmented. */
if (zone_watermark_ok(zone, order,
low_wmark_pages(zone), *classzone_idx, 0))
zones_need_compaction = 0;
/* If balanced, clear the congested flag */
zone_clear_flag(zone, ZONE_CONGESTED);
}
if (zones_need_compaction)
@@ -2944,7 +2960,7 @@ static int kswapd(void *p)
classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
balanced_classzone_idx = classzone_idx;
for ( ; ; ) {
int ret;
bool ret;
/*
* If the last balance_pgdat was unsuccessful it's unlikely a
@@ -3106,13 +3122,13 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
not required for correctness. So if the last cpu in a node goes
away, we get changed to run anywhere: as the first one comes back,
restore their cpu bindings. */
static int __devinit cpu_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
static int cpu_callback(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
int nid;
if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
for_each_node_state(nid, N_HIGH_MEMORY) {
for_each_node_state(nid, N_MEMORY) {
pg_data_t *pgdat = NODE_DATA(nid);
const struct cpumask *mask;
@@ -3168,7 +3184,7 @@ static int __init kswapd_init(void)
int nid;
swap_setup();
for_each_node_state(nid, N_HIGH_MEMORY)
for_each_node_state(nid, N_MEMORY)
kswapd_run(nid);
hotcpu_notifier(cpu_callback, 0);
return 0;

파일 보기

@@ -774,10 +774,20 @@ const char * const vmstat_text[] = {
"pgrotated",
#ifdef CONFIG_NUMA_BALANCING
"numa_pte_updates",
"numa_hint_faults",
"numa_hint_faults_local",
"numa_pages_migrated",
#endif
#ifdef CONFIG_MIGRATION
"pgmigrate_success",
"pgmigrate_fail",
#endif
#ifdef CONFIG_COMPACTION
"compact_blocks_moved",
"compact_pages_moved",
"compact_pagemigrate_failed",
"compact_migrate_scanned",
"compact_free_scanned",
"compact_isolated",
"compact_stall",
"compact_fail",
"compact_success",
@@ -801,6 +811,8 @@ const char * const vmstat_text[] = {
"thp_collapse_alloc",
"thp_collapse_alloc_failed",
"thp_split",
"thp_zero_page_alloc",
"thp_zero_page_alloc_failed",
#endif
#endif /* CONFIG_VM_EVENTS_COUNTERS */
@@ -930,7 +942,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg)
pg_data_t *pgdat = (pg_data_t *)arg;
/* check memoryless node */
if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
if (!node_state(pgdat->node_id, N_MEMORY))
return 0;
seq_printf(m, "Page block order: %d\n", pageblock_order);
@@ -992,14 +1004,16 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n high %lu"
"\n scanned %lu"
"\n spanned %lu"
"\n present %lu",
"\n present %lu"
"\n managed %lu",
zone_page_state(zone, NR_FREE_PAGES),
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
zone->pages_scanned,
zone->spanned_pages,
zone->present_pages);
zone->present_pages,
zone->managed_pages);
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
seq_printf(m, "\n %-12s %lu", vmstat_text[i],
@@ -1292,7 +1306,7 @@ static int unusable_show(struct seq_file *m, void *arg)
pg_data_t *pgdat = (pg_data_t *)arg;
/* check memoryless node */
if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
if (!node_state(pgdat->node_id, N_MEMORY))
return 0;
walk_zones_in_node(m, pgdat, unusable_show_print);