Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: - fsnotify updates - ocfs2 updates - all of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (127 commits) console: don't prefer first registered if DT specifies stdout-path cred: simpler, 1D supplementary groups CREDITS: update Pavel's information, add GPG key, remove snail mail address mailmap: add Johan Hovold .gitattributes: set git diff driver for C source code files uprobes: remove function declarations from arch/{mips,s390} spelling.txt: "modeled" is spelt correctly nmi_backtrace: generate one-line reports for idle cpus arch/tile: adopt the new nmi_backtrace framework nmi_backtrace: do a local dump_stack() instead of a self-NMI nmi_backtrace: add more trigger_*_cpu_backtrace() methods min/max: remove sparse warnings when they're nested Documentation/filesystems/proc.txt: add more description for maps/smaps mm, proc: fix region lost in /proc/self/smaps proc: fix timerslack_ns CAP_SYS_NICE check when adjusting self proc: add LSM hook checks to /proc/<tid>/timerslack_ns proc: relax /proc/<tid>/timerslack_ns capability requirements meminfo: break apart a very long seq_printf with #ifdefs seq/proc: modify seq_put_decimal_[u]ll to take a const char *, not char proc: faster /proc/*/status ...
这个提交包含在:
14
mm/bootmem.c
14
mm/bootmem.c
@@ -11,15 +11,12 @@
|
||||
#include <linux/init.h>
|
||||
#include <linux/pfn.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/bootmem.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/kmemleak.h>
|
||||
#include <linux/range.h>
|
||||
#include <linux/memblock.h>
|
||||
#include <linux/bug.h>
|
||||
#include <linux/io.h>
|
||||
|
||||
#include <asm/processor.h>
|
||||
#include <linux/bootmem.h>
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
@@ -712,7 +709,7 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
|
||||
void *ptr;
|
||||
|
||||
if (WARN_ON_ONCE(slab_is_available()))
|
||||
return kzalloc(size, GFP_NOWAIT);
|
||||
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
|
||||
again:
|
||||
|
||||
/* do not panic in alloc_bootmem_bdata() */
|
||||
@@ -738,9 +735,6 @@ again:
|
||||
void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
|
||||
unsigned long align, unsigned long goal)
|
||||
{
|
||||
if (WARN_ON_ONCE(slab_is_available()))
|
||||
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
|
||||
|
||||
return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
|
||||
}
|
||||
|
||||
@@ -812,10 +806,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
|
||||
|
||||
}
|
||||
|
||||
#ifndef ARCH_LOW_ADDRESS_LIMIT
|
||||
#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
|
||||
#endif
|
||||
|
||||
/**
|
||||
* __alloc_bootmem_low - allocate low boot memory
|
||||
* @size: size of the request in bytes
|
||||
|
205
mm/compaction.c
205
mm/compaction.c
@@ -997,8 +997,12 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
|
||||
#ifdef CONFIG_COMPACTION
|
||||
|
||||
/* Returns true if the page is within a block suitable for migration to */
|
||||
static bool suitable_migration_target(struct page *page)
|
||||
static bool suitable_migration_target(struct compact_control *cc,
|
||||
struct page *page)
|
||||
{
|
||||
if (cc->ignore_block_suitable)
|
||||
return true;
|
||||
|
||||
/* If the page is a large free page, then disallow migration */
|
||||
if (PageBuddy(page)) {
|
||||
/*
|
||||
@@ -1083,7 +1087,7 @@ static void isolate_freepages(struct compact_control *cc)
|
||||
continue;
|
||||
|
||||
/* Check the block is suitable for migration */
|
||||
if (!suitable_migration_target(page))
|
||||
if (!suitable_migration_target(cc, page))
|
||||
continue;
|
||||
|
||||
/* If isolation recently failed, do not retry */
|
||||
@@ -1316,7 +1320,7 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
|
||||
return COMPACT_CONTINUE;
|
||||
|
||||
/* Compaction run is not finished if the watermark is not met */
|
||||
watermark = low_wmark_pages(zone);
|
||||
watermark = zone->watermark[cc->alloc_flags & ALLOC_WMARK_MASK];
|
||||
|
||||
if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx,
|
||||
cc->alloc_flags))
|
||||
@@ -1329,13 +1333,13 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
|
||||
|
||||
/* Job done if page is free of the right migratetype */
|
||||
if (!list_empty(&area->free_list[migratetype]))
|
||||
return COMPACT_PARTIAL;
|
||||
return COMPACT_SUCCESS;
|
||||
|
||||
#ifdef CONFIG_CMA
|
||||
/* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
|
||||
if (migratetype == MIGRATE_MOVABLE &&
|
||||
!list_empty(&area->free_list[MIGRATE_CMA]))
|
||||
return COMPACT_PARTIAL;
|
||||
return COMPACT_SUCCESS;
|
||||
#endif
|
||||
/*
|
||||
* Job done if allocation would steal freepages from
|
||||
@@ -1343,7 +1347,7 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
|
||||
*/
|
||||
if (find_suitable_fallback(area, order, migratetype,
|
||||
true, &can_steal) != -1)
|
||||
return COMPACT_PARTIAL;
|
||||
return COMPACT_SUCCESS;
|
||||
}
|
||||
|
||||
return COMPACT_NO_SUITABLE_PAGE;
|
||||
@@ -1367,7 +1371,7 @@ static enum compact_result compact_finished(struct zone *zone,
|
||||
* compaction_suitable: Is this suitable to run compaction on this zone now?
|
||||
* Returns
|
||||
* COMPACT_SKIPPED - If there are too few free pages for compaction
|
||||
* COMPACT_PARTIAL - If the allocation would succeed without compaction
|
||||
* COMPACT_SUCCESS - If the allocation would succeed without compaction
|
||||
* COMPACT_CONTINUE - If compaction should run now
|
||||
*/
|
||||
static enum compact_result __compaction_suitable(struct zone *zone, int order,
|
||||
@@ -1375,46 +1379,41 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
|
||||
int classzone_idx,
|
||||
unsigned long wmark_target)
|
||||
{
|
||||
int fragindex;
|
||||
unsigned long watermark;
|
||||
|
||||
if (is_via_compact_memory(order))
|
||||
return COMPACT_CONTINUE;
|
||||
|
||||
watermark = low_wmark_pages(zone);
|
||||
watermark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
|
||||
/*
|
||||
* If watermarks for high-order allocation are already met, there
|
||||
* should be no need for compaction at all.
|
||||
*/
|
||||
if (zone_watermark_ok(zone, order, watermark, classzone_idx,
|
||||
alloc_flags))
|
||||
return COMPACT_PARTIAL;
|
||||
return COMPACT_SUCCESS;
|
||||
|
||||
/*
|
||||
* Watermarks for order-0 must be met for compaction. Note the 2UL.
|
||||
* This is because during migration, copies of pages need to be
|
||||
* allocated and for a short time, the footprint is higher
|
||||
* Watermarks for order-0 must be met for compaction to be able to
|
||||
* isolate free pages for migration targets. This means that the
|
||||
* watermark and alloc_flags have to match, or be more pessimistic than
|
||||
* the check in __isolate_free_page(). We don't use the direct
|
||||
* compactor's alloc_flags, as they are not relevant for freepage
|
||||
* isolation. We however do use the direct compactor's classzone_idx to
|
||||
* skip over zones where lowmem reserves would prevent allocation even
|
||||
* if compaction succeeds.
|
||||
* For costly orders, we require low watermark instead of min for
|
||||
* compaction to proceed to increase its chances.
|
||||
* ALLOC_CMA is used, as pages in CMA pageblocks are considered
|
||||
* suitable migration targets
|
||||
*/
|
||||
watermark += (2UL << order);
|
||||
watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
|
||||
low_wmark_pages(zone) : min_wmark_pages(zone);
|
||||
watermark += compact_gap(order);
|
||||
if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
|
||||
alloc_flags, wmark_target))
|
||||
ALLOC_CMA, wmark_target))
|
||||
return COMPACT_SKIPPED;
|
||||
|
||||
/*
|
||||
* fragmentation index determines if allocation failures are due to
|
||||
* low memory or external fragmentation
|
||||
*
|
||||
* index of -1000 would imply allocations might succeed depending on
|
||||
* watermarks, but we already failed the high-order watermark check
|
||||
* index towards 0 implies failure is due to lack of memory
|
||||
* index towards 1000 implies failure is due to fragmentation
|
||||
*
|
||||
* Only compact if a failure would be due to fragmentation.
|
||||
*/
|
||||
fragindex = fragmentation_index(zone, order);
|
||||
if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
|
||||
return COMPACT_NOT_SUITABLE_ZONE;
|
||||
|
||||
return COMPACT_CONTINUE;
|
||||
}
|
||||
|
||||
@@ -1423,9 +1422,32 @@ enum compact_result compaction_suitable(struct zone *zone, int order,
|
||||
int classzone_idx)
|
||||
{
|
||||
enum compact_result ret;
|
||||
int fragindex;
|
||||
|
||||
ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,
|
||||
zone_page_state(zone, NR_FREE_PAGES));
|
||||
/*
|
||||
* fragmentation index determines if allocation failures are due to
|
||||
* low memory or external fragmentation
|
||||
*
|
||||
* index of -1000 would imply allocations might succeed depending on
|
||||
* watermarks, but we already failed the high-order watermark check
|
||||
* index towards 0 implies failure is due to lack of memory
|
||||
* index towards 1000 implies failure is due to fragmentation
|
||||
*
|
||||
* Only compact if a failure would be due to fragmentation. Also
|
||||
* ignore fragindex for non-costly orders where the alternative to
|
||||
* a successful reclaim/compaction is OOM. Fragindex and the
|
||||
* vm.extfrag_threshold sysctl is meant as a heuristic to prevent
|
||||
* excessive compaction for costly orders, but it should not be at the
|
||||
* expense of system stability.
|
||||
*/
|
||||
if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) {
|
||||
fragindex = fragmentation_index(zone, order);
|
||||
if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
|
||||
ret = COMPACT_NOT_SUITABLE_ZONE;
|
||||
}
|
||||
|
||||
trace_mm_compaction_suitable(zone, order, ret);
|
||||
if (ret == COMPACT_NOT_SUITABLE_ZONE)
|
||||
ret = COMPACT_SKIPPED;
|
||||
@@ -1458,8 +1480,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
|
||||
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
|
||||
compact_result = __compaction_suitable(zone, order, alloc_flags,
|
||||
ac_classzone_idx(ac), available);
|
||||
if (compact_result != COMPACT_SKIPPED &&
|
||||
compact_result != COMPACT_NOT_SUITABLE_ZONE)
|
||||
if (compact_result != COMPACT_SKIPPED)
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1477,7 +1498,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
|
||||
ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
|
||||
cc->classzone_idx);
|
||||
/* Compaction is likely to fail */
|
||||
if (ret == COMPACT_PARTIAL || ret == COMPACT_SKIPPED)
|
||||
if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
|
||||
return ret;
|
||||
|
||||
/* huh, compaction_suitable is returning something unexpected */
|
||||
@@ -1492,23 +1513,29 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
|
||||
|
||||
/*
|
||||
* Setup to move all movable pages to the end of the zone. Used cached
|
||||
* information on where the scanners should start but check that it
|
||||
* is initialised by ensuring the values are within zone boundaries.
|
||||
* information on where the scanners should start (unless we explicitly
|
||||
* want to compact the whole zone), but check that it is initialised
|
||||
* by ensuring the values are within zone boundaries.
|
||||
*/
|
||||
cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
|
||||
cc->free_pfn = zone->compact_cached_free_pfn;
|
||||
if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
|
||||
cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
|
||||
zone->compact_cached_free_pfn = cc->free_pfn;
|
||||
}
|
||||
if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
|
||||
if (cc->whole_zone) {
|
||||
cc->migrate_pfn = start_pfn;
|
||||
zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
|
||||
zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
|
||||
}
|
||||
cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
|
||||
} else {
|
||||
cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
|
||||
cc->free_pfn = zone->compact_cached_free_pfn;
|
||||
if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
|
||||
cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
|
||||
zone->compact_cached_free_pfn = cc->free_pfn;
|
||||
}
|
||||
if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
|
||||
cc->migrate_pfn = start_pfn;
|
||||
zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
|
||||
zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
|
||||
}
|
||||
|
||||
if (cc->migrate_pfn == start_pfn)
|
||||
cc->whole_zone = true;
|
||||
if (cc->migrate_pfn == start_pfn)
|
||||
cc->whole_zone = true;
|
||||
}
|
||||
|
||||
cc->last_migrated_pfn = 0;
|
||||
|
||||
@@ -1638,6 +1665,9 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
|
||||
.alloc_flags = alloc_flags,
|
||||
.classzone_idx = classzone_idx,
|
||||
.direct_compaction = true,
|
||||
.whole_zone = (prio == MIN_COMPACT_PRIORITY),
|
||||
.ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
|
||||
.ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY)
|
||||
};
|
||||
INIT_LIST_HEAD(&cc.freepages);
|
||||
INIT_LIST_HEAD(&cc.migratepages);
|
||||
@@ -1683,7 +1713,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
|
||||
ac->nodemask) {
|
||||
enum compact_result status;
|
||||
|
||||
if (compaction_deferred(zone, order)) {
|
||||
if (prio > MIN_COMPACT_PRIORITY
|
||||
&& compaction_deferred(zone, order)) {
|
||||
rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
|
||||
continue;
|
||||
}
|
||||
@@ -1692,9 +1723,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
|
||||
alloc_flags, ac_classzone_idx(ac));
|
||||
rc = max(status, rc);
|
||||
|
||||
/* If a normal allocation would succeed, stop compacting */
|
||||
if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
|
||||
ac_classzone_idx(ac), alloc_flags)) {
|
||||
/* The allocation should succeed, stop compacting */
|
||||
if (status == COMPACT_SUCCESS) {
|
||||
/*
|
||||
* We think the allocation will succeed in this zone,
|
||||
* but it is not certain, hence the false. The caller
|
||||
@@ -1730,10 +1760,18 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
|
||||
|
||||
|
||||
/* Compact all zones within a node */
|
||||
static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
|
||||
static void compact_node(int nid)
|
||||
{
|
||||
pg_data_t *pgdat = NODE_DATA(nid);
|
||||
int zoneid;
|
||||
struct zone *zone;
|
||||
struct compact_control cc = {
|
||||
.order = -1,
|
||||
.mode = MIGRATE_SYNC,
|
||||
.ignore_skip_hint = true,
|
||||
.whole_zone = true,
|
||||
};
|
||||
|
||||
|
||||
for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
|
||||
|
||||
@@ -1741,60 +1779,19 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
|
||||
if (!populated_zone(zone))
|
||||
continue;
|
||||
|
||||
cc->nr_freepages = 0;
|
||||
cc->nr_migratepages = 0;
|
||||
cc->zone = zone;
|
||||
INIT_LIST_HEAD(&cc->freepages);
|
||||
INIT_LIST_HEAD(&cc->migratepages);
|
||||
cc.nr_freepages = 0;
|
||||
cc.nr_migratepages = 0;
|
||||
cc.zone = zone;
|
||||
INIT_LIST_HEAD(&cc.freepages);
|
||||
INIT_LIST_HEAD(&cc.migratepages);
|
||||
|
||||
/*
|
||||
* When called via /proc/sys/vm/compact_memory
|
||||
* this makes sure we compact the whole zone regardless of
|
||||
* cached scanner positions.
|
||||
*/
|
||||
if (is_via_compact_memory(cc->order))
|
||||
__reset_isolation_suitable(zone);
|
||||
compact_zone(zone, &cc);
|
||||
|
||||
if (is_via_compact_memory(cc->order) ||
|
||||
!compaction_deferred(zone, cc->order))
|
||||
compact_zone(zone, cc);
|
||||
|
||||
VM_BUG_ON(!list_empty(&cc->freepages));
|
||||
VM_BUG_ON(!list_empty(&cc->migratepages));
|
||||
|
||||
if (is_via_compact_memory(cc->order))
|
||||
continue;
|
||||
|
||||
if (zone_watermark_ok(zone, cc->order,
|
||||
low_wmark_pages(zone), 0, 0))
|
||||
compaction_defer_reset(zone, cc->order, false);
|
||||
VM_BUG_ON(!list_empty(&cc.freepages));
|
||||
VM_BUG_ON(!list_empty(&cc.migratepages));
|
||||
}
|
||||
}
|
||||
|
||||
void compact_pgdat(pg_data_t *pgdat, int order)
|
||||
{
|
||||
struct compact_control cc = {
|
||||
.order = order,
|
||||
.mode = MIGRATE_ASYNC,
|
||||
};
|
||||
|
||||
if (!order)
|
||||
return;
|
||||
|
||||
__compact_pgdat(pgdat, &cc);
|
||||
}
|
||||
|
||||
static void compact_node(int nid)
|
||||
{
|
||||
struct compact_control cc = {
|
||||
.order = -1,
|
||||
.mode = MIGRATE_SYNC,
|
||||
.ignore_skip_hint = true,
|
||||
};
|
||||
|
||||
__compact_pgdat(NODE_DATA(nid), &cc);
|
||||
}
|
||||
|
||||
/* Compact all nodes in the system */
|
||||
static void compact_nodes(void)
|
||||
{
|
||||
@@ -1900,8 +1897,6 @@ static void kcompactd_do_work(pg_data_t *pgdat)
|
||||
.ignore_skip_hint = true,
|
||||
|
||||
};
|
||||
bool success = false;
|
||||
|
||||
trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
|
||||
cc.classzone_idx);
|
||||
count_vm_event(KCOMPACTD_WAKE);
|
||||
@@ -1930,9 +1925,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
|
||||
return;
|
||||
status = compact_zone(zone, &cc);
|
||||
|
||||
if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone),
|
||||
cc.classzone_idx, 0)) {
|
||||
success = true;
|
||||
if (status == COMPACT_SUCCESS) {
|
||||
compaction_defer_reset(zone, cc.order, false);
|
||||
} else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) {
|
||||
/*
|
||||
|
@@ -42,6 +42,11 @@ const struct trace_print_flags vmaflag_names[] = {
|
||||
|
||||
void __dump_page(struct page *page, const char *reason)
|
||||
{
|
||||
/*
|
||||
* Avoid VM_BUG_ON() in page_mapcount().
|
||||
* page->_mapcount space in struct page is used by sl[aou]b pages to
|
||||
* encode own info.
|
||||
*/
|
||||
int mapcount = PageSlab(page) ? 0 : page_mapcount(page);
|
||||
|
||||
pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
|
||||
|
@@ -1687,6 +1687,10 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
|
||||
unsigned int prev_offset;
|
||||
int error = 0;
|
||||
|
||||
if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
|
||||
return -EINVAL;
|
||||
iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
|
||||
|
||||
index = *ppos >> PAGE_SHIFT;
|
||||
prev_index = ra->prev_pos >> PAGE_SHIFT;
|
||||
prev_offset = ra->prev_pos & (PAGE_SIZE-1);
|
||||
@@ -1721,7 +1725,9 @@ find_page:
|
||||
* wait_on_page_locked is used to avoid unnecessarily
|
||||
* serialisations and why it's safe.
|
||||
*/
|
||||
wait_on_page_locked_killable(page);
|
||||
error = wait_on_page_locked_killable(page);
|
||||
if (unlikely(error))
|
||||
goto readpage_error;
|
||||
if (PageUptodate(page))
|
||||
goto page_ok;
|
||||
|
||||
|
@@ -59,7 +59,7 @@ static struct shrinker deferred_split_shrinker;
|
||||
static atomic_t huge_zero_refcount;
|
||||
struct page *huge_zero_page __read_mostly;
|
||||
|
||||
struct page *get_huge_zero_page(void)
|
||||
static struct page *get_huge_zero_page(void)
|
||||
{
|
||||
struct page *zero_page;
|
||||
retry:
|
||||
@@ -86,7 +86,7 @@ retry:
|
||||
return READ_ONCE(huge_zero_page);
|
||||
}
|
||||
|
||||
void put_huge_zero_page(void)
|
||||
static void put_huge_zero_page(void)
|
||||
{
|
||||
/*
|
||||
* Counter should never go to zero here. Only shrinker can put
|
||||
@@ -95,6 +95,26 @@ void put_huge_zero_page(void)
|
||||
BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
|
||||
}
|
||||
|
||||
struct page *mm_get_huge_zero_page(struct mm_struct *mm)
|
||||
{
|
||||
if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
|
||||
return READ_ONCE(huge_zero_page);
|
||||
|
||||
if (!get_huge_zero_page())
|
||||
return NULL;
|
||||
|
||||
if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
|
||||
put_huge_zero_page();
|
||||
|
||||
return READ_ONCE(huge_zero_page);
|
||||
}
|
||||
|
||||
void mm_put_huge_zero_page(struct mm_struct *mm)
|
||||
{
|
||||
if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
|
||||
put_huge_zero_page();
|
||||
}
|
||||
|
||||
static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
|
||||
struct shrink_control *sc)
|
||||
{
|
||||
@@ -469,6 +489,49 @@ void prep_transhuge_page(struct page *page)
|
||||
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
|
||||
}
|
||||
|
||||
unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
|
||||
loff_t off, unsigned long flags, unsigned long size)
|
||||
{
|
||||
unsigned long addr;
|
||||
loff_t off_end = off + len;
|
||||
loff_t off_align = round_up(off, size);
|
||||
unsigned long len_pad;
|
||||
|
||||
if (off_end <= off_align || (off_end - off_align) < size)
|
||||
return 0;
|
||||
|
||||
len_pad = len + size;
|
||||
if (len_pad < len || (off + len_pad) < off)
|
||||
return 0;
|
||||
|
||||
addr = current->mm->get_unmapped_area(filp, 0, len_pad,
|
||||
off >> PAGE_SHIFT, flags);
|
||||
if (IS_ERR_VALUE(addr))
|
||||
return 0;
|
||||
|
||||
addr += (off - addr) & (size - 1);
|
||||
return addr;
|
||||
}
|
||||
|
||||
unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
|
||||
unsigned long len, unsigned long pgoff, unsigned long flags)
|
||||
{
|
||||
loff_t off = (loff_t)pgoff << PAGE_SHIFT;
|
||||
|
||||
if (addr)
|
||||
goto out;
|
||||
if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
|
||||
goto out;
|
||||
|
||||
addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE);
|
||||
if (addr)
|
||||
return addr;
|
||||
|
||||
out:
|
||||
return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
|
||||
|
||||
static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
|
||||
gfp_t gfp)
|
||||
{
|
||||
@@ -601,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
|
||||
pgtable = pte_alloc_one(vma->vm_mm, haddr);
|
||||
if (unlikely(!pgtable))
|
||||
return VM_FAULT_OOM;
|
||||
zero_page = get_huge_zero_page();
|
||||
zero_page = mm_get_huge_zero_page(vma->vm_mm);
|
||||
if (unlikely(!zero_page)) {
|
||||
pte_free(vma->vm_mm, pgtable);
|
||||
count_vm_event(THP_FAULT_FALLBACK);
|
||||
@@ -623,10 +686,8 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
|
||||
}
|
||||
} else
|
||||
spin_unlock(fe->ptl);
|
||||
if (!set) {
|
||||
if (!set)
|
||||
pte_free(vma->vm_mm, pgtable);
|
||||
put_huge_zero_page();
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
gfp = alloc_hugepage_direct_gfpmask(vma);
|
||||
@@ -780,7 +841,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
||||
* since we already have a zero page to copy. It just takes a
|
||||
* reference.
|
||||
*/
|
||||
zero_page = get_huge_zero_page();
|
||||
zero_page = mm_get_huge_zero_page(dst_mm);
|
||||
set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
|
||||
zero_page);
|
||||
ret = 0;
|
||||
@@ -1038,7 +1099,6 @@ alloc:
|
||||
update_mmu_cache_pmd(vma, fe->address, fe->pmd);
|
||||
if (!page) {
|
||||
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
|
||||
put_huge_zero_page();
|
||||
} else {
|
||||
VM_BUG_ON_PAGE(!PageHead(page), page);
|
||||
page_remove_rmap(page, true);
|
||||
@@ -1499,7 +1559,6 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
|
||||
}
|
||||
smp_wmb(); /* make pte visible before pmd */
|
||||
pmd_populate(mm, pmd, pgtable);
|
||||
put_huge_zero_page();
|
||||
}
|
||||
|
||||
static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
@@ -1522,8 +1581,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
|
||||
if (!vma_is_anonymous(vma)) {
|
||||
_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
|
||||
if (is_huge_zero_pmd(_pmd))
|
||||
put_huge_zero_page();
|
||||
if (vma_is_dax(vma))
|
||||
return;
|
||||
page = pmd_page(_pmd);
|
||||
@@ -1563,7 +1620,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
if (soft_dirty)
|
||||
entry = pte_swp_mksoft_dirty(entry);
|
||||
} else {
|
||||
entry = mk_pte(page + i, vma->vm_page_prot);
|
||||
entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
|
||||
entry = maybe_mkwrite(entry, vma);
|
||||
if (!write)
|
||||
entry = pte_wrprotect(entry);
|
||||
|
53
mm/hugetlb.c
53
mm/hugetlb.c
@@ -567,13 +567,13 @@ retry:
|
||||
* appear as a "reserved" entry instead of simply dangling with incorrect
|
||||
* counts.
|
||||
*/
|
||||
void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve)
|
||||
void hugetlb_fix_reserve_counts(struct inode *inode)
|
||||
{
|
||||
struct hugepage_subpool *spool = subpool_inode(inode);
|
||||
long rsv_adjust;
|
||||
|
||||
rsv_adjust = hugepage_subpool_get_pages(spool, 1);
|
||||
if (restore_reserve && rsv_adjust) {
|
||||
if (rsv_adjust) {
|
||||
struct hstate *h = hstate_inode(inode);
|
||||
|
||||
hugetlb_acct_memory(h, 1);
|
||||
@@ -1022,7 +1022,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
|
||||
((node = hstate_next_node_to_free(hs, mask)) || 1); \
|
||||
nr_nodes--)
|
||||
|
||||
#if (defined(CONFIG_X86_64) || defined(CONFIG_S390)) && \
|
||||
#if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && \
|
||||
((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \
|
||||
defined(CONFIG_CMA))
|
||||
static void destroy_compound_gigantic_page(struct page *page,
|
||||
@@ -1437,38 +1437,61 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
|
||||
|
||||
/*
|
||||
* Dissolve a given free hugepage into free buddy pages. This function does
|
||||
* nothing for in-use (including surplus) hugepages.
|
||||
* nothing for in-use (including surplus) hugepages. Returns -EBUSY if the
|
||||
* number of free hugepages would be reduced below the number of reserved
|
||||
* hugepages.
|
||||
*/
|
||||
static void dissolve_free_huge_page(struct page *page)
|
||||
static int dissolve_free_huge_page(struct page *page)
|
||||
{
|
||||
int rc = 0;
|
||||
|
||||
spin_lock(&hugetlb_lock);
|
||||
if (PageHuge(page) && !page_count(page)) {
|
||||
struct hstate *h = page_hstate(page);
|
||||
int nid = page_to_nid(page);
|
||||
list_del(&page->lru);
|
||||
struct page *head = compound_head(page);
|
||||
struct hstate *h = page_hstate(head);
|
||||
int nid = page_to_nid(head);
|
||||
if (h->free_huge_pages - h->resv_huge_pages == 0) {
|
||||
rc = -EBUSY;
|
||||
goto out;
|
||||
}
|
||||
list_del(&head->lru);
|
||||
h->free_huge_pages--;
|
||||
h->free_huge_pages_node[nid]--;
|
||||
h->max_huge_pages--;
|
||||
update_and_free_page(h, page);
|
||||
update_and_free_page(h, head);
|
||||
}
|
||||
out:
|
||||
spin_unlock(&hugetlb_lock);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* Dissolve free hugepages in a given pfn range. Used by memory hotplug to
|
||||
* make specified memory blocks removable from the system.
|
||||
* Note that start_pfn should aligned with (minimum) hugepage size.
|
||||
* Note that this will dissolve a free gigantic hugepage completely, if any
|
||||
* part of it lies within the given range.
|
||||
* Also note that if dissolve_free_huge_page() returns with an error, all
|
||||
* free hugepages that were dissolved before that error are lost.
|
||||
*/
|
||||
void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
|
||||
int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
|
||||
{
|
||||
unsigned long pfn;
|
||||
struct page *page;
|
||||
int rc = 0;
|
||||
|
||||
if (!hugepages_supported())
|
||||
return;
|
||||
return rc;
|
||||
|
||||
VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order));
|
||||
for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order)
|
||||
dissolve_free_huge_page(pfn_to_page(pfn));
|
||||
for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
|
||||
page = pfn_to_page(pfn);
|
||||
if (PageHuge(page) && !page_count(page)) {
|
||||
rc = dissolve_free_huge_page(page);
|
||||
if (rc)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@@ -178,8 +178,9 @@ struct compact_control {
|
||||
unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
|
||||
enum migrate_mode mode; /* Async or sync migration mode */
|
||||
bool ignore_skip_hint; /* Scan blocks even if marked skip */
|
||||
bool ignore_block_suitable; /* Scan blocks considered unsuitable */
|
||||
bool direct_compaction; /* False from kcompactd or /proc/... */
|
||||
bool whole_zone; /* Whole zone has been scanned */
|
||||
bool whole_zone; /* Whole zone should/has been scanned */
|
||||
int order; /* order a direct compactor needs */
|
||||
const gfp_t gfp_mask; /* gfp mask of a direct compactor */
|
||||
const unsigned int alloc_flags; /* alloc flags of a direct compactor */
|
||||
|
7
mm/ksm.c
7
mm/ksm.c
@@ -299,7 +299,12 @@ static inline void free_rmap_item(struct rmap_item *rmap_item)
|
||||
|
||||
static inline struct stable_node *alloc_stable_node(void)
|
||||
{
|
||||
return kmem_cache_alloc(stable_node_cache, GFP_KERNEL);
|
||||
/*
|
||||
* The allocation can take too long with GFP_KERNEL when memory is under
|
||||
* pressure, which may lead to hung task warnings. Adding __GFP_HIGH
|
||||
* grants access to memory reserves, helping to avoid this problem.
|
||||
*/
|
||||
return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
|
||||
}
|
||||
|
||||
static inline void free_stable_node(struct stable_node *stable_node)
|
||||
|
@@ -1438,6 +1438,11 @@ phys_addr_t __init_memblock memblock_phys_mem_size(void)
|
||||
return memblock.memory.total_size;
|
||||
}
|
||||
|
||||
phys_addr_t __init_memblock memblock_reserved_size(void)
|
||||
{
|
||||
return memblock.reserved.total_size;
|
||||
}
|
||||
|
||||
phys_addr_t __init memblock_mem_size(unsigned long limit_pfn)
|
||||
{
|
||||
unsigned long pages = 0;
|
||||
|
154
mm/memcontrol.c
154
mm/memcontrol.c
@@ -920,6 +920,43 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
|
||||
iter != NULL; \
|
||||
iter = mem_cgroup_iter(NULL, iter, NULL))
|
||||
|
||||
/**
|
||||
* mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
|
||||
* @memcg: hierarchy root
|
||||
* @fn: function to call for each task
|
||||
* @arg: argument passed to @fn
|
||||
*
|
||||
* This function iterates over tasks attached to @memcg or to any of its
|
||||
* descendants and calls @fn for each task. If @fn returns a non-zero
|
||||
* value, the function breaks the iteration loop and returns the value.
|
||||
* Otherwise, it will iterate over all tasks and return 0.
|
||||
*
|
||||
* This function must not be called for the root memory cgroup.
|
||||
*/
|
||||
int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
|
||||
int (*fn)(struct task_struct *, void *), void *arg)
|
||||
{
|
||||
struct mem_cgroup *iter;
|
||||
int ret = 0;
|
||||
|
||||
BUG_ON(memcg == root_mem_cgroup);
|
||||
|
||||
for_each_mem_cgroup_tree(iter, memcg) {
|
||||
struct css_task_iter it;
|
||||
struct task_struct *task;
|
||||
|
||||
css_task_iter_start(&iter->css, &it);
|
||||
while (!ret && (task = css_task_iter_next(&it)))
|
||||
ret = fn(task, arg);
|
||||
css_task_iter_end(&it);
|
||||
if (ret) {
|
||||
mem_cgroup_iter_break(memcg, iter);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
|
||||
* @page: the page
|
||||
@@ -1178,7 +1215,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
|
||||
/*
|
||||
* Return the memory (and swap, if configured) limit for a memcg.
|
||||
*/
|
||||
static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
|
||||
unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
|
||||
{
|
||||
unsigned long limit;
|
||||
|
||||
@@ -1205,79 +1242,12 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
||||
.gfp_mask = gfp_mask,
|
||||
.order = order,
|
||||
};
|
||||
struct mem_cgroup *iter;
|
||||
unsigned long chosen_points = 0;
|
||||
unsigned long totalpages;
|
||||
unsigned int points = 0;
|
||||
struct task_struct *chosen = NULL;
|
||||
bool ret;
|
||||
|
||||
mutex_lock(&oom_lock);
|
||||
|
||||
/*
|
||||
* If current has a pending SIGKILL or is exiting, then automatically
|
||||
* select it. The goal is to allow it to allocate so that it may
|
||||
* quickly exit and free its memory.
|
||||
*/
|
||||
if (task_will_free_mem(current)) {
|
||||
mark_oom_victim(current);
|
||||
wake_oom_reaper(current);
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
check_panic_on_oom(&oc, CONSTRAINT_MEMCG);
|
||||
totalpages = mem_cgroup_get_limit(memcg) ? : 1;
|
||||
for_each_mem_cgroup_tree(iter, memcg) {
|
||||
struct css_task_iter it;
|
||||
struct task_struct *task;
|
||||
|
||||
css_task_iter_start(&iter->css, &it);
|
||||
while ((task = css_task_iter_next(&it))) {
|
||||
switch (oom_scan_process_thread(&oc, task)) {
|
||||
case OOM_SCAN_SELECT:
|
||||
if (chosen)
|
||||
put_task_struct(chosen);
|
||||
chosen = task;
|
||||
chosen_points = ULONG_MAX;
|
||||
get_task_struct(chosen);
|
||||
/* fall through */
|
||||
case OOM_SCAN_CONTINUE:
|
||||
continue;
|
||||
case OOM_SCAN_ABORT:
|
||||
css_task_iter_end(&it);
|
||||
mem_cgroup_iter_break(memcg, iter);
|
||||
if (chosen)
|
||||
put_task_struct(chosen);
|
||||
/* Set a dummy value to return "true". */
|
||||
chosen = (void *) 1;
|
||||
goto unlock;
|
||||
case OOM_SCAN_OK:
|
||||
break;
|
||||
};
|
||||
points = oom_badness(task, memcg, NULL, totalpages);
|
||||
if (!points || points < chosen_points)
|
||||
continue;
|
||||
/* Prefer thread group leaders for display purposes */
|
||||
if (points == chosen_points &&
|
||||
thread_group_leader(chosen))
|
||||
continue;
|
||||
|
||||
if (chosen)
|
||||
put_task_struct(chosen);
|
||||
chosen = task;
|
||||
chosen_points = points;
|
||||
get_task_struct(chosen);
|
||||
}
|
||||
css_task_iter_end(&it);
|
||||
}
|
||||
|
||||
if (chosen) {
|
||||
points = chosen_points * 1000 / totalpages;
|
||||
oom_kill_process(&oc, chosen, points, totalpages,
|
||||
"Memory cgroup out of memory");
|
||||
}
|
||||
unlock:
|
||||
ret = out_of_memory(&oc);
|
||||
mutex_unlock(&oom_lock);
|
||||
return chosen;
|
||||
return ret;
|
||||
}
|
||||
|
||||
#if MAX_NUMNODES > 1
|
||||
@@ -1600,7 +1570,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
|
||||
if (!memcg)
|
||||
return false;
|
||||
|
||||
if (!handle || oom_killer_disabled)
|
||||
if (!handle)
|
||||
goto cleanup;
|
||||
|
||||
owait.memcg = memcg;
|
||||
@@ -2969,16 +2939,16 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
|
||||
/*
|
||||
* The active flag needs to be written after the static_key
|
||||
* update. This is what guarantees that the socket activation
|
||||
* function is the last one to run. See sock_update_memcg() for
|
||||
* details, and note that we don't mark any socket as belonging
|
||||
* to this memcg until that flag is up.
|
||||
* function is the last one to run. See mem_cgroup_sk_alloc()
|
||||
* for details, and note that we don't mark any socket as
|
||||
* belonging to this memcg until that flag is up.
|
||||
*
|
||||
* We need to do this, because static_keys will span multiple
|
||||
* sites, but we can't control their order. If we mark a socket
|
||||
* as accounted, but the accounting functions are not patched in
|
||||
* yet, we'll lose accounting.
|
||||
*
|
||||
* We never race with the readers in sock_update_memcg(),
|
||||
* We never race with the readers in mem_cgroup_sk_alloc(),
|
||||
* because when this value change, the code to process it is not
|
||||
* patched in yet.
|
||||
*/
|
||||
@@ -4092,11 +4062,13 @@ static DEFINE_IDR(mem_cgroup_idr);
|
||||
|
||||
static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
|
||||
{
|
||||
VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
|
||||
atomic_add(n, &memcg->id.ref);
|
||||
}
|
||||
|
||||
static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
|
||||
{
|
||||
VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
|
||||
if (atomic_sub_and_test(n, &memcg->id.ref)) {
|
||||
idr_remove(&mem_cgroup_idr, memcg->id.id);
|
||||
memcg->id.id = 0;
|
||||
@@ -4285,8 +4257,10 @@ fail:
|
||||
|
||||
static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
|
||||
{
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
||||
|
||||
/* Online state pins memcg ID, memcg ID pins CSS */
|
||||
mem_cgroup_id_get(mem_cgroup_from_css(css));
|
||||
atomic_set(&memcg->id.ref, 1);
|
||||
css_get(css);
|
||||
return 0;
|
||||
}
|
||||
@@ -4434,7 +4408,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
|
||||
* Because lookup_swap_cache() updates some statistics counter,
|
||||
* we call find_get_page() with swapper_space directly.
|
||||
*/
|
||||
page = find_get_page(swap_address_space(ent), ent.val);
|
||||
page = find_get_page(swap_address_space(ent), swp_offset(ent));
|
||||
if (do_memsw_account())
|
||||
entry->val = ent.val;
|
||||
|
||||
@@ -4472,7 +4446,8 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
|
||||
swp_entry_t swp = radix_to_swp_entry(page);
|
||||
if (do_memsw_account())
|
||||
*entry = swp;
|
||||
page = find_get_page(swap_address_space(swp), swp.val);
|
||||
page = find_get_page(swap_address_space(swp),
|
||||
swp_offset(swp));
|
||||
}
|
||||
} else
|
||||
page = find_get_page(mapping, pgoff);
|
||||
@@ -4707,7 +4682,8 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
|
||||
.mm = mm,
|
||||
};
|
||||
down_read(&mm->mmap_sem);
|
||||
walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk);
|
||||
walk_page_range(0, mm->highest_vm_end,
|
||||
&mem_cgroup_count_precharge_walk);
|
||||
up_read(&mm->mmap_sem);
|
||||
|
||||
precharge = mc.precharge;
|
||||
@@ -4995,7 +4971,8 @@ retry:
|
||||
* When we have consumed all precharges and failed in doing
|
||||
* additional charge, the page walk just aborts.
|
||||
*/
|
||||
walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
|
||||
walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
|
||||
|
||||
up_read(&mc.mm->mmap_sem);
|
||||
atomic_dec(&mc.from->moving_account);
|
||||
}
|
||||
@@ -5674,11 +5651,15 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
|
||||
DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
|
||||
EXPORT_SYMBOL(memcg_sockets_enabled_key);
|
||||
|
||||
void sock_update_memcg(struct sock *sk)
|
||||
void mem_cgroup_sk_alloc(struct sock *sk)
|
||||
{
|
||||
struct mem_cgroup *memcg;
|
||||
|
||||
/* Socket cloning can throw us here with sk_cgrp already
|
||||
if (!mem_cgroup_sockets_enabled)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Socket cloning can throw us here with sk_memcg already
|
||||
* filled. It won't however, necessarily happen from
|
||||
* process context. So the test for root memcg given
|
||||
* the current task's memcg won't help us in this case.
|
||||
@@ -5703,12 +5684,11 @@ void sock_update_memcg(struct sock *sk)
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
}
|
||||
EXPORT_SYMBOL(sock_update_memcg);
|
||||
|
||||
void sock_release_memcg(struct sock *sk)
|
||||
void mem_cgroup_sk_free(struct sock *sk)
|
||||
{
|
||||
WARN_ON(!sk->sk_memcg);
|
||||
css_put(&sk->sk_memcg->css);
|
||||
if (sk->sk_memcg)
|
||||
css_put(&sk->sk_memcg->css);
|
||||
}
|
||||
|
||||
/**
|
||||
|
21
mm/memory.c
21
mm/memory.c
@@ -1649,10 +1649,14 @@ EXPORT_SYMBOL(vm_insert_pfn_prot);
|
||||
int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
|
||||
pfn_t pfn)
|
||||
{
|
||||
pgprot_t pgprot = vma->vm_page_prot;
|
||||
|
||||
BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
|
||||
|
||||
if (addr < vma->vm_start || addr >= vma->vm_end)
|
||||
return -EFAULT;
|
||||
if (track_pfn_insert(vma, &pgprot, pfn))
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* If we don't have pte special, then we have to use the pfn_valid()
|
||||
@@ -1670,9 +1674,9 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
|
||||
* result in pfn_t_has_page() == false.
|
||||
*/
|
||||
page = pfn_to_page(pfn_t_to_pfn(pfn));
|
||||
return insert_page(vma, addr, page, vma->vm_page_prot);
|
||||
return insert_page(vma, addr, page, pgprot);
|
||||
}
|
||||
return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
|
||||
return insert_pfn(vma, addr, pfn, pgprot);
|
||||
}
|
||||
EXPORT_SYMBOL(vm_insert_mixed);
|
||||
|
||||
@@ -3658,6 +3662,19 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
|
||||
mem_cgroup_oom_synchronize(false);
|
||||
}
|
||||
|
||||
/*
|
||||
* This mm has been already reaped by the oom reaper and so the
|
||||
* refault cannot be trusted in general. Anonymous refaults would
|
||||
* lose data and give a zero page instead e.g. This is especially
|
||||
* problem for use_mm() because regular tasks will just die and
|
||||
* the corrupted data will not be visible anywhere while kthread
|
||||
* will outlive the oom victim and potentially propagate the date
|
||||
* further.
|
||||
*/
|
||||
if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR)
|
||||
&& test_bit(MMF_UNSTABLE, &vma->vm_mm->flags)))
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(handle_mm_fault);
|
||||
|
@@ -1945,7 +1945,9 @@ repeat:
|
||||
* dissolve free hugepages in the memory block before doing offlining
|
||||
* actually in order to make hugetlbfs's object counting consistent.
|
||||
*/
|
||||
dissolve_free_huge_pages(start_pfn, end_pfn);
|
||||
ret = dissolve_free_huge_pages(start_pfn, end_pfn);
|
||||
if (ret)
|
||||
goto failed_removal;
|
||||
/* check again */
|
||||
offlined_pages = check_pages_isolated(start_pfn, end_pfn);
|
||||
if (offlined_pages < 0) {
|
||||
|
@@ -1749,7 +1749,7 @@ unsigned int mempolicy_slab_node(void)
|
||||
*/
|
||||
struct zonelist *zonelist;
|
||||
enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
|
||||
zonelist = &NODE_DATA(node)->node_zonelists[0];
|
||||
zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
|
||||
z = first_zones_zonelist(zonelist, highest_zoneidx,
|
||||
&policy->v.nodes);
|
||||
return z->zone ? z->zone->node : node;
|
||||
|
@@ -234,7 +234,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
|
||||
goto unlock;
|
||||
|
||||
get_page(new);
|
||||
pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
|
||||
pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
|
||||
if (pte_swp_soft_dirty(*ptep))
|
||||
pte = pte_mksoft_dirty(pte);
|
||||
|
||||
|
@@ -66,7 +66,8 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
|
||||
*/
|
||||
if (radix_tree_exceptional_entry(page)) {
|
||||
swp_entry_t swp = radix_to_swp_entry(page);
|
||||
page = find_get_page(swap_address_space(swp), swp.val);
|
||||
page = find_get_page(swap_address_space(swp),
|
||||
swp_offset(swp));
|
||||
}
|
||||
} else
|
||||
page = find_get_page(mapping, pgoff);
|
||||
@@ -150,7 +151,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
|
||||
} else {
|
||||
#ifdef CONFIG_SWAP
|
||||
*vec = mincore_page(swap_address_space(entry),
|
||||
entry.val);
|
||||
swp_offset(entry));
|
||||
#else
|
||||
WARN_ON(1);
|
||||
*vec = 1;
|
||||
|
52
mm/mlock.c
52
mm/mlock.c
@@ -516,6 +516,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
|
||||
int nr_pages;
|
||||
int ret = 0;
|
||||
int lock = !!(newflags & VM_LOCKED);
|
||||
vm_flags_t old_flags = vma->vm_flags;
|
||||
|
||||
if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
|
||||
is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
|
||||
@@ -550,6 +551,8 @@ success:
|
||||
nr_pages = (end - start) >> PAGE_SHIFT;
|
||||
if (!lock)
|
||||
nr_pages = -nr_pages;
|
||||
else if (old_flags & VM_LOCKED)
|
||||
nr_pages = 0;
|
||||
mm->locked_vm += nr_pages;
|
||||
|
||||
/*
|
||||
@@ -617,6 +620,45 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Go through vma areas and sum size of mlocked
|
||||
* vma pages, as return value.
|
||||
* Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
|
||||
* is also counted.
|
||||
* Return value: previously mlocked page counts
|
||||
*/
|
||||
static int count_mm_mlocked_page_nr(struct mm_struct *mm,
|
||||
unsigned long start, size_t len)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
int count = 0;
|
||||
|
||||
if (mm == NULL)
|
||||
mm = current->mm;
|
||||
|
||||
vma = find_vma(mm, start);
|
||||
if (vma == NULL)
|
||||
vma = mm->mmap;
|
||||
|
||||
for (; vma ; vma = vma->vm_next) {
|
||||
if (start >= vma->vm_end)
|
||||
continue;
|
||||
if (start + len <= vma->vm_start)
|
||||
break;
|
||||
if (vma->vm_flags & VM_LOCKED) {
|
||||
if (start > vma->vm_start)
|
||||
count -= (start - vma->vm_start);
|
||||
if (start + len < vma->vm_end) {
|
||||
count += start + len - vma->vm_start;
|
||||
break;
|
||||
}
|
||||
count += vma->vm_end - vma->vm_start;
|
||||
}
|
||||
}
|
||||
|
||||
return count >> PAGE_SHIFT;
|
||||
}
|
||||
|
||||
static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
|
||||
{
|
||||
unsigned long locked;
|
||||
@@ -639,6 +681,16 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla
|
||||
return -EINTR;
|
||||
|
||||
locked += current->mm->locked_vm;
|
||||
if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
|
||||
/*
|
||||
* It is possible that the regions requested intersect with
|
||||
* previously mlocked areas, that part area in "mm->locked_vm"
|
||||
* should not be counted to new mlock increment count. So check
|
||||
* and adjust locked count if necessary.
|
||||
*/
|
||||
locked -= count_mm_mlocked_page_nr(current->mm,
|
||||
start, len);
|
||||
}
|
||||
|
||||
/* check against resource limits */
|
||||
if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
|
||||
|
238
mm/mmap.c
238
mm/mmap.c
@@ -116,13 +116,15 @@ static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
|
||||
void vma_set_page_prot(struct vm_area_struct *vma)
|
||||
{
|
||||
unsigned long vm_flags = vma->vm_flags;
|
||||
pgprot_t vm_page_prot;
|
||||
|
||||
vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
|
||||
if (vma_wants_writenotify(vma)) {
|
||||
vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
|
||||
if (vma_wants_writenotify(vma, vm_page_prot)) {
|
||||
vm_flags &= ~VM_SHARED;
|
||||
vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot,
|
||||
vm_flags);
|
||||
vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
|
||||
}
|
||||
/* remove_protection_ptes reads vma->vm_page_prot without mmap_sem */
|
||||
WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -400,7 +402,32 @@ static inline void vma_rb_insert(struct vm_area_struct *vma,
|
||||
rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
|
||||
}
|
||||
|
||||
static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
|
||||
static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
|
||||
{
|
||||
/*
|
||||
* Note rb_erase_augmented is a fairly large inline function,
|
||||
* so make sure we instantiate it only once with our desired
|
||||
* augmented rbtree callbacks.
|
||||
*/
|
||||
rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
|
||||
}
|
||||
|
||||
static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
|
||||
struct rb_root *root,
|
||||
struct vm_area_struct *ignore)
|
||||
{
|
||||
/*
|
||||
* All rb_subtree_gap values must be consistent prior to erase,
|
||||
* with the possible exception of the "next" vma being erased if
|
||||
* next->vm_start was reduced.
|
||||
*/
|
||||
validate_mm_rb(root, ignore);
|
||||
|
||||
__vma_rb_erase(vma, root);
|
||||
}
|
||||
|
||||
static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
|
||||
struct rb_root *root)
|
||||
{
|
||||
/*
|
||||
* All rb_subtree_gap values must be consistent prior to erase,
|
||||
@@ -408,12 +435,7 @@ static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
|
||||
*/
|
||||
validate_mm_rb(root, vma);
|
||||
|
||||
/*
|
||||
* Note rb_erase_augmented is a fairly large inline function,
|
||||
* so make sure we instantiate it only once with our desired
|
||||
* augmented rbtree callbacks.
|
||||
*/
|
||||
rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
|
||||
__vma_rb_erase(vma, root);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -599,14 +621,25 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
|
||||
mm->map_count++;
|
||||
}
|
||||
|
||||
static inline void
|
||||
__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
struct vm_area_struct *prev)
|
||||
static __always_inline void __vma_unlink_common(struct mm_struct *mm,
|
||||
struct vm_area_struct *vma,
|
||||
struct vm_area_struct *prev,
|
||||
bool has_prev,
|
||||
struct vm_area_struct *ignore)
|
||||
{
|
||||
struct vm_area_struct *next;
|
||||
|
||||
vma_rb_erase(vma, &mm->mm_rb);
|
||||
prev->vm_next = next = vma->vm_next;
|
||||
vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
|
||||
next = vma->vm_next;
|
||||
if (has_prev)
|
||||
prev->vm_next = next;
|
||||
else {
|
||||
prev = vma->vm_prev;
|
||||
if (prev)
|
||||
prev->vm_next = next;
|
||||
else
|
||||
mm->mmap = next;
|
||||
}
|
||||
if (next)
|
||||
next->vm_prev = prev;
|
||||
|
||||
@@ -614,6 +647,13 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
vmacache_invalidate(mm);
|
||||
}
|
||||
|
||||
static inline void __vma_unlink_prev(struct mm_struct *mm,
|
||||
struct vm_area_struct *vma,
|
||||
struct vm_area_struct *prev)
|
||||
{
|
||||
__vma_unlink_common(mm, vma, prev, true, vma);
|
||||
}
|
||||
|
||||
/*
|
||||
* We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
|
||||
* is already present in an i_mmap tree without adjusting the tree.
|
||||
@@ -621,11 +661,12 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
* are necessary. The "insert" vma (if any) is to be inserted
|
||||
* before we drop the necessary locks.
|
||||
*/
|
||||
int vma_adjust(struct vm_area_struct *vma, unsigned long start,
|
||||
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
|
||||
int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
|
||||
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
|
||||
struct vm_area_struct *expand)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
struct vm_area_struct *next = vma->vm_next;
|
||||
struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
|
||||
struct address_space *mapping = NULL;
|
||||
struct rb_root *root = NULL;
|
||||
struct anon_vma *anon_vma = NULL;
|
||||
@@ -641,9 +682,38 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
|
||||
/*
|
||||
* vma expands, overlapping all the next, and
|
||||
* perhaps the one after too (mprotect case 6).
|
||||
* The only other cases that gets here are
|
||||
* case 1, case 7 and case 8.
|
||||
*/
|
||||
remove_next = 1 + (end > next->vm_end);
|
||||
end = next->vm_end;
|
||||
if (next == expand) {
|
||||
/*
|
||||
* The only case where we don't expand "vma"
|
||||
* and we expand "next" instead is case 8.
|
||||
*/
|
||||
VM_WARN_ON(end != next->vm_end);
|
||||
/*
|
||||
* remove_next == 3 means we're
|
||||
* removing "vma" and that to do so we
|
||||
* swapped "vma" and "next".
|
||||
*/
|
||||
remove_next = 3;
|
||||
VM_WARN_ON(file != next->vm_file);
|
||||
swap(vma, next);
|
||||
} else {
|
||||
VM_WARN_ON(expand != vma);
|
||||
/*
|
||||
* case 1, 6, 7, remove_next == 2 is case 6,
|
||||
* remove_next == 1 is case 1 or 7.
|
||||
*/
|
||||
remove_next = 1 + (end > next->vm_end);
|
||||
VM_WARN_ON(remove_next == 2 &&
|
||||
end != next->vm_next->vm_end);
|
||||
VM_WARN_ON(remove_next == 1 &&
|
||||
end != next->vm_end);
|
||||
/* trim end to next, for case 6 first pass */
|
||||
end = next->vm_end;
|
||||
}
|
||||
|
||||
exporter = next;
|
||||
importer = vma;
|
||||
|
||||
@@ -651,7 +721,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
|
||||
* If next doesn't have anon_vma, import from vma after
|
||||
* next, if the vma overlaps with it.
|
||||
*/
|
||||
if (remove_next == 2 && next && !next->anon_vma)
|
||||
if (remove_next == 2 && !next->anon_vma)
|
||||
exporter = next->vm_next;
|
||||
|
||||
} else if (end > next->vm_start) {
|
||||
@@ -662,6 +732,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
|
||||
adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
|
||||
exporter = next;
|
||||
importer = vma;
|
||||
VM_WARN_ON(expand != importer);
|
||||
} else if (end < vma->vm_end) {
|
||||
/*
|
||||
* vma shrinks, and !insert tells it's not
|
||||
@@ -671,6 +742,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
|
||||
adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
|
||||
exporter = vma;
|
||||
importer = next;
|
||||
VM_WARN_ON(expand != importer);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -688,7 +760,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
|
||||
}
|
||||
}
|
||||
again:
|
||||
vma_adjust_trans_huge(vma, start, end, adjust_next);
|
||||
vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
|
||||
|
||||
if (file) {
|
||||
mapping = file->f_mapping;
|
||||
@@ -714,8 +786,8 @@ again:
|
||||
if (!anon_vma && adjust_next)
|
||||
anon_vma = next->anon_vma;
|
||||
if (anon_vma) {
|
||||
VM_BUG_ON_VMA(adjust_next && next->anon_vma &&
|
||||
anon_vma != next->anon_vma, next);
|
||||
VM_WARN_ON(adjust_next && next->anon_vma &&
|
||||
anon_vma != next->anon_vma);
|
||||
anon_vma_lock_write(anon_vma);
|
||||
anon_vma_interval_tree_pre_update_vma(vma);
|
||||
if (adjust_next)
|
||||
@@ -755,7 +827,19 @@ again:
|
||||
* vma_merge has merged next into vma, and needs
|
||||
* us to remove next before dropping the locks.
|
||||
*/
|
||||
__vma_unlink(mm, next, vma);
|
||||
if (remove_next != 3)
|
||||
__vma_unlink_prev(mm, next, vma);
|
||||
else
|
||||
/*
|
||||
* vma is not before next if they've been
|
||||
* swapped.
|
||||
*
|
||||
* pre-swap() next->vm_start was reduced so
|
||||
* tell validate_mm_rb to ignore pre-swap()
|
||||
* "next" (which is stored in post-swap()
|
||||
* "vma").
|
||||
*/
|
||||
__vma_unlink_common(mm, next, NULL, false, vma);
|
||||
if (file)
|
||||
__remove_shared_vm_struct(next, file, mapping);
|
||||
} else if (insert) {
|
||||
@@ -807,7 +891,27 @@ again:
|
||||
* we must remove another next too. It would clutter
|
||||
* up the code too much to do both in one go.
|
||||
*/
|
||||
next = vma->vm_next;
|
||||
if (remove_next != 3) {
|
||||
/*
|
||||
* If "next" was removed and vma->vm_end was
|
||||
* expanded (up) over it, in turn
|
||||
* "next->vm_prev->vm_end" changed and the
|
||||
* "vma->vm_next" gap must be updated.
|
||||
*/
|
||||
next = vma->vm_next;
|
||||
} else {
|
||||
/*
|
||||
* For the scope of the comment "next" and
|
||||
* "vma" considered pre-swap(): if "vma" was
|
||||
* removed, next->vm_start was expanded (down)
|
||||
* over it and the "next" gap must be updated.
|
||||
* Because of the swap() the post-swap() "vma"
|
||||
* actually points to pre-swap() "next"
|
||||
* (post-swap() "next" as opposed is now a
|
||||
* dangling pointer).
|
||||
*/
|
||||
next = vma;
|
||||
}
|
||||
if (remove_next == 2) {
|
||||
remove_next = 1;
|
||||
end = next->vm_end;
|
||||
@@ -815,8 +919,28 @@ again:
|
||||
}
|
||||
else if (next)
|
||||
vma_gap_update(next);
|
||||
else
|
||||
mm->highest_vm_end = end;
|
||||
else {
|
||||
/*
|
||||
* If remove_next == 2 we obviously can't
|
||||
* reach this path.
|
||||
*
|
||||
* If remove_next == 3 we can't reach this
|
||||
* path because pre-swap() next is always not
|
||||
* NULL. pre-swap() "next" is not being
|
||||
* removed and its next->vm_end is not altered
|
||||
* (and furthermore "end" already matches
|
||||
* next->vm_end in remove_next == 3).
|
||||
*
|
||||
* We reach this only in the remove_next == 1
|
||||
* case if the "next" vma that was removed was
|
||||
* the highest vma of the mm. However in such
|
||||
* case next->vm_end == "end" and the extended
|
||||
* "vma" has vma->vm_end == next->vm_end so
|
||||
* mm->highest_vm_end doesn't need any update
|
||||
* in remove_next == 1 case.
|
||||
*/
|
||||
VM_WARN_ON(mm->highest_vm_end != end);
|
||||
}
|
||||
}
|
||||
if (insert && file)
|
||||
uprobe_mmap(insert);
|
||||
@@ -936,13 +1060,24 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
|
||||
* cannot merge might become might become might become
|
||||
* PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or
|
||||
* mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or
|
||||
* mremap move: PPPPNNNNNNNN 8
|
||||
* mremap move: PPPPXXXXXXXX 8
|
||||
* AAAA
|
||||
* PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN
|
||||
* might become case 1 below case 2 below case 3 below
|
||||
*
|
||||
* Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
|
||||
* mprotect_fixup updates vm_flags & vm_page_prot on successful return.
|
||||
* It is important for case 8 that the the vma NNNN overlapping the
|
||||
* region AAAA is never going to extended over XXXX. Instead XXXX must
|
||||
* be extended in region AAAA and NNNN must be removed. This way in
|
||||
* all cases where vma_merge succeeds, the moment vma_adjust drops the
|
||||
* rmap_locks, the properties of the merged vma will be already
|
||||
* correct for the whole merged range. Some of those properties like
|
||||
* vm_page_prot/vm_flags may be accessed by rmap_walks and they must
|
||||
* be correct for the whole merged range immediately after the
|
||||
* rmap_locks are released. Otherwise if XXXX would be removed and
|
||||
* NNNN would be extended over the XXXX range, remove_migration_ptes
|
||||
* or other rmap walkers (if working on addresses beyond the "end"
|
||||
* parameter) may establish ptes with the wrong permissions of NNNN
|
||||
* instead of the right permissions of XXXX.
|
||||
*/
|
||||
struct vm_area_struct *vma_merge(struct mm_struct *mm,
|
||||
struct vm_area_struct *prev, unsigned long addr,
|
||||
@@ -967,9 +1102,14 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
|
||||
else
|
||||
next = mm->mmap;
|
||||
area = next;
|
||||
if (next && next->vm_end == end) /* cases 6, 7, 8 */
|
||||
if (area && area->vm_end == end) /* cases 6, 7, 8 */
|
||||
next = next->vm_next;
|
||||
|
||||
/* verify some invariant that must be enforced by the caller */
|
||||
VM_WARN_ON(prev && addr <= prev->vm_start);
|
||||
VM_WARN_ON(area && end > area->vm_end);
|
||||
VM_WARN_ON(addr >= end);
|
||||
|
||||
/*
|
||||
* Can it merge with the predecessor?
|
||||
*/
|
||||
@@ -990,11 +1130,12 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
|
||||
is_mergeable_anon_vma(prev->anon_vma,
|
||||
next->anon_vma, NULL)) {
|
||||
/* cases 1, 6 */
|
||||
err = vma_adjust(prev, prev->vm_start,
|
||||
next->vm_end, prev->vm_pgoff, NULL);
|
||||
err = __vma_adjust(prev, prev->vm_start,
|
||||
next->vm_end, prev->vm_pgoff, NULL,
|
||||
prev);
|
||||
} else /* cases 2, 5, 7 */
|
||||
err = vma_adjust(prev, prev->vm_start,
|
||||
end, prev->vm_pgoff, NULL);
|
||||
err = __vma_adjust(prev, prev->vm_start,
|
||||
end, prev->vm_pgoff, NULL, prev);
|
||||
if (err)
|
||||
return NULL;
|
||||
khugepaged_enter_vma_merge(prev, vm_flags);
|
||||
@@ -1010,11 +1151,18 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
|
||||
anon_vma, file, pgoff+pglen,
|
||||
vm_userfaultfd_ctx)) {
|
||||
if (prev && addr < prev->vm_end) /* case 4 */
|
||||
err = vma_adjust(prev, prev->vm_start,
|
||||
addr, prev->vm_pgoff, NULL);
|
||||
else /* cases 3, 8 */
|
||||
err = vma_adjust(area, addr, next->vm_end,
|
||||
next->vm_pgoff - pglen, NULL);
|
||||
err = __vma_adjust(prev, prev->vm_start,
|
||||
addr, prev->vm_pgoff, NULL, next);
|
||||
else { /* cases 3, 8 */
|
||||
err = __vma_adjust(area, addr, next->vm_end,
|
||||
next->vm_pgoff - pglen, NULL, next);
|
||||
/*
|
||||
* In case 3 area is already equal to next and
|
||||
* this is a noop, but in case 8 "area" has
|
||||
* been removed and next was expanded over it.
|
||||
*/
|
||||
area = next;
|
||||
}
|
||||
if (err)
|
||||
return NULL;
|
||||
khugepaged_enter_vma_merge(area, vm_flags);
|
||||
@@ -1386,7 +1534,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
|
||||
* to the private version (using protection_map[] without the
|
||||
* VM_SHARED bit).
|
||||
*/
|
||||
int vma_wants_writenotify(struct vm_area_struct *vma)
|
||||
int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
|
||||
{
|
||||
vm_flags_t vm_flags = vma->vm_flags;
|
||||
const struct vm_operations_struct *vm_ops = vma->vm_ops;
|
||||
@@ -1401,8 +1549,8 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
|
||||
|
||||
/* The open routine did something to the protections that pgprot_modify
|
||||
* won't preserve? */
|
||||
if (pgprot_val(vma->vm_page_prot) !=
|
||||
pgprot_val(vm_pgprot_modify(vma->vm_page_prot, vm_flags)))
|
||||
if (pgprot_val(vm_page_prot) !=
|
||||
pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
|
||||
return 0;
|
||||
|
||||
/* Do we need to track softdirty? */
|
||||
|
@@ -304,6 +304,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
|
||||
vma->vm_userfaultfd_ctx);
|
||||
if (*pprev) {
|
||||
vma = *pprev;
|
||||
VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
|
||||
goto success;
|
||||
}
|
||||
|
||||
@@ -327,7 +328,7 @@ success:
|
||||
* held in write mode.
|
||||
*/
|
||||
vma->vm_flags = newflags;
|
||||
dirty_accountable = vma_wants_writenotify(vma);
|
||||
dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
|
||||
vma_set_page_prot(vma);
|
||||
|
||||
change_protection(vma, start, end, vma->vm_page_prot,
|
||||
|
@@ -11,18 +11,21 @@
|
||||
#include <linux/init.h>
|
||||
#include <linux/pfn.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/bootmem.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/kmemleak.h>
|
||||
#include <linux/range.h>
|
||||
#include <linux/memblock.h>
|
||||
#include <linux/bootmem.h>
|
||||
|
||||
#include <asm/bug.h>
|
||||
#include <asm/io.h>
|
||||
#include <asm/processor.h>
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
#ifndef CONFIG_HAVE_MEMBLOCK
|
||||
#error CONFIG_HAVE_MEMBLOCK not defined
|
||||
#endif
|
||||
|
||||
#ifndef CONFIG_NEED_MULTIPLE_NODES
|
||||
struct pglist_data __refdata contig_page_data;
|
||||
EXPORT_SYMBOL(contig_page_data);
|
||||
@@ -134,6 +137,11 @@ static unsigned long __init free_low_memory_core_early(void)
|
||||
for_each_reserved_mem_region(i, &start, &end)
|
||||
reserve_bootmem_region(start, end);
|
||||
|
||||
/*
|
||||
* We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
|
||||
* because in some case like Node0 doesn't have RAM installed
|
||||
* low ram will be on Node1
|
||||
*/
|
||||
for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
|
||||
NULL)
|
||||
count += __free_memory_core(start, end);
|
||||
@@ -191,11 +199,6 @@ unsigned long __init free_all_bootmem(void)
|
||||
|
||||
reset_all_zones_managed_pages();
|
||||
|
||||
/*
|
||||
* We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
|
||||
* because in some case like Node0 doesn't have RAM installed
|
||||
* low ram will be on Node1
|
||||
*/
|
||||
pages = free_low_memory_core_early();
|
||||
totalram_pages += pages;
|
||||
|
||||
@@ -395,9 +398,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
|
||||
return __alloc_bootmem_node(pgdat, size, align, goal);
|
||||
}
|
||||
|
||||
#ifndef ARCH_LOW_ADDRESS_LIMIT
|
||||
#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
|
||||
#endif
|
||||
|
||||
/**
|
||||
* __alloc_bootmem_low - allocate low boot memory
|
||||
|
381
mm/oom_kill.c
381
mm/oom_kill.c
@@ -132,6 +132,11 @@ static inline bool is_sysrq_oom(struct oom_control *oc)
|
||||
return oc->order == -1;
|
||||
}
|
||||
|
||||
static inline bool is_memcg_oom(struct oom_control *oc)
|
||||
{
|
||||
return oc->memcg != NULL;
|
||||
}
|
||||
|
||||
/* return true if the task is not adequate as candidate victim task. */
|
||||
static bool oom_unkillable_task(struct task_struct *p,
|
||||
struct mem_cgroup *memcg, const nodemask_t *nodemask)
|
||||
@@ -181,7 +186,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
|
||||
*/
|
||||
adj = (long)p->signal->oom_score_adj;
|
||||
if (adj == OOM_SCORE_ADJ_MIN ||
|
||||
test_bit(MMF_OOM_REAPED, &p->mm->flags) ||
|
||||
test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
|
||||
in_vfork(p)) {
|
||||
task_unlock(p);
|
||||
return 0;
|
||||
@@ -213,12 +218,17 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
|
||||
return points > 0 ? points : 1;
|
||||
}
|
||||
|
||||
enum oom_constraint {
|
||||
CONSTRAINT_NONE,
|
||||
CONSTRAINT_CPUSET,
|
||||
CONSTRAINT_MEMORY_POLICY,
|
||||
CONSTRAINT_MEMCG,
|
||||
};
|
||||
|
||||
/*
|
||||
* Determine the type of allocation constraint.
|
||||
*/
|
||||
#ifdef CONFIG_NUMA
|
||||
static enum oom_constraint constrained_alloc(struct oom_control *oc,
|
||||
unsigned long *totalpages)
|
||||
static enum oom_constraint constrained_alloc(struct oom_control *oc)
|
||||
{
|
||||
struct zone *zone;
|
||||
struct zoneref *z;
|
||||
@@ -226,8 +236,16 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc,
|
||||
bool cpuset_limited = false;
|
||||
int nid;
|
||||
|
||||
if (is_memcg_oom(oc)) {
|
||||
oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1;
|
||||
return CONSTRAINT_MEMCG;
|
||||
}
|
||||
|
||||
/* Default to all available memory */
|
||||
*totalpages = totalram_pages + total_swap_pages;
|
||||
oc->totalpages = totalram_pages + total_swap_pages;
|
||||
|
||||
if (!IS_ENABLED(CONFIG_NUMA))
|
||||
return CONSTRAINT_NONE;
|
||||
|
||||
if (!oc->zonelist)
|
||||
return CONSTRAINT_NONE;
|
||||
@@ -246,9 +264,9 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc,
|
||||
*/
|
||||
if (oc->nodemask &&
|
||||
!nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
|
||||
*totalpages = total_swap_pages;
|
||||
oc->totalpages = total_swap_pages;
|
||||
for_each_node_mask(nid, *oc->nodemask)
|
||||
*totalpages += node_spanned_pages(nid);
|
||||
oc->totalpages += node_spanned_pages(nid);
|
||||
return CONSTRAINT_MEMORY_POLICY;
|
||||
}
|
||||
|
||||
@@ -259,98 +277,84 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc,
|
||||
cpuset_limited = true;
|
||||
|
||||
if (cpuset_limited) {
|
||||
*totalpages = total_swap_pages;
|
||||
oc->totalpages = total_swap_pages;
|
||||
for_each_node_mask(nid, cpuset_current_mems_allowed)
|
||||
*totalpages += node_spanned_pages(nid);
|
||||
oc->totalpages += node_spanned_pages(nid);
|
||||
return CONSTRAINT_CPUSET;
|
||||
}
|
||||
return CONSTRAINT_NONE;
|
||||
}
|
||||
#else
|
||||
static enum oom_constraint constrained_alloc(struct oom_control *oc,
|
||||
unsigned long *totalpages)
|
||||
{
|
||||
*totalpages = totalram_pages + total_swap_pages;
|
||||
return CONSTRAINT_NONE;
|
||||
}
|
||||
#endif
|
||||
|
||||
enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
|
||||
struct task_struct *task)
|
||||
static int oom_evaluate_task(struct task_struct *task, void *arg)
|
||||
{
|
||||
struct oom_control *oc = arg;
|
||||
unsigned long points;
|
||||
|
||||
if (oom_unkillable_task(task, NULL, oc->nodemask))
|
||||
return OOM_SCAN_CONTINUE;
|
||||
goto next;
|
||||
|
||||
/*
|
||||
* This task already has access to memory reserves and is being killed.
|
||||
* Don't allow any other task to have access to the reserves unless
|
||||
* the task has MMF_OOM_REAPED because chances that it would release
|
||||
* the task has MMF_OOM_SKIP because chances that it would release
|
||||
* any memory is quite low.
|
||||
*/
|
||||
if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) {
|
||||
struct task_struct *p = find_lock_task_mm(task);
|
||||
enum oom_scan_t ret = OOM_SCAN_ABORT;
|
||||
|
||||
if (p) {
|
||||
if (test_bit(MMF_OOM_REAPED, &p->mm->flags))
|
||||
ret = OOM_SCAN_CONTINUE;
|
||||
task_unlock(p);
|
||||
}
|
||||
|
||||
return ret;
|
||||
if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
|
||||
if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
|
||||
goto next;
|
||||
goto abort;
|
||||
}
|
||||
|
||||
/*
|
||||
* If task is allocating a lot of memory and has been marked to be
|
||||
* killed first if it triggers an oom, then select it.
|
||||
*/
|
||||
if (oom_task_origin(task))
|
||||
return OOM_SCAN_SELECT;
|
||||
if (oom_task_origin(task)) {
|
||||
points = ULONG_MAX;
|
||||
goto select;
|
||||
}
|
||||
|
||||
return OOM_SCAN_OK;
|
||||
points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
|
||||
if (!points || points < oc->chosen_points)
|
||||
goto next;
|
||||
|
||||
/* Prefer thread group leaders for display purposes */
|
||||
if (points == oc->chosen_points && thread_group_leader(oc->chosen))
|
||||
goto next;
|
||||
select:
|
||||
if (oc->chosen)
|
||||
put_task_struct(oc->chosen);
|
||||
get_task_struct(task);
|
||||
oc->chosen = task;
|
||||
oc->chosen_points = points;
|
||||
next:
|
||||
return 0;
|
||||
abort:
|
||||
if (oc->chosen)
|
||||
put_task_struct(oc->chosen);
|
||||
oc->chosen = (void *)-1UL;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Simple selection loop. We chose the process with the highest
|
||||
* number of 'points'. Returns -1 on scan abort.
|
||||
* Simple selection loop. We choose the process with the highest number of
|
||||
* 'points'. In case scan was aborted, oc->chosen is set to -1.
|
||||
*/
|
||||
static struct task_struct *select_bad_process(struct oom_control *oc,
|
||||
unsigned int *ppoints, unsigned long totalpages)
|
||||
static void select_bad_process(struct oom_control *oc)
|
||||
{
|
||||
struct task_struct *p;
|
||||
struct task_struct *chosen = NULL;
|
||||
unsigned long chosen_points = 0;
|
||||
if (is_memcg_oom(oc))
|
||||
mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
|
||||
else {
|
||||
struct task_struct *p;
|
||||
|
||||
rcu_read_lock();
|
||||
for_each_process(p) {
|
||||
unsigned int points;
|
||||
|
||||
switch (oom_scan_process_thread(oc, p)) {
|
||||
case OOM_SCAN_SELECT:
|
||||
chosen = p;
|
||||
chosen_points = ULONG_MAX;
|
||||
/* fall through */
|
||||
case OOM_SCAN_CONTINUE:
|
||||
continue;
|
||||
case OOM_SCAN_ABORT:
|
||||
rcu_read_unlock();
|
||||
return (struct task_struct *)(-1UL);
|
||||
case OOM_SCAN_OK:
|
||||
break;
|
||||
};
|
||||
points = oom_badness(p, NULL, oc->nodemask, totalpages);
|
||||
if (!points || points < chosen_points)
|
||||
continue;
|
||||
|
||||
chosen = p;
|
||||
chosen_points = points;
|
||||
rcu_read_lock();
|
||||
for_each_process(p)
|
||||
if (oom_evaluate_task(p, oc))
|
||||
break;
|
||||
rcu_read_unlock();
|
||||
}
|
||||
if (chosen)
|
||||
get_task_struct(chosen);
|
||||
rcu_read_unlock();
|
||||
|
||||
*ppoints = chosen_points * 1000 / totalpages;
|
||||
return chosen;
|
||||
oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -399,9 +403,14 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
|
||||
|
||||
static void dump_header(struct oom_control *oc, struct task_struct *p)
|
||||
{
|
||||
pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
|
||||
current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
|
||||
nodemask_t *nm = (oc->nodemask) ? oc->nodemask : &cpuset_current_mems_allowed;
|
||||
|
||||
pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n",
|
||||
current->comm, oc->gfp_mask, &oc->gfp_mask,
|
||||
nodemask_pr_args(nm), oc->order,
|
||||
current->signal->oom_score_adj);
|
||||
if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
|
||||
pr_warn("COMPACTION is disabled!!!\n");
|
||||
|
||||
cpuset_print_current_mems_allowed();
|
||||
dump_stack();
|
||||
@@ -419,7 +428,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
|
||||
static atomic_t oom_victims = ATOMIC_INIT(0);
|
||||
static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
|
||||
|
||||
bool oom_killer_disabled __read_mostly;
|
||||
static bool oom_killer_disabled __read_mostly;
|
||||
|
||||
#define K(x) ((x) << (PAGE_SHIFT-10))
|
||||
|
||||
@@ -452,12 +461,10 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
|
||||
static struct task_struct *oom_reaper_list;
|
||||
static DEFINE_SPINLOCK(oom_reaper_lock);
|
||||
|
||||
static bool __oom_reap_task(struct task_struct *tsk)
|
||||
static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
|
||||
{
|
||||
struct mmu_gather tlb;
|
||||
struct vm_area_struct *vma;
|
||||
struct mm_struct *mm = NULL;
|
||||
struct task_struct *p;
|
||||
struct zap_details details = {.check_swap_entries = true,
|
||||
.ignore_dirty = true};
|
||||
bool ret = true;
|
||||
@@ -465,7 +472,7 @@ static bool __oom_reap_task(struct task_struct *tsk)
|
||||
/*
|
||||
* We have to make sure to not race with the victim exit path
|
||||
* and cause premature new oom victim selection:
|
||||
* __oom_reap_task exit_mm
|
||||
* __oom_reap_task_mm exit_mm
|
||||
* mmget_not_zero
|
||||
* mmput
|
||||
* atomic_dec_and_test
|
||||
@@ -478,22 +485,9 @@ static bool __oom_reap_task(struct task_struct *tsk)
|
||||
*/
|
||||
mutex_lock(&oom_lock);
|
||||
|
||||
/*
|
||||
* Make sure we find the associated mm_struct even when the particular
|
||||
* thread has already terminated and cleared its mm.
|
||||
* We might have race with exit path so consider our work done if there
|
||||
* is no mm.
|
||||
*/
|
||||
p = find_lock_task_mm(tsk);
|
||||
if (!p)
|
||||
goto unlock_oom;
|
||||
mm = p->mm;
|
||||
atomic_inc(&mm->mm_count);
|
||||
task_unlock(p);
|
||||
|
||||
if (!down_read_trylock(&mm->mmap_sem)) {
|
||||
ret = false;
|
||||
goto mm_drop;
|
||||
goto unlock_oom;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -503,9 +497,17 @@ static bool __oom_reap_task(struct task_struct *tsk)
|
||||
*/
|
||||
if (!mmget_not_zero(mm)) {
|
||||
up_read(&mm->mmap_sem);
|
||||
goto mm_drop;
|
||||
goto unlock_oom;
|
||||
}
|
||||
|
||||
/*
|
||||
* Tell all users of get_user/copy_from_user etc... that the content
|
||||
* is no longer stable. No barriers really needed because unmapping
|
||||
* should imply barriers already and the reader would hit a page fault
|
||||
* if it stumbled over a reaped memory.
|
||||
*/
|
||||
set_bit(MMF_UNSTABLE, &mm->flags);
|
||||
|
||||
tlb_gather_mmu(&tlb, mm, 0, -1);
|
||||
for (vma = mm->mmap ; vma; vma = vma->vm_next) {
|
||||
if (is_vm_hugetlb_page(vma))
|
||||
@@ -540,19 +542,12 @@ static bool __oom_reap_task(struct task_struct *tsk)
|
||||
K(get_mm_counter(mm, MM_SHMEMPAGES)));
|
||||
up_read(&mm->mmap_sem);
|
||||
|
||||
/*
|
||||
* This task can be safely ignored because we cannot do much more
|
||||
* to release its memory.
|
||||
*/
|
||||
set_bit(MMF_OOM_REAPED, &mm->flags);
|
||||
/*
|
||||
* Drop our reference but make sure the mmput slow path is called from a
|
||||
* different context because we shouldn't risk we get stuck there and
|
||||
* put the oom_reaper out of the way.
|
||||
*/
|
||||
mmput_async(mm);
|
||||
mm_drop:
|
||||
mmdrop(mm);
|
||||
unlock_oom:
|
||||
mutex_unlock(&oom_lock);
|
||||
return ret;
|
||||
@@ -562,44 +557,28 @@ unlock_oom:
|
||||
static void oom_reap_task(struct task_struct *tsk)
|
||||
{
|
||||
int attempts = 0;
|
||||
struct mm_struct *mm = tsk->signal->oom_mm;
|
||||
|
||||
/* Retry the down_read_trylock(mmap_sem) a few times */
|
||||
while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk))
|
||||
while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm))
|
||||
schedule_timeout_idle(HZ/10);
|
||||
|
||||
if (attempts > MAX_OOM_REAP_RETRIES) {
|
||||
struct task_struct *p;
|
||||
if (attempts <= MAX_OOM_REAP_RETRIES)
|
||||
goto done;
|
||||
|
||||
pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
|
||||
task_pid_nr(tsk), tsk->comm);
|
||||
|
||||
/*
|
||||
* If we've already tried to reap this task in the past and
|
||||
* failed it probably doesn't make much sense to try yet again
|
||||
* so hide the mm from the oom killer so that it can move on
|
||||
* to another task with a different mm struct.
|
||||
*/
|
||||
p = find_lock_task_mm(tsk);
|
||||
if (p) {
|
||||
if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &p->mm->flags)) {
|
||||
pr_info("oom_reaper: giving up pid:%d (%s)\n",
|
||||
task_pid_nr(tsk), tsk->comm);
|
||||
set_bit(MMF_OOM_REAPED, &p->mm->flags);
|
||||
}
|
||||
task_unlock(p);
|
||||
}
|
||||
pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
|
||||
task_pid_nr(tsk), tsk->comm);
|
||||
debug_show_all_locks();
|
||||
|
||||
debug_show_all_locks();
|
||||
}
|
||||
done:
|
||||
tsk->oom_reaper_list = NULL;
|
||||
|
||||
/*
|
||||
* Clear TIF_MEMDIE because the task shouldn't be sitting on a
|
||||
* reasonably reclaimable memory anymore or it is not a good candidate
|
||||
* for the oom victim right now because it cannot release its memory
|
||||
* itself nor by the oom reaper.
|
||||
* Hide this mm from OOM killer because it has been either reaped or
|
||||
* somebody can't call up_write(mmap_sem).
|
||||
*/
|
||||
tsk->oom_reaper_list = NULL;
|
||||
exit_oom_victim(tsk);
|
||||
set_bit(MMF_OOM_SKIP, &mm->flags);
|
||||
|
||||
/* Drop a reference taken by wake_oom_reaper */
|
||||
put_task_struct(tsk);
|
||||
@@ -607,8 +586,6 @@ static void oom_reap_task(struct task_struct *tsk)
|
||||
|
||||
static int oom_reaper(void *unused)
|
||||
{
|
||||
set_freezable();
|
||||
|
||||
while (true) {
|
||||
struct task_struct *tsk = NULL;
|
||||
|
||||
@@ -627,7 +604,7 @@ static int oom_reaper(void *unused)
|
||||
return 0;
|
||||
}
|
||||
|
||||
void wake_oom_reaper(struct task_struct *tsk)
|
||||
static void wake_oom_reaper(struct task_struct *tsk)
|
||||
{
|
||||
if (!oom_reaper_th)
|
||||
return;
|
||||
@@ -656,7 +633,11 @@ static int __init oom_init(void)
|
||||
return 0;
|
||||
}
|
||||
subsys_initcall(oom_init)
|
||||
#endif
|
||||
#else
|
||||
static inline void wake_oom_reaper(struct task_struct *tsk)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_MMU */
|
||||
|
||||
/**
|
||||
* mark_oom_victim - mark the given task as OOM victim
|
||||
@@ -664,14 +645,23 @@ subsys_initcall(oom_init)
|
||||
*
|
||||
* Has to be called with oom_lock held and never after
|
||||
* oom has been disabled already.
|
||||
*
|
||||
* tsk->mm has to be non NULL and caller has to guarantee it is stable (either
|
||||
* under task_lock or operate on the current).
|
||||
*/
|
||||
void mark_oom_victim(struct task_struct *tsk)
|
||||
static void mark_oom_victim(struct task_struct *tsk)
|
||||
{
|
||||
struct mm_struct *mm = tsk->mm;
|
||||
|
||||
WARN_ON(oom_killer_disabled);
|
||||
/* OOM killer might race with memcg OOM */
|
||||
if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
|
||||
return;
|
||||
atomic_inc(&tsk->signal->oom_victims);
|
||||
|
||||
/* oom_mm is bound to the signal struct life time. */
|
||||
if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
|
||||
atomic_inc(&tsk->signal->oom_mm->mm_count);
|
||||
|
||||
/*
|
||||
* Make sure that the task is woken up from uninterruptible sleep
|
||||
* if it is frozen because OOM killer wouldn't be able to free
|
||||
@@ -685,21 +675,29 @@ void mark_oom_victim(struct task_struct *tsk)
|
||||
/**
|
||||
* exit_oom_victim - note the exit of an OOM victim
|
||||
*/
|
||||
void exit_oom_victim(struct task_struct *tsk)
|
||||
void exit_oom_victim(void)
|
||||
{
|
||||
if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE))
|
||||
return;
|
||||
atomic_dec(&tsk->signal->oom_victims);
|
||||
clear_thread_flag(TIF_MEMDIE);
|
||||
|
||||
if (!atomic_dec_return(&oom_victims))
|
||||
wake_up_all(&oom_victims_wait);
|
||||
}
|
||||
|
||||
/**
|
||||
* oom_killer_enable - enable OOM killer
|
||||
*/
|
||||
void oom_killer_enable(void)
|
||||
{
|
||||
oom_killer_disabled = false;
|
||||
}
|
||||
|
||||
/**
|
||||
* oom_killer_disable - disable OOM killer
|
||||
* @timeout: maximum timeout to wait for oom victims in jiffies
|
||||
*
|
||||
* Forces all page allocations to fail rather than trigger OOM killer.
|
||||
* Will block and wait until all OOM victims are killed.
|
||||
* Will block and wait until all OOM victims are killed or the given
|
||||
* timeout expires.
|
||||
*
|
||||
* The function cannot be called when there are runnable user tasks because
|
||||
* the userspace would see unexpected allocation failures as a result. Any
|
||||
@@ -708,8 +706,10 @@ void exit_oom_victim(struct task_struct *tsk)
|
||||
* Returns true if successful and false if the OOM killer cannot be
|
||||
* disabled.
|
||||
*/
|
||||
bool oom_killer_disable(void)
|
||||
bool oom_killer_disable(signed long timeout)
|
||||
{
|
||||
signed long ret;
|
||||
|
||||
/*
|
||||
* Make sure to not race with an ongoing OOM killer. Check that the
|
||||
* current is not killed (possibly due to sharing the victim's memory).
|
||||
@@ -719,19 +719,16 @@ bool oom_killer_disable(void)
|
||||
oom_killer_disabled = true;
|
||||
mutex_unlock(&oom_lock);
|
||||
|
||||
wait_event(oom_victims_wait, !atomic_read(&oom_victims));
|
||||
ret = wait_event_interruptible_timeout(oom_victims_wait,
|
||||
!atomic_read(&oom_victims), timeout);
|
||||
if (ret <= 0) {
|
||||
oom_killer_enable();
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* oom_killer_enable - enable OOM killer
|
||||
*/
|
||||
void oom_killer_enable(void)
|
||||
{
|
||||
oom_killer_disabled = false;
|
||||
}
|
||||
|
||||
static inline bool __task_will_free_mem(struct task_struct *task)
|
||||
{
|
||||
struct signal_struct *sig = task->signal;
|
||||
@@ -760,7 +757,7 @@ static inline bool __task_will_free_mem(struct task_struct *task)
|
||||
* Caller has to make sure that task->mm is stable (hold task_lock or
|
||||
* it operates on the current).
|
||||
*/
|
||||
bool task_will_free_mem(struct task_struct *task)
|
||||
static bool task_will_free_mem(struct task_struct *task)
|
||||
{
|
||||
struct mm_struct *mm = task->mm;
|
||||
struct task_struct *p;
|
||||
@@ -781,15 +778,16 @@ bool task_will_free_mem(struct task_struct *task)
|
||||
* This task has already been drained by the oom reaper so there are
|
||||
* only small chances it will free some more
|
||||
*/
|
||||
if (test_bit(MMF_OOM_REAPED, &mm->flags))
|
||||
if (test_bit(MMF_OOM_SKIP, &mm->flags))
|
||||
return false;
|
||||
|
||||
if (atomic_read(&mm->mm_users) <= 1)
|
||||
return true;
|
||||
|
||||
/*
|
||||
* This is really pessimistic but we do not have any reliable way
|
||||
* to check that external processes share with our mm
|
||||
* Make sure that all tasks which share the mm with the given tasks
|
||||
* are dying as well to make sure that a) nobody pins its mm and
|
||||
* b) the task is also reapable by the oom reaper.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
for_each_process(p) {
|
||||
@@ -806,14 +804,10 @@ bool task_will_free_mem(struct task_struct *task)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Must be called while holding a reference to p, which will be released upon
|
||||
* returning.
|
||||
*/
|
||||
void oom_kill_process(struct oom_control *oc, struct task_struct *p,
|
||||
unsigned int points, unsigned long totalpages,
|
||||
const char *message)
|
||||
static void oom_kill_process(struct oom_control *oc, const char *message)
|
||||
{
|
||||
struct task_struct *p = oc->chosen;
|
||||
unsigned int points = oc->chosen_points;
|
||||
struct task_struct *victim = p;
|
||||
struct task_struct *child;
|
||||
struct task_struct *t;
|
||||
@@ -860,7 +854,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
|
||||
* oom_badness() returns 0 if the thread is unkillable
|
||||
*/
|
||||
child_points = oom_badness(child,
|
||||
oc->memcg, oc->nodemask, totalpages);
|
||||
oc->memcg, oc->nodemask, oc->totalpages);
|
||||
if (child_points > victim_points) {
|
||||
put_task_struct(victim);
|
||||
victim = child;
|
||||
@@ -913,20 +907,20 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
|
||||
continue;
|
||||
if (same_thread_group(p, victim))
|
||||
continue;
|
||||
if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p)) {
|
||||
/*
|
||||
* We cannot use oom_reaper for the mm shared by this
|
||||
* process because it wouldn't get killed and so the
|
||||
* memory might be still used. Hide the mm from the oom
|
||||
* killer to guarantee OOM forward progress.
|
||||
*/
|
||||
if (is_global_init(p)) {
|
||||
can_oom_reap = false;
|
||||
set_bit(MMF_OOM_REAPED, &mm->flags);
|
||||
set_bit(MMF_OOM_SKIP, &mm->flags);
|
||||
pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
|
||||
task_pid_nr(victim), victim->comm,
|
||||
task_pid_nr(p), p->comm);
|
||||
continue;
|
||||
}
|
||||
/*
|
||||
* No use_mm() user needs to read from the userspace so we are
|
||||
* ok to reap it.
|
||||
*/
|
||||
if (unlikely(p->flags & PF_KTHREAD))
|
||||
continue;
|
||||
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
@@ -942,7 +936,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
|
||||
/*
|
||||
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
|
||||
*/
|
||||
void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint)
|
||||
static void check_panic_on_oom(struct oom_control *oc,
|
||||
enum oom_constraint constraint)
|
||||
{
|
||||
if (likely(!sysctl_panic_on_oom))
|
||||
return;
|
||||
@@ -988,19 +983,18 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
|
||||
*/
|
||||
bool out_of_memory(struct oom_control *oc)
|
||||
{
|
||||
struct task_struct *p;
|
||||
unsigned long totalpages;
|
||||
unsigned long freed = 0;
|
||||
unsigned int uninitialized_var(points);
|
||||
enum oom_constraint constraint = CONSTRAINT_NONE;
|
||||
|
||||
if (oom_killer_disabled)
|
||||
return false;
|
||||
|
||||
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
|
||||
if (freed > 0)
|
||||
/* Got some memory back in the last second. */
|
||||
return true;
|
||||
if (!is_memcg_oom(oc)) {
|
||||
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
|
||||
if (freed > 0)
|
||||
/* Got some memory back in the last second. */
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* If current has a pending SIGKILL or is exiting, then automatically
|
||||
@@ -1024,37 +1018,38 @@ bool out_of_memory(struct oom_control *oc)
|
||||
|
||||
/*
|
||||
* Check if there were limitations on the allocation (only relevant for
|
||||
* NUMA) that may require different handling.
|
||||
* NUMA and memcg) that may require different handling.
|
||||
*/
|
||||
constraint = constrained_alloc(oc, &totalpages);
|
||||
constraint = constrained_alloc(oc);
|
||||
if (constraint != CONSTRAINT_MEMORY_POLICY)
|
||||
oc->nodemask = NULL;
|
||||
check_panic_on_oom(oc, constraint);
|
||||
|
||||
if (sysctl_oom_kill_allocating_task && current->mm &&
|
||||
!oom_unkillable_task(current, NULL, oc->nodemask) &&
|
||||
if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
|
||||
current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
|
||||
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
|
||||
get_task_struct(current);
|
||||
oom_kill_process(oc, current, 0, totalpages,
|
||||
"Out of memory (oom_kill_allocating_task)");
|
||||
oc->chosen = current;
|
||||
oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
|
||||
return true;
|
||||
}
|
||||
|
||||
p = select_bad_process(oc, &points, totalpages);
|
||||
select_bad_process(oc);
|
||||
/* Found nothing?!?! Either we hang forever, or we panic. */
|
||||
if (!p && !is_sysrq_oom(oc)) {
|
||||
if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
|
||||
dump_header(oc, NULL);
|
||||
panic("Out of memory and no killable processes...\n");
|
||||
}
|
||||
if (p && p != (void *)-1UL) {
|
||||
oom_kill_process(oc, p, points, totalpages, "Out of memory");
|
||||
if (oc->chosen && oc->chosen != (void *)-1UL) {
|
||||
oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
|
||||
"Memory cgroup out of memory");
|
||||
/*
|
||||
* Give the killed process a good chance to exit before trying
|
||||
* to allocate memory again.
|
||||
*/
|
||||
schedule_timeout_killable(1);
|
||||
}
|
||||
return true;
|
||||
return !!oc->chosen;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1077,16 +1072,6 @@ void pagefault_out_of_memory(void)
|
||||
|
||||
if (!mutex_trylock(&oom_lock))
|
||||
return;
|
||||
|
||||
if (!out_of_memory(&oc)) {
|
||||
/*
|
||||
* There shouldn't be any user tasks runnable while the
|
||||
* OOM killer is disabled, so the current task has to
|
||||
* be a racing OOM victim for which oom_killer_disable()
|
||||
* is waiting for.
|
||||
*/
|
||||
WARN_ON(test_thread_flag(TIF_MEMDIE));
|
||||
}
|
||||
|
||||
out_of_memory(&oc);
|
||||
mutex_unlock(&oom_lock);
|
||||
}
|
||||
|
@@ -1965,36 +1965,6 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
|
||||
return false;
|
||||
}
|
||||
|
||||
void throttle_vm_writeout(gfp_t gfp_mask)
|
||||
{
|
||||
unsigned long background_thresh;
|
||||
unsigned long dirty_thresh;
|
||||
|
||||
for ( ; ; ) {
|
||||
global_dirty_limits(&background_thresh, &dirty_thresh);
|
||||
dirty_thresh = hard_dirty_limit(&global_wb_domain, dirty_thresh);
|
||||
|
||||
/*
|
||||
* Boost the allowable dirty threshold a bit for page
|
||||
* allocators so they don't get DoS'ed by heavy writers
|
||||
*/
|
||||
dirty_thresh += dirty_thresh / 10; /* wheeee... */
|
||||
|
||||
if (global_node_page_state(NR_UNSTABLE_NFS) +
|
||||
global_node_page_state(NR_WRITEBACK) <= dirty_thresh)
|
||||
break;
|
||||
congestion_wait(BLK_RW_ASYNC, HZ/10);
|
||||
|
||||
/*
|
||||
* The caller might hold locks which can prevent IO completion
|
||||
* or progress in the filesystem. So we cannot just sit here
|
||||
* waiting for IO to complete.
|
||||
*/
|
||||
if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO))
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
|
||||
*/
|
||||
@@ -2746,7 +2716,7 @@ int test_clear_page_writeback(struct page *page)
|
||||
int ret;
|
||||
|
||||
lock_page_memcg(page);
|
||||
if (mapping) {
|
||||
if (mapping && mapping_use_writeback_tags(mapping)) {
|
||||
struct inode *inode = mapping->host;
|
||||
struct backing_dev_info *bdi = inode_to_bdi(inode);
|
||||
unsigned long flags;
|
||||
@@ -2789,7 +2759,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
|
||||
int ret;
|
||||
|
||||
lock_page_memcg(page);
|
||||
if (mapping) {
|
||||
if (mapping && mapping_use_writeback_tags(mapping)) {
|
||||
struct inode *inode = mapping->host;
|
||||
struct backing_dev_info *bdi = inode_to_bdi(inode);
|
||||
unsigned long flags;
|
||||
|
275
mm/page_alloc.c
275
mm/page_alloc.c
@@ -607,6 +607,9 @@ static bool need_debug_guardpage(void)
|
||||
if (!debug_pagealloc_enabled())
|
||||
return false;
|
||||
|
||||
if (!debug_guardpage_minorder())
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -615,6 +618,9 @@ static void init_debug_guardpage(void)
|
||||
if (!debug_pagealloc_enabled())
|
||||
return;
|
||||
|
||||
if (!debug_guardpage_minorder())
|
||||
return;
|
||||
|
||||
_debug_guardpage_enabled = true;
|
||||
}
|
||||
|
||||
@@ -635,19 +641,22 @@ static int __init debug_guardpage_minorder_setup(char *buf)
|
||||
pr_info("Setting debug_guardpage_minorder to %lu\n", res);
|
||||
return 0;
|
||||
}
|
||||
__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
|
||||
early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
|
||||
|
||||
static inline void set_page_guard(struct zone *zone, struct page *page,
|
||||
static inline bool set_page_guard(struct zone *zone, struct page *page,
|
||||
unsigned int order, int migratetype)
|
||||
{
|
||||
struct page_ext *page_ext;
|
||||
|
||||
if (!debug_guardpage_enabled())
|
||||
return;
|
||||
return false;
|
||||
|
||||
if (order >= debug_guardpage_minorder())
|
||||
return false;
|
||||
|
||||
page_ext = lookup_page_ext(page);
|
||||
if (unlikely(!page_ext))
|
||||
return;
|
||||
return false;
|
||||
|
||||
__set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
|
||||
|
||||
@@ -655,6 +664,8 @@ static inline void set_page_guard(struct zone *zone, struct page *page,
|
||||
set_page_private(page, order);
|
||||
/* Guard pages are not available for any usage */
|
||||
__mod_zone_freepage_state(zone, -(1 << order), migratetype);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline void clear_page_guard(struct zone *zone, struct page *page,
|
||||
@@ -676,9 +687,9 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
|
||||
__mod_zone_freepage_state(zone, (1 << order), migratetype);
|
||||
}
|
||||
#else
|
||||
struct page_ext_operations debug_guardpage_ops = { NULL, };
|
||||
static inline void set_page_guard(struct zone *zone, struct page *page,
|
||||
unsigned int order, int migratetype) {}
|
||||
struct page_ext_operations debug_guardpage_ops;
|
||||
static inline bool set_page_guard(struct zone *zone, struct page *page,
|
||||
unsigned int order, int migratetype) { return false; }
|
||||
static inline void clear_page_guard(struct zone *zone, struct page *page,
|
||||
unsigned int order, int migratetype) {}
|
||||
#endif
|
||||
@@ -1393,15 +1404,18 @@ static void __init deferred_free_range(struct page *page,
|
||||
return;
|
||||
|
||||
/* Free a large naturally-aligned chunk if possible */
|
||||
if (nr_pages == MAX_ORDER_NR_PAGES &&
|
||||
(pfn & (MAX_ORDER_NR_PAGES-1)) == 0) {
|
||||
if (nr_pages == pageblock_nr_pages &&
|
||||
(pfn & (pageblock_nr_pages - 1)) == 0) {
|
||||
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
|
||||
__free_pages_boot_core(page, MAX_ORDER-1);
|
||||
__free_pages_boot_core(page, pageblock_order);
|
||||
return;
|
||||
}
|
||||
|
||||
for (i = 0; i < nr_pages; i++, page++)
|
||||
for (i = 0; i < nr_pages; i++, page++, pfn++) {
|
||||
if ((pfn & (pageblock_nr_pages - 1)) == 0)
|
||||
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
|
||||
__free_pages_boot_core(page, 0);
|
||||
}
|
||||
}
|
||||
|
||||
/* Completion tracking for deferred_init_memmap() threads */
|
||||
@@ -1469,9 +1483,9 @@ static int __init deferred_init_memmap(void *data)
|
||||
|
||||
/*
|
||||
* Ensure pfn_valid is checked every
|
||||
* MAX_ORDER_NR_PAGES for memory holes
|
||||
* pageblock_nr_pages for memory holes
|
||||
*/
|
||||
if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
|
||||
if ((pfn & (pageblock_nr_pages - 1)) == 0) {
|
||||
if (!pfn_valid(pfn)) {
|
||||
page = NULL;
|
||||
goto free_range;
|
||||
@@ -1484,7 +1498,7 @@ static int __init deferred_init_memmap(void *data)
|
||||
}
|
||||
|
||||
/* Minimise pfn page lookups and scheduler checks */
|
||||
if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) {
|
||||
if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {
|
||||
page++;
|
||||
} else {
|
||||
nr_pages += nr_to_free;
|
||||
@@ -1520,6 +1534,9 @@ free_range:
|
||||
free_base_page = NULL;
|
||||
free_base_pfn = nr_to_free = 0;
|
||||
}
|
||||
/* Free the last block of pages to allocator */
|
||||
nr_pages += nr_to_free;
|
||||
deferred_free_range(free_base_page, free_base_pfn, nr_to_free);
|
||||
|
||||
first_init_pfn = max(end_pfn, first_init_pfn);
|
||||
}
|
||||
@@ -1616,18 +1633,15 @@ static inline void expand(struct zone *zone, struct page *page,
|
||||
size >>= 1;
|
||||
VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
|
||||
|
||||
if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
|
||||
debug_guardpage_enabled() &&
|
||||
high < debug_guardpage_minorder()) {
|
||||
/*
|
||||
* Mark as guard pages (or page), that will allow to
|
||||
* merge back to allocator when buddy will be freed.
|
||||
* Corresponding page table entries will not be touched,
|
||||
* pages will stay not present in virtual address space
|
||||
*/
|
||||
set_page_guard(zone, &page[size], high, migratetype);
|
||||
/*
|
||||
* Mark as guard pages (or page), that will allow to
|
||||
* merge back to allocator when buddy will be freed.
|
||||
* Corresponding page table entries will not be touched,
|
||||
* pages will stay not present in virtual address space
|
||||
*/
|
||||
if (set_page_guard(zone, &page[size], high, migratetype))
|
||||
continue;
|
||||
}
|
||||
|
||||
list_add(&page[size].lru, &area->free_list[migratetype]);
|
||||
area->nr_free++;
|
||||
set_page_order(&page[size], high);
|
||||
@@ -2489,9 +2503,14 @@ int __isolate_free_page(struct page *page, unsigned int order)
|
||||
mt = get_pageblock_migratetype(page);
|
||||
|
||||
if (!is_migrate_isolate(mt)) {
|
||||
/* Obey watermarks as if the page was being allocated */
|
||||
watermark = low_wmark_pages(zone) + (1 << order);
|
||||
if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
|
||||
/*
|
||||
* Obey watermarks as if the page was being allocated. We can
|
||||
* emulate a high-order watermark check with a raised order-0
|
||||
* watermark, because we already know our high-order page
|
||||
* exists.
|
||||
*/
|
||||
watermark = min_wmark_pages(zone) + (1UL << order);
|
||||
if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
|
||||
return 0;
|
||||
|
||||
__mod_zone_freepage_state(zone, -(1UL << order), mt);
|
||||
@@ -2960,9 +2979,11 @@ static DEFINE_RATELIMIT_STATE(nopage_rs,
|
||||
DEFAULT_RATELIMIT_INTERVAL,
|
||||
DEFAULT_RATELIMIT_BURST);
|
||||
|
||||
void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
|
||||
void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
|
||||
{
|
||||
unsigned int filter = SHOW_MEM_FILTER_NODES;
|
||||
struct va_format vaf;
|
||||
va_list args;
|
||||
|
||||
if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
|
||||
debug_guardpage_minorder() > 0)
|
||||
@@ -2980,22 +3001,16 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
|
||||
if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
|
||||
filter &= ~SHOW_MEM_FILTER_NODES;
|
||||
|
||||
if (fmt) {
|
||||
struct va_format vaf;
|
||||
va_list args;
|
||||
pr_warn("%s: ", current->comm);
|
||||
|
||||
va_start(args, fmt);
|
||||
va_start(args, fmt);
|
||||
vaf.fmt = fmt;
|
||||
vaf.va = &args;
|
||||
pr_cont("%pV", &vaf);
|
||||
va_end(args);
|
||||
|
||||
vaf.fmt = fmt;
|
||||
vaf.va = &args;
|
||||
pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask);
|
||||
|
||||
pr_warn("%pV", &vaf);
|
||||
|
||||
va_end(args);
|
||||
}
|
||||
|
||||
pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n",
|
||||
current->comm, order, gfp_mask, &gfp_mask);
|
||||
dump_stack();
|
||||
if (!should_suppress_show_mem())
|
||||
show_mem(filter);
|
||||
@@ -3137,6 +3152,65 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
|
||||
enum compact_result compact_result,
|
||||
enum compact_priority *compact_priority,
|
||||
int *compaction_retries)
|
||||
{
|
||||
int max_retries = MAX_COMPACT_RETRIES;
|
||||
int min_priority;
|
||||
|
||||
if (!order)
|
||||
return false;
|
||||
|
||||
if (compaction_made_progress(compact_result))
|
||||
(*compaction_retries)++;
|
||||
|
||||
/*
|
||||
* compaction considers all the zone as desperately out of memory
|
||||
* so it doesn't really make much sense to retry except when the
|
||||
* failure could be caused by insufficient priority
|
||||
*/
|
||||
if (compaction_failed(compact_result))
|
||||
goto check_priority;
|
||||
|
||||
/*
|
||||
* make sure the compaction wasn't deferred or didn't bail out early
|
||||
* due to locks contention before we declare that we should give up.
|
||||
* But do not retry if the given zonelist is not suitable for
|
||||
* compaction.
|
||||
*/
|
||||
if (compaction_withdrawn(compact_result))
|
||||
return compaction_zonelist_suitable(ac, order, alloc_flags);
|
||||
|
||||
/*
|
||||
* !costly requests are much more important than __GFP_REPEAT
|
||||
* costly ones because they are de facto nofail and invoke OOM
|
||||
* killer to move on while costly can fail and users are ready
|
||||
* to cope with that. 1/4 retries is rather arbitrary but we
|
||||
* would need much more detailed feedback from compaction to
|
||||
* make a better decision.
|
||||
*/
|
||||
if (order > PAGE_ALLOC_COSTLY_ORDER)
|
||||
max_retries /= 4;
|
||||
if (*compaction_retries <= max_retries)
|
||||
return true;
|
||||
|
||||
/*
|
||||
* Make sure there are attempts at the highest priority if we exhausted
|
||||
* all retries or failed at the lower priorities.
|
||||
*/
|
||||
check_priority:
|
||||
min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
|
||||
MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
|
||||
if (*compact_priority > min_priority) {
|
||||
(*compact_priority)--;
|
||||
*compaction_retries = 0;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
#else
|
||||
static inline struct page *
|
||||
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
|
||||
@@ -3147,13 +3221,11 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_COMPACTION */
|
||||
|
||||
static inline bool
|
||||
should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
|
||||
enum compact_result compact_result,
|
||||
enum compact_priority *compact_priority,
|
||||
int compaction_retries)
|
||||
int *compaction_retries)
|
||||
{
|
||||
struct zone *zone;
|
||||
struct zoneref *z;
|
||||
@@ -3175,6 +3247,7 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
|
||||
}
|
||||
return false;
|
||||
}
|
||||
#endif /* CONFIG_COMPACTION */
|
||||
|
||||
/* Perform direct synchronous page reclaim */
|
||||
static int
|
||||
@@ -3325,16 +3398,26 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
|
||||
static inline bool
|
||||
should_reclaim_retry(gfp_t gfp_mask, unsigned order,
|
||||
struct alloc_context *ac, int alloc_flags,
|
||||
bool did_some_progress, int no_progress_loops)
|
||||
bool did_some_progress, int *no_progress_loops)
|
||||
{
|
||||
struct zone *zone;
|
||||
struct zoneref *z;
|
||||
|
||||
/*
|
||||
* Costly allocations might have made a progress but this doesn't mean
|
||||
* their order will become available due to high fragmentation so
|
||||
* always increment the no progress counter for them
|
||||
*/
|
||||
if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
|
||||
*no_progress_loops = 0;
|
||||
else
|
||||
(*no_progress_loops)++;
|
||||
|
||||
/*
|
||||
* Make sure we converge to OOM if we cannot make any progress
|
||||
* several times in the row.
|
||||
*/
|
||||
if (no_progress_loops > MAX_RECLAIM_RETRIES)
|
||||
if (*no_progress_loops > MAX_RECLAIM_RETRIES)
|
||||
return false;
|
||||
|
||||
/*
|
||||
@@ -3349,7 +3432,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
|
||||
unsigned long reclaimable;
|
||||
|
||||
available = reclaimable = zone_reclaimable_pages(zone);
|
||||
available -= DIV_ROUND_UP(no_progress_loops * available,
|
||||
available -= DIV_ROUND_UP((*no_progress_loops) * available,
|
||||
MAX_RECLAIM_RETRIES);
|
||||
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
|
||||
|
||||
@@ -3410,6 +3493,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
|
||||
enum compact_result compact_result;
|
||||
int compaction_retries = 0;
|
||||
int no_progress_loops = 0;
|
||||
unsigned long alloc_start = jiffies;
|
||||
unsigned int stall_timeout = 10 * HZ;
|
||||
|
||||
/*
|
||||
* In the slowpath, we sanity check order to avoid ever trying to
|
||||
@@ -3554,9 +3639,6 @@ retry:
|
||||
if (page)
|
||||
goto got_pg;
|
||||
|
||||
if (order && compaction_made_progress(compact_result))
|
||||
compaction_retries++;
|
||||
|
||||
/* Do not loop if specifically requested */
|
||||
if (gfp_mask & __GFP_NORETRY)
|
||||
goto nopage;
|
||||
@@ -3568,18 +3650,16 @@ retry:
|
||||
if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
|
||||
goto nopage;
|
||||
|
||||
/*
|
||||
* Costly allocations might have made a progress but this doesn't mean
|
||||
* their order will become available due to high fragmentation so
|
||||
* always increment the no progress counter for them
|
||||
*/
|
||||
if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
|
||||
no_progress_loops = 0;
|
||||
else
|
||||
no_progress_loops++;
|
||||
/* Make sure we know about allocations which stall for too long */
|
||||
if (time_after(jiffies, alloc_start + stall_timeout)) {
|
||||
warn_alloc(gfp_mask,
|
||||
"page alloction stalls for %ums, order:%u\n",
|
||||
jiffies_to_msecs(jiffies-alloc_start), order);
|
||||
stall_timeout += 10 * HZ;
|
||||
}
|
||||
|
||||
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
|
||||
did_some_progress > 0, no_progress_loops))
|
||||
did_some_progress > 0, &no_progress_loops))
|
||||
goto retry;
|
||||
|
||||
/*
|
||||
@@ -3591,7 +3671,7 @@ retry:
|
||||
if (did_some_progress > 0 &&
|
||||
should_compact_retry(ac, order, alloc_flags,
|
||||
compact_result, &compact_priority,
|
||||
compaction_retries))
|
||||
&compaction_retries))
|
||||
goto retry;
|
||||
|
||||
/* Reclaim has failed us, start killing things */
|
||||
@@ -3606,7 +3686,8 @@ retry:
|
||||
}
|
||||
|
||||
nopage:
|
||||
warn_alloc_failed(gfp_mask, order, NULL);
|
||||
warn_alloc(gfp_mask,
|
||||
"page allocation failure: order:%u", order);
|
||||
got_pg:
|
||||
return page;
|
||||
}
|
||||
@@ -4555,7 +4636,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
|
||||
int j;
|
||||
struct zonelist *zonelist;
|
||||
|
||||
zonelist = &pgdat->node_zonelists[0];
|
||||
zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
|
||||
for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
|
||||
;
|
||||
j = build_zonelists_node(NODE_DATA(node), zonelist, j);
|
||||
@@ -4571,7 +4652,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
|
||||
int j;
|
||||
struct zonelist *zonelist;
|
||||
|
||||
zonelist = &pgdat->node_zonelists[1];
|
||||
zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK];
|
||||
j = build_zonelists_node(pgdat, zonelist, 0);
|
||||
zonelist->_zonerefs[j].zone = NULL;
|
||||
zonelist->_zonerefs[j].zone_idx = 0;
|
||||
@@ -4592,7 +4673,7 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
|
||||
struct zone *z;
|
||||
struct zonelist *zonelist;
|
||||
|
||||
zonelist = &pgdat->node_zonelists[0];
|
||||
zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
|
||||
pos = 0;
|
||||
for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
|
||||
for (j = 0; j < nr_nodes; j++) {
|
||||
@@ -4727,7 +4808,7 @@ static void build_zonelists(pg_data_t *pgdat)
|
||||
|
||||
local_node = pgdat->node_id;
|
||||
|
||||
zonelist = &pgdat->node_zonelists[0];
|
||||
zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
|
||||
j = build_zonelists_node(pgdat, zonelist, 0);
|
||||
|
||||
/*
|
||||
@@ -4999,15 +5080,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
|
||||
break;
|
||||
|
||||
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
|
||||
/*
|
||||
* If not mirrored_kernelcore and ZONE_MOVABLE exists, range
|
||||
* from zone_movable_pfn[nid] to end of each node should be
|
||||
* ZONE_MOVABLE not ZONE_NORMAL. skip it.
|
||||
*/
|
||||
if (!mirrored_kernelcore && zone_movable_pfn[nid])
|
||||
if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid])
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Check given memblock attribute by firmware which can affect
|
||||
* kernel memory layout. If zone==ZONE_MOVABLE but memory is
|
||||
@@ -5451,6 +5523,12 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
|
||||
*zone_end_pfn = min(node_end_pfn,
|
||||
arch_zone_highest_possible_pfn[movable_zone]);
|
||||
|
||||
/* Adjust for ZONE_MOVABLE starting within this range */
|
||||
} else if (!mirrored_kernelcore &&
|
||||
*zone_start_pfn < zone_movable_pfn[nid] &&
|
||||
*zone_end_pfn > zone_movable_pfn[nid]) {
|
||||
*zone_end_pfn = zone_movable_pfn[nid];
|
||||
|
||||
/* Check if this whole range is within ZONE_MOVABLE */
|
||||
} else if (*zone_start_pfn >= zone_movable_pfn[nid])
|
||||
*zone_start_pfn = *zone_end_pfn;
|
||||
@@ -5554,28 +5632,23 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
|
||||
* Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
|
||||
* and vice versa.
|
||||
*/
|
||||
if (zone_movable_pfn[nid]) {
|
||||
if (mirrored_kernelcore) {
|
||||
unsigned long start_pfn, end_pfn;
|
||||
struct memblock_region *r;
|
||||
if (mirrored_kernelcore && zone_movable_pfn[nid]) {
|
||||
unsigned long start_pfn, end_pfn;
|
||||
struct memblock_region *r;
|
||||
|
||||
for_each_memblock(memory, r) {
|
||||
start_pfn = clamp(memblock_region_memory_base_pfn(r),
|
||||
zone_start_pfn, zone_end_pfn);
|
||||
end_pfn = clamp(memblock_region_memory_end_pfn(r),
|
||||
zone_start_pfn, zone_end_pfn);
|
||||
for_each_memblock(memory, r) {
|
||||
start_pfn = clamp(memblock_region_memory_base_pfn(r),
|
||||
zone_start_pfn, zone_end_pfn);
|
||||
end_pfn = clamp(memblock_region_memory_end_pfn(r),
|
||||
zone_start_pfn, zone_end_pfn);
|
||||
|
||||
if (zone_type == ZONE_MOVABLE &&
|
||||
memblock_is_mirror(r))
|
||||
nr_absent += end_pfn - start_pfn;
|
||||
if (zone_type == ZONE_MOVABLE &&
|
||||
memblock_is_mirror(r))
|
||||
nr_absent += end_pfn - start_pfn;
|
||||
|
||||
if (zone_type == ZONE_NORMAL &&
|
||||
!memblock_is_mirror(r))
|
||||
nr_absent += end_pfn - start_pfn;
|
||||
}
|
||||
} else {
|
||||
if (zone_type == ZONE_NORMAL)
|
||||
nr_absent += node_end_pfn - zone_movable_pfn[nid];
|
||||
if (zone_type == ZONE_NORMAL &&
|
||||
!memblock_is_mirror(r))
|
||||
nr_absent += end_pfn - start_pfn;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6929,6 +7002,17 @@ static int __init set_hashdist(char *str)
|
||||
__setup("hashdist=", set_hashdist);
|
||||
#endif
|
||||
|
||||
#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
|
||||
/*
|
||||
* Returns the number of pages that arch has reserved but
|
||||
* is not known to alloc_large_system_hash().
|
||||
*/
|
||||
static unsigned long __init arch_reserved_kernel_pages(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* allocate a large system hash table from bootmem
|
||||
* - it is assumed that the hash table must contain an exact power-of-2
|
||||
@@ -6953,6 +7037,7 @@ void *__init alloc_large_system_hash(const char *tablename,
|
||||
if (!numentries) {
|
||||
/* round applicable memory size up to nearest megabyte */
|
||||
numentries = nr_kernel_pages;
|
||||
numentries -= arch_reserved_kernel_pages();
|
||||
|
||||
/* It isn't necessary when PAGE_SIZE >= 1MB */
|
||||
if (PAGE_SHIFT < 20)
|
||||
|
@@ -42,6 +42,11 @@
|
||||
* and page extension core can skip to allocate memory. As result,
|
||||
* none of memory is wasted.
|
||||
*
|
||||
* When need callback returns true, page_ext checks if there is a request for
|
||||
* extra memory through size in struct page_ext_operations. If it is non-zero,
|
||||
* extra space is allocated for each page_ext entry and offset is returned to
|
||||
* user through offset in struct page_ext_operations.
|
||||
*
|
||||
* The init callback is used to do proper initialization after page extension
|
||||
* is completely initialized. In sparse memory system, extra memory is
|
||||
* allocated some time later than memmap is allocated. In other words, lifetime
|
||||
@@ -66,18 +71,24 @@ static struct page_ext_operations *page_ext_ops[] = {
|
||||
};
|
||||
|
||||
static unsigned long total_usage;
|
||||
static unsigned long extra_mem;
|
||||
|
||||
static bool __init invoke_need_callbacks(void)
|
||||
{
|
||||
int i;
|
||||
int entries = ARRAY_SIZE(page_ext_ops);
|
||||
bool need = false;
|
||||
|
||||
for (i = 0; i < entries; i++) {
|
||||
if (page_ext_ops[i]->need && page_ext_ops[i]->need())
|
||||
return true;
|
||||
if (page_ext_ops[i]->need && page_ext_ops[i]->need()) {
|
||||
page_ext_ops[i]->offset = sizeof(struct page_ext) +
|
||||
extra_mem;
|
||||
extra_mem += page_ext_ops[i]->size;
|
||||
need = true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
return need;
|
||||
}
|
||||
|
||||
static void __init invoke_init_callbacks(void)
|
||||
@@ -91,6 +102,16 @@ static void __init invoke_init_callbacks(void)
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned long get_entry_size(void)
|
||||
{
|
||||
return sizeof(struct page_ext) + extra_mem;
|
||||
}
|
||||
|
||||
static inline struct page_ext *get_entry(void *base, unsigned long index)
|
||||
{
|
||||
return base + get_entry_size() * index;
|
||||
}
|
||||
|
||||
#if !defined(CONFIG_SPARSEMEM)
|
||||
|
||||
|
||||
@@ -102,7 +123,7 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
|
||||
struct page_ext *lookup_page_ext(struct page *page)
|
||||
{
|
||||
unsigned long pfn = page_to_pfn(page);
|
||||
unsigned long offset;
|
||||
unsigned long index;
|
||||
struct page_ext *base;
|
||||
|
||||
base = NODE_DATA(page_to_nid(page))->node_page_ext;
|
||||
@@ -119,9 +140,9 @@ struct page_ext *lookup_page_ext(struct page *page)
|
||||
if (unlikely(!base))
|
||||
return NULL;
|
||||
#endif
|
||||
offset = pfn - round_down(node_start_pfn(page_to_nid(page)),
|
||||
index = pfn - round_down(node_start_pfn(page_to_nid(page)),
|
||||
MAX_ORDER_NR_PAGES);
|
||||
return base + offset;
|
||||
return get_entry(base, index);
|
||||
}
|
||||
|
||||
static int __init alloc_node_page_ext(int nid)
|
||||
@@ -143,7 +164,7 @@ static int __init alloc_node_page_ext(int nid)
|
||||
!IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
|
||||
nr_pages += MAX_ORDER_NR_PAGES;
|
||||
|
||||
table_size = sizeof(struct page_ext) * nr_pages;
|
||||
table_size = get_entry_size() * nr_pages;
|
||||
|
||||
base = memblock_virt_alloc_try_nid_nopanic(
|
||||
table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
|
||||
@@ -196,7 +217,7 @@ struct page_ext *lookup_page_ext(struct page *page)
|
||||
if (!section->page_ext)
|
||||
return NULL;
|
||||
#endif
|
||||
return section->page_ext + pfn;
|
||||
return get_entry(section->page_ext, pfn);
|
||||
}
|
||||
|
||||
static void *__meminit alloc_page_ext(size_t size, int nid)
|
||||
@@ -229,7 +250,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid)
|
||||
if (section->page_ext)
|
||||
return 0;
|
||||
|
||||
table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
|
||||
table_size = get_entry_size() * PAGES_PER_SECTION;
|
||||
base = alloc_page_ext(table_size, nid);
|
||||
|
||||
/*
|
||||
@@ -249,7 +270,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid)
|
||||
* we need to apply a mask.
|
||||
*/
|
||||
pfn &= PAGE_SECTION_MASK;
|
||||
section->page_ext = base - pfn;
|
||||
section->page_ext = (void *)base - get_entry_size() * pfn;
|
||||
total_usage += table_size;
|
||||
return 0;
|
||||
}
|
||||
@@ -262,7 +283,7 @@ static void free_page_ext(void *addr)
|
||||
struct page *page = virt_to_page(addr);
|
||||
size_t table_size;
|
||||
|
||||
table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
|
||||
table_size = get_entry_size() * PAGES_PER_SECTION;
|
||||
|
||||
BUG_ON(PageReserved(page));
|
||||
free_pages_exact(addr, table_size);
|
||||
@@ -277,7 +298,7 @@ static void __free_page_ext(unsigned long pfn)
|
||||
ms = __pfn_to_section(pfn);
|
||||
if (!ms || !ms->page_ext)
|
||||
return;
|
||||
base = ms->page_ext + pfn;
|
||||
base = get_entry(ms->page_ext, pfn);
|
||||
free_page_ext(base);
|
||||
ms->page_ext = NULL;
|
||||
}
|
||||
|
@@ -264,7 +264,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
|
||||
int ret;
|
||||
struct swap_info_struct *sis = page_swap_info(page);
|
||||
|
||||
BUG_ON(!PageSwapCache(page));
|
||||
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
|
||||
if (sis->flags & SWP_FILE) {
|
||||
struct kiocb kiocb;
|
||||
struct file *swap_file = sis->swap_file;
|
||||
@@ -338,7 +338,7 @@ int swap_readpage(struct page *page)
|
||||
int ret = 0;
|
||||
struct swap_info_struct *sis = page_swap_info(page);
|
||||
|
||||
BUG_ON(!PageSwapCache(page));
|
||||
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
|
||||
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
||||
VM_BUG_ON_PAGE(PageUptodate(page), page);
|
||||
if (frontswap_load(page) == 0) {
|
||||
@@ -388,7 +388,8 @@ int swap_set_page_dirty(struct page *page)
|
||||
|
||||
if (sis->flags & SWP_FILE) {
|
||||
struct address_space *mapping = sis->swap_file->f_mapping;
|
||||
BUG_ON(!PageSwapCache(page));
|
||||
|
||||
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
|
||||
return mapping->a_ops->set_page_dirty(page);
|
||||
} else {
|
||||
return __set_page_dirty_no_writeback(page);
|
||||
|
@@ -55,7 +55,7 @@ static int set_migratetype_isolate(struct page *page,
|
||||
ret = 0;
|
||||
|
||||
/*
|
||||
* immobile means "not-on-lru" paes. If immobile is larger than
|
||||
* immobile means "not-on-lru" pages. If immobile is larger than
|
||||
* removable-by-driver pages reported by notifier, we'll fail.
|
||||
*/
|
||||
|
||||
|
156
mm/page_owner.c
156
mm/page_owner.c
@@ -8,6 +8,7 @@
|
||||
#include <linux/jump_label.h>
|
||||
#include <linux/migrate.h>
|
||||
#include <linux/stackdepot.h>
|
||||
#include <linux/seq_file.h>
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
@@ -17,6 +18,13 @@
|
||||
*/
|
||||
#define PAGE_OWNER_STACK_DEPTH (16)
|
||||
|
||||
struct page_owner {
|
||||
unsigned int order;
|
||||
gfp_t gfp_mask;
|
||||
int last_migrate_reason;
|
||||
depot_stack_handle_t handle;
|
||||
};
|
||||
|
||||
static bool page_owner_disabled = true;
|
||||
DEFINE_STATIC_KEY_FALSE(page_owner_inited);
|
||||
|
||||
@@ -85,10 +93,16 @@ static void init_page_owner(void)
|
||||
}
|
||||
|
||||
struct page_ext_operations page_owner_ops = {
|
||||
.size = sizeof(struct page_owner),
|
||||
.need = need_page_owner,
|
||||
.init = init_page_owner,
|
||||
};
|
||||
|
||||
static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
|
||||
{
|
||||
return (void *)page_ext + page_owner_ops.offset;
|
||||
}
|
||||
|
||||
void __reset_page_owner(struct page *page, unsigned int order)
|
||||
{
|
||||
int i;
|
||||
@@ -155,14 +169,16 @@ noinline void __set_page_owner(struct page *page, unsigned int order,
|
||||
gfp_t gfp_mask)
|
||||
{
|
||||
struct page_ext *page_ext = lookup_page_ext(page);
|
||||
struct page_owner *page_owner;
|
||||
|
||||
if (unlikely(!page_ext))
|
||||
return;
|
||||
|
||||
page_ext->handle = save_stack(gfp_mask);
|
||||
page_ext->order = order;
|
||||
page_ext->gfp_mask = gfp_mask;
|
||||
page_ext->last_migrate_reason = -1;
|
||||
page_owner = get_page_owner(page_ext);
|
||||
page_owner->handle = save_stack(gfp_mask);
|
||||
page_owner->order = order;
|
||||
page_owner->gfp_mask = gfp_mask;
|
||||
page_owner->last_migrate_reason = -1;
|
||||
|
||||
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
|
||||
}
|
||||
@@ -170,21 +186,26 @@ noinline void __set_page_owner(struct page *page, unsigned int order,
|
||||
void __set_page_owner_migrate_reason(struct page *page, int reason)
|
||||
{
|
||||
struct page_ext *page_ext = lookup_page_ext(page);
|
||||
struct page_owner *page_owner;
|
||||
|
||||
if (unlikely(!page_ext))
|
||||
return;
|
||||
|
||||
page_ext->last_migrate_reason = reason;
|
||||
page_owner = get_page_owner(page_ext);
|
||||
page_owner->last_migrate_reason = reason;
|
||||
}
|
||||
|
||||
void __split_page_owner(struct page *page, unsigned int order)
|
||||
{
|
||||
int i;
|
||||
struct page_ext *page_ext = lookup_page_ext(page);
|
||||
struct page_owner *page_owner;
|
||||
|
||||
if (unlikely(!page_ext))
|
||||
return;
|
||||
|
||||
page_ext->order = 0;
|
||||
page_owner = get_page_owner(page_ext);
|
||||
page_owner->order = 0;
|
||||
for (i = 1; i < (1 << order); i++)
|
||||
__copy_page_owner(page, page + i);
|
||||
}
|
||||
@@ -193,14 +214,18 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
|
||||
{
|
||||
struct page_ext *old_ext = lookup_page_ext(oldpage);
|
||||
struct page_ext *new_ext = lookup_page_ext(newpage);
|
||||
struct page_owner *old_page_owner, *new_page_owner;
|
||||
|
||||
if (unlikely(!old_ext || !new_ext))
|
||||
return;
|
||||
|
||||
new_ext->order = old_ext->order;
|
||||
new_ext->gfp_mask = old_ext->gfp_mask;
|
||||
new_ext->last_migrate_reason = old_ext->last_migrate_reason;
|
||||
new_ext->handle = old_ext->handle;
|
||||
old_page_owner = get_page_owner(old_ext);
|
||||
new_page_owner = get_page_owner(new_ext);
|
||||
new_page_owner->order = old_page_owner->order;
|
||||
new_page_owner->gfp_mask = old_page_owner->gfp_mask;
|
||||
new_page_owner->last_migrate_reason =
|
||||
old_page_owner->last_migrate_reason;
|
||||
new_page_owner->handle = old_page_owner->handle;
|
||||
|
||||
/*
|
||||
* We don't clear the bit on the oldpage as it's going to be freed
|
||||
@@ -214,9 +239,88 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
|
||||
__set_bit(PAGE_EXT_OWNER, &new_ext->flags);
|
||||
}
|
||||
|
||||
void pagetypeinfo_showmixedcount_print(struct seq_file *m,
|
||||
pg_data_t *pgdat, struct zone *zone)
|
||||
{
|
||||
struct page *page;
|
||||
struct page_ext *page_ext;
|
||||
struct page_owner *page_owner;
|
||||
unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
|
||||
unsigned long end_pfn = pfn + zone->spanned_pages;
|
||||
unsigned long count[MIGRATE_TYPES] = { 0, };
|
||||
int pageblock_mt, page_mt;
|
||||
int i;
|
||||
|
||||
/* Scan block by block. First and last block may be incomplete */
|
||||
pfn = zone->zone_start_pfn;
|
||||
|
||||
/*
|
||||
* Walk the zone in pageblock_nr_pages steps. If a page block spans
|
||||
* a zone boundary, it will be double counted between zones. This does
|
||||
* not matter as the mixed block count will still be correct
|
||||
*/
|
||||
for (; pfn < end_pfn; ) {
|
||||
if (!pfn_valid(pfn)) {
|
||||
pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
|
||||
continue;
|
||||
}
|
||||
|
||||
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
|
||||
block_end_pfn = min(block_end_pfn, end_pfn);
|
||||
|
||||
page = pfn_to_page(pfn);
|
||||
pageblock_mt = get_pageblock_migratetype(page);
|
||||
|
||||
for (; pfn < block_end_pfn; pfn++) {
|
||||
if (!pfn_valid_within(pfn))
|
||||
continue;
|
||||
|
||||
page = pfn_to_page(pfn);
|
||||
|
||||
if (page_zone(page) != zone)
|
||||
continue;
|
||||
|
||||
if (PageBuddy(page)) {
|
||||
pfn += (1UL << page_order(page)) - 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (PageReserved(page))
|
||||
continue;
|
||||
|
||||
page_ext = lookup_page_ext(page);
|
||||
if (unlikely(!page_ext))
|
||||
continue;
|
||||
|
||||
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
|
||||
continue;
|
||||
|
||||
page_owner = get_page_owner(page_ext);
|
||||
page_mt = gfpflags_to_migratetype(
|
||||
page_owner->gfp_mask);
|
||||
if (pageblock_mt != page_mt) {
|
||||
if (is_migrate_cma(pageblock_mt))
|
||||
count[MIGRATE_MOVABLE]++;
|
||||
else
|
||||
count[pageblock_mt]++;
|
||||
|
||||
pfn = block_end_pfn;
|
||||
break;
|
||||
}
|
||||
pfn += (1UL << page_owner->order) - 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* Print counts */
|
||||
seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
|
||||
for (i = 0; i < MIGRATE_TYPES; i++)
|
||||
seq_printf(m, "%12lu ", count[i]);
|
||||
seq_putc(m, '\n');
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
print_page_owner(char __user *buf, size_t count, unsigned long pfn,
|
||||
struct page *page, struct page_ext *page_ext,
|
||||
struct page *page, struct page_owner *page_owner,
|
||||
depot_stack_handle_t handle)
|
||||
{
|
||||
int ret;
|
||||
@@ -236,15 +340,15 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
|
||||
|
||||
ret = snprintf(kbuf, count,
|
||||
"Page allocated via order %u, mask %#x(%pGg)\n",
|
||||
page_ext->order, page_ext->gfp_mask,
|
||||
&page_ext->gfp_mask);
|
||||
page_owner->order, page_owner->gfp_mask,
|
||||
&page_owner->gfp_mask);
|
||||
|
||||
if (ret >= count)
|
||||
goto err;
|
||||
|
||||
/* Print information relevant to grouping pages by mobility */
|
||||
pageblock_mt = get_pageblock_migratetype(page);
|
||||
page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
|
||||
page_mt = gfpflags_to_migratetype(page_owner->gfp_mask);
|
||||
ret += snprintf(kbuf + ret, count - ret,
|
||||
"PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
|
||||
pfn,
|
||||
@@ -261,10 +365,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
|
||||
if (ret >= count)
|
||||
goto err;
|
||||
|
||||
if (page_ext->last_migrate_reason != -1) {
|
||||
if (page_owner->last_migrate_reason != -1) {
|
||||
ret += snprintf(kbuf + ret, count - ret,
|
||||
"Page has been migrated, last migrate reason: %s\n",
|
||||
migrate_reason_names[page_ext->last_migrate_reason]);
|
||||
migrate_reason_names[page_owner->last_migrate_reason]);
|
||||
if (ret >= count)
|
||||
goto err;
|
||||
}
|
||||
@@ -287,6 +391,7 @@ err:
|
||||
void __dump_page_owner(struct page *page)
|
||||
{
|
||||
struct page_ext *page_ext = lookup_page_ext(page);
|
||||
struct page_owner *page_owner;
|
||||
unsigned long entries[PAGE_OWNER_STACK_DEPTH];
|
||||
struct stack_trace trace = {
|
||||
.nr_entries = 0,
|
||||
@@ -302,7 +407,9 @@ void __dump_page_owner(struct page *page)
|
||||
pr_alert("There is not page extension available.\n");
|
||||
return;
|
||||
}
|
||||
gfp_mask = page_ext->gfp_mask;
|
||||
|
||||
page_owner = get_page_owner(page_ext);
|
||||
gfp_mask = page_owner->gfp_mask;
|
||||
mt = gfpflags_to_migratetype(gfp_mask);
|
||||
|
||||
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
|
||||
@@ -310,7 +417,7 @@ void __dump_page_owner(struct page *page)
|
||||
return;
|
||||
}
|
||||
|
||||
handle = READ_ONCE(page_ext->handle);
|
||||
handle = READ_ONCE(page_owner->handle);
|
||||
if (!handle) {
|
||||
pr_alert("page_owner info is not active (free page?)\n");
|
||||
return;
|
||||
@@ -318,12 +425,12 @@ void __dump_page_owner(struct page *page)
|
||||
|
||||
depot_fetch_stack(handle, &trace);
|
||||
pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
|
||||
page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask);
|
||||
page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask);
|
||||
print_stack_trace(&trace, 0);
|
||||
|
||||
if (page_ext->last_migrate_reason != -1)
|
||||
if (page_owner->last_migrate_reason != -1)
|
||||
pr_alert("page has been migrated, last migrate reason: %s\n",
|
||||
migrate_reason_names[page_ext->last_migrate_reason]);
|
||||
migrate_reason_names[page_owner->last_migrate_reason]);
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
@@ -332,6 +439,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
|
||||
unsigned long pfn;
|
||||
struct page *page;
|
||||
struct page_ext *page_ext;
|
||||
struct page_owner *page_owner;
|
||||
depot_stack_handle_t handle;
|
||||
|
||||
if (!static_branch_unlikely(&page_owner_inited))
|
||||
@@ -381,11 +489,13 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
|
||||
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
|
||||
continue;
|
||||
|
||||
page_owner = get_page_owner(page_ext);
|
||||
|
||||
/*
|
||||
* Access to page_ext->handle isn't synchronous so we should
|
||||
* be careful to access it.
|
||||
*/
|
||||
handle = READ_ONCE(page_ext->handle);
|
||||
handle = READ_ONCE(page_owner->handle);
|
||||
if (!handle)
|
||||
continue;
|
||||
|
||||
@@ -393,7 +503,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
|
||||
*ppos = (pfn - min_low_pfn) + 1;
|
||||
|
||||
return print_page_owner(buf, count, pfn, page,
|
||||
page_ext, handle);
|
||||
page_owner, handle);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@@ -3965,7 +3965,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
|
||||
|
||||
/* common code */
|
||||
|
||||
static struct dentry_operations anon_ops = {
|
||||
static const struct dentry_operations anon_ops = {
|
||||
.d_dname = simple_dname
|
||||
};
|
||||
|
||||
|
@@ -748,10 +748,8 @@ void release_pages(struct page **pages, int nr, bool cold)
|
||||
locked_pgdat = NULL;
|
||||
}
|
||||
|
||||
if (is_huge_zero_page(page)) {
|
||||
put_huge_zero_page();
|
||||
if (is_huge_zero_page(page))
|
||||
continue;
|
||||
}
|
||||
|
||||
page = compound_head(page);
|
||||
if (!put_page_testzero(page))
|
||||
|
@@ -37,6 +37,8 @@ struct address_space swapper_spaces[MAX_SWAPFILES] = {
|
||||
.page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
|
||||
.i_mmap_writable = ATOMIC_INIT(0),
|
||||
.a_ops = &swap_aops,
|
||||
/* swap cache doesn't use writeback related tags */
|
||||
.flags = 1 << AS_NO_WRITEBACK_TAGS,
|
||||
}
|
||||
};
|
||||
|
||||
@@ -92,7 +94,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
|
||||
address_space = swap_address_space(entry);
|
||||
spin_lock_irq(&address_space->tree_lock);
|
||||
error = radix_tree_insert(&address_space->page_tree,
|
||||
entry.val, page);
|
||||
swp_offset(entry), page);
|
||||
if (likely(!error)) {
|
||||
address_space->nrpages++;
|
||||
__inc_node_page_state(page, NR_FILE_PAGES);
|
||||
@@ -143,7 +145,7 @@ void __delete_from_swap_cache(struct page *page)
|
||||
|
||||
entry.val = page_private(page);
|
||||
address_space = swap_address_space(entry);
|
||||
radix_tree_delete(&address_space->page_tree, page_private(page));
|
||||
radix_tree_delete(&address_space->page_tree, swp_offset(entry));
|
||||
set_page_private(page, 0);
|
||||
ClearPageSwapCache(page);
|
||||
address_space->nrpages--;
|
||||
@@ -252,9 +254,7 @@ static inline void free_swap_cache(struct page *page)
|
||||
void free_page_and_swap_cache(struct page *page)
|
||||
{
|
||||
free_swap_cache(page);
|
||||
if (is_huge_zero_page(page))
|
||||
put_huge_zero_page();
|
||||
else
|
||||
if (!is_huge_zero_page(page))
|
||||
put_page(page);
|
||||
}
|
||||
|
||||
@@ -283,7 +283,7 @@ struct page * lookup_swap_cache(swp_entry_t entry)
|
||||
{
|
||||
struct page *page;
|
||||
|
||||
page = find_get_page(swap_address_space(entry), entry.val);
|
||||
page = find_get_page(swap_address_space(entry), swp_offset(entry));
|
||||
|
||||
if (page) {
|
||||
INC_CACHE_INFO(find_success);
|
||||
@@ -310,7 +310,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
|
||||
* called after lookup_swap_cache() failed, re-calling
|
||||
* that would confuse statistics.
|
||||
*/
|
||||
found_page = find_get_page(swapper_space, entry.val);
|
||||
found_page = find_get_page(swapper_space, swp_offset(entry));
|
||||
if (found_page)
|
||||
break;
|
||||
|
||||
|
137
mm/swapfile.c
137
mm/swapfile.c
@@ -105,7 +105,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
|
||||
struct page *page;
|
||||
int ret = 0;
|
||||
|
||||
page = find_get_page(swap_address_space(entry), entry.val);
|
||||
page = find_get_page(swap_address_space(entry), swp_offset(entry));
|
||||
if (!page)
|
||||
return 0;
|
||||
/*
|
||||
@@ -257,6 +257,53 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
|
||||
info->data = 0;
|
||||
}
|
||||
|
||||
static inline bool cluster_list_empty(struct swap_cluster_list *list)
|
||||
{
|
||||
return cluster_is_null(&list->head);
|
||||
}
|
||||
|
||||
static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
|
||||
{
|
||||
return cluster_next(&list->head);
|
||||
}
|
||||
|
||||
static void cluster_list_init(struct swap_cluster_list *list)
|
||||
{
|
||||
cluster_set_null(&list->head);
|
||||
cluster_set_null(&list->tail);
|
||||
}
|
||||
|
||||
static void cluster_list_add_tail(struct swap_cluster_list *list,
|
||||
struct swap_cluster_info *ci,
|
||||
unsigned int idx)
|
||||
{
|
||||
if (cluster_list_empty(list)) {
|
||||
cluster_set_next_flag(&list->head, idx, 0);
|
||||
cluster_set_next_flag(&list->tail, idx, 0);
|
||||
} else {
|
||||
unsigned int tail = cluster_next(&list->tail);
|
||||
|
||||
cluster_set_next(&ci[tail], idx);
|
||||
cluster_set_next_flag(&list->tail, idx, 0);
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
|
||||
struct swap_cluster_info *ci)
|
||||
{
|
||||
unsigned int idx;
|
||||
|
||||
idx = cluster_next(&list->head);
|
||||
if (cluster_next(&list->tail) == idx) {
|
||||
cluster_set_null(&list->head);
|
||||
cluster_set_null(&list->tail);
|
||||
} else
|
||||
cluster_set_next_flag(&list->head,
|
||||
cluster_next(&ci[idx]), 0);
|
||||
|
||||
return idx;
|
||||
}
|
||||
|
||||
/* Add a cluster to discard list and schedule it to do discard */
|
||||
static void swap_cluster_schedule_discard(struct swap_info_struct *si,
|
||||
unsigned int idx)
|
||||
@@ -270,17 +317,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
|
||||
memset(si->swap_map + idx * SWAPFILE_CLUSTER,
|
||||
SWAP_MAP_BAD, SWAPFILE_CLUSTER);
|
||||
|
||||
if (cluster_is_null(&si->discard_cluster_head)) {
|
||||
cluster_set_next_flag(&si->discard_cluster_head,
|
||||
idx, 0);
|
||||
cluster_set_next_flag(&si->discard_cluster_tail,
|
||||
idx, 0);
|
||||
} else {
|
||||
unsigned int tail = cluster_next(&si->discard_cluster_tail);
|
||||
cluster_set_next(&si->cluster_info[tail], idx);
|
||||
cluster_set_next_flag(&si->discard_cluster_tail,
|
||||
idx, 0);
|
||||
}
|
||||
cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
|
||||
|
||||
schedule_work(&si->discard_work);
|
||||
}
|
||||
@@ -296,15 +333,8 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si)
|
||||
|
||||
info = si->cluster_info;
|
||||
|
||||
while (!cluster_is_null(&si->discard_cluster_head)) {
|
||||
idx = cluster_next(&si->discard_cluster_head);
|
||||
|
||||
cluster_set_next_flag(&si->discard_cluster_head,
|
||||
cluster_next(&info[idx]), 0);
|
||||
if (cluster_next(&si->discard_cluster_tail) == idx) {
|
||||
cluster_set_null(&si->discard_cluster_head);
|
||||
cluster_set_null(&si->discard_cluster_tail);
|
||||
}
|
||||
while (!cluster_list_empty(&si->discard_clusters)) {
|
||||
idx = cluster_list_del_first(&si->discard_clusters, info);
|
||||
spin_unlock(&si->lock);
|
||||
|
||||
discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
|
||||
@@ -312,19 +342,7 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si)
|
||||
|
||||
spin_lock(&si->lock);
|
||||
cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
|
||||
if (cluster_is_null(&si->free_cluster_head)) {
|
||||
cluster_set_next_flag(&si->free_cluster_head,
|
||||
idx, 0);
|
||||
cluster_set_next_flag(&si->free_cluster_tail,
|
||||
idx, 0);
|
||||
} else {
|
||||
unsigned int tail;
|
||||
|
||||
tail = cluster_next(&si->free_cluster_tail);
|
||||
cluster_set_next(&info[tail], idx);
|
||||
cluster_set_next_flag(&si->free_cluster_tail,
|
||||
idx, 0);
|
||||
}
|
||||
cluster_list_add_tail(&si->free_clusters, info, idx);
|
||||
memset(si->swap_map + idx * SWAPFILE_CLUSTER,
|
||||
0, SWAPFILE_CLUSTER);
|
||||
}
|
||||
@@ -353,13 +371,8 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
|
||||
if (!cluster_info)
|
||||
return;
|
||||
if (cluster_is_free(&cluster_info[idx])) {
|
||||
VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx);
|
||||
cluster_set_next_flag(&p->free_cluster_head,
|
||||
cluster_next(&cluster_info[idx]), 0);
|
||||
if (cluster_next(&p->free_cluster_tail) == idx) {
|
||||
cluster_set_null(&p->free_cluster_tail);
|
||||
cluster_set_null(&p->free_cluster_head);
|
||||
}
|
||||
VM_BUG_ON(cluster_list_first(&p->free_clusters) != idx);
|
||||
cluster_list_del_first(&p->free_clusters, cluster_info);
|
||||
cluster_set_count_flag(&cluster_info[idx], 0, 0);
|
||||
}
|
||||
|
||||
@@ -398,14 +411,7 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
|
||||
}
|
||||
|
||||
cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
|
||||
if (cluster_is_null(&p->free_cluster_head)) {
|
||||
cluster_set_next_flag(&p->free_cluster_head, idx, 0);
|
||||
cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
|
||||
} else {
|
||||
unsigned int tail = cluster_next(&p->free_cluster_tail);
|
||||
cluster_set_next(&cluster_info[tail], idx);
|
||||
cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
|
||||
}
|
||||
cluster_list_add_tail(&p->free_clusters, cluster_info, idx);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -421,8 +427,8 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
|
||||
bool conflict;
|
||||
|
||||
offset /= SWAPFILE_CLUSTER;
|
||||
conflict = !cluster_is_null(&si->free_cluster_head) &&
|
||||
offset != cluster_next(&si->free_cluster_head) &&
|
||||
conflict = !cluster_list_empty(&si->free_clusters) &&
|
||||
offset != cluster_list_first(&si->free_clusters) &&
|
||||
cluster_is_free(&si->cluster_info[offset]);
|
||||
|
||||
if (!conflict)
|
||||
@@ -447,11 +453,11 @@ static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
|
||||
new_cluster:
|
||||
cluster = this_cpu_ptr(si->percpu_cluster);
|
||||
if (cluster_is_null(&cluster->index)) {
|
||||
if (!cluster_is_null(&si->free_cluster_head)) {
|
||||
cluster->index = si->free_cluster_head;
|
||||
if (!cluster_list_empty(&si->free_clusters)) {
|
||||
cluster->index = si->free_clusters.head;
|
||||
cluster->next = cluster_next(&cluster->index) *
|
||||
SWAPFILE_CLUSTER;
|
||||
} else if (!cluster_is_null(&si->discard_cluster_head)) {
|
||||
} else if (!cluster_list_empty(&si->discard_clusters)) {
|
||||
/*
|
||||
* we don't have free cluster but have some clusters in
|
||||
* discarding, do discard now and reclaim them
|
||||
@@ -999,7 +1005,7 @@ int free_swap_and_cache(swp_entry_t entry)
|
||||
if (p) {
|
||||
if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
|
||||
page = find_get_page(swap_address_space(entry),
|
||||
entry.val);
|
||||
swp_offset(entry));
|
||||
if (page && !trylock_page(page)) {
|
||||
put_page(page);
|
||||
page = NULL;
|
||||
@@ -2292,10 +2298,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
|
||||
|
||||
nr_good_pages = maxpages - 1; /* omit header page */
|
||||
|
||||
cluster_set_null(&p->free_cluster_head);
|
||||
cluster_set_null(&p->free_cluster_tail);
|
||||
cluster_set_null(&p->discard_cluster_head);
|
||||
cluster_set_null(&p->discard_cluster_tail);
|
||||
cluster_list_init(&p->free_clusters);
|
||||
cluster_list_init(&p->discard_clusters);
|
||||
|
||||
for (i = 0; i < swap_header->info.nr_badpages; i++) {
|
||||
unsigned int page_nr = swap_header->info.badpages[i];
|
||||
@@ -2341,19 +2345,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
|
||||
for (i = 0; i < nr_clusters; i++) {
|
||||
if (!cluster_count(&cluster_info[idx])) {
|
||||
cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
|
||||
if (cluster_is_null(&p->free_cluster_head)) {
|
||||
cluster_set_next_flag(&p->free_cluster_head,
|
||||
idx, 0);
|
||||
cluster_set_next_flag(&p->free_cluster_tail,
|
||||
idx, 0);
|
||||
} else {
|
||||
unsigned int tail;
|
||||
|
||||
tail = cluster_next(&p->free_cluster_tail);
|
||||
cluster_set_next(&cluster_info[tail], idx);
|
||||
cluster_set_next_flag(&p->free_cluster_tail,
|
||||
idx, 0);
|
||||
}
|
||||
cluster_list_add_tail(&p->free_clusters, cluster_info,
|
||||
idx);
|
||||
}
|
||||
idx++;
|
||||
if (idx == nr_clusters)
|
||||
|
@@ -87,11 +87,11 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
|
||||
{
|
||||
int i;
|
||||
|
||||
count_vm_vmacache_event(VMACACHE_FIND_CALLS);
|
||||
|
||||
if (!vmacache_valid(mm))
|
||||
return NULL;
|
||||
|
||||
count_vm_vmacache_event(VMACACHE_FIND_CALLS);
|
||||
|
||||
for (i = 0; i < VMACACHE_SIZE; i++) {
|
||||
struct vm_area_struct *vma = current->vmacache[i];
|
||||
|
||||
@@ -115,11 +115,11 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
|
||||
{
|
||||
int i;
|
||||
|
||||
count_vm_vmacache_event(VMACACHE_FIND_CALLS);
|
||||
|
||||
if (!vmacache_valid(mm))
|
||||
return NULL;
|
||||
|
||||
count_vm_vmacache_event(VMACACHE_FIND_CALLS);
|
||||
|
||||
for (i = 0; i < VMACACHE_SIZE; i++) {
|
||||
struct vm_area_struct *vma = current->vmacache[i];
|
||||
|
||||
|
22
mm/vmalloc.c
22
mm/vmalloc.c
@@ -1359,14 +1359,14 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
|
||||
struct vm_struct *area;
|
||||
|
||||
BUG_ON(in_interrupt());
|
||||
if (flags & VM_IOREMAP)
|
||||
align = 1ul << clamp_t(int, fls_long(size),
|
||||
PAGE_SHIFT, IOREMAP_MAX_ORDER);
|
||||
|
||||
size = PAGE_ALIGN(size);
|
||||
if (unlikely(!size))
|
||||
return NULL;
|
||||
|
||||
if (flags & VM_IOREMAP)
|
||||
align = 1ul << clamp_t(int, get_count_order_long(size),
|
||||
PAGE_SHIFT, IOREMAP_MAX_ORDER);
|
||||
|
||||
area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
|
||||
if (unlikely(!area))
|
||||
return NULL;
|
||||
@@ -1601,7 +1601,6 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
|
||||
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
|
||||
pgprot_t prot, int node)
|
||||
{
|
||||
const int order = 0;
|
||||
struct page **pages;
|
||||
unsigned int nr_pages, array_size, i;
|
||||
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
|
||||
@@ -1629,9 +1628,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
|
||||
struct page *page;
|
||||
|
||||
if (node == NUMA_NO_NODE)
|
||||
page = alloc_pages(alloc_mask, order);
|
||||
page = alloc_page(alloc_mask);
|
||||
else
|
||||
page = alloc_pages_node(node, alloc_mask, order);
|
||||
page = alloc_pages_node(node, alloc_mask, 0);
|
||||
|
||||
if (unlikely(!page)) {
|
||||
/* Successfully allocated i pages, free them in __vunmap() */
|
||||
@@ -1648,8 +1647,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
|
||||
return area->addr;
|
||||
|
||||
fail:
|
||||
warn_alloc_failed(gfp_mask, order,
|
||||
"vmalloc: allocation failure, allocated %ld of %ld bytes\n",
|
||||
warn_alloc(gfp_mask,
|
||||
"vmalloc: allocation failure, allocated %ld of %ld bytes",
|
||||
(area->nr_pages*PAGE_SIZE), area->size);
|
||||
vfree(area->addr);
|
||||
return NULL;
|
||||
@@ -1710,9 +1709,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
|
||||
return addr;
|
||||
|
||||
fail:
|
||||
warn_alloc_failed(gfp_mask, 0,
|
||||
"vmalloc: allocation failure: %lu bytes\n",
|
||||
real_size);
|
||||
warn_alloc(gfp_mask,
|
||||
"vmalloc: allocation failure: %lu bytes", real_size);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
57
mm/vmscan.c
57
mm/vmscan.c
@@ -2418,8 +2418,6 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
|
||||
if (inactive_list_is_low(lruvec, false, sc))
|
||||
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
|
||||
sc, LRU_ACTIVE_ANON);
|
||||
|
||||
throttle_vm_writeout(sc->gfp_mask);
|
||||
}
|
||||
|
||||
/* Use reclaim/compaction for costly allocs or under memory pressure */
|
||||
@@ -2480,7 +2478,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
|
||||
* If we have not reclaimed enough pages for compaction and the
|
||||
* inactive lists are large enough, continue reclaiming
|
||||
*/
|
||||
pages_for_compaction = (2UL << sc->order);
|
||||
pages_for_compaction = compact_gap(sc->order);
|
||||
inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
|
||||
if (get_nr_swap_pages() > 0)
|
||||
inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
|
||||
@@ -2495,7 +2493,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
|
||||
continue;
|
||||
|
||||
switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
|
||||
case COMPACT_PARTIAL:
|
||||
case COMPACT_SUCCESS:
|
||||
case COMPACT_CONTINUE:
|
||||
return false;
|
||||
default:
|
||||
@@ -2598,38 +2596,35 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns true if compaction should go ahead for a high-order request, or
|
||||
* the high-order allocation would succeed without compaction.
|
||||
* Returns true if compaction should go ahead for a costly-order request, or
|
||||
* the allocation would already succeed without compaction. Return false if we
|
||||
* should reclaim first.
|
||||
*/
|
||||
static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
|
||||
{
|
||||
unsigned long watermark;
|
||||
bool watermark_ok;
|
||||
enum compact_result suitable;
|
||||
|
||||
/*
|
||||
* Compaction takes time to run and there are potentially other
|
||||
* callers using the pages just freed. Continue reclaiming until
|
||||
* there is a buffer of free pages available to give compaction
|
||||
* a reasonable chance of completing and allocating the page
|
||||
*/
|
||||
watermark = high_wmark_pages(zone) + (2UL << sc->order);
|
||||
watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
|
||||
|
||||
/*
|
||||
* If compaction is deferred, reclaim up to a point where
|
||||
* compaction will have a chance of success when re-enabled
|
||||
*/
|
||||
if (compaction_deferred(zone, sc->order))
|
||||
return watermark_ok;
|
||||
|
||||
/*
|
||||
* If compaction is not ready to start and allocation is not likely
|
||||
* to succeed without it, then keep reclaiming.
|
||||
*/
|
||||
if (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx) == COMPACT_SKIPPED)
|
||||
suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
|
||||
if (suitable == COMPACT_SUCCESS)
|
||||
/* Allocation should succeed already. Don't reclaim. */
|
||||
return true;
|
||||
if (suitable == COMPACT_SKIPPED)
|
||||
/* Compaction cannot yet proceed. Do reclaim. */
|
||||
return false;
|
||||
|
||||
return watermark_ok;
|
||||
/*
|
||||
* Compaction is already possible, but it takes time to run and there
|
||||
* are potentially other callers using the pages just freed. So proceed
|
||||
* with reclaim to make a buffer of free pages available to give
|
||||
* compaction a reasonable chance of completing and allocating the page.
|
||||
* Note that we won't actually reclaim the whole buffer in one attempt
|
||||
* as the target watermark in should_continue_reclaim() is lower. But if
|
||||
* we are already above the high+gap watermark, don't reclaim at all.
|
||||
*/
|
||||
watermark = high_wmark_pages(zone) + compact_gap(sc->order);
|
||||
|
||||
return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -3041,7 +3036,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
|
||||
*/
|
||||
nid = mem_cgroup_select_victim_node(memcg);
|
||||
|
||||
zonelist = NODE_DATA(nid)->node_zonelists;
|
||||
zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
|
||||
|
||||
trace_mm_vmscan_memcg_reclaim_begin(0,
|
||||
sc.may_writepage,
|
||||
@@ -3169,7 +3164,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
|
||||
* excessive reclaim. Assume that a process requested a high-order
|
||||
* can direct reclaim/compact.
|
||||
*/
|
||||
if (sc->order && sc->nr_reclaimed >= 2UL << sc->order)
|
||||
if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
|
||||
sc->order = 0;
|
||||
|
||||
return sc->nr_scanned >= sc->nr_to_reclaim;
|
||||
|
95
mm/vmstat.c
95
mm/vmstat.c
@@ -1254,85 +1254,6 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PAGE_OWNER
|
||||
static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
|
||||
pg_data_t *pgdat,
|
||||
struct zone *zone)
|
||||
{
|
||||
struct page *page;
|
||||
struct page_ext *page_ext;
|
||||
unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
|
||||
unsigned long end_pfn = pfn + zone->spanned_pages;
|
||||
unsigned long count[MIGRATE_TYPES] = { 0, };
|
||||
int pageblock_mt, page_mt;
|
||||
int i;
|
||||
|
||||
/* Scan block by block. First and last block may be incomplete */
|
||||
pfn = zone->zone_start_pfn;
|
||||
|
||||
/*
|
||||
* Walk the zone in pageblock_nr_pages steps. If a page block spans
|
||||
* a zone boundary, it will be double counted between zones. This does
|
||||
* not matter as the mixed block count will still be correct
|
||||
*/
|
||||
for (; pfn < end_pfn; ) {
|
||||
if (!pfn_valid(pfn)) {
|
||||
pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
|
||||
continue;
|
||||
}
|
||||
|
||||
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
|
||||
block_end_pfn = min(block_end_pfn, end_pfn);
|
||||
|
||||
page = pfn_to_page(pfn);
|
||||
pageblock_mt = get_pageblock_migratetype(page);
|
||||
|
||||
for (; pfn < block_end_pfn; pfn++) {
|
||||
if (!pfn_valid_within(pfn))
|
||||
continue;
|
||||
|
||||
page = pfn_to_page(pfn);
|
||||
|
||||
if (page_zone(page) != zone)
|
||||
continue;
|
||||
|
||||
if (PageBuddy(page)) {
|
||||
pfn += (1UL << page_order(page)) - 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (PageReserved(page))
|
||||
continue;
|
||||
|
||||
page_ext = lookup_page_ext(page);
|
||||
if (unlikely(!page_ext))
|
||||
continue;
|
||||
|
||||
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
|
||||
continue;
|
||||
|
||||
page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
|
||||
if (pageblock_mt != page_mt) {
|
||||
if (is_migrate_cma(pageblock_mt))
|
||||
count[MIGRATE_MOVABLE]++;
|
||||
else
|
||||
count[pageblock_mt]++;
|
||||
|
||||
pfn = block_end_pfn;
|
||||
break;
|
||||
}
|
||||
pfn += (1UL << page_ext->order) - 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* Print counts */
|
||||
seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
|
||||
for (i = 0; i < MIGRATE_TYPES; i++)
|
||||
seq_printf(m, "%12lu ", count[i]);
|
||||
seq_putc(m, '\n');
|
||||
}
|
||||
#endif /* CONFIG_PAGE_OWNER */
|
||||
|
||||
/*
|
||||
* Print out the number of pageblocks for each migratetype that contain pages
|
||||
* of other types. This gives an indication of how well fallbacks are being
|
||||
@@ -1592,7 +1513,10 @@ static int vmstat_show(struct seq_file *m, void *arg)
|
||||
{
|
||||
unsigned long *l = arg;
|
||||
unsigned long off = l - (unsigned long *)m->private;
|
||||
seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
|
||||
|
||||
seq_puts(m, vmstat_text[off]);
|
||||
seq_put_decimal_ull(m, " ", *l);
|
||||
seq_putc(m, '\n');
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1794,6 +1718,16 @@ static void __init start_shepherd_timer(void)
|
||||
round_jiffies_relative(sysctl_stat_interval));
|
||||
}
|
||||
|
||||
static void __init init_cpu_node_state(void)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
get_online_cpus();
|
||||
for_each_online_cpu(cpu)
|
||||
node_set_state(cpu_to_node(cpu), N_CPU);
|
||||
put_online_cpus();
|
||||
}
|
||||
|
||||
static void vmstat_cpu_dead(int node)
|
||||
{
|
||||
int cpu;
|
||||
@@ -1851,6 +1785,7 @@ static int __init setup_vmstat(void)
|
||||
#ifdef CONFIG_SMP
|
||||
cpu_notifier_register_begin();
|
||||
__register_cpu_notifier(&vmstat_notifier);
|
||||
init_cpu_node_state();
|
||||
|
||||
start_shepherd_timer();
|
||||
cpu_notifier_register_done();
|
||||
|
在新工单中引用
屏蔽一个用户