Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton: - the rest of MM - procfs updates - various misc things - more y2038 fixes - get_maintainer updates - lib/ updates - checkpatch updates - various epoll updates - autofs updates - hfsplus - some reiserfs work - fatfs updates - signal.c cleanups - ipc/ updates * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (166 commits) ipc/util.c: update return value of ipc_getref from int to bool ipc/util.c: further variable name cleanups ipc: simplify ipc initialization ipc: get rid of ids->tables_initialized hack lib/rhashtable: guarantee initial hashtable allocation lib/rhashtable: simplify bucket_table_alloc() ipc: drop ipc_lock() ipc/util.c: correct comment in ipc_obtain_object_check ipc: rename ipcctl_pre_down_nolock() ipc/util.c: use ipc_rcu_putref() for failues in ipc_addid() ipc: reorganize initialization of kern_ipc_perm.seq ipc: compute kern_ipc_perm.id under the ipc lock init/Kconfig: remove EXPERT from CHECKPOINT_RESTORE fs/sysv/inode.c: use ktime_get_real_seconds() for superblock stamp adfs: use timespec64 for time conversion kernel/sysctl.c: fix typos in comments drivers/rapidio/devices/rio_mport_cdev.c: remove redundant pointer md fork: don't copy inconsistent signal handler state to child signal: make get_signal() return bool signal: make sigkill_pending() return bool ...
このコミットが含まれているのは:
@@ -46,7 +46,8 @@ config PAGE_POISONING
|
||||
Fill the pages with poison patterns after free_pages() and verify
|
||||
the patterns before alloc_pages. The filling of the memory helps
|
||||
reduce the risk of information leaks from freed data. This does
|
||||
have a potential performance impact.
|
||||
have a potential performance impact if enabled with the
|
||||
"page_poison=1" kernel boot option.
|
||||
|
||||
Note that "poison" here is not the same thing as the "HWPoison"
|
||||
for CONFIG_MEMORY_FAILURE. This is software poisoning only.
|
||||
@@ -65,7 +66,7 @@ config PAGE_POISONING_NO_SANITY
|
||||
say N.
|
||||
|
||||
config PAGE_POISONING_ZERO
|
||||
bool "Use zero for poisoning instead of random data"
|
||||
bool "Use zero for poisoning instead of debugging value"
|
||||
depends on PAGE_POISONING
|
||||
---help---
|
||||
Instead of using the existing poison value, fill the pages with
|
||||
@@ -75,7 +76,6 @@ config PAGE_POISONING_ZERO
|
||||
allocation.
|
||||
|
||||
If unsure, say N
|
||||
bool
|
||||
|
||||
config DEBUG_PAGE_REF
|
||||
bool "Enable tracepoint to track down page reference manipulation"
|
||||
|
@@ -438,10 +438,10 @@ retry:
|
||||
if (new_congested) {
|
||||
/* !found and storage for new one already allocated, insert */
|
||||
congested = new_congested;
|
||||
new_congested = NULL;
|
||||
rb_link_node(&congested->rb_node, parent, node);
|
||||
rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree);
|
||||
goto found;
|
||||
spin_unlock_irqrestore(&cgwb_lock, flags);
|
||||
return congested;
|
||||
}
|
||||
|
||||
spin_unlock_irqrestore(&cgwb_lock, flags);
|
||||
@@ -451,13 +451,13 @@ retry:
|
||||
if (!new_congested)
|
||||
return NULL;
|
||||
|
||||
atomic_set(&new_congested->refcnt, 0);
|
||||
refcount_set(&new_congested->refcnt, 1);
|
||||
new_congested->__bdi = bdi;
|
||||
new_congested->blkcg_id = blkcg_id;
|
||||
goto retry;
|
||||
|
||||
found:
|
||||
atomic_inc(&congested->refcnt);
|
||||
refcount_inc(&congested->refcnt);
|
||||
spin_unlock_irqrestore(&cgwb_lock, flags);
|
||||
kfree(new_congested);
|
||||
return congested;
|
||||
@@ -473,11 +473,8 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
|
||||
local_irq_restore(flags);
|
||||
if (!refcount_dec_and_lock_irqsave(&congested->refcnt, &cgwb_lock, &flags))
|
||||
return;
|
||||
}
|
||||
|
||||
/* bdi might already have been destroyed leaving @congested unlinked */
|
||||
if (congested->__bdi) {
|
||||
@@ -804,7 +801,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
|
||||
if (!bdi->wb_congested)
|
||||
return -ENOMEM;
|
||||
|
||||
atomic_set(&bdi->wb_congested->refcnt, 1);
|
||||
refcount_set(&bdi->wb_congested->refcnt, 1);
|
||||
|
||||
err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
|
||||
if (err) {
|
||||
|
7
mm/hmm.c
7
mm/hmm.c
@@ -177,16 +177,19 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
|
||||
up_write(&hmm->mirrors_sem);
|
||||
}
|
||||
|
||||
static void hmm_invalidate_range_start(struct mmu_notifier *mn,
|
||||
static int hmm_invalidate_range_start(struct mmu_notifier *mn,
|
||||
struct mm_struct *mm,
|
||||
unsigned long start,
|
||||
unsigned long end)
|
||||
unsigned long end,
|
||||
bool blockable)
|
||||
{
|
||||
struct hmm *hmm = mm->hmm;
|
||||
|
||||
VM_BUG_ON(!hmm);
|
||||
|
||||
atomic_inc(&hmm->sequence);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void hmm_invalidate_range_end(struct mmu_notifier *mn,
|
||||
|
@@ -389,18 +389,6 @@ static inline struct page *mem_map_next(struct page *iter,
|
||||
return iter + 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
|
||||
* so all functions starting at paging_init should be marked __init
|
||||
* in those cases. SPARSEMEM, however, allows for memory hotplug,
|
||||
* and alloc_bootmem_node is not used.
|
||||
*/
|
||||
#ifdef CONFIG_SPARSEMEM
|
||||
#define __paginginit __meminit
|
||||
#else
|
||||
#define __paginginit __init
|
||||
#endif
|
||||
|
||||
/* Memory initialisation debug and verification */
|
||||
enum mminit_level {
|
||||
MMINIT_WARNING,
|
||||
|
4
mm/ksm.c
4
mm/ksm.c
@@ -703,7 +703,7 @@ again:
|
||||
* We cannot do anything with the page while its refcount is 0.
|
||||
* Usually 0 means free, or tail of a higher-order page: in which
|
||||
* case this node is no longer referenced, and should be freed;
|
||||
* however, it might mean that the page is under page_freeze_refs().
|
||||
* however, it might mean that the page is under page_ref_freeze().
|
||||
* The __remove_mapping() case is easy, again the node is now stale;
|
||||
* but if page is swapcache in migrate_page_move_mapping(), it might
|
||||
* still be our page, in which case it's essential to keep the node.
|
||||
@@ -714,7 +714,7 @@ again:
|
||||
* work here too. We have chosen the !PageSwapCache test to
|
||||
* optimize the common case, when the page is or is about to
|
||||
* be freed: PageSwapCache is cleared (under spin_lock_irq)
|
||||
* in the freeze_refs section of __remove_mapping(); but Anon
|
||||
* in the ref_freeze section of __remove_mapping(); but Anon
|
||||
* page->mapping reset to NULL later, in free_pages_prepare().
|
||||
*/
|
||||
if (!PageSwapCache(page))
|
||||
|
243
mm/memcontrol.c
243
mm/memcontrol.c
@@ -1776,6 +1776,62 @@ cleanup:
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
|
||||
* @victim: task to be killed by the OOM killer
|
||||
* @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
|
||||
*
|
||||
* Returns a pointer to a memory cgroup, which has to be cleaned up
|
||||
* by killing all belonging OOM-killable tasks.
|
||||
*
|
||||
* Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
|
||||
*/
|
||||
struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
|
||||
struct mem_cgroup *oom_domain)
|
||||
{
|
||||
struct mem_cgroup *oom_group = NULL;
|
||||
struct mem_cgroup *memcg;
|
||||
|
||||
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
||||
return NULL;
|
||||
|
||||
if (!oom_domain)
|
||||
oom_domain = root_mem_cgroup;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
memcg = mem_cgroup_from_task(victim);
|
||||
if (memcg == root_mem_cgroup)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Traverse the memory cgroup hierarchy from the victim task's
|
||||
* cgroup up to the OOMing cgroup (or root) to find the
|
||||
* highest-level memory cgroup with oom.group set.
|
||||
*/
|
||||
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
|
||||
if (memcg->oom_group)
|
||||
oom_group = memcg;
|
||||
|
||||
if (memcg == oom_domain)
|
||||
break;
|
||||
}
|
||||
|
||||
if (oom_group)
|
||||
css_get(&oom_group->css);
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
|
||||
return oom_group;
|
||||
}
|
||||
|
||||
void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
|
||||
{
|
||||
pr_info("Tasks in ");
|
||||
pr_cont_cgroup_path(memcg->css.cgroup);
|
||||
pr_cont(" are going to be killed due to memory.oom.group set\n");
|
||||
}
|
||||
|
||||
/**
|
||||
* lock_page_memcg - lock a page->mem_cgroup binding
|
||||
* @page: the page
|
||||
@@ -2899,29 +2955,34 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
|
||||
return retval;
|
||||
}
|
||||
|
||||
static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
|
||||
struct accumulated_stats {
|
||||
unsigned long stat[MEMCG_NR_STAT];
|
||||
unsigned long events[NR_VM_EVENT_ITEMS];
|
||||
unsigned long lru_pages[NR_LRU_LISTS];
|
||||
const unsigned int *stats_array;
|
||||
const unsigned int *events_array;
|
||||
int stats_size;
|
||||
int events_size;
|
||||
};
|
||||
|
||||
static void accumulate_memcg_tree(struct mem_cgroup *memcg,
|
||||
struct accumulated_stats *acc)
|
||||
{
|
||||
struct mem_cgroup *iter;
|
||||
struct mem_cgroup *mi;
|
||||
int i;
|
||||
|
||||
memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT);
|
||||
for_each_mem_cgroup_tree(mi, memcg) {
|
||||
for (i = 0; i < acc->stats_size; i++)
|
||||
acc->stat[i] += memcg_page_state(mi,
|
||||
acc->stats_array ? acc->stats_array[i] : i);
|
||||
|
||||
for_each_mem_cgroup_tree(iter, memcg) {
|
||||
for (i = 0; i < MEMCG_NR_STAT; i++)
|
||||
stat[i] += memcg_page_state(iter, i);
|
||||
}
|
||||
}
|
||||
for (i = 0; i < acc->events_size; i++)
|
||||
acc->events[i] += memcg_sum_events(mi,
|
||||
acc->events_array ? acc->events_array[i] : i);
|
||||
|
||||
static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
|
||||
{
|
||||
struct mem_cgroup *iter;
|
||||
int i;
|
||||
|
||||
memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS);
|
||||
|
||||
for_each_mem_cgroup_tree(iter, memcg) {
|
||||
for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
|
||||
events[i] += memcg_sum_events(iter, i);
|
||||
for (i = 0; i < NR_LRU_LISTS; i++)
|
||||
acc->lru_pages[i] +=
|
||||
mem_cgroup_nr_lru_pages(mi, BIT(i));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3332,6 +3393,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
|
||||
unsigned long memory, memsw;
|
||||
struct mem_cgroup *mi;
|
||||
unsigned int i;
|
||||
struct accumulated_stats acc;
|
||||
|
||||
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
|
||||
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
|
||||
@@ -3364,32 +3426,27 @@ static int memcg_stat_show(struct seq_file *m, void *v)
|
||||
seq_printf(m, "hierarchical_memsw_limit %llu\n",
|
||||
(u64)memsw * PAGE_SIZE);
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
|
||||
unsigned long long val = 0;
|
||||
memset(&acc, 0, sizeof(acc));
|
||||
acc.stats_size = ARRAY_SIZE(memcg1_stats);
|
||||
acc.stats_array = memcg1_stats;
|
||||
acc.events_size = ARRAY_SIZE(memcg1_events);
|
||||
acc.events_array = memcg1_events;
|
||||
accumulate_memcg_tree(memcg, &acc);
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
|
||||
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
|
||||
continue;
|
||||
for_each_mem_cgroup_tree(mi, memcg)
|
||||
val += memcg_page_state(mi, memcg1_stats[i]) *
|
||||
PAGE_SIZE;
|
||||
seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val);
|
||||
seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
|
||||
(u64)acc.stat[i] * PAGE_SIZE);
|
||||
}
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) {
|
||||
unsigned long long val = 0;
|
||||
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
|
||||
seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
|
||||
(u64)acc.events[i]);
|
||||
|
||||
for_each_mem_cgroup_tree(mi, memcg)
|
||||
val += memcg_sum_events(mi, memcg1_events[i]);
|
||||
seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val);
|
||||
}
|
||||
|
||||
for (i = 0; i < NR_LRU_LISTS; i++) {
|
||||
unsigned long long val = 0;
|
||||
|
||||
for_each_mem_cgroup_tree(mi, memcg)
|
||||
val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
|
||||
seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
|
||||
}
|
||||
for (i = 0; i < NR_LRU_LISTS; i++)
|
||||
seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
|
||||
(u64)acc.lru_pages[i] * PAGE_SIZE);
|
||||
|
||||
#ifdef CONFIG_DEBUG_VM
|
||||
{
|
||||
@@ -5486,8 +5543,7 @@ static int memory_events_show(struct seq_file *m, void *v)
|
||||
static int memory_stat_show(struct seq_file *m, void *v)
|
||||
{
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
|
||||
unsigned long stat[MEMCG_NR_STAT];
|
||||
unsigned long events[NR_VM_EVENT_ITEMS];
|
||||
struct accumulated_stats acc;
|
||||
int i;
|
||||
|
||||
/*
|
||||
@@ -5501,70 +5557,97 @@ static int memory_stat_show(struct seq_file *m, void *v)
|
||||
* Current memory state:
|
||||
*/
|
||||
|
||||
tree_stat(memcg, stat);
|
||||
tree_events(memcg, events);
|
||||
memset(&acc, 0, sizeof(acc));
|
||||
acc.stats_size = MEMCG_NR_STAT;
|
||||
acc.events_size = NR_VM_EVENT_ITEMS;
|
||||
accumulate_memcg_tree(memcg, &acc);
|
||||
|
||||
seq_printf(m, "anon %llu\n",
|
||||
(u64)stat[MEMCG_RSS] * PAGE_SIZE);
|
||||
(u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
|
||||
seq_printf(m, "file %llu\n",
|
||||
(u64)stat[MEMCG_CACHE] * PAGE_SIZE);
|
||||
(u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
|
||||
seq_printf(m, "kernel_stack %llu\n",
|
||||
(u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
|
||||
(u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
|
||||
seq_printf(m, "slab %llu\n",
|
||||
(u64)(stat[NR_SLAB_RECLAIMABLE] +
|
||||
stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
|
||||
(u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
|
||||
acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
|
||||
seq_printf(m, "sock %llu\n",
|
||||
(u64)stat[MEMCG_SOCK] * PAGE_SIZE);
|
||||
(u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
|
||||
|
||||
seq_printf(m, "shmem %llu\n",
|
||||
(u64)stat[NR_SHMEM] * PAGE_SIZE);
|
||||
(u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
|
||||
seq_printf(m, "file_mapped %llu\n",
|
||||
(u64)stat[NR_FILE_MAPPED] * PAGE_SIZE);
|
||||
(u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
|
||||
seq_printf(m, "file_dirty %llu\n",
|
||||
(u64)stat[NR_FILE_DIRTY] * PAGE_SIZE);
|
||||
(u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
|
||||
seq_printf(m, "file_writeback %llu\n",
|
||||
(u64)stat[NR_WRITEBACK] * PAGE_SIZE);
|
||||
(u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
|
||||
|
||||
for (i = 0; i < NR_LRU_LISTS; i++) {
|
||||
struct mem_cgroup *mi;
|
||||
unsigned long val = 0;
|
||||
|
||||
for_each_mem_cgroup_tree(mi, memcg)
|
||||
val += mem_cgroup_nr_lru_pages(mi, BIT(i));
|
||||
seq_printf(m, "%s %llu\n",
|
||||
mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
|
||||
}
|
||||
for (i = 0; i < NR_LRU_LISTS; i++)
|
||||
seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
|
||||
(u64)acc.lru_pages[i] * PAGE_SIZE);
|
||||
|
||||
seq_printf(m, "slab_reclaimable %llu\n",
|
||||
(u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
|
||||
(u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
|
||||
seq_printf(m, "slab_unreclaimable %llu\n",
|
||||
(u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
|
||||
(u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
|
||||
|
||||
/* Accumulated memory events */
|
||||
|
||||
seq_printf(m, "pgfault %lu\n", events[PGFAULT]);
|
||||
seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
|
||||
seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
|
||||
seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
|
||||
|
||||
seq_printf(m, "pgrefill %lu\n", events[PGREFILL]);
|
||||
seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] +
|
||||
events[PGSCAN_DIRECT]);
|
||||
seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] +
|
||||
events[PGSTEAL_DIRECT]);
|
||||
seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]);
|
||||
seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]);
|
||||
seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]);
|
||||
seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]);
|
||||
seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
|
||||
seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
|
||||
acc.events[PGSCAN_DIRECT]);
|
||||
seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
|
||||
acc.events[PGSTEAL_DIRECT]);
|
||||
seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
|
||||
seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
|
||||
seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
|
||||
seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
|
||||
|
||||
seq_printf(m, "workingset_refault %lu\n",
|
||||
stat[WORKINGSET_REFAULT]);
|
||||
acc.stat[WORKINGSET_REFAULT]);
|
||||
seq_printf(m, "workingset_activate %lu\n",
|
||||
stat[WORKINGSET_ACTIVATE]);
|
||||
acc.stat[WORKINGSET_ACTIVATE]);
|
||||
seq_printf(m, "workingset_nodereclaim %lu\n",
|
||||
stat[WORKINGSET_NODERECLAIM]);
|
||||
acc.stat[WORKINGSET_NODERECLAIM]);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int memory_oom_group_show(struct seq_file *m, void *v)
|
||||
{
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
|
||||
|
||||
seq_printf(m, "%d\n", memcg->oom_group);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes, loff_t off)
|
||||
{
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
|
||||
int ret, oom_group;
|
||||
|
||||
buf = strstrip(buf);
|
||||
if (!buf)
|
||||
return -EINVAL;
|
||||
|
||||
ret = kstrtoint(buf, 0, &oom_group);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (oom_group != 0 && oom_group != 1)
|
||||
return -EINVAL;
|
||||
|
||||
memcg->oom_group = oom_group;
|
||||
|
||||
return nbytes;
|
||||
}
|
||||
|
||||
static struct cftype memory_files[] = {
|
||||
{
|
||||
.name = "current",
|
||||
@@ -5606,6 +5689,12 @@ static struct cftype memory_files[] = {
|
||||
.flags = CFTYPE_NOT_ON_ROOT,
|
||||
.seq_show = memory_stat_show,
|
||||
},
|
||||
{
|
||||
.name = "oom.group",
|
||||
.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
|
||||
.seq_show = memory_oom_group_show,
|
||||
.write = memory_oom_group_write,
|
||||
},
|
||||
{ } /* terminate */
|
||||
};
|
||||
|
||||
|
@@ -1167,7 +1167,7 @@ int memory_failure(unsigned long pfn, int flags)
|
||||
* R/W the page; let's pray that the page has been
|
||||
* used and will be freed some time later.
|
||||
* In fact it's dangerous to directly bump up page count from 0,
|
||||
* that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
|
||||
* that may make page_ref_freeze()/page_ref_unfreeze() mismatch.
|
||||
*/
|
||||
if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
|
||||
if (is_free_buddy_page(p)) {
|
||||
|
@@ -982,8 +982,6 @@ static void reset_node_present_pages(pg_data_t *pgdat)
|
||||
static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
|
||||
{
|
||||
struct pglist_data *pgdat;
|
||||
unsigned long zones_size[MAX_NR_ZONES] = {0};
|
||||
unsigned long zholes_size[MAX_NR_ZONES] = {0};
|
||||
unsigned long start_pfn = PFN_DOWN(start);
|
||||
|
||||
pgdat = NODE_DATA(nid);
|
||||
@@ -1006,8 +1004,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
|
||||
|
||||
/* we can use NODE_DATA(nid) from here */
|
||||
|
||||
pgdat->node_id = nid;
|
||||
pgdat->node_start_pfn = start_pfn;
|
||||
|
||||
/* init node's zones as empty zones, we don't have any present pages.*/
|
||||
free_area_init_node(nid, zones_size, start_pfn, zholes_size);
|
||||
free_area_init_core_hotplug(nid);
|
||||
pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
|
||||
|
||||
/*
|
||||
@@ -1016,19 +1017,12 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
|
||||
*/
|
||||
build_all_zonelists(pgdat);
|
||||
|
||||
/*
|
||||
* zone->managed_pages is set to an approximate value in
|
||||
* free_area_init_core(), which will cause
|
||||
* /sys/device/system/node/nodeX/meminfo has wrong data.
|
||||
* So reset it to 0 before any memory is onlined.
|
||||
*/
|
||||
reset_node_managed_pages(pgdat);
|
||||
|
||||
/*
|
||||
* When memory is hot-added, all the memory is in offline state. So
|
||||
* clear all zones' present_pages because they will be updated in
|
||||
* online_pages() and offline_pages().
|
||||
*/
|
||||
reset_node_managed_pages(pgdat);
|
||||
reset_node_present_pages(pgdat);
|
||||
|
||||
return pgdat;
|
||||
|
@@ -1784,7 +1784,7 @@ unsigned int mempolicy_slab_node(void)
|
||||
zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
|
||||
z = first_zones_zonelist(zonelist, highest_zoneidx,
|
||||
&policy->v.nodes);
|
||||
return z->zone ? z->zone->node : node;
|
||||
return z->zone ? zone_to_nid(z->zone) : node;
|
||||
}
|
||||
|
||||
default:
|
||||
@@ -2326,7 +2326,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
|
||||
node_zonelist(numa_node_id(), GFP_HIGHUSER),
|
||||
gfp_zone(GFP_HIGHUSER),
|
||||
&pol->v.nodes);
|
||||
polnid = z->zone->node;
|
||||
polnid = zone_to_nid(z->zone);
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -2504,7 +2504,6 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
|
||||
goto put_new;
|
||||
|
||||
/* Create pseudo-vma that contains just the policy */
|
||||
memset(&pvma, 0, sizeof(struct vm_area_struct));
|
||||
vma_init(&pvma, NULL);
|
||||
pvma.vm_end = TASK_SIZE; /* policy covers entire file */
|
||||
mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
|
||||
|
@@ -213,6 +213,7 @@ EXPORT_SYMBOL(mempool_init_node);
|
||||
|
||||
/**
|
||||
* mempool_init - initialize a memory pool
|
||||
* @pool: pointer to the memory pool that should be initialized
|
||||
* @min_nr: the minimum number of elements guaranteed to be
|
||||
* allocated for this pool.
|
||||
* @alloc_fn: user-defined element-allocation function.
|
||||
|
@@ -53,13 +53,8 @@ void __init mminit_verify_zonelist(void)
|
||||
zone->name);
|
||||
|
||||
/* Iterate the zonelist */
|
||||
for_each_zone_zonelist(zone, z, zonelist, zoneid) {
|
||||
#ifdef CONFIG_NUMA
|
||||
pr_cont("%d:%s ", zone->node, zone->name);
|
||||
#else
|
||||
pr_cont("0:%s ", zone->name);
|
||||
#endif /* CONFIG_NUMA */
|
||||
}
|
||||
for_each_zone_zonelist(zone, z, zonelist, zoneid)
|
||||
pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
|
||||
pr_cont("\n");
|
||||
}
|
||||
}
|
||||
|
@@ -3063,9 +3063,7 @@ void exit_mmap(struct mm_struct *mm)
|
||||
* which clears VM_LOCKED, otherwise the oom reaper cannot
|
||||
* reliably test it.
|
||||
*/
|
||||
mutex_lock(&oom_lock);
|
||||
__oom_reap_task_mm(mm);
|
||||
mutex_unlock(&oom_lock);
|
||||
(void)__oom_reap_task_mm(mm);
|
||||
|
||||
set_bit(MMF_OOM_SKIP, &mm->flags);
|
||||
down_write(&mm->mmap_sem);
|
||||
|
@@ -174,18 +174,29 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
|
||||
srcu_read_unlock(&srcu, id);
|
||||
}
|
||||
|
||||
void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
|
||||
unsigned long start, unsigned long end)
|
||||
int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
|
||||
unsigned long start, unsigned long end,
|
||||
bool blockable)
|
||||
{
|
||||
struct mmu_notifier *mn;
|
||||
int ret = 0;
|
||||
int id;
|
||||
|
||||
id = srcu_read_lock(&srcu);
|
||||
hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
|
||||
if (mn->ops->invalidate_range_start)
|
||||
mn->ops->invalidate_range_start(mn, mm, start, end);
|
||||
if (mn->ops->invalidate_range_start) {
|
||||
int _ret = mn->ops->invalidate_range_start(mn, mm, start, end, blockable);
|
||||
if (_ret) {
|
||||
pr_info("%pS callback failed with %d in %sblockable context.\n",
|
||||
mn->ops->invalidate_range_start, _ret,
|
||||
!blockable ? "non-" : "");
|
||||
ret = _ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
srcu_read_unlock(&srcu, id);
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
|
||||
|
||||
|
219
mm/oom_kill.c
219
mm/oom_kill.c
@@ -400,7 +400,8 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
|
||||
struct task_struct *p;
|
||||
struct task_struct *task;
|
||||
|
||||
pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
|
||||
pr_info("Tasks state (memory values in pages):\n");
|
||||
pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
|
||||
rcu_read_lock();
|
||||
for_each_process(p) {
|
||||
if (oom_unkillable_task(p, memcg, nodemask))
|
||||
@@ -416,7 +417,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
|
||||
continue;
|
||||
}
|
||||
|
||||
pr_info("[%5d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
|
||||
pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
|
||||
task->pid, from_kuid(&init_user_ns, task_uid(task)),
|
||||
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
|
||||
mm_pgtables_bytes(task->mm),
|
||||
@@ -487,9 +488,10 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
|
||||
static struct task_struct *oom_reaper_list;
|
||||
static DEFINE_SPINLOCK(oom_reaper_lock);
|
||||
|
||||
void __oom_reap_task_mm(struct mm_struct *mm)
|
||||
bool __oom_reap_task_mm(struct mm_struct *mm)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
bool ret = true;
|
||||
|
||||
/*
|
||||
* Tell all users of get_user/copy_from_user etc... that the content
|
||||
@@ -519,50 +521,32 @@ void __oom_reap_task_mm(struct mm_struct *mm)
|
||||
struct mmu_gather tlb;
|
||||
|
||||
tlb_gather_mmu(&tlb, mm, start, end);
|
||||
mmu_notifier_invalidate_range_start(mm, start, end);
|
||||
if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) {
|
||||
ret = false;
|
||||
continue;
|
||||
}
|
||||
unmap_page_range(&tlb, vma, start, end, NULL);
|
||||
mmu_notifier_invalidate_range_end(mm, start, end);
|
||||
tlb_finish_mmu(&tlb, start, end);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Reaps the address space of the give task.
|
||||
*
|
||||
* Returns true on success and false if none or part of the address space
|
||||
* has been reclaimed and the caller should retry later.
|
||||
*/
|
||||
static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
|
||||
{
|
||||
bool ret = true;
|
||||
|
||||
/*
|
||||
* We have to make sure to not race with the victim exit path
|
||||
* and cause premature new oom victim selection:
|
||||
* oom_reap_task_mm exit_mm
|
||||
* mmget_not_zero
|
||||
* mmput
|
||||
* atomic_dec_and_test
|
||||
* exit_oom_victim
|
||||
* [...]
|
||||
* out_of_memory
|
||||
* select_bad_process
|
||||
* # no TIF_MEMDIE task selects new victim
|
||||
* unmap_page_range # frees some memory
|
||||
*/
|
||||
mutex_lock(&oom_lock);
|
||||
|
||||
if (!down_read_trylock(&mm->mmap_sem)) {
|
||||
ret = false;
|
||||
trace_skip_task_reaping(tsk->pid);
|
||||
goto unlock_oom;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the mm has invalidate_{start,end}() notifiers that could block,
|
||||
* sleep to give the oom victim some more time.
|
||||
* TODO: we really want to get rid of this ugly hack and make sure that
|
||||
* notifiers cannot block for unbounded amount of time
|
||||
*/
|
||||
if (mm_has_blockable_invalidate_notifiers(mm)) {
|
||||
up_read(&mm->mmap_sem);
|
||||
schedule_timeout_idle(HZ);
|
||||
goto unlock_oom;
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -572,25 +556,27 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
|
||||
* down_write();up_write() cycle in exit_mmap().
|
||||
*/
|
||||
if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
|
||||
up_read(&mm->mmap_sem);
|
||||
trace_skip_task_reaping(tsk->pid);
|
||||
goto unlock_oom;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
trace_start_task_reaping(tsk->pid);
|
||||
|
||||
__oom_reap_task_mm(mm);
|
||||
/* failed to reap part of the address space. Try again later */
|
||||
ret = __oom_reap_task_mm(mm);
|
||||
if (!ret)
|
||||
goto out_finish;
|
||||
|
||||
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
|
||||
task_pid_nr(tsk), tsk->comm,
|
||||
K(get_mm_counter(mm, MM_ANONPAGES)),
|
||||
K(get_mm_counter(mm, MM_FILEPAGES)),
|
||||
K(get_mm_counter(mm, MM_SHMEMPAGES)));
|
||||
out_finish:
|
||||
trace_finish_task_reaping(tsk->pid);
|
||||
out_unlock:
|
||||
up_read(&mm->mmap_sem);
|
||||
|
||||
trace_finish_task_reaping(tsk->pid);
|
||||
unlock_oom:
|
||||
mutex_unlock(&oom_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -843,68 +829,12 @@ static bool task_will_free_mem(struct task_struct *task)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void oom_kill_process(struct oom_control *oc, const char *message)
|
||||
static void __oom_kill_process(struct task_struct *victim)
|
||||
{
|
||||
struct task_struct *p = oc->chosen;
|
||||
unsigned int points = oc->chosen_points;
|
||||
struct task_struct *victim = p;
|
||||
struct task_struct *child;
|
||||
struct task_struct *t;
|
||||
struct task_struct *p;
|
||||
struct mm_struct *mm;
|
||||
unsigned int victim_points = 0;
|
||||
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
|
||||
DEFAULT_RATELIMIT_BURST);
|
||||
bool can_oom_reap = true;
|
||||
|
||||
/*
|
||||
* If the task is already exiting, don't alarm the sysadmin or kill
|
||||
* its children or threads, just give it access to memory reserves
|
||||
* so it can die quickly
|
||||
*/
|
||||
task_lock(p);
|
||||
if (task_will_free_mem(p)) {
|
||||
mark_oom_victim(p);
|
||||
wake_oom_reaper(p);
|
||||
task_unlock(p);
|
||||
put_task_struct(p);
|
||||
return;
|
||||
}
|
||||
task_unlock(p);
|
||||
|
||||
if (__ratelimit(&oom_rs))
|
||||
dump_header(oc, p);
|
||||
|
||||
pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
|
||||
message, task_pid_nr(p), p->comm, points);
|
||||
|
||||
/*
|
||||
* If any of p's children has a different mm and is eligible for kill,
|
||||
* the one with the highest oom_badness() score is sacrificed for its
|
||||
* parent. This attempts to lose the minimal amount of work done while
|
||||
* still freeing memory.
|
||||
*/
|
||||
read_lock(&tasklist_lock);
|
||||
for_each_thread(p, t) {
|
||||
list_for_each_entry(child, &t->children, sibling) {
|
||||
unsigned int child_points;
|
||||
|
||||
if (process_shares_mm(child, p->mm))
|
||||
continue;
|
||||
/*
|
||||
* oom_badness() returns 0 if the thread is unkillable
|
||||
*/
|
||||
child_points = oom_badness(child,
|
||||
oc->memcg, oc->nodemask, oc->totalpages);
|
||||
if (child_points > victim_points) {
|
||||
put_task_struct(victim);
|
||||
victim = child;
|
||||
victim_points = child_points;
|
||||
get_task_struct(victim);
|
||||
}
|
||||
}
|
||||
}
|
||||
read_unlock(&tasklist_lock);
|
||||
|
||||
p = find_lock_task_mm(victim);
|
||||
if (!p) {
|
||||
put_task_struct(victim);
|
||||
@@ -978,6 +908,99 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
|
||||
}
|
||||
#undef K
|
||||
|
||||
/*
|
||||
* Kill provided task unless it's secured by setting
|
||||
* oom_score_adj to OOM_SCORE_ADJ_MIN.
|
||||
*/
|
||||
static int oom_kill_memcg_member(struct task_struct *task, void *unused)
|
||||
{
|
||||
if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
|
||||
get_task_struct(task);
|
||||
__oom_kill_process(task);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void oom_kill_process(struct oom_control *oc, const char *message)
|
||||
{
|
||||
struct task_struct *p = oc->chosen;
|
||||
unsigned int points = oc->chosen_points;
|
||||
struct task_struct *victim = p;
|
||||
struct task_struct *child;
|
||||
struct task_struct *t;
|
||||
struct mem_cgroup *oom_group;
|
||||
unsigned int victim_points = 0;
|
||||
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
|
||||
DEFAULT_RATELIMIT_BURST);
|
||||
|
||||
/*
|
||||
* If the task is already exiting, don't alarm the sysadmin or kill
|
||||
* its children or threads, just give it access to memory reserves
|
||||
* so it can die quickly
|
||||
*/
|
||||
task_lock(p);
|
||||
if (task_will_free_mem(p)) {
|
||||
mark_oom_victim(p);
|
||||
wake_oom_reaper(p);
|
||||
task_unlock(p);
|
||||
put_task_struct(p);
|
||||
return;
|
||||
}
|
||||
task_unlock(p);
|
||||
|
||||
if (__ratelimit(&oom_rs))
|
||||
dump_header(oc, p);
|
||||
|
||||
pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
|
||||
message, task_pid_nr(p), p->comm, points);
|
||||
|
||||
/*
|
||||
* If any of p's children has a different mm and is eligible for kill,
|
||||
* the one with the highest oom_badness() score is sacrificed for its
|
||||
* parent. This attempts to lose the minimal amount of work done while
|
||||
* still freeing memory.
|
||||
*/
|
||||
read_lock(&tasklist_lock);
|
||||
for_each_thread(p, t) {
|
||||
list_for_each_entry(child, &t->children, sibling) {
|
||||
unsigned int child_points;
|
||||
|
||||
if (process_shares_mm(child, p->mm))
|
||||
continue;
|
||||
/*
|
||||
* oom_badness() returns 0 if the thread is unkillable
|
||||
*/
|
||||
child_points = oom_badness(child,
|
||||
oc->memcg, oc->nodemask, oc->totalpages);
|
||||
if (child_points > victim_points) {
|
||||
put_task_struct(victim);
|
||||
victim = child;
|
||||
victim_points = child_points;
|
||||
get_task_struct(victim);
|
||||
}
|
||||
}
|
||||
}
|
||||
read_unlock(&tasklist_lock);
|
||||
|
||||
/*
|
||||
* Do we need to kill the entire memory cgroup?
|
||||
* Or even one of the ancestor memory cgroups?
|
||||
* Check this out before killing the victim task.
|
||||
*/
|
||||
oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
|
||||
|
||||
__oom_kill_process(victim);
|
||||
|
||||
/*
|
||||
* If necessary, kill all tasks in the selected memory cgroup.
|
||||
*/
|
||||
if (oom_group) {
|
||||
mem_cgroup_print_oom_group(oom_group);
|
||||
mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL);
|
||||
mem_cgroup_put(oom_group);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
|
||||
*/
|
||||
|
160
mm/page_alloc.c
160
mm/page_alloc.c
@@ -2909,10 +2909,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
|
||||
if (!static_branch_likely(&vm_numa_stat_key))
|
||||
return;
|
||||
|
||||
if (z->node != numa_node_id())
|
||||
if (zone_to_nid(z) != numa_node_id())
|
||||
local_stat = NUMA_OTHER;
|
||||
|
||||
if (z->node == preferred_zone->node)
|
||||
if (zone_to_nid(z) == zone_to_nid(preferred_zone))
|
||||
__inc_numa_state(z, NUMA_HIT);
|
||||
else {
|
||||
__inc_numa_state(z, NUMA_MISS);
|
||||
@@ -5278,7 +5278,7 @@ int local_memory_node(int node)
|
||||
z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
|
||||
gfp_zone(GFP_KERNEL),
|
||||
NULL);
|
||||
return z->zone->node;
|
||||
return zone_to_nid(z->zone);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -6120,7 +6120,7 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l
|
||||
return usemapsize / 8;
|
||||
}
|
||||
|
||||
static void __init setup_usemap(struct pglist_data *pgdat,
|
||||
static void __ref setup_usemap(struct pglist_data *pgdat,
|
||||
struct zone *zone,
|
||||
unsigned long zone_start_pfn,
|
||||
unsigned long zonesize)
|
||||
@@ -6140,7 +6140,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
|
||||
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
|
||||
|
||||
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
|
||||
void __paginginit set_pageblock_order(void)
|
||||
void __init set_pageblock_order(void)
|
||||
{
|
||||
unsigned int order;
|
||||
|
||||
@@ -6168,14 +6168,14 @@ void __paginginit set_pageblock_order(void)
|
||||
* include/linux/pageblock-flags.h for the values of pageblock_order based on
|
||||
* the kernel config
|
||||
*/
|
||||
void __paginginit set_pageblock_order(void)
|
||||
void __init set_pageblock_order(void)
|
||||
{
|
||||
}
|
||||
|
||||
#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
|
||||
|
||||
static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
|
||||
unsigned long present_pages)
|
||||
static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
|
||||
unsigned long present_pages)
|
||||
{
|
||||
unsigned long pages = spanned_pages;
|
||||
|
||||
@@ -6194,6 +6194,84 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
|
||||
return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
static void pgdat_init_numabalancing(struct pglist_data *pgdat)
|
||||
{
|
||||
spin_lock_init(&pgdat->numabalancing_migrate_lock);
|
||||
pgdat->numabalancing_migrate_nr_pages = 0;
|
||||
pgdat->numabalancing_migrate_next_window = jiffies;
|
||||
}
|
||||
#else
|
||||
static void pgdat_init_numabalancing(struct pglist_data *pgdat) {}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
static void pgdat_init_split_queue(struct pglist_data *pgdat)
|
||||
{
|
||||
spin_lock_init(&pgdat->split_queue_lock);
|
||||
INIT_LIST_HEAD(&pgdat->split_queue);
|
||||
pgdat->split_queue_len = 0;
|
||||
}
|
||||
#else
|
||||
static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_COMPACTION
|
||||
static void pgdat_init_kcompactd(struct pglist_data *pgdat)
|
||||
{
|
||||
init_waitqueue_head(&pgdat->kcompactd_wait);
|
||||
}
|
||||
#else
|
||||
static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
|
||||
#endif
|
||||
|
||||
static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
|
||||
{
|
||||
pgdat_resize_init(pgdat);
|
||||
|
||||
pgdat_init_numabalancing(pgdat);
|
||||
pgdat_init_split_queue(pgdat);
|
||||
pgdat_init_kcompactd(pgdat);
|
||||
|
||||
init_waitqueue_head(&pgdat->kswapd_wait);
|
||||
init_waitqueue_head(&pgdat->pfmemalloc_wait);
|
||||
|
||||
pgdat_page_ext_init(pgdat);
|
||||
spin_lock_init(&pgdat->lru_lock);
|
||||
lruvec_init(node_lruvec(pgdat));
|
||||
}
|
||||
|
||||
static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
|
||||
unsigned long remaining_pages)
|
||||
{
|
||||
zone->managed_pages = remaining_pages;
|
||||
zone_set_nid(zone, nid);
|
||||
zone->name = zone_names[idx];
|
||||
zone->zone_pgdat = NODE_DATA(nid);
|
||||
spin_lock_init(&zone->lock);
|
||||
zone_seqlock_init(zone);
|
||||
zone_pcp_init(zone);
|
||||
}
|
||||
|
||||
/*
|
||||
* Set up the zone data structures
|
||||
* - init pgdat internals
|
||||
* - init all zones belonging to this node
|
||||
*
|
||||
* NOTE: this function is only called during memory hotplug
|
||||
*/
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
void __ref free_area_init_core_hotplug(int nid)
|
||||
{
|
||||
enum zone_type z;
|
||||
pg_data_t *pgdat = NODE_DATA(nid);
|
||||
|
||||
pgdat_init_internals(pgdat);
|
||||
for (z = 0; z < MAX_NR_ZONES; z++)
|
||||
zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Set up the zone data structures:
|
||||
* - mark all pages reserved
|
||||
@@ -6201,32 +6279,14 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
|
||||
* - clear the memory bitmaps
|
||||
*
|
||||
* NOTE: pgdat should get zeroed by caller.
|
||||
* NOTE: this function is only called during early init.
|
||||
*/
|
||||
static void __paginginit free_area_init_core(struct pglist_data *pgdat)
|
||||
static void __init free_area_init_core(struct pglist_data *pgdat)
|
||||
{
|
||||
enum zone_type j;
|
||||
int nid = pgdat->node_id;
|
||||
|
||||
pgdat_resize_init(pgdat);
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
spin_lock_init(&pgdat->numabalancing_migrate_lock);
|
||||
pgdat->numabalancing_migrate_nr_pages = 0;
|
||||
pgdat->numabalancing_migrate_next_window = jiffies;
|
||||
#endif
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
spin_lock_init(&pgdat->split_queue_lock);
|
||||
INIT_LIST_HEAD(&pgdat->split_queue);
|
||||
pgdat->split_queue_len = 0;
|
||||
#endif
|
||||
init_waitqueue_head(&pgdat->kswapd_wait);
|
||||
init_waitqueue_head(&pgdat->pfmemalloc_wait);
|
||||
#ifdef CONFIG_COMPACTION
|
||||
init_waitqueue_head(&pgdat->kcompactd_wait);
|
||||
#endif
|
||||
pgdat_page_ext_init(pgdat);
|
||||
spin_lock_init(&pgdat->lru_lock);
|
||||
lruvec_init(node_lruvec(pgdat));
|
||||
|
||||
pgdat_init_internals(pgdat);
|
||||
pgdat->per_cpu_nodestats = &boot_nodestats;
|
||||
|
||||
for (j = 0; j < MAX_NR_ZONES; j++) {
|
||||
@@ -6274,15 +6334,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
|
||||
* when the bootmem allocator frees pages into the buddy system.
|
||||
* And all highmem pages will be managed by the buddy system.
|
||||
*/
|
||||
zone->managed_pages = freesize;
|
||||
#ifdef CONFIG_NUMA
|
||||
zone->node = nid;
|
||||
#endif
|
||||
zone->name = zone_names[j];
|
||||
zone->zone_pgdat = pgdat;
|
||||
spin_lock_init(&zone->lock);
|
||||
zone_seqlock_init(zone);
|
||||
zone_pcp_init(zone);
|
||||
zone_init_internals(zone, j, nid, freesize);
|
||||
|
||||
if (!size)
|
||||
continue;
|
||||
@@ -6342,8 +6394,24 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
|
||||
static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
|
||||
#endif /* CONFIG_FLAT_NODE_MEM_MAP */
|
||||
|
||||
void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
|
||||
unsigned long node_start_pfn, unsigned long *zholes_size)
|
||||
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
|
||||
static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
|
||||
{
|
||||
/*
|
||||
* We start only with one section of pages, more pages are added as
|
||||
* needed until the rest of deferred pages are initialized.
|
||||
*/
|
||||
pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
|
||||
pgdat->node_spanned_pages);
|
||||
pgdat->first_deferred_pfn = ULONG_MAX;
|
||||
}
|
||||
#else
|
||||
static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
|
||||
#endif
|
||||
|
||||
void __init free_area_init_node(int nid, unsigned long *zones_size,
|
||||
unsigned long node_start_pfn,
|
||||
unsigned long *zholes_size)
|
||||
{
|
||||
pg_data_t *pgdat = NODE_DATA(nid);
|
||||
unsigned long start_pfn = 0;
|
||||
@@ -6367,16 +6435,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
|
||||
zones_size, zholes_size);
|
||||
|
||||
alloc_node_mem_map(pgdat);
|
||||
pgdat_set_deferred_range(pgdat);
|
||||
|
||||
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
|
||||
/*
|
||||
* We start only with one section of pages, more pages are added as
|
||||
* needed until the rest of deferred pages are initialized.
|
||||
*/
|
||||
pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
|
||||
pgdat->node_spanned_pages);
|
||||
pgdat->first_deferred_pfn = ULONG_MAX;
|
||||
#endif
|
||||
free_area_init_core(pgdat);
|
||||
}
|
||||
|
||||
@@ -6388,7 +6448,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
|
||||
* may be accessed (for example page_to_pfn() on some configuration accesses
|
||||
* flags). We must explicitly zero those struct pages.
|
||||
*/
|
||||
void __paginginit zero_resv_unavail(void)
|
||||
void __init zero_resv_unavail(void)
|
||||
{
|
||||
phys_addr_t start, end;
|
||||
unsigned long pfn;
|
||||
|
29
mm/percpu.c
29
mm/percpu.c
@@ -169,6 +169,14 @@ static LIST_HEAD(pcpu_map_extend_chunks);
|
||||
*/
|
||||
int pcpu_nr_empty_pop_pages;
|
||||
|
||||
/*
|
||||
* The number of populated pages in use by the allocator, protected by
|
||||
* pcpu_lock. This number is kept per a unit per chunk (i.e. when a page gets
|
||||
* allocated/deallocated, it is allocated/deallocated in all units of a chunk
|
||||
* and increments/decrements this count by 1).
|
||||
*/
|
||||
static unsigned long pcpu_nr_populated;
|
||||
|
||||
/*
|
||||
* Balance work is used to populate or destroy chunks asynchronously. We
|
||||
* try to keep the number of populated free pages between
|
||||
@@ -1232,6 +1240,7 @@ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
|
||||
|
||||
bitmap_set(chunk->populated, page_start, nr);
|
||||
chunk->nr_populated += nr;
|
||||
pcpu_nr_populated += nr;
|
||||
|
||||
if (!for_alloc) {
|
||||
chunk->nr_empty_pop_pages += nr;
|
||||
@@ -1260,6 +1269,7 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
|
||||
chunk->nr_populated -= nr;
|
||||
chunk->nr_empty_pop_pages -= nr;
|
||||
pcpu_nr_empty_pop_pages -= nr;
|
||||
pcpu_nr_populated -= nr;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2176,6 +2186,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
|
||||
pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
|
||||
pcpu_chunk_relocate(pcpu_first_chunk, -1);
|
||||
|
||||
/* include all regions of the first chunk */
|
||||
pcpu_nr_populated += PFN_DOWN(size_sum);
|
||||
|
||||
pcpu_stats_chunk_alloc();
|
||||
trace_percpu_create_chunk(base_addr);
|
||||
|
||||
@@ -2745,6 +2758,22 @@ void __init setup_per_cpu_areas(void)
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
/*
|
||||
* pcpu_nr_pages - calculate total number of populated backing pages
|
||||
*
|
||||
* This reflects the number of pages populated to back chunks. Metadata is
|
||||
* excluded in the number exposed in meminfo as the number of backing pages
|
||||
* scales with the number of cpus and can quickly outweigh the memory used for
|
||||
* metadata. It also keeps this calculation nice and simple.
|
||||
*
|
||||
* RETURNS:
|
||||
* Total number of populated backing pages in use by the allocator.
|
||||
*/
|
||||
unsigned long pcpu_nr_pages(void)
|
||||
{
|
||||
return pcpu_nr_populated * pcpu_nr_units;
|
||||
}
|
||||
|
||||
/*
|
||||
* Percpu allocator is initialized early during boot when neither slab or
|
||||
* workqueue is available. Plug async management until everything is up
|
||||
|
@@ -1421,7 +1421,6 @@ static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
|
||||
struct shmem_inode_info *info, pgoff_t index)
|
||||
{
|
||||
/* Create a pseudo vma that just contains the policy */
|
||||
memset(vma, 0, sizeof(*vma));
|
||||
vma_init(vma, NULL);
|
||||
/* Bias interleave by inode number to distribute better across nodes */
|
||||
vma->vm_pgoff = index + info->vfs_inode.i_ino;
|
||||
|
@@ -269,8 +269,8 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache)
|
||||
|
||||
cache->cur = 0;
|
||||
if (swap_slot_cache_active)
|
||||
cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, false,
|
||||
cache->slots);
|
||||
cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE,
|
||||
cache->slots, 1);
|
||||
|
||||
return cache->nr;
|
||||
}
|
||||
@@ -316,7 +316,7 @@ swp_entry_t get_swap_page(struct page *page)
|
||||
|
||||
if (PageTransHuge(page)) {
|
||||
if (IS_ENABLED(CONFIG_THP_SWAP))
|
||||
get_swap_pages(1, true, &entry);
|
||||
get_swap_pages(1, &entry, HPAGE_PMD_NR);
|
||||
goto out;
|
||||
}
|
||||
|
||||
@@ -350,7 +350,7 @@ repeat:
|
||||
goto out;
|
||||
}
|
||||
|
||||
get_swap_pages(1, false, &entry);
|
||||
get_swap_pages(1, &entry, 1);
|
||||
out:
|
||||
if (mem_cgroup_try_charge_swap(page, entry)) {
|
||||
put_swap_page(page, entry);
|
||||
|
195
mm/swapfile.c
195
mm/swapfile.c
@@ -204,8 +204,16 @@ static void discard_swap_cluster(struct swap_info_struct *si,
|
||||
|
||||
#ifdef CONFIG_THP_SWAP
|
||||
#define SWAPFILE_CLUSTER HPAGE_PMD_NR
|
||||
|
||||
#define swap_entry_size(size) (size)
|
||||
#else
|
||||
#define SWAPFILE_CLUSTER 256
|
||||
|
||||
/*
|
||||
* Define swap_entry_size() as constant to let compiler to optimize
|
||||
* out some code if !CONFIG_THP_SWAP
|
||||
*/
|
||||
#define swap_entry_size(size) 1
|
||||
#endif
|
||||
#define LATENCY_LIMIT 256
|
||||
|
||||
@@ -269,7 +277,9 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
|
||||
|
||||
static inline bool cluster_is_huge(struct swap_cluster_info *info)
|
||||
{
|
||||
return info->flags & CLUSTER_FLAG_HUGE;
|
||||
if (IS_ENABLED(CONFIG_THP_SWAP))
|
||||
return info->flags & CLUSTER_FLAG_HUGE;
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void cluster_clear_huge(struct swap_cluster_info *info)
|
||||
@@ -296,13 +306,18 @@ static inline void unlock_cluster(struct swap_cluster_info *ci)
|
||||
spin_unlock(&ci->lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine the locking method in use for this device. Return
|
||||
* swap_cluster_info if SSD-style cluster-based locking is in place.
|
||||
*/
|
||||
static inline struct swap_cluster_info *lock_cluster_or_swap_info(
|
||||
struct swap_info_struct *si,
|
||||
unsigned long offset)
|
||||
struct swap_info_struct *si, unsigned long offset)
|
||||
{
|
||||
struct swap_cluster_info *ci;
|
||||
|
||||
/* Try to use fine-grained SSD-style locking if available: */
|
||||
ci = lock_cluster(si, offset);
|
||||
/* Otherwise, fall back to traditional, coarse locking: */
|
||||
if (!ci)
|
||||
spin_lock(&si->lock);
|
||||
|
||||
@@ -863,7 +878,6 @@ no_page:
|
||||
return n_ret;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_THP_SWAP
|
||||
static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
|
||||
{
|
||||
unsigned long idx;
|
||||
@@ -871,6 +885,15 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
|
||||
unsigned long offset, i;
|
||||
unsigned char *map;
|
||||
|
||||
/*
|
||||
* Should not even be attempting cluster allocations when huge
|
||||
* page swap is disabled. Warn and fail the allocation.
|
||||
*/
|
||||
if (!IS_ENABLED(CONFIG_THP_SWAP)) {
|
||||
VM_WARN_ON_ONCE(1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (cluster_list_empty(&si->free_clusters))
|
||||
return 0;
|
||||
|
||||
@@ -901,13 +924,6 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
|
||||
unlock_cluster(ci);
|
||||
swap_range_free(si, offset, SWAPFILE_CLUSTER);
|
||||
}
|
||||
#else
|
||||
static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
|
||||
{
|
||||
VM_WARN_ON_ONCE(1);
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_THP_SWAP */
|
||||
|
||||
static unsigned long scan_swap_map(struct swap_info_struct *si,
|
||||
unsigned char usage)
|
||||
@@ -924,18 +940,18 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
|
||||
|
||||
}
|
||||
|
||||
int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
|
||||
int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
|
||||
{
|
||||
unsigned long nr_pages = cluster ? SWAPFILE_CLUSTER : 1;
|
||||
unsigned long size = swap_entry_size(entry_size);
|
||||
struct swap_info_struct *si, *next;
|
||||
long avail_pgs;
|
||||
int n_ret = 0;
|
||||
int node;
|
||||
|
||||
/* Only single cluster request supported */
|
||||
WARN_ON_ONCE(n_goal > 1 && cluster);
|
||||
WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
|
||||
|
||||
avail_pgs = atomic_long_read(&nr_swap_pages) / nr_pages;
|
||||
avail_pgs = atomic_long_read(&nr_swap_pages) / size;
|
||||
if (avail_pgs <= 0)
|
||||
goto noswap;
|
||||
|
||||
@@ -945,7 +961,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
|
||||
if (n_goal > avail_pgs)
|
||||
n_goal = avail_pgs;
|
||||
|
||||
atomic_long_sub(n_goal * nr_pages, &nr_swap_pages);
|
||||
atomic_long_sub(n_goal * size, &nr_swap_pages);
|
||||
|
||||
spin_lock(&swap_avail_lock);
|
||||
|
||||
@@ -972,14 +988,14 @@ start_over:
|
||||
spin_unlock(&si->lock);
|
||||
goto nextsi;
|
||||
}
|
||||
if (cluster) {
|
||||
if (size == SWAPFILE_CLUSTER) {
|
||||
if (!(si->flags & SWP_FILE))
|
||||
n_ret = swap_alloc_cluster(si, swp_entries);
|
||||
} else
|
||||
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
|
||||
n_goal, swp_entries);
|
||||
spin_unlock(&si->lock);
|
||||
if (n_ret || cluster)
|
||||
if (n_ret || size == SWAPFILE_CLUSTER)
|
||||
goto check_out;
|
||||
pr_debug("scan_swap_map of si %d failed to find offset\n",
|
||||
si->type);
|
||||
@@ -1005,7 +1021,7 @@ nextsi:
|
||||
|
||||
check_out:
|
||||
if (n_ret < n_goal)
|
||||
atomic_long_add((long)(n_goal - n_ret) * nr_pages,
|
||||
atomic_long_add((long)(n_goal - n_ret) * size,
|
||||
&nr_swap_pages);
|
||||
noswap:
|
||||
return n_ret;
|
||||
@@ -1107,16 +1123,13 @@ static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
|
||||
return p;
|
||||
}
|
||||
|
||||
static unsigned char __swap_entry_free(struct swap_info_struct *p,
|
||||
swp_entry_t entry, unsigned char usage)
|
||||
static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
|
||||
unsigned long offset,
|
||||
unsigned char usage)
|
||||
{
|
||||
struct swap_cluster_info *ci;
|
||||
unsigned long offset = swp_offset(entry);
|
||||
unsigned char count;
|
||||
unsigned char has_cache;
|
||||
|
||||
ci = lock_cluster_or_swap_info(p, offset);
|
||||
|
||||
count = p->swap_map[offset];
|
||||
|
||||
has_cache = count & SWAP_HAS_CACHE;
|
||||
@@ -1144,6 +1157,17 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
|
||||
usage = count | has_cache;
|
||||
p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
|
||||
|
||||
return usage;
|
||||
}
|
||||
|
||||
static unsigned char __swap_entry_free(struct swap_info_struct *p,
|
||||
swp_entry_t entry, unsigned char usage)
|
||||
{
|
||||
struct swap_cluster_info *ci;
|
||||
unsigned long offset = swp_offset(entry);
|
||||
|
||||
ci = lock_cluster_or_swap_info(p, offset);
|
||||
usage = __swap_entry_free_locked(p, offset, usage);
|
||||
unlock_cluster_or_swap_info(p, ci);
|
||||
|
||||
return usage;
|
||||
@@ -1184,19 +1208,7 @@ void swap_free(swp_entry_t entry)
|
||||
/*
|
||||
* Called after dropping swapcache to decrease refcnt to swap entries.
|
||||
*/
|
||||
static void swapcache_free(swp_entry_t entry)
|
||||
{
|
||||
struct swap_info_struct *p;
|
||||
|
||||
p = _swap_info_get(entry);
|
||||
if (p) {
|
||||
if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE))
|
||||
free_swap_slot(entry);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_THP_SWAP
|
||||
static void swapcache_free_cluster(swp_entry_t entry)
|
||||
void put_swap_page(struct page *page, swp_entry_t entry)
|
||||
{
|
||||
unsigned long offset = swp_offset(entry);
|
||||
unsigned long idx = offset / SWAPFILE_CLUSTER;
|
||||
@@ -1205,42 +1217,48 @@ static void swapcache_free_cluster(swp_entry_t entry)
|
||||
unsigned char *map;
|
||||
unsigned int i, free_entries = 0;
|
||||
unsigned char val;
|
||||
int size = swap_entry_size(hpage_nr_pages(page));
|
||||
|
||||
si = _swap_info_get(entry);
|
||||
if (!si)
|
||||
return;
|
||||
|
||||
ci = lock_cluster(si, offset);
|
||||
VM_BUG_ON(!cluster_is_huge(ci));
|
||||
map = si->swap_map + offset;
|
||||
for (i = 0; i < SWAPFILE_CLUSTER; i++) {
|
||||
val = map[i];
|
||||
VM_BUG_ON(!(val & SWAP_HAS_CACHE));
|
||||
if (val == SWAP_HAS_CACHE)
|
||||
free_entries++;
|
||||
}
|
||||
if (!free_entries) {
|
||||
for (i = 0; i < SWAPFILE_CLUSTER; i++)
|
||||
map[i] &= ~SWAP_HAS_CACHE;
|
||||
}
|
||||
cluster_clear_huge(ci);
|
||||
unlock_cluster(ci);
|
||||
if (free_entries == SWAPFILE_CLUSTER) {
|
||||
spin_lock(&si->lock);
|
||||
ci = lock_cluster(si, offset);
|
||||
memset(map, 0, SWAPFILE_CLUSTER);
|
||||
unlock_cluster(ci);
|
||||
mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
|
||||
swap_free_cluster(si, idx);
|
||||
spin_unlock(&si->lock);
|
||||
} else if (free_entries) {
|
||||
for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
|
||||
if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
|
||||
free_swap_slot(entry);
|
||||
ci = lock_cluster_or_swap_info(si, offset);
|
||||
if (size == SWAPFILE_CLUSTER) {
|
||||
VM_BUG_ON(!cluster_is_huge(ci));
|
||||
map = si->swap_map + offset;
|
||||
for (i = 0; i < SWAPFILE_CLUSTER; i++) {
|
||||
val = map[i];
|
||||
VM_BUG_ON(!(val & SWAP_HAS_CACHE));
|
||||
if (val == SWAP_HAS_CACHE)
|
||||
free_entries++;
|
||||
}
|
||||
cluster_clear_huge(ci);
|
||||
if (free_entries == SWAPFILE_CLUSTER) {
|
||||
unlock_cluster_or_swap_info(si, ci);
|
||||
spin_lock(&si->lock);
|
||||
ci = lock_cluster(si, offset);
|
||||
memset(map, 0, SWAPFILE_CLUSTER);
|
||||
unlock_cluster(ci);
|
||||
mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
|
||||
swap_free_cluster(si, idx);
|
||||
spin_unlock(&si->lock);
|
||||
return;
|
||||
}
|
||||
}
|
||||
for (i = 0; i < size; i++, entry.val++) {
|
||||
if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
|
||||
unlock_cluster_or_swap_info(si, ci);
|
||||
free_swap_slot(entry);
|
||||
if (i == size - 1)
|
||||
return;
|
||||
lock_cluster_or_swap_info(si, offset);
|
||||
}
|
||||
}
|
||||
unlock_cluster_or_swap_info(si, ci);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_THP_SWAP
|
||||
int split_swap_cluster(swp_entry_t entry)
|
||||
{
|
||||
struct swap_info_struct *si;
|
||||
@@ -1255,19 +1273,7 @@ int split_swap_cluster(swp_entry_t entry)
|
||||
unlock_cluster(ci);
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
static inline void swapcache_free_cluster(swp_entry_t entry)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_THP_SWAP */
|
||||
|
||||
void put_swap_page(struct page *page, swp_entry_t entry)
|
||||
{
|
||||
if (!PageTransHuge(page))
|
||||
swapcache_free(entry);
|
||||
else
|
||||
swapcache_free_cluster(entry);
|
||||
}
|
||||
#endif
|
||||
|
||||
static int swp_entry_cmp(const void *ent1, const void *ent2)
|
||||
{
|
||||
@@ -1409,7 +1415,6 @@ out:
|
||||
return count;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_THP_SWAP
|
||||
static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
|
||||
swp_entry_t entry)
|
||||
{
|
||||
@@ -1422,12 +1427,12 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
|
||||
|
||||
ci = lock_cluster_or_swap_info(si, offset);
|
||||
if (!ci || !cluster_is_huge(ci)) {
|
||||
if (map[roffset] != SWAP_HAS_CACHE)
|
||||
if (swap_count(map[roffset]))
|
||||
ret = true;
|
||||
goto unlock_out;
|
||||
}
|
||||
for (i = 0; i < SWAPFILE_CLUSTER; i++) {
|
||||
if (map[offset + i] != SWAP_HAS_CACHE) {
|
||||
if (swap_count(map[offset + i])) {
|
||||
ret = true;
|
||||
break;
|
||||
}
|
||||
@@ -1442,7 +1447,7 @@ static bool page_swapped(struct page *page)
|
||||
swp_entry_t entry;
|
||||
struct swap_info_struct *si;
|
||||
|
||||
if (likely(!PageTransCompound(page)))
|
||||
if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
|
||||
return page_swapcount(page) != 0;
|
||||
|
||||
page = compound_head(page);
|
||||
@@ -1466,10 +1471,8 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
|
||||
/* hugetlbfs shouldn't call it */
|
||||
VM_BUG_ON_PAGE(PageHuge(page), page);
|
||||
|
||||
if (likely(!PageTransCompound(page))) {
|
||||
mapcount = atomic_read(&page->_mapcount) + 1;
|
||||
if (total_mapcount)
|
||||
*total_mapcount = mapcount;
|
||||
if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
|
||||
mapcount = page_trans_huge_mapcount(page, total_mapcount);
|
||||
if (PageSwapCache(page))
|
||||
swapcount = page_swapcount(page);
|
||||
if (total_swapcount)
|
||||
@@ -1516,26 +1519,6 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
|
||||
|
||||
return map_swapcount;
|
||||
}
|
||||
#else
|
||||
#define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry)
|
||||
#define page_swapped(page) (page_swapcount(page) != 0)
|
||||
|
||||
static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
|
||||
int *total_swapcount)
|
||||
{
|
||||
int mapcount, swapcount = 0;
|
||||
|
||||
/* hugetlbfs shouldn't call it */
|
||||
VM_BUG_ON_PAGE(PageHuge(page), page);
|
||||
|
||||
mapcount = page_trans_huge_mapcount(page, total_mapcount);
|
||||
if (PageSwapCache(page))
|
||||
swapcount = page_swapcount(page);
|
||||
if (total_swapcount)
|
||||
*total_swapcount = swapcount;
|
||||
return mapcount + swapcount;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* We can write to an anon page without COW if there are no other references
|
||||
|
@@ -408,7 +408,8 @@ void register_shrinker_prepared(struct shrinker *shrinker)
|
||||
down_write(&shrinker_rwsem);
|
||||
list_add_tail(&shrinker->list, &shrinker_list);
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
idr_replace(&shrinker_idr, shrinker, shrinker->id);
|
||||
if (shrinker->flags & SHRINKER_MEMCG_AWARE)
|
||||
idr_replace(&shrinker_idr, shrinker, shrinker->id);
|
||||
#endif
|
||||
up_write(&shrinker_rwsem);
|
||||
}
|
||||
@@ -902,7 +903,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
|
||||
refcount = 2;
|
||||
if (!page_ref_freeze(page, refcount))
|
||||
goto cannot_free;
|
||||
/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
|
||||
/* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
|
||||
if (unlikely(PageDirty(page))) {
|
||||
page_ref_unfreeze(page, refcount);
|
||||
goto cannot_free;
|
||||
|
新しいイシューから参照
ユーザーをブロックする