Merge branch 'akpm' (patches from Andrew Morton)

Merge first patch-bomb from Andrew Morton:
 "Quite a lot of other stuff is banked up awaiting further
  next->mainline merging, but this batch contains:

   - Lots of random misc patches
   - OCFS2
   - Most of MM
   - backlight updates
   - lib/ updates
   - printk updates
   - checkpatch updates
   - epoll tweaking
   - rtc updates
   - hfs
   - hfsplus
   - documentation
   - procfs
   - update gcov to gcc-4.7 format
   - IPC"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (269 commits)
  ipc, msg: fix message length check for negative values
  ipc/util.c: remove unnecessary work pending test
  devpts: plug the memory leak in kill_sb
  ./Makefile: export initial ramdisk compression config option
  init/Kconfig: add option to disable kernel compression
  drivers: w1: make w1_slave::flags long to avoid memory corruption
  drivers/w1/masters/ds1wm.cuse dev_get_platdata()
  drivers/memstick/core/ms_block.c: fix unreachable state in h_msb_read_page()
  drivers/memstick/core/mspro_block.c: fix attributes array allocation
  drivers/pps/clients/pps-gpio.c: remove redundant of_match_ptr
  kernel/panic.c: reduce 1 byte usage for print tainted buffer
  gcov: reuse kbasename helper
  kernel/gcov/fs.c: use pr_warn()
  kernel/module.c: use pr_foo()
  gcov: compile specific gcov implementation based on gcc version
  gcov: add support for gcc 4.7 gcov format
  gcov: move gcov structs definitions to a gcc version specific file
  kernel/taskstats.c: return -ENOMEM when alloc memory fails in add_del_listener()
  kernel/taskstats.c: add nla_nest_cancel() for failure processing between nla_nest_start() and nla_nest_end()
  kernel/sysctl_binary.c: use scnprintf() instead of snprintf()
  ...
This commit is contained in:
Linus Torvalds
2013-11-13 15:45:43 +09:00
287 changed files with 5004 additions and 2378 deletions

View File

@@ -153,11 +153,18 @@ config MOVABLE_NODE
help
Allow a node to have only movable memory. Pages used by the kernel,
such as direct mapping pages cannot be migrated. So the corresponding
memory device cannot be hotplugged. This option allows users to
online all the memory of a node as movable memory so that the whole
node can be hotplugged. Users who don't use the memory hotplug
feature are fine with this option on since they don't online memory
as movable.
memory device cannot be hotplugged. This option allows the following
two things:
- When the system is booting, node full of hotpluggable memory can
be arranged to have only movable memory so that the whole node can
be hot-removed. (need movable_node boot option specified).
- After the system is up, the option allows users to online all the
memory of a node as movable memory so that the whole node can be
hot-removed.
Users who don't use the memory hotplug feature are fine with this
option on since they don't specify movable_node boot option or they
don't online memory as movable.
Say Y here if you want to hotplug a whole node.
Say N here if you want kernel to use memory on all nodes evenly.

View File

@@ -172,11 +172,12 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
struct page *page;
unsigned long start, end, pages, count = 0;
unsigned long *map, start, end, pages, count = 0;
if (!bdata->node_bootmem_map)
return 0;
map = bdata->node_bootmem_map;
start = bdata->node_min_pfn;
end = bdata->node_low_pfn;
@@ -184,10 +185,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
bdata - bootmem_node_data, start, end);
while (start < end) {
unsigned long *map, idx, vec;
unsigned long idx, vec;
unsigned shift;
map = bdata->node_bootmem_map;
idx = start - bdata->node_min_pfn;
shift = idx & (BITS_PER_LONG - 1);
/*
@@ -784,7 +784,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
/* update goal according ...MAX_DMA32_PFN */
end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
end_pfn = pgdat_end_pfn(pgdat);
if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
(goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {

View File

@@ -235,10 +235,9 @@ static bool suitable_migration_target(struct page *page)
}
/*
* Isolate free pages onto a private freelist. Caller must hold zone->lock.
* If @strict is true, will abort returning 0 on any invalid PFNs or non-free
* pages inside of the pageblock (even though it may still end up isolating
* some pages).
* Isolate free pages onto a private freelist. If @strict is true, will abort
* returning 0 on any invalid PFNs or non-free pages inside of the pageblock
* (even though it may still end up isolating some pages).
*/
static unsigned long isolate_freepages_block(struct compact_control *cc,
unsigned long blockpfn,

View File

@@ -27,11 +27,12 @@
#include "internal.h"
/*
* By default transparent hugepage support is enabled for all mappings
* and khugepaged scans all mappings. Defrag is only invoked by
* khugepaged hugepage allocations and by page faults inside
* MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
* allocations.
* By default transparent hugepage support is disabled in order that avoid
* to risk increase the memory footprint of applications without a guaranteed
* benefit. When transparent hugepage support is enabled, is for all mappings,
* and khugepaged scans all mappings.
* Defrag is invoked by khugepaged hugepage allocations and by page faults
* for all hugepage allocations.
*/
unsigned long transparent_hugepage_flags __read_mostly =
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
@@ -758,14 +759,6 @@ static inline struct page *alloc_hugepage_vma(int defrag,
HPAGE_PMD_ORDER, vma, haddr, nd);
}
#ifndef CONFIG_NUMA
static inline struct page *alloc_hugepage(int defrag)
{
return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
HPAGE_PMD_ORDER);
}
#endif
static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
struct page *zero_page)
@@ -2198,7 +2191,34 @@ static void khugepaged_alloc_sleep(void)
msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
}
static int khugepaged_node_load[MAX_NUMNODES];
#ifdef CONFIG_NUMA
static int khugepaged_find_target_node(void)
{
static int last_khugepaged_target_node = NUMA_NO_NODE;
int nid, target_node = 0, max_value = 0;
/* find first node with max normal pages hit */
for (nid = 0; nid < MAX_NUMNODES; nid++)
if (khugepaged_node_load[nid] > max_value) {
max_value = khugepaged_node_load[nid];
target_node = nid;
}
/* do some balance if several nodes have the same hit record */
if (target_node <= last_khugepaged_target_node)
for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
nid++)
if (max_value == khugepaged_node_load[nid]) {
target_node = nid;
break;
}
last_khugepaged_target_node = target_node;
return target_node;
}
static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
{
if (IS_ERR(*hpage)) {
@@ -2232,9 +2252,8 @@ static struct page
* mmap_sem in read mode is good idea also to allow greater
* scalability.
*/
*hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
node, __GFP_OTHER_NODE);
*hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
/*
* After allocating the hugepage, release the mmap_sem read lock in
* preparation for taking it in write mode.
@@ -2250,6 +2269,17 @@ static struct page
return *hpage;
}
#else
static int khugepaged_find_target_node(void)
{
return 0;
}
static inline struct page *alloc_hugepage(int defrag)
{
return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
HPAGE_PMD_ORDER);
}
static struct page *khugepaged_alloc_hugepage(bool *wait)
{
struct page *hpage;
@@ -2456,6 +2486,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
if (pmd_trans_huge(*pmd))
goto out;
memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
@@ -2472,12 +2503,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
if (unlikely(!page))
goto out_unmap;
/*
* Chose the node of the first page. This could
* be more sophisticated and look at more pages,
* but isn't for now.
* Record which node the original page is from and save this
* information to khugepaged_node_load[].
* Khupaged will allocate hugepage from the node has the max
* hit record.
*/
if (node == NUMA_NO_NODE)
node = page_to_nid(page);
node = page_to_nid(page);
khugepaged_node_load[node]++;
VM_BUG_ON(PageCompound(page));
if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
goto out_unmap;
@@ -2492,9 +2524,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
ret = 1;
out_unmap:
pte_unmap_unlock(pte, ptl);
if (ret)
if (ret) {
node = khugepaged_find_target_node();
/* collapse_huge_page will return with the mmap_sem released */
collapse_huge_page(mm, address, hpage, vma, node);
}
out:
return ret;
}

View File

@@ -753,7 +753,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
}
spin_lock_irqsave(&object->lock, flags);
if (ptr + size > object->pointer + object->size) {
if (size == SIZE_MAX) {
size = object->pointer + object->size - ptr;
} else if (ptr + size > object->pointer + object->size) {
kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
dump_object_info(object);
kmem_cache_free(scan_area_cache, area);

View File

@@ -2309,8 +2309,8 @@ static ssize_t merge_across_nodes_store(struct kobject *kobj,
* Allocate stable and unstable together:
* MAXSMP NODES_SHIFT 10 will use 16kB.
*/
buf = kcalloc(nr_node_ids + nr_node_ids,
sizeof(*buf), GFP_KERNEL | __GFP_ZERO);
buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
GFP_KERNEL);
/* Let us assume that RB_ROOT is NULL is zero */
if (!buf)
err = -ENOMEM;

View File

@@ -20,6 +20,8 @@
#include <linux/seq_file.h>
#include <linux/memblock.h>
#include <asm-generic/sections.h>
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
@@ -32,6 +34,7 @@ struct memblock memblock __initdata_memblock = {
.reserved.cnt = 1, /* empty dummy entry */
.reserved.max = INIT_MEMBLOCK_REGIONS,
.bottom_up = false,
.current_limit = MEMBLOCK_ALLOC_ANYWHERE,
};
@@ -82,33 +85,57 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
return (i < type->cnt) ? i : -1;
}
/**
* memblock_find_in_range_node - find free area in given range and node
/*
* __memblock_find_range_bottom_up - find free area utility in bottom-up
* @start: start of candidate range
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @size: size of free area to find
* @align: alignment of free area to find
* @nid: nid of the free area to find, %MAX_NUMNODES for any node
*
* Find @size free area aligned to @align in the specified range and node.
* Utility called from memblock_find_in_range_node(), find free area bottom-up.
*
* RETURNS:
* Found address on success, %0 on failure.
* Found address on success, 0 on failure.
*/
phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
phys_addr_t end, phys_addr_t size,
phys_addr_t align, int nid)
static phys_addr_t __init_memblock
__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align, int nid)
{
phys_addr_t this_start, this_end, cand;
u64 i;
/* pump up @end */
if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
end = memblock.current_limit;
for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) {
this_start = clamp(this_start, start, end);
this_end = clamp(this_end, start, end);
/* avoid allocating the first page */
start = max_t(phys_addr_t, start, PAGE_SIZE);
end = max(start, end);
cand = round_up(this_start, align);
if (cand < this_end && this_end - cand >= size)
return cand;
}
return 0;
}
/**
* __memblock_find_range_top_down - find free area utility, in top-down
* @start: start of candidate range
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @size: size of free area to find
* @align: alignment of free area to find
* @nid: nid of the free area to find, %MAX_NUMNODES for any node
*
* Utility called from memblock_find_in_range_node(), find free area top-down.
*
* RETURNS:
* Found address on success, 0 on failure.
*/
static phys_addr_t __init_memblock
__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align, int nid)
{
phys_addr_t this_start, this_end, cand;
u64 i;
for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
this_start = clamp(this_start, start, end);
@@ -121,9 +148,80 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
if (cand >= this_start)
return cand;
}
return 0;
}
/**
* memblock_find_in_range_node - find free area in given range and node
* @start: start of candidate range
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @size: size of free area to find
* @align: alignment of free area to find
* @nid: nid of the free area to find, %MAX_NUMNODES for any node
*
* Find @size free area aligned to @align in the specified range and node.
*
* When allocation direction is bottom-up, the @start should be greater
* than the end of the kernel image. Otherwise, it will be trimmed. The
* reason is that we want the bottom-up allocation just near the kernel
* image so it is highly likely that the allocated memory and the kernel
* will reside in the same node.
*
* If bottom-up allocation failed, will try to allocate memory top-down.
*
* RETURNS:
* Found address on success, 0 on failure.
*/
phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
phys_addr_t end, phys_addr_t size,
phys_addr_t align, int nid)
{
int ret;
phys_addr_t kernel_end;
/* pump up @end */
if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
end = memblock.current_limit;
/* avoid allocating the first page */
start = max_t(phys_addr_t, start, PAGE_SIZE);
end = max(start, end);
kernel_end = __pa_symbol(_end);
/*
* try bottom-up allocation only when bottom-up mode
* is set and @end is above the kernel image.
*/
if (memblock_bottom_up() && end > kernel_end) {
phys_addr_t bottom_up_start;
/* make sure we will allocate above the kernel */
bottom_up_start = max(start, kernel_end);
/* ok, try bottom-up allocation first */
ret = __memblock_find_range_bottom_up(bottom_up_start, end,
size, align, nid);
if (ret)
return ret;
/*
* we always limit bottom-up allocation above the kernel,
* but top-down allocation doesn't have the limit, so
* retrying top-down allocation may succeed when bottom-up
* allocation failed.
*
* bottom-up allocation is expected to be fail very rarely,
* so we use WARN_ONCE() here to see the stack trace if
* fail happens.
*/
WARN_ONCE(1, "memblock: bottom-up allocation failed, "
"memory hotunplug may be affected\n");
}
return __memblock_find_range_top_down(start, end, size, align, nid);
}
/**
* memblock_find_in_range - find free area in given range
* @start: start of candidate range
@@ -134,7 +232,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
* Find @size free area aligned to @align in the specified range.
*
* RETURNS:
* Found address on success, %0 on failure.
* Found address on success, 0 on failure.
*/
phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
phys_addr_t end, phys_addr_t size,

View File

@@ -59,6 +59,7 @@
#include <net/sock.h>
#include <net/ip.h>
#include <net/tcp_memcontrol.h>
#include "slab.h"
#include <asm/uaccess.h>
@@ -2968,7 +2969,7 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
VM_BUG_ON(p->is_root_cache);
cachep = p->root_cache;
return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
}
#ifdef CONFIG_SLABINFO
@@ -2997,21 +2998,14 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
struct res_counter *fail_res;
struct mem_cgroup *_memcg;
int ret = 0;
bool may_oom;
ret = res_counter_charge(&memcg->kmem, size, &fail_res);
if (ret)
return ret;
/*
* Conditions under which we can wait for the oom_killer. Those are
* the same conditions tested by the core page allocator
*/
may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
_memcg = memcg;
ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
&_memcg, may_oom);
&_memcg, oom_gfp_allowed(gfp));
if (ret == -EINTR) {
/*
@@ -3151,7 +3145,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
{
struct memcg_cache_params *cur_params = s->memcg_params;
VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
VM_BUG_ON(!is_root_cache(s));
if (num_groups > memcg_limited_groups_array_size) {
int i;
@@ -3412,7 +3406,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
idx = memcg_cache_id(memcg);
mutex_lock(&memcg_cache_mutex);
new_cachep = cachep->memcg_params->memcg_caches[idx];
new_cachep = cache_from_memcg_idx(cachep, idx);
if (new_cachep) {
css_put(&memcg->css);
goto out;
@@ -3458,8 +3452,8 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
* we'll take the set_limit_mutex to protect ourselves against this.
*/
mutex_lock(&set_limit_mutex);
for (i = 0; i < memcg_limited_groups_array_size; i++) {
c = s->memcg_params->memcg_caches[i];
for_each_memcg_cache_index(i) {
c = cache_from_memcg_idx(s, i);
if (!c)
continue;
@@ -3592,8 +3586,8 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
* code updating memcg_caches will issue a write barrier to match this.
*/
read_barrier_depends();
if (likely(cachep->memcg_params->memcg_caches[idx])) {
cachep = cachep->memcg_params->memcg_caches[idx];
if (likely(cache_from_memcg_idx(cachep, idx))) {
cachep = cache_from_memcg_idx(cachep, idx);
goto out;
}
@@ -5389,45 +5383,50 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *m)
{
struct numa_stat {
const char *name;
unsigned int lru_mask;
};
static const struct numa_stat stats[] = {
{ "total", LRU_ALL },
{ "file", LRU_ALL_FILE },
{ "anon", LRU_ALL_ANON },
{ "unevictable", BIT(LRU_UNEVICTABLE) },
};
const struct numa_stat *stat;
int nid;
unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
unsigned long node_nr;
unsigned long nr;
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
seq_printf(m, "total=%lu", total_nr);
for_each_node_state(nid, N_MEMORY) {
node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
seq_printf(m, " N%d=%lu", nid, node_nr);
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
seq_printf(m, "%s=%lu", stat->name, nr);
for_each_node_state(nid, N_MEMORY) {
nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
stat->lru_mask);
seq_printf(m, " N%d=%lu", nid, nr);
}
seq_putc(m, '\n');
}
seq_putc(m, '\n');
file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
seq_printf(m, "file=%lu", file_nr);
for_each_node_state(nid, N_MEMORY) {
node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
LRU_ALL_FILE);
seq_printf(m, " N%d=%lu", nid, node_nr);
}
seq_putc(m, '\n');
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
struct mem_cgroup *iter;
anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
seq_printf(m, "anon=%lu", anon_nr);
for_each_node_state(nid, N_MEMORY) {
node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
LRU_ALL_ANON);
seq_printf(m, " N%d=%lu", nid, node_nr);
nr = 0;
for_each_mem_cgroup_tree(iter, memcg)
nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
for_each_node_state(nid, N_MEMORY) {
nr = 0;
for_each_mem_cgroup_tree(iter, memcg)
nr += mem_cgroup_node_nr_lru_pages(
iter, nid, stat->lru_mask);
seq_printf(m, " N%d=%lu", nid, nr);
}
seq_putc(m, '\n');
}
seq_putc(m, '\n');
unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
seq_printf(m, "unevictable=%lu", unevictable_nr);
for_each_node_state(nid, N_MEMORY) {
node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
BIT(LRU_UNEVICTABLE));
seq_printf(m, " N%d=%lu", nid, node_nr);
}
seq_putc(m, '\n');
return 0;
}
#endif /* CONFIG_NUMA */

View File

@@ -1422,19 +1422,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
if (flags & MF_COUNT_INCREASED)
return 1;
/*
* The lock_memory_hotplug prevents a race with memory hotplug.
* This is a big hammer, a better would be nicer.
*/
lock_memory_hotplug();
/*
* Isolate the page, so that it doesn't get reallocated if it
* was free. This flag should be kept set until the source page
* is freed and PG_hwpoison on it is set.
*/
if (get_pageblock_migratetype(p) != MIGRATE_ISOLATE)
set_migratetype_isolate(p, true);
/*
* When the target page is a free hugepage, just remove it
* from free hugepage list.
@@ -1455,7 +1442,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
/* Not a free page */
ret = 1;
}
unlock_memory_hotplug();
return ret;
}
@@ -1654,15 +1640,28 @@ int soft_offline_page(struct page *page, int flags)
}
}
/*
* The lock_memory_hotplug prevents a race with memory hotplug.
* This is a big hammer, a better would be nicer.
*/
lock_memory_hotplug();
/*
* Isolate the page, so that it doesn't get reallocated if it
* was free. This flag should be kept set until the source page
* is freed and PG_hwpoison on it is set.
*/
if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
set_migratetype_isolate(page, true);
ret = get_any_page(page, pfn, flags);
if (ret < 0)
goto unset;
if (ret) { /* for in-use pages */
unlock_memory_hotplug();
if (ret > 0) { /* for in-use pages */
if (PageHuge(page))
ret = soft_offline_huge_page(page, flags);
else
ret = __soft_offline_page(page, flags);
} else { /* for free pages */
} else if (ret == 0) { /* for free pages */
if (PageHuge(page)) {
set_page_hwpoison_huge_page(hpage);
dequeue_hwpoisoned_huge_page(hpage);
@@ -1673,7 +1672,6 @@ int soft_offline_page(struct page *page, int flags)
atomic_long_inc(&num_poisoned_pages);
}
}
unset:
unset_migratetype_isolate(page, MIGRATE_MOVABLE);
return ret;
}

View File

@@ -453,8 +453,6 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
/*
* This function frees user-level page tables of a process.
*
* Must be called with pagetable lock held.
*/
void free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,

View File

@@ -31,6 +31,7 @@
#include <linux/firmware-map.h>
#include <linux/stop_machine.h>
#include <linux/hugetlb.h>
#include <linux/memblock.h>
#include <asm/tlbflush.h>
@@ -365,8 +366,7 @@ out_fail:
static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
unsigned long end_pfn)
{
unsigned long old_pgdat_end_pfn =
pgdat->node_start_pfn + pgdat->node_spanned_pages;
unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat);
if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
pgdat->node_start_pfn = start_pfn;
@@ -402,13 +402,12 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
static int __meminit __add_section(int nid, struct zone *zone,
unsigned long phys_start_pfn)
{
int nr_pages = PAGES_PER_SECTION;
int ret;
if (pfn_valid(phys_start_pfn))
return -EEXIST;
ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
ret = sparse_add_one_section(zone, phys_start_pfn);
if (ret < 0)
return ret;
@@ -579,9 +578,9 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
static void shrink_pgdat_span(struct pglist_data *pgdat,
unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
unsigned long pgdat_end_pfn =
pgdat->node_start_pfn + pgdat->node_spanned_pages;
unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
unsigned long pgdat_end_pfn = p;
unsigned long pfn;
struct mem_section *ms;
int nid = pgdat->node_id;
@@ -935,7 +934,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
arg.nr_pages = nr_pages;
node_states_check_changes_online(nr_pages, zone, &arg);
nid = page_to_nid(pfn_to_page(pfn));
nid = pfn_to_nid(pfn);
ret = memory_notify(MEM_GOING_ONLINE, &arg);
ret = notifier_to_errno(ret);
@@ -1044,17 +1043,23 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
}
/*
/**
* try_online_node - online a node if offlined
*
* called by cpu_up() to online a node without onlined memory.
*/
int mem_online_node(int nid)
int try_online_node(int nid)
{
pg_data_t *pgdat;
int ret;
if (node_online(nid))
return 0;
lock_memory_hotplug();
pgdat = hotadd_new_pgdat(nid, 0);
if (!pgdat) {
pr_err("Cannot online node %d due to NULL pgdat\n", nid);
ret = -ENOMEM;
goto out;
}
@@ -1062,6 +1067,12 @@ int mem_online_node(int nid)
ret = register_one_node(nid);
BUG_ON(ret);
if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
mutex_lock(&zonelists_mutex);
build_all_zonelists(NULL, NULL);
mutex_unlock(&zonelists_mutex);
}
out:
unlock_memory_hotplug();
return ret;
@@ -1412,6 +1423,36 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
}
#endif /* CONFIG_MOVABLE_NODE */
static int __init cmdline_parse_movable_node(char *p)
{
#ifdef CONFIG_MOVABLE_NODE
/*
* Memory used by the kernel cannot be hot-removed because Linux
* cannot migrate the kernel pages. When memory hotplug is
* enabled, we should prevent memblock from allocating memory
* for the kernel.
*
* ACPI SRAT records all hotpluggable memory ranges. But before
* SRAT is parsed, we don't know about it.
*
* The kernel image is loaded into memory at very early time. We
* cannot prevent this anyway. So on NUMA system, we set any
* node the kernel resides in as un-hotpluggable.
*
* Since on modern servers, one node could have double-digit
* gigabytes memory, we can assume the memory around the kernel
* image is also un-hotpluggable. So before SRAT is parsed, just
* allocate memory near the kernel image to try the best to keep
* the kernel away from hotpluggable memory.
*/
memblock_set_bottom_up(true);
#else
pr_warn("movable_node option not supported\n");
#endif
return 0;
}
early_param("movable_node", cmdline_parse_movable_node);
/* check which state of node_states will be changed when offline memory */
static void node_states_check_changes_offline(unsigned long nr_pages,
struct zone *zone, struct memory_notify *arg)
@@ -1702,7 +1743,7 @@ int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
}
#ifdef CONFIG_MEMORY_HOTREMOVE
static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
{
int ret = !is_memblock_offlined(mem);
@@ -1854,7 +1895,7 @@ void __ref remove_memory(int nid, u64 start, u64 size)
* if this is not the case.
*/
ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
is_memblock_offlined_cb);
check_memblock_offlined_cb);
if (ret) {
unlock_memory_hotplug();
BUG();

View File

@@ -1125,7 +1125,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
tmp = *from;
while (!nodes_empty(tmp)) {
int s,d;
int source = -1;
int source = NUMA_NO_NODE;
int dest = 0;
for_each_node_mask(s, tmp) {
@@ -1160,7 +1160,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
if (!node_isset(dest, tmp))
break;
}
if (source == -1)
if (source == NUMA_NO_NODE)
break;
node_clear(source, tmp);
@@ -1835,7 +1835,7 @@ static unsigned offset_il_node(struct mempolicy *pol,
unsigned nnodes = nodes_weight(pol->v.nodes);
unsigned target;
int c;
int nid = -1;
int nid = NUMA_NO_NODE;
if (!nnodes)
return numa_node_id();
@@ -1872,11 +1872,11 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
/*
* Return the bit number of a random bit set in the nodemask.
* (returns -1 if nodemask is empty)
* (returns NUMA_NO_NODE if nodemask is empty)
*/
int node_random(const nodemask_t *maskp)
{
int w, bit = -1;
int w, bit = NUMA_NO_NODE;
w = nodes_weight(*maskp);
if (w)
@@ -2914,62 +2914,45 @@ out:
* @maxlen: length of @buffer
* @pol: pointer to mempolicy to be formatted
*
* Convert a mempolicy into a string.
* Returns the number of characters in buffer (if positive)
* or an error (negative)
* Convert @pol into a string. If @buffer is too short, truncate the string.
* Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
* longest flag, "relative", and to display at least a few node ids.
*/
int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
{
char *p = buffer;
int l;
nodemask_t nodes;
unsigned short mode;
unsigned short flags = pol ? pol->flags : 0;
nodemask_t nodes = NODE_MASK_NONE;
unsigned short mode = MPOL_DEFAULT;
unsigned short flags = 0;
/*
* Sanity check: room for longest mode, flag and some nodes
*/
VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
if (!pol || pol == &default_policy)
mode = MPOL_DEFAULT;
else
if (pol && pol != &default_policy) {
mode = pol->mode;
flags = pol->flags;
}
switch (mode) {
case MPOL_DEFAULT:
nodes_clear(nodes);
break;
case MPOL_PREFERRED:
nodes_clear(nodes);
if (flags & MPOL_F_LOCAL)
mode = MPOL_LOCAL;
else
node_set(pol->v.preferred_node, nodes);
break;
case MPOL_BIND:
/* Fall through */
case MPOL_INTERLEAVE:
nodes = pol->v.nodes;
break;
default:
return -EINVAL;
WARN_ON_ONCE(1);
snprintf(p, maxlen, "unknown");
return;
}
l = strlen(policy_modes[mode]);
if (buffer + maxlen < p + l + 1)
return -ENOSPC;
strcpy(p, policy_modes[mode]);
p += l;
p += snprintf(p, maxlen, policy_modes[mode]);
if (flags & MPOL_MODE_FLAGS) {
if (buffer + maxlen < p + 2)
return -ENOSPC;
*p++ = '=';
p += snprintf(p, buffer + maxlen - p, "=");
/*
* Currently, the only defined flags are mutually exclusive
@@ -2981,10 +2964,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
}
if (!nodes_empty(nodes)) {
if (buffer + maxlen < p + 2)
return -ENOSPC;
*p++ = ':';
p += snprintf(p, buffer + maxlen - p, ":");
p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
}
return p - buffer;
}

View File

@@ -179,14 +179,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
goto error;
}
allowed = (totalram_pages - hugetlb_total_pages())
* sysctl_overcommit_ratio / 100;
allowed = vm_commit_limit();
/*
* Reserve some for root
*/
if (!cap_sys_admin)
allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
allowed += total_swap_pages;
/*
* Don't let a single process grow so big a user can't recover
@@ -1856,7 +1854,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
struct vm_area_struct *vma;
struct vm_unmapped_area_info info;
if (len > TASK_SIZE)
if (len > TASK_SIZE - mmap_min_addr)
return -ENOMEM;
if (flags & MAP_FIXED)
@@ -1865,14 +1863,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
info.flags = 0;
info.length = len;
info.low_limit = TASK_UNMAPPED_BASE;
info.low_limit = mm->mmap_base;
info.high_limit = TASK_SIZE;
info.align_mask = 0;
return vm_unmapped_area(&info);
@@ -1895,7 +1893,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
struct vm_unmapped_area_info info;
/* requested length too big for entire address space */
if (len > TASK_SIZE)
if (len > TASK_SIZE - mmap_min_addr)
return -ENOMEM;
if (flags & MAP_FIXED)
@@ -1905,14 +1903,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = PAGE_SIZE;
info.low_limit = max(PAGE_SIZE, mmap_min_addr);
info.high_limit = mm->mmap_base;
info.align_mask = 0;
addr = vm_unmapped_area(&info);

View File

@@ -112,6 +112,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pmd_t *pmd;
unsigned long next;
unsigned long pages = 0;
unsigned long nr_huge_updates = 0;
pmd = pmd_offset(pud, addr);
do {
@@ -126,9 +127,10 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
newprot, prot_numa);
if (nr_ptes) {
if (nr_ptes == HPAGE_PMD_NR)
pages++;
if (nr_ptes == HPAGE_PMD_NR) {
pages += HPAGE_PMD_NR;
nr_huge_updates++;
}
continue;
}
}
@@ -141,6 +143,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pages += this_pages;
} while (pmd++, addr = next, addr != end);
if (nr_huge_updates)
count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
return pages;
}

View File

@@ -82,27 +82,18 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
static void __init __free_pages_memory(unsigned long start, unsigned long end)
{
unsigned long i, start_aligned, end_aligned;
int order = ilog2(BITS_PER_LONG);
int order;
start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
end_aligned = end & ~(BITS_PER_LONG - 1);
while (start < end) {
order = min(MAX_ORDER - 1UL, __ffs(start));
if (end_aligned <= start_aligned) {
for (i = start; i < end; i++)
__free_pages_bootmem(pfn_to_page(i), 0);
while (start + (1UL << order) > end)
order--;
return;
__free_pages_bootmem(pfn_to_page(start), order);
start += (1UL << order);
}
for (i = start; i < start_aligned; i++)
__free_pages_bootmem(pfn_to_page(i), 0);
for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
__free_pages_bootmem(pfn_to_page(i), order);
for (i = end_aligned; i < end; i++)
__free_pages_bootmem(pfn_to_page(i), 0);
}
static unsigned long __init __free_memory_core(phys_addr_t start,

View File

@@ -1948,13 +1948,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
goto error;
}
allowed = totalram_pages * sysctl_overcommit_ratio / 100;
allowed = vm_commit_limit();
/*
* Reserve some 3% for root
*/
if (!cap_sys_admin)
allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
allowed += total_swap_pages;
/*
* Don't let a single process grow so big a user can't recover

View File

@@ -234,8 +234,8 @@ int page_group_by_mobility_disabled __read_mostly;
void set_pageblock_migratetype(struct page *page, int migratetype)
{
if (unlikely(page_group_by_mobility_disabled))
if (unlikely(page_group_by_mobility_disabled &&
migratetype < MIGRATE_PCPTYPES))
migratetype = MIGRATE_UNMOVABLE;
set_pageblock_flags_group(page, (unsigned long)migratetype,
@@ -1027,6 +1027,10 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
{
int current_order = page_order(page);
/*
* When borrowing from MIGRATE_CMA, we need to release the excess
* buddy pages to CMA itself.
*/
if (is_migrate_cma(fallback_type))
return fallback_type;
@@ -1091,21 +1095,11 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
list_del(&page->lru);
rmv_page_order(page);
/*
* Borrow the excess buddy pages as well, irrespective
* of whether we stole freepages, or took ownership of
* the pageblock or not.
*
* Exception: When borrowing from MIGRATE_CMA, release
* the excess buddy pages to CMA itself.
*/
expand(zone, page, order, current_order, area,
is_migrate_cma(migratetype)
? migratetype : start_migratetype);
new_type);
trace_mm_page_alloc_extfrag(page, order,
current_order, start_migratetype, migratetype,
new_type == start_migratetype);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, migratetype, new_type);
return page;
}
@@ -1711,7 +1705,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
* comments in mmzone.h. Reduces cache footprint of zonelist scans
* that have to skip over a lot of full or unallowed zones.
*
* If the zonelist cache is present in the passed in zonelist, then
* If the zonelist cache is present in the passed zonelist, then
* returns a pointer to the allowed node mask (either the current
* tasks mems_allowed, or node_states[N_MEMORY].)
*
@@ -2593,7 +2587,7 @@ rebalance:
* running out of options and have to consider going OOM
*/
if (!did_some_progress) {
if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
if (oom_gfp_allowed(gfp_mask)) {
if (oom_killer_disabled)
goto nopage;
/* Coredumps can quickly deplete all memory reserves */
@@ -3881,8 +3875,6 @@ static inline unsigned long wait_table_bits(unsigned long size)
return ffz(~size);
}
#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
/*
* Check if a pageblock contains reserved pages
*/
@@ -4266,7 +4258,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
*/
zone->pageset = &boot_pageset;
if (zone->present_pages)
if (populated_zone(zone))
printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
zone->name, zone->present_pages,
zone_batchsize(zone));
@@ -5160,7 +5152,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
if (zone->present_pages) {
if (populated_zone(zone)) {
node_set_state(nid, N_HIGH_MEMORY);
if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
zone_type <= ZONE_NORMAL)

View File

@@ -401,6 +401,7 @@ ondemand_readahead(struct address_space *mapping,
unsigned long req_size)
{
unsigned long max = max_sane_readahead(ra->ra_pages);
pgoff_t prev_offset;
/*
* start of file
@@ -452,8 +453,11 @@ ondemand_readahead(struct address_space *mapping,
/*
* sequential cache miss
* trivial case: (offset - prev_offset) == 1
* unaligned reads: (offset - prev_offset) == 0
*/
if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL)
prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT;
if (offset - prev_offset <= 1UL)
goto initial_readahead;
/*
@@ -569,7 +573,7 @@ static ssize_t
do_readahead(struct address_space *mapping, struct file *filp,
pgoff_t index, unsigned long nr)
{
if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
if (!mapping || !mapping->a_ops)
return -EINVAL;
force_page_cache_readahead(mapping, filp, index, nr);

View File

@@ -3982,7 +3982,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
VM_BUG_ON(!mutex_is_locked(&slab_mutex));
for_each_memcg_cache_index(i) {
c = cache_from_memcg(cachep, i);
c = cache_from_memcg_idx(cachep, i);
if (c)
/* return value determined by the parent cache only */
__do_tune_cpucache(c, limit, batchcount, shared, gfp);

View File

@@ -160,7 +160,8 @@ static inline const char *cache_name(struct kmem_cache *s)
return s->name;
}
static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
static inline struct kmem_cache *
cache_from_memcg_idx(struct kmem_cache *s, int idx)
{
if (!s->memcg_params)
return NULL;
@@ -204,7 +205,8 @@ static inline const char *cache_name(struct kmem_cache *s)
return s->name;
}
static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
static inline struct kmem_cache *
cache_from_memcg_idx(struct kmem_cache *s, int idx)
{
return NULL;
}

View File

@@ -571,7 +571,7 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
return;
for_each_memcg_cache_index(i) {
c = cache_from_memcg(s, i);
c = cache_from_memcg_idx(s, i);
if (!c)
continue;

View File

@@ -4983,7 +4983,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
* through the descendants with best-effort propagation.
*/
for_each_memcg_cache_index(i) {
struct kmem_cache *c = cache_from_memcg(s, i);
struct kmem_cache *c = cache_from_memcg_idx(s, i);
if (c)
attribute->store(c, buf, len);
}

View File

@@ -590,33 +590,32 @@ void __init sparse_init(void)
#ifdef CONFIG_MEMORY_HOTPLUG
#ifdef CONFIG_SPARSEMEM_VMEMMAP
static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
unsigned long nr_pages)
static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid)
{
/* This will make the necessary allocations eventually. */
return sparse_mem_map_populate(pnum, nid);
}
static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
static void __kfree_section_memmap(struct page *memmap)
{
unsigned long start = (unsigned long)memmap;
unsigned long end = (unsigned long)(memmap + nr_pages);
unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
vmemmap_free(start, end);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
static void free_map_bootmem(struct page *memmap)
{
unsigned long start = (unsigned long)memmap;
unsigned long end = (unsigned long)(memmap + nr_pages);
unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
vmemmap_free(start, end);
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
#else
static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
static struct page *__kmalloc_section_memmap(void)
{
struct page *page, *ret;
unsigned long memmap_size = sizeof(struct page) * nr_pages;
unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
if (page)
@@ -634,28 +633,30 @@ got_map_ptr:
return ret;
}
static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
unsigned long nr_pages)
static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid)
{
return __kmalloc_section_memmap(nr_pages);
return __kmalloc_section_memmap();
}
static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
static void __kfree_section_memmap(struct page *memmap)
{
if (is_vmalloc_addr(memmap))
vfree(memmap);
else
free_pages((unsigned long)memmap,
get_order(sizeof(struct page) * nr_pages));
get_order(sizeof(struct page) * PAGES_PER_SECTION));
}
#ifdef CONFIG_MEMORY_HOTREMOVE
static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
static void free_map_bootmem(struct page *memmap)
{
unsigned long maps_section_nr, removing_section_nr, i;
unsigned long magic;
unsigned long magic, nr_pages;
struct page *page = virt_to_page(memmap);
nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
>> PAGE_SHIFT;
for (i = 0; i < nr_pages; i++, page++) {
magic = (unsigned long) page->lru.next;
@@ -684,8 +685,7 @@ static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
* set. If this is <=0, then that means that the passed-in
* map was not consumed and must be freed.
*/
int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
int nr_pages)
int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
{
unsigned long section_nr = pfn_to_section_nr(start_pfn);
struct pglist_data *pgdat = zone->zone_pgdat;
@@ -702,12 +702,12 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
ret = sparse_index_init(section_nr, pgdat->node_id);
if (ret < 0 && ret != -EEXIST)
return ret;
memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages);
memmap = kmalloc_section_memmap(section_nr, pgdat->node_id);
if (!memmap)
return -ENOMEM;
usemap = __kmalloc_section_usemap();
if (!usemap) {
__kfree_section_memmap(memmap, nr_pages);
__kfree_section_memmap(memmap);
return -ENOMEM;
}
@@ -719,7 +719,7 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
goto out;
}
memset(memmap, 0, sizeof(struct page) * nr_pages);
memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION);
ms->section_mem_map |= SECTION_MARKED_PRESENT;
@@ -729,7 +729,7 @@ out:
pgdat_resize_unlock(pgdat, &flags);
if (ret <= 0) {
kfree(usemap);
__kfree_section_memmap(memmap, nr_pages);
__kfree_section_memmap(memmap);
}
return ret;
}
@@ -759,7 +759,6 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
static void free_section_usemap(struct page *memmap, unsigned long *usemap)
{
struct page *usemap_page;
unsigned long nr_pages;
if (!usemap)
return;
@@ -771,7 +770,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
kfree(usemap);
if (memmap)
__kfree_section_memmap(memmap, PAGES_PER_SECTION);
__kfree_section_memmap(memmap);
return;
}
@@ -780,12 +779,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
* on the section which has pgdat at boot time. Just keep it as is now.
*/
if (memmap) {
nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
>> PAGE_SHIFT;
free_map_bootmem(memmap, nr_pages);
}
if (memmap)
free_map_bootmem(memmap);
}
void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)

View File

@@ -707,7 +707,7 @@ noswap:
return (swp_entry_t) {0};
}
/* The only caller of this function is now susupend routine */
/* The only caller of this function is now suspend routine */
swp_entry_t get_swap_page_of_type(int type)
{
struct swap_info_struct *si;
@@ -845,7 +845,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
}
/*
* Caller has made sure that the swapdevice corresponding to entry
* Caller has made sure that the swap device corresponding to entry
* is still around or has not been recycled.
*/
void swap_free(swp_entry_t entry)
@@ -947,7 +947,7 @@ int try_to_free_swap(struct page *page)
* original page might be freed under memory pressure, then
* later read back in from swap, now with the wrong data.
*
* Hibration suspends storage while it is writing the image
* Hibernation suspends storage while it is writing the image
* to disk so check that here.
*/
if (pm_suspended_storage())
@@ -1179,7 +1179,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
* some architectures (e.g. x86_32 with PAE) we might catch a glimpse
* of unmatched parts which look like swp_pte, so unuse_pte must
* recheck under pte lock. Scanning without pte lock lets it be
* preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
* preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
*/
pte = pte_offset_map(pmd, addr);
do {
@@ -1924,17 +1924,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
p->cluster_info = NULL;
p->flags = 0;
frontswap_map = frontswap_map_get(p);
frontswap_map_set(p, NULL);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
frontswap_invalidate_area(type);
frontswap_map_set(p, NULL);
mutex_unlock(&swapon_mutex);
free_percpu(p->percpu_cluster);
p->percpu_cluster = NULL;
vfree(swap_map);
vfree(cluster_info);
vfree(frontswap_map);
/* Destroy swap account informatin */
/* Destroy swap account information */
swap_cgroup_swapoff(type);
inode = mapping->host;
@@ -2786,8 +2786,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
/*
* We are fortunate that although vmalloc_to_page uses pte_offset_map,
* no architecture is using highmem pages for kernel pagetables: so it
* will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps.
* no architecture is using highmem pages for kernel page tables: so it
* will not corrupt the GFP_ATOMIC caller's atomic page table kmaps.
*/
head = vmalloc_to_page(si->swap_map + offset);
offset &= ~PAGE_MASK;

View File

@@ -7,6 +7,9 @@
#include <linux/security.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mman.h>
#include <linux/hugetlb.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -398,6 +401,16 @@ struct address_space *page_mapping(struct page *page)
return mapping;
}
/*
* Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
*/
unsigned long vm_commit_limit(void)
{
return ((totalram_pages - hugetlb_total_pages())
* sysctl_overcommit_ratio / 100) + total_swap_pages;
}
/* Tracepoints definitions. */
EXPORT_TRACEPOINT_SYMBOL(kmalloc);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);

View File

@@ -359,6 +359,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
if (unlikely(!va))
return ERR_PTR(-ENOMEM);
/*
* Only scan the relevant parts containing pointers to other objects
* to avoid false negatives.
*/
kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
retry:
spin_lock(&vmap_area_lock);
/*
@@ -1546,7 +1552,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, pgprot_t prot,
int node, const void *caller);
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node, const void *caller)
pgprot_t prot, int node)
{
const int order = 0;
struct page **pages;
@@ -1560,13 +1566,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
PAGE_KERNEL, node, caller);
PAGE_KERNEL, node, area->caller);
area->flags |= VM_VPAGES;
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
area->pages = pages;
area->caller = caller;
if (!area->pages) {
remove_vm_area(area->addr);
kfree(area);
@@ -1577,7 +1582,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
struct page *page;
gfp_t tmp_mask = gfp_mask | __GFP_NOWARN;
if (node < 0)
if (node == NUMA_NO_NODE)
page = alloc_page(tmp_mask);
else
page = alloc_pages_node(node, tmp_mask, order);
@@ -1634,9 +1639,9 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
if (!area)
goto fail;
addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
addr = __vmalloc_area_node(area, gfp_mask, prot, node);
if (!addr)
goto fail;
return NULL;
/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
@@ -1646,11 +1651,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
clear_vm_uninitialized_flag(area);
/*
* A ref_count = 3 is needed because the vm_struct and vmap_area
* structures allocated in the __get_vm_area_node() function contain
* references to the virtual address of the vmalloc'ed block.
* A ref_count = 2 is needed because vm_struct allocated in
* __get_vm_area_node() contains a reference to the virtual address of
* the vmalloc'ed block.
*/
kmemleak_alloc(addr, real_size, 3, gfp_mask);
kmemleak_alloc(addr, real_size, 2, gfp_mask);
return addr;
@@ -2563,6 +2568,11 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
if (!counters)
return;
/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
smp_rmb();
if (v->flags & VM_UNINITIALIZED)
return;
memset(counters, 0, nr_node_ids * sizeof(unsigned int));
for (nr = 0; nr < v->nr_pages; nr++)
@@ -2579,23 +2589,15 @@ static int s_show(struct seq_file *m, void *p)
struct vmap_area *va = p;
struct vm_struct *v;
if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING))
/*
* s_show can encounter race with remove_vm_area, !VM_VM_AREA on
* behalf of vmap area is being tear down or vm_map_ram allocation.
*/
if (!(va->flags & VM_VM_AREA))
return 0;
if (!(va->flags & VM_VM_AREA)) {
seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
(void *)va->va_start, (void *)va->va_end,
va->va_end - va->va_start);
return 0;
}
v = va->vm;
/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
smp_rmb();
if (v->flags & VM_UNINITIALIZED)
return 0;
seq_printf(m, "0x%pK-0x%pK %7ld",
v->addr, v->addr + v->size, v->size);

View File

@@ -812,6 +812,7 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_NUMA_BALANCING
"numa_pte_updates",
"numa_huge_pte_updates",
"numa_hint_faults",
"numa_hint_faults_local",
"numa_pages_migrated",
@@ -1229,6 +1230,20 @@ static void start_cpu_timer(int cpu)
schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
}
static void vmstat_cpu_dead(int node)
{
int cpu;
get_online_cpus();
for_each_online_cpu(cpu)
if (cpu_to_node(cpu) == node)
goto end;
node_clear_state(node, N_CPU);
end:
put_online_cpus();
}
/*
* Use the cpu notifier to insure that the thresholds are recalculated
* when necessary.
@@ -1258,6 +1273,7 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb,
case CPU_DEAD:
case CPU_DEAD_FROZEN:
refresh_zone_stat_thresholds();
vmstat_cpu_dead(cpu_to_node(cpu));
break;
default:
break;
@@ -1276,8 +1292,12 @@ static int __init setup_vmstat(void)
register_cpu_notifier(&vmstat_notifier);
for_each_online_cpu(cpu)
get_online_cpus();
for_each_online_cpu(cpu) {
start_cpu_timer(cpu);
node_set_state(cpu_to_node(cpu), N_CPU);
}
put_online_cpus();
#endif
#ifdef CONFIG_PROC_FS
proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);

View File

@@ -217,6 +217,7 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
if (!entry)
return NULL;
entry->refcount = 1;
RB_CLEAR_NODE(&entry->rbnode);
return entry;
}
@@ -225,19 +226,6 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
kmem_cache_free(zswap_entry_cache, entry);
}
/* caller must hold the tree lock */
static void zswap_entry_get(struct zswap_entry *entry)
{
entry->refcount++;
}
/* caller must hold the tree lock */
static int zswap_entry_put(struct zswap_entry *entry)
{
entry->refcount--;
return entry->refcount;
}
/*********************************
* rbtree functions
**********************************/
@@ -285,6 +273,61 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
return 0;
}
static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
{
if (!RB_EMPTY_NODE(&entry->rbnode)) {
rb_erase(&entry->rbnode, root);
RB_CLEAR_NODE(&entry->rbnode);
}
}
/*
* Carries out the common pattern of freeing and entry's zsmalloc allocation,
* freeing the entry itself, and decrementing the number of stored pages.
*/
static void zswap_free_entry(struct zswap_tree *tree,
struct zswap_entry *entry)
{
zbud_free(tree->pool, entry->handle);
zswap_entry_cache_free(entry);
atomic_dec(&zswap_stored_pages);
zswap_pool_pages = zbud_get_pool_size(tree->pool);
}
/* caller must hold the tree lock */
static void zswap_entry_get(struct zswap_entry *entry)
{
entry->refcount++;
}
/* caller must hold the tree lock
* remove from the tree and free it, if nobody reference the entry
*/
static void zswap_entry_put(struct zswap_tree *tree,
struct zswap_entry *entry)
{
int refcount = --entry->refcount;
BUG_ON(refcount < 0);
if (refcount == 0) {
zswap_rb_erase(&tree->rbroot, entry);
zswap_free_entry(tree, entry);
}
}
/* caller must hold the tree lock */
static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
pgoff_t offset)
{
struct zswap_entry *entry = NULL;
entry = zswap_rb_search(root, offset);
if (entry)
zswap_entry_get(entry);
return entry;
}
/*********************************
* per-cpu code
**********************************/
@@ -368,18 +411,6 @@ static bool zswap_is_full(void)
zswap_pool_pages);
}
/*
* Carries out the common pattern of freeing and entry's zsmalloc allocation,
* freeing the entry itself, and decrementing the number of stored pages.
*/
static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry)
{
zbud_free(tree->pool, entry->handle);
zswap_entry_cache_free(entry);
atomic_dec(&zswap_stored_pages);
zswap_pool_pages = zbud_get_pool_size(tree->pool);
}
/*********************************
* writeback code
**********************************/
@@ -387,7 +418,7 @@ static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry)
enum zswap_get_swap_ret {
ZSWAP_SWAPCACHE_NEW,
ZSWAP_SWAPCACHE_EXIST,
ZSWAP_SWAPCACHE_NOMEM
ZSWAP_SWAPCACHE_FAIL,
};
/*
@@ -401,9 +432,10 @@ enum zswap_get_swap_ret {
* added to the swap cache, and returned in retpage.
*
* If success, the swap cache page is returned in retpage
* Returns 0 if page was already in the swap cache, page is not locked
* Returns 1 if the new page needs to be populated, page is locked
* Returns <0 on error
* Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache
* Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated,
* the new page is added to swapcache and locked
* Returns ZSWAP_SWAPCACHE_FAIL on error
*/
static int zswap_get_swap_cache_page(swp_entry_t entry,
struct page **retpage)
@@ -475,7 +507,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
if (new_page)
page_cache_release(new_page);
if (!found_page)
return ZSWAP_SWAPCACHE_NOMEM;
return ZSWAP_SWAPCACHE_FAIL;
*retpage = found_page;
return ZSWAP_SWAPCACHE_EXIST;
}
@@ -502,7 +534,7 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
struct page *page;
u8 *src, *dst;
unsigned int dlen;
int ret, refcount;
int ret;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
};
@@ -517,23 +549,22 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
/* find and ref zswap entry */
spin_lock(&tree->lock);
entry = zswap_rb_search(&tree->rbroot, offset);
entry = zswap_entry_find_get(&tree->rbroot, offset);
if (!entry) {
/* entry was invalidated */
spin_unlock(&tree->lock);
return 0;
}
zswap_entry_get(entry);
spin_unlock(&tree->lock);
BUG_ON(offset != entry->offset);
/* try to allocate swap cache page */
switch (zswap_get_swap_cache_page(swpentry, &page)) {
case ZSWAP_SWAPCACHE_NOMEM: /* no memory */
case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
ret = -ENOMEM;
goto fail;
case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */
case ZSWAP_SWAPCACHE_EXIST:
/* page is already in the swap cache, ignore for now */
page_cache_release(page);
ret = -EEXIST;
@@ -556,43 +587,44 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
SetPageUptodate(page);
}
/* move it to the tail of the inactive list after end_writeback */
SetPageReclaim(page);
/* start writeback */
__swap_writepage(page, &wbc, end_swap_bio_write);
page_cache_release(page);
zswap_written_back_pages++;
spin_lock(&tree->lock);
/* drop local reference */
zswap_entry_put(entry);
/* drop the initial reference from entry creation */
refcount = zswap_entry_put(entry);
zswap_entry_put(tree, entry);
/*
* There are three possible values for refcount here:
* (1) refcount is 1, load is in progress, unlink from rbtree,
* load will free
* (2) refcount is 0, (normal case) entry is valid,
* remove from rbtree and free entry
* (3) refcount is -1, invalidate happened during writeback,
* free entry
*/
if (refcount >= 0) {
/* no invalidate yet, remove from rbtree */
rb_erase(&entry->rbnode, &tree->rbroot);
}
* There are two possible situations for entry here:
* (1) refcount is 1(normal case), entry is valid and on the tree
* (2) refcount is 0, entry is freed and not on the tree
* because invalidate happened during writeback
* search the tree and free the entry if find entry
*/
if (entry == zswap_rb_search(&tree->rbroot, offset))
zswap_entry_put(tree, entry);
spin_unlock(&tree->lock);
if (refcount <= 0) {
/* free the entry */
zswap_free_entry(tree, entry);
return 0;
}
return -EAGAIN;
goto end;
/*
* if we get here due to ZSWAP_SWAPCACHE_EXIST
* a load may happening concurrently
* it is safe and okay to not free the entry
* if we free the entry in the following put
* it it either okay to return !0
*/
fail:
spin_lock(&tree->lock);
zswap_entry_put(entry);
zswap_entry_put(tree, entry);
spin_unlock(&tree->lock);
end:
return ret;
}
@@ -676,11 +708,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
if (ret == -EEXIST) {
zswap_duplicate_entry++;
/* remove from rbtree */
rb_erase(&dupentry->rbnode, &tree->rbroot);
if (!zswap_entry_put(dupentry)) {
/* free */
zswap_free_entry(tree, dupentry);
}
zswap_rb_erase(&tree->rbroot, dupentry);
zswap_entry_put(tree, dupentry);
}
} while (ret == -EEXIST);
spin_unlock(&tree->lock);
@@ -709,17 +738,16 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
struct zswap_entry *entry;
u8 *src, *dst;
unsigned int dlen;
int refcount, ret;
int ret;
/* find */
spin_lock(&tree->lock);
entry = zswap_rb_search(&tree->rbroot, offset);
entry = zswap_entry_find_get(&tree->rbroot, offset);
if (!entry) {
/* entry was written back */
spin_unlock(&tree->lock);
return -1;
}
zswap_entry_get(entry);
spin_unlock(&tree->lock);
/* decompress */
@@ -734,22 +762,9 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
BUG_ON(ret);
spin_lock(&tree->lock);
refcount = zswap_entry_put(entry);
if (likely(refcount)) {
spin_unlock(&tree->lock);
return 0;
}
zswap_entry_put(tree, entry);
spin_unlock(&tree->lock);
/*
* We don't have to unlink from the rbtree because
* zswap_writeback_entry() or zswap_frontswap_invalidate page()
* has already done this for us if we are the last reference.
*/
/* free */
zswap_free_entry(tree, entry);
return 0;
}
@@ -758,7 +773,6 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
{
struct zswap_tree *tree = zswap_trees[type];
struct zswap_entry *entry;
int refcount;
/* find */
spin_lock(&tree->lock);
@@ -770,20 +784,12 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
}
/* remove from rbtree */
rb_erase(&entry->rbnode, &tree->rbroot);
zswap_rb_erase(&tree->rbroot, entry);
/* drop the initial reference from entry creation */
refcount = zswap_entry_put(entry);
zswap_entry_put(tree, entry);
spin_unlock(&tree->lock);
if (refcount) {
/* writeback in progress, writeback will free */
return;
}
/* free */
zswap_free_entry(tree, entry);
}
/* frees all zswap entries for the given swap type */
@@ -797,11 +803,8 @@ static void zswap_frontswap_invalidate_area(unsigned type)
/* walk the tree and free everything */
spin_lock(&tree->lock);
rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) {
zbud_free(tree->pool, entry->handle);
zswap_entry_cache_free(entry);
atomic_dec(&zswap_stored_pages);
}
rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
zswap_free_entry(tree, entry);
tree->rbroot = RB_ROOT;
spin_unlock(&tree->lock);