Merge branch 'akpm' (more incoming from Andrew)

Merge second patch-bomb from Andrew Morton:

 - A little DM fix

 - the MM queue

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (154 commits)
  ksm: allocate roots when needed
  mm: cleanup "swapcache" in do_swap_page
  mm,ksm: swapoff might need to copy
  mm,ksm: FOLL_MIGRATION do migration_entry_wait
  ksm: shrink 32-bit rmap_item back to 32 bytes
  ksm: treat unstable nid like in stable tree
  ksm: add some comments
  tmpfs: fix mempolicy object leaks
  tmpfs: fix use-after-free of mempolicy object
  mm/fadvise.c: drain all pagevecs if POSIX_FADV_DONTNEED fails to discard all pages
  mm: export mmu notifier invalidates
  mm: accelerate mm_populate() treatment of THP pages
  mm: use long type for page counts in mm_populate() and get_user_pages()
  mm: accurately document nr_free_*_pages functions with code comments
  HWPOISON: change order of error_states[]'s elements
  HWPOISON: fix misjudgement of page_action() for errors on mlocked pages
  memcg: stop warning on memcg_propagate_kmem
  net: change type of virtio_chan->p9_max_pages
  vmscan: change type of vm_total_pages to unsigned long
  fs/nfsd: change type of max_delegations, nfsd_drc_max_mem and nfsd_drc_mem_used
  ...
This commit is contained in:
Linus Torvalds
2013-02-23 17:50:35 -08:00
113 changed files with 4443 additions and 1667 deletions

View File

@@ -862,6 +862,18 @@ int arch_add_memory(int nid, u64 start, u64 size)
return __add_pages(nid, zone, start_pfn, nr_pages);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
int arch_remove_memory(u64 start, u64 size)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
struct zone *zone;
zone = page_zone(pfn_to_page(start_pfn));
return __remove_pages(zone, start_pfn, nr_pages);
}
#endif
#endif
/*

View File

@@ -707,6 +707,343 @@ int arch_add_memory(int nid, u64 start, u64 size)
}
EXPORT_SYMBOL_GPL(arch_add_memory);
#define PAGE_INUSE 0xFD
static void __meminit free_pagetable(struct page *page, int order)
{
struct zone *zone;
bool bootmem = false;
unsigned long magic;
unsigned int nr_pages = 1 << order;
/* bootmem page has reserved flag */
if (PageReserved(page)) {
__ClearPageReserved(page);
bootmem = true;
magic = (unsigned long)page->lru.next;
if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
while (nr_pages--)
put_page_bootmem(page++);
} else
__free_pages_bootmem(page, order);
} else
free_pages((unsigned long)page_address(page), order);
/*
* SECTION_INFO pages and MIX_SECTION_INFO pages
* are all allocated by bootmem.
*/
if (bootmem) {
zone = page_zone(page);
zone_span_writelock(zone);
zone->present_pages += nr_pages;
zone_span_writeunlock(zone);
totalram_pages += nr_pages;
}
}
static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
{
pte_t *pte;
int i;
for (i = 0; i < PTRS_PER_PTE; i++) {
pte = pte_start + i;
if (pte_val(*pte))
return;
}
/* free a pte talbe */
free_pagetable(pmd_page(*pmd), 0);
spin_lock(&init_mm.page_table_lock);
pmd_clear(pmd);
spin_unlock(&init_mm.page_table_lock);
}
static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
{
pmd_t *pmd;
int i;
for (i = 0; i < PTRS_PER_PMD; i++) {
pmd = pmd_start + i;
if (pmd_val(*pmd))
return;
}
/* free a pmd talbe */
free_pagetable(pud_page(*pud), 0);
spin_lock(&init_mm.page_table_lock);
pud_clear(pud);
spin_unlock(&init_mm.page_table_lock);
}
/* Return true if pgd is changed, otherwise return false. */
static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd)
{
pud_t *pud;
int i;
for (i = 0; i < PTRS_PER_PUD; i++) {
pud = pud_start + i;
if (pud_val(*pud))
return false;
}
/* free a pud table */
free_pagetable(pgd_page(*pgd), 0);
spin_lock(&init_mm.page_table_lock);
pgd_clear(pgd);
spin_unlock(&init_mm.page_table_lock);
return true;
}
static void __meminit
remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
bool direct)
{
unsigned long next, pages = 0;
pte_t *pte;
void *page_addr;
phys_addr_t phys_addr;
pte = pte_start + pte_index(addr);
for (; addr < end; addr = next, pte++) {
next = (addr + PAGE_SIZE) & PAGE_MASK;
if (next > end)
next = end;
if (!pte_present(*pte))
continue;
/*
* We mapped [0,1G) memory as identity mapping when
* initializing, in arch/x86/kernel/head_64.S. These
* pagetables cannot be removed.
*/
phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
if (phys_addr < (phys_addr_t)0x40000000)
return;
if (IS_ALIGNED(addr, PAGE_SIZE) &&
IS_ALIGNED(next, PAGE_SIZE)) {
/*
* Do not free direct mapping pages since they were
* freed when offlining, or simplely not in use.
*/
if (!direct)
free_pagetable(pte_page(*pte), 0);
spin_lock(&init_mm.page_table_lock);
pte_clear(&init_mm, addr, pte);
spin_unlock(&init_mm.page_table_lock);
/* For non-direct mapping, pages means nothing. */
pages++;
} else {
/*
* If we are here, we are freeing vmemmap pages since
* direct mapped memory ranges to be freed are aligned.
*
* If we are not removing the whole page, it means
* other page structs in this page are being used and
* we canot remove them. So fill the unused page_structs
* with 0xFD, and remove the page when it is wholly
* filled with 0xFD.
*/
memset((void *)addr, PAGE_INUSE, next - addr);
page_addr = page_address(pte_page(*pte));
if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
free_pagetable(pte_page(*pte), 0);
spin_lock(&init_mm.page_table_lock);
pte_clear(&init_mm, addr, pte);
spin_unlock(&init_mm.page_table_lock);
}
}
}
/* Call free_pte_table() in remove_pmd_table(). */
flush_tlb_all();
if (direct)
update_page_count(PG_LEVEL_4K, -pages);
}
static void __meminit
remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
bool direct)
{
unsigned long next, pages = 0;
pte_t *pte_base;
pmd_t *pmd;
void *page_addr;
pmd = pmd_start + pmd_index(addr);
for (; addr < end; addr = next, pmd++) {
next = pmd_addr_end(addr, end);
if (!pmd_present(*pmd))
continue;
if (pmd_large(*pmd)) {
if (IS_ALIGNED(addr, PMD_SIZE) &&
IS_ALIGNED(next, PMD_SIZE)) {
if (!direct)
free_pagetable(pmd_page(*pmd),
get_order(PMD_SIZE));
spin_lock(&init_mm.page_table_lock);
pmd_clear(pmd);
spin_unlock(&init_mm.page_table_lock);
pages++;
} else {
/* If here, we are freeing vmemmap pages. */
memset((void *)addr, PAGE_INUSE, next - addr);
page_addr = page_address(pmd_page(*pmd));
if (!memchr_inv(page_addr, PAGE_INUSE,
PMD_SIZE)) {
free_pagetable(pmd_page(*pmd),
get_order(PMD_SIZE));
spin_lock(&init_mm.page_table_lock);
pmd_clear(pmd);
spin_unlock(&init_mm.page_table_lock);
}
}
continue;
}
pte_base = (pte_t *)pmd_page_vaddr(*pmd);
remove_pte_table(pte_base, addr, next, direct);
free_pte_table(pte_base, pmd);
}
/* Call free_pmd_table() in remove_pud_table(). */
if (direct)
update_page_count(PG_LEVEL_2M, -pages);
}
static void __meminit
remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
bool direct)
{
unsigned long next, pages = 0;
pmd_t *pmd_base;
pud_t *pud;
void *page_addr;
pud = pud_start + pud_index(addr);
for (; addr < end; addr = next, pud++) {
next = pud_addr_end(addr, end);
if (!pud_present(*pud))
continue;
if (pud_large(*pud)) {
if (IS_ALIGNED(addr, PUD_SIZE) &&
IS_ALIGNED(next, PUD_SIZE)) {
if (!direct)
free_pagetable(pud_page(*pud),
get_order(PUD_SIZE));
spin_lock(&init_mm.page_table_lock);
pud_clear(pud);
spin_unlock(&init_mm.page_table_lock);
pages++;
} else {
/* If here, we are freeing vmemmap pages. */
memset((void *)addr, PAGE_INUSE, next - addr);
page_addr = page_address(pud_page(*pud));
if (!memchr_inv(page_addr, PAGE_INUSE,
PUD_SIZE)) {
free_pagetable(pud_page(*pud),
get_order(PUD_SIZE));
spin_lock(&init_mm.page_table_lock);
pud_clear(pud);
spin_unlock(&init_mm.page_table_lock);
}
}
continue;
}
pmd_base = (pmd_t *)pud_page_vaddr(*pud);
remove_pmd_table(pmd_base, addr, next, direct);
free_pmd_table(pmd_base, pud);
}
if (direct)
update_page_count(PG_LEVEL_1G, -pages);
}
/* start and end are both virtual address. */
static void __meminit
remove_pagetable(unsigned long start, unsigned long end, bool direct)
{
unsigned long next;
pgd_t *pgd;
pud_t *pud;
bool pgd_changed = false;
for (; start < end; start = next) {
next = pgd_addr_end(start, end);
pgd = pgd_offset_k(start);
if (!pgd_present(*pgd))
continue;
pud = (pud_t *)pgd_page_vaddr(*pgd);
remove_pud_table(pud, start, next, direct);
if (free_pud_table(pud, pgd))
pgd_changed = true;
}
if (pgd_changed)
sync_global_pgds(start, end - 1);
flush_tlb_all();
}
void __ref vmemmap_free(struct page *memmap, unsigned long nr_pages)
{
unsigned long start = (unsigned long)memmap;
unsigned long end = (unsigned long)(memmap + nr_pages);
remove_pagetable(start, end, false);
}
static void __meminit
kernel_physical_mapping_remove(unsigned long start, unsigned long end)
{
start = (unsigned long)__va(start);
end = (unsigned long)__va(end);
remove_pagetable(start, end, true);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
int __ref arch_remove_memory(u64 start, u64 size)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
struct zone *zone;
int ret;
zone = page_zone(pfn_to_page(start_pfn));
kernel_physical_mapping_remove(start, start + size);
ret = __remove_pages(zone, start_pfn, nr_pages);
WARN_ON_ONCE(ret);
return ret;
}
#endif
#endif /* CONFIG_MEMORY_HOTPLUG */
static struct kcore_list kcore_vsyscall;
@@ -1019,6 +1356,66 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
return 0;
}
#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
void register_page_bootmem_memmap(unsigned long section_nr,
struct page *start_page, unsigned long size)
{
unsigned long addr = (unsigned long)start_page;
unsigned long end = (unsigned long)(start_page + size);
unsigned long next;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
unsigned int nr_pages;
struct page *page;
for (; addr < end; addr = next) {
pte_t *pte = NULL;
pgd = pgd_offset_k(addr);
if (pgd_none(*pgd)) {
next = (addr + PAGE_SIZE) & PAGE_MASK;
continue;
}
get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
pud = pud_offset(pgd, addr);
if (pud_none(*pud)) {
next = (addr + PAGE_SIZE) & PAGE_MASK;
continue;
}
get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
if (!cpu_has_pse) {
next = (addr + PAGE_SIZE) & PAGE_MASK;
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd))
continue;
get_page_bootmem(section_nr, pmd_page(*pmd),
MIX_SECTION_INFO);
pte = pte_offset_kernel(pmd, addr);
if (pte_none(*pte))
continue;
get_page_bootmem(section_nr, pte_page(*pte),
SECTION_INFO);
} else {
next = pmd_addr_end(addr, end);
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd))
continue;
nr_pages = 1 << (get_order(PMD_SIZE));
page = pmd_page(*pmd);
while (nr_pages--)
get_page_bootmem(section_nr, page++,
SECTION_INFO);
}
}
}
#endif
void __meminit vmemmap_populate_print_last(void)
{
if (p_start) {

View File

@@ -56,7 +56,7 @@ early_param("numa", numa_setup);
/*
* apicid, cpu, node mappings
*/
s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
s16 __apicid_to_node[MAX_LOCAL_APIC] = {
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
};
@@ -78,7 +78,7 @@ EXPORT_SYMBOL(node_to_cpumask_map);
DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
void __cpuinit numa_set_node(int cpu, int node)
void numa_set_node(int cpu, int node)
{
int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
@@ -101,7 +101,7 @@ void __cpuinit numa_set_node(int cpu, int node)
set_cpu_numa_node(cpu, node);
}
void __cpuinit numa_clear_node(int cpu)
void numa_clear_node(int cpu)
{
numa_set_node(cpu, NUMA_NO_NODE);
}
@@ -213,10 +213,9 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
* Allocate node data. Try node-local memory and then any node.
* Never allocate in DMA zone.
*/
nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
if (!nd_pa) {
pr_err("Cannot find %zu bytes in node %d\n",
nd_size, nid);
pr_err("Cannot find %zu bytes in any node\n", nd_size);
return;
}
nd = __va(nd_pa);
@@ -561,10 +560,12 @@ static int __init numa_init(int (*init_func)(void))
for (i = 0; i < MAX_LOCAL_APIC; i++)
set_apicid_to_node(i, NUMA_NO_NODE);
nodes_clear(numa_nodes_parsed);
/*
* Do not clear numa_nodes_parsed or zero numa_meminfo here, because
* SRAT was parsed earlier in early_parse_srat().
*/
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
memset(&numa_meminfo, 0, sizeof(numa_meminfo));
WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES));
numa_reset_distance();

View File

@@ -529,21 +529,13 @@ out_unlock:
return do_split;
}
static int split_large_page(pte_t *kpte, unsigned long address)
int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase)
{
unsigned long pfn, pfninc = 1;
unsigned int i, level;
pte_t *pbase, *tmp;
pte_t *tmp;
pgprot_t ref_prot;
struct page *base;
if (!debug_pagealloc)
spin_unlock(&cpa_lock);
base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
if (!debug_pagealloc)
spin_lock(&cpa_lock);
if (!base)
return -ENOMEM;
struct page *base = virt_to_page(pbase);
spin_lock(&pgd_lock);
/*
@@ -551,10 +543,11 @@ static int split_large_page(pte_t *kpte, unsigned long address)
* up for us already:
*/
tmp = lookup_address(address, &level);
if (tmp != kpte)
goto out_unlock;
if (tmp != kpte) {
spin_unlock(&pgd_lock);
return 1;
}
pbase = (pte_t *)page_address(base);
paravirt_alloc_pte(&init_mm, page_to_pfn(base));
ref_prot = pte_pgprot(pte_clrhuge(*kpte));
/*
@@ -601,21 +594,31 @@ static int split_large_page(pte_t *kpte, unsigned long address)
* going on.
*/
__flush_tlb_all();
base = NULL;
out_unlock:
/*
* If we dropped out via the lookup_address check under
* pgd_lock then stick the page back into the pool:
*/
if (base)
__free_page(base);
spin_unlock(&pgd_lock);
return 0;
}
static int split_large_page(pte_t *kpte, unsigned long address)
{
pte_t *pbase;
struct page *base;
if (!debug_pagealloc)
spin_unlock(&cpa_lock);
base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
if (!debug_pagealloc)
spin_lock(&cpa_lock);
if (!base)
return -ENOMEM;
pbase = (pte_t *)page_address(base);
if (__split_large_page(kpte, address, pbase))
__free_page(base);
return 0;
}
static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
int primary)
{

View File

@@ -141,11 +141,126 @@ static inline int save_add_info(void) {return 1;}
static inline int save_add_info(void) {return 0;}
#endif
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
static void __init
handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable)
{
int overlap, i;
unsigned long start_pfn, end_pfn;
start_pfn = PFN_DOWN(start);
end_pfn = PFN_UP(end);
/*
* For movablemem_map=acpi:
*
* SRAT: |_____| |_____| |_________| |_________| ......
* node id: 0 1 1 2
* hotpluggable: n y y n
* movablemem_map: |_____| |_________|
*
* Using movablemem_map, we can prevent memblock from allocating memory
* on ZONE_MOVABLE at boot time.
*
* Before parsing SRAT, memblock has already reserve some memory ranges
* for other purposes, such as for kernel image. We cannot prevent
* kernel from using these memory, so we need to exclude these memory
* even if it is hotpluggable.
* Furthermore, to ensure the kernel has enough memory to boot, we make
* all the memory on the node which the kernel resides in
* un-hotpluggable.
*/
if (hotpluggable && movablemem_map.acpi) {
/* Exclude ranges reserved by memblock. */
struct memblock_type *rgn = &memblock.reserved;
for (i = 0; i < rgn->cnt; i++) {
if (end <= rgn->regions[i].base ||
start >= rgn->regions[i].base +
rgn->regions[i].size)
continue;
/*
* If the memory range overlaps the memory reserved by
* memblock, then the kernel resides in this node.
*/
node_set(node, movablemem_map.numa_nodes_kernel);
goto out;
}
/*
* If the kernel resides in this node, then the whole node
* should not be hotpluggable.
*/
if (node_isset(node, movablemem_map.numa_nodes_kernel))
goto out;
insert_movablemem_map(start_pfn, end_pfn);
/*
* numa_nodes_hotplug nodemask represents which nodes are put
* into movablemem_map.map[].
*/
node_set(node, movablemem_map.numa_nodes_hotplug);
goto out;
}
/*
* For movablemem_map=nn[KMG]@ss[KMG]:
*
* SRAT: |_____| |_____| |_________| |_________| ......
* node id: 0 1 1 2
* user specified: |__| |___|
* movablemem_map: |___| |_________| |______| ......
*
* Using movablemem_map, we can prevent memblock from allocating memory
* on ZONE_MOVABLE at boot time.
*
* NOTE: In this case, SRAT info will be ingored.
*/
overlap = movablemem_map_overlap(start_pfn, end_pfn);
if (overlap >= 0) {
/*
* If part of this range is in movablemem_map, we need to
* add the range after it to extend the range to the end
* of the node, because from the min address specified to
* the end of the node will be ZONE_MOVABLE.
*/
start_pfn = max(start_pfn,
movablemem_map.map[overlap].start_pfn);
insert_movablemem_map(start_pfn, end_pfn);
/*
* Set the nodemask, so that if the address range on one node
* is not continuse, we can add the subsequent ranges on the
* same node into movablemem_map.
*/
node_set(node, movablemem_map.numa_nodes_hotplug);
} else {
if (node_isset(node, movablemem_map.numa_nodes_hotplug))
/*
* Insert the range if we already have movable ranges
* on the same node.
*/
insert_movablemem_map(start_pfn, end_pfn);
}
out:
return;
}
#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
static inline void
handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable)
{
}
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
int __init
acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
{
u64 start, end;
u32 hotpluggable;
int node, pxm;
if (srat_disabled())
@@ -154,7 +269,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
goto out_err_bad_srat;
if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
goto out_err;
if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE;
if (hotpluggable && !save_add_info())
goto out_err;
start = ma->base_address;
@@ -174,9 +290,12 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
node_set(node, numa_nodes_parsed);
printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n",
printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx] %s\n",
node, pxm,
(unsigned long long) start, (unsigned long long) end - 1);
(unsigned long long) start, (unsigned long long) end - 1,
hotpluggable ? "Hot Pluggable": "");
handle_movablemem(node, start, end, hotpluggable);
return 0;
out_err_bad_srat: