Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: - various misc bits - most of MM (quite a lot of MM material is awaiting the merge of linux-next dependencies) - kasan - printk updates - procfs updates - MAINTAINERS - /lib updates - checkpatch updates * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (123 commits) init: reduce rootwait polling interval time to 5ms binfmt_elf: use vmalloc() for allocation of vma_filesz checkpatch: don't emit unified-diff error for rename-only patches checkpatch: don't check c99 types like uint8_t under tools checkpatch: avoid multiple line dereferences checkpatch: don't check .pl files, improve absolute path commit log test scripts/checkpatch.pl: fix spelling checkpatch: don't try to get maintained status when --no-tree is given lib/ida: document locking requirements a bit better lib/rbtree.c: fix typo in comment of ____rb_erase_color lib/Kconfig.debug: make CONFIG_STRICT_DEVMEM depend on CONFIG_DEVMEM MAINTAINERS: add drm and drm/i915 irc channels MAINTAINERS: add "C:" for URI for chat where developers hang out MAINTAINERS: add drm and drm/i915 bug filing info MAINTAINERS: add "B:" for URI where to file bugs get_maintainer: look for arbitrary letter prefixes in sections printk: add Kconfig option to set default console loglevel printk/sound: handle more message headers printk/btrfs: handle more message headers printk/kdb: handle more message headers ...
This commit is contained in:
@@ -153,7 +153,7 @@ config MOVABLE_NODE
|
||||
bool "Enable to assign a node which has only movable memory"
|
||||
depends on HAVE_MEMBLOCK
|
||||
depends on NO_BOOTMEM
|
||||
depends on X86_64
|
||||
depends on X86_64 || OF_EARLY_FLATTREE || MEMORY_HOTPLUG
|
||||
depends on NUMA
|
||||
default n
|
||||
help
|
||||
@@ -447,13 +447,9 @@ choice
|
||||
benefit.
|
||||
endchoice
|
||||
|
||||
#
|
||||
# We don't deposit page tables on file THP mapping,
|
||||
# but Power makes use of them to address MMU quirk.
|
||||
#
|
||||
config TRANSPARENT_HUGE_PAGECACHE
|
||||
def_bool y
|
||||
depends on TRANSPARENT_HUGEPAGE && !PPC
|
||||
depends on TRANSPARENT_HUGEPAGE
|
||||
|
||||
#
|
||||
# UP and nommu archs use km based percpu allocator
|
||||
|
@@ -634,22 +634,6 @@ isolate_freepages_range(struct compact_control *cc,
|
||||
return pfn;
|
||||
}
|
||||
|
||||
/* Update the number of anon and file isolated pages in the zone */
|
||||
static void acct_isolated(struct zone *zone, struct compact_control *cc)
|
||||
{
|
||||
struct page *page;
|
||||
unsigned int count[2] = { 0, };
|
||||
|
||||
if (list_empty(&cc->migratepages))
|
||||
return;
|
||||
|
||||
list_for_each_entry(page, &cc->migratepages, lru)
|
||||
count[!!page_is_file_cache(page)]++;
|
||||
|
||||
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, count[0]);
|
||||
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, count[1]);
|
||||
}
|
||||
|
||||
/* Similar to reclaim, but different enough that they don't share logic */
|
||||
static bool too_many_isolated(struct zone *zone)
|
||||
{
|
||||
@@ -866,6 +850,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
||||
|
||||
/* Successfully isolated */
|
||||
del_page_from_lru_list(page, lruvec, page_lru(page));
|
||||
inc_node_page_state(page,
|
||||
NR_ISOLATED_ANON + page_is_file_cache(page));
|
||||
|
||||
isolate_success:
|
||||
list_add(&page->lru, &cc->migratepages);
|
||||
@@ -902,7 +888,6 @@ isolate_fail:
|
||||
spin_unlock_irqrestore(zone_lru_lock(zone), flags);
|
||||
locked = false;
|
||||
}
|
||||
acct_isolated(zone, cc);
|
||||
putback_movable_pages(&cc->migratepages);
|
||||
cc->nr_migratepages = 0;
|
||||
cc->last_migrated_pfn = 0;
|
||||
@@ -988,7 +973,6 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
|
||||
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
|
||||
break;
|
||||
}
|
||||
acct_isolated(cc->zone, cc);
|
||||
|
||||
return pfn;
|
||||
}
|
||||
@@ -1258,10 +1242,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
|
||||
low_pfn = isolate_migratepages_block(cc, low_pfn,
|
||||
block_end_pfn, isolate_mode);
|
||||
|
||||
if (!low_pfn || cc->contended) {
|
||||
acct_isolated(zone, cc);
|
||||
if (!low_pfn || cc->contended)
|
||||
return ISOLATE_ABORT;
|
||||
}
|
||||
|
||||
/*
|
||||
* Either we isolated something and proceed with migration. Or
|
||||
@@ -1271,7 +1253,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
|
||||
break;
|
||||
}
|
||||
|
||||
acct_isolated(zone, cc);
|
||||
/* Record where migration scanner will be restarted. */
|
||||
cc->migrate_pfn = low_pfn;
|
||||
|
||||
|
@@ -59,6 +59,10 @@ void __dump_page(struct page *page, const char *reason)
|
||||
|
||||
pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags);
|
||||
|
||||
print_hex_dump(KERN_ALERT, "raw: ", DUMP_PREFIX_NONE, 32,
|
||||
sizeof(unsigned long), page,
|
||||
sizeof(struct page), false);
|
||||
|
||||
if (reason)
|
||||
pr_alert("page dumped because: %s\n", reason);
|
||||
|
||||
|
68
mm/filemap.c
68
mm/filemap.c
@@ -132,44 +132,29 @@ static int page_cache_tree_insert(struct address_space *mapping,
|
||||
if (!dax_mapping(mapping)) {
|
||||
if (shadowp)
|
||||
*shadowp = p;
|
||||
if (node)
|
||||
workingset_node_shadows_dec(node);
|
||||
} else {
|
||||
/* DAX can replace empty locked entry with a hole */
|
||||
WARN_ON_ONCE(p !=
|
||||
(void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
|
||||
RADIX_DAX_ENTRY_LOCK));
|
||||
/* DAX accounts exceptional entries as normal pages */
|
||||
if (node)
|
||||
workingset_node_pages_dec(node);
|
||||
/* Wakeup waiters for exceptional entry lock */
|
||||
dax_wake_mapping_entry_waiter(mapping, page->index,
|
||||
false);
|
||||
}
|
||||
}
|
||||
radix_tree_replace_slot(slot, page);
|
||||
__radix_tree_replace(&mapping->page_tree, node, slot, page,
|
||||
workingset_update_node, mapping);
|
||||
mapping->nrpages++;
|
||||
if (node) {
|
||||
workingset_node_pages_inc(node);
|
||||
/*
|
||||
* Don't track node that contains actual pages.
|
||||
*
|
||||
* Avoid acquiring the list_lru lock if already
|
||||
* untracked. The list_empty() test is safe as
|
||||
* node->private_list is protected by
|
||||
* mapping->tree_lock.
|
||||
*/
|
||||
if (!list_empty(&node->private_list))
|
||||
list_lru_del(&workingset_shadow_nodes,
|
||||
&node->private_list);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void page_cache_tree_delete(struct address_space *mapping,
|
||||
struct page *page, void *shadow)
|
||||
{
|
||||
int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
|
||||
int i, nr;
|
||||
|
||||
/* hugetlb pages are represented by one entry in the radix tree */
|
||||
nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
|
||||
|
||||
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
||||
VM_BUG_ON_PAGE(PageTail(page), page);
|
||||
@@ -182,44 +167,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
|
||||
__radix_tree_lookup(&mapping->page_tree, page->index + i,
|
||||
&node, &slot);
|
||||
|
||||
VM_BUG_ON_PAGE(!node && nr != 1, page);
|
||||
|
||||
radix_tree_clear_tags(&mapping->page_tree, node, slot);
|
||||
|
||||
if (!node) {
|
||||
VM_BUG_ON_PAGE(nr != 1, page);
|
||||
/*
|
||||
* We need a node to properly account shadow
|
||||
* entries. Don't plant any without. XXX
|
||||
*/
|
||||
shadow = NULL;
|
||||
}
|
||||
|
||||
radix_tree_replace_slot(slot, shadow);
|
||||
|
||||
if (!node)
|
||||
break;
|
||||
|
||||
workingset_node_pages_dec(node);
|
||||
if (shadow)
|
||||
workingset_node_shadows_inc(node);
|
||||
else
|
||||
if (__radix_tree_delete_node(&mapping->page_tree, node))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Track node that only contains shadow entries. DAX mappings
|
||||
* contain no shadow entries and may contain other exceptional
|
||||
* entries so skip those.
|
||||
*
|
||||
* Avoid acquiring the list_lru lock if already tracked.
|
||||
* The list_empty() test is safe as node->private_list is
|
||||
* protected by mapping->tree_lock.
|
||||
*/
|
||||
if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
|
||||
list_empty(&node->private_list)) {
|
||||
node->private_data = mapping;
|
||||
list_lru_add(&workingset_shadow_nodes,
|
||||
&node->private_list);
|
||||
}
|
||||
__radix_tree_replace(&mapping->page_tree, node, slot, shadow,
|
||||
workingset_update_node, mapping);
|
||||
}
|
||||
|
||||
if (shadow) {
|
||||
|
19
mm/gup.c
19
mm/gup.c
@@ -632,7 +632,8 @@ next_page:
|
||||
return i;
|
||||
}
|
||||
|
||||
bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags)
|
||||
static bool vma_permits_fault(struct vm_area_struct *vma,
|
||||
unsigned int fault_flags)
|
||||
{
|
||||
bool write = !!(fault_flags & FAULT_FLAG_WRITE);
|
||||
bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
|
||||
@@ -857,14 +858,12 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
|
||||
EXPORT_SYMBOL(get_user_pages_locked);
|
||||
|
||||
/*
|
||||
* Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to
|
||||
* pass additional gup_flags as last parameter (like FOLL_HWPOISON).
|
||||
* Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows for
|
||||
* tsk, mm to be specified.
|
||||
*
|
||||
* NOTE: here FOLL_TOUCH is not set implicitly and must be set by the
|
||||
* caller if required (just like with __get_user_pages). "FOLL_GET",
|
||||
* "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed
|
||||
* according to the parameters "pages", "write", "force"
|
||||
* respectively.
|
||||
* caller if required (just like with __get_user_pages). "FOLL_GET"
|
||||
* is set implicitly if "pages" is non-NULL.
|
||||
*/
|
||||
__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, unsigned long nr_pages,
|
||||
@@ -894,10 +893,8 @@ EXPORT_SYMBOL(__get_user_pages_unlocked);
|
||||
* get_user_pages_unlocked(tsk, mm, ..., pages);
|
||||
*
|
||||
* It is functionally equivalent to get_user_pages_fast so
|
||||
* get_user_pages_fast should be used instead, if the two parameters
|
||||
* "tsk" and "mm" are respectively equal to current and current->mm,
|
||||
* or if "force" shall be set to 1 (get_user_pages_fast misses the
|
||||
* "force" parameter).
|
||||
* get_user_pages_fast should be used instead if specific gup_flags
|
||||
* (e.g. FOLL_FORCE) are not required.
|
||||
*/
|
||||
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
|
||||
struct page **pages, unsigned int gup_flags)
|
||||
|
@@ -285,6 +285,15 @@ static ssize_t use_zero_page_store(struct kobject *kobj,
|
||||
}
|
||||
static struct kobj_attribute use_zero_page_attr =
|
||||
__ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
|
||||
|
||||
static ssize_t hpage_pmd_size_show(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE);
|
||||
}
|
||||
static struct kobj_attribute hpage_pmd_size_attr =
|
||||
__ATTR_RO(hpage_pmd_size);
|
||||
|
||||
#ifdef CONFIG_DEBUG_VM
|
||||
static ssize_t debug_cow_show(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *buf)
|
||||
@@ -307,6 +316,7 @@ static struct attribute *hugepage_attr[] = {
|
||||
&enabled_attr.attr,
|
||||
&defrag_attr.attr,
|
||||
&use_zero_page_attr.attr,
|
||||
&hpage_pmd_size_attr.attr,
|
||||
#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
|
||||
&shmem_enabled_attr.attr,
|
||||
#endif
|
||||
@@ -1323,6 +1333,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
||||
struct mm_struct *mm = tlb->mm;
|
||||
bool ret = false;
|
||||
|
||||
tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
|
||||
|
||||
ptl = pmd_trans_huge_lock(pmd, vma);
|
||||
if (!ptl)
|
||||
goto out_unlocked;
|
||||
@@ -1378,12 +1390,23 @@ out_unlocked:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
|
||||
{
|
||||
pgtable_t pgtable;
|
||||
|
||||
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
|
||||
pte_free(mm, pgtable);
|
||||
atomic_long_dec(&mm->nr_ptes);
|
||||
}
|
||||
|
||||
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
||||
pmd_t *pmd, unsigned long addr)
|
||||
{
|
||||
pmd_t orig_pmd;
|
||||
spinlock_t *ptl;
|
||||
|
||||
tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
|
||||
|
||||
ptl = __pmd_trans_huge_lock(pmd, vma);
|
||||
if (!ptl)
|
||||
return 0;
|
||||
@@ -1399,12 +1422,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
||||
if (vma_is_dax(vma)) {
|
||||
spin_unlock(ptl);
|
||||
if (is_huge_zero_pmd(orig_pmd))
|
||||
tlb_remove_page(tlb, pmd_page(orig_pmd));
|
||||
tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
|
||||
} else if (is_huge_zero_pmd(orig_pmd)) {
|
||||
pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
|
||||
atomic_long_dec(&tlb->mm->nr_ptes);
|
||||
spin_unlock(ptl);
|
||||
tlb_remove_page(tlb, pmd_page(orig_pmd));
|
||||
tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
|
||||
} else {
|
||||
struct page *page = pmd_page(orig_pmd);
|
||||
page_remove_rmap(page, true);
|
||||
@@ -1417,6 +1440,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
||||
atomic_long_dec(&tlb->mm->nr_ptes);
|
||||
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
|
||||
} else {
|
||||
if (arch_needs_pgtable_deposit())
|
||||
zap_deposited_table(tlb->mm, pmd);
|
||||
add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
|
||||
}
|
||||
spin_unlock(ptl);
|
||||
@@ -1425,6 +1450,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifndef pmd_move_must_withdraw
|
||||
static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
|
||||
spinlock_t *old_pmd_ptl,
|
||||
struct vm_area_struct *vma)
|
||||
{
|
||||
/*
|
||||
* With split pmd lock we also need to move preallocated
|
||||
* PTE page table if new_pmd is on different PMD page table.
|
||||
*
|
||||
* We also don't deposit and withdraw tables for file pages.
|
||||
*/
|
||||
return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
|
||||
}
|
||||
#endif
|
||||
|
||||
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
|
||||
unsigned long new_addr, unsigned long old_end,
|
||||
pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
|
||||
@@ -1462,8 +1502,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
|
||||
force_flush = true;
|
||||
VM_BUG_ON(!pmd_none(*new_pmd));
|
||||
|
||||
if (pmd_move_must_withdraw(new_ptl, old_ptl) &&
|
||||
vma_is_anonymous(vma)) {
|
||||
if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
|
||||
pgtable_t pgtable;
|
||||
pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
|
||||
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
|
||||
@@ -1589,6 +1628,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
|
||||
if (!vma_is_anonymous(vma)) {
|
||||
_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
|
||||
/*
|
||||
* We are going to unmap this huge page. So
|
||||
* just go ahead and zap it
|
||||
*/
|
||||
if (arch_needs_pgtable_deposit())
|
||||
zap_deposited_table(mm, pmd);
|
||||
if (vma_is_dax(vma))
|
||||
return;
|
||||
page = pmd_page(_pmd);
|
||||
|
25
mm/hugetlb.c
25
mm/hugetlb.c
@@ -3286,6 +3286,11 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
||||
BUG_ON(start & ~huge_page_mask(h));
|
||||
BUG_ON(end & ~huge_page_mask(h));
|
||||
|
||||
/*
|
||||
* This is a hugetlb vma, all the pte entries should point
|
||||
* to huge page.
|
||||
*/
|
||||
tlb_remove_check_page_size_change(tlb, sz);
|
||||
tlb_start_vma(tlb, vma);
|
||||
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
|
||||
address = start;
|
||||
@@ -3336,7 +3341,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
||||
}
|
||||
|
||||
pte = huge_ptep_get_and_clear(mm, address, ptep);
|
||||
tlb_remove_tlb_entry(tlb, ptep, address);
|
||||
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
|
||||
if (huge_pte_dirty(pte))
|
||||
set_page_dirty(page);
|
||||
|
||||
@@ -3450,15 +3455,17 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
* Keep the pte_same checks anyway to make transition from the mutex easier.
|
||||
*/
|
||||
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long address, pte_t *ptep, pte_t pte,
|
||||
struct page *pagecache_page, spinlock_t *ptl)
|
||||
unsigned long address, pte_t *ptep,
|
||||
struct page *pagecache_page, spinlock_t *ptl)
|
||||
{
|
||||
pte_t pte;
|
||||
struct hstate *h = hstate_vma(vma);
|
||||
struct page *old_page, *new_page;
|
||||
int ret = 0, outside_reserve = 0;
|
||||
unsigned long mmun_start; /* For mmu_notifiers */
|
||||
unsigned long mmun_end; /* For mmu_notifiers */
|
||||
|
||||
pte = huge_ptep_get(ptep);
|
||||
old_page = pte_page(pte);
|
||||
|
||||
retry_avoidcopy:
|
||||
@@ -3711,8 +3718,7 @@ retry:
|
||||
vma_end_reservation(h, vma, address);
|
||||
}
|
||||
|
||||
ptl = huge_pte_lockptr(h, mm, ptep);
|
||||
spin_lock(ptl);
|
||||
ptl = huge_pte_lock(h, mm, ptep);
|
||||
size = i_size_read(mapping->host) >> huge_page_shift(h);
|
||||
if (idx >= size)
|
||||
goto backout;
|
||||
@@ -3733,7 +3739,7 @@ retry:
|
||||
hugetlb_count_add(pages_per_huge_page(h), mm);
|
||||
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
|
||||
/* Optimization, do the COW without a second fault */
|
||||
ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
|
||||
ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
|
||||
}
|
||||
|
||||
spin_unlock(ptl);
|
||||
@@ -3888,8 +3894,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
|
||||
if (flags & FAULT_FLAG_WRITE) {
|
||||
if (!huge_pte_write(entry)) {
|
||||
ret = hugetlb_cow(mm, vma, address, ptep, entry,
|
||||
pagecache_page, ptl);
|
||||
ret = hugetlb_cow(mm, vma, address, ptep,
|
||||
pagecache_page, ptl);
|
||||
goto out_put_page;
|
||||
}
|
||||
entry = huge_pte_mkdirty(entry);
|
||||
@@ -4330,8 +4336,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
|
||||
if (!spte)
|
||||
goto out;
|
||||
|
||||
ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
|
||||
spin_lock(ptl);
|
||||
ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
|
||||
if (pud_none(*pud)) {
|
||||
pud_populate(mm, pud,
|
||||
(pmd_t *)((unsigned long)spte & PAGE_MASK));
|
||||
|
@@ -86,24 +86,9 @@ static void qlist_move_all(struct qlist_head *from, struct qlist_head *to)
|
||||
qlist_init(from);
|
||||
}
|
||||
|
||||
static void qlist_move(struct qlist_head *from, struct qlist_node *last,
|
||||
struct qlist_head *to, size_t size)
|
||||
{
|
||||
if (unlikely(last == from->tail)) {
|
||||
qlist_move_all(from, to);
|
||||
return;
|
||||
}
|
||||
if (qlist_empty(to))
|
||||
to->head = from->head;
|
||||
else
|
||||
to->tail->next = from->head;
|
||||
to->tail = last;
|
||||
from->head = last->next;
|
||||
last->next = NULL;
|
||||
from->bytes -= size;
|
||||
to->bytes += size;
|
||||
}
|
||||
|
||||
#define QUARANTINE_PERCPU_SIZE (1 << 20)
|
||||
#define QUARANTINE_BATCHES \
|
||||
(1024 > 4 * CONFIG_NR_CPUS ? 1024 : 4 * CONFIG_NR_CPUS)
|
||||
|
||||
/*
|
||||
* The object quarantine consists of per-cpu queues and a global queue,
|
||||
@@ -111,11 +96,22 @@ static void qlist_move(struct qlist_head *from, struct qlist_node *last,
|
||||
*/
|
||||
static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine);
|
||||
|
||||
static struct qlist_head global_quarantine;
|
||||
/* Round-robin FIFO array of batches. */
|
||||
static struct qlist_head global_quarantine[QUARANTINE_BATCHES];
|
||||
static int quarantine_head;
|
||||
static int quarantine_tail;
|
||||
/* Total size of all objects in global_quarantine across all batches. */
|
||||
static unsigned long quarantine_size;
|
||||
static DEFINE_SPINLOCK(quarantine_lock);
|
||||
|
||||
/* Maximum size of the global queue. */
|
||||
static unsigned long quarantine_size;
|
||||
static unsigned long quarantine_max_size;
|
||||
|
||||
/*
|
||||
* Target size of a batch in global_quarantine.
|
||||
* Usually equal to QUARANTINE_PERCPU_SIZE unless we have too much RAM.
|
||||
*/
|
||||
static unsigned long quarantine_batch_size;
|
||||
|
||||
/*
|
||||
* The fraction of physical memory the quarantine is allowed to occupy.
|
||||
@@ -124,9 +120,6 @@ static unsigned long quarantine_size;
|
||||
*/
|
||||
#define QUARANTINE_FRACTION 32
|
||||
|
||||
#define QUARANTINE_LOW_SIZE (READ_ONCE(quarantine_size) * 3 / 4)
|
||||
#define QUARANTINE_PERCPU_SIZE (1 << 20)
|
||||
|
||||
static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
|
||||
{
|
||||
return virt_to_head_page(qlink)->slab_cache;
|
||||
@@ -191,21 +184,30 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
|
||||
|
||||
if (unlikely(!qlist_empty(&temp))) {
|
||||
spin_lock_irqsave(&quarantine_lock, flags);
|
||||
qlist_move_all(&temp, &global_quarantine);
|
||||
WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes);
|
||||
qlist_move_all(&temp, &global_quarantine[quarantine_tail]);
|
||||
if (global_quarantine[quarantine_tail].bytes >=
|
||||
READ_ONCE(quarantine_batch_size)) {
|
||||
int new_tail;
|
||||
|
||||
new_tail = quarantine_tail + 1;
|
||||
if (new_tail == QUARANTINE_BATCHES)
|
||||
new_tail = 0;
|
||||
if (new_tail != quarantine_head)
|
||||
quarantine_tail = new_tail;
|
||||
}
|
||||
spin_unlock_irqrestore(&quarantine_lock, flags);
|
||||
}
|
||||
}
|
||||
|
||||
void quarantine_reduce(void)
|
||||
{
|
||||
size_t new_quarantine_size, percpu_quarantines;
|
||||
size_t total_size, new_quarantine_size, percpu_quarantines;
|
||||
unsigned long flags;
|
||||
struct qlist_head to_free = QLIST_INIT;
|
||||
size_t size_to_free = 0;
|
||||
struct qlist_node *last;
|
||||
|
||||
if (likely(READ_ONCE(global_quarantine.bytes) <=
|
||||
READ_ONCE(quarantine_size)))
|
||||
if (likely(READ_ONCE(quarantine_size) <=
|
||||
READ_ONCE(quarantine_max_size)))
|
||||
return;
|
||||
|
||||
spin_lock_irqsave(&quarantine_lock, flags);
|
||||
@@ -214,24 +216,23 @@ void quarantine_reduce(void)
|
||||
* Update quarantine size in case of hotplug. Allocate a fraction of
|
||||
* the installed memory to quarantine minus per-cpu queue limits.
|
||||
*/
|
||||
new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
|
||||
total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
|
||||
QUARANTINE_FRACTION;
|
||||
percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus();
|
||||
new_quarantine_size = (new_quarantine_size < percpu_quarantines) ?
|
||||
0 : new_quarantine_size - percpu_quarantines;
|
||||
WRITE_ONCE(quarantine_size, new_quarantine_size);
|
||||
new_quarantine_size = (total_size < percpu_quarantines) ?
|
||||
0 : total_size - percpu_quarantines;
|
||||
WRITE_ONCE(quarantine_max_size, new_quarantine_size);
|
||||
/* Aim at consuming at most 1/2 of slots in quarantine. */
|
||||
WRITE_ONCE(quarantine_batch_size, max((size_t)QUARANTINE_PERCPU_SIZE,
|
||||
2 * total_size / QUARANTINE_BATCHES));
|
||||
|
||||
last = global_quarantine.head;
|
||||
while (last) {
|
||||
struct kmem_cache *cache = qlink_to_cache(last);
|
||||
|
||||
size_to_free += cache->size;
|
||||
if (!last->next || size_to_free >
|
||||
global_quarantine.bytes - QUARANTINE_LOW_SIZE)
|
||||
break;
|
||||
last = last->next;
|
||||
if (likely(quarantine_size > quarantine_max_size)) {
|
||||
qlist_move_all(&global_quarantine[quarantine_head], &to_free);
|
||||
WRITE_ONCE(quarantine_size, quarantine_size - to_free.bytes);
|
||||
quarantine_head++;
|
||||
if (quarantine_head == QUARANTINE_BATCHES)
|
||||
quarantine_head = 0;
|
||||
}
|
||||
qlist_move(&global_quarantine, last, &to_free, size_to_free);
|
||||
|
||||
spin_unlock_irqrestore(&quarantine_lock, flags);
|
||||
|
||||
@@ -275,13 +276,14 @@ static void per_cpu_remove_cache(void *arg)
|
||||
|
||||
void quarantine_remove_cache(struct kmem_cache *cache)
|
||||
{
|
||||
unsigned long flags;
|
||||
unsigned long flags, i;
|
||||
struct qlist_head to_free = QLIST_INIT;
|
||||
|
||||
on_each_cpu(per_cpu_remove_cache, cache, 1);
|
||||
|
||||
spin_lock_irqsave(&quarantine_lock, flags);
|
||||
qlist_move_cache(&global_quarantine, &to_free, cache);
|
||||
for (i = 0; i < QUARANTINE_BATCHES; i++)
|
||||
qlist_move_cache(&global_quarantine[i], &to_free, cache);
|
||||
spin_unlock_irqrestore(&quarantine_lock, flags);
|
||||
|
||||
qlist_free_all(&to_free, cache);
|
||||
|
@@ -136,6 +136,8 @@ static void kasan_end_report(unsigned long *flags)
|
||||
pr_err("==================================================================\n");
|
||||
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
|
||||
spin_unlock_irqrestore(&report_lock, *flags);
|
||||
if (panic_on_warn)
|
||||
panic("panic_on_warn set ...\n");
|
||||
kasan_enable_current();
|
||||
}
|
||||
|
||||
|
@@ -1242,6 +1242,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
|
||||
struct vm_area_struct *vma;
|
||||
unsigned long addr;
|
||||
pmd_t *pmd, _pmd;
|
||||
bool deposited = false;
|
||||
|
||||
i_mmap_lock_write(mapping);
|
||||
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
|
||||
@@ -1266,10 +1267,26 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
|
||||
spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
|
||||
/* assume page table is clear */
|
||||
_pmd = pmdp_collapse_flush(vma, addr, pmd);
|
||||
/*
|
||||
* now deposit the pgtable for arch that need it
|
||||
* otherwise free it.
|
||||
*/
|
||||
if (arch_needs_pgtable_deposit()) {
|
||||
/*
|
||||
* The deposit should be visibile only after
|
||||
* collapse is seen by others.
|
||||
*/
|
||||
smp_wmb();
|
||||
pgtable_trans_huge_deposit(vma->vm_mm, pmd,
|
||||
pmd_pgtable(_pmd));
|
||||
deposited = true;
|
||||
}
|
||||
spin_unlock(ptl);
|
||||
up_write(&vma->vm_mm->mmap_sem);
|
||||
atomic_long_dec(&vma->vm_mm->nr_ptes);
|
||||
pte_free(vma->vm_mm, pmd_pgtable(_pmd));
|
||||
if (!deposited) {
|
||||
atomic_long_dec(&vma->vm_mm->nr_ptes);
|
||||
pte_free(vma->vm_mm, pmd_pgtable(_pmd));
|
||||
}
|
||||
}
|
||||
}
|
||||
i_mmap_unlock_write(mapping);
|
||||
@@ -1403,6 +1420,9 @@ static void collapse_shmem(struct mm_struct *mm,
|
||||
|
||||
spin_lock_irq(&mapping->tree_lock);
|
||||
|
||||
slot = radix_tree_lookup_slot(&mapping->page_tree, index);
|
||||
VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot,
|
||||
&mapping->tree_lock), page);
|
||||
VM_BUG_ON_PAGE(page_mapped(page), page);
|
||||
|
||||
/*
|
||||
@@ -1423,9 +1443,10 @@ static void collapse_shmem(struct mm_struct *mm,
|
||||
list_add_tail(&page->lru, &pagelist);
|
||||
|
||||
/* Finally, replace with the new page. */
|
||||
radix_tree_replace_slot(slot,
|
||||
radix_tree_replace_slot(&mapping->page_tree, slot,
|
||||
new_page + (index % HPAGE_PMD_NR));
|
||||
|
||||
slot = radix_tree_iter_next(&iter);
|
||||
index++;
|
||||
continue;
|
||||
out_lru:
|
||||
@@ -1521,9 +1542,11 @@ tree_unlocked:
|
||||
if (!page || iter.index < page->index) {
|
||||
if (!nr_none)
|
||||
break;
|
||||
/* Put holes back where they were */
|
||||
radix_tree_replace_slot(slot, NULL);
|
||||
nr_none--;
|
||||
/* Put holes back where they were */
|
||||
radix_tree_delete(&mapping->page_tree,
|
||||
iter.index);
|
||||
slot = radix_tree_iter_next(&iter);
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -1532,11 +1555,13 @@ tree_unlocked:
|
||||
/* Unfreeze the page. */
|
||||
list_del(&page->lru);
|
||||
page_ref_unfreeze(page, 2);
|
||||
radix_tree_replace_slot(slot, page);
|
||||
radix_tree_replace_slot(&mapping->page_tree,
|
||||
slot, page);
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
putback_lru_page(page);
|
||||
unlock_page(page);
|
||||
spin_lock_irq(&mapping->tree_lock);
|
||||
slot = radix_tree_iter_next(&iter);
|
||||
}
|
||||
VM_BUG_ON(nr_none);
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
|
@@ -19,7 +19,7 @@
|
||||
*
|
||||
*
|
||||
* For more information on the algorithm and kmemleak usage, please see
|
||||
* Documentation/kmemleak.txt.
|
||||
* Documentation/dev-tools/kmemleak.rst.
|
||||
*
|
||||
* Notes on locking
|
||||
* ----------------
|
||||
|
@@ -281,6 +281,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
|
||||
if (pmd_trans_unstable(pmd))
|
||||
return 0;
|
||||
|
||||
tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
|
||||
orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
|
||||
arch_enter_lazy_mmu_mode();
|
||||
for (; addr != end; pte++, addr += PAGE_SIZE) {
|
||||
|
@@ -2145,6 +2145,8 @@ struct memcg_kmem_cache_create_work {
|
||||
struct work_struct work;
|
||||
};
|
||||
|
||||
static struct workqueue_struct *memcg_kmem_cache_create_wq;
|
||||
|
||||
static void memcg_kmem_cache_create_func(struct work_struct *w)
|
||||
{
|
||||
struct memcg_kmem_cache_create_work *cw =
|
||||
@@ -2176,7 +2178,7 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
|
||||
cw->cachep = cachep;
|
||||
INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
|
||||
|
||||
schedule_work(&cw->work);
|
||||
queue_work(memcg_kmem_cache_create_wq, &cw->work);
|
||||
}
|
||||
|
||||
static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
|
||||
@@ -5774,6 +5776,17 @@ static int __init mem_cgroup_init(void)
|
||||
{
|
||||
int cpu, node;
|
||||
|
||||
#ifndef CONFIG_SLOB
|
||||
/*
|
||||
* Kmem cache creation is mostly done with the slab_mutex held,
|
||||
* so use a special workqueue to avoid stalling all worker
|
||||
* threads in case lots of cgroups are created simultaneously.
|
||||
*/
|
||||
memcg_kmem_cache_create_wq =
|
||||
alloc_ordered_workqueue("memcg_kmem_cache_create", 0);
|
||||
BUG_ON(!memcg_kmem_cache_create_wq);
|
||||
#endif
|
||||
|
||||
cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
|
||||
memcg_hotplug_cpu_dead);
|
||||
|
||||
|
92
mm/memory.c
92
mm/memory.c
@@ -300,15 +300,14 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
|
||||
struct mmu_gather_batch *batch;
|
||||
|
||||
VM_BUG_ON(!tlb->end);
|
||||
|
||||
if (!tlb->page_size)
|
||||
tlb->page_size = page_size;
|
||||
else {
|
||||
if (page_size != tlb->page_size)
|
||||
return true;
|
||||
}
|
||||
VM_WARN_ON(tlb->page_size != page_size);
|
||||
|
||||
batch = tlb->active;
|
||||
/*
|
||||
* Add the page and check if we are full. If so
|
||||
* force a flush.
|
||||
*/
|
||||
batch->pages[batch->nr++] = page;
|
||||
if (batch->nr == batch->max) {
|
||||
if (!tlb_next_batch(tlb))
|
||||
return true;
|
||||
@@ -316,7 +315,6 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
|
||||
}
|
||||
VM_BUG_ON_PAGE(batch->nr > batch->max, page);
|
||||
|
||||
batch->pages[batch->nr++] = page;
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -528,7 +526,11 @@ void free_pgd_range(struct mmu_gather *tlb,
|
||||
end -= PMD_SIZE;
|
||||
if (addr > end - 1)
|
||||
return;
|
||||
|
||||
/*
|
||||
* We add page table cache pages with PAGE_SIZE,
|
||||
* (see pte_free_tlb()), flush the tlb if we need
|
||||
*/
|
||||
tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
|
||||
pgd = pgd_offset(tlb->mm, addr);
|
||||
do {
|
||||
next = pgd_addr_end(addr, end);
|
||||
@@ -1118,8 +1120,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
|
||||
pte_t *start_pte;
|
||||
pte_t *pte;
|
||||
swp_entry_t entry;
|
||||
struct page *pending_page = NULL;
|
||||
|
||||
tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
|
||||
again:
|
||||
init_rss_vec(rss);
|
||||
start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
|
||||
@@ -1172,7 +1174,6 @@ again:
|
||||
print_bad_pte(vma, addr, ptent, page);
|
||||
if (unlikely(__tlb_remove_page(tlb, page))) {
|
||||
force_flush = 1;
|
||||
pending_page = page;
|
||||
addr += PAGE_SIZE;
|
||||
break;
|
||||
}
|
||||
@@ -1213,11 +1214,6 @@ again:
|
||||
if (force_flush) {
|
||||
force_flush = 0;
|
||||
tlb_flush_mmu_free(tlb);
|
||||
if (pending_page) {
|
||||
/* remove the page with new size */
|
||||
__tlb_remove_pte_page(tlb, pending_page);
|
||||
pending_page = NULL;
|
||||
}
|
||||
if (addr != end)
|
||||
goto again;
|
||||
}
|
||||
@@ -1240,7 +1236,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
|
||||
if (next - addr != HPAGE_PMD_SIZE) {
|
||||
VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
|
||||
!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
|
||||
split_huge_pmd(vma, pmd, addr);
|
||||
__split_huge_pmd(vma, pmd, addr, false, NULL);
|
||||
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
|
||||
goto next;
|
||||
/* fall through */
|
||||
@@ -2939,6 +2935,19 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
|
||||
return true;
|
||||
}
|
||||
|
||||
static void deposit_prealloc_pte(struct fault_env *fe)
|
||||
{
|
||||
struct vm_area_struct *vma = fe->vma;
|
||||
|
||||
pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte);
|
||||
/*
|
||||
* We are going to consume the prealloc table,
|
||||
* count that as nr_ptes.
|
||||
*/
|
||||
atomic_long_inc(&vma->vm_mm->nr_ptes);
|
||||
fe->prealloc_pte = 0;
|
||||
}
|
||||
|
||||
static int do_set_pmd(struct fault_env *fe, struct page *page)
|
||||
{
|
||||
struct vm_area_struct *vma = fe->vma;
|
||||
@@ -2953,6 +2962,17 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
|
||||
ret = VM_FAULT_FALLBACK;
|
||||
page = compound_head(page);
|
||||
|
||||
/*
|
||||
* Archs like ppc64 need additonal space to store information
|
||||
* related to pte entry. Use the preallocated table for that.
|
||||
*/
|
||||
if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) {
|
||||
fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address);
|
||||
if (!fe->prealloc_pte)
|
||||
return VM_FAULT_OOM;
|
||||
smp_wmb(); /* See comment in __pte_alloc() */
|
||||
}
|
||||
|
||||
fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
|
||||
if (unlikely(!pmd_none(*fe->pmd)))
|
||||
goto out;
|
||||
@@ -2966,6 +2986,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
|
||||
|
||||
add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR);
|
||||
page_add_file_rmap(page, true);
|
||||
/*
|
||||
* deposit and withdraw with pmd lock held
|
||||
*/
|
||||
if (arch_needs_pgtable_deposit())
|
||||
deposit_prealloc_pte(fe);
|
||||
|
||||
set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
|
||||
|
||||
@@ -2975,6 +3000,13 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
|
||||
ret = 0;
|
||||
count_vm_event(THP_FILE_MAPPED);
|
||||
out:
|
||||
/*
|
||||
* If we are going to fallback to pte mapping, do a
|
||||
* withdraw with pmd lock held.
|
||||
*/
|
||||
if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK)
|
||||
fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
|
||||
fe->pmd);
|
||||
spin_unlock(fe->ptl);
|
||||
return ret;
|
||||
}
|
||||
@@ -3014,18 +3046,20 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
|
||||
|
||||
ret = do_set_pmd(fe, page);
|
||||
if (ret != VM_FAULT_FALLBACK)
|
||||
return ret;
|
||||
goto fault_handled;
|
||||
}
|
||||
|
||||
if (!fe->pte) {
|
||||
ret = pte_alloc_one_map(fe);
|
||||
if (ret)
|
||||
return ret;
|
||||
goto fault_handled;
|
||||
}
|
||||
|
||||
/* Re-check under ptl */
|
||||
if (unlikely(!pte_none(*fe->pte)))
|
||||
return VM_FAULT_NOPAGE;
|
||||
if (unlikely(!pte_none(*fe->pte))) {
|
||||
ret = VM_FAULT_NOPAGE;
|
||||
goto fault_handled;
|
||||
}
|
||||
|
||||
flush_icache_page(vma, page);
|
||||
entry = mk_pte(page, vma->vm_page_prot);
|
||||
@@ -3045,8 +3079,15 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
|
||||
|
||||
/* no need to invalidate: a not-present page won't be cached */
|
||||
update_mmu_cache(vma, fe->address, fe->pte);
|
||||
ret = 0;
|
||||
|
||||
return 0;
|
||||
fault_handled:
|
||||
/* preallocated pagetable is unused: free it */
|
||||
if (fe->prealloc_pte) {
|
||||
pte_free(fe->vma->vm_mm, fe->prealloc_pte);
|
||||
fe->prealloc_pte = 0;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static unsigned long fault_around_bytes __read_mostly =
|
||||
@@ -3145,11 +3186,6 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
|
||||
|
||||
fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff);
|
||||
|
||||
/* preallocated pagetable is unused: free it */
|
||||
if (fe->prealloc_pte) {
|
||||
pte_free(fe->vma->vm_mm, fe->prealloc_pte);
|
||||
fe->prealloc_pte = 0;
|
||||
}
|
||||
/* Huge page is mapped? Page fault is solved */
|
||||
if (pmd_trans_huge(*fe->pmd)) {
|
||||
ret = VM_FAULT_NOPAGE;
|
||||
@@ -3454,7 +3490,7 @@ static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd)
|
||||
|
||||
/* COW handled on pte level: split pmd */
|
||||
VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma);
|
||||
split_huge_pmd(fe->vma, fe->pmd, fe->address);
|
||||
__split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL);
|
||||
|
||||
return VM_FAULT_FALLBACK;
|
||||
}
|
||||
|
@@ -1727,26 +1727,6 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
|
||||
static int __init cmdline_parse_movable_node(char *p)
|
||||
{
|
||||
#ifdef CONFIG_MOVABLE_NODE
|
||||
/*
|
||||
* Memory used by the kernel cannot be hot-removed because Linux
|
||||
* cannot migrate the kernel pages. When memory hotplug is
|
||||
* enabled, we should prevent memblock from allocating memory
|
||||
* for the kernel.
|
||||
*
|
||||
* ACPI SRAT records all hotpluggable memory ranges. But before
|
||||
* SRAT is parsed, we don't know about it.
|
||||
*
|
||||
* The kernel image is loaded into memory at very early time. We
|
||||
* cannot prevent this anyway. So on NUMA system, we set any
|
||||
* node the kernel resides in as un-hotpluggable.
|
||||
*
|
||||
* Since on modern servers, one node could have double-digit
|
||||
* gigabytes memory, we can assume the memory around the kernel
|
||||
* image is also un-hotpluggable. So before SRAT is parsed, just
|
||||
* allocate memory near the kernel image to try the best to keep
|
||||
* the kernel away from hotpluggable memory.
|
||||
*/
|
||||
memblock_set_bottom_up(true);
|
||||
movable_node_enabled = true;
|
||||
#else
|
||||
pr_warn("movable_node option not supported\n");
|
||||
|
@@ -276,7 +276,9 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
} else if (mode == MPOL_LOCAL) {
|
||||
if (!nodes_empty(*nodes))
|
||||
if (!nodes_empty(*nodes) ||
|
||||
(flags & MPOL_F_STATIC_NODES) ||
|
||||
(flags & MPOL_F_RELATIVE_NODES))
|
||||
return ERR_PTR(-EINVAL);
|
||||
mode = MPOL_PREFERRED;
|
||||
} else if (nodes_empty(*nodes))
|
||||
@@ -496,7 +498,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
|
||||
page = pmd_page(*pmd);
|
||||
if (is_huge_zero_page(page)) {
|
||||
spin_unlock(ptl);
|
||||
split_huge_pmd(vma, pmd, addr);
|
||||
__split_huge_pmd(vma, pmd, addr, false, NULL);
|
||||
} else {
|
||||
get_page(page);
|
||||
spin_unlock(ptl);
|
||||
@@ -1679,25 +1681,17 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
|
||||
static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
|
||||
int nd)
|
||||
{
|
||||
switch (policy->mode) {
|
||||
case MPOL_PREFERRED:
|
||||
if (!(policy->flags & MPOL_F_LOCAL))
|
||||
nd = policy->v.preferred_node;
|
||||
break;
|
||||
case MPOL_BIND:
|
||||
if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
|
||||
nd = policy->v.preferred_node;
|
||||
else {
|
||||
/*
|
||||
* Normally, MPOL_BIND allocations are node-local within the
|
||||
* allowed nodemask. However, if __GFP_THISNODE is set and the
|
||||
* current node isn't part of the mask, we use the zonelist for
|
||||
* the first node in the mask instead.
|
||||
* __GFP_THISNODE shouldn't even be used with the bind policy
|
||||
* because we might easily break the expectation to stay on the
|
||||
* requested node and not break the policy.
|
||||
*/
|
||||
if (unlikely(gfp & __GFP_THISNODE) &&
|
||||
unlikely(!node_isset(nd, policy->v.nodes)))
|
||||
nd = first_node(policy->v.nodes);
|
||||
break;
|
||||
default:
|
||||
BUG();
|
||||
WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
|
||||
}
|
||||
|
||||
return node_zonelist(nd, gfp);
|
||||
}
|
||||
|
||||
|
19
mm/migrate.c
19
mm/migrate.c
@@ -168,8 +168,6 @@ void putback_movable_pages(struct list_head *l)
|
||||
continue;
|
||||
}
|
||||
list_del(&page->lru);
|
||||
dec_node_page_state(page, NR_ISOLATED_ANON +
|
||||
page_is_file_cache(page));
|
||||
/*
|
||||
* We isolated non-lru movable page so here we can use
|
||||
* __PageMovable because LRU page's mapping cannot have
|
||||
@@ -186,6 +184,8 @@ void putback_movable_pages(struct list_head *l)
|
||||
put_page(page);
|
||||
} else {
|
||||
putback_lru_page(page);
|
||||
dec_node_page_state(page, NR_ISOLATED_ANON +
|
||||
page_is_file_cache(page));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -482,7 +482,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
|
||||
SetPageDirty(newpage);
|
||||
}
|
||||
|
||||
radix_tree_replace_slot(pslot, newpage);
|
||||
radix_tree_replace_slot(&mapping->page_tree, pslot, newpage);
|
||||
|
||||
/*
|
||||
* Drop cache reference from old page by unfreezing
|
||||
@@ -556,7 +556,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
|
||||
|
||||
get_page(newpage);
|
||||
|
||||
radix_tree_replace_slot(pslot, newpage);
|
||||
radix_tree_replace_slot(&mapping->page_tree, pslot, newpage);
|
||||
|
||||
page_ref_unfreeze(page, expected_count - 1);
|
||||
|
||||
@@ -1121,8 +1121,15 @@ out:
|
||||
* restored.
|
||||
*/
|
||||
list_del(&page->lru);
|
||||
dec_node_page_state(page, NR_ISOLATED_ANON +
|
||||
page_is_file_cache(page));
|
||||
|
||||
/*
|
||||
* Compaction can migrate also non-LRU pages which are
|
||||
* not accounted to NR_ISOLATED_*. They can be recognized
|
||||
* as __PageMovable
|
||||
*/
|
||||
if (likely(!__PageMovable(page)))
|
||||
dec_node_page_state(page, NR_ISOLATED_ANON +
|
||||
page_is_file_cache(page));
|
||||
}
|
||||
|
||||
/*
|
||||
|
@@ -69,11 +69,17 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
pte_t *pte, oldpte;
|
||||
spinlock_t *ptl;
|
||||
unsigned long pages = 0;
|
||||
int target_node = NUMA_NO_NODE;
|
||||
|
||||
pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl);
|
||||
if (!pte)
|
||||
return 0;
|
||||
|
||||
/* Get target node for single threaded private VMAs */
|
||||
if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
|
||||
atomic_read(&vma->vm_mm->mm_users) == 1)
|
||||
target_node = numa_node_id();
|
||||
|
||||
arch_enter_lazy_mmu_mode();
|
||||
do {
|
||||
oldpte = *pte;
|
||||
@@ -95,6 +101,13 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
/* Avoid TLB flush if possible */
|
||||
if (pte_protnone(oldpte))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Don't mess with PTEs if page is already on the node
|
||||
* a single-threaded process is running on.
|
||||
*/
|
||||
if (target_node == page_to_nid(page))
|
||||
continue;
|
||||
}
|
||||
|
||||
ptent = ptep_modify_prot_start(mm, addr, pte);
|
||||
@@ -163,7 +176,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
|
||||
|
||||
if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
|
||||
if (next - addr != HPAGE_PMD_SIZE) {
|
||||
split_huge_pmd(vma, pmd, addr);
|
||||
__split_huge_pmd(vma, pmd, addr, false, NULL);
|
||||
if (pmd_trans_unstable(pmd))
|
||||
continue;
|
||||
} else {
|
||||
@@ -484,6 +497,8 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
|
||||
return do_mprotect_pkey(start, len, prot, -1);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_PKEYS
|
||||
|
||||
SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
|
||||
unsigned long, prot, int, pkey)
|
||||
{
|
||||
@@ -534,3 +549,5 @@ SYSCALL_DEFINE1(pkey_free, int, pkey)
|
||||
*/
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_ARCH_HAS_PKEYS */
|
||||
|
@@ -2058,8 +2058,12 @@ out_unlock:
|
||||
* potentially hurts the reliability of high-order allocations when under
|
||||
* intense memory pressure but failed atomic allocations should be easier
|
||||
* to recover from than an OOM.
|
||||
*
|
||||
* If @force is true, try to unreserve a pageblock even though highatomic
|
||||
* pageblock is exhausted.
|
||||
*/
|
||||
static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
|
||||
static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
|
||||
bool force)
|
||||
{
|
||||
struct zonelist *zonelist = ac->zonelist;
|
||||
unsigned long flags;
|
||||
@@ -2067,11 +2071,16 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
|
||||
struct zone *zone;
|
||||
struct page *page;
|
||||
int order;
|
||||
bool ret;
|
||||
|
||||
for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
|
||||
ac->nodemask) {
|
||||
/* Preserve at least one pageblock */
|
||||
if (zone->nr_reserved_highatomic <= pageblock_nr_pages)
|
||||
/*
|
||||
* Preserve at least one pageblock unless memory pressure
|
||||
* is really high.
|
||||
*/
|
||||
if (!force && zone->nr_reserved_highatomic <=
|
||||
pageblock_nr_pages)
|
||||
continue;
|
||||
|
||||
spin_lock_irqsave(&zone->lock, flags);
|
||||
@@ -2085,13 +2094,25 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* It should never happen but changes to locking could
|
||||
* inadvertently allow a per-cpu drain to add pages
|
||||
* to MIGRATE_HIGHATOMIC while unreserving so be safe
|
||||
* and watch for underflows.
|
||||
* In page freeing path, migratetype change is racy so
|
||||
* we can counter several free pages in a pageblock
|
||||
* in this loop althoug we changed the pageblock type
|
||||
* from highatomic to ac->migratetype. So we should
|
||||
* adjust the count once.
|
||||
*/
|
||||
zone->nr_reserved_highatomic -= min(pageblock_nr_pages,
|
||||
zone->nr_reserved_highatomic);
|
||||
if (get_pageblock_migratetype(page) ==
|
||||
MIGRATE_HIGHATOMIC) {
|
||||
/*
|
||||
* It should never happen but changes to
|
||||
* locking could inadvertently allow a per-cpu
|
||||
* drain to add pages to MIGRATE_HIGHATOMIC
|
||||
* while unreserving so be safe and watch for
|
||||
* underflows.
|
||||
*/
|
||||
zone->nr_reserved_highatomic -= min(
|
||||
pageblock_nr_pages,
|
||||
zone->nr_reserved_highatomic);
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert to ac->migratetype and avoid the normal
|
||||
@@ -2103,12 +2124,16 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
|
||||
* may increase.
|
||||
*/
|
||||
set_pageblock_migratetype(page, ac->migratetype);
|
||||
move_freepages_block(zone, page, ac->migratetype);
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
return;
|
||||
ret = move_freepages_block(zone, page, ac->migratetype);
|
||||
if (ret) {
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Remove an element from the buddy allocator from the fallback list */
|
||||
@@ -2133,7 +2158,8 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
|
||||
|
||||
page = list_first_entry(&area->free_list[fallback_mt],
|
||||
struct page, lru);
|
||||
if (can_steal)
|
||||
if (can_steal &&
|
||||
get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC)
|
||||
steal_suitable_fallback(zone, page, start_migratetype);
|
||||
|
||||
/* Remove the page from the freelists */
|
||||
@@ -2192,7 +2218,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
|
||||
unsigned long count, struct list_head *list,
|
||||
int migratetype, bool cold)
|
||||
{
|
||||
int i;
|
||||
int i, alloced = 0;
|
||||
|
||||
spin_lock(&zone->lock);
|
||||
for (i = 0; i < count; ++i) {
|
||||
@@ -2217,13 +2243,21 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
|
||||
else
|
||||
list_add_tail(&page->lru, list);
|
||||
list = &page->lru;
|
||||
alloced++;
|
||||
if (is_migrate_cma(get_pcppage_migratetype(page)))
|
||||
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
|
||||
-(1 << order));
|
||||
}
|
||||
|
||||
/*
|
||||
* i pages were removed from the buddy list even if some leak due
|
||||
* to check_pcp_refill failing so adjust NR_FREE_PAGES based
|
||||
* on i. Do not confuse with 'alloced' which is the number of
|
||||
* pages added to the pcp list.
|
||||
*/
|
||||
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
|
||||
spin_unlock(&zone->lock);
|
||||
return i;
|
||||
return alloced;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
@@ -2534,7 +2568,8 @@ int __isolate_free_page(struct page *page, unsigned int order)
|
||||
struct page *endpage = page + (1 << order) - 1;
|
||||
for (; page < endpage; page += pageblock_nr_pages) {
|
||||
int mt = get_pageblock_migratetype(page);
|
||||
if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
|
||||
if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
|
||||
&& mt != MIGRATE_HIGHATOMIC)
|
||||
set_pageblock_migratetype(page,
|
||||
MIGRATE_MOVABLE);
|
||||
}
|
||||
@@ -3305,7 +3340,7 @@ retry:
|
||||
* Shrink them them and try again
|
||||
*/
|
||||
if (!page && !drained) {
|
||||
unreserve_highatomic_pageblock(ac);
|
||||
unreserve_highatomic_pageblock(ac, false);
|
||||
drain_all_pages(NULL);
|
||||
drained = true;
|
||||
goto retry;
|
||||
@@ -3422,8 +3457,10 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
|
||||
* Make sure we converge to OOM if we cannot make any progress
|
||||
* several times in the row.
|
||||
*/
|
||||
if (*no_progress_loops > MAX_RECLAIM_RETRIES)
|
||||
return false;
|
||||
if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
|
||||
/* Before OOM, exhaust highatomic_reserve */
|
||||
return unreserve_highatomic_pageblock(ac, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* Keep reclaiming pages while there is a chance this will lead
|
||||
|
16
mm/percpu.c
16
mm/percpu.c
@@ -2093,6 +2093,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
|
||||
size_t pages_size;
|
||||
struct page **pages;
|
||||
int unit, i, j, rc;
|
||||
int upa;
|
||||
int nr_g0_units;
|
||||
|
||||
snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
|
||||
|
||||
@@ -2100,7 +2102,12 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
|
||||
if (IS_ERR(ai))
|
||||
return PTR_ERR(ai);
|
||||
BUG_ON(ai->nr_groups != 1);
|
||||
BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
|
||||
upa = ai->alloc_size/ai->unit_size;
|
||||
nr_g0_units = roundup(num_possible_cpus(), upa);
|
||||
if (unlikely(WARN_ON(ai->groups[0].nr_units != nr_g0_units))) {
|
||||
pcpu_free_alloc_info(ai);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
unit_pages = ai->unit_size >> PAGE_SHIFT;
|
||||
|
||||
@@ -2111,21 +2118,22 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
|
||||
|
||||
/* allocate pages */
|
||||
j = 0;
|
||||
for (unit = 0; unit < num_possible_cpus(); unit++)
|
||||
for (unit = 0; unit < num_possible_cpus(); unit++) {
|
||||
unsigned int cpu = ai->groups[0].cpu_map[unit];
|
||||
for (i = 0; i < unit_pages; i++) {
|
||||
unsigned int cpu = ai->groups[0].cpu_map[unit];
|
||||
void *ptr;
|
||||
|
||||
ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
|
||||
if (!ptr) {
|
||||
pr_warn("failed to allocate %s page for cpu%u\n",
|
||||
psize_str, cpu);
|
||||
psize_str, cpu);
|
||||
goto enomem;
|
||||
}
|
||||
/* kmemleak tracks the percpu allocations separately */
|
||||
kmemleak_free(ptr);
|
||||
pages[j++] = virt_to_page(ptr);
|
||||
}
|
||||
}
|
||||
|
||||
/* allocate vm area, map the pages and copy static data */
|
||||
vm.flags = VM_ALLOC;
|
||||
|
@@ -207,12 +207,21 @@ out:
|
||||
* memory at once.
|
||||
*/
|
||||
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
|
||||
pgoff_t offset, unsigned long nr_to_read)
|
||||
pgoff_t offset, unsigned long nr_to_read)
|
||||
{
|
||||
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
|
||||
struct file_ra_state *ra = &filp->f_ra;
|
||||
unsigned long max_pages;
|
||||
|
||||
if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
|
||||
return -EINVAL;
|
||||
|
||||
nr_to_read = min(nr_to_read, inode_to_bdi(mapping->host)->ra_pages);
|
||||
/*
|
||||
* If the request exceeds the readahead window, allow the read to
|
||||
* be up to the optimal hardware IO size
|
||||
*/
|
||||
max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
|
||||
nr_to_read = min(nr_to_read, max_pages);
|
||||
while (nr_to_read) {
|
||||
int err;
|
||||
|
||||
@@ -369,9 +378,17 @@ ondemand_readahead(struct address_space *mapping,
|
||||
bool hit_readahead_marker, pgoff_t offset,
|
||||
unsigned long req_size)
|
||||
{
|
||||
unsigned long max = ra->ra_pages;
|
||||
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
|
||||
unsigned long max_pages = ra->ra_pages;
|
||||
pgoff_t prev_offset;
|
||||
|
||||
/*
|
||||
* If the request exceeds the readahead window, allow the read to
|
||||
* be up to the optimal hardware IO size
|
||||
*/
|
||||
if (req_size > max_pages && bdi->io_pages > max_pages)
|
||||
max_pages = min(req_size, bdi->io_pages);
|
||||
|
||||
/*
|
||||
* start of file
|
||||
*/
|
||||
@@ -385,7 +402,7 @@ ondemand_readahead(struct address_space *mapping,
|
||||
if ((offset == (ra->start + ra->size - ra->async_size) ||
|
||||
offset == (ra->start + ra->size))) {
|
||||
ra->start += ra->size;
|
||||
ra->size = get_next_ra_size(ra, max);
|
||||
ra->size = get_next_ra_size(ra, max_pages);
|
||||
ra->async_size = ra->size;
|
||||
goto readit;
|
||||
}
|
||||
@@ -400,16 +417,16 @@ ondemand_readahead(struct address_space *mapping,
|
||||
pgoff_t start;
|
||||
|
||||
rcu_read_lock();
|
||||
start = page_cache_next_hole(mapping, offset + 1, max);
|
||||
start = page_cache_next_hole(mapping, offset + 1, max_pages);
|
||||
rcu_read_unlock();
|
||||
|
||||
if (!start || start - offset > max)
|
||||
if (!start || start - offset > max_pages)
|
||||
return 0;
|
||||
|
||||
ra->start = start;
|
||||
ra->size = start - offset; /* old async_size */
|
||||
ra->size += req_size;
|
||||
ra->size = get_next_ra_size(ra, max);
|
||||
ra->size = get_next_ra_size(ra, max_pages);
|
||||
ra->async_size = ra->size;
|
||||
goto readit;
|
||||
}
|
||||
@@ -417,7 +434,7 @@ ondemand_readahead(struct address_space *mapping,
|
||||
/*
|
||||
* oversize read
|
||||
*/
|
||||
if (req_size > max)
|
||||
if (req_size > max_pages)
|
||||
goto initial_readahead;
|
||||
|
||||
/*
|
||||
@@ -433,7 +450,7 @@ ondemand_readahead(struct address_space *mapping,
|
||||
* Query the page cache and look for the traces(cached history pages)
|
||||
* that a sequential stream would leave behind.
|
||||
*/
|
||||
if (try_context_readahead(mapping, ra, offset, req_size, max))
|
||||
if (try_context_readahead(mapping, ra, offset, req_size, max_pages))
|
||||
goto readit;
|
||||
|
||||
/*
|
||||
@@ -444,7 +461,7 @@ ondemand_readahead(struct address_space *mapping,
|
||||
|
||||
initial_readahead:
|
||||
ra->start = offset;
|
||||
ra->size = get_init_ra_size(req_size, max);
|
||||
ra->size = get_init_ra_size(req_size, max_pages);
|
||||
ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
|
||||
|
||||
readit:
|
||||
@@ -454,7 +471,7 @@ readit:
|
||||
* the resulted next readahead window into the current one.
|
||||
*/
|
||||
if (offset == ra->start && ra->size == ra->async_size) {
|
||||
ra->async_size = get_next_ra_size(ra, max);
|
||||
ra->async_size = get_next_ra_size(ra, max_pages);
|
||||
ra->size += ra->async_size;
|
||||
}
|
||||
|
||||
|
73
mm/rmap.c
73
mm/rmap.c
@@ -141,14 +141,15 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
|
||||
}
|
||||
|
||||
/**
|
||||
* anon_vma_prepare - attach an anon_vma to a memory region
|
||||
* __anon_vma_prepare - attach an anon_vma to a memory region
|
||||
* @vma: the memory region in question
|
||||
*
|
||||
* This makes sure the memory mapping described by 'vma' has
|
||||
* an 'anon_vma' attached to it, so that we can associate the
|
||||
* anonymous pages mapped into it with that anon_vma.
|
||||
*
|
||||
* The common case will be that we already have one, but if
|
||||
* The common case will be that we already have one, which
|
||||
* is handled inline by anon_vma_prepare(). But if
|
||||
* not we either need to find an adjacent mapping that we
|
||||
* can re-use the anon_vma from (very common when the only
|
||||
* reason for splitting a vma has been mprotect()), or we
|
||||
@@ -167,48 +168,46 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
|
||||
*
|
||||
* This must be called with the mmap_sem held for reading.
|
||||
*/
|
||||
int anon_vma_prepare(struct vm_area_struct *vma)
|
||||
int __anon_vma_prepare(struct vm_area_struct *vma)
|
||||
{
|
||||
struct anon_vma *anon_vma = vma->anon_vma;
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
struct anon_vma *anon_vma, *allocated;
|
||||
struct anon_vma_chain *avc;
|
||||
|
||||
might_sleep();
|
||||
if (unlikely(!anon_vma)) {
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
struct anon_vma *allocated;
|
||||
|
||||
avc = anon_vma_chain_alloc(GFP_KERNEL);
|
||||
if (!avc)
|
||||
goto out_enomem;
|
||||
avc = anon_vma_chain_alloc(GFP_KERNEL);
|
||||
if (!avc)
|
||||
goto out_enomem;
|
||||
|
||||
anon_vma = find_mergeable_anon_vma(vma);
|
||||
allocated = NULL;
|
||||
if (!anon_vma) {
|
||||
anon_vma = anon_vma_alloc();
|
||||
if (unlikely(!anon_vma))
|
||||
goto out_enomem_free_avc;
|
||||
allocated = anon_vma;
|
||||
}
|
||||
|
||||
anon_vma_lock_write(anon_vma);
|
||||
/* page_table_lock to protect against threads */
|
||||
spin_lock(&mm->page_table_lock);
|
||||
if (likely(!vma->anon_vma)) {
|
||||
vma->anon_vma = anon_vma;
|
||||
anon_vma_chain_link(vma, avc, anon_vma);
|
||||
/* vma reference or self-parent link for new root */
|
||||
anon_vma->degree++;
|
||||
allocated = NULL;
|
||||
avc = NULL;
|
||||
}
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
anon_vma_unlock_write(anon_vma);
|
||||
|
||||
if (unlikely(allocated))
|
||||
put_anon_vma(allocated);
|
||||
if (unlikely(avc))
|
||||
anon_vma_chain_free(avc);
|
||||
anon_vma = find_mergeable_anon_vma(vma);
|
||||
allocated = NULL;
|
||||
if (!anon_vma) {
|
||||
anon_vma = anon_vma_alloc();
|
||||
if (unlikely(!anon_vma))
|
||||
goto out_enomem_free_avc;
|
||||
allocated = anon_vma;
|
||||
}
|
||||
|
||||
anon_vma_lock_write(anon_vma);
|
||||
/* page_table_lock to protect against threads */
|
||||
spin_lock(&mm->page_table_lock);
|
||||
if (likely(!vma->anon_vma)) {
|
||||
vma->anon_vma = anon_vma;
|
||||
anon_vma_chain_link(vma, avc, anon_vma);
|
||||
/* vma reference or self-parent link for new root */
|
||||
anon_vma->degree++;
|
||||
allocated = NULL;
|
||||
avc = NULL;
|
||||
}
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
anon_vma_unlock_write(anon_vma);
|
||||
|
||||
if (unlikely(allocated))
|
||||
put_anon_vma(allocated);
|
||||
if (unlikely(avc))
|
||||
anon_vma_chain_free(avc);
|
||||
|
||||
return 0;
|
||||
|
||||
out_enomem_free_avc:
|
||||
|
15
mm/shmem.c
15
mm/shmem.c
@@ -300,18 +300,19 @@ void shmem_uncharge(struct inode *inode, long pages)
|
||||
static int shmem_radix_tree_replace(struct address_space *mapping,
|
||||
pgoff_t index, void *expected, void *replacement)
|
||||
{
|
||||
struct radix_tree_node *node;
|
||||
void **pslot;
|
||||
void *item;
|
||||
|
||||
VM_BUG_ON(!expected);
|
||||
VM_BUG_ON(!replacement);
|
||||
pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
|
||||
if (!pslot)
|
||||
item = __radix_tree_lookup(&mapping->page_tree, index, &node, &pslot);
|
||||
if (!item)
|
||||
return -ENOENT;
|
||||
item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock);
|
||||
if (item != expected)
|
||||
return -ENOENT;
|
||||
radix_tree_replace_slot(pslot, replacement);
|
||||
__radix_tree_replace(&mapping->page_tree, node, pslot,
|
||||
replacement, NULL, NULL);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -370,6 +371,7 @@ static bool shmem_confirm_swap(struct address_space *mapping,
|
||||
|
||||
int shmem_huge __read_mostly;
|
||||
|
||||
#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
|
||||
static int shmem_parse_huge(const char *str)
|
||||
{
|
||||
if (!strcmp(str, "never"))
|
||||
@@ -407,6 +409,7 @@ static const char *shmem_format_huge(int huge)
|
||||
return "bad_val";
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
|
||||
struct shrink_control *sc, unsigned long nr_to_split)
|
||||
@@ -1539,7 +1542,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
|
||||
struct mm_struct *fault_mm, int *fault_type)
|
||||
{
|
||||
struct address_space *mapping = inode->i_mapping;
|
||||
struct shmem_inode_info *info;
|
||||
struct shmem_inode_info *info = SHMEM_I(inode);
|
||||
struct shmem_sb_info *sbinfo;
|
||||
struct mm_struct *charge_mm;
|
||||
struct mem_cgroup *memcg;
|
||||
@@ -1589,7 +1592,6 @@ repeat:
|
||||
* Fast cache lookup did not find it:
|
||||
* bring it back from swap or allocate.
|
||||
*/
|
||||
info = SHMEM_I(inode);
|
||||
sbinfo = SHMEM_SB(inode->i_sb);
|
||||
charge_mm = fault_mm ? : current->mm;
|
||||
|
||||
@@ -1837,7 +1839,6 @@ unlock:
|
||||
put_page(page);
|
||||
}
|
||||
if (error == -ENOSPC && !once++) {
|
||||
info = SHMEM_I(inode);
|
||||
spin_lock_irq(&info->lock);
|
||||
shmem_recalc_inode(inode);
|
||||
spin_unlock_irq(&info->lock);
|
||||
|
129
mm/slab.c
129
mm/slab.c
@@ -227,13 +227,14 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
|
||||
INIT_LIST_HEAD(&parent->slabs_full);
|
||||
INIT_LIST_HEAD(&parent->slabs_partial);
|
||||
INIT_LIST_HEAD(&parent->slabs_free);
|
||||
parent->total_slabs = 0;
|
||||
parent->free_slabs = 0;
|
||||
parent->shared = NULL;
|
||||
parent->alien = NULL;
|
||||
parent->colour_next = 0;
|
||||
spin_lock_init(&parent->list_lock);
|
||||
parent->free_objects = 0;
|
||||
parent->free_touched = 0;
|
||||
parent->num_slabs = 0;
|
||||
}
|
||||
|
||||
#define MAKE_LIST(cachep, listp, slab, nodeid) \
|
||||
@@ -1366,7 +1367,6 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
|
||||
{
|
||||
#if DEBUG
|
||||
struct kmem_cache_node *n;
|
||||
struct page *page;
|
||||
unsigned long flags;
|
||||
int node;
|
||||
static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
|
||||
@@ -1381,32 +1381,18 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
|
||||
cachep->name, cachep->size, cachep->gfporder);
|
||||
|
||||
for_each_kmem_cache_node(cachep, node, n) {
|
||||
unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
|
||||
unsigned long active_slabs = 0, num_slabs = 0;
|
||||
unsigned long num_slabs_partial = 0, num_slabs_free = 0;
|
||||
unsigned long num_slabs_full;
|
||||
unsigned long total_slabs, free_slabs, free_objs;
|
||||
|
||||
spin_lock_irqsave(&n->list_lock, flags);
|
||||
num_slabs = n->num_slabs;
|
||||
list_for_each_entry(page, &n->slabs_partial, lru) {
|
||||
active_objs += page->active;
|
||||
num_slabs_partial++;
|
||||
}
|
||||
list_for_each_entry(page, &n->slabs_free, lru)
|
||||
num_slabs_free++;
|
||||
|
||||
free_objects += n->free_objects;
|
||||
total_slabs = n->total_slabs;
|
||||
free_slabs = n->free_slabs;
|
||||
free_objs = n->free_objects;
|
||||
spin_unlock_irqrestore(&n->list_lock, flags);
|
||||
|
||||
num_objs = num_slabs * cachep->num;
|
||||
active_slabs = num_slabs - num_slabs_free;
|
||||
num_slabs_full = num_slabs -
|
||||
(num_slabs_partial + num_slabs_free);
|
||||
active_objs += (num_slabs_full * cachep->num);
|
||||
|
||||
pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
|
||||
node, active_slabs, num_slabs, active_objs, num_objs,
|
||||
free_objects);
|
||||
pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
|
||||
node, total_slabs - free_slabs, total_slabs,
|
||||
(total_slabs * cachep->num) - free_objs,
|
||||
total_slabs * cachep->num);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -2318,7 +2304,8 @@ static int drain_freelist(struct kmem_cache *cache,
|
||||
|
||||
page = list_entry(p, struct page, lru);
|
||||
list_del(&page->lru);
|
||||
n->num_slabs--;
|
||||
n->free_slabs--;
|
||||
n->total_slabs--;
|
||||
/*
|
||||
* Safe to drop the lock. The slab is no longer linked
|
||||
* to the cache.
|
||||
@@ -2332,7 +2319,7 @@ out:
|
||||
return nr_freed;
|
||||
}
|
||||
|
||||
int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
|
||||
int __kmem_cache_shrink(struct kmem_cache *cachep)
|
||||
{
|
||||
int ret = 0;
|
||||
int node;
|
||||
@@ -2352,7 +2339,7 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
|
||||
|
||||
int __kmem_cache_shutdown(struct kmem_cache *cachep)
|
||||
{
|
||||
return __kmem_cache_shrink(cachep, false);
|
||||
return __kmem_cache_shrink(cachep);
|
||||
}
|
||||
|
||||
void __kmem_cache_release(struct kmem_cache *cachep)
|
||||
@@ -2753,12 +2740,13 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
|
||||
n = get_node(cachep, page_to_nid(page));
|
||||
|
||||
spin_lock(&n->list_lock);
|
||||
if (!page->active)
|
||||
n->total_slabs++;
|
||||
if (!page->active) {
|
||||
list_add_tail(&page->lru, &(n->slabs_free));
|
||||
else
|
||||
n->free_slabs++;
|
||||
} else
|
||||
fixup_slab_list(cachep, n, page, &list);
|
||||
|
||||
n->num_slabs++;
|
||||
STATS_INC_GROWN(cachep);
|
||||
n->free_objects += cachep->num - page->active;
|
||||
spin_unlock(&n->list_lock);
|
||||
@@ -2903,9 +2891,10 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
|
||||
|
||||
/* Move pfmemalloc slab to the end of list to speed up next search */
|
||||
list_del(&page->lru);
|
||||
if (!page->active)
|
||||
if (!page->active) {
|
||||
list_add_tail(&page->lru, &n->slabs_free);
|
||||
else
|
||||
n->free_slabs++;
|
||||
} else
|
||||
list_add_tail(&page->lru, &n->slabs_partial);
|
||||
|
||||
list_for_each_entry(page, &n->slabs_partial, lru) {
|
||||
@@ -2913,9 +2902,12 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
|
||||
return page;
|
||||
}
|
||||
|
||||
n->free_touched = 1;
|
||||
list_for_each_entry(page, &n->slabs_free, lru) {
|
||||
if (!PageSlabPfmemalloc(page))
|
||||
if (!PageSlabPfmemalloc(page)) {
|
||||
n->free_slabs--;
|
||||
return page;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
@@ -2925,16 +2917,18 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
|
||||
{
|
||||
struct page *page;
|
||||
|
||||
page = list_first_entry_or_null(&n->slabs_partial,
|
||||
struct page, lru);
|
||||
assert_spin_locked(&n->list_lock);
|
||||
page = list_first_entry_or_null(&n->slabs_partial, struct page, lru);
|
||||
if (!page) {
|
||||
n->free_touched = 1;
|
||||
page = list_first_entry_or_null(&n->slabs_free,
|
||||
struct page, lru);
|
||||
page = list_first_entry_or_null(&n->slabs_free, struct page,
|
||||
lru);
|
||||
if (page)
|
||||
n->free_slabs--;
|
||||
}
|
||||
|
||||
if (sk_memalloc_socks())
|
||||
return get_valid_first_slab(n, page, pfmemalloc);
|
||||
page = get_valid_first_slab(n, page, pfmemalloc);
|
||||
|
||||
return page;
|
||||
}
|
||||
@@ -3434,9 +3428,10 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
|
||||
STATS_DEC_ACTIVE(cachep);
|
||||
|
||||
/* fixup slab chains */
|
||||
if (page->active == 0)
|
||||
if (page->active == 0) {
|
||||
list_add(&page->lru, &n->slabs_free);
|
||||
else {
|
||||
n->free_slabs++;
|
||||
} else {
|
||||
/* Unconditionally move a slab to the end of the
|
||||
* partial list on free - maximum time for the
|
||||
* other objects to be freed, too.
|
||||
@@ -3450,7 +3445,8 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
|
||||
|
||||
page = list_last_entry(&n->slabs_free, struct page, lru);
|
||||
list_move(&page->lru, list);
|
||||
n->num_slabs--;
|
||||
n->free_slabs--;
|
||||
n->total_slabs--;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4102,64 +4098,33 @@ out:
|
||||
#ifdef CONFIG_SLABINFO
|
||||
void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
|
||||
{
|
||||
struct page *page;
|
||||
unsigned long active_objs;
|
||||
unsigned long num_objs;
|
||||
unsigned long active_slabs = 0;
|
||||
unsigned long num_slabs, free_objects = 0, shared_avail = 0;
|
||||
unsigned long num_slabs_partial = 0, num_slabs_free = 0;
|
||||
unsigned long num_slabs_full = 0;
|
||||
const char *name;
|
||||
char *error = NULL;
|
||||
unsigned long active_objs, num_objs, active_slabs;
|
||||
unsigned long total_slabs = 0, free_objs = 0, shared_avail = 0;
|
||||
unsigned long free_slabs = 0;
|
||||
int node;
|
||||
struct kmem_cache_node *n;
|
||||
|
||||
active_objs = 0;
|
||||
num_slabs = 0;
|
||||
for_each_kmem_cache_node(cachep, node, n) {
|
||||
|
||||
check_irq_on();
|
||||
spin_lock_irq(&n->list_lock);
|
||||
|
||||
num_slabs += n->num_slabs;
|
||||
total_slabs += n->total_slabs;
|
||||
free_slabs += n->free_slabs;
|
||||
free_objs += n->free_objects;
|
||||
|
||||
list_for_each_entry(page, &n->slabs_partial, lru) {
|
||||
if (page->active == cachep->num && !error)
|
||||
error = "slabs_partial accounting error";
|
||||
if (!page->active && !error)
|
||||
error = "slabs_partial accounting error";
|
||||
active_objs += page->active;
|
||||
num_slabs_partial++;
|
||||
}
|
||||
|
||||
list_for_each_entry(page, &n->slabs_free, lru) {
|
||||
if (page->active && !error)
|
||||
error = "slabs_free accounting error";
|
||||
num_slabs_free++;
|
||||
}
|
||||
|
||||
free_objects += n->free_objects;
|
||||
if (n->shared)
|
||||
shared_avail += n->shared->avail;
|
||||
|
||||
spin_unlock_irq(&n->list_lock);
|
||||
}
|
||||
num_objs = num_slabs * cachep->num;
|
||||
active_slabs = num_slabs - num_slabs_free;
|
||||
num_slabs_full = num_slabs - (num_slabs_partial + num_slabs_free);
|
||||
active_objs += (num_slabs_full * cachep->num);
|
||||
|
||||
if (num_objs - active_objs != free_objects && !error)
|
||||
error = "free_objects accounting error";
|
||||
|
||||
name = cachep->name;
|
||||
if (error)
|
||||
pr_err("slab: cache %s error: %s\n", name, error);
|
||||
num_objs = total_slabs * cachep->num;
|
||||
active_slabs = total_slabs - free_slabs;
|
||||
active_objs = num_objs - free_objs;
|
||||
|
||||
sinfo->active_objs = active_objs;
|
||||
sinfo->num_objs = num_objs;
|
||||
sinfo->active_slabs = active_slabs;
|
||||
sinfo->num_slabs = num_slabs;
|
||||
sinfo->num_slabs = total_slabs;
|
||||
sinfo->shared_avail = shared_avail;
|
||||
sinfo->limit = cachep->limit;
|
||||
sinfo->batchcount = cachep->batchcount;
|
||||
|
20
mm/slab.h
20
mm/slab.h
@@ -142,11 +142,26 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
|
||||
#define SLAB_CACHE_FLAGS (0)
|
||||
#endif
|
||||
|
||||
/* Common flags available with current configuration */
|
||||
#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
|
||||
|
||||
/* Common flags permitted for kmem_cache_create */
|
||||
#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \
|
||||
SLAB_RED_ZONE | \
|
||||
SLAB_POISON | \
|
||||
SLAB_STORE_USER | \
|
||||
SLAB_TRACE | \
|
||||
SLAB_CONSISTENCY_CHECKS | \
|
||||
SLAB_MEM_SPREAD | \
|
||||
SLAB_NOLEAKTRACE | \
|
||||
SLAB_RECLAIM_ACCOUNT | \
|
||||
SLAB_TEMPORARY | \
|
||||
SLAB_NOTRACK | \
|
||||
SLAB_ACCOUNT)
|
||||
|
||||
int __kmem_cache_shutdown(struct kmem_cache *);
|
||||
void __kmem_cache_release(struct kmem_cache *);
|
||||
int __kmem_cache_shrink(struct kmem_cache *, bool);
|
||||
int __kmem_cache_shrink(struct kmem_cache *);
|
||||
void slab_kmem_cache_release(struct kmem_cache *);
|
||||
|
||||
struct seq_file;
|
||||
@@ -432,7 +447,8 @@ struct kmem_cache_node {
|
||||
struct list_head slabs_partial; /* partial list first, better asm code */
|
||||
struct list_head slabs_full;
|
||||
struct list_head slabs_free;
|
||||
unsigned long num_slabs;
|
||||
unsigned long total_slabs; /* length of all slab lists */
|
||||
unsigned long free_slabs; /* length of free slab list only */
|
||||
unsigned long free_objects;
|
||||
unsigned int free_limit;
|
||||
unsigned int colour_next; /* Per-node cache coloring */
|
||||
|
@@ -404,6 +404,12 @@ kmem_cache_create(const char *name, size_t size, size_t align,
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
/* Refuse requests with allocator specific flags */
|
||||
if (flags & ~SLAB_FLAGS_PERMITTED) {
|
||||
err = -EINVAL;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
/*
|
||||
* Some allocators will constraint the set of valid flags to a subset
|
||||
* of all flags. We expect them to define CACHE_CREATE_MASK in this
|
||||
@@ -573,6 +579,29 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
|
||||
get_online_cpus();
|
||||
get_online_mems();
|
||||
|
||||
#ifdef CONFIG_SLUB
|
||||
/*
|
||||
* In case of SLUB, we need to disable empty slab caching to
|
||||
* avoid pinning the offline memory cgroup by freeable kmem
|
||||
* pages charged to it. SLAB doesn't need this, as it
|
||||
* periodically purges unused slabs.
|
||||
*/
|
||||
mutex_lock(&slab_mutex);
|
||||
list_for_each_entry(s, &slab_caches, list) {
|
||||
c = is_root_cache(s) ? cache_from_memcg_idx(s, idx) : NULL;
|
||||
if (c) {
|
||||
c->cpu_partial = 0;
|
||||
c->min_partial = 0;
|
||||
}
|
||||
}
|
||||
mutex_unlock(&slab_mutex);
|
||||
/*
|
||||
* kmem_cache->cpu_partial is checked locklessly (see
|
||||
* put_cpu_partial()). Make sure the change is visible.
|
||||
*/
|
||||
synchronize_sched();
|
||||
#endif
|
||||
|
||||
mutex_lock(&slab_mutex);
|
||||
list_for_each_entry(s, &slab_caches, list) {
|
||||
if (!is_root_cache(s))
|
||||
@@ -584,7 +613,7 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
|
||||
if (!c)
|
||||
continue;
|
||||
|
||||
__kmem_cache_shrink(c, true);
|
||||
__kmem_cache_shrink(c);
|
||||
arr->entries[idx] = NULL;
|
||||
}
|
||||
mutex_unlock(&slab_mutex);
|
||||
@@ -755,7 +784,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
|
||||
get_online_cpus();
|
||||
get_online_mems();
|
||||
kasan_cache_shrink(cachep);
|
||||
ret = __kmem_cache_shrink(cachep, false);
|
||||
ret = __kmem_cache_shrink(cachep);
|
||||
put_online_mems();
|
||||
put_online_cpus();
|
||||
return ret;
|
||||
|
@@ -634,7 +634,7 @@ void __kmem_cache_release(struct kmem_cache *c)
|
||||
{
|
||||
}
|
||||
|
||||
int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate)
|
||||
int __kmem_cache_shrink(struct kmem_cache *d)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
21
mm/slub.c
21
mm/slub.c
@@ -3076,7 +3076,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
|
||||
struct detached_freelist df;
|
||||
|
||||
size = build_detached_freelist(s, size, p, &df);
|
||||
if (unlikely(!df.page))
|
||||
if (!df.page)
|
||||
continue;
|
||||
|
||||
slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_);
|
||||
@@ -3883,7 +3883,7 @@ EXPORT_SYMBOL(kfree);
|
||||
* being allocated from last increasing the chance that the last objects
|
||||
* are freed in them.
|
||||
*/
|
||||
int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
|
||||
int __kmem_cache_shrink(struct kmem_cache *s)
|
||||
{
|
||||
int node;
|
||||
int i;
|
||||
@@ -3895,21 +3895,6 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
|
||||
unsigned long flags;
|
||||
int ret = 0;
|
||||
|
||||
if (deactivate) {
|
||||
/*
|
||||
* Disable empty slabs caching. Used to avoid pinning offline
|
||||
* memory cgroups by kmem pages that can be freed.
|
||||
*/
|
||||
s->cpu_partial = 0;
|
||||
s->min_partial = 0;
|
||||
|
||||
/*
|
||||
* s->cpu_partial is checked locklessly (see put_cpu_partial),
|
||||
* so we have to make sure the change is visible.
|
||||
*/
|
||||
synchronize_sched();
|
||||
}
|
||||
|
||||
flush_all(s);
|
||||
for_each_kmem_cache_node(s, node, n) {
|
||||
INIT_LIST_HEAD(&discard);
|
||||
@@ -3966,7 +3951,7 @@ static int slab_mem_going_offline_callback(void *arg)
|
||||
|
||||
mutex_lock(&slab_mutex);
|
||||
list_for_each_entry(s, &slab_caches, list)
|
||||
__kmem_cache_shrink(s, false);
|
||||
__kmem_cache_shrink(s);
|
||||
mutex_unlock(&slab_mutex);
|
||||
|
||||
return 0;
|
||||
|
@@ -1234,6 +1234,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
|
||||
|
||||
pmd = pmd_offset(pud, addr);
|
||||
do {
|
||||
cond_resched();
|
||||
next = pmd_addr_end(addr, end);
|
||||
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
|
||||
continue;
|
||||
@@ -1313,6 +1314,7 @@ static int unuse_mm(struct mm_struct *mm,
|
||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
||||
if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
|
||||
break;
|
||||
cond_resched();
|
||||
}
|
||||
up_read(&mm->mmap_sem);
|
||||
return (ret < 0)? ret: 0;
|
||||
@@ -1350,15 +1352,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
|
||||
prev = 0;
|
||||
i = 1;
|
||||
}
|
||||
if (frontswap) {
|
||||
if (frontswap_test(si, i))
|
||||
break;
|
||||
else
|
||||
continue;
|
||||
}
|
||||
count = READ_ONCE(si->swap_map[i]);
|
||||
if (count && swap_count(count) != SWAP_MAP_BAD)
|
||||
break;
|
||||
if (!frontswap || frontswap_test(si, i))
|
||||
break;
|
||||
if ((i % LATENCY_LIMIT) == 0)
|
||||
cond_resched();
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
@@ -44,28 +44,13 @@ static void clear_exceptional_entry(struct address_space *mapping,
|
||||
* without the tree itself locked. These unlocked entries
|
||||
* need verification under the tree lock.
|
||||
*/
|
||||
if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
|
||||
&slot))
|
||||
if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
|
||||
goto unlock;
|
||||
if (*slot != entry)
|
||||
goto unlock;
|
||||
radix_tree_replace_slot(slot, NULL);
|
||||
__radix_tree_replace(&mapping->page_tree, node, slot, NULL,
|
||||
workingset_update_node, mapping);
|
||||
mapping->nrexceptional--;
|
||||
if (!node)
|
||||
goto unlock;
|
||||
workingset_node_shadows_dec(node);
|
||||
/*
|
||||
* Don't track node without shadow entries.
|
||||
*
|
||||
* Avoid acquiring the list_lru lock if already untracked.
|
||||
* The list_empty() test is safe as node->private_list is
|
||||
* protected by mapping->tree_lock.
|
||||
*/
|
||||
if (!workingset_node_shadows(node) &&
|
||||
!list_empty(&node->private_list))
|
||||
list_lru_del(&workingset_shadow_nodes,
|
||||
&node->private_list);
|
||||
__radix_tree_delete_node(&mapping->page_tree, node);
|
||||
unlock:
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
}
|
||||
|
198
mm/vmalloc.c
198
mm/vmalloc.c
@@ -365,7 +365,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
|
||||
BUG_ON(offset_in_page(size));
|
||||
BUG_ON(!is_power_of_2(align));
|
||||
|
||||
might_sleep_if(gfpflags_allow_blocking(gfp_mask));
|
||||
might_sleep();
|
||||
|
||||
va = kmalloc_node(sizeof(struct vmap_area),
|
||||
gfp_mask & GFP_RECLAIM_MASK, node);
|
||||
@@ -601,6 +601,13 @@ static unsigned long lazy_max_pages(void)
|
||||
|
||||
static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
|
||||
|
||||
/*
|
||||
* Serialize vmap purging. There is no actual criticial section protected
|
||||
* by this look, but we want to avoid concurrent calls for performance
|
||||
* reasons and to make the pcpu_get_vm_areas more deterministic.
|
||||
*/
|
||||
static DEFINE_MUTEX(vmap_purge_lock);
|
||||
|
||||
/* for per-CPU blocks */
|
||||
static void purge_fragmented_blocks_allcpus(void);
|
||||
|
||||
@@ -615,59 +622,40 @@ void set_iounmap_nonlazy(void)
|
||||
|
||||
/*
|
||||
* Purges all lazily-freed vmap areas.
|
||||
*
|
||||
* If sync is 0 then don't purge if there is already a purge in progress.
|
||||
* If force_flush is 1, then flush kernel TLBs between *start and *end even
|
||||
* if we found no lazy vmap areas to unmap (callers can use this to optimise
|
||||
* their own TLB flushing).
|
||||
* Returns with *start = min(*start, lowest purged address)
|
||||
* *end = max(*end, highest purged address)
|
||||
*/
|
||||
static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
|
||||
int sync, int force_flush)
|
||||
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
|
||||
{
|
||||
static DEFINE_SPINLOCK(purge_lock);
|
||||
struct llist_node *valist;
|
||||
struct vmap_area *va;
|
||||
struct vmap_area *n_va;
|
||||
int nr = 0;
|
||||
bool do_free = false;
|
||||
|
||||
/*
|
||||
* If sync is 0 but force_flush is 1, we'll go sync anyway but callers
|
||||
* should not expect such behaviour. This just simplifies locking for
|
||||
* the case that isn't actually used at the moment anyway.
|
||||
*/
|
||||
if (!sync && !force_flush) {
|
||||
if (!spin_trylock(&purge_lock))
|
||||
return;
|
||||
} else
|
||||
spin_lock(&purge_lock);
|
||||
|
||||
if (sync)
|
||||
purge_fragmented_blocks_allcpus();
|
||||
lockdep_assert_held(&vmap_purge_lock);
|
||||
|
||||
valist = llist_del_all(&vmap_purge_list);
|
||||
llist_for_each_entry(va, valist, purge_list) {
|
||||
if (va->va_start < *start)
|
||||
*start = va->va_start;
|
||||
if (va->va_end > *end)
|
||||
*end = va->va_end;
|
||||
nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
|
||||
if (va->va_start < start)
|
||||
start = va->va_start;
|
||||
if (va->va_end > end)
|
||||
end = va->va_end;
|
||||
do_free = true;
|
||||
}
|
||||
|
||||
if (nr)
|
||||
if (!do_free)
|
||||
return false;
|
||||
|
||||
flush_tlb_kernel_range(start, end);
|
||||
|
||||
spin_lock(&vmap_area_lock);
|
||||
llist_for_each_entry_safe(va, n_va, valist, purge_list) {
|
||||
int nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
|
||||
|
||||
__free_vmap_area(va);
|
||||
atomic_sub(nr, &vmap_lazy_nr);
|
||||
|
||||
if (nr || force_flush)
|
||||
flush_tlb_kernel_range(*start, *end);
|
||||
|
||||
if (nr) {
|
||||
spin_lock(&vmap_area_lock);
|
||||
llist_for_each_entry_safe(va, n_va, valist, purge_list)
|
||||
__free_vmap_area(va);
|
||||
spin_unlock(&vmap_area_lock);
|
||||
cond_resched_lock(&vmap_area_lock);
|
||||
}
|
||||
spin_unlock(&purge_lock);
|
||||
spin_unlock(&vmap_area_lock);
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -676,9 +664,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
|
||||
*/
|
||||
static void try_purge_vmap_area_lazy(void)
|
||||
{
|
||||
unsigned long start = ULONG_MAX, end = 0;
|
||||
|
||||
__purge_vmap_area_lazy(&start, &end, 0, 0);
|
||||
if (mutex_trylock(&vmap_purge_lock)) {
|
||||
__purge_vmap_area_lazy(ULONG_MAX, 0);
|
||||
mutex_unlock(&vmap_purge_lock);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -686,9 +675,10 @@ static void try_purge_vmap_area_lazy(void)
|
||||
*/
|
||||
static void purge_vmap_area_lazy(void)
|
||||
{
|
||||
unsigned long start = ULONG_MAX, end = 0;
|
||||
|
||||
__purge_vmap_area_lazy(&start, &end, 1, 0);
|
||||
mutex_lock(&vmap_purge_lock);
|
||||
purge_fragmented_blocks_allcpus();
|
||||
__purge_vmap_area_lazy(ULONG_MAX, 0);
|
||||
mutex_unlock(&vmap_purge_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -710,23 +700,14 @@ static void free_vmap_area_noflush(struct vmap_area *va)
|
||||
try_purge_vmap_area_lazy();
|
||||
}
|
||||
|
||||
/*
|
||||
* Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
|
||||
* called for the correct range previously.
|
||||
*/
|
||||
static void free_unmap_vmap_area_noflush(struct vmap_area *va)
|
||||
{
|
||||
unmap_vmap_area(va);
|
||||
free_vmap_area_noflush(va);
|
||||
}
|
||||
|
||||
/*
|
||||
* Free and unmap a vmap area
|
||||
*/
|
||||
static void free_unmap_vmap_area(struct vmap_area *va)
|
||||
{
|
||||
flush_cache_vunmap(va->va_start, va->va_end);
|
||||
free_unmap_vmap_area_noflush(va);
|
||||
unmap_vmap_area(va);
|
||||
free_vmap_area_noflush(va);
|
||||
}
|
||||
|
||||
static struct vmap_area *find_vmap_area(unsigned long addr)
|
||||
@@ -740,16 +721,6 @@ static struct vmap_area *find_vmap_area(unsigned long addr)
|
||||
return va;
|
||||
}
|
||||
|
||||
static void free_unmap_vmap_area_addr(unsigned long addr)
|
||||
{
|
||||
struct vmap_area *va;
|
||||
|
||||
va = find_vmap_area(addr);
|
||||
BUG_ON(!va);
|
||||
free_unmap_vmap_area(va);
|
||||
}
|
||||
|
||||
|
||||
/*** Per cpu kva allocator ***/
|
||||
|
||||
/*
|
||||
@@ -1070,6 +1041,8 @@ void vm_unmap_aliases(void)
|
||||
if (unlikely(!vmap_initialized))
|
||||
return;
|
||||
|
||||
might_sleep();
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
|
||||
struct vmap_block *vb;
|
||||
@@ -1094,7 +1067,11 @@ void vm_unmap_aliases(void)
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
__purge_vmap_area_lazy(&start, &end, 1, flush);
|
||||
mutex_lock(&vmap_purge_lock);
|
||||
purge_fragmented_blocks_allcpus();
|
||||
if (!__purge_vmap_area_lazy(start, end) && flush)
|
||||
flush_tlb_kernel_range(start, end);
|
||||
mutex_unlock(&vmap_purge_lock);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
|
||||
|
||||
@@ -1107,7 +1084,9 @@ void vm_unmap_ram(const void *mem, unsigned int count)
|
||||
{
|
||||
unsigned long size = (unsigned long)count << PAGE_SHIFT;
|
||||
unsigned long addr = (unsigned long)mem;
|
||||
struct vmap_area *va;
|
||||
|
||||
might_sleep();
|
||||
BUG_ON(!addr);
|
||||
BUG_ON(addr < VMALLOC_START);
|
||||
BUG_ON(addr > VMALLOC_END);
|
||||
@@ -1116,10 +1095,14 @@ void vm_unmap_ram(const void *mem, unsigned int count)
|
||||
debug_check_no_locks_freed(mem, size);
|
||||
vmap_debug_free_range(addr, addr+size);
|
||||
|
||||
if (likely(count <= VMAP_MAX_ALLOC))
|
||||
if (likely(count <= VMAP_MAX_ALLOC)) {
|
||||
vb_free(mem, size);
|
||||
else
|
||||
free_unmap_vmap_area_addr(addr);
|
||||
return;
|
||||
}
|
||||
|
||||
va = find_vmap_area(addr);
|
||||
BUG_ON(!va);
|
||||
free_unmap_vmap_area(va);
|
||||
}
|
||||
EXPORT_SYMBOL(vm_unmap_ram);
|
||||
|
||||
@@ -1455,6 +1438,8 @@ struct vm_struct *remove_vm_area(const void *addr)
|
||||
{
|
||||
struct vmap_area *va;
|
||||
|
||||
might_sleep();
|
||||
|
||||
va = find_vmap_area((unsigned long)addr);
|
||||
if (va && va->flags & VM_VM_AREA) {
|
||||
struct vm_struct *vm = va->vm;
|
||||
@@ -1510,7 +1495,39 @@ static void __vunmap(const void *addr, int deallocate_pages)
|
||||
kfree(area);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
static inline void __vfree_deferred(const void *addr)
|
||||
{
|
||||
/*
|
||||
* Use raw_cpu_ptr() because this can be called from preemptible
|
||||
* context. Preemption is absolutely fine here, because the llist_add()
|
||||
* implementation is lockless, so it works even if we are adding to
|
||||
* nother cpu's list. schedule_work() should be fine with this too.
|
||||
*/
|
||||
struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
|
||||
|
||||
if (llist_add((struct llist_node *)addr, &p->list))
|
||||
schedule_work(&p->wq);
|
||||
}
|
||||
|
||||
/**
|
||||
* vfree_atomic - release memory allocated by vmalloc()
|
||||
* @addr: memory base address
|
||||
*
|
||||
* This one is just like vfree() but can be called in any atomic context
|
||||
* except NMIs.
|
||||
*/
|
||||
void vfree_atomic(const void *addr)
|
||||
{
|
||||
BUG_ON(in_nmi());
|
||||
|
||||
kmemleak_free(addr);
|
||||
|
||||
if (!addr)
|
||||
return;
|
||||
__vfree_deferred(addr);
|
||||
}
|
||||
|
||||
/**
|
||||
* vfree - release memory allocated by vmalloc()
|
||||
* @addr: memory base address
|
||||
@@ -1533,11 +1550,9 @@ void vfree(const void *addr)
|
||||
|
||||
if (!addr)
|
||||
return;
|
||||
if (unlikely(in_interrupt())) {
|
||||
struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred);
|
||||
if (llist_add((struct llist_node *)addr, &p->list))
|
||||
schedule_work(&p->wq);
|
||||
} else
|
||||
if (unlikely(in_interrupt()))
|
||||
__vfree_deferred(addr);
|
||||
else
|
||||
__vunmap(addr, 1);
|
||||
}
|
||||
EXPORT_SYMBOL(vfree);
|
||||
@@ -2574,32 +2589,13 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
|
||||
static void *s_start(struct seq_file *m, loff_t *pos)
|
||||
__acquires(&vmap_area_lock)
|
||||
{
|
||||
loff_t n = *pos;
|
||||
struct vmap_area *va;
|
||||
|
||||
spin_lock(&vmap_area_lock);
|
||||
va = list_first_entry(&vmap_area_list, typeof(*va), list);
|
||||
while (n > 0 && &va->list != &vmap_area_list) {
|
||||
n--;
|
||||
va = list_next_entry(va, list);
|
||||
}
|
||||
if (!n && &va->list != &vmap_area_list)
|
||||
return va;
|
||||
|
||||
return NULL;
|
||||
|
||||
return seq_list_start(&vmap_area_list, *pos);
|
||||
}
|
||||
|
||||
static void *s_next(struct seq_file *m, void *p, loff_t *pos)
|
||||
{
|
||||
struct vmap_area *va = p, *next;
|
||||
|
||||
++*pos;
|
||||
next = list_next_entry(va, list);
|
||||
if (&next->list != &vmap_area_list)
|
||||
return next;
|
||||
|
||||
return NULL;
|
||||
return seq_list_next(p, &vmap_area_list, pos);
|
||||
}
|
||||
|
||||
static void s_stop(struct seq_file *m, void *p)
|
||||
@@ -2634,9 +2630,11 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
|
||||
|
||||
static int s_show(struct seq_file *m, void *p)
|
||||
{
|
||||
struct vmap_area *va = p;
|
||||
struct vmap_area *va;
|
||||
struct vm_struct *v;
|
||||
|
||||
va = list_entry(p, struct vmap_area, list);
|
||||
|
||||
/*
|
||||
* s_show can encounter race with remove_vm_area, !VM_VM_AREA on
|
||||
* behalf of vmap area is being tear down or vm_map_ram allocation.
|
||||
|
14
mm/vmscan.c
14
mm/vmscan.c
@@ -291,6 +291,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
|
||||
int nid = shrinkctl->nid;
|
||||
long batch_size = shrinker->batch ? shrinker->batch
|
||||
: SHRINK_BATCH;
|
||||
long scanned = 0, next_deferred;
|
||||
|
||||
freeable = shrinker->count_objects(shrinker, shrinkctl);
|
||||
if (freeable == 0)
|
||||
@@ -312,7 +313,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
|
||||
pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
|
||||
shrinker->scan_objects, total_scan);
|
||||
total_scan = freeable;
|
||||
}
|
||||
next_deferred = nr;
|
||||
} else
|
||||
next_deferred = total_scan;
|
||||
|
||||
/*
|
||||
* We need to avoid excessive windup on filesystem shrinkers
|
||||
@@ -369,17 +372,22 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
|
||||
|
||||
count_vm_events(SLABS_SCANNED, nr_to_scan);
|
||||
total_scan -= nr_to_scan;
|
||||
scanned += nr_to_scan;
|
||||
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
if (next_deferred >= scanned)
|
||||
next_deferred -= scanned;
|
||||
else
|
||||
next_deferred = 0;
|
||||
/*
|
||||
* move the unused scan count back into the shrinker in a
|
||||
* manner that handles concurrent updates. If we exhausted the
|
||||
* scan, there is no need to do an update.
|
||||
*/
|
||||
if (total_scan > 0)
|
||||
new_nr = atomic_long_add_return(total_scan,
|
||||
if (next_deferred > 0)
|
||||
new_nr = atomic_long_add_return(next_deferred,
|
||||
&shrinker->nr_deferred[nid]);
|
||||
else
|
||||
new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
|
||||
|
114
mm/workingset.c
114
mm/workingset.c
@@ -10,6 +10,7 @@
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/dax.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/mm.h>
|
||||
|
||||
@@ -334,48 +335,81 @@ out:
|
||||
* point where they would still be useful.
|
||||
*/
|
||||
|
||||
struct list_lru workingset_shadow_nodes;
|
||||
static struct list_lru shadow_nodes;
|
||||
|
||||
void workingset_update_node(struct radix_tree_node *node, void *private)
|
||||
{
|
||||
struct address_space *mapping = private;
|
||||
|
||||
/* Only regular page cache has shadow entries */
|
||||
if (dax_mapping(mapping) || shmem_mapping(mapping))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Track non-empty nodes that contain only shadow entries;
|
||||
* unlink those that contain pages or are being freed.
|
||||
*
|
||||
* Avoid acquiring the list_lru lock when the nodes are
|
||||
* already where they should be. The list_empty() test is safe
|
||||
* as node->private_list is protected by &mapping->tree_lock.
|
||||
*/
|
||||
if (node->count && node->count == node->exceptional) {
|
||||
if (list_empty(&node->private_list)) {
|
||||
node->private_data = mapping;
|
||||
list_lru_add(&shadow_nodes, &node->private_list);
|
||||
}
|
||||
} else {
|
||||
if (!list_empty(&node->private_list))
|
||||
list_lru_del(&shadow_nodes, &node->private_list);
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned long count_shadow_nodes(struct shrinker *shrinker,
|
||||
struct shrink_control *sc)
|
||||
{
|
||||
unsigned long shadow_nodes;
|
||||
unsigned long max_nodes;
|
||||
unsigned long pages;
|
||||
unsigned long nodes;
|
||||
unsigned long cache;
|
||||
|
||||
/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
|
||||
local_irq_disable();
|
||||
shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
|
||||
nodes = list_lru_shrink_count(&shadow_nodes, sc);
|
||||
local_irq_enable();
|
||||
|
||||
if (sc->memcg) {
|
||||
pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
|
||||
LRU_ALL_FILE);
|
||||
} else {
|
||||
pages = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) +
|
||||
node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Active cache pages are limited to 50% of memory, and shadow
|
||||
* entries that represent a refault distance bigger than that
|
||||
* do not have any effect. Limit the number of shadow nodes
|
||||
* such that shadow entries do not exceed the number of active
|
||||
* cache pages, assuming a worst-case node population density
|
||||
* of 1/8th on average.
|
||||
* Approximate a reasonable limit for the radix tree nodes
|
||||
* containing shadow entries. We don't need to keep more
|
||||
* shadow entries than possible pages on the active list,
|
||||
* since refault distances bigger than that are dismissed.
|
||||
*
|
||||
* The size of the active list converges toward 100% of
|
||||
* overall page cache as memory grows, with only a tiny
|
||||
* inactive list. Assume the total cache size for that.
|
||||
*
|
||||
* Nodes might be sparsely populated, with only one shadow
|
||||
* entry in the extreme case. Obviously, we cannot keep one
|
||||
* node for every eligible shadow entry, so compromise on a
|
||||
* worst-case density of 1/8th. Below that, not all eligible
|
||||
* refaults can be detected anymore.
|
||||
*
|
||||
* On 64-bit with 7 radix_tree_nodes per page and 64 slots
|
||||
* each, this will reclaim shadow entries when they consume
|
||||
* ~2% of available memory:
|
||||
* ~1.8% of available memory:
|
||||
*
|
||||
* PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE
|
||||
* PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE
|
||||
*/
|
||||
max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3);
|
||||
if (sc->memcg) {
|
||||
cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
|
||||
LRU_ALL_FILE);
|
||||
} else {
|
||||
cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) +
|
||||
node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE);
|
||||
}
|
||||
max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3);
|
||||
|
||||
if (shadow_nodes <= max_nodes)
|
||||
if (nodes <= max_nodes)
|
||||
return 0;
|
||||
|
||||
return shadow_nodes - max_nodes;
|
||||
return nodes - max_nodes;
|
||||
}
|
||||
|
||||
static enum lru_status shadow_lru_isolate(struct list_head *item,
|
||||
@@ -418,23 +452,30 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
|
||||
* no pages, so we expect to be able to remove them all and
|
||||
* delete and free the empty node afterwards.
|
||||
*/
|
||||
BUG_ON(!workingset_node_shadows(node));
|
||||
BUG_ON(workingset_node_pages(node));
|
||||
|
||||
if (WARN_ON_ONCE(!node->exceptional))
|
||||
goto out_invalid;
|
||||
if (WARN_ON_ONCE(node->count != node->exceptional))
|
||||
goto out_invalid;
|
||||
for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
|
||||
if (node->slots[i]) {
|
||||
BUG_ON(!radix_tree_exceptional_entry(node->slots[i]));
|
||||
if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i])))
|
||||
goto out_invalid;
|
||||
if (WARN_ON_ONCE(!node->exceptional))
|
||||
goto out_invalid;
|
||||
if (WARN_ON_ONCE(!mapping->nrexceptional))
|
||||
goto out_invalid;
|
||||
node->slots[i] = NULL;
|
||||
workingset_node_shadows_dec(node);
|
||||
BUG_ON(!mapping->nrexceptional);
|
||||
node->exceptional--;
|
||||
node->count--;
|
||||
mapping->nrexceptional--;
|
||||
}
|
||||
}
|
||||
BUG_ON(workingset_node_shadows(node));
|
||||
if (WARN_ON_ONCE(node->exceptional))
|
||||
goto out_invalid;
|
||||
inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
|
||||
if (!__radix_tree_delete_node(&mapping->page_tree, node))
|
||||
BUG();
|
||||
__radix_tree_delete_node(&mapping->page_tree, node);
|
||||
|
||||
out_invalid:
|
||||
spin_unlock(&mapping->tree_lock);
|
||||
ret = LRU_REMOVED_RETRY;
|
||||
out:
|
||||
@@ -452,8 +493,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
|
||||
|
||||
/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
|
||||
local_irq_disable();
|
||||
ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc,
|
||||
shadow_lru_isolate, NULL);
|
||||
ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL);
|
||||
local_irq_enable();
|
||||
return ret;
|
||||
}
|
||||
@@ -492,7 +532,7 @@ static int __init workingset_init(void)
|
||||
pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
|
||||
timestamp_bits, max_order, bucket_order);
|
||||
|
||||
ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
|
||||
ret = list_lru_init_key(&shadow_nodes, &shadow_nodes_key);
|
||||
if (ret)
|
||||
goto err;
|
||||
ret = register_shrinker(&workingset_shadow_shrinker);
|
||||
@@ -500,7 +540,7 @@ static int __init workingset_init(void)
|
||||
goto err_list_lru;
|
||||
return 0;
|
||||
err_list_lru:
|
||||
list_lru_destroy(&workingset_shadow_nodes);
|
||||
list_lru_destroy(&shadow_nodes);
|
||||
err:
|
||||
return ret;
|
||||
}
|
||||
|
Reference in New Issue
Block a user