
The rmap locks(i_mmap_rwsem and anon_vma->root->rwsem) could be contended under memory pressure if processes keep working on their vmas(e.g., fork, mmap, munmap). It makes reclaim path stuck. In our real workload traces, we see kswapd is waiting the lock for 300ms+(worst case, a sec) and it makes other processes entering direct reclaim, which were also stuck on the lock. This patch makes lru aging path try_lock mode like shink_page_list so the reclaim context will keep working with next lru pages without being stuck. if it found the rmap lock contended, it rotates the page back to head of lru in both active/inactive lrus to make them consistent behavior, which is basic starting point rather than adding more heristic. Since this patch introduces a new "contended" field as out-param along with try_lock in-param in rmap_walk_control, it's not immutable any longer if the try_lock is set so remove const keywords on rmap related functions. Since rmap walking is already expensive operation, I doubt the const would help sizable benefit( And we didn't have it until 5.17). In a heavy app workload in Android, trace shows following statistics. It almost removes rmap lock contention from reclaim path. Martin Liu reported: Before: max_dur(ms) min_dur(ms) max-min(dur)ms avg_dur(ms) sum_dur(ms) count blocked_function 1632 0 1631 151.542173 31672 209 page_lock_anon_vma_read 601 0 601 145.544681 28817 198 rmap_walk_file After: max_dur(ms) min_dur(ms) max-min(dur)ms avg_dur(ms) sum_dur(ms) count blocked_function NaN NaN NaN NaN NaN 0.0 NaN 0 0 0 0.127645 1 12 rmap_walk_file [minchan@kernel.org: add comment, per Matthew] Link: https://lkml.kernel.org/r/YnNqeB5tUf6LZ57b@google.com Link: https://lkml.kernel.org/r/20220510215423.164547-1-minchan@kernel.org Signed-off-by: Minchan Kim <minchan@kernel.org> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Michal Hocko <mhocko@suse.com> Cc: John Dias <joaodias@google.com> Cc: Tim Murray <timmurray@google.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Vladimir Davydov <vdavydov.dev@gmail.com> Cc: Martin Liu <liumartin@google.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Matthew Wilcox <willy@infradead.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Conflicts: folio->page (cherry picked from commit 6d4675e601357834dadd2ba1d803f6484596015c) Bug: 239681156 Bug: 252333201 Signed-off-by: Minchan Kim <minchan@google.com> Change-Id: I0c63e0291120c8a1b5f2d83b8a7b210cb56c27a2 Signed-off-by: chenxin <chenxinxin@xiaomi.corp-partner.google.com>
333 lines
9.8 KiB
C
333 lines
9.8 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _LINUX_RMAP_H
|
|
#define _LINUX_RMAP_H
|
|
/*
|
|
* Declarations for Reverse Mapping functions in mm/rmap.c
|
|
*/
|
|
|
|
#include <linux/list.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/rwsem.h>
|
|
#include <linux/memcontrol.h>
|
|
#include <linux/highmem.h>
|
|
#ifndef __GENKSYMS__
|
|
#define PROTECT_TRACE_INCLUDE_PATH
|
|
#include <trace/hooks/mm.h>
|
|
#endif
|
|
|
|
/*
|
|
* The anon_vma heads a list of private "related" vmas, to scan if
|
|
* an anonymous page pointing to this anon_vma needs to be unmapped:
|
|
* the vmas on the list will be related by forking, or by splitting.
|
|
*
|
|
* Since vmas come and go as they are split and merged (particularly
|
|
* in mprotect), the mapping field of an anonymous page cannot point
|
|
* directly to a vma: instead it points to an anon_vma, on whose list
|
|
* the related vmas can be easily linked or unlinked.
|
|
*
|
|
* After unlinking the last vma on the list, we must garbage collect
|
|
* the anon_vma object itself: we're guaranteed no page can be
|
|
* pointing to this anon_vma once its vma list is empty.
|
|
*/
|
|
struct anon_vma {
|
|
struct anon_vma *root; /* Root of this anon_vma tree */
|
|
struct rw_semaphore rwsem; /* W: modification, R: walking the list */
|
|
/*
|
|
* The refcount is taken on an anon_vma when there is no
|
|
* guarantee that the vma of page tables will exist for
|
|
* the duration of the operation. A caller that takes
|
|
* the reference is responsible for clearing up the
|
|
* anon_vma if they are the last user on release
|
|
*/
|
|
atomic_t refcount;
|
|
|
|
/*
|
|
* Count of child anon_vmas and VMAs which points to this anon_vma.
|
|
*
|
|
* This counter is used for making decision about reusing anon_vma
|
|
* instead of forking new one. See comments in function anon_vma_clone.
|
|
*/
|
|
unsigned degree;
|
|
|
|
struct anon_vma *parent; /* Parent of this anon_vma */
|
|
|
|
/*
|
|
* NOTE: the LSB of the rb_root.rb_node is set by
|
|
* mm_take_all_locks() _after_ taking the above lock. So the
|
|
* rb_root must only be read/written after taking the above lock
|
|
* to be sure to see a valid next pointer. The LSB bit itself
|
|
* is serialized by a system wide lock only visible to
|
|
* mm_take_all_locks() (mm_all_locks_mutex).
|
|
*/
|
|
|
|
/* Interval tree of private "related" vmas */
|
|
struct rb_root_cached rb_root;
|
|
};
|
|
|
|
/*
|
|
* The copy-on-write semantics of fork mean that an anon_vma
|
|
* can become associated with multiple processes. Furthermore,
|
|
* each child process will have its own anon_vma, where new
|
|
* pages for that process are instantiated.
|
|
*
|
|
* This structure allows us to find the anon_vmas associated
|
|
* with a VMA, or the VMAs associated with an anon_vma.
|
|
* The "same_vma" list contains the anon_vma_chains linking
|
|
* all the anon_vmas associated with this VMA.
|
|
* The "rb" field indexes on an interval tree the anon_vma_chains
|
|
* which link all the VMAs associated with this anon_vma.
|
|
*/
|
|
struct anon_vma_chain {
|
|
struct vm_area_struct *vma;
|
|
struct anon_vma *anon_vma;
|
|
struct list_head same_vma; /* locked by mmap_lock & page_table_lock */
|
|
struct rb_node rb; /* locked by anon_vma->rwsem */
|
|
unsigned long rb_subtree_last;
|
|
#ifdef CONFIG_DEBUG_VM_RB
|
|
unsigned long cached_vma_start, cached_vma_last;
|
|
#endif
|
|
};
|
|
|
|
enum ttu_flags {
|
|
TTU_MIGRATION = 0x1, /* migration mode */
|
|
TTU_MUNLOCK = 0x2, /* munlock mode */
|
|
|
|
TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */
|
|
TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */
|
|
TTU_SYNC = 0x10, /* avoid racy checks with PVMW_SYNC */
|
|
TTU_IGNORE_HWPOISON = 0x20, /* corrupted page is recoverable */
|
|
TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible
|
|
* and caller guarantees they will
|
|
* do a final flush if necessary */
|
|
TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock:
|
|
* caller holds it */
|
|
TTU_SPLIT_FREEZE = 0x100, /* freeze pte under splitting thp */
|
|
};
|
|
|
|
#ifdef CONFIG_MMU
|
|
static inline void get_anon_vma(struct anon_vma *anon_vma)
|
|
{
|
|
atomic_inc(&anon_vma->refcount);
|
|
}
|
|
|
|
void __put_anon_vma(struct anon_vma *anon_vma);
|
|
|
|
static inline void put_anon_vma(struct anon_vma *anon_vma)
|
|
{
|
|
if (atomic_dec_and_test(&anon_vma->refcount))
|
|
__put_anon_vma(anon_vma);
|
|
}
|
|
|
|
static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
|
|
{
|
|
down_write(&anon_vma->root->rwsem);
|
|
}
|
|
|
|
static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
|
|
{
|
|
up_write(&anon_vma->root->rwsem);
|
|
}
|
|
|
|
static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
|
|
{
|
|
down_read(&anon_vma->root->rwsem);
|
|
}
|
|
|
|
static inline int anon_vma_trylock_read(struct anon_vma *anon_vma)
|
|
{
|
|
return down_read_trylock(&anon_vma->root->rwsem);
|
|
}
|
|
|
|
static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
|
|
{
|
|
up_read(&anon_vma->root->rwsem);
|
|
}
|
|
|
|
|
|
/*
|
|
* anon_vma helper functions.
|
|
*/
|
|
void anon_vma_init(void); /* create anon_vma_cachep */
|
|
int __anon_vma_prepare(struct vm_area_struct *);
|
|
void unlink_anon_vmas(struct vm_area_struct *);
|
|
int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
|
|
int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
|
|
|
|
static inline int anon_vma_prepare(struct vm_area_struct *vma)
|
|
{
|
|
if (likely(vma->anon_vma))
|
|
return 0;
|
|
|
|
return __anon_vma_prepare(vma);
|
|
}
|
|
|
|
static inline void anon_vma_merge(struct vm_area_struct *vma,
|
|
struct vm_area_struct *next)
|
|
{
|
|
VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma);
|
|
unlink_anon_vmas(next);
|
|
}
|
|
|
|
struct anon_vma *page_get_anon_vma(struct page *page);
|
|
|
|
/* bitflags for do_page_add_anon_rmap() */
|
|
#define RMAP_EXCLUSIVE 0x01
|
|
#define RMAP_COMPOUND 0x02
|
|
|
|
/*
|
|
* rmap interfaces called when adding or removing pte of page
|
|
*/
|
|
void page_move_anon_rmap(struct page *, struct vm_area_struct *);
|
|
void page_add_anon_rmap(struct page *, struct vm_area_struct *,
|
|
unsigned long, bool);
|
|
void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
|
|
unsigned long, int);
|
|
void __page_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma,
|
|
unsigned long address, bool compound);
|
|
static inline void page_add_new_anon_rmap(struct page *page,
|
|
struct vm_area_struct *vma,
|
|
unsigned long address, bool compound)
|
|
{
|
|
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
|
|
__page_add_new_anon_rmap(page, vma, address, compound);
|
|
}
|
|
|
|
void page_add_file_rmap(struct page *, bool);
|
|
void page_remove_rmap(struct page *, bool);
|
|
|
|
void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
|
|
unsigned long);
|
|
void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
|
|
unsigned long);
|
|
|
|
static inline void page_dup_rmap(struct page *page, bool compound)
|
|
{
|
|
bool success = false;
|
|
|
|
if (!compound)
|
|
trace_android_vh_update_page_mapcount(page, true, compound, NULL, &success);
|
|
if (!success)
|
|
atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount);
|
|
}
|
|
|
|
/*
|
|
* Called from mm/vmscan.c to handle paging out
|
|
*/
|
|
int page_referenced(struct page *, int is_locked,
|
|
struct mem_cgroup *memcg, unsigned long *vm_flags);
|
|
|
|
bool try_to_unmap(struct page *, enum ttu_flags flags);
|
|
|
|
/* Avoid racy checks */
|
|
#define PVMW_SYNC (1 << 0)
|
|
/* Look for migarion entries rather than present PTEs */
|
|
#define PVMW_MIGRATION (1 << 1)
|
|
|
|
struct page_vma_mapped_walk {
|
|
struct page *page;
|
|
struct vm_area_struct *vma;
|
|
unsigned long address;
|
|
pmd_t *pmd;
|
|
pte_t *pte;
|
|
spinlock_t *ptl;
|
|
unsigned int flags;
|
|
};
|
|
|
|
static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
|
|
{
|
|
/* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
|
|
if (pvmw->pte && !PageHuge(pvmw->page))
|
|
pte_unmap(pvmw->pte);
|
|
if (pvmw->ptl)
|
|
spin_unlock(pvmw->ptl);
|
|
}
|
|
|
|
bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
|
|
|
|
/*
|
|
* Used by swapoff to help locate where page is expected in vma.
|
|
*/
|
|
unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
|
|
|
|
/*
|
|
* Cleans the PTEs of shared mappings.
|
|
* (and since clean PTEs should also be readonly, write protects them too)
|
|
*
|
|
* returns the number of cleaned PTEs.
|
|
*/
|
|
int page_mkclean(struct page *);
|
|
|
|
/*
|
|
* called in munlock()/munmap() path to check for other vmas holding
|
|
* the page mlocked.
|
|
*/
|
|
void try_to_munlock(struct page *);
|
|
|
|
void remove_migration_ptes(struct page *old, struct page *new, bool locked);
|
|
|
|
int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
|
|
|
|
/*
|
|
* rmap_walk_control: To control rmap traversing for specific needs
|
|
*
|
|
* arg: passed to rmap_one() and invalid_vma()
|
|
* try_lock: bail out if the rmap lock is contended
|
|
* contended: indicate the rmap traversal bailed out due to lock contention
|
|
* rmap_one: executed on each vma where page is mapped
|
|
* done: for checking traversing termination condition
|
|
* anon_lock: for getting anon_lock by optimized way rather than default
|
|
* invalid_vma: for skipping uninterested vma
|
|
*/
|
|
struct rmap_walk_control {
|
|
void *arg;
|
|
bool try_lock;
|
|
bool contended;
|
|
/*
|
|
* Return false if page table scanning in rmap_walk should be stopped.
|
|
* Otherwise, return true.
|
|
*/
|
|
bool (*rmap_one)(struct page *page, struct vm_area_struct *vma,
|
|
unsigned long addr, void *arg);
|
|
int (*done)(struct page *page);
|
|
struct anon_vma *(*anon_lock)(struct page *page,
|
|
struct rmap_walk_control *rwc);
|
|
bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
|
|
};
|
|
|
|
void rmap_walk(struct page *page, struct rmap_walk_control *rwc);
|
|
void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc);
|
|
|
|
/*
|
|
* Called by memory-failure.c to kill processes.
|
|
*/
|
|
struct anon_vma *page_lock_anon_vma_read(struct page *page,
|
|
struct rmap_walk_control *rwc);
|
|
void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
|
|
|
|
#else /* !CONFIG_MMU */
|
|
|
|
#define anon_vma_init() do {} while (0)
|
|
#define anon_vma_prepare(vma) (0)
|
|
#define anon_vma_link(vma) do {} while (0)
|
|
|
|
static inline int page_referenced(struct page *page, int is_locked,
|
|
struct mem_cgroup *memcg,
|
|
unsigned long *vm_flags)
|
|
{
|
|
*vm_flags = 0;
|
|
return 0;
|
|
}
|
|
|
|
#define try_to_unmap(page, refs) false
|
|
|
|
static inline int page_mkclean(struct page *page)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
|
|
#endif /* CONFIG_MMU */
|
|
|
|
#endif /* _LINUX_RMAP_H */
|