Merge branch 'akpm' (patches from Andrew)

Merge updates from Andrew Morton:
 "Am experimenting with splitting MM up into identifiable subsystems
  perhaps with a view to gitifying it in complex ways. Also with more
  verbose "incoming" emails.

  Most of MM is here and a few other trees.

  Subsystems affected by this patch series:
   - hotfixes
   - iommu
   - scripts
   - arch/sh
   - ocfs2
   - mm:slab-generic
   - mm:slub
   - mm:kmemleak
   - mm:kasan
   - mm:cleanups
   - mm:debug
   - mm:pagecache
   - mm:swap
   - mm:memcg
   - mm:gup
   - mm:pagemap
   - mm:infrastructure
   - mm:vmalloc
   - mm:initialization
   - mm:pagealloc
   - mm:vmscan
   - mm:tools
   - mm:proc
   - mm:ras
   - mm:oom-kill

  hotfixes:
      mm: vmscan: scan anonymous pages on file refaults
      mm/nvdimm: add is_ioremap_addr and use that to check ioremap address
      mm/memcontrol: fix wrong statistics in memory.stat
      mm/z3fold.c: lock z3fold page before  __SetPageMovable()
      nilfs2: do not use unexported cpu_to_le32()/le32_to_cpu() in uapi header
      MAINTAINERS: nilfs2: update email address

  iommu:
      include/linux/dmar.h: replace single-char identifiers in macros

  scripts:
      scripts/decode_stacktrace: match basepath using shell prefix operator, not regex
      scripts/decode_stacktrace: look for modules with .ko.debug extension
      scripts/spelling.txt: drop "sepc" from the misspelling list
      scripts/spelling.txt: add spelling fix for prohibited
      scripts/decode_stacktrace: Accept dash/underscore in modules
      scripts/spelling.txt: add more spellings to spelling.txt

  arch/sh:
      arch/sh/configs/sdk7786_defconfig: remove CONFIG_LOGFS
      sh: config: remove left-over BACKLIGHT_LCD_SUPPORT
      sh: prevent warnings when using iounmap

  ocfs2:
      fs: ocfs: fix spelling mistake "hearbeating" -> "heartbeat"
      ocfs2/dlm: use struct_size() helper
      ocfs2: add last unlock times in locking_state
      ocfs2: add locking filter debugfs file
      ocfs2: add first lock wait time in locking_state
      ocfs: no need to check return value of debugfs_create functions
      fs/ocfs2/dlmglue.c: unneeded variable: "status"
      ocfs2: use kmemdup rather than duplicating its implementation

  mm:slab-generic:
    Patch series "mm/slab: Improved sanity checking":
      mm/slab: validate cache membership under freelist hardening
      mm/slab: sanity-check page type when looking up cache
      lkdtm/heap: add tests for freelist hardening

  mm:slub:
      mm/slub.c: avoid double string traverse in kmem_cache_flags()
      slub: don't panic for memcg kmem cache creation failure

  mm:kmemleak:
      mm/kmemleak.c: fix check for softirq context
      mm/kmemleak.c: change error at _write when kmemleak is disabled
      docs: kmemleak: add more documentation details

  mm:kasan:
      mm/kasan: print frame description for stack bugs
      Patch series "Bitops instrumentation for KASAN", v5:
        lib/test_kasan: add bitops tests
        x86: use static_cpu_has in uaccess region to avoid instrumentation
        asm-generic, x86: add bitops instrumentation for KASAN
      Patch series "mm/kasan: Add object validation in ksize()", v3:
        mm/kasan: introduce __kasan_check_{read,write}
        mm/kasan: change kasan_check_{read,write} to return boolean
        lib/test_kasan: Add test for double-kzfree detection
        mm/slab: refactor common ksize KASAN logic into slab_common.c
        mm/kasan: add object validation in ksize()

  mm:cleanups:
      include/linux/pfn_t.h: remove pfn_t_to_virt()
      Patch series "remove ARCH_SELECT_MEMORY_MODEL where it has no effect":
        arm: remove ARCH_SELECT_MEMORY_MODEL
        s390: remove ARCH_SELECT_MEMORY_MODEL
        sparc: remove ARCH_SELECT_MEMORY_MODEL
      mm/gup.c: make follow_page_mask() static
      mm/memory.c: trivial clean up in insert_page()
      mm: make !CONFIG_HUGE_PAGE wrappers into static inlines
      include/linux/mm_types.h: ifdef struct vm_area_struct::swap_readahead_info
      mm: remove the account_page_dirtied export
      mm/page_isolation.c: change the prototype of undo_isolate_page_range()
      include/linux/vmpressure.h: use spinlock_t instead of struct spinlock
      mm: remove the exporting of totalram_pages
      include/linux/pagemap.h: document trylock_page() return value

  mm:debug:
      mm/failslab.c: by default, do not fail allocations with direct reclaim only
      Patch series "debug_pagealloc improvements":
        mm, debug_pagelloc: use static keys to enable debugging
        mm, page_alloc: more extensive free page checking with debug_pagealloc
        mm, debug_pagealloc: use a page type instead of page_ext flag

  mm:pagecache:
      Patch series "fix filler_t callback type mismatches", v2:
        mm/filemap.c: fix an overly long line in read_cache_page
        mm/filemap: don't cast ->readpage to filler_t for do_read_cache_page
        jffs2: pass the correct prototype to read_cache_page
        9p: pass the correct prototype to read_cache_page
      mm/filemap.c: correct the comment about VM_FAULT_RETRY

  mm:swap:
      mm, swap: fix race between swapoff and some swap operations
      mm/swap_state.c: simplify total_swapcache_pages() with get_swap_device()
      mm, swap: use rbtree for swap_extent
      mm/mincore.c: fix race between swapoff and mincore

  mm:memcg:
      memcg, oom: no oom-kill for __GFP_RETRY_MAYFAIL
      memcg, fsnotify: no oom-kill for remote memcg charging
      mm, memcg: introduce memory.events.local
      mm: memcontrol: dump memory.stat during cgroup OOM
      Patch series "mm: reparent slab memory on cgroup removal", v7:
        mm: memcg/slab: postpone kmem_cache memcg pointer initialization to memcg_link_cache()
        mm: memcg/slab: rename slab delayed deactivation functions and fields
        mm: memcg/slab: generalize postponed non-root kmem_cache deactivation
        mm: memcg/slab: introduce __memcg_kmem_uncharge_memcg()
        mm: memcg/slab: unify SLAB and SLUB page accounting
        mm: memcg/slab: don't check the dying flag on kmem_cache creation
        mm: memcg/slab: synchronize access to kmem_cache dying flag using a spinlock
        mm: memcg/slab: rework non-root kmem_cache lifecycle management
        mm: memcg/slab: stop setting page->mem_cgroup pointer for slab pages
        mm: memcg/slab: reparent memcg kmem_caches on cgroup removal
      mm, memcg: add a memcg_slabinfo debugfs file

  mm:gup:
      Patch series "switch the remaining architectures to use generic GUP", v4:
        mm: use untagged_addr() for get_user_pages_fast addresses
        mm: simplify gup_fast_permitted
        mm: lift the x86_32 PAE version of gup_get_pte to common code
        MIPS: use the generic get_user_pages_fast code
        sh: add the missing pud_page definition
        sh: use the generic get_user_pages_fast code
        sparc64: add the missing pgd_page definition
        sparc64: define untagged_addr()
        sparc64: use the generic get_user_pages_fast code
        mm: rename CONFIG_HAVE_GENERIC_GUP to CONFIG_HAVE_FAST_GUP
        mm: reorder code blocks in gup.c
        mm: consolidate the get_user_pages* implementations
        mm: validate get_user_pages_fast flags
        mm: move the powerpc hugepd code to mm/gup.c
        mm: switch gup_hugepte to use try_get_compound_head
        mm: mark the page referenced in gup_hugepte
      mm/gup: speed up check_and_migrate_cma_pages() on huge page
      mm/gup.c: remove some BUG_ONs from get_gate_page()
      mm/gup.c: mark undo_dev_pagemap as __maybe_unused

  mm:pagemap:
      asm-generic, x86: introduce generic pte_{alloc,free}_one[_kernel]
      alpha: switch to generic version of pte allocation
      arm: switch to generic version of pte allocation
      arm64: switch to generic version of pte allocation
      csky: switch to generic version of pte allocation
      m68k: sun3: switch to generic version of pte allocation
      mips: switch to generic version of pte allocation
      nds32: switch to generic version of pte allocation
      nios2: switch to generic version of pte allocation
      parisc: switch to generic version of pte allocation
      riscv: switch to generic version of pte allocation
      um: switch to generic version of pte allocation
      unicore32: switch to generic version of pte allocation
      mm/pgtable: drop pgtable_t variable from pte_fn_t functions
      mm/memory.c: fail when offset == num in first check of __vm_map_pages()

  mm:infrastructure:
      mm/mmu_notifier: use hlist_add_head_rcu()

  mm:vmalloc:
      Patch series "Some cleanups for the KVA/vmalloc", v5:
        mm/vmalloc.c: remove "node" argument
        mm/vmalloc.c: preload a CPU with one object for split purpose
        mm/vmalloc.c: get rid of one single unlink_va() when merge
        mm/vmalloc.c: switch to WARN_ON() and move it under unlink_va()
      mm/vmalloc.c: spelling> s/informaion/information/

  mm:initialization:
      mm/large system hash: use vmalloc for size > MAX_ORDER when !hashdist
      mm/large system hash: clear hashdist when only one node with memory is booted

  mm:pagealloc:
      arm64: move jump_label_init() before parse_early_param()
      Patch series "add init_on_alloc/init_on_free boot options", v10:
        mm: security: introduce init_on_alloc=1 and init_on_free=1 boot options
        mm: init: report memory auto-initialization features at boot time

  mm:vmscan:
      mm: vmscan: remove double slab pressure by inc'ing sc->nr_scanned
      mm: vmscan: correct some vmscan counters for THP swapout

  mm:tools:
      tools/vm/slabinfo: order command line options
      tools/vm/slabinfo: add partial slab listing to -X
      tools/vm/slabinfo: add option to sort by partial slabs
      tools/vm/slabinfo: add sorting info to help menu

  mm:proc:
      proc: use down_read_killable mmap_sem for /proc/pid/maps
      proc: use down_read_killable mmap_sem for /proc/pid/smaps_rollup
      proc: use down_read_killable mmap_sem for /proc/pid/pagemap
      proc: use down_read_killable mmap_sem for /proc/pid/clear_refs
      proc: use down_read_killable mmap_sem for /proc/pid/map_files
      mm: use down_read_killable for locking mmap_sem in access_remote_vm
      mm: smaps: split PSS into components
      mm: vmalloc: show number of vmalloc pages in /proc/meminfo

  mm:ras:
      mm/memory-failure.c: clarify error message

  mm:oom-kill:
      mm: memcontrol: use CSS_TASK_ITER_PROCS at mem_cgroup_scan_tasks()
      mm, oom: refactor dump_tasks for memcg OOMs
      mm, oom: remove redundant task_in_mem_cgroup() check
      oom: decouple mems_allowed from oom_unkillable_task
      mm/oom_kill.c: remove redundant OOM score normalization in select_bad_process()"

* akpm: (147 commits)
  mm/oom_kill.c: remove redundant OOM score normalization in select_bad_process()
  oom: decouple mems_allowed from oom_unkillable_task
  mm, oom: remove redundant task_in_mem_cgroup() check
  mm, oom: refactor dump_tasks for memcg OOMs
  mm: memcontrol: use CSS_TASK_ITER_PROCS at mem_cgroup_scan_tasks()
  mm/memory-failure.c: clarify error message
  mm: vmalloc: show number of vmalloc pages in /proc/meminfo
  mm: smaps: split PSS into components
  mm: use down_read_killable for locking mmap_sem in access_remote_vm
  proc: use down_read_killable mmap_sem for /proc/pid/map_files
  proc: use down_read_killable mmap_sem for /proc/pid/clear_refs
  proc: use down_read_killable mmap_sem for /proc/pid/pagemap
  proc: use down_read_killable mmap_sem for /proc/pid/smaps_rollup
  proc: use down_read_killable mmap_sem for /proc/pid/maps
  tools/vm/slabinfo: add sorting info to help menu
  tools/vm/slabinfo: add option to sort by partial slabs
  tools/vm/slabinfo: add partial slab listing to -X
  tools/vm/slabinfo: order command line options
  mm: vmscan: correct some vmscan counters for THP swapout
  mm: vmscan: remove double slab pressure by inc'ing sc->nr_scanned
  ...
Este commit está contenido en:
Linus Torvalds
2019-07-12 11:40:28 -07:00
Se han modificado 173 ficheros con 3710 adiciones y 3505 borrados

Ver fichero

@@ -132,7 +132,8 @@ config HAVE_MEMBLOCK_NODE_MAP
config HAVE_MEMBLOCK_PHYS_MAP
bool
config HAVE_GENERIC_GUP
config HAVE_FAST_GUP
depends on MMU
bool
config ARCH_KEEP_MEMBLOCK
@@ -762,7 +763,20 @@ config GUP_BENCHMARK
See tools/testing/selftests/vm/gup_benchmark.c
config GUP_GET_PTE_LOW_HIGH
bool
config ARCH_HAS_PTE_SPECIAL
bool
#
# Some architectures require a special hugepage directory format that is
# required to support multiple hugepage sizes. For example a4fe3ce76
# "powerpc/mm: Allow more flexible layouts for hugepage pagetables"
# introduced it on powerpc. This allows for a more flexible hugepage
# pagetable layouts.
#
config ARCH_HAS_HUGEPD
bool
endmenu

Ver fichero

@@ -12,19 +12,23 @@ config DEBUG_PAGEALLOC
bool "Debug page memory allocations"
depends on DEBUG_KERNEL
depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
select PAGE_EXTENSION
select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
---help---
Unmap pages from the kernel linear mapping after free_pages().
Depending on runtime enablement, this results in a small or large
slowdown, but helps to find certain types of memory corruption.
Also, the state of page tracking structures is checked more often as
pages are being allocated and freed, as unexpected state changes
often happen for same reasons as memory corruption (e.g. double free,
use-after-free).
For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
fill the pages with poison patterns after free_pages() and verify
the patterns before alloc_pages(). Additionally,
this option cannot be enabled in combination with hibernation as
that would result in incorrect warnings of memory corruption after
a resume because free pages are not saved to the suspend image.
the patterns before alloc_pages(). Additionally, this option cannot
be enabled in combination with hibernation as that would result in
incorrect warnings of memory corruption after a resume because free
pages are not saved to the suspend image.
By default this option will have a small overhead, e.g. by not
allowing the kernel mapping to be backed by large pages on some

Ver fichero

@@ -22,7 +22,7 @@ KCOV_INSTRUMENT_mmzone.o := n
KCOV_INSTRUMENT_vmstat.o := n
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \
mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
msync.o page_vma_mapped.o pagewalk.o \
pgtable-generic.o rmap.o vmalloc.o
@@ -39,7 +39,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
compaction.o vmacache.o \
interval_tree.o list_lru.o workingset.o \
debug.o $(mmu-y)
debug.o gup.o $(mmu-y)
# Give 'page_alloc' its own module-parameter namespace
page-alloc-y := page_alloc.o

Ver fichero

@@ -378,7 +378,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
#endif
spin_unlock_irqrestore(&pool->lock, flags);
if (mem_flags & __GFP_ZERO)
if (want_init_on_alloc(mem_flags))
memset(retval, 0, pool->size);
return retval;
@@ -428,6 +428,8 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
}
offset = vaddr - page->vaddr;
if (want_init_on_free())
memset(vaddr, 0, pool->size);
#ifdef DMAPOOL_DEBUG
if ((dma - page->dma) != offset) {
spin_unlock_irqrestore(&pool->lock, flags);

Ver fichero

@@ -23,7 +23,8 @@ bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags)
if (gfpflags & __GFP_NOFAIL)
return false;
if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM))
if (failslab.ignore_gfp_reclaim &&
(gfpflags & __GFP_DIRECT_RECLAIM))
return false;
if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB))

Ver fichero

@@ -2504,10 +2504,8 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
*
* vma->vm_mm->mmap_sem must be held on entry.
*
* If our return value has VM_FAULT_RETRY set, it's because
* lock_page_or_retry() returned 0.
* The mmap_sem has usually been released in this case.
* See __lock_page_or_retry() for the exception.
* If our return value has VM_FAULT_RETRY set, it's because the mmap_sem
* may be dropped before doing I/O or by lock_page_maybe_drop_mmap().
*
* If our return value does not have VM_FAULT_RETRY set, the mmap_sem
* has not been released.
@@ -2825,7 +2823,11 @@ repeat:
}
filler:
err = filler(data, page);
if (filler)
err = filler(data, page);
else
err = mapping->a_ops->readpage(data, page);
if (err < 0) {
put_page(page);
return ERR_PTR(err);
@@ -2915,7 +2917,8 @@ struct page *read_cache_page(struct address_space *mapping,
int (*filler)(void *, struct page *),
void *data)
{
return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
return do_read_cache_page(mapping, index, filler, data,
mapping_gfp_mask(mapping));
}
EXPORT_SYMBOL(read_cache_page);
@@ -2936,9 +2939,7 @@ struct page *read_cache_page_gfp(struct address_space *mapping,
pgoff_t index,
gfp_t gfp)
{
filler_t *filler = (filler_t *)mapping->a_ops->readpage;
return do_read_cache_page(mapping, index, filler, NULL, gfp);
return do_read_cache_page(mapping, index, NULL, NULL, gfp);
}
EXPORT_SYMBOL(read_cache_page_gfp);

902
mm/gup.c

La diferencia del archivo ha sido suprimido porque es demasiado grande Cargar Diff

Ver fichero

@@ -14,8 +14,6 @@
*
*/
#define __KASAN_INTERNAL
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/init.h>
@@ -89,17 +87,17 @@ void kasan_disable_current(void)
current->kasan_depth--;
}
void kasan_check_read(const volatile void *p, unsigned int size)
bool __kasan_check_read(const volatile void *p, unsigned int size)
{
check_memory_region((unsigned long)p, size, false, _RET_IP_);
return check_memory_region((unsigned long)p, size, false, _RET_IP_);
}
EXPORT_SYMBOL(kasan_check_read);
EXPORT_SYMBOL(__kasan_check_read);
void kasan_check_write(const volatile void *p, unsigned int size)
bool __kasan_check_write(const volatile void *p, unsigned int size)
{
check_memory_region((unsigned long)p, size, true, _RET_IP_);
return check_memory_region((unsigned long)p, size, true, _RET_IP_);
}
EXPORT_SYMBOL(kasan_check_write);
EXPORT_SYMBOL(__kasan_check_write);
#undef memset
void *memset(void *addr, int c, size_t len)

Ver fichero

@@ -166,29 +166,30 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
return memory_is_poisoned_n(addr, size);
}
static __always_inline void check_memory_region_inline(unsigned long addr,
static __always_inline bool check_memory_region_inline(unsigned long addr,
size_t size, bool write,
unsigned long ret_ip)
{
if (unlikely(size == 0))
return;
return true;
if (unlikely((void *)addr <
kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
kasan_report(addr, size, write, ret_ip);
return;
return false;
}
if (likely(!memory_is_poisoned(addr, size)))
return;
return true;
kasan_report(addr, size, write, ret_ip);
return false;
}
void check_memory_region(unsigned long addr, size_t size, bool write,
bool check_memory_region(unsigned long addr, size_t size, bool write,
unsigned long ret_ip)
{
check_memory_region_inline(addr, size, write, ret_ip);
return check_memory_region_inline(addr, size, write, ret_ip);
}
void kasan_cache_shrink(struct kmem_cache *cache)

Ver fichero

@@ -43,6 +43,11 @@
#define KASAN_ALLOCA_REDZONE_SIZE 32
/*
* Stack frame marker (compiler ABI).
*/
#define KASAN_CURRENT_STACK_FRAME_MAGIC 0x41B58AB3
/* Don't break randconfig/all*config builds */
#ifndef KASAN_ABI_VERSION
#define KASAN_ABI_VERSION 1
@@ -123,7 +128,15 @@ static inline bool addr_has_shadow(const void *addr)
void kasan_poison_shadow(const void *address, size_t size, u8 value);
void check_memory_region(unsigned long addr, size_t size, bool write,
/**
* check_memory_region - Check memory region, and report if invalid access.
* @addr: the accessed address
* @size: the accessed size
* @write: true if access is a write access
* @ret_ip: return address
* @return: true if access was valid, false if invalid
*/
bool check_memory_region(unsigned long addr, size_t size, bool write,
unsigned long ret_ip);
void *find_first_bad_addr(void *addr, size_t size);

Ver fichero

@@ -28,6 +28,7 @@
#include <linux/types.h>
#include <linux/kasan.h>
#include <linux/module.h>
#include <linux/sched/task_stack.h>
#include <asm/sections.h>
@@ -181,6 +182,168 @@ static inline bool init_task_stack_addr(const void *addr)
sizeof(init_thread_union.stack));
}
static bool __must_check tokenize_frame_descr(const char **frame_descr,
char *token, size_t max_tok_len,
unsigned long *value)
{
const char *sep = strchr(*frame_descr, ' ');
if (sep == NULL)
sep = *frame_descr + strlen(*frame_descr);
if (token != NULL) {
const size_t tok_len = sep - *frame_descr;
if (tok_len + 1 > max_tok_len) {
pr_err("KASAN internal error: frame description too long: %s\n",
*frame_descr);
return false;
}
/* Copy token (+ 1 byte for '\0'). */
strlcpy(token, *frame_descr, tok_len + 1);
}
/* Advance frame_descr past separator. */
*frame_descr = sep + 1;
if (value != NULL && kstrtoul(token, 10, value)) {
pr_err("KASAN internal error: not a valid number: %s\n", token);
return false;
}
return true;
}
static void print_decoded_frame_descr(const char *frame_descr)
{
/*
* We need to parse the following string:
* "n alloc_1 alloc_2 ... alloc_n"
* where alloc_i looks like
* "offset size len name"
* or "offset size len name:line".
*/
char token[64];
unsigned long num_objects;
if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
&num_objects))
return;
pr_err("\n");
pr_err("this frame has %lu %s:\n", num_objects,
num_objects == 1 ? "object" : "objects");
while (num_objects--) {
unsigned long offset;
unsigned long size;
/* access offset */
if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
&offset))
return;
/* access size */
if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
&size))
return;
/* name length (unused) */
if (!tokenize_frame_descr(&frame_descr, NULL, 0, NULL))
return;
/* object name */
if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
NULL))
return;
/* Strip line number; without filename it's not very helpful. */
strreplace(token, ':', '\0');
/* Finally, print object information. */
pr_err(" [%lu, %lu) '%s'", offset, offset + size, token);
}
}
static bool __must_check get_address_stack_frame_info(const void *addr,
unsigned long *offset,
const char **frame_descr,
const void **frame_pc)
{
unsigned long aligned_addr;
unsigned long mem_ptr;
const u8 *shadow_bottom;
const u8 *shadow_ptr;
const unsigned long *frame;
BUILD_BUG_ON(IS_ENABLED(CONFIG_STACK_GROWSUP));
/*
* NOTE: We currently only support printing frame information for
* accesses to the task's own stack.
*/
if (!object_is_on_stack(addr))
return false;
aligned_addr = round_down((unsigned long)addr, sizeof(long));
mem_ptr = round_down(aligned_addr, KASAN_SHADOW_SCALE_SIZE);
shadow_ptr = kasan_mem_to_shadow((void *)aligned_addr);
shadow_bottom = kasan_mem_to_shadow(end_of_stack(current));
while (shadow_ptr >= shadow_bottom && *shadow_ptr != KASAN_STACK_LEFT) {
shadow_ptr--;
mem_ptr -= KASAN_SHADOW_SCALE_SIZE;
}
while (shadow_ptr >= shadow_bottom && *shadow_ptr == KASAN_STACK_LEFT) {
shadow_ptr--;
mem_ptr -= KASAN_SHADOW_SCALE_SIZE;
}
if (shadow_ptr < shadow_bottom)
return false;
frame = (const unsigned long *)(mem_ptr + KASAN_SHADOW_SCALE_SIZE);
if (frame[0] != KASAN_CURRENT_STACK_FRAME_MAGIC) {
pr_err("KASAN internal error: frame info validation failed; invalid marker: %lu\n",
frame[0]);
return false;
}
*offset = (unsigned long)addr - (unsigned long)frame;
*frame_descr = (const char *)frame[1];
*frame_pc = (void *)frame[2];
return true;
}
static void print_address_stack_frame(const void *addr)
{
unsigned long offset;
const char *frame_descr;
const void *frame_pc;
if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
return;
if (!get_address_stack_frame_info(addr, &offset, &frame_descr,
&frame_pc))
return;
/*
* get_address_stack_frame_info only returns true if the given addr is
* on the current task's stack.
*/
pr_err("\n");
pr_err("addr %px is located in stack of task %s/%d at offset %lu in frame:\n",
addr, current->comm, task_pid_nr(current), offset);
pr_err(" %pS\n", frame_pc);
if (!frame_descr)
return;
print_decoded_frame_descr(frame_descr);
}
static void print_address_description(void *addr)
{
struct page *page = addr_to_page(addr);
@@ -204,6 +367,8 @@ static void print_address_description(void *addr)
pr_err("The buggy address belongs to the page:\n");
dump_page(page, "kasan: bad access detected");
}
print_address_stack_frame(addr);
}
static bool row_is_guilty(const void *row, const void *guilty)

Ver fichero

@@ -76,7 +76,7 @@ void *kasan_reset_tag(const void *addr)
return reset_tag(addr);
}
void check_memory_region(unsigned long addr, size_t size, bool write,
bool check_memory_region(unsigned long addr, size_t size, bool write,
unsigned long ret_ip)
{
u8 tag;
@@ -84,7 +84,7 @@ void check_memory_region(unsigned long addr, size_t size, bool write,
void *untagged_addr;
if (unlikely(size == 0))
return;
return true;
tag = get_tag((const void *)addr);
@@ -106,22 +106,24 @@ void check_memory_region(unsigned long addr, size_t size, bool write,
* set to KASAN_TAG_KERNEL (0xFF)).
*/
if (tag == KASAN_TAG_KERNEL)
return;
return true;
untagged_addr = reset_tag((const void *)addr);
if (unlikely(untagged_addr <
kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
kasan_report(addr, size, write, ret_ip);
return;
return false;
}
shadow_first = kasan_mem_to_shadow(untagged_addr);
shadow_last = kasan_mem_to_shadow(untagged_addr + size - 1);
for (shadow = shadow_first; shadow <= shadow_last; shadow++) {
if (*shadow != tag) {
kasan_report(addr, size, write, ret_ip);
return;
return false;
}
}
return true;
}
#define DEFINE_HWASAN_LOAD_STORE(size) \

Ver fichero

@@ -575,7 +575,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
if (in_irq()) {
object->pid = 0;
strncpy(object->comm, "hardirq", sizeof(object->comm));
} else if (in_softirq()) {
} else if (in_serving_softirq()) {
object->pid = 0;
strncpy(object->comm, "softirq", sizeof(object->comm));
} else {
@@ -1866,7 +1866,7 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
}
if (!kmemleak_enabled) {
ret = -EBUSY;
ret = -EPERM;
goto out;
}

Ver fichero

@@ -12,6 +12,7 @@
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/memcontrol.h>
#include "slab.h"
#ifdef CONFIG_MEMCG_KMEM
static LIST_HEAD(list_lrus);
@@ -63,7 +64,7 @@ static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
if (!memcg_kmem_enabled())
return NULL;
page = virt_to_head_page(ptr);
return page->mem_cgroup;
return memcg_from_slab_page(page);
}
static inline struct list_lru_one *

Ver fichero

@@ -57,6 +57,7 @@
#include <linux/lockdep.h>
#include <linux/file.h>
#include <linux/tracehook.h>
#include <linux/seq_buf.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
@@ -485,7 +486,10 @@ ino_t page_cgroup_ino(struct page *page)
unsigned long ino = 0;
rcu_read_lock();
memcg = READ_ONCE(page->mem_cgroup);
if (PageHead(page) && PageSlab(page))
memcg = memcg_from_slab_page(page);
else
memcg = READ_ONCE(page->mem_cgroup);
while (memcg && !(memcg->css.flags & CSS_ONLINE))
memcg = parent_mem_cgroup(memcg);
if (memcg)
@@ -1163,7 +1167,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
struct css_task_iter it;
struct task_struct *task;
css_task_iter_start(&iter->css, 0, &it);
css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
while (!ret && (task = css_task_iter_next(&it)))
ret = fn(task, arg);
css_task_iter_end(&it);
@@ -1255,32 +1259,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
*lru_size += nr_pages;
}
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
{
struct mem_cgroup *task_memcg;
struct task_struct *p;
bool ret;
p = find_lock_task_mm(task);
if (p) {
task_memcg = get_mem_cgroup_from_mm(p->mm);
task_unlock(p);
} else {
/*
* All threads may have already detached their mm's, but the oom
* killer still needs to detect if they have already been oom
* killed to prevent needlessly killing additional tasks.
*/
rcu_read_lock();
task_memcg = mem_cgroup_from_task(task);
css_get(&task_memcg->css);
rcu_read_unlock();
}
ret = mem_cgroup_is_descendant(task_memcg, memcg);
css_put(&task_memcg->css);
return ret;
}
/**
* mem_cgroup_margin - calculate chargeable space of a memory cgroup
* @memcg: the memory cgroup
@@ -1356,27 +1334,114 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
return false;
}
static const unsigned int memcg1_stats[] = {
MEMCG_CACHE,
MEMCG_RSS,
MEMCG_RSS_HUGE,
NR_SHMEM,
NR_FILE_MAPPED,
NR_FILE_DIRTY,
NR_WRITEBACK,
MEMCG_SWAP,
};
static char *memory_stat_format(struct mem_cgroup *memcg)
{
struct seq_buf s;
int i;
static const char *const memcg1_stat_names[] = {
"cache",
"rss",
"rss_huge",
"shmem",
"mapped_file",
"dirty",
"writeback",
"swap",
};
seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
if (!s.buffer)
return NULL;
/*
* Provide statistics on the state of the memory subsystem as
* well as cumulative event counters that show past behavior.
*
* This list is ordered following a combination of these gradients:
* 1) generic big picture -> specifics and details
* 2) reflecting userspace activity -> reflecting kernel heuristics
*
* Current memory state:
*/
seq_buf_printf(&s, "anon %llu\n",
(u64)memcg_page_state(memcg, MEMCG_RSS) *
PAGE_SIZE);
seq_buf_printf(&s, "file %llu\n",
(u64)memcg_page_state(memcg, MEMCG_CACHE) *
PAGE_SIZE);
seq_buf_printf(&s, "kernel_stack %llu\n",
(u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
1024);
seq_buf_printf(&s, "slab %llu\n",
(u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
PAGE_SIZE);
seq_buf_printf(&s, "sock %llu\n",
(u64)memcg_page_state(memcg, MEMCG_SOCK) *
PAGE_SIZE);
seq_buf_printf(&s, "shmem %llu\n",
(u64)memcg_page_state(memcg, NR_SHMEM) *
PAGE_SIZE);
seq_buf_printf(&s, "file_mapped %llu\n",
(u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
PAGE_SIZE);
seq_buf_printf(&s, "file_dirty %llu\n",
(u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
PAGE_SIZE);
seq_buf_printf(&s, "file_writeback %llu\n",
(u64)memcg_page_state(memcg, NR_WRITEBACK) *
PAGE_SIZE);
/*
* TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
* with the NR_ANON_THP vm counter, but right now it's a pain in the
* arse because it requires migrating the work out of rmap to a place
* where the page->mem_cgroup is set up and stable.
*/
seq_buf_printf(&s, "anon_thp %llu\n",
(u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) *
PAGE_SIZE);
for (i = 0; i < NR_LRU_LISTS; i++)
seq_buf_printf(&s, "%s %llu\n", mem_cgroup_lru_names[i],
(u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
PAGE_SIZE);
seq_buf_printf(&s, "slab_reclaimable %llu\n",
(u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
PAGE_SIZE);
seq_buf_printf(&s, "slab_unreclaimable %llu\n",
(u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
PAGE_SIZE);
/* Accumulated memory events */
seq_buf_printf(&s, "pgfault %lu\n", memcg_events(memcg, PGFAULT));
seq_buf_printf(&s, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT));
seq_buf_printf(&s, "workingset_refault %lu\n",
memcg_page_state(memcg, WORKINGSET_REFAULT));
seq_buf_printf(&s, "workingset_activate %lu\n",
memcg_page_state(memcg, WORKINGSET_ACTIVATE));
seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
seq_buf_printf(&s, "pgrefill %lu\n", memcg_events(memcg, PGREFILL));
seq_buf_printf(&s, "pgscan %lu\n",
memcg_events(memcg, PGSCAN_KSWAPD) +
memcg_events(memcg, PGSCAN_DIRECT));
seq_buf_printf(&s, "pgsteal %lu\n",
memcg_events(memcg, PGSTEAL_KSWAPD) +
memcg_events(memcg, PGSTEAL_DIRECT));
seq_buf_printf(&s, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE));
seq_buf_printf(&s, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE));
seq_buf_printf(&s, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE));
seq_buf_printf(&s, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED));
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
seq_buf_printf(&s, "thp_fault_alloc %lu\n",
memcg_events(memcg, THP_FAULT_ALLOC));
seq_buf_printf(&s, "thp_collapse_alloc %lu\n",
memcg_events(memcg, THP_COLLAPSE_ALLOC));
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/* The above should easily fit into one page */
WARN_ON_ONCE(seq_buf_has_overflowed(&s));
return s.buffer;
}
#define K(x) ((x) << (PAGE_SHIFT-10))
/**
@@ -1411,39 +1476,32 @@ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *
*/
void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
unsigned int i;
char *buf;
pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memory)),
K((u64)memcg->memory.max), memcg->memory.failcnt);
pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memsw)),
K((u64)memcg->memsw.max), memcg->memsw.failcnt);
pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->kmem)),
K((u64)memcg->kmem.max), memcg->kmem.failcnt);
for_each_mem_cgroup_tree(iter, memcg) {
pr_info("Memory cgroup stats for ");
pr_cont_cgroup_path(iter->css.cgroup);
pr_cont(":");
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
continue;
pr_cont(" %s:%luKB", memcg1_stat_names[i],
K(memcg_page_state_local(iter,
memcg1_stats[i])));
}
for (i = 0; i < NR_LRU_LISTS; i++)
pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
K(memcg_page_state_local(iter,
NR_LRU_BASE + i)));
pr_cont("\n");
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->swap)),
K((u64)memcg->swap.max), memcg->swap.failcnt);
else {
pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memsw)),
K((u64)memcg->memsw.max), memcg->memsw.failcnt);
pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->kmem)),
K((u64)memcg->kmem.max), memcg->kmem.failcnt);
}
pr_info("Memory cgroup stats for ");
pr_cont_cgroup_path(memcg->css.cgroup);
pr_cont(":");
buf = memory_stat_format(memcg);
if (!buf)
return;
pr_info("%s", buf);
kfree(buf);
}
/*
@@ -2279,7 +2337,6 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned long nr_reclaimed;
bool may_swap = true;
bool drained = false;
bool oomed = false;
enum oom_status oom_status;
if (mem_cgroup_is_root(memcg))
@@ -2366,7 +2423,7 @@ retry:
if (nr_retries--)
goto retry;
if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed)
if (gfp_mask & __GFP_RETRY_MAYFAIL)
goto nomem;
if (gfp_mask & __GFP_NOFAIL)
@@ -2385,7 +2442,6 @@ retry:
switch (oom_status) {
case OOM_SUCCESS:
nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
oomed = true;
goto retry;
case OOM_FAILED:
goto force;
@@ -2588,12 +2644,13 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
{
struct memcg_kmem_cache_create_work *cw;
if (!css_tryget_online(&memcg->css))
return;
cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
if (!cw)
return;
css_get(&memcg->css);
cw->memcg = memcg;
cw->cachep = cachep;
INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
@@ -2628,6 +2685,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
{
struct mem_cgroup *memcg;
struct kmem_cache *memcg_cachep;
struct memcg_cache_array *arr;
int kmemcg_id;
VM_BUG_ON(!is_root_cache(cachep));
@@ -2635,14 +2693,28 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
if (memcg_kmem_bypass())
return cachep;
memcg = get_mem_cgroup_from_current();
rcu_read_lock();
if (unlikely(current->active_memcg))
memcg = current->active_memcg;
else
memcg = mem_cgroup_from_task(current);
if (!memcg || memcg == root_mem_cgroup)
goto out_unlock;
kmemcg_id = READ_ONCE(memcg->kmemcg_id);
if (kmemcg_id < 0)
goto out;
goto out_unlock;
memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
if (likely(memcg_cachep))
return memcg_cachep;
arr = rcu_dereference(cachep->memcg_params.memcg_caches);
/*
* Make sure we will access the up-to-date value. The code updating
* memcg_caches issues a write barrier to match the data dependency
* barrier inside READ_ONCE() (see memcg_create_kmem_cache()).
*/
memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
/*
* If we are in a safe context (can wait, and not in interrupt
@@ -2655,10 +2727,20 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
* memcg_create_kmem_cache, this means no further allocation
* could happen with the slab_mutex held. So it's better to
* defer everything.
*
* If the memcg is dying or memcg_cache is about to be released,
* don't bother creating new kmem_caches. Because memcg_cachep
* is ZEROed as the fist step of kmem offlining, we don't need
* percpu_ref_tryget_live() here. css_tryget_online() check in
* memcg_schedule_kmem_cache_create() will prevent us from
* creation of a new kmem_cache.
*/
memcg_schedule_kmem_cache_create(memcg, cachep);
out:
css_put(&memcg->css);
if (unlikely(!memcg_cachep))
memcg_schedule_kmem_cache_create(memcg, cachep);
else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
cachep = memcg_cachep;
out_unlock:
rcu_read_unlock();
return cachep;
}
@@ -2669,7 +2751,7 @@ out:
void memcg_kmem_put_cache(struct kmem_cache *cachep)
{
if (!is_root_cache(cachep))
css_put(&cachep->memcg_params.memcg->css);
percpu_ref_put(&cachep->memcg_params.refcnt);
}
/**
@@ -2697,9 +2779,6 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
cancel_charge(memcg, nr_pages);
return -ENOMEM;
}
page->mem_cgroup = memcg;
return 0;
}
@@ -2722,12 +2801,30 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
memcg = get_mem_cgroup_from_current();
if (!mem_cgroup_is_root(memcg)) {
ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
if (!ret)
if (!ret) {
page->mem_cgroup = memcg;
__SetPageKmemcg(page);
}
}
css_put(&memcg->css);
return ret;
}
/**
* __memcg_kmem_uncharge_memcg: uncharge a kmem page
* @memcg: memcg to uncharge
* @nr_pages: number of pages to uncharge
*/
void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg,
unsigned int nr_pages)
{
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
page_counter_uncharge(&memcg->kmem, nr_pages);
page_counter_uncharge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
}
/**
* __memcg_kmem_uncharge: uncharge a kmem page
* @page: page to uncharge
@@ -2742,14 +2839,7 @@ void __memcg_kmem_uncharge(struct page *page, int order)
return;
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
page_counter_uncharge(&memcg->kmem, nr_pages);
page_counter_uncharge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
__memcg_kmem_uncharge_memcg(memcg, nr_pages);
page->mem_cgroup = NULL;
/* slab pages do not have PageKmemcg flag set */
@@ -3168,15 +3258,15 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
*/
memcg->kmem_state = KMEM_ALLOCATED;
memcg_deactivate_kmem_caches(memcg);
kmemcg_id = memcg->kmemcg_id;
BUG_ON(kmemcg_id < 0);
parent = parent_mem_cgroup(memcg);
if (!parent)
parent = root_mem_cgroup;
memcg_deactivate_kmem_caches(memcg, parent);
kmemcg_id = memcg->kmemcg_id;
BUG_ON(kmemcg_id < 0);
/*
* Change kmemcg_id of this cgroup and all its descendants to the
* parent's id, and then move all entries from this cgroup's list_lrus
@@ -3207,9 +3297,8 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
memcg_offline_kmem(memcg);
if (memcg->kmem_state == KMEM_ALLOCATED) {
memcg_destroy_kmem_caches(memcg);
WARN_ON(!list_empty(&memcg->kmem_caches));
static_branch_dec(&memcg_kmem_enabled_key);
WARN_ON(page_counter_read(&memcg->kmem));
}
}
#else
@@ -3472,6 +3561,28 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
}
#endif /* CONFIG_NUMA */
static const unsigned int memcg1_stats[] = {
MEMCG_CACHE,
MEMCG_RSS,
MEMCG_RSS_HUGE,
NR_SHMEM,
NR_FILE_MAPPED,
NR_FILE_DIRTY,
NR_WRITEBACK,
MEMCG_SWAP,
};
static const char *const memcg1_stat_names[] = {
"cache",
"rss",
"rss_huge",
"shmem",
"mapped_file",
"dirty",
"writeback",
"swap",
};
/* Universal VM events cgroup1 shows, original sort order */
static const unsigned int memcg1_events[] = {
PGPGIN,
@@ -3530,12 +3641,13 @@ static int memcg_stat_show(struct seq_file *m, void *v)
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
(u64)memcg_page_state(memcg, i) * PAGE_SIZE);
(u64)memcg_page_state(memcg, memcg1_stats[i]) *
PAGE_SIZE);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
(u64)memcg_events(memcg, i));
(u64)memcg_events(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
@@ -4634,6 +4746,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
/* The following stuff does not apply to the root */
if (!parent) {
#ifdef CONFIG_MEMCG_KMEM
INIT_LIST_HEAD(&memcg->kmem_caches);
#endif
root_mem_cgroup = memcg;
return &memcg->css;
}
@@ -5625,112 +5740,42 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
return nbytes;
}
static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
{
seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
seq_printf(m, "oom_kill %lu\n",
atomic_long_read(&events[MEMCG_OOM_KILL]));
}
static int memory_events_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
seq_printf(m, "low %lu\n",
atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
seq_printf(m, "high %lu\n",
atomic_long_read(&memcg->memory_events[MEMCG_HIGH]));
seq_printf(m, "max %lu\n",
atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
seq_printf(m, "oom %lu\n",
atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
seq_printf(m, "oom_kill %lu\n",
atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
__memory_events_show(m, memcg->memory_events);
return 0;
}
static int memory_events_local_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
__memory_events_show(m, memcg->memory_events_local);
return 0;
}
static int memory_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
int i;
/*
* Provide statistics on the state of the memory subsystem as
* well as cumulative event counters that show past behavior.
*
* This list is ordered following a combination of these gradients:
* 1) generic big picture -> specifics and details
* 2) reflecting userspace activity -> reflecting kernel heuristics
*
* Current memory state:
*/
seq_printf(m, "anon %llu\n",
(u64)memcg_page_state(memcg, MEMCG_RSS) * PAGE_SIZE);
seq_printf(m, "file %llu\n",
(u64)memcg_page_state(memcg, MEMCG_CACHE) * PAGE_SIZE);
seq_printf(m, "kernel_stack %llu\n",
(u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * 1024);
seq_printf(m, "slab %llu\n",
(u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
PAGE_SIZE);
seq_printf(m, "sock %llu\n",
(u64)memcg_page_state(memcg, MEMCG_SOCK) * PAGE_SIZE);
seq_printf(m, "shmem %llu\n",
(u64)memcg_page_state(memcg, NR_SHMEM) * PAGE_SIZE);
seq_printf(m, "file_mapped %llu\n",
(u64)memcg_page_state(memcg, NR_FILE_MAPPED) * PAGE_SIZE);
seq_printf(m, "file_dirty %llu\n",
(u64)memcg_page_state(memcg, NR_FILE_DIRTY) * PAGE_SIZE);
seq_printf(m, "file_writeback %llu\n",
(u64)memcg_page_state(memcg, NR_WRITEBACK) * PAGE_SIZE);
/*
* TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
* with the NR_ANON_THP vm counter, but right now it's a pain in the
* arse because it requires migrating the work out of rmap to a place
* where the page->mem_cgroup is set up and stable.
*/
seq_printf(m, "anon_thp %llu\n",
(u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * PAGE_SIZE);
for (i = 0; i < NR_LRU_LISTS; i++)
seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
(u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
PAGE_SIZE);
seq_printf(m, "slab_reclaimable %llu\n",
(u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
PAGE_SIZE);
seq_printf(m, "slab_unreclaimable %llu\n",
(u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
PAGE_SIZE);
/* Accumulated memory events */
seq_printf(m, "pgfault %lu\n", memcg_events(memcg, PGFAULT));
seq_printf(m, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT));
seq_printf(m, "workingset_refault %lu\n",
memcg_page_state(memcg, WORKINGSET_REFAULT));
seq_printf(m, "workingset_activate %lu\n",
memcg_page_state(memcg, WORKINGSET_ACTIVATE));
seq_printf(m, "workingset_nodereclaim %lu\n",
memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
seq_printf(m, "pgrefill %lu\n", memcg_events(memcg, PGREFILL));
seq_printf(m, "pgscan %lu\n", memcg_events(memcg, PGSCAN_KSWAPD) +
memcg_events(memcg, PGSCAN_DIRECT));
seq_printf(m, "pgsteal %lu\n", memcg_events(memcg, PGSTEAL_KSWAPD) +
memcg_events(memcg, PGSTEAL_DIRECT));
seq_printf(m, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE));
seq_printf(m, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE));
seq_printf(m, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE));
seq_printf(m, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED));
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
seq_printf(m, "thp_fault_alloc %lu\n",
memcg_events(memcg, THP_FAULT_ALLOC));
seq_printf(m, "thp_collapse_alloc %lu\n",
memcg_events(memcg, THP_COLLAPSE_ALLOC));
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
char *buf;
buf = memory_stat_format(memcg);
if (!buf)
return -ENOMEM;
seq_puts(m, buf);
kfree(buf);
return 0;
}
@@ -5801,6 +5846,12 @@ static struct cftype memory_files[] = {
.file_offset = offsetof(struct mem_cgroup, events_file),
.seq_show = memory_events_show,
},
{
.name = "events.local",
.flags = CFTYPE_NOT_ON_ROOT,
.file_offset = offsetof(struct mem_cgroup, events_local_file),
.seq_show = memory_events_local_show,
},
{
.name = "stat",
.flags = CFTYPE_NOT_ON_ROOT,

Ver fichero

@@ -213,7 +213,7 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
short addr_lsb = tk->size_shift;
int ret;
pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
pfn, t->comm, t->pid);
if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {

Ver fichero

@@ -1475,8 +1475,6 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
set_pte_at(mm, addr, pte, mk_pte(page, prot));
retval = 0;
pte_unmap_unlock(pte, ptl);
return retval;
out_unlock:
pte_unmap_unlock(pte, ptl);
out:
@@ -1547,7 +1545,7 @@ static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
int ret, i;
/* Fail if the user requested offset is beyond the end of the object */
if (offset > num)
if (offset >= num)
return -ENXIO;
/* Fail if the user requested size exceeds available object size */
@@ -2038,7 +2036,6 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
{
pte_t *pte;
int err;
pgtable_t token;
spinlock_t *uninitialized_var(ptl);
pte = (mm == &init_mm) ?
@@ -2051,10 +2048,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
arch_enter_lazy_mmu_mode();
token = pmd_pgtable(*pmd);
do {
err = fn(pte++, token, addr, data);
err = fn(pte++, addr, data);
if (err)
break;
} while (addr += PAGE_SIZE, addr != end);
@@ -2807,7 +2802,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
struct swap_info_struct *si = swp_swap_info(entry);
if (si->flags & SWP_SYNCHRONOUS_IO &&
__swap_count(si, entry) == 1) {
__swap_count(entry) == 1) {
/* skip swapcache */
page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
vmf->address);
@@ -4349,7 +4344,9 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
void *old_buf = buf;
int write = gup_flags & FOLL_WRITE;
down_read(&mm->mmap_sem);
if (down_read_killable(&mm->mmap_sem))
return 0;
/* ignore errors, just check how much was successfully transferred */
while (len) {
int bytes, ret, offset;

Ver fichero

@@ -68,8 +68,16 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
*/
if (xa_is_value(page)) {
swp_entry_t swp = radix_to_swp_entry(page);
page = find_get_page(swap_address_space(swp),
swp_offset(swp));
struct swap_info_struct *si;
/* Prevent swap device to being swapoff under us */
si = get_swap_device(swp);
if (si) {
page = find_get_page(swap_address_space(swp),
swp_offset(swp));
put_swap_device(si);
} else
page = NULL;
}
} else
page = find_get_page(mapping, pgoff);

Ver fichero

@@ -274,7 +274,7 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
* thanks to mm_take_all_locks().
*/
spin_lock(&mm->mmu_notifier_mm->lock);
hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list);
spin_unlock(&mm->mmu_notifier_mm->lock);
mm_drop_all_locks(mm);

Ver fichero

@@ -111,94 +111,6 @@ unsigned int kobjsize(const void *objp)
return PAGE_SIZE << compound_order(page);
}
static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int foll_flags, struct page **pages,
struct vm_area_struct **vmas, int *nonblocking)
{
struct vm_area_struct *vma;
unsigned long vm_flags;
int i;
/* calculate required read or write permissions.
* If FOLL_FORCE is set, we only require the "MAY" flags.
*/
vm_flags = (foll_flags & FOLL_WRITE) ?
(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
vm_flags &= (foll_flags & FOLL_FORCE) ?
(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
for (i = 0; i < nr_pages; i++) {
vma = find_vma(mm, start);
if (!vma)
goto finish_or_fault;
/* protect what we can, including chardevs */
if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
!(vm_flags & vma->vm_flags))
goto finish_or_fault;
if (pages) {
pages[i] = virt_to_page(start);
if (pages[i])
get_page(pages[i]);
}
if (vmas)
vmas[i] = vma;
start = (start + PAGE_SIZE) & PAGE_MASK;
}
return i;
finish_or_fault:
return i ? : -EFAULT;
}
/*
* get a list of pages in an address range belonging to the specified process
* and indicate the VMA that covers each page
* - this is potentially dodgy as we may end incrementing the page count of a
* slab page or a secondary page from a compound page
* - don't permit access to VMAs that don't support it, such as I/O mappings
*/
long get_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas)
{
return __get_user_pages(current, current->mm, start, nr_pages,
gup_flags, pages, vmas, NULL);
}
EXPORT_SYMBOL(get_user_pages);
long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
int *locked)
{
return get_user_pages(start, nr_pages, gup_flags, pages, NULL);
}
EXPORT_SYMBOL(get_user_pages_locked);
static long __get_user_pages_unlocked(struct task_struct *tsk,
struct mm_struct *mm, unsigned long start,
unsigned long nr_pages, struct page **pages,
unsigned int gup_flags)
{
long ret;
down_read(&mm->mmap_sem);
ret = __get_user_pages(tsk, mm, start, nr_pages, gup_flags, pages,
NULL, NULL);
up_read(&mm->mmap_sem);
return ret;
}
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags)
{
return __get_user_pages_unlocked(current, current->mm, start, nr_pages,
pages, gup_flags);
}
EXPORT_SYMBOL(get_user_pages_unlocked);
/**
* follow_pfn - look up PFN at a user virtual address
* @vma: memory mapping
@@ -1792,7 +1704,8 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
struct vm_area_struct *vma;
int write = gup_flags & FOLL_WRITE;
down_read(&mm->mmap_sem);
if (down_read_killable(&mm->mmap_sem))
return 0;
/* the access must start within one of the target process's mappings */
vma = find_vma(mm, addr);

Ver fichero

@@ -64,21 +64,33 @@ int sysctl_oom_dump_tasks = 1;
*/
DEFINE_MUTEX(oom_lock);
static inline bool is_memcg_oom(struct oom_control *oc)
{
return oc->memcg != NULL;
}
#ifdef CONFIG_NUMA
/**
* has_intersects_mems_allowed() - check task eligiblity for kill
* oom_cpuset_eligible() - check task eligiblity for kill
* @start: task struct of which task to consider
* @mask: nodemask passed to page allocator for mempolicy ooms
*
* Task eligibility is determined by whether or not a candidate task, @tsk,
* shares the same mempolicy nodes as current if it is bound by such a policy
* and whether or not it has the same set of allowed cpuset nodes.
*
* This function is assuming oom-killer context and 'current' has triggered
* the oom-killer.
*/
static bool has_intersects_mems_allowed(struct task_struct *start,
const nodemask_t *mask)
static bool oom_cpuset_eligible(struct task_struct *start,
struct oom_control *oc)
{
struct task_struct *tsk;
bool ret = false;
const nodemask_t *mask = oc->nodemask;
if (is_memcg_oom(oc))
return true;
rcu_read_lock();
for_each_thread(start, tsk) {
@@ -105,8 +117,7 @@ static bool has_intersects_mems_allowed(struct task_struct *start,
return ret;
}
#else
static bool has_intersects_mems_allowed(struct task_struct *tsk,
const nodemask_t *mask)
static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc)
{
return true;
}
@@ -146,28 +157,13 @@ static inline bool is_sysrq_oom(struct oom_control *oc)
return oc->order == -1;
}
static inline bool is_memcg_oom(struct oom_control *oc)
{
return oc->memcg != NULL;
}
/* return true if the task is not adequate as candidate victim task. */
static bool oom_unkillable_task(struct task_struct *p,
struct mem_cgroup *memcg, const nodemask_t *nodemask)
static bool oom_unkillable_task(struct task_struct *p)
{
if (is_global_init(p))
return true;
if (p->flags & PF_KTHREAD)
return true;
/* When mem_cgroup_out_of_memory() and p is not member of the group */
if (memcg && !task_in_mem_cgroup(p, memcg))
return true;
/* p may not have freeable memory in nodemask */
if (!has_intersects_mems_allowed(p, nodemask))
return true;
return false;
}
@@ -194,20 +190,17 @@ static bool is_dump_unreclaim_slabs(void)
* oom_badness - heuristic function to determine which candidate task to kill
* @p: task struct of which task we should calculate
* @totalpages: total present RAM allowed for page allocation
* @memcg: task's memory controller, if constrained
* @nodemask: nodemask passed to page allocator for mempolicy ooms
*
* The heuristic for determining which task to kill is made to be as simple and
* predictable as possible. The goal is to return the highest value for the
* task consuming the most memory to avoid subsequent oom failures.
*/
unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
const nodemask_t *nodemask, unsigned long totalpages)
unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
{
long points;
long adj;
if (oom_unkillable_task(p, memcg, nodemask))
if (oom_unkillable_task(p))
return 0;
p = find_lock_task_mm(p);
@@ -318,7 +311,11 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
struct oom_control *oc = arg;
unsigned long points;
if (oom_unkillable_task(task, NULL, oc->nodemask))
if (oom_unkillable_task(task))
goto next;
/* p may not have freeable memory in nodemask */
if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))
goto next;
/*
@@ -342,13 +339,10 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
goto select;
}
points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
points = oom_badness(task, oc->totalpages);
if (!points || points < oc->chosen_points)
goto next;
/* Prefer thread group leaders for display purposes */
if (points == oc->chosen_points && thread_group_leader(oc->chosen))
goto next;
select:
if (oc->chosen)
put_task_struct(oc->chosen);
@@ -381,14 +375,44 @@ static void select_bad_process(struct oom_control *oc)
break;
rcu_read_unlock();
}
}
oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
static int dump_task(struct task_struct *p, void *arg)
{
struct oom_control *oc = arg;
struct task_struct *task;
if (oom_unkillable_task(p))
return 0;
/* p may not have freeable memory in nodemask */
if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
return 0;
task = find_lock_task_mm(p);
if (!task) {
/*
* This is a kthread or all of p's threads have already
* detached their mm's. There's no need to report
* them; they can't be oom killed anyway.
*/
return 0;
}
pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
task->pid, from_kuid(&init_user_ns, task_uid(task)),
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
mm_pgtables_bytes(task->mm),
get_mm_counter(task->mm, MM_SWAPENTS),
task->signal->oom_score_adj, task->comm);
task_unlock(task);
return 0;
}
/**
* dump_tasks - dump current memory state of all system tasks
* @memcg: current's memory controller, if constrained
* @nodemask: nodemask passed to page allocator for mempolicy ooms
* @oc: pointer to struct oom_control
*
* Dumps the current memory state of all eligible tasks. Tasks not in the same
* memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
@@ -396,37 +420,21 @@ static void select_bad_process(struct oom_control *oc)
* State information includes task's pid, uid, tgid, vm size, rss,
* pgtables_bytes, swapents, oom_score_adj value, and name.
*/
static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
static void dump_tasks(struct oom_control *oc)
{
struct task_struct *p;
struct task_struct *task;
pr_info("Tasks state (memory values in pages):\n");
pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
rcu_read_lock();
for_each_process(p) {
if (oom_unkillable_task(p, memcg, nodemask))
continue;
task = find_lock_task_mm(p);
if (!task) {
/*
* This is a kthread or all of p's threads have already
* detached their mm's. There's no need to report
* them; they can't be oom killed anyway.
*/
continue;
}
if (is_memcg_oom(oc))
mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
else {
struct task_struct *p;
pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
task->pid, from_kuid(&init_user_ns, task_uid(task)),
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
mm_pgtables_bytes(task->mm),
get_mm_counter(task->mm, MM_SWAPENTS),
task->signal->oom_score_adj, task->comm);
task_unlock(task);
rcu_read_lock();
for_each_process(p)
dump_task(p, oc);
rcu_read_unlock();
}
rcu_read_unlock();
}
static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
@@ -458,7 +466,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
dump_unreclaimable_slab();
}
if (sysctl_oom_dump_tasks)
dump_tasks(oc->memcg, oc->nodemask);
dump_tasks(oc);
if (p)
dump_oom_summary(oc, p);
}
@@ -1075,7 +1083,8 @@ bool out_of_memory(struct oom_control *oc)
check_panic_on_oom(oc);
if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
current->mm && !oom_unkillable_task(current) &&
oom_cpuset_eligible(current, oc) &&
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
get_task_struct(current);
oc->chosen = current;

Ver fichero

@@ -2429,7 +2429,6 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
this_cpu_inc(bdp_ratelimits);
}
}
EXPORT_SYMBOL(account_page_dirtied);
/*
* Helper function for deaccounting dirty page without writeback.

Ver fichero

@@ -50,7 +50,6 @@
#include <linux/backing-dev.h>
#include <linux/fault-inject.h>
#include <linux/page-isolation.h>
#include <linux/page_ext.h>
#include <linux/debugobjects.h>
#include <linux/kmemleak.h>
#include <linux/compaction.h>
@@ -136,6 +135,55 @@ unsigned long totalcma_pages __read_mostly;
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON
DEFINE_STATIC_KEY_TRUE(init_on_alloc);
#else
DEFINE_STATIC_KEY_FALSE(init_on_alloc);
#endif
EXPORT_SYMBOL(init_on_alloc);
#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON
DEFINE_STATIC_KEY_TRUE(init_on_free);
#else
DEFINE_STATIC_KEY_FALSE(init_on_free);
#endif
EXPORT_SYMBOL(init_on_free);
static int __init early_init_on_alloc(char *buf)
{
int ret;
bool bool_result;
if (!buf)
return -EINVAL;
ret = kstrtobool(buf, &bool_result);
if (bool_result && page_poisoning_enabled())
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n");
if (bool_result)
static_branch_enable(&init_on_alloc);
else
static_branch_disable(&init_on_alloc);
return ret;
}
early_param("init_on_alloc", early_init_on_alloc);
static int __init early_init_on_free(char *buf)
{
int ret;
bool bool_result;
if (!buf)
return -EINVAL;
ret = kstrtobool(buf, &bool_result);
if (bool_result && page_poisoning_enabled())
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n");
if (bool_result)
static_branch_enable(&init_on_free);
else
static_branch_disable(&init_on_free);
return ret;
}
early_param("init_on_free", early_init_on_free);
/*
* A cached value of the page's pageblock's migratetype, used when the page is
@@ -224,8 +272,6 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
[ZONE_MOVABLE] = 0,
};
EXPORT_SYMBOL(totalram_pages);
static char * const zone_names[MAX_NR_ZONES] = {
#ifdef CONFIG_ZONE_DMA
"DMA",
@@ -646,31 +692,30 @@ void prep_compound_page(struct page *page, unsigned int order)
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
bool _debug_pagealloc_enabled __read_mostly
= IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
#ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT
DEFINE_STATIC_KEY_TRUE(_debug_pagealloc_enabled);
#else
DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
#endif
EXPORT_SYMBOL(_debug_pagealloc_enabled);
bool _debug_guardpage_enabled __read_mostly;
DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
static int __init early_debug_pagealloc(char *buf)
{
if (!buf)
bool enable = false;
if (kstrtobool(buf, &enable))
return -EINVAL;
return kstrtobool(buf, &_debug_pagealloc_enabled);
if (enable)
static_branch_enable(&_debug_pagealloc_enabled);
return 0;
}
early_param("debug_pagealloc", early_debug_pagealloc);
static bool need_debug_guardpage(void)
{
/* If we don't use debug_pagealloc, we don't need guard page */
if (!debug_pagealloc_enabled())
return false;
if (!debug_guardpage_minorder())
return false;
return true;
}
static void init_debug_guardpage(void)
{
if (!debug_pagealloc_enabled())
@@ -679,14 +724,9 @@ static void init_debug_guardpage(void)
if (!debug_guardpage_minorder())
return;
_debug_guardpage_enabled = true;
static_branch_enable(&_debug_guardpage_enabled);
}
struct page_ext_operations debug_guardpage_ops = {
.need = need_debug_guardpage,
.init = init_debug_guardpage,
};
static int __init debug_guardpage_minorder_setup(char *buf)
{
unsigned long res;
@@ -704,20 +744,13 @@ early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
static inline bool set_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype)
{
struct page_ext *page_ext;
if (!debug_guardpage_enabled())
return false;
if (order >= debug_guardpage_minorder())
return false;
page_ext = lookup_page_ext(page);
if (unlikely(!page_ext))
return false;
__set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
__SetPageGuard(page);
INIT_LIST_HEAD(&page->lru);
set_page_private(page, order);
/* Guard pages are not available for any usage */
@@ -729,23 +762,16 @@ static inline bool set_page_guard(struct zone *zone, struct page *page,
static inline void clear_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype)
{
struct page_ext *page_ext;
if (!debug_guardpage_enabled())
return;
page_ext = lookup_page_ext(page);
if (unlikely(!page_ext))
return;
__clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
__ClearPageGuard(page);
set_page_private(page, 0);
if (!is_migrate_isolate(migratetype))
__mod_zone_freepage_state(zone, (1 << order), migratetype);
}
#else
struct page_ext_operations debug_guardpage_ops;
static inline bool set_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype) { return false; }
static inline void clear_page_guard(struct zone *zone, struct page *page,
@@ -1090,6 +1116,14 @@ out:
return ret;
}
static void kernel_init_free_pages(struct page *page, int numpages)
{
int i;
for (i = 0; i < numpages; i++)
clear_highpage(page + i);
}
static __always_inline bool free_pages_prepare(struct page *page,
unsigned int order, bool check_free)
{
@@ -1141,6 +1175,9 @@ static __always_inline bool free_pages_prepare(struct page *page,
PAGE_SIZE << order);
}
arch_free_page(page, order);
if (want_init_on_free())
kernel_init_free_pages(page, 1 << order);
kernel_poison_pages(page, 1 << order, 0);
if (debug_pagealloc_enabled())
kernel_map_pages(page, 1 << order, 0);
@@ -1151,19 +1188,36 @@ static __always_inline bool free_pages_prepare(struct page *page,
}
#ifdef CONFIG_DEBUG_VM
static inline bool free_pcp_prepare(struct page *page)
/*
* With DEBUG_VM enabled, order-0 pages are checked immediately when being freed
* to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
* moved from pcp lists to free lists.
*/
static bool free_pcp_prepare(struct page *page)
{
return free_pages_prepare(page, 0, true);
}
static inline bool bulkfree_pcp_prepare(struct page *page)
static bool bulkfree_pcp_prepare(struct page *page)
{
return false;
if (debug_pagealloc_enabled())
return free_pages_check(page);
else
return false;
}
#else
/*
* With DEBUG_VM disabled, order-0 pages being freed are checked only when
* moving from pcp lists to free list in order to reduce overhead. With
* debug_pagealloc enabled, they are checked also immediately when being freed
* to the pcp lists.
*/
static bool free_pcp_prepare(struct page *page)
{
return free_pages_prepare(page, 0, false);
if (debug_pagealloc_enabled())
return free_pages_prepare(page, 0, true);
else
return free_pages_prepare(page, 0, false);
}
static bool bulkfree_pcp_prepare(struct page *page)
@@ -1904,6 +1958,10 @@ void __init page_alloc_init_late(void)
for_each_populated_zone(zone)
set_zone_contiguous(zone);
#ifdef CONFIG_DEBUG_PAGEALLOC
init_debug_guardpage();
#endif
}
#ifdef CONFIG_CMA
@@ -2021,28 +2079,44 @@ static inline int check_new_page(struct page *page)
static inline bool free_pages_prezeroed(void)
{
return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
page_poisoning_enabled();
return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
page_poisoning_enabled()) || want_init_on_free();
}
#ifdef CONFIG_DEBUG_VM
static bool check_pcp_refill(struct page *page)
/*
* With DEBUG_VM enabled, order-0 pages are checked for expected state when
* being allocated from pcp lists. With debug_pagealloc also enabled, they are
* also checked when pcp lists are refilled from the free lists.
*/
static inline bool check_pcp_refill(struct page *page)
{
return false;
if (debug_pagealloc_enabled())
return check_new_page(page);
else
return false;
}
static bool check_new_pcp(struct page *page)
static inline bool check_new_pcp(struct page *page)
{
return check_new_page(page);
}
#else
static bool check_pcp_refill(struct page *page)
/*
* With DEBUG_VM disabled, free order-0 pages are checked for expected state
* when pcp lists are being refilled from the free lists. With debug_pagealloc
* enabled, they are also checked when being allocated from the pcp lists.
*/
static inline bool check_pcp_refill(struct page *page)
{
return check_new_page(page);
}
static bool check_new_pcp(struct page *page)
static inline bool check_new_pcp(struct page *page)
{
return false;
if (debug_pagealloc_enabled())
return check_new_page(page);
else
return false;
}
#endif /* CONFIG_DEBUG_VM */
@@ -2076,13 +2150,10 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
unsigned int alloc_flags)
{
int i;
post_alloc_hook(page, order, gfp_flags);
if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))
for (i = 0; i < (1 << order); i++)
clear_highpage(page + i);
if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags))
kernel_init_free_pages(page, 1 << order);
if (order && (gfp_flags & __GFP_COMP))
prep_compound_page(page, order);
@@ -7520,10 +7591,28 @@ static int page_alloc_cpu_dead(unsigned int cpu)
return 0;
}
#ifdef CONFIG_NUMA
int hashdist = HASHDIST_DEFAULT;
static int __init set_hashdist(char *str)
{
if (!str)
return 0;
hashdist = simple_strtoul(str, &str, 0);
return 1;
}
__setup("hashdist=", set_hashdist);
#endif
void __init page_alloc_init(void)
{
int ret;
#ifdef CONFIG_NUMA
if (num_node_state(N_MEMORY) == 1)
hashdist = 0;
#endif
ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
"mm/page_alloc:dead", NULL,
page_alloc_cpu_dead);
@@ -7908,19 +7997,6 @@ out:
return ret;
}
#ifdef CONFIG_NUMA
int hashdist = HASHDIST_DEFAULT;
static int __init set_hashdist(char *str)
{
if (!str)
return 0;
hashdist = simple_strtoul(str, &str, 0);
return 1;
}
__setup("hashdist=", set_hashdist);
#endif
#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
/*
* Returns the number of pages that arch has reserved but
@@ -7967,6 +8043,7 @@ void *__init alloc_large_system_hash(const char *tablename,
unsigned long log2qty, size;
void *table = NULL;
gfp_t gfp_flags;
bool virt;
/* allow the kernel cmdline to have a say */
if (!numentries) {
@@ -8023,6 +8100,7 @@ void *__init alloc_large_system_hash(const char *tablename,
gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
do {
virt = false;
size = bucketsize << log2qty;
if (flags & HASH_EARLY) {
if (flags & HASH_ZERO)
@@ -8030,26 +8108,26 @@ void *__init alloc_large_system_hash(const char *tablename,
else
table = memblock_alloc_raw(size,
SMP_CACHE_BYTES);
} else if (hashdist) {
} else if (get_order(size) >= MAX_ORDER || hashdist) {
table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
virt = true;
} else {
/*
* If bucketsize is not a power-of-two, we may free
* some pages at the end of hash table which
* alloc_pages_exact() automatically does
*/
if (get_order(size) < MAX_ORDER) {
table = alloc_pages_exact(size, gfp_flags);
kmemleak_alloc(table, size, 1, gfp_flags);
}
table = alloc_pages_exact(size, gfp_flags);
kmemleak_alloc(table, size, 1, gfp_flags);
}
} while (!table && size > PAGE_SIZE && --log2qty);
if (!table)
panic("Failed to allocate %s hash table\n", tablename);
pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n",
tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size);
pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
virt ? "vmalloc" : "linear");
if (_hash_shift)
*_hash_shift = log2qty;

Ver fichero

@@ -59,9 +59,6 @@
*/
static struct page_ext_operations *page_ext_ops[] = {
#ifdef CONFIG_DEBUG_PAGEALLOC
&debug_guardpage_ops,
#endif
#ifdef CONFIG_PAGE_OWNER
&page_owner_ops,
#endif

Ver fichero

@@ -163,7 +163,7 @@ int generic_swapfile_activate(struct swap_info_struct *sis,
blocks_per_page = PAGE_SIZE >> blkbits;
/*
* Map all the blocks into the extent list. This code doesn't try
* Map all the blocks into the extent tree. This code doesn't try
* to be very smart.
*/
probe_block = 0;

Ver fichero

@@ -230,7 +230,7 @@ undo:
/*
* Make isolated pages available again.
*/
int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
unsigned migratetype)
{
unsigned long pfn;
@@ -247,7 +247,6 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
continue;
unset_migratetype_isolate(page, migratetype);
}
return 0;
}
/*
* Test all pages in the range is free(means isolated) or not.

Ver fichero

@@ -371,12 +371,6 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
static int slab_max_order = SLAB_MAX_ORDER_LO;
static bool slab_max_order_set __initdata;
static inline struct kmem_cache *virt_to_cache(const void *obj)
{
struct page *page = virt_to_head_page(obj);
return page->slab_cache;
}
static inline void *index_to_obj(struct kmem_cache *cache, struct page *page,
unsigned int idx)
{
@@ -1245,7 +1239,7 @@ void __init kmem_cache_init(void)
nr_node_ids * sizeof(struct kmem_cache_node *),
SLAB_HWCACHE_ALIGN, 0, 0);
list_add(&kmem_cache->list, &slab_caches);
memcg_link_cache(kmem_cache);
memcg_link_cache(kmem_cache, NULL);
slab_state = PARTIAL;
/*
@@ -1366,7 +1360,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
int nodeid)
{
struct page *page;
int nr_pages;
flags |= cachep->allocflags;
@@ -1376,17 +1369,11 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
return NULL;
}
if (memcg_charge_slab(page, flags, cachep->gfporder, cachep)) {
if (charge_slab_page(page, flags, cachep->gfporder, cachep)) {
__free_pages(page, cachep->gfporder);
return NULL;
}
nr_pages = (1 << cachep->gfporder);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, nr_pages);
else
mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, nr_pages);
__SetPageSlab(page);
/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
if (sk_memalloc_socks() && page_is_pfmemalloc(page))
@@ -1401,12 +1388,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
{
int order = cachep->gfporder;
unsigned long nr_freed = (1 << order);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed);
else
mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, -nr_freed);
BUG_ON(!PageSlab(page));
__ClearPageSlabPfmemalloc(page);
@@ -1415,8 +1396,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
page->mapping = NULL;
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += nr_freed;
memcg_uncharge_slab(page, order, cachep);
current->reclaim_state->reclaimed_slab += 1 << order;
uncharge_slab_page(page, order, cachep);
__free_pages(page, order);
}
@@ -1830,6 +1811,14 @@ static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
cachep->num = 0;
/*
* If slab auto-initialization on free is enabled, store the freelist
* off-slab, so that its contents don't end up in one of the allocated
* objects.
*/
if (unlikely(slab_want_init_on_free(cachep)))
return false;
if (cachep->ctor || flags & SLAB_TYPESAFE_BY_RCU)
return false;
@@ -2258,6 +2247,10 @@ void __kmemcg_cache_deactivate(struct kmem_cache *cachep)
{
__kmem_cache_shrink(cachep);
}
void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
{
}
#endif
int __kmem_cache_shutdown(struct kmem_cache *cachep)
@@ -3263,7 +3256,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
local_irq_restore(save_flags);
ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
if (unlikely(flags & __GFP_ZERO) && ptr)
if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr)
memset(ptr, 0, cachep->object_size);
slab_post_alloc_hook(cachep, flags, 1, &ptr);
@@ -3320,7 +3313,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
prefetchw(objp);
if (unlikely(flags & __GFP_ZERO) && objp)
if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp)
memset(objp, 0, cachep->object_size);
slab_post_alloc_hook(cachep, flags, 1, &objp);
@@ -3441,6 +3434,8 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
struct array_cache *ac = cpu_cache_get(cachep);
check_irq_off();
if (unlikely(slab_want_init_on_free(cachep)))
memset(objp, 0, cachep->object_size);
kmemleak_free_recursive(objp, cachep->flags);
objp = cache_free_debugcheck(cachep, objp, caller);
@@ -3528,7 +3523,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_);
/* Clear memory outside IRQ disabled section */
if (unlikely(flags & __GFP_ZERO))
if (unlikely(slab_want_init_on_alloc(flags, s)))
for (i = 0; i < size; i++)
memset(p[i], 0, s->object_size);
@@ -3715,6 +3710,8 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
s = virt_to_cache(objp);
else
s = cache_from_obj(orig_s, objp);
if (!s)
continue;
debug_check_no_locks_freed(objp, s->object_size);
if (!(s->flags & SLAB_DEBUG_OBJECTS))
@@ -3749,6 +3746,10 @@ void kfree(const void *objp)
local_irq_save(flags);
kfree_debugcheck(objp);
c = virt_to_cache(objp);
if (!c) {
local_irq_restore(flags);
return;
}
debug_check_no_locks_freed(objp, c->object_size);
debug_check_no_obj_freed(objp, c->object_size);
@@ -4204,33 +4205,23 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
#endif /* CONFIG_HARDENED_USERCOPY */
/**
* ksize - get the actual amount of memory allocated for a given object
* @objp: Pointer to the object
* __ksize -- Uninstrumented ksize.
*
* kmalloc may internally round up allocations and return more memory
* than requested. ksize() can be used to determine the actual amount of
* memory allocated. The caller may use this additional memory, even though
* a smaller amount of memory was initially specified with the kmalloc call.
* The caller must guarantee that objp points to a valid object previously
* allocated with either kmalloc() or kmem_cache_alloc(). The object
* must not be freed during the duration of the call.
*
* Return: size of the actual memory used by @objp in bytes
* Unlike ksize(), __ksize() is uninstrumented, and does not provide the same
* safety checks as ksize() with KASAN instrumentation enabled.
*/
size_t ksize(const void *objp)
size_t __ksize(const void *objp)
{
struct kmem_cache *c;
size_t size;
BUG_ON(!objp);
if (unlikely(objp == ZERO_SIZE_PTR))
return 0;
size = virt_to_cache(objp)->object_size;
/* We assume that ksize callers could use the whole allocated area,
* so we need to unpoison this area.
*/
kasan_unpoison_shadow(objp, size);
c = virt_to_cache(objp);
size = c ? c->object_size : 0;
return size;
}
EXPORT_SYMBOL(ksize);
EXPORT_SYMBOL(__ksize);

201
mm/slab.h
Ver fichero

@@ -172,6 +172,7 @@ int __kmem_cache_shutdown(struct kmem_cache *);
void __kmem_cache_release(struct kmem_cache *);
int __kmem_cache_shrink(struct kmem_cache *);
void __kmemcg_cache_deactivate(struct kmem_cache *s);
void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s);
void slab_kmem_cache_release(struct kmem_cache *);
struct seq_file;
@@ -204,6 +205,12 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
static inline int cache_vmstat_idx(struct kmem_cache *s)
{
return (s->flags & SLAB_RECLAIM_ACCOUNT) ?
NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE;
}
#ifdef CONFIG_MEMCG_KMEM
/* List of all root caches. */
@@ -241,31 +248,6 @@ static inline const char *cache_name(struct kmem_cache *s)
return s->name;
}
/*
* Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
* That said the caller must assure the memcg's cache won't go away by either
* taking a css reference to the owner cgroup, or holding the slab_mutex.
*/
static inline struct kmem_cache *
cache_from_memcg_idx(struct kmem_cache *s, int idx)
{
struct kmem_cache *cachep;
struct memcg_cache_array *arr;
rcu_read_lock();
arr = rcu_dereference(s->memcg_params.memcg_caches);
/*
* Make sure we will access the up-to-date value. The code updating
* memcg_caches issues a write barrier to match this (see
* memcg_create_kmem_cache()).
*/
cachep = READ_ONCE(arr->entries[idx]);
rcu_read_unlock();
return cachep;
}
static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
{
if (is_root_cache(s))
@@ -273,25 +255,94 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
return s->memcg_params.root_cache;
}
/*
* Expects a pointer to a slab page. Please note, that PageSlab() check
* isn't sufficient, as it returns true also for tail compound slab pages,
* which do not have slab_cache pointer set.
* So this function assumes that the page can pass PageHead() and PageSlab()
* checks.
*
* The kmem_cache can be reparented asynchronously. The caller must ensure
* the memcg lifetime, e.g. by taking rcu_read_lock() or cgroup_mutex.
*/
static inline struct mem_cgroup *memcg_from_slab_page(struct page *page)
{
struct kmem_cache *s;
s = READ_ONCE(page->slab_cache);
if (s && !is_root_cache(s))
return READ_ONCE(s->memcg_params.memcg);
return NULL;
}
/*
* Charge the slab page belonging to the non-root kmem_cache.
* Can be called for non-root kmem_caches only.
*/
static __always_inline int memcg_charge_slab(struct page *page,
gfp_t gfp, int order,
struct kmem_cache *s)
{
if (is_root_cache(s))
struct mem_cgroup *memcg;
struct lruvec *lruvec;
int ret;
rcu_read_lock();
memcg = READ_ONCE(s->memcg_params.memcg);
while (memcg && !css_tryget_online(&memcg->css))
memcg = parent_mem_cgroup(memcg);
rcu_read_unlock();
if (unlikely(!memcg || mem_cgroup_is_root(memcg))) {
mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
(1 << order));
percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);
return 0;
return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
}
ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
if (ret)
goto out;
lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
mod_lruvec_state(lruvec, cache_vmstat_idx(s), 1 << order);
/* transer try_charge() page references to kmem_cache */
percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);
css_put_many(&memcg->css, 1 << order);
out:
css_put(&memcg->css);
return ret;
}
/*
* Uncharge a slab page belonging to a non-root kmem_cache.
* Can be called for non-root kmem_caches only.
*/
static __always_inline void memcg_uncharge_slab(struct page *page, int order,
struct kmem_cache *s)
{
memcg_kmem_uncharge(page, order);
struct mem_cgroup *memcg;
struct lruvec *lruvec;
rcu_read_lock();
memcg = READ_ONCE(s->memcg_params.memcg);
if (likely(!mem_cgroup_is_root(memcg))) {
lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order));
memcg_kmem_uncharge_memcg(page, order, memcg);
} else {
mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
-(1 << order));
}
rcu_read_unlock();
percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order);
}
extern void slab_init_memcg_params(struct kmem_cache *);
extern void memcg_link_cache(struct kmem_cache *s);
extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
void (*deact_fn)(struct kmem_cache *));
extern void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg);
#else /* CONFIG_MEMCG_KMEM */
@@ -310,7 +361,7 @@ static inline bool is_root_cache(struct kmem_cache *s)
static inline bool slab_equal_or_root(struct kmem_cache *s,
struct kmem_cache *p)
{
return true;
return s == p;
}
static inline const char *cache_name(struct kmem_cache *s)
@@ -318,17 +369,16 @@ static inline const char *cache_name(struct kmem_cache *s)
return s->name;
}
static inline struct kmem_cache *
cache_from_memcg_idx(struct kmem_cache *s, int idx)
{
return NULL;
}
static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
{
return s;
}
static inline struct mem_cgroup *memcg_from_slab_page(struct page *page)
{
return NULL;
}
static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order,
struct kmem_cache *s)
{
@@ -344,16 +394,52 @@ static inline void slab_init_memcg_params(struct kmem_cache *s)
{
}
static inline void memcg_link_cache(struct kmem_cache *s)
static inline void memcg_link_cache(struct kmem_cache *s,
struct mem_cgroup *memcg)
{
}
#endif /* CONFIG_MEMCG_KMEM */
static inline struct kmem_cache *virt_to_cache(const void *obj)
{
struct page *page;
page = virt_to_head_page(obj);
if (WARN_ONCE(!PageSlab(page), "%s: Object is not a Slab page!\n",
__func__))
return NULL;
return page->slab_cache;
}
static __always_inline int charge_slab_page(struct page *page,
gfp_t gfp, int order,
struct kmem_cache *s)
{
if (is_root_cache(s)) {
mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
1 << order);
return 0;
}
return memcg_charge_slab(page, gfp, order, s);
}
static __always_inline void uncharge_slab_page(struct page *page, int order,
struct kmem_cache *s)
{
if (is_root_cache(s)) {
mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
-(1 << order));
return;
}
memcg_uncharge_slab(page, order, s);
}
static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
{
struct kmem_cache *cachep;
struct page *page;
/*
* When kmemcg is not being used, both assignments should return the
@@ -363,18 +449,15 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
* will also be a constant.
*/
if (!memcg_kmem_enabled() &&
!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
!unlikely(s->flags & SLAB_CONSISTENCY_CHECKS))
return s;
page = virt_to_head_page(x);
cachep = page->slab_cache;
if (slab_equal_or_root(cachep, s))
return cachep;
pr_err("%s: Wrong slab cache. %s but object is from %s\n",
__func__, s->name, cachep->name);
WARN_ON_ONCE(1);
return s;
cachep = virt_to_cache(x);
WARN_ONCE(cachep && !slab_equal_or_root(cachep, s),
"%s: Wrong slab cache. %s but object is from %s\n",
__func__, s->name, cachep->name);
return cachep;
}
static inline size_t slab_ksize(const struct kmem_cache *s)
@@ -524,4 +607,24 @@ static inline int cache_random_seq_create(struct kmem_cache *cachep,
static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { }
#endif /* CONFIG_SLAB_FREELIST_RANDOM */
static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c)
{
if (static_branch_unlikely(&init_on_alloc)) {
if (c->ctor)
return false;
if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))
return flags & __GFP_ZERO;
return true;
}
return flags & __GFP_ZERO;
}
static inline bool slab_want_init_on_free(struct kmem_cache *c)
{
if (static_branch_unlikely(&init_on_free))
return !(c->ctor ||
(c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)));
return false;
}
#endif /* MM_SLAB_H */

Ver fichero

@@ -17,6 +17,7 @@
#include <linux/uaccess.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/debugfs.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/page.h>
@@ -130,6 +131,9 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
#ifdef CONFIG_MEMCG_KMEM
LIST_HEAD(slab_root_caches);
static DEFINE_SPINLOCK(memcg_kmem_wq_lock);
static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref);
void slab_init_memcg_params(struct kmem_cache *s)
{
@@ -140,13 +144,18 @@ void slab_init_memcg_params(struct kmem_cache *s)
}
static int init_memcg_params(struct kmem_cache *s,
struct mem_cgroup *memcg, struct kmem_cache *root_cache)
struct kmem_cache *root_cache)
{
struct memcg_cache_array *arr;
if (root_cache) {
int ret = percpu_ref_init(&s->memcg_params.refcnt,
kmemcg_cache_shutdown,
0, GFP_KERNEL);
if (ret)
return ret;
s->memcg_params.root_cache = root_cache;
s->memcg_params.memcg = memcg;
INIT_LIST_HEAD(&s->memcg_params.children_node);
INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);
return 0;
@@ -171,6 +180,8 @@ static void destroy_memcg_params(struct kmem_cache *s)
{
if (is_root_cache(s))
kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));
else
percpu_ref_exit(&s->memcg_params.refcnt);
}
static void free_memcg_params(struct rcu_head *rcu)
@@ -221,11 +232,13 @@ int memcg_update_all_caches(int num_memcgs)
return ret;
}
void memcg_link_cache(struct kmem_cache *s)
void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg)
{
if (is_root_cache(s)) {
list_add(&s->root_caches_node, &slab_root_caches);
} else {
css_get(&memcg->css);
s->memcg_params.memcg = memcg;
list_add(&s->memcg_params.children_node,
&s->memcg_params.root_cache->memcg_params.children);
list_add(&s->memcg_params.kmem_caches_node,
@@ -240,11 +253,13 @@ static void memcg_unlink_cache(struct kmem_cache *s)
} else {
list_del(&s->memcg_params.children_node);
list_del(&s->memcg_params.kmem_caches_node);
mem_cgroup_put(s->memcg_params.memcg);
WRITE_ONCE(s->memcg_params.memcg, NULL);
}
}
#else
static inline int init_memcg_params(struct kmem_cache *s,
struct mem_cgroup *memcg, struct kmem_cache *root_cache)
struct kmem_cache *root_cache)
{
return 0;
}
@@ -384,7 +399,7 @@ static struct kmem_cache *create_cache(const char *name,
s->useroffset = useroffset;
s->usersize = usersize;
err = init_memcg_params(s, memcg, root_cache);
err = init_memcg_params(s, root_cache);
if (err)
goto out_free_cache;
@@ -394,7 +409,7 @@ static struct kmem_cache *create_cache(const char *name,
s->refcount = 1;
list_add(&s->list, &slab_caches);
memcg_link_cache(s);
memcg_link_cache(s, memcg);
out:
if (err)
return ERR_PTR(err);
@@ -640,7 +655,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
* The memory cgroup could have been offlined while the cache
* creation work was pending.
*/
if (memcg->kmem_state != KMEM_ONLINE || root_cache->memcg_params.dying)
if (memcg->kmem_state != KMEM_ONLINE)
goto out_unlock;
idx = memcg_cache_id(memcg);
@@ -677,7 +692,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
}
/*
* Since readers won't lock (see cache_from_memcg_idx()), we need a
* Since readers won't lock (see memcg_kmem_get_cache()), we need a
* barrier here to ensure nobody will see the kmem_cache partially
* initialized.
*/
@@ -691,74 +706,95 @@ out_unlock:
put_online_cpus();
}
static void kmemcg_deactivate_workfn(struct work_struct *work)
static void kmemcg_workfn(struct work_struct *work)
{
struct kmem_cache *s = container_of(work, struct kmem_cache,
memcg_params.deact_work);
memcg_params.work);
get_online_cpus();
get_online_mems();
mutex_lock(&slab_mutex);
s->memcg_params.deact_fn(s);
s->memcg_params.work_fn(s);
mutex_unlock(&slab_mutex);
put_online_mems();
put_online_cpus();
/* done, put the ref from slab_deactivate_memcg_cache_rcu_sched() */
css_put(&s->memcg_params.memcg->css);
}
static void kmemcg_deactivate_rcufn(struct rcu_head *head)
static void kmemcg_rcufn(struct rcu_head *head)
{
struct kmem_cache *s = container_of(head, struct kmem_cache,
memcg_params.deact_rcu_head);
memcg_params.rcu_head);
/*
* We need to grab blocking locks. Bounce to ->deact_work. The
* We need to grab blocking locks. Bounce to ->work. The
* work item shares the space with the RCU head and can't be
* initialized eariler.
*/
INIT_WORK(&s->memcg_params.deact_work, kmemcg_deactivate_workfn);
queue_work(memcg_kmem_cache_wq, &s->memcg_params.deact_work);
INIT_WORK(&s->memcg_params.work, kmemcg_workfn);
queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
}
/**
* slab_deactivate_memcg_cache_rcu_sched - schedule deactivation after a
* sched RCU grace period
* @s: target kmem_cache
* @deact_fn: deactivation function to call
*
* Schedule @deact_fn to be invoked with online cpus, mems and slab_mutex
* held after a sched RCU grace period. The slab is guaranteed to stay
* alive until @deact_fn is finished. This is to be used from
* __kmemcg_cache_deactivate().
*/
void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
void (*deact_fn)(struct kmem_cache *))
static void kmemcg_cache_shutdown_fn(struct kmem_cache *s)
{
if (WARN_ON_ONCE(is_root_cache(s)) ||
WARN_ON_ONCE(s->memcg_params.deact_fn))
return;
if (s->memcg_params.root_cache->memcg_params.dying)
return;
/* pin memcg so that @s doesn't get destroyed in the middle */
css_get(&s->memcg_params.memcg->css);
s->memcg_params.deact_fn = deact_fn;
call_rcu(&s->memcg_params.deact_rcu_head, kmemcg_deactivate_rcufn);
WARN_ON(shutdown_cache(s));
}
void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref)
{
struct kmem_cache *s = container_of(percpu_ref, struct kmem_cache,
memcg_params.refcnt);
unsigned long flags;
spin_lock_irqsave(&memcg_kmem_wq_lock, flags);
if (s->memcg_params.root_cache->memcg_params.dying)
goto unlock;
s->memcg_params.work_fn = kmemcg_cache_shutdown_fn;
INIT_WORK(&s->memcg_params.work, kmemcg_workfn);
queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
unlock:
spin_unlock_irqrestore(&memcg_kmem_wq_lock, flags);
}
static void kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
{
__kmemcg_cache_deactivate_after_rcu(s);
percpu_ref_kill(&s->memcg_params.refcnt);
}
static void kmemcg_cache_deactivate(struct kmem_cache *s)
{
if (WARN_ON_ONCE(is_root_cache(s)))
return;
__kmemcg_cache_deactivate(s);
s->flags |= SLAB_DEACTIVATED;
/*
* memcg_kmem_wq_lock is used to synchronize memcg_params.dying
* flag and make sure that no new kmem_cache deactivation tasks
* are queued (see flush_memcg_workqueue() ).
*/
spin_lock_irq(&memcg_kmem_wq_lock);
if (s->memcg_params.root_cache->memcg_params.dying)
goto unlock;
s->memcg_params.work_fn = kmemcg_cache_deactivate_after_rcu;
call_rcu(&s->memcg_params.rcu_head, kmemcg_rcufn);
unlock:
spin_unlock_irq(&memcg_kmem_wq_lock);
}
void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg,
struct mem_cgroup *parent)
{
int idx;
struct memcg_cache_array *arr;
struct kmem_cache *s, *c;
unsigned int nr_reparented;
idx = memcg_cache_id(memcg);
@@ -773,30 +809,20 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
if (!c)
continue;
__kmemcg_cache_deactivate(c);
kmemcg_cache_deactivate(c);
arr->entries[idx] = NULL;
}
mutex_unlock(&slab_mutex);
put_online_mems();
put_online_cpus();
}
void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
{
struct kmem_cache *s, *s2;
get_online_cpus();
get_online_mems();
mutex_lock(&slab_mutex);
list_for_each_entry_safe(s, s2, &memcg->kmem_caches,
memcg_params.kmem_caches_node) {
/*
* The cgroup is about to be freed and therefore has no charges
* left. Hence, all its caches must be empty by now.
*/
BUG_ON(shutdown_cache(s));
nr_reparented = 0;
list_for_each_entry(s, &memcg->kmem_caches,
memcg_params.kmem_caches_node) {
WRITE_ONCE(s->memcg_params.memcg, parent);
css_put(&memcg->css);
nr_reparented++;
}
if (nr_reparented) {
list_splice_init(&memcg->kmem_caches,
&parent->kmem_caches);
css_get_many(&parent->css, nr_reparented);
}
mutex_unlock(&slab_mutex);
@@ -861,16 +887,15 @@ static int shutdown_memcg_caches(struct kmem_cache *s)
static void flush_memcg_workqueue(struct kmem_cache *s)
{
mutex_lock(&slab_mutex);
spin_lock_irq(&memcg_kmem_wq_lock);
s->memcg_params.dying = true;
mutex_unlock(&slab_mutex);
spin_unlock_irq(&memcg_kmem_wq_lock);
/*
* SLUB deactivates the kmem_caches through call_rcu. Make
* SLAB and SLUB deactivate the kmem_caches through call_rcu. Make
* sure all registered rcu callbacks have been invoked.
*/
if (IS_ENABLED(CONFIG_SLUB))
rcu_barrier();
rcu_barrier();
/*
* SLAB and SLUB create memcg kmem_caches through workqueue and SLUB
@@ -997,7 +1022,7 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name,
create_boot_cache(s, name, size, flags, useroffset, usersize);
list_add(&s->list, &slab_caches);
memcg_link_cache(s);
memcg_link_cache(s, NULL);
s->refcount = 1;
return s;
}
@@ -1498,6 +1523,64 @@ static int __init slab_proc_init(void)
return 0;
}
module_init(slab_proc_init);
#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_MEMCG_KMEM)
/*
* Display information about kmem caches that have child memcg caches.
*/
static int memcg_slabinfo_show(struct seq_file *m, void *unused)
{
struct kmem_cache *s, *c;
struct slabinfo sinfo;
mutex_lock(&slab_mutex);
seq_puts(m, "# <name> <css_id[:dead|deact]> <active_objs> <num_objs>");
seq_puts(m, " <active_slabs> <num_slabs>\n");
list_for_each_entry(s, &slab_root_caches, root_caches_node) {
/*
* Skip kmem caches that don't have any memcg children.
*/
if (list_empty(&s->memcg_params.children))
continue;
memset(&sinfo, 0, sizeof(sinfo));
get_slabinfo(s, &sinfo);
seq_printf(m, "%-17s root %6lu %6lu %6lu %6lu\n",
cache_name(s), sinfo.active_objs, sinfo.num_objs,
sinfo.active_slabs, sinfo.num_slabs);
for_each_memcg_cache(c, s) {
struct cgroup_subsys_state *css;
char *status = "";
css = &c->memcg_params.memcg->css;
if (!(css->flags & CSS_ONLINE))
status = ":dead";
else if (c->flags & SLAB_DEACTIVATED)
status = ":deact";
memset(&sinfo, 0, sizeof(sinfo));
get_slabinfo(c, &sinfo);
seq_printf(m, "%-17s %4d%-6s %6lu %6lu %6lu %6lu\n",
cache_name(c), css->id, status,
sinfo.active_objs, sinfo.num_objs,
sinfo.active_slabs, sinfo.num_slabs);
}
}
mutex_unlock(&slab_mutex);
return 0;
}
DEFINE_SHOW_ATTRIBUTE(memcg_slabinfo);
static int __init memcg_slabinfo_init(void)
{
debugfs_create_file("memcg_slabinfo", S_IFREG | S_IRUGO,
NULL, NULL, &memcg_slabinfo_fops);
return 0;
}
late_initcall(memcg_slabinfo_init);
#endif /* CONFIG_DEBUG_FS && CONFIG_MEMCG_KMEM */
#endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */
static __always_inline void *__do_krealloc(const void *p, size_t new_size,
@@ -1597,6 +1680,52 @@ void kzfree(const void *p)
}
EXPORT_SYMBOL(kzfree);
/**
* ksize - get the actual amount of memory allocated for a given object
* @objp: Pointer to the object
*
* kmalloc may internally round up allocations and return more memory
* than requested. ksize() can be used to determine the actual amount of
* memory allocated. The caller may use this additional memory, even though
* a smaller amount of memory was initially specified with the kmalloc call.
* The caller must guarantee that objp points to a valid object previously
* allocated with either kmalloc() or kmem_cache_alloc(). The object
* must not be freed during the duration of the call.
*
* Return: size of the actual memory used by @objp in bytes
*/
size_t ksize(const void *objp)
{
size_t size;
if (WARN_ON_ONCE(!objp))
return 0;
/*
* We need to check that the pointed to object is valid, and only then
* unpoison the shadow memory below. We use __kasan_check_read(), to
* generate a more useful report at the time ksize() is called (rather
* than later where behaviour is undefined due to potential
* use-after-free or double-free).
*
* If the pointed to memory is invalid we return 0, to avoid users of
* ksize() writing to and potentially corrupting the memory region.
*
* We want to perform the check before __ksize(), to avoid potentially
* crashing in __ksize() due to accessing invalid metadata.
*/
if (unlikely(objp == ZERO_SIZE_PTR) || !__kasan_check_read(objp, 1))
return 0;
size = __ksize(objp);
/*
* We assume that ksize callers could use whole allocated area,
* so we need to unpoison this area.
*/
kasan_unpoison_shadow(objp, size);
return size;
}
EXPORT_SYMBOL(ksize);
/* Tracepoints definitions. */
EXPORT_TRACEPOINT_SYMBOL(kmalloc);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);

Ver fichero

@@ -527,7 +527,7 @@ void kfree(const void *block)
EXPORT_SYMBOL(kfree);
/* can't use ksize for kmem_cache_alloc memory, only kmalloc */
size_t ksize(const void *block)
size_t __ksize(const void *block)
{
struct page *sp;
int align;
@@ -545,7 +545,7 @@ size_t ksize(const void *block)
m = (unsigned int *)(block - align);
return SLOB_UNITS(*m) * SLOB_UNIT;
}
EXPORT_SYMBOL(ksize);
EXPORT_SYMBOL(__ksize);
int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags)
{

Ver fichero

@@ -1279,6 +1279,10 @@ check_slabs:
if (*str == ',')
slub_debug_slabs = str + 1;
out:
if ((static_branch_unlikely(&init_on_alloc) ||
static_branch_unlikely(&init_on_free)) &&
(slub_debug & SLAB_POISON))
pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n");
return 1;
}
@@ -1313,9 +1317,7 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
char *end, *glob;
size_t cmplen;
end = strchr(iter, ',');
if (!end)
end = iter + strlen(iter);
end = strchrnul(iter, ',');
glob = strnchr(iter, end - iter, '*');
if (glob)
@@ -1424,6 +1426,28 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
static inline bool slab_free_freelist_hook(struct kmem_cache *s,
void **head, void **tail)
{
void *object;
void *next = *head;
void *old_tail = *tail ? *tail : *head;
int rsize;
if (slab_want_init_on_free(s))
do {
object = next;
next = get_freepointer(s, object);
/*
* Clear the object and the metadata, but don't touch
* the redzone.
*/
memset(object, 0, s->object_size);
rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad
: 0;
memset((char *)object + s->inuse, 0,
s->size - s->inuse - rsize);
set_freepointer(s, object, next);
} while (object != old_tail);
/*
* Compiler cannot detect this function can be removed if slab_free_hook()
* evaluates to nothing. Thus, catch all relevant config debug options here.
@@ -1433,9 +1457,7 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
defined(CONFIG_DEBUG_OBJECTS_FREE) || \
defined(CONFIG_KASAN)
void *object;
void *next = *head;
void *old_tail = *tail ? *tail : *head;
next = *head;
/* Head and tail of the reconstructed freelist */
*head = NULL;
@@ -1490,7 +1512,7 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
else
page = __alloc_pages_node(node, flags, order);
if (page && memcg_charge_slab(page, flags, order, s)) {
if (page && charge_slab_page(page, flags, order, s)) {
__free_pages(page, order);
page = NULL;
}
@@ -1683,11 +1705,6 @@ out:
if (!page)
return NULL;
mod_lruvec_page_state(page,
(s->flags & SLAB_RECLAIM_ACCOUNT) ?
NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1 << oo_order(oo));
inc_slabs_node(s, page_to_nid(page), page->objects);
return page;
@@ -1721,18 +1738,13 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
check_object(s, page, p, SLUB_RED_INACTIVE);
}
mod_lruvec_page_state(page,
(s->flags & SLAB_RECLAIM_ACCOUNT) ?
NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
-pages);
__ClearPageSlabPfmemalloc(page);
__ClearPageSlab(page);
page->mapping = NULL;
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
memcg_uncharge_slab(page, order, s);
uncharge_slab_page(page, order, s);
__free_pages(page, order);
}
@@ -2741,8 +2753,14 @@ redo:
prefetch_freepointer(s, next_object);
stat(s, ALLOC_FASTPATH);
}
/*
* If the object has been wiped upon free, make sure it's fully
* initialized by zeroing out freelist pointer.
*/
if (unlikely(slab_want_init_on_free(s)) && object)
memset(object + s->offset, 0, sizeof(void *));
if (unlikely(gfpflags & __GFP_ZERO) && object)
if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
memset(object, 0, s->object_size);
slab_post_alloc_hook(s, gfpflags, 1, &object);
@@ -3163,7 +3181,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
local_irq_enable();
/* Clear memory outside IRQ disabled fastpath loop */
if (unlikely(flags & __GFP_ZERO)) {
if (unlikely(slab_want_init_on_alloc(flags, s))) {
int j;
for (j = 0; j < i; j++)
@@ -3652,10 +3670,6 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
free_kmem_cache_nodes(s);
error:
if (flags & SLAB_PANIC)
panic("Cannot create slab %s size=%u realsize=%u order=%u offset=%u flags=%lx\n",
s->name, s->size, s->size,
oo_order(s->oo), s->offset, (unsigned long)flags);
return -EINVAL;
}
@@ -3901,7 +3915,7 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
}
#endif /* CONFIG_HARDENED_USERCOPY */
static size_t __ksize(const void *object)
size_t __ksize(const void *object)
{
struct page *page;
@@ -3917,17 +3931,7 @@ static size_t __ksize(const void *object)
return slab_ksize(page->slab_cache);
}
size_t ksize(const void *object)
{
size_t size = __ksize(object);
/* We assume that ksize callers could use whole allocated area,
* so we need to unpoison this area.
*/
kasan_unpoison_shadow(object, size);
return size;
}
EXPORT_SYMBOL(ksize);
EXPORT_SYMBOL(__ksize);
void kfree(const void *x)
{
@@ -4024,7 +4028,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
}
#ifdef CONFIG_MEMCG
static void kmemcg_cache_deact_after_rcu(struct kmem_cache *s)
void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
{
/*
* Called with all the locks held after a sched RCU grace period.
@@ -4050,12 +4054,6 @@ void __kmemcg_cache_deactivate(struct kmem_cache *s)
*/
slub_set_cpu_partial(s, 0);
s->min_partial = 0;
/*
* s->cpu_partial is checked locklessly (see put_cpu_partial), so
* we have to make sure the change is visible before shrinking.
*/
slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu);
}
#endif /* CONFIG_MEMCG */
@@ -4215,7 +4213,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
}
slab_init_memcg_params(s);
list_add(&s->list, &slab_caches);
memcg_link_cache(s);
memcg_link_cache(s, NULL);
return s;
}

Ver fichero

@@ -73,23 +73,24 @@ unsigned long total_swapcache_pages(void)
unsigned int i, j, nr;
unsigned long ret = 0;
struct address_space *spaces;
struct swap_info_struct *si;
rcu_read_lock();
for (i = 0; i < MAX_SWAPFILES; i++) {
/*
* The corresponding entries in nr_swapper_spaces and
* swapper_spaces will be reused only after at least
* one grace period. So it is impossible for them
* belongs to different usage.
*/
nr = nr_swapper_spaces[i];
spaces = rcu_dereference(swapper_spaces[i]);
if (!nr || !spaces)
swp_entry_t entry = swp_entry(i, 1);
/* Avoid get_swap_device() to warn for bad swap entry */
if (!swp_swap_info(entry))
continue;
/* Prevent swapoff to free swapper_spaces */
si = get_swap_device(entry);
if (!si)
continue;
nr = nr_swapper_spaces[i];
spaces = swapper_spaces[i];
for (j = 0; j < nr; j++)
ret += spaces[j].nrpages;
put_swap_device(si);
}
rcu_read_unlock();
return ret;
}
@@ -310,8 +311,13 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
unsigned long addr)
{
struct page *page;
struct swap_info_struct *si;
si = get_swap_device(entry);
if (!si)
return NULL;
page = find_get_page(swap_address_space(entry), swp_offset(entry));
put_swap_device(si);
INC_CACHE_INFO(find_total);
if (page) {
@@ -354,8 +360,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr,
bool *new_page_allocated)
{
struct page *found_page, *new_page = NULL;
struct address_space *swapper_space = swap_address_space(entry);
struct page *found_page = NULL, *new_page = NULL;
struct swap_info_struct *si;
int err;
*new_page_allocated = false;
@@ -365,7 +371,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* called after lookup_swap_cache() failed, re-calling
* that would confuse statistics.
*/
found_page = find_get_page(swapper_space, swp_offset(entry));
si = get_swap_device(entry);
if (!si)
break;
found_page = find_get_page(swap_address_space(entry),
swp_offset(entry));
put_swap_device(si);
if (found_page)
break;
@@ -601,20 +612,16 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages)
mapping_set_no_writeback_tags(space);
}
nr_swapper_spaces[type] = nr;
rcu_assign_pointer(swapper_spaces[type], spaces);
swapper_spaces[type] = spaces;
return 0;
}
void exit_swap_address_space(unsigned int type)
{
struct address_space *spaces;
spaces = swapper_spaces[type];
kvfree(swapper_spaces[type]);
nr_swapper_spaces[type] = 0;
rcu_assign_pointer(swapper_spaces[type], NULL);
synchronize_rcu();
kvfree(spaces);
swapper_spaces[type] = NULL;
}
static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,

Ver fichero

@@ -152,6 +152,18 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
return ret;
}
static inline struct swap_extent *first_se(struct swap_info_struct *sis)
{
struct rb_node *rb = rb_first(&sis->swap_extent_root);
return rb_entry(rb, struct swap_extent, rb_node);
}
static inline struct swap_extent *next_se(struct swap_extent *se)
{
struct rb_node *rb = rb_next(&se->rb_node);
return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
}
/*
* swapon tell device that all the old swap contents can be discarded,
* to allow the swap device to optimize its wear-levelling.
@@ -164,7 +176,7 @@ static int discard_swap(struct swap_info_struct *si)
int err = 0;
/* Do not discard the swap header page! */
se = &si->first_swap_extent;
se = first_se(si);
start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
if (nr_blocks) {
@@ -175,7 +187,7 @@ static int discard_swap(struct swap_info_struct *si)
cond_resched();
}
list_for_each_entry(se, &si->first_swap_extent.list, list) {
for (se = next_se(se); se; se = next_se(se)) {
start_block = se->start_block << (PAGE_SHIFT - 9);
nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
@@ -189,6 +201,26 @@ static int discard_swap(struct swap_info_struct *si)
return err; /* That will often be -EOPNOTSUPP */
}
static struct swap_extent *
offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
{
struct swap_extent *se;
struct rb_node *rb;
rb = sis->swap_extent_root.rb_node;
while (rb) {
se = rb_entry(rb, struct swap_extent, rb_node);
if (offset < se->start_page)
rb = rb->rb_left;
else if (offset >= se->start_page + se->nr_pages)
rb = rb->rb_right;
else
return se;
}
/* It *must* be present */
BUG();
}
/*
* swap allocation tell device that a cluster of swap can now be discarded,
* to allow the swap device to optimize its wear-levelling.
@@ -196,32 +228,25 @@ static int discard_swap(struct swap_info_struct *si)
static void discard_swap_cluster(struct swap_info_struct *si,
pgoff_t start_page, pgoff_t nr_pages)
{
struct swap_extent *se = si->curr_swap_extent;
int found_extent = 0;
struct swap_extent *se = offset_to_swap_extent(si, start_page);
while (nr_pages) {
if (se->start_page <= start_page &&
start_page < se->start_page + se->nr_pages) {
pgoff_t offset = start_page - se->start_page;
sector_t start_block = se->start_block + offset;
sector_t nr_blocks = se->nr_pages - offset;
pgoff_t offset = start_page - se->start_page;
sector_t start_block = se->start_block + offset;
sector_t nr_blocks = se->nr_pages - offset;
if (nr_blocks > nr_pages)
nr_blocks = nr_pages;
start_page += nr_blocks;
nr_pages -= nr_blocks;
if (nr_blocks > nr_pages)
nr_blocks = nr_pages;
start_page += nr_blocks;
nr_pages -= nr_blocks;
if (!found_extent++)
si->curr_swap_extent = se;
start_block <<= PAGE_SHIFT - 9;
nr_blocks <<= PAGE_SHIFT - 9;
if (blkdev_issue_discard(si->bdev, start_block,
nr_blocks, GFP_NOIO, 0))
break;
start_block <<= PAGE_SHIFT - 9;
nr_blocks <<= PAGE_SHIFT - 9;
if (blkdev_issue_discard(si->bdev, start_block,
nr_blocks, GFP_NOIO, 0))
break;
}
se = list_next_entry(se, list);
se = next_se(se);
}
}
@@ -1079,12 +1104,11 @@ fail:
static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
{
struct swap_info_struct *p;
unsigned long offset, type;
unsigned long offset;
if (!entry.val)
goto out;
type = swp_type(entry);
p = swap_type_to_swap_info(type);
p = swp_swap_info(entry);
if (!p)
goto bad_nofile;
if (!(p->flags & SWP_USED))
@@ -1187,6 +1211,69 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
return usage;
}
/*
* Check whether swap entry is valid in the swap device. If so,
* return pointer to swap_info_struct, and keep the swap entry valid
* via preventing the swap device from being swapoff, until
* put_swap_device() is called. Otherwise return NULL.
*
* The entirety of the RCU read critical section must come before the
* return from or after the call to synchronize_rcu() in
* enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is
* true, the si->map, si->cluster_info, etc. must be valid in the
* critical section.
*
* Notice that swapoff or swapoff+swapon can still happen before the
* rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
* in put_swap_device() if there isn't any other way to prevent
* swapoff, such as page lock, page table lock, etc. The caller must
* be prepared for that. For example, the following situation is
* possible.
*
* CPU1 CPU2
* do_swap_page()
* ... swapoff+swapon
* __read_swap_cache_async()
* swapcache_prepare()
* __swap_duplicate()
* // check swap_map
* // verify PTE not changed
*
* In __swap_duplicate(), the swap_map need to be checked before
* changing partly because the specified swap entry may be for another
* swap device which has been swapoff. And in do_swap_page(), after
* the page is read from the swap device, the PTE is verified not
* changed with the page table locked to check whether the swap device
* has been swapoff or swapoff+swapon.
*/
struct swap_info_struct *get_swap_device(swp_entry_t entry)
{
struct swap_info_struct *si;
unsigned long offset;
if (!entry.val)
goto out;
si = swp_swap_info(entry);
if (!si)
goto bad_nofile;
rcu_read_lock();
if (!(si->flags & SWP_VALID))
goto unlock_out;
offset = swp_offset(entry);
if (offset >= si->max)
goto unlock_out;
return si;
bad_nofile:
pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
out:
return NULL;
unlock_out:
rcu_read_unlock();
return NULL;
}
static unsigned char __swap_entry_free(struct swap_info_struct *p,
swp_entry_t entry, unsigned char usage)
{
@@ -1358,11 +1445,18 @@ int page_swapcount(struct page *page)
return count;
}
int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
int __swap_count(swp_entry_t entry)
{
struct swap_info_struct *si;
pgoff_t offset = swp_offset(entry);
int count = 0;
return swap_count(si->swap_map[offset]);
si = get_swap_device(entry);
if (si) {
count = swap_count(si->swap_map[offset]);
put_swap_device(si);
}
return count;
}
static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
@@ -1387,9 +1481,11 @@ int __swp_swapcount(swp_entry_t entry)
int count = 0;
struct swap_info_struct *si;
si = __swap_info_get(entry);
if (si)
si = get_swap_device(entry);
if (si) {
count = swap_swapcount(si, entry);
put_swap_device(si);
}
return count;
}
@@ -1684,7 +1780,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
return type;
}
if (bdev == sis->bdev) {
struct swap_extent *se = &sis->first_swap_extent;
struct swap_extent *se = first_se(sis);
if (se->start_block == offset) {
if (bdev_p)
@@ -2161,7 +2257,6 @@ static void drain_mmlist(void)
static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
{
struct swap_info_struct *sis;
struct swap_extent *start_se;
struct swap_extent *se;
pgoff_t offset;
@@ -2169,18 +2264,8 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
*bdev = sis->bdev;
offset = swp_offset(entry);
start_se = sis->curr_swap_extent;
se = start_se;
for ( ; ; ) {
if (se->start_page <= offset &&
offset < (se->start_page + se->nr_pages)) {
return se->start_block + (offset - se->start_page);
}
se = list_next_entry(se, list);
sis->curr_swap_extent = se;
BUG_ON(se == start_se); /* It *must* be present */
}
se = offset_to_swap_extent(sis, offset);
return se->start_block + (offset - se->start_page);
}
/*
@@ -2198,12 +2283,11 @@ sector_t map_swap_page(struct page *page, struct block_device **bdev)
*/
static void destroy_swap_extents(struct swap_info_struct *sis)
{
while (!list_empty(&sis->first_swap_extent.list)) {
struct swap_extent *se;
while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
struct rb_node *rb = sis->swap_extent_root.rb_node;
struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
se = list_first_entry(&sis->first_swap_extent.list,
struct swap_extent, list);
list_del(&se->list);
rb_erase(rb, &sis->swap_extent_root);
kfree(se);
}
@@ -2219,7 +2303,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
/*
* Add a block range (and the corresponding page range) into this swapdev's
* extent list. The extent list is kept sorted in page order.
* extent tree.
*
* This function rather assumes that it is called in ascending page order.
*/
@@ -2227,20 +2311,21 @@ int
add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
unsigned long nr_pages, sector_t start_block)
{
struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
struct swap_extent *se;
struct swap_extent *new_se;
struct list_head *lh;
if (start_page == 0) {
se = &sis->first_swap_extent;
sis->curr_swap_extent = se;
se->start_page = 0;
se->nr_pages = nr_pages;
se->start_block = start_block;
return 1;
} else {
lh = sis->first_swap_extent.list.prev; /* Highest extent */
se = list_entry(lh, struct swap_extent, list);
/*
* place the new node at the right most since the
* function is called in ascending page order.
*/
while (*link) {
parent = *link;
link = &parent->rb_right;
}
if (parent) {
se = rb_entry(parent, struct swap_extent, rb_node);
BUG_ON(se->start_page + se->nr_pages != start_page);
if (se->start_block + se->nr_pages == start_block) {
/* Merge it */
@@ -2249,9 +2334,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
}
}
/*
* No merge. Insert a new extent, preserving ordering.
*/
/* No merge, insert a new extent. */
new_se = kmalloc(sizeof(*se), GFP_KERNEL);
if (new_se == NULL)
return -ENOMEM;
@@ -2259,7 +2342,8 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
new_se->nr_pages = nr_pages;
new_se->start_block = start_block;
list_add_tail(&new_se->list, &sis->first_swap_extent.list);
rb_link_node(&new_se->rb_node, parent, link);
rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
return 1;
}
EXPORT_SYMBOL_GPL(add_swap_extent);
@@ -2335,9 +2419,9 @@ static int swap_node(struct swap_info_struct *p)
return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
}
static void _enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
struct swap_cluster_info *cluster_info)
static void setup_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
struct swap_cluster_info *cluster_info)
{
int i;
@@ -2362,7 +2446,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
}
p->swap_map = swap_map;
p->cluster_info = cluster_info;
p->flags |= SWP_WRITEOK;
}
static void _enable_swap_info(struct swap_info_struct *p)
{
p->flags |= SWP_WRITEOK | SWP_VALID;
atomic_long_add(p->pages, &nr_swap_pages);
total_swap_pages += p->pages;
@@ -2389,7 +2477,17 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
frontswap_init(p->type, frontswap_map);
spin_lock(&swap_lock);
spin_lock(&p->lock);
_enable_swap_info(p, prio, swap_map, cluster_info);
setup_swap_info(p, prio, swap_map, cluster_info);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
/*
* Guarantee swap_map, cluster_info, etc. fields are valid
* between get/put_swap_device() if SWP_VALID bit is set
*/
synchronize_rcu();
spin_lock(&swap_lock);
spin_lock(&p->lock);
_enable_swap_info(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
}
@@ -2398,7 +2496,8 @@ static void reinsert_swap_info(struct swap_info_struct *p)
{
spin_lock(&swap_lock);
spin_lock(&p->lock);
_enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
_enable_swap_info(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
}
@@ -2501,6 +2600,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
reenable_swap_slots_cache_unlock();
spin_lock(&swap_lock);
spin_lock(&p->lock);
p->flags &= ~SWP_VALID; /* mark swap device as invalid */
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
/*
* wait for swap operations protected by get/put_swap_device()
* to complete
*/
synchronize_rcu();
flush_work(&p->discard_work);
destroy_swap_extents(p);
@@ -2749,7 +2859,7 @@ static struct swap_info_struct *alloc_swap_info(void)
* would be relying on p->type to remain valid.
*/
}
INIT_LIST_HEAD(&p->first_swap_extent.list);
p->swap_extent_root = RB_ROOT;
plist_node_init(&p->list, 0);
for_each_node(i)
plist_node_init(&p->avail_lists[i], 0);
@@ -3265,17 +3375,11 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
unsigned char has_cache;
int err = -EINVAL;
if (non_swap_entry(entry))
goto out;
p = swp_swap_info(entry);
p = get_swap_device(entry);
if (!p)
goto bad_file;
goto out;
offset = swp_offset(entry);
if (unlikely(offset >= p->max))
goto out;
ci = lock_cluster_or_swap_info(p, offset);
count = p->swap_map[offset];
@@ -3321,11 +3425,9 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
unlock_out:
unlock_cluster_or_swap_info(p, ci);
out:
if (p)
put_swap_device(p);
return err;
bad_file:
pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
goto out;
}
/*
@@ -3417,6 +3519,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
struct page *list_page;
pgoff_t offset;
unsigned char count;
int ret = 0;
/*
* When debugging, it's easier to use __GFP_ZERO here; but it's better
@@ -3424,15 +3527,15 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
*/
page = alloc_page(gfp_mask | __GFP_HIGHMEM);
si = swap_info_get(entry);
si = get_swap_device(entry);
if (!si) {
/*
* An acceptable race has occurred since the failing
* __swap_duplicate(): the swap entry has been freed,
* perhaps even the whole swap_map cleared for swapoff.
* __swap_duplicate(): the swap device may be swapoff
*/
goto outer;
}
spin_lock(&si->lock);
offset = swp_offset(entry);
@@ -3450,9 +3553,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
}
if (!page) {
unlock_cluster(ci);
spin_unlock(&si->lock);
return -ENOMEM;
ret = -ENOMEM;
goto out;
}
/*
@@ -3504,10 +3606,11 @@ out_unlock_cont:
out:
unlock_cluster(ci);
spin_unlock(&si->lock);
put_swap_device(si);
outer:
if (page)
__free_page(page);
return 0;
return ret;
}
/*

Ver fichero

@@ -300,53 +300,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
}
#endif
/*
* Like get_user_pages_fast() except its IRQ-safe in that it won't fall
* back to the regular GUP.
* Note a difference with get_user_pages_fast: this always returns the
* number of pages pinned, 0 if no pages were pinned.
* If the architecture does not support this function, simply return with no
* pages pinned.
*/
int __weak __get_user_pages_fast(unsigned long start,
int nr_pages, int write, struct page **pages)
{
return 0;
}
EXPORT_SYMBOL_GPL(__get_user_pages_fast);
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
* get_user_pages_fast provides equivalent functionality to get_user_pages,
* operating on current and current->mm, with force=0 and vma=NULL. However
* unlike get_user_pages, it must be called without mmap_sem held.
*
* get_user_pages_fast may take mmap_sem and page table locks, so no
* assumptions can be made about lack of locking. get_user_pages_fast is to be
* implemented in a way that is advantageous (vs get_user_pages()) when the
* user memory area is already faulted in and present in ptes. However if the
* pages have to be faulted in, it may turn out to be slightly slower so
* callers need to carefully consider what to use. On many architectures,
* get_user_pages_fast simply falls back to get_user_pages.
*
* Return: number of pages pinned. This may be fewer than the number
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno.
*/
int __weak get_user_pages_fast(unsigned long start,
int nr_pages, unsigned int gup_flags,
struct page **pages)
{
return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long pgoff)

Ver fichero

@@ -365,6 +365,13 @@ static LIST_HEAD(free_vmap_area_list);
*/
static struct rb_root free_vmap_area_root = RB_ROOT;
/*
* Preload a CPU with one object for "no edge" split case. The
* aim is to get rid of allocations from the atomic context, thus
* to use more permissive allocation masks.
*/
static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
static __always_inline unsigned long
va_size(struct vmap_area *va)
{
@@ -399,6 +406,13 @@ static void purge_vmap_area_lazy(void);
static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
static unsigned long lazy_max_pages(void);
static atomic_long_t nr_vmalloc_pages;
unsigned long vmalloc_nr_pages(void)
{
return atomic_long_read(&nr_vmalloc_pages);
}
static struct vmap_area *__find_vmap_area(unsigned long addr)
{
struct rb_node *n = vmap_area_root.rb_node;
@@ -527,20 +541,17 @@ link_va(struct vmap_area *va, struct rb_root *root,
static __always_inline void
unlink_va(struct vmap_area *va, struct rb_root *root)
{
/*
* During merging a VA node can be empty, therefore
* not linked with the tree nor list. Just check it.
*/
if (!RB_EMPTY_NODE(&va->rb_node)) {
if (root == &free_vmap_area_root)
rb_erase_augmented(&va->rb_node,
root, &free_vmap_area_rb_augment_cb);
else
rb_erase(&va->rb_node, root);
if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
return;
list_del(&va->list);
RB_CLEAR_NODE(&va->rb_node);
}
if (root == &free_vmap_area_root)
rb_erase_augmented(&va->rb_node,
root, &free_vmap_area_rb_augment_cb);
else
rb_erase(&va->rb_node, root);
list_del(&va->list);
RB_CLEAR_NODE(&va->rb_node);
}
#if DEBUG_AUGMENT_PROPAGATE_CHECK
@@ -712,9 +723,6 @@ merge_or_add_vmap_area(struct vmap_area *va,
/* Check and update the tree if needed. */
augment_tree_propagate_from(sibling);
/* Remove this VA, it has been merged. */
unlink_va(va, root);
/* Free vmap_area object. */
kmem_cache_free(vmap_area_cachep, va);
@@ -739,12 +747,11 @@ merge_or_add_vmap_area(struct vmap_area *va,
/* Check and update the tree if needed. */
augment_tree_propagate_from(sibling);
/* Remove this VA, it has been merged. */
unlink_va(va, root);
if (merged)
unlink_va(va, root);
/* Free vmap_area object. */
kmem_cache_free(vmap_area_cachep, va);
return;
}
}
@@ -951,9 +958,24 @@ adjust_va_to_fit_type(struct vmap_area *va,
* L V NVA V R
* |---|-------|---|
*/
lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
if (unlikely(!lva))
return -1;
lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
if (unlikely(!lva)) {
/*
* For percpu allocator we do not do any pre-allocation
* and leave it as it is. The reason is it most likely
* never ends up with NE_FIT_TYPE splitting. In case of
* percpu allocations offsets and sizes are aligned to
* fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
* are its main fitting cases.
*
* There are a few exceptions though, as an example it is
* a first allocation (early boot up) when we have "one"
* big free space that has to be split.
*/
lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
if (!lva)
return -1;
}
/*
* Build the remainder.
@@ -986,7 +1008,7 @@ adjust_va_to_fit_type(struct vmap_area *va,
*/
static __always_inline unsigned long
__alloc_vmap_area(unsigned long size, unsigned long align,
unsigned long vstart, unsigned long vend, int node)
unsigned long vstart, unsigned long vend)
{
unsigned long nva_start_addr;
struct vmap_area *va;
@@ -1032,7 +1054,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long vstart, unsigned long vend,
int node, gfp_t gfp_mask)
{
struct vmap_area *va;
struct vmap_area *va, *pva;
unsigned long addr;
int purged = 0;
@@ -1057,13 +1079,38 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
retry:
/*
* Preload this CPU with one extra vmap_area object to ensure
* that we have it available when fit type of free area is
* NE_FIT_TYPE.
*
* The preload is done in non-atomic context, thus it allows us
* to use more permissive allocation masks to be more stable under
* low memory condition and high memory pressure.
*
* Even if it fails we do not really care about that. Just proceed
* as it is. "overflow" path will refill the cache we allocate from.
*/
preempt_disable();
if (!__this_cpu_read(ne_fit_preload_node)) {
preempt_enable();
pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node);
preempt_disable();
if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) {
if (pva)
kmem_cache_free(vmap_area_cachep, pva);
}
}
spin_lock(&vmap_area_lock);
preempt_enable();
/*
* If an allocation fails, the "vend" address is
* returned. Therefore trigger the overflow path.
*/
addr = __alloc_vmap_area(size, align, vstart, vend, node);
addr = __alloc_vmap_area(size, align, vstart, vend);
if (unlikely(addr == vend))
goto overflow;
@@ -1119,8 +1166,6 @@ EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
static void __free_vmap_area(struct vmap_area *va)
{
BUG_ON(RB_EMPTY_NODE(&va->rb_node));
/*
* Remove from the busy tree/list.
*/
@@ -2199,6 +2244,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
BUG_ON(!page);
__free_pages(page, 0);
}
atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
kvfree(area->pages);
}
@@ -2376,12 +2422,14 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
area->nr_pages = i;
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
goto fail;
}
area->pages[i] = page;
if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
cond_resched();
}
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
if (map_vm_area(area, prot, pages))
goto fail;
@@ -2774,7 +2822,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
* Note: In usual ops, vread() is never necessary because the caller
* should know vmalloc() area is valid and can use memcpy().
* This is for routines which have to access vmalloc area without
* any informaion, as /dev/kmem.
* any information, as /dev/kmem.
*
* Return: number of bytes for which addr and buf should be increased
* (same number as @count) or %0 if [addr...addr+count) doesn't
@@ -2853,7 +2901,7 @@ finished:
* Note: In usual ops, vwrite() is never necessary because the caller
* should know vmalloc() area is valid and can use memcpy().
* This is for routines which have to access vmalloc area without
* any informaion, as /dev/kmem.
* any information, as /dev/kmem.
*
* Return: number of bytes for which addr and buf should be
* increased (same number as @count) or %0 if [addr...addr+count)
@@ -2996,7 +3044,7 @@ void __weak vmalloc_sync_all(void)
}
static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
static int f(pte_t *pte, unsigned long addr, void *data)
{
pte_t ***p = data;

Ver fichero

@@ -1118,6 +1118,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
int may_enter_fs;
enum page_references references = PAGEREF_RECLAIM_CLEAN;
bool dirty, writeback;
unsigned int nr_pages;
cond_resched();
@@ -1129,7 +1130,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
VM_BUG_ON_PAGE(PageActive(page), page);
sc->nr_scanned++;
nr_pages = 1 << compound_order(page);
/* Account the number of base pages even though THP */
sc->nr_scanned += nr_pages;
if (unlikely(!page_evictable(page)))
goto activate_locked;
@@ -1137,11 +1141,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (!sc->may_unmap && page_mapped(page))
goto keep_locked;
/* Double the slab pressure for mapped and swapcache pages */
if ((page_mapped(page) || PageSwapCache(page)) &&
!(PageAnon(page) && !PageSwapBacked(page)))
sc->nr_scanned++;
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
@@ -1255,7 +1254,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
case PAGEREF_ACTIVATE:
goto activate_locked;
case PAGEREF_KEEP:
stat->nr_ref_keep++;
stat->nr_ref_keep += nr_pages;
goto keep_locked;
case PAGEREF_RECLAIM:
case PAGEREF_RECLAIM_CLEAN:
@@ -1287,7 +1286,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
if (!add_to_swap(page)) {
if (!PageTransHuge(page))
goto activate_locked;
goto activate_locked_split;
/* Fallback to swap normal pages */
if (split_huge_page_to_list(page,
page_list))
@@ -1296,7 +1295,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
count_vm_event(THP_SWPOUT_FALLBACK);
#endif
if (!add_to_swap(page))
goto activate_locked;
goto activate_locked_split;
}
may_enter_fs = 1;
@@ -1310,6 +1309,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep_locked;
}
/*
* THP may get split above, need minus tail pages and update
* nr_pages to avoid accounting tail pages twice.
*
* The tail pages that are added into swap cache successfully
* reach here.
*/
if ((nr_pages > 1) && !PageTransHuge(page)) {
sc->nr_scanned -= (nr_pages - 1);
nr_pages = 1;
}
/*
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
@@ -1320,7 +1331,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (unlikely(PageTransHuge(page)))
flags |= TTU_SPLIT_HUGE_PMD;
if (!try_to_unmap(page, flags)) {
stat->nr_unmap_fail++;
stat->nr_unmap_fail += nr_pages;
goto activate_locked;
}
}
@@ -1447,7 +1458,11 @@ static unsigned long shrink_page_list(struct list_head *page_list,
unlock_page(page);
free_it:
nr_reclaimed++;
/*
* THP may get swapped out in a whole, need account
* all base pages.
*/
nr_reclaimed += nr_pages;
/*
* Is there need to periodically free_page_list? It would
@@ -1460,6 +1475,15 @@ free_it:
list_add(&page->lru, &free_pages);
continue;
activate_locked_split:
/*
* The tail pages that are failed to add into swap cache
* reach here. Fixup nr_scanned and nr_pages.
*/
if (nr_pages > 1) {
sc->nr_scanned -= (nr_pages - 1);
nr_pages = 1;
}
activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
@@ -1469,8 +1493,7 @@ activate_locked:
if (!PageMlocked(page)) {
int type = page_is_file_cache(page);
SetPageActive(page);
pgactivate++;
stat->nr_activate[type] += hpage_nr_pages(page);
stat->nr_activate[type] += nr_pages;
count_memcg_page_event(page, PGACTIVATE);
}
keep_locked:
@@ -1480,6 +1503,8 @@ keep:
VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
}
pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
mem_cgroup_uncharge_list(&free_pages);
try_to_unmap_flush();
free_unref_page_list(&free_pages);
@@ -1651,10 +1676,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
LIST_HEAD(pages_skipped);
isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
total_scan = 0;
scan = 0;
for (total_scan = 0;
scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src);
total_scan++) {
while (scan < nr_to_scan && !list_empty(src)) {
struct page *page;
page = lru_to_page(src);
@@ -1662,9 +1686,12 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
VM_BUG_ON_PAGE(!PageLRU(page), page);
nr_pages = 1 << compound_order(page);
total_scan += nr_pages;
if (page_zonenum(page) > sc->reclaim_idx) {
list_move(&page->lru, &pages_skipped);
nr_skipped[page_zonenum(page)]++;
nr_skipped[page_zonenum(page)] += nr_pages;
continue;
}
@@ -1673,11 +1700,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
* return with no isolated pages if the LRU mostly contains
* ineligible pages. This causes the VM to not reclaim any
* pages, triggering a premature OOM.
*
* Account all tail pages of THP. This would not cause
* premature OOM since __isolate_lru_page() returns -EBUSY
* only when the page is being freed somewhere else.
*/
scan++;
scan += nr_pages;
switch (__isolate_lru_page(page, mode)) {
case 0:
nr_pages = hpage_nr_pages(page);
nr_taken += nr_pages;
nr_zone_taken[page_zonenum(page)] += nr_pages;
list_move(&page->lru, dst);
@@ -2125,7 +2155,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
* 10TB 320 32GB
*/
static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
struct scan_control *sc, bool actual_reclaim)
struct scan_control *sc, bool trace)
{
enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -2151,7 +2181,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
* rid of the stale workingset quickly.
*/
refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
if (file && actual_reclaim && lruvec->refaults != refaults) {
if (file && lruvec->refaults != refaults) {
inactive_ratio = 0;
} else {
gb = (inactive + active) >> (30 - PAGE_SHIFT);
@@ -2161,7 +2191,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
inactive_ratio = 1;
}
if (actual_reclaim)
if (trace)
trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,

Ver fichero

@@ -924,7 +924,16 @@ retry:
set_bit(PAGE_HEADLESS, &page->private);
goto headless;
}
__SetPageMovable(page, pool->inode->i_mapping);
if (can_sleep) {
lock_page(page);
__SetPageMovable(page, pool->inode->i_mapping);
unlock_page(page);
} else {
if (trylock_page(page)) {
__SetPageMovable(page, pool->inode->i_mapping);
unlock_page(page);
}
}
z3fold_page_lock(zhdr);
found:
@@ -1331,6 +1340,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(!PageIsolated(page), page);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
zhdr = page_address(page);
pool = zhdr_to_pool(zhdr);