diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 7d73882e2c27..a48baf202265 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -31,6 +31,7 @@ Currently, these files are in /proc/sys/vm: - dirty_writeback_centisecs - drop_caches - extfrag_threshold +- extra_free_kbytes - hugetlb_shm_group - laptop_mode - legacy_va_layout @@ -274,6 +275,21 @@ any throttling. ============================================================== +extra_free_kbytes + +This parameter tells the VM to keep extra free memory between the threshold +where background reclaim (kswapd) kicks in, and the threshold where direct +reclaim (by allocating processes) kicks in. + +This is useful for workloads that require low latency memory allocations +and have a bounded burstiness in memory allocations, for example a +realtime application that receives and transmits network traffic +(causing in-kernel memory allocations) with a maximum total message burst +size of 200MB may need 200MB of extra free memory to avoid direct reclaim +related latencies. + +============================================================== + hugetlb_shm_group hugetlb_shm_group contains group id that is allowed to create SysV diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cc02050fd0c4..8c7635ecb752 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -106,6 +106,7 @@ extern char core_pattern[]; extern unsigned int core_pipe_limit; #endif extern int pid_max; +extern int extra_free_kbytes; extern int pid_max_min, pid_max_max; extern int percpu_pagelist_fraction; extern int latencytop_enabled; @@ -1459,6 +1460,14 @@ static struct ctl_table vm_table[] = { .extra1 = &one, .extra2 = &one_thousand, }, + { + .procname = "extra_free_kbytes", + .data = &extra_free_kbytes, + .maxlen = sizeof(extra_free_kbytes), + .mode = 0644, + .proc_handler = min_free_kbytes_sysctl_handler, + .extra1 = &zero, + }, { .procname = "percpu_pagelist_fraction", .data = &percpu_pagelist_fraction, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e2ef1c17942f..2c543599c489 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -261,10 +261,22 @@ compound_page_dtor * const compound_page_dtors[] = { #endif }; +/* + * Try to keep at least this much lowmem free. Do not allow normal + * allocations below this point, only high priority ones. Automatically + * tuned according to the amount of memory in the system. + */ int min_free_kbytes = 1024; int user_min_free_kbytes = -1; int watermark_scale_factor = 10; +/* + * Extra memory for the system to try freeing. Used to temporarily + * free memory, to make space for new workloads. Anyone can allocate + * down to the min watermarks controlled by min_free_kbytes above. + */ +int extra_free_kbytes = 0; + static unsigned long nr_kernel_pages __meminitdata; static unsigned long nr_all_pages __meminitdata; static unsigned long dma_reserve __meminitdata; @@ -7214,6 +7226,7 @@ static void setup_per_zone_lowmem_reserve(void) static void __setup_per_zone_wmarks(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); + unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; struct zone *zone; unsigned long flags; @@ -7225,11 +7238,14 @@ static void __setup_per_zone_wmarks(void) } for_each_zone(zone) { - u64 tmp; + u64 min, low; spin_lock_irqsave(&zone->lock, flags); - tmp = (u64)pages_min * zone->managed_pages; - do_div(tmp, lowmem_pages); + min = (u64)pages_min * zone->managed_pages; + do_div(min, lowmem_pages); + low = (u64)pages_low * zone->managed_pages; + do_div(low, vm_total_pages); + if (is_highmem(zone)) { /* * __GFP_HIGH and PF_MEMALLOC allocations usually don't @@ -7250,7 +7266,7 @@ static void __setup_per_zone_wmarks(void) * If it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ - zone->watermark[WMARK_MIN] = tmp; + zone->watermark[WMARK_MIN] = min; } /* @@ -7258,12 +7274,14 @@ static void __setup_per_zone_wmarks(void) * scale factor in proportion to available memory, but * ensure a minimum size on small systems. */ - tmp = max_t(u64, tmp >> 2, + min = max_t(u64, min >> 2, mult_frac(zone->managed_pages, watermark_scale_factor, 10000)); - zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; - zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; + zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + + low + min; + zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + + low + min * 2; spin_unlock_irqrestore(&zone->lock, flags); } @@ -7346,7 +7364,7 @@ core_initcall(init_per_zone_wmark_min) /* * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so * that we can call two helper functions whenever min_free_kbytes - * changes. + * or extra_free_kbytes changes. */ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos)