memcg: coalesce uncharge during unmap/truncate
In massive parallel enviroment, res_counter can be a performance bottleneck. One strong techinque to reduce lock contention is reducing calls by coalescing some amount of calls into one. Considering charge/uncharge chatacteristic, - charge is done one by one via demand-paging. - uncharge is done by - in chunk at munmap, truncate, exit, execve... - one by one via vmscan/paging. It seems we have a chance to coalesce uncharges for improving scalability at unmap/truncation. This patch is a for coalescing uncharge. For avoiding scattering memcg's structure to functions under /mm, this patch adds memcg batch uncharge information to the task. A reason for per-task batching is for making use of caller's context information. We do batched uncharge (deleyed uncharge) when truncation/unmap occurs but do direct uncharge when uncharge is called by memory reclaim (vmscan.c). The degree of coalescing depends on callers - at invalidate/trucate... pagevec size - at unmap ....ZAP_BLOCK_SIZE (memory itself will be freed in this degree.) Then, we'll not coalescing too much. On x86-64 8cpu server, I tested overheads of memcg at page fault by running a program which does map/fault/unmap in a loop. Running a task per a cpu by taskset and see sum of the number of page faults in 60secs. [without memcg config] 40156968 page-faults # 0.085 M/sec ( +- 0.046% ) 27.67 cache-miss/faults [root cgroup] 36659599 page-faults # 0.077 M/sec ( +- 0.247% ) 31.58 miss/faults [in a child cgroup] 18444157 page-faults # 0.039 M/sec ( +- 0.133% ) 69.96 miss/faults [child with this patch] 27133719 page-faults # 0.057 M/sec ( +- 0.155% ) 47.16 miss/faults We can see some amounts of improvement. (root cgroup doesn't affected by this patch) Another patch for "charge" will follow this and above will be improved more. Changelog(since 2009/10/02): - renamed filed of memcg_batch (as pages to bytes, memsw to memsw_bytes) - some clean up and commentary/description updates. - added initialize code to copy_process(). (possible bug fix) Changelog(old): - fixed !CONFIG_MEM_CGROUP case. - rebased onto the latest mmotm + softlimit fix patches. - unified patch for callers - added commetns. - make ->do_batch as bool. - removed css_get() at el. We don't need it. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:

committed by
Linus Torvalds

parent
cd9b45b78a
commit
569b846df5
@@ -1827,6 +1827,50 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
|
||||
css_put(&mem->css);
|
||||
}
|
||||
|
||||
static void
|
||||
__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
|
||||
{
|
||||
struct memcg_batch_info *batch = NULL;
|
||||
bool uncharge_memsw = true;
|
||||
/* If swapout, usage of swap doesn't decrease */
|
||||
if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
|
||||
uncharge_memsw = false;
|
||||
/*
|
||||
* do_batch > 0 when unmapping pages or inode invalidate/truncate.
|
||||
* In those cases, all pages freed continously can be expected to be in
|
||||
* the same cgroup and we have chance to coalesce uncharges.
|
||||
* But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
|
||||
* because we want to do uncharge as soon as possible.
|
||||
*/
|
||||
if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
|
||||
goto direct_uncharge;
|
||||
|
||||
batch = ¤t->memcg_batch;
|
||||
/*
|
||||
* In usual, we do css_get() when we remember memcg pointer.
|
||||
* But in this case, we keep res->usage until end of a series of
|
||||
* uncharges. Then, it's ok to ignore memcg's refcnt.
|
||||
*/
|
||||
if (!batch->memcg)
|
||||
batch->memcg = mem;
|
||||
/*
|
||||
* In typical case, batch->memcg == mem. This means we can
|
||||
* merge a series of uncharges to an uncharge of res_counter.
|
||||
* If not, we uncharge res_counter ony by one.
|
||||
*/
|
||||
if (batch->memcg != mem)
|
||||
goto direct_uncharge;
|
||||
/* remember freed charge and uncharge it later */
|
||||
batch->bytes += PAGE_SIZE;
|
||||
if (uncharge_memsw)
|
||||
batch->memsw_bytes += PAGE_SIZE;
|
||||
return;
|
||||
direct_uncharge:
|
||||
res_counter_uncharge(&mem->res, PAGE_SIZE);
|
||||
if (uncharge_memsw)
|
||||
res_counter_uncharge(&mem->memsw, PAGE_SIZE);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* uncharge if !page_mapped(page)
|
||||
@@ -1875,12 +1919,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
|
||||
break;
|
||||
}
|
||||
|
||||
if (!mem_cgroup_is_root(mem)) {
|
||||
res_counter_uncharge(&mem->res, PAGE_SIZE);
|
||||
if (do_swap_account &&
|
||||
(ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
|
||||
res_counter_uncharge(&mem->memsw, PAGE_SIZE);
|
||||
}
|
||||
if (!mem_cgroup_is_root(mem))
|
||||
__do_uncharge(mem, ctype);
|
||||
if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
|
||||
mem_cgroup_swap_statistics(mem, true);
|
||||
mem_cgroup_charge_statistics(mem, pc, false);
|
||||
@@ -1926,6 +1966,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
|
||||
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
|
||||
* In that cases, pages are freed continuously and we can expect pages
|
||||
* are in the same memcg. All these calls itself limits the number of
|
||||
* pages freed at once, then uncharge_start/end() is called properly.
|
||||
* This may be called prural(2) times in a context,
|
||||
*/
|
||||
|
||||
void mem_cgroup_uncharge_start(void)
|
||||
{
|
||||
current->memcg_batch.do_batch++;
|
||||
/* We can do nest. */
|
||||
if (current->memcg_batch.do_batch == 1) {
|
||||
current->memcg_batch.memcg = NULL;
|
||||
current->memcg_batch.bytes = 0;
|
||||
current->memcg_batch.memsw_bytes = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void mem_cgroup_uncharge_end(void)
|
||||
{
|
||||
struct memcg_batch_info *batch = ¤t->memcg_batch;
|
||||
|
||||
if (!batch->do_batch)
|
||||
return;
|
||||
|
||||
batch->do_batch--;
|
||||
if (batch->do_batch) /* If stacked, do nothing. */
|
||||
return;
|
||||
|
||||
if (!batch->memcg)
|
||||
return;
|
||||
/*
|
||||
* This "batch->memcg" is valid without any css_get/put etc...
|
||||
* bacause we hide charges behind us.
|
||||
*/
|
||||
if (batch->bytes)
|
||||
res_counter_uncharge(&batch->memcg->res, batch->bytes);
|
||||
if (batch->memsw_bytes)
|
||||
res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
|
||||
/* forget this pointer (for sanity check) */
|
||||
batch->memcg = NULL;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SWAP
|
||||
/*
|
||||
* called after __delete_from_swap_cache() and drop "page" account.
|
||||
|
Reference in New Issue
Block a user