mm: memcontrol: fix excessive complexity in memory.stat reporting
We've seen memory.stat reads in top-level cgroups take up to fourteen seconds during a userspace bug that created tens of thousands of ghost cgroups pinned by lingering page cache. Even with a more reasonable number of cgroups, aggregating memory.stat is unnecessarily heavy. The complexity is this: nr_cgroups * nr_stat_items * nr_possible_cpus where the stat items are ~70 at this point. With 128 cgroups and 128 CPUs - decent, not enormous setups - reading the top-level memory.stat has to aggregate over a million per-cpu counters. This doesn't scale. Instead of spreading the source of truth across all CPUs, use the per-cpu counters merely to batch updates to shared atomic counters. This is the same as the per-cpu stocks we use for charging memory to the shared atomic page_counters, and also the way the global vmstat counters are implemented. Vmstat has elaborate spilling thresholds that depend on the number of CPUs, amount of memory, and memory pressure - carefully balancing the cost of counter updates with the amount of per-cpu error. That's because the vmstat counters are system-wide, but also used for decisions inside the kernel (e.g. NR_FREE_PAGES in the allocator). Neither is true for the memory controller. Use the same static batch size we already use for page_counter updates during charging. The per-cpu error in the stats will be 128k, which is an acceptable ratio of cores to memory accounting granularity. [hannes@cmpxchg.org: fix warning in __this_cpu_xchg() calls] Link: http://lkml.kernel.org/r/20171201135750.GB8097@cmpxchg.org Link: http://lkml.kernel.org/r/20171103153336.24044-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com> Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:

committed by
Linus Torvalds

parent
284542656e
commit
a983b5ebee
@@ -108,7 +108,10 @@ struct lruvec_stat {
|
||||
*/
|
||||
struct mem_cgroup_per_node {
|
||||
struct lruvec lruvec;
|
||||
struct lruvec_stat __percpu *lruvec_stat;
|
||||
|
||||
struct lruvec_stat __percpu *lruvec_stat_cpu;
|
||||
atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS];
|
||||
|
||||
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
|
||||
|
||||
struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1];
|
||||
@@ -227,10 +230,10 @@ struct mem_cgroup {
|
||||
spinlock_t move_lock;
|
||||
struct task_struct *move_lock_task;
|
||||
unsigned long move_lock_flags;
|
||||
/*
|
||||
* percpu counter.
|
||||
*/
|
||||
struct mem_cgroup_stat_cpu __percpu *stat;
|
||||
|
||||
struct mem_cgroup_stat_cpu __percpu *stat_cpu;
|
||||
atomic_long_t stat[MEMCG_NR_STAT];
|
||||
atomic_long_t events[MEMCG_NR_EVENTS];
|
||||
|
||||
unsigned long socket_pressure;
|
||||
|
||||
@@ -265,6 +268,12 @@ struct mem_cgroup {
|
||||
/* WARNING: nodeinfo must be the last member here */
|
||||
};
|
||||
|
||||
/*
|
||||
* size of first charge trial. "32" comes from vmscan.c's magic value.
|
||||
* TODO: maybe necessary to use big numbers in big irons.
|
||||
*/
|
||||
#define MEMCG_CHARGE_BATCH 32U
|
||||
|
||||
extern struct mem_cgroup *root_mem_cgroup;
|
||||
|
||||
static inline bool mem_cgroup_disabled(void)
|
||||
@@ -485,32 +494,38 @@ void unlock_page_memcg(struct page *page);
|
||||
static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
|
||||
int idx)
|
||||
{
|
||||
long val = 0;
|
||||
int cpu;
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
val += per_cpu(memcg->stat->count[idx], cpu);
|
||||
|
||||
if (val < 0)
|
||||
val = 0;
|
||||
|
||||
return val;
|
||||
long x = atomic_long_read(&memcg->stat[idx]);
|
||||
#ifdef CONFIG_SMP
|
||||
if (x < 0)
|
||||
x = 0;
|
||||
#endif
|
||||
return x;
|
||||
}
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void __mod_memcg_state(struct mem_cgroup *memcg,
|
||||
int idx, int val)
|
||||
{
|
||||
if (!mem_cgroup_disabled())
|
||||
__this_cpu_add(memcg->stat->count[idx], val);
|
||||
long x;
|
||||
|
||||
if (mem_cgroup_disabled())
|
||||
return;
|
||||
|
||||
x = val + __this_cpu_read(memcg->stat_cpu->count[idx]);
|
||||
if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
|
||||
atomic_long_add(x, &memcg->stat[idx]);
|
||||
x = 0;
|
||||
}
|
||||
__this_cpu_write(memcg->stat_cpu->count[idx], x);
|
||||
}
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void mod_memcg_state(struct mem_cgroup *memcg,
|
||||
int idx, int val)
|
||||
{
|
||||
if (!mem_cgroup_disabled())
|
||||
this_cpu_add(memcg->stat->count[idx], val);
|
||||
preempt_disable();
|
||||
__mod_memcg_state(memcg, idx, val);
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -548,26 +563,25 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
|
||||
enum node_stat_item idx)
|
||||
{
|
||||
struct mem_cgroup_per_node *pn;
|
||||
long val = 0;
|
||||
int cpu;
|
||||
long x;
|
||||
|
||||
if (mem_cgroup_disabled())
|
||||
return node_page_state(lruvec_pgdat(lruvec), idx);
|
||||
|
||||
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
|
||||
for_each_possible_cpu(cpu)
|
||||
val += per_cpu(pn->lruvec_stat->count[idx], cpu);
|
||||
|
||||
if (val < 0)
|
||||
val = 0;
|
||||
|
||||
return val;
|
||||
x = atomic_long_read(&pn->lruvec_stat[idx]);
|
||||
#ifdef CONFIG_SMP
|
||||
if (x < 0)
|
||||
x = 0;
|
||||
#endif
|
||||
return x;
|
||||
}
|
||||
|
||||
static inline void __mod_lruvec_state(struct lruvec *lruvec,
|
||||
enum node_stat_item idx, int val)
|
||||
{
|
||||
struct mem_cgroup_per_node *pn;
|
||||
long x;
|
||||
|
||||
/* Update node */
|
||||
__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
|
||||
@@ -581,7 +595,12 @@ static inline void __mod_lruvec_state(struct lruvec *lruvec,
|
||||
__mod_memcg_state(pn->memcg, idx, val);
|
||||
|
||||
/* Update lruvec */
|
||||
__this_cpu_add(pn->lruvec_stat->count[idx], val);
|
||||
x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
|
||||
if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
|
||||
atomic_long_add(x, &pn->lruvec_stat[idx]);
|
||||
x = 0;
|
||||
}
|
||||
__this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
|
||||
}
|
||||
|
||||
static inline void mod_lruvec_state(struct lruvec *lruvec,
|
||||
@@ -624,16 +643,25 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
|
||||
static inline void __count_memcg_events(struct mem_cgroup *memcg,
|
||||
int idx, unsigned long count)
|
||||
{
|
||||
if (!mem_cgroup_disabled())
|
||||
__this_cpu_add(memcg->stat->events[idx], count);
|
||||
unsigned long x;
|
||||
|
||||
if (mem_cgroup_disabled())
|
||||
return;
|
||||
|
||||
x = count + __this_cpu_read(memcg->stat_cpu->events[idx]);
|
||||
if (unlikely(x > MEMCG_CHARGE_BATCH)) {
|
||||
atomic_long_add(x, &memcg->events[idx]);
|
||||
x = 0;
|
||||
}
|
||||
__this_cpu_write(memcg->stat_cpu->events[idx], x);
|
||||
}
|
||||
|
||||
/* idx can be of type enum memcg_event_item or vm_event_item */
|
||||
static inline void count_memcg_events(struct mem_cgroup *memcg,
|
||||
int idx, unsigned long count)
|
||||
{
|
||||
if (!mem_cgroup_disabled())
|
||||
this_cpu_add(memcg->stat->events[idx], count);
|
||||
preempt_disable();
|
||||
__count_memcg_events(memcg, idx, count);
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
/* idx can be of type enum memcg_event_item or vm_event_item */
|
||||
|
Reference in New Issue
Block a user