|
@@ -89,7 +89,9 @@ enum mem_cgroup_stat_index {
|
|
|
MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
|
|
|
MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
|
|
|
MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
|
|
|
- MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */
|
|
|
+ MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
|
|
|
+ /* incremented at every pagein/pageout */
|
|
|
+ MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA,
|
|
|
MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
|
|
|
|
|
|
MEM_CGROUP_STAT_NSTATS,
|
|
@@ -255,6 +257,12 @@ struct mem_cgroup {
|
|
|
* percpu counter.
|
|
|
*/
|
|
|
struct mem_cgroup_stat_cpu *stat;
|
|
|
+ /*
|
|
|
+ * used when a cpu is offlined or other synchronizations
|
|
|
+ * See mem_cgroup_read_stat().
|
|
|
+ */
|
|
|
+ struct mem_cgroup_stat_cpu nocpu_base;
|
|
|
+ spinlock_t pcp_counter_lock;
|
|
|
};
|
|
|
|
|
|
/* Stuffs for move charges at task migration. */
|
|
@@ -531,14 +539,40 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
|
|
|
return mz;
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * Implementation Note: reading percpu statistics for memcg.
|
|
|
+ *
|
|
|
+ * Both of vmstat[] and percpu_counter has threshold and do periodic
|
|
|
+ * synchronization to implement "quick" read. There are trade-off between
|
|
|
+ * reading cost and precision of value. Then, we may have a chance to implement
|
|
|
+ * a periodic synchronizion of counter in memcg's counter.
|
|
|
+ *
|
|
|
+ * But this _read() function is used for user interface now. The user accounts
|
|
|
+ * memory usage by memory cgroup and he _always_ requires exact value because
|
|
|
+ * he accounts memory. Even if we provide quick-and-fuzzy read, we always
|
|
|
+ * have to visit all online cpus and make sum. So, for now, unnecessary
|
|
|
+ * synchronization is not implemented. (just implemented for cpu hotplug)
|
|
|
+ *
|
|
|
+ * If there are kernel internal actions which can make use of some not-exact
|
|
|
+ * value, and reading all cpu value can be performance bottleneck in some
|
|
|
+ * common workload, threashold and synchonization as vmstat[] should be
|
|
|
+ * implemented.
|
|
|
+ */
|
|
|
static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
|
|
|
enum mem_cgroup_stat_index idx)
|
|
|
{
|
|
|
int cpu;
|
|
|
s64 val = 0;
|
|
|
|
|
|
- for_each_possible_cpu(cpu)
|
|
|
+ get_online_cpus();
|
|
|
+ for_each_online_cpu(cpu)
|
|
|
val += per_cpu(mem->stat->count[idx], cpu);
|
|
|
+#ifdef CONFIG_HOTPLUG_CPU
|
|
|
+ spin_lock(&mem->pcp_counter_lock);
|
|
|
+ val += mem->nocpu_base.count[idx];
|
|
|
+ spin_unlock(&mem->pcp_counter_lock);
|
|
|
+#endif
|
|
|
+ put_online_cpus();
|
|
|
return val;
|
|
|
}
|
|
|
|
|
@@ -663,9 +697,28 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
|
|
|
/* The caller has to guarantee "mem" exists before calling this */
|
|
|
static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
|
|
|
{
|
|
|
- if (mem && css_tryget(&mem->css))
|
|
|
- return mem;
|
|
|
- return NULL;
|
|
|
+ struct cgroup_subsys_state *css;
|
|
|
+ int found;
|
|
|
+
|
|
|
+ if (!mem) /* ROOT cgroup has the smallest ID */
|
|
|
+ return root_mem_cgroup; /*css_put/get against root is ignored*/
|
|
|
+ if (!mem->use_hierarchy) {
|
|
|
+ if (css_tryget(&mem->css))
|
|
|
+ return mem;
|
|
|
+ return NULL;
|
|
|
+ }
|
|
|
+ rcu_read_lock();
|
|
|
+ /*
|
|
|
+ * searching a memory cgroup which has the smallest ID under given
|
|
|
+ * ROOT cgroup. (ID >= 1)
|
|
|
+ */
|
|
|
+ css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
|
|
|
+ if (css && css_tryget(css))
|
|
|
+ mem = container_of(css, struct mem_cgroup, css);
|
|
|
+ else
|
|
|
+ mem = NULL;
|
|
|
+ rcu_read_unlock();
|
|
|
+ return mem;
|
|
|
}
|
|
|
|
|
|
static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
|
|
@@ -680,9 +733,13 @@ static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
|
|
|
hierarchy_used = iter->use_hierarchy;
|
|
|
|
|
|
css_put(&iter->css);
|
|
|
- if (!cond || !hierarchy_used)
|
|
|
+ /* If no ROOT, walk all, ignore hierarchy */
|
|
|
+ if (!cond || (root && !hierarchy_used))
|
|
|
return NULL;
|
|
|
|
|
|
+ if (!root)
|
|
|
+ root = root_mem_cgroup;
|
|
|
+
|
|
|
do {
|
|
|
iter = NULL;
|
|
|
rcu_read_lock();
|
|
@@ -711,6 +768,9 @@ static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
|
|
|
#define for_each_mem_cgroup_tree(iter, root) \
|
|
|
for_each_mem_cgroup_tree_cond(iter, root, true)
|
|
|
|
|
|
+#define for_each_mem_cgroup_all(iter) \
|
|
|
+ for_each_mem_cgroup_tree_cond(iter, NULL, true)
|
|
|
+
|
|
|
|
|
|
static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
|
|
|
{
|
|
@@ -1676,15 +1736,38 @@ static void drain_all_stock_sync(void)
|
|
|
atomic_dec(&memcg_drain_count);
|
|
|
}
|
|
|
|
|
|
-static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
|
|
|
+/*
|
|
|
+ * This function drains percpu counter value from DEAD cpu and
|
|
|
+ * move it to local cpu. Note that this function can be preempted.
|
|
|
+ */
|
|
|
+static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
|
|
|
+{
|
|
|
+ int i;
|
|
|
+
|
|
|
+ spin_lock(&mem->pcp_counter_lock);
|
|
|
+ for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
|
|
|
+ s64 x = per_cpu(mem->stat->count[i], cpu);
|
|
|
+
|
|
|
+ per_cpu(mem->stat->count[i], cpu) = 0;
|
|
|
+ mem->nocpu_base.count[i] += x;
|
|
|
+ }
|
|
|
+ spin_unlock(&mem->pcp_counter_lock);
|
|
|
+}
|
|
|
+
|
|
|
+static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
|
|
|
unsigned long action,
|
|
|
void *hcpu)
|
|
|
{
|
|
|
int cpu = (unsigned long)hcpu;
|
|
|
struct memcg_stock_pcp *stock;
|
|
|
+ struct mem_cgroup *iter;
|
|
|
|
|
|
- if (action != CPU_DEAD)
|
|
|
+ if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
|
|
|
return NOTIFY_OK;
|
|
|
+
|
|
|
+ for_each_mem_cgroup_all(iter)
|
|
|
+ mem_cgroup_drain_pcp_counter(iter, cpu);
|
|
|
+
|
|
|
stock = &per_cpu(memcg_stock, cpu);
|
|
|
drain_stock(stock);
|
|
|
return NOTIFY_OK;
|
|
@@ -4098,6 +4181,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
|
|
|
vfree(mem);
|
|
|
mem = NULL;
|
|
|
}
|
|
|
+ spin_lock_init(&mem->pcp_counter_lock);
|
|
|
return mem;
|
|
|
}
|
|
|
|
|
@@ -4224,7 +4308,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
|
|
|
&per_cpu(memcg_stock, cpu);
|
|
|
INIT_WORK(&stock->work, drain_local_stock);
|
|
|
}
|
|
|
- hotcpu_notifier(memcg_stock_cpu_callback, 0);
|
|
|
+ hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
|
|
|
} else {
|
|
|
parent = mem_cgroup_from_cont(cont->parent);
|
|
|
mem->use_hierarchy = parent->use_hierarchy;
|