15 years ago · 32047e2a85
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -90,6 +90,7 @@ enum mem_cgroup_stat_index {
 
				 	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
			
 
				 	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
			
 
				 	MEM_CGROUP_EVENTS,	/* incremented at every  pagein/pageout */
			
 
				+	MEM_CGROUP_ON_MOVE,	/* someone is moving account between groups */
			
 
				 
			
 
				 	MEM_CGROUP_STAT_NSTATS,
			
 
				 };
			
@@ -1051,7 +1052,46 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
 
				 	return swappiness;
			
 
				 }
			
 
				 
			
 
				-/* A routine for testing mem is not under move_account */
			
 
				+static void mem_cgroup_start_move(struct mem_cgroup *mem)
			
 
				+{
			
 
				+	int cpu;
			
 
				+	/* Because this is for moving account, reuse mc.lock */
			
 
				+	spin_lock(&mc.lock);
			
 
				+	for_each_possible_cpu(cpu)
			
 
				+		per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
			
 
				+	spin_unlock(&mc.lock);
			
 
				+
			
 
				+	synchronize_rcu();
			
 
				+}
			
 
				+
			
 
				+static void mem_cgroup_end_move(struct mem_cgroup *mem)
			
 
				+{
			
 
				+	int cpu;
			
 
				+
			
 
				+	if (!mem)
			
 
				+		return;
			
 
				+	spin_lock(&mc.lock);
			
 
				+	for_each_possible_cpu(cpu)
			
 
				+		per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
			
 
				+	spin_unlock(&mc.lock);
			
 
				+}
			
 
				+/*
			
 
				+ * 2 routines for checking "mem" is under move_account() or not.
			
 
				+ *
			
 
				+ * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
			
 
				+ *			  for avoiding race in accounting. If true,
			
 
				+ *			  pc->mem_cgroup may be overwritten.
			
 
				+ *
			
 
				+ * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
			
 
				+ *			  under hierarchy of moving cgroups. This is for
			
 
				+ *			  waiting at hith-memory prressure caused by "move".
			
 
				+ */
			
 
				+
			
 
				+static bool mem_cgroup_stealed(struct mem_cgroup *mem)
			
 
				+{
			
 
				+	VM_BUG_ON(!rcu_read_lock_held());
			
 
				+	return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
			
 
				+}
			
 
				 
			
 
				 static bool mem_cgroup_under_move(struct mem_cgroup *mem)
			
 
				 {
			
@@ -1462,35 +1502,62 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
 
				 /*
			
 
				  * Currently used to update mapped file statistics, but the routine can be
			
 
				  * generalized to update other statistics as well.
			
 
				+ *
			
 
				+ * Notes: Race condition
			
 
				+ *
			
 
				+ * We usually use page_cgroup_lock() for accessing page_cgroup member but
			
 
				+ * it tends to be costly. But considering some conditions, we doesn't need
			
 
				+ * to do so _always_.
			
 
				+ *
			
 
				+ * Considering "charge", lock_page_cgroup() is not required because all
			
 
				+ * file-stat operations happen after a page is attached to radix-tree. There
			
 
				+ * are no race with "charge".
			
 
				+ *
			
 
				+ * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
			
 
				+ * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
			
 
				+ * if there are race with "uncharge". Statistics itself is properly handled
			
 
				+ * by flags.
			
 
				+ *
			
 
				+ * Considering "move", this is an only case we see a race. To make the race
			
 
				+ * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
			
 
				+ * possibility of race condition. If there is, we take a lock.
			
 
				  */
			
 
				 void mem_cgroup_update_file_mapped(struct page *page, int val)
			
 
				 {
			
 
				 	struct mem_cgroup *mem;
			
 
				-	struct page_cgroup *pc;
			
 
				+	struct page_cgroup *pc = lookup_page_cgroup(page);
			
 
				+	bool need_unlock = false;
			
 
				 
			
 
				-	pc = lookup_page_cgroup(page);
			
 
				 	if (unlikely(!pc))
			
 
				 		return;
			
 
				 
			
 
				-	lock_page_cgroup(pc);
			
 
				+	rcu_read_lock();
			
 
				 	mem = pc->mem_cgroup;
			
 
				-	if (!mem || !PageCgroupUsed(pc))
			
 
				-		goto done;
			
 
				-
			
 
				-	/*
			
 
				-	 * Preemption is already disabled. We can use __this_cpu_xxx
			
 
				-	 */
			
 
				+	if (unlikely(!mem || !PageCgroupUsed(pc)))
			
 
				+		goto out;
			
 
				+	/* pc->mem_cgroup is unstable ? */
			
 
				+	if (unlikely(mem_cgroup_stealed(mem))) {
			
 
				+		/* take a lock against to access pc->mem_cgroup */
			
 
				+		lock_page_cgroup(pc);
			
 
				+		need_unlock = true;
			
 
				+		mem = pc->mem_cgroup;
			
 
				+		if (!mem || !PageCgroupUsed(pc))
			
 
				+			goto out;
			
 
				+	}
			
 
				 	if (val > 0) {
			
 
				-		__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
			
 
				+		this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
			
 
				 		SetPageCgroupFileMapped(pc);
			
 
				 	} else {
			
 
				-		__this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
			
 
				+		this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
			
 
				 		if (!page_mapped(page)) /* for race between dec->inc counter */
			
 
				 			ClearPageCgroupFileMapped(pc);
			
 
				 	}
			
 
				 
			
 
				-done:
			
 
				-	unlock_page_cgroup(pc);
			
 
				+out:
			
 
				+	if (unlikely(need_unlock))
			
 
				+		unlock_page_cgroup(pc);
			
 
				+	rcu_read_unlock();
			
 
				+	return;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -3039,6 +3106,7 @@ move_account:
 
				 		lru_add_drain_all();
			
 
				 		drain_all_stock_sync();
			
 
				 		ret = 0;
			
 
				+		mem_cgroup_start_move(mem);
			
 
				 		for_each_node_state(node, N_HIGH_MEMORY) {
			
 
				 			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
			
 
				 				enum lru_list l;
			
@@ -3052,6 +3120,7 @@ move_account:
 
				 			if (ret)
			
 
				 				break;
			
 
				 		}
			
 
				+		mem_cgroup_end_move(mem);
			
 
				 		memcg_oom_recover(mem);
			
 
				 		/* it seems parent cgroup doesn't have enough mem */
			
 
				 		if (ret == -ENOMEM)
			
@@ -4514,6 +4583,7 @@ static void mem_cgroup_clear_mc(void)
 
				 	mc.to = NULL;
			
 
				 	mc.moving_task = NULL;
			
 
				 	spin_unlock(&mc.lock);
			
 
				+	mem_cgroup_end_move(from);
			
 
				 	memcg_oom_recover(from);
			
 
				 	memcg_oom_recover(to);
			
 
				 	wake_up_all(&mc.waitq);
			
@@ -4544,6 +4614,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
 
				 			VM_BUG_ON(mc.moved_charge);
			
 
				 			VM_BUG_ON(mc.moved_swap);
			
 
				 			VM_BUG_ON(mc.moving_task);
			
 
				+			mem_cgroup_start_move(from);
			
 
				 			spin_lock(&mc.lock);
			
 
				 			mc.from = from;
			
 
				 			mc.to = mem;