12 years ago · 55007d8497
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -447,6 +447,8 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s);
 
				 void memcg_release_cache(struct kmem_cache *cachep);
			
 
				 void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep);
			
 
				 
			
 
				+int memcg_update_cache_size(struct kmem_cache *s, int num_groups);
			
 
				+void memcg_update_array_size(int num_groups);
			
 
				 /**
			
 
				  * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed.
			
 
				  * @gfp: the gfp allocation flags.
			
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -378,6 +378,11 @@ static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
 
				 	set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
			
 
				 }
			
 
				 
			
 
				+static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
			
 
				+{
			
 
				+	clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
			
 
				+}
			
 
				+
			
 
				 static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
			
 
				 {
			
 
				 	if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
			
@@ -549,12 +554,48 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
 
				 #endif
			
 
				 
			
 
				 #ifdef CONFIG_MEMCG_KMEM
			
 
				+/*
			
 
				+ * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
			
 
				+ * There are two main reasons for not using the css_id for this:
			
 
				+ *  1) this works better in sparse environments, where we have a lot of memcgs,
			
 
				+ *     but only a few kmem-limited. Or also, if we have, for instance, 200
			
 
				+ *     memcgs, and none but the 200th is kmem-limited, we'd have to have a
			
 
				+ *     200 entry array for that.
			
 
				+ *
			
 
				+ *  2) In order not to violate the cgroup API, we would like to do all memory
			
 
				+ *     allocation in ->create(). At that point, we haven't yet allocated the
			
 
				+ *     css_id. Having a separate index prevents us from messing with the cgroup
			
 
				+ *     core for this
			
 
				+ *
			
 
				+ * The current size of the caches array is stored in
			
 
				+ * memcg_limited_groups_array_size.  It will double each time we have to
			
 
				+ * increase it.
			
 
				+ */
			
 
				+static DEFINE_IDA(kmem_limited_groups);
			
 
				+static int memcg_limited_groups_array_size;
			
 
				+/*
			
 
				+ * MIN_SIZE is different than 1, because we would like to avoid going through
			
 
				+ * the alloc/free process all the time. In a small machine, 4 kmem-limited
			
 
				+ * cgroups is a reasonable guess. In the future, it could be a parameter or
			
 
				+ * tunable, but that is strictly not necessary.
			
 
				+ *
			
 
				+ * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
			
 
				+ * this constant directly from cgroup, but it is understandable that this is
			
 
				+ * better kept as an internal representation in cgroup.c. In any case, the
			
 
				+ * css_id space is not getting any smaller, and we don't have to necessarily
			
 
				+ * increase ours as well if it increases.
			
 
				+ */
			
 
				+#define MEMCG_CACHES_MIN_SIZE 4
			
 
				+#define MEMCG_CACHES_MAX_SIZE 65535
			
 
				+
			
 
				 struct static_key memcg_kmem_enabled_key;
			
 
				 
			
 
				 static void disarm_kmem_keys(struct mem_cgroup *memcg)
			
 
				 {
			
 
				-	if (memcg_kmem_is_active(memcg))
			
 
				+	if (memcg_kmem_is_active(memcg)) {
			
 
				 		static_key_slow_dec(&memcg_kmem_enabled_key);
			
 
				+		ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
			
 
				+	}
			
 
				 	/*
			
 
				 	 * This check can't live in kmem destruction function,
			
 
				 	 * since the charges will outlive the cgroup
			
@@ -2813,6 +2854,120 @@ int memcg_cache_id(struct mem_cgroup *memcg)
 
				 	return memcg ? memcg->kmemcg_id : -1;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * This ends up being protected by the set_limit mutex, during normal
			
 
				+ * operation, because that is its main call site.
			
 
				+ *
			
 
				+ * But when we create a new cache, we can call this as well if its parent
			
 
				+ * is kmem-limited. That will have to hold set_limit_mutex as well.
			
 
				+ */
			
 
				+int memcg_update_cache_sizes(struct mem_cgroup *memcg)
			
 
				+{
			
 
				+	int num, ret;
			
 
				+
			
 
				+	num = ida_simple_get(&kmem_limited_groups,
			
 
				+				0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
			
 
				+	if (num < 0)
			
 
				+		return num;
			
 
				+	/*
			
 
				+	 * After this point, kmem_accounted (that we test atomically in
			
 
				+	 * the beginning of this conditional), is no longer 0. This
			
 
				+	 * guarantees only one process will set the following boolean
			
 
				+	 * to true. We don't need test_and_set because we're protected
			
 
				+	 * by the set_limit_mutex anyway.
			
 
				+	 */
			
 
				+	memcg_kmem_set_activated(memcg);
			
 
				+
			
 
				+	ret = memcg_update_all_caches(num+1);
			
 
				+	if (ret) {
			
 
				+		ida_simple_remove(&kmem_limited_groups, num);
			
 
				+		memcg_kmem_clear_activated(memcg);
			
 
				+		return ret;
			
 
				+	}
			
 
				+
			
 
				+	memcg->kmemcg_id = num;
			
 
				+	INIT_LIST_HEAD(&memcg->memcg_slab_caches);
			
 
				+	mutex_init(&memcg->slab_caches_mutex);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static size_t memcg_caches_array_size(int num_groups)
			
 
				+{
			
 
				+	ssize_t size;
			
 
				+	if (num_groups <= 0)
			
 
				+		return 0;
			
 
				+
			
 
				+	size = 2 * num_groups;
			
 
				+	if (size < MEMCG_CACHES_MIN_SIZE)
			
 
				+		size = MEMCG_CACHES_MIN_SIZE;
			
 
				+	else if (size > MEMCG_CACHES_MAX_SIZE)
			
 
				+		size = MEMCG_CACHES_MAX_SIZE;
			
 
				+
			
 
				+	return size;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * We should update the current array size iff all caches updates succeed. This
			
 
				+ * can only be done from the slab side. The slab mutex needs to be held when
			
 
				+ * calling this.
			
 
				+ */
			
 
				+void memcg_update_array_size(int num)
			
 
				+{
			
 
				+	if (num > memcg_limited_groups_array_size)
			
 
				+		memcg_limited_groups_array_size = memcg_caches_array_size(num);
			
 
				+}
			
 
				+
			
 
				+int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
			
 
				+{
			
 
				+	struct memcg_cache_params *cur_params = s->memcg_params;
			
 
				+
			
 
				+	VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
			
 
				+
			
 
				+	if (num_groups > memcg_limited_groups_array_size) {
			
 
				+		int i;
			
 
				+		ssize_t size = memcg_caches_array_size(num_groups);
			
 
				+
			
 
				+		size *= sizeof(void *);
			
 
				+		size += sizeof(struct memcg_cache_params);
			
 
				+
			
 
				+		s->memcg_params = kzalloc(size, GFP_KERNEL);
			
 
				+		if (!s->memcg_params) {
			
 
				+			s->memcg_params = cur_params;
			
 
				+			return -ENOMEM;
			
 
				+		}
			
 
				+
			
 
				+		s->memcg_params->is_root_cache = true;
			
 
				+
			
 
				+		/*
			
 
				+		 * There is the chance it will be bigger than
			
 
				+		 * memcg_limited_groups_array_size, if we failed an allocation
			
 
				+		 * in a cache, in which case all caches updated before it, will
			
 
				+		 * have a bigger array.
			
 
				+		 *
			
 
				+		 * But if that is the case, the data after
			
 
				+		 * memcg_limited_groups_array_size is certainly unused
			
 
				+		 */
			
 
				+		for (i = 0; i < memcg_limited_groups_array_size; i++) {
			
 
				+			if (!cur_params->memcg_caches[i])
			
 
				+				continue;
			
 
				+			s->memcg_params->memcg_caches[i] =
			
 
				+						cur_params->memcg_caches[i];
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * Ideally, we would wait until all caches succeed, and only
			
 
				+		 * then free the old one. But this is not worth the extra
			
 
				+		 * pointer per-cache we'd have to have for this.
			
 
				+		 *
			
 
				+		 * It is not a big deal if some caches are left with a size
			
 
				+		 * bigger than the others. And all updates will reset this
			
 
				+		 * anyway.
			
 
				+		 */
			
 
				+		kfree(cur_params);
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s)
			
 
				 {
			
 
				 	size_t size = sizeof(struct memcg_cache_params);
			
@@ -2820,6 +2975,9 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s)
 
				 	if (!memcg_kmem_enabled())
			
 
				 		return 0;
			
 
				 
			
 
				+	if (!memcg)
			
 
				+		size += memcg_limited_groups_array_size * sizeof(void *);
			
 
				+
			
 
				 	s->memcg_params = kzalloc(size, GFP_KERNEL);
			
 
				 	if (!s->memcg_params)
			
 
				 		return -ENOMEM;
			
@@ -4326,14 +4484,11 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
 
				 		ret = res_counter_set_limit(&memcg->kmem, val);
			
 
				 		VM_BUG_ON(ret);
			
 
				 
			
 
				-		/*
			
 
				-		 * After this point, kmem_accounted (that we test atomically in
			
 
				-		 * the beginning of this conditional), is no longer 0. This
			
 
				-		 * guarantees only one process will set the following boolean
			
 
				-		 * to true. We don't need test_and_set because we're protected
			
 
				-		 * by the set_limit_mutex anyway.
			
 
				-		 */
			
 
				-		memcg_kmem_set_activated(memcg);
			
 
				+		ret = memcg_update_cache_sizes(memcg);
			
 
				+		if (ret) {
			
 
				+			res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
			
 
				+			goto out;
			
 
				+		}
			
 
				 		must_inc_static_branch = true;
			
 
				 		/*
			
 
				 		 * kmem charges can outlive the cgroup. In the case of slab
			
@@ -4372,11 +4527,13 @@ out:
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static void memcg_propagate_kmem(struct mem_cgroup *memcg)
			
 
				+static int memcg_propagate_kmem(struct mem_cgroup *memcg)
			
 
				 {
			
 
				+	int ret = 0;
			
 
				 	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
			
 
				 	if (!parent)
			
 
				-		return;
			
 
				+		goto out;
			
 
				+
			
 
				 	memcg->kmem_account_flags = parent->kmem_account_flags;
			
 
				 #ifdef CONFIG_MEMCG_KMEM
			
 
				 	/*
			
@@ -4389,11 +4546,24 @@ static void memcg_propagate_kmem(struct mem_cgroup *memcg)
 
				 	 * It is a lot simpler just to do static_key_slow_inc() on every child
			
 
				 	 * that is accounted.
			
 
				 	 */
			
 
				-	if (memcg_kmem_is_active(memcg)) {
			
 
				-		mem_cgroup_get(memcg);
			
 
				-		static_key_slow_inc(&memcg_kmem_enabled_key);
			
 
				-	}
			
 
				+	if (!memcg_kmem_is_active(memcg))
			
 
				+		goto out;
			
 
				+
			
 
				+	/*
			
 
				+	 * destroy(), called if we fail, will issue static_key_slow_inc() and
			
 
				+	 * mem_cgroup_put() if kmem is enabled. We have to either call them
			
 
				+	 * unconditionally, or clear the KMEM_ACTIVE flag. I personally find
			
 
				+	 * this more consistent, since it always leads to the same destroy path
			
 
				+	 */
			
 
				+	mem_cgroup_get(memcg);
			
 
				+	static_key_slow_inc(&memcg_kmem_enabled_key);
			
 
				+
			
 
				+	mutex_lock(&set_limit_mutex);
			
 
				+	ret = memcg_update_cache_sizes(memcg);
			
 
				+	mutex_unlock(&set_limit_mutex);
			
 
				 #endif
			
 
				+out:
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -5075,8 +5245,12 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
 
				 #ifdef CONFIG_MEMCG_KMEM
			
 
				 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
			
 
				 {
			
 
				+	int ret;
			
 
				+
			
 
				 	memcg->kmemcg_id = -1;
			
 
				-	memcg_propagate_kmem(memcg);
			
 
				+	ret = memcg_propagate_kmem(memcg);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				 
			
 
				 	return mem_cgroup_sockets_init(memcg, ss);
			
 
				 };
			
@@ -5479,6 +5653,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
 
				 		res_counter_init(&memcg->res, &parent->res);
			
 
				 		res_counter_init(&memcg->memsw, &parent->memsw);
			
 
				 		res_counter_init(&memcg->kmem, &parent->kmem);
			
 
				+
			
 
				 		/*
			
 
				 		 * We increment refcnt of the parent to ensure that we can
			
 
				 		 * safely access it on res_counter_charge/uncharge.
			
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -81,6 +81,34 @@ static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg,
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+#ifdef CONFIG_MEMCG_KMEM
			
 
				+int memcg_update_all_caches(int num_memcgs)
			
 
				+{
			
 
				+	struct kmem_cache *s;
			
 
				+	int ret = 0;
			
 
				+	mutex_lock(&slab_mutex);
			
 
				+
			
 
				+	list_for_each_entry(s, &slab_caches, list) {
			
 
				+		if (!is_root_cache(s))
			
 
				+			continue;
			
 
				+
			
 
				+		ret = memcg_update_cache_size(s, num_memcgs);
			
 
				+		/*
			
 
				+		 * See comment in memcontrol.c, memcg_update_cache_size:
			
 
				+		 * Instead of freeing the memory, we'll just leave the caches
			
 
				+		 * up to this point in an updated state.
			
 
				+		 */
			
 
				+		if (ret)
			
 
				+			goto out;
			
 
				+	}
			
 
				+
			
 
				+	memcg_update_array_size(num_memcgs);
			
 
				+out:
			
 
				+	mutex_unlock(&slab_mutex);
			
 
				+	return ret;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 /*
			
 
				  * Figure out what the alignment of the objects will be given a set of
			
 
				  * flags, a user specified alignment and the size of the objects.