12 years ago · d3daf28da1
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -20,6 +20,7 @@
 
				 #include <linux/workqueue.h>
			
 
				 #include <linux/xattr.h>
			
 
				 #include <linux/fs.h>
			
 
				+#include <linux/percpu-refcount.h>
			
 
				 
			
 
				 #ifdef CONFIG_CGROUPS
			
 
				 
			
@@ -72,13 +73,8 @@ struct cgroup_subsys_state {
 
				 	 */
			
 
				 	struct cgroup *cgroup;
			
 
				 
			
 
				-	/*
			
 
				-	 * State maintained by the cgroup system to allow subsystems
			
 
				-	 * to be "busy". Should be accessed via css_get(),
			
 
				-	 * css_tryget() and css_put().
			
 
				-	 */
			
 
				-
			
 
				-	atomic_t refcnt;
			
 
				+	/* reference count - access via css_[try]get() and css_put() */
			
 
				+	struct percpu_ref refcnt;
			
 
				 
			
 
				 	unsigned long flags;
			
 
				 	/* ID for this css, if possible */
			
@@ -104,11 +100,9 @@ static inline void css_get(struct cgroup_subsys_state *css)
 
				 {
			
 
				 	/* We don't need to reference count the root state */
			
 
				 	if (!(css->flags & CSS_ROOT))
			
 
				-		atomic_inc(&css->refcnt);
			
 
				+		percpu_ref_get(&css->refcnt);
			
 
				 }
			
 
				 
			
 
				-extern bool __css_tryget(struct cgroup_subsys_state *css);
			
 
				-
			
 
				 /**
			
 
				  * css_tryget - try to obtain a reference on the specified css
			
 
				  * @css: target css
			
@@ -123,11 +117,9 @@ static inline bool css_tryget(struct cgroup_subsys_state *css)
 
				 {
			
 
				 	if (css->flags & CSS_ROOT)
			
 
				 		return true;
			
 
				-	return __css_tryget(css);
			
 
				+	return percpu_ref_tryget(&css->refcnt);
			
 
				 }
			
 
				 
			
 
				-extern void __css_put(struct cgroup_subsys_state *css);
			
 
				-
			
 
				 /**
			
 
				  * css_put - put a css reference
			
 
				  * @css: target css
			
@@ -137,7 +129,7 @@ extern void __css_put(struct cgroup_subsys_state *css);
 
				 static inline void css_put(struct cgroup_subsys_state *css)
			
 
				 {
			
 
				 	if (!(css->flags & CSS_ROOT))
			
 
				-		__css_put(css);
			
 
				+		percpu_ref_put(&css->refcnt);
			
 
				 }
			
 
				 
			
 
				 /* bits in struct cgroup flags field */
			
@@ -231,9 +223,10 @@ struct cgroup {
 
				 	struct list_head pidlists;
			
 
				 	struct mutex pidlist_mutex;
			
 
				 
			
 
				-	/* For RCU-protected deletion */
			
 
				+	/* For css percpu_ref killing and RCU-protected deletion */
			
 
				 	struct rcu_head rcu_head;
			
 
				 	struct work_struct destroy_work;
			
 
				+	atomic_t css_kill_cnt;
			
 
				 
			
 
				 	/* List of events which userspace want to receive */
			
 
				 	struct list_head event_list;
			
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,9 +63,6 @@
 
				 
			
 
				 #include <linux/atomic.h>
			
 
				 
			
 
				-/* css deactivation bias, makes css->refcnt negative to deny new trygets */
			
 
				-#define CSS_DEACT_BIAS		INT_MIN
			
 
				-
			
 
				 /*
			
 
				  * cgroup_mutex is the master lock.  Any modification to cgroup or its
			
 
				  * hierarchy must be performed while holding it.
			
@@ -213,19 +210,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
 
				 static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
			
 
				 			      struct cftype cfts[], bool is_add);
			
 
				 
			
 
				-static int css_unbias_refcnt(int refcnt)
			
 
				-{
			
 
				-	return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
			
 
				-}
			
 
				-
			
 
				-/* the current nr of refs, always >= 0 whether @css is deactivated or not */
			
 
				-static int css_refcnt(struct cgroup_subsys_state *css)
			
 
				-{
			
 
				-	int v = atomic_read(&css->refcnt);
			
 
				-
			
 
				-	return css_unbias_refcnt(v);
			
 
				-}
			
 
				-
			
 
				 /* convenient tests for these bits */
			
 
				 static inline bool cgroup_is_dead(const struct cgroup *cgrp)
			
 
				 {
			
@@ -4139,12 +4123,19 @@ static void css_dput_fn(struct work_struct *work)
 
				 	deactivate_super(sb);
			
 
				 }
			
 
				 
			
 
				+static void css_release(struct percpu_ref *ref)
			
 
				+{
			
 
				+	struct cgroup_subsys_state *css =
			
 
				+		container_of(ref, struct cgroup_subsys_state, refcnt);
			
 
				+
			
 
				+	schedule_work(&css->dput_work);
			
 
				+}
			
 
				+
			
 
				 static void init_cgroup_css(struct cgroup_subsys_state *css,
			
 
				 			       struct cgroup_subsys *ss,
			
 
				 			       struct cgroup *cgrp)
			
 
				 {
			
 
				 	css->cgroup = cgrp;
			
 
				-	atomic_set(&css->refcnt, 1);
			
 
				 	css->flags = 0;
			
 
				 	css->id = NULL;
			
 
				 	if (cgrp == dummytop)
			
@@ -4266,7 +4257,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
				 			err = PTR_ERR(css);
			
 
				 			goto err_free_all;
			
 
				 		}
			
 
				+
			
 
				+		err = percpu_ref_init(&css->refcnt, css_release);
			
 
				+		if (err)
			
 
				+			goto err_free_all;
			
 
				+
			
 
				 		init_cgroup_css(css, ss, cgrp);
			
 
				+
			
 
				 		if (ss->use_id) {
			
 
				 			err = alloc_css_id(ss, parent, cgrp);
			
 
				 			if (err)
			
@@ -4331,8 +4328,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
				 
			
 
				 err_free_all:
			
 
				 	for_each_subsys(root, ss) {
			
 
				-		if (cgrp->subsys[ss->subsys_id])
			
 
				+		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
			
 
				+
			
 
				+		if (css) {
			
 
				+			percpu_ref_cancel_init(&css->refcnt);
			
 
				 			ss->css_free(cgrp);
			
 
				+		}
			
 
				 	}
			
 
				 	mutex_unlock(&cgroup_mutex);
			
 
				 	/* Release the reference count that we took on the superblock */
			
@@ -4360,6 +4361,48 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 
				 	return cgroup_create(c_parent, dentry, mode | S_IFDIR);
			
 
				 }
			
 
				 
			
 
				+static void cgroup_css_killed(struct cgroup *cgrp)
			
 
				+{
			
 
				+	if (!atomic_dec_and_test(&cgrp->css_kill_cnt))
			
 
				+		return;
			
 
				+
			
 
				+	/* percpu ref's of all css's are killed, kick off the next step */
			
 
				+	INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
			
 
				+	schedule_work(&cgrp->destroy_work);
			
 
				+}
			
 
				+
			
 
				+static void css_ref_killed_fn(struct percpu_ref *ref)
			
 
				+{
			
 
				+	struct cgroup_subsys_state *css =
			
 
				+		container_of(ref, struct cgroup_subsys_state, refcnt);
			
 
				+
			
 
				+	cgroup_css_killed(css->cgroup);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * cgroup_destroy_locked - the first stage of cgroup destruction
			
 
				+ * @cgrp: cgroup to be destroyed
			
 
				+ *
			
 
				+ * css's make use of percpu refcnts whose killing latency shouldn't be
			
 
				+ * exposed to userland and are RCU protected.  Also, cgroup core needs to
			
 
				+ * guarantee that css_tryget() won't succeed by the time ->css_offline() is
			
 
				+ * invoked.  To satisfy all the requirements, destruction is implemented in
			
 
				+ * the following two steps.
			
 
				+ *
			
 
				+ * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
			
 
				+ *     userland visible parts and start killing the percpu refcnts of
			
 
				+ *     css's.  Set up so that the next stage will be kicked off once all
			
 
				+ *     the percpu refcnts are confirmed to be killed.
			
 
				+ *
			
 
				+ * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
			
 
				+ *     rest of destruction.  Once all cgroup references are gone, the
			
 
				+ *     cgroup is RCU-freed.
			
 
				+ *
			
 
				+ * This function implements s1.  After this step, @cgrp is gone as far as
			
 
				+ * the userland is concerned and a new cgroup with the same name may be
			
 
				+ * created.  As cgroup doesn't care about the names internally, this
			
 
				+ * doesn't cause any problem.
			
 
				+ */
			
 
				 static int cgroup_destroy_locked(struct cgroup *cgrp)
			
 
				 	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
			
 
				 {
			
@@ -4382,16 +4425,34 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 
				 		return -EBUSY;
			
 
				 
			
 
				 	/*
			
 
				-	 * Block new css_tryget() by deactivating refcnt and mark @cgrp
			
 
				-	 * removed.  This makes future css_tryget() attempts fail which we
			
 
				-	 * guarantee to ->css_offline() callbacks.
			
 
				+	 * Block new css_tryget() by killing css refcnts.  cgroup core
			
 
				+	 * guarantees that, by the time ->css_offline() is invoked, no new
			
 
				+	 * css reference will be given out via css_tryget().  We can't
			
 
				+	 * simply call percpu_ref_kill() and proceed to offlining css's
			
 
				+	 * because percpu_ref_kill() doesn't guarantee that the ref is seen
			
 
				+	 * as killed on all CPUs on return.
			
 
				+	 *
			
 
				+	 * Use percpu_ref_kill_and_confirm() to get notifications as each
			
 
				+	 * css is confirmed to be seen as killed on all CPUs.  The
			
 
				+	 * notification callback keeps track of the number of css's to be
			
 
				+	 * killed and schedules cgroup_offline_fn() to perform the rest of
			
 
				+	 * destruction once the percpu refs of all css's are confirmed to
			
 
				+	 * be killed.
			
 
				 	 */
			
 
				+	atomic_set(&cgrp->css_kill_cnt, 1);
			
 
				 	for_each_subsys(cgrp->root, ss) {
			
 
				 		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
			
 
				 
			
 
				-		WARN_ON(atomic_read(&css->refcnt) < 0);
			
 
				-		atomic_add(CSS_DEACT_BIAS, &css->refcnt);
			
 
				+		/*
			
 
				+		 * Killing would put the base ref, but we need to keep it
			
 
				+		 * alive until after ->css_offline.
			
 
				+		 */
			
 
				+		percpu_ref_get(&css->refcnt);
			
 
				+
			
 
				+		atomic_inc(&cgrp->css_kill_cnt);
			
 
				+		percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
			
 
				 	}
			
 
				+	cgroup_css_killed(cgrp);
			
 
				 
			
 
				 	/*
			
 
				 	 * Mark @cgrp dead.  This prevents further task migration and child
			
@@ -4427,12 +4488,19 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 
				 	}
			
 
				 	spin_unlock(&cgrp->event_list_lock);
			
 
				 
			
 
				-	INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
			
 
				-	schedule_work(&cgrp->destroy_work);
			
 
				-
			
 
				 	return 0;
			
 
				 };
			
 
				 
			
 
				+/**
			
 
				+ * cgroup_offline_fn - the second step of cgroup destruction
			
 
				+ * @work: cgroup->destroy_free_work
			
 
				+ *
			
 
				+ * This function is invoked from a work item for a cgroup which is being
			
 
				+ * destroyed after the percpu refcnts of all css's are guaranteed to be
			
 
				+ * seen as killed on all CPUs, and performs the rest of destruction.  This
			
 
				+ * is the second step of destruction described in the comment above
			
 
				+ * cgroup_destroy_locked().
			
 
				+ */
			
 
				 static void cgroup_offline_fn(struct work_struct *work)
			
 
				 {
			
 
				 	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
			
@@ -4442,16 +4510,19 @@ static void cgroup_offline_fn(struct work_struct *work)
 
				 
			
 
				 	mutex_lock(&cgroup_mutex);
			
 
				 
			
 
				-	/* tell subsystems to initate destruction */
			
 
				+	/*
			
 
				+	 * css_tryget() is guaranteed to fail now.  Tell subsystems to
			
 
				+	 * initate destruction.
			
 
				+	 */
			
 
				 	for_each_subsys(cgrp->root, ss)
			
 
				 		offline_css(ss, cgrp);
			
 
				 
			
 
				 	/*
			
 
				-	 * Put all the base refs.  Each css holds an extra reference to the
			
 
				-	 * cgroup's dentry and cgroup removal proceeds regardless of css
			
 
				-	 * refs.  On the last put of each css, whenever that may be, the
			
 
				-	 * extra dentry ref is put so that dentry destruction happens only
			
 
				-	 * after all css's are released.
			
 
				+	 * Put the css refs from cgroup_destroy_locked().  Each css holds
			
 
				+	 * an extra reference to the cgroup's dentry and cgroup removal
			
 
				+	 * proceeds regardless of css refs.  On the last put of each css,
			
 
				+	 * whenever that may be, the extra dentry ref is put so that dentry
			
 
				+	 * destruction happens only after all css's are released.
			
 
				 	 */
			
 
				 	for_each_subsys(cgrp->root, ss)
			
 
				 		css_put(cgrp->subsys[ss->subsys_id]);
			
@@ -5100,34 +5171,6 @@ static void check_for_release(struct cgroup *cgrp)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-/* Caller must verify that the css is not for root cgroup */
			
 
				-bool __css_tryget(struct cgroup_subsys_state *css)
			
 
				-{
			
 
				-	while (true) {
			
 
				-		int t, v;
			
 
				-
			
 
				-		v = css_refcnt(css);
			
 
				-		t = atomic_cmpxchg(&css->refcnt, v, v + 1);
			
 
				-		if (likely(t == v))
			
 
				-			return true;
			
 
				-		else if (t < 0)
			
 
				-			return false;
			
 
				-		cpu_relax();
			
 
				-	}
			
 
				-}
			
 
				-EXPORT_SYMBOL_GPL(__css_tryget);
			
 
				-
			
 
				-/* Caller must verify that the css is not for root cgroup */
			
 
				-void __css_put(struct cgroup_subsys_state *css)
			
 
				-{
			
 
				-	int v;
			
 
				-
			
 
				-	v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
			
 
				-	if (v == 0)
			
 
				-		schedule_work(&css->dput_work);
			
 
				-}
			
 
				-EXPORT_SYMBOL_GPL(__css_put);
			
 
				-
			
 
				 /*
			
 
				  * Notify userspace when a cgroup is released, by running the
			
 
				  * configured release agent with the name of the cgroup (path
			
@@ -5245,7 +5288,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
 
				 	 * on this or this is under rcu_read_lock(). Once css->id is allocated,
			
 
				 	 * it's unchanged until freed.
			
 
				 	 */
			
 
				-	cssid = rcu_dereference_check(css->id, css_refcnt(css));
			
 
				+	cssid = rcu_dereference_raw(css->id);
			
 
				 
			
 
				 	if (cssid)
			
 
				 		return cssid->id;