12 years ago · 1db1e31b1e
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -600,7 +600,7 @@ struct cftype blkcg_files[] = {
 
				  *
			
 
				  * This is the blkcg counterpart of ioc_release_fn().
			
 
				  */
			
 
				-static int blkcg_pre_destroy(struct cgroup *cgroup)
			
 
				+static void blkcg_pre_destroy(struct cgroup *cgroup)
			
 
				 {
			
 
				 	struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
			
 
				 
			
@@ -622,7 +622,6 @@ static int blkcg_pre_destroy(struct cgroup *cgroup)
 
				 	}
			
 
				 
			
 
				 	spin_unlock_irq(&blkcg->lock);
			
 
				-	return 0;
			
 
				 }
			
 
				 
			
 
				 static void blkcg_destroy(struct cgroup *cgroup)
			
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -81,8 +81,6 @@ struct cgroup_subsys_state {
 
				 /* bits in struct cgroup_subsys_state flags field */
			
 
				 enum {
			
 
				 	CSS_ROOT, /* This CSS is the root of the subsystem */
			
 
				-	CSS_REMOVED, /* This CSS is dead */
			
 
				-	CSS_CLEAR_CSS_REFS,		/* @ss->__DEPRECATED_clear_css_refs */
			
 
				 };
			
 
				 
			
 
				 /* Caller must verify that the css is not for root cgroup */
			
@@ -105,11 +103,6 @@ static inline void css_get(struct cgroup_subsys_state *css)
 
				 		__css_get(css, 1);
			
 
				 }
			
 
				 
			
 
				-static inline bool css_is_removed(struct cgroup_subsys_state *css)
			
 
				-{
			
 
				-	return test_bit(CSS_REMOVED, &css->flags);
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * Call css_tryget() to take a reference on a css if your existing
			
 
				  * (known-valid) reference isn't already ref-counted. Returns false if
			
@@ -147,10 +140,6 @@ enum {
 
				 	CGRP_RELEASABLE,
			
 
				 	/* Control Group requires release notifications to userspace */
			
 
				 	CGRP_NOTIFY_ON_RELEASE,
			
 
				-	/*
			
 
				-	 * A thread in rmdir() is wating for this cgroup.
			
 
				-	 */
			
 
				-	CGRP_WAIT_ON_RMDIR,
			
 
				 	/*
			
 
				 	 * Clone cgroup values when creating a new child cgroup
			
 
				 	 */
			
@@ -420,23 +409,6 @@ int cgroup_task_count(const struct cgroup *cgrp);
 
				 /* Return true if cgrp is a descendant of the task's cgroup */
			
 
				 int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
			
 
				 
			
 
				-/*
			
 
				- * When the subsys has to access css and may add permanent refcnt to css,
			
 
				- * it should take care of racy conditions with rmdir(). Following set of
			
 
				- * functions, is for stop/restart rmdir if necessary.
			
 
				- * Because these will call css_get/put, "css" should be alive css.
			
 
				- *
			
 
				- *  cgroup_exclude_rmdir();
			
 
				- *  ...do some jobs which may access arbitrary empty cgroup
			
 
				- *  cgroup_release_and_wakeup_rmdir();
			
 
				- *
			
 
				- *  When someone removes a cgroup while cgroup_exclude_rmdir() holds it,
			
 
				- *  it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up.
			
 
				- */
			
 
				-
			
 
				-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css);
			
 
				-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css);
			
 
				-
			
 
				 /*
			
 
				  * Control Group taskset, used to pass around set of tasks to cgroup_subsys
			
 
				  * methods.
			
@@ -466,7 +438,7 @@ int cgroup_taskset_size(struct cgroup_taskset *tset);
 
				 
			
 
				 struct cgroup_subsys {
			
 
				 	struct cgroup_subsys_state *(*create)(struct cgroup *cgrp);
			
 
				-	int (*pre_destroy)(struct cgroup *cgrp);
			
 
				+	void (*pre_destroy)(struct cgroup *cgrp);
			
 
				 	void (*destroy)(struct cgroup *cgrp);
			
 
				 	int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
			
 
				 	void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
			
@@ -487,17 +459,6 @@ struct cgroup_subsys {
 
				 	 */
			
 
				 	bool use_id;
			
 
				 
			
 
				-	/*
			
 
				-	 * If %true, cgroup removal will try to clear css refs by retrying
			
 
				-	 * ss->pre_destroy() until there's no css ref left.  This behavior
			
 
				-	 * is strictly for backward compatibility and will be removed as
			
 
				-	 * soon as the current user (memcg) is updated.
			
 
				-	 *
			
 
				-	 * If %false, ss->pre_destroy() can't fail and cgroup removal won't
			
 
				-	 * wait for css refs to drop to zero before proceeding.
			
 
				-	 */
			
 
				-	bool __DEPRECATED_clear_css_refs;
			
 
				-
			
 
				 	/*
			
 
				 	 * If %false, this subsystem is properly hierarchical -
			
 
				 	 * configuration, resource accounting and restriction on a parent
			
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -171,8 +171,8 @@ struct css_id {
 
				 	 * The css to which this ID points. This pointer is set to valid value
			
 
				 	 * after cgroup is populated. If cgroup is removed, this will be NULL.
			
 
				 	 * This pointer is expected to be RCU-safe because destroy()
			
 
				-	 * is called after synchronize_rcu(). But for safe use, css_is_removed()
			
 
				-	 * css_tryget() should be used for avoiding race.
			
 
				+	 * is called after synchronize_rcu(). But for safe use, css_tryget()
			
 
				+	 * should be used for avoiding race.
			
 
				 	 */
			
 
				 	struct cgroup_subsys_state __rcu *css;
			
 
				 	/*
			
@@ -854,30 +854,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
 
				 	return inode;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Call subsys's pre_destroy handler.
			
 
				- * This is called before css refcnt check.
			
 
				- */
			
 
				-static int cgroup_call_pre_destroy(struct cgroup *cgrp)
			
 
				-{
			
 
				-	struct cgroup_subsys *ss;
			
 
				-	int ret = 0;
			
 
				-
			
 
				-	for_each_subsys(cgrp->root, ss) {
			
 
				-		if (!ss->pre_destroy)
			
 
				-			continue;
			
 
				-
			
 
				-		ret = ss->pre_destroy(cgrp);
			
 
				-		if (ret) {
			
 
				-			/* ->pre_destroy() failure is being deprecated */
			
 
				-			WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
			
 
				-			break;
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
			
 
				 {
			
 
				 	/* is dentry a directory ? if so, kfree() associated cgroup */
			
@@ -1014,33 +990,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
 
				 	remove_dir(dentry);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
			
 
				- * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
			
 
				- * reference to css->refcnt. In general, this refcnt is expected to goes down
			
 
				- * to zero, soon.
			
 
				- *
			
 
				- * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
			
 
				- */
			
 
				-static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
			
 
				-
			
 
				-static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
			
 
				-{
			
 
				-	if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
			
 
				-		wake_up_all(&cgroup_rmdir_waitq);
			
 
				-}
			
 
				-
			
 
				-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
			
 
				-{
			
 
				-	css_get(css);
			
 
				-}
			
 
				-
			
 
				-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
			
 
				-{
			
 
				-	cgroup_wakeup_rmdir_waiter(css->cgroup);
			
 
				-	css_put(css);
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * Call with cgroup_mutex held. Drops reference counts on modules, including
			
 
				  * any duplicate ones that parse_cgroupfs_options took. If this function
			
@@ -2026,12 +1975,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 
				 	}
			
 
				 
			
 
				 	synchronize_rcu();
			
 
				-
			
 
				-	/*
			
 
				-	 * wake up rmdir() waiter. the rmdir should fail since the cgroup
			
 
				-	 * is no longer empty.
			
 
				-	 */
			
 
				-	cgroup_wakeup_rmdir_waiter(cgrp);
			
 
				 out:
			
 
				 	if (retval) {
			
 
				 		for_each_subsys(root, ss) {
			
@@ -2201,7 +2144,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 
				 	 * step 5: success! and cleanup
			
 
				 	 */
			
 
				 	synchronize_rcu();
			
 
				-	cgroup_wakeup_rmdir_waiter(cgrp);
			
 
				 	retval = 0;
			
 
				 out_put_css_set_refs:
			
 
				 	if (retval) {
			
@@ -4023,14 +3965,12 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
 
				 	cgrp->subsys[ss->subsys_id] = css;
			
 
				 
			
 
				 	/*
			
 
				-	 * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
			
 
				-	 * which is put on the last css_put().  dput() requires process
			
 
				-	 * context, which css_put() may be called without.  @css->dput_work
			
 
				-	 * will be used to invoke dput() asynchronously from css_put().
			
 
				+	 * css holds an extra ref to @cgrp->dentry which is put on the last
			
 
				+	 * css_put().  dput() requires process context, which css_put() may
			
 
				+	 * be called without.  @css->dput_work will be used to invoke
			
 
				+	 * dput() asynchronously from css_put().
			
 
				 	 */
			
 
				 	INIT_WORK(&css->dput_work, css_dput_fn);
			
 
				-	if (ss->__DEPRECATED_clear_css_refs)
			
 
				-		set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -4054,6 +3994,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
				 	if (!cgrp)
			
 
				 		return -ENOMEM;
			
 
				 
			
 
				+	/*
			
 
				+	 * Only live parents can have children.  Note that the liveliness
			
 
				+	 * check isn't strictly necessary because cgroup_mkdir() and
			
 
				+	 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
			
 
				+	 * anyway so that locking is contained inside cgroup proper and we
			
 
				+	 * don't get nasty surprises if we ever grow another caller.
			
 
				+	 */
			
 
				+	if (!cgroup_lock_live_group(parent)) {
			
 
				+		err = -ENODEV;
			
 
				+		goto err_free;
			
 
				+	}
			
 
				+
			
 
				 	/* Grab a reference on the superblock so the hierarchy doesn't
			
 
				 	 * get deleted on unmount if there are child cgroups.  This
			
 
				 	 * can be done outside cgroup_mutex, since the sb can't
			
@@ -4061,8 +4013,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
				 	 * fs */
			
 
				 	atomic_inc(&sb->s_active);
			
 
				 
			
 
				-	mutex_lock(&cgroup_mutex);
			
 
				-
			
 
				 	init_cgroup_housekeeping(cgrp);
			
 
				 
			
 
				 	cgrp->parent = parent;
			
@@ -4110,10 +4060,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
				 	if (err < 0)
			
 
				 		goto err_remove;
			
 
				 
			
 
				-	/* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
			
 
				+	/* each css holds a ref to the cgroup's dentry */
			
 
				 	for_each_subsys(root, ss)
			
 
				-		if (!ss->__DEPRECATED_clear_css_refs)
			
 
				-			dget(dentry);
			
 
				+		dget(dentry);
			
 
				 
			
 
				 	/* The cgroup directory was pre-locked for us */
			
 
				 	BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
			
@@ -4144,7 +4093,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
				 
			
 
				 	/* Release the reference count that we took on the superblock */
			
 
				 	deactivate_super(sb);
			
 
				-
			
 
				+err_free:
			
 
				 	kfree(cgrp);
			
 
				 	return err;
			
 
				 }
			
@@ -4198,71 +4147,6 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Atomically mark all (or else none) of the cgroup's CSS objects as
			
 
				- * CSS_REMOVED. Return true on success, or false if the cgroup has
			
 
				- * busy subsystems. Call with cgroup_mutex held
			
 
				- *
			
 
				- * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
			
 
				- * not, cgroup removal behaves differently.
			
 
				- *
			
 
				- * If clear is set, css refcnt for the subsystem should be zero before
			
 
				- * cgroup removal can be committed.  This is implemented by
			
 
				- * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
			
 
				- * called multiple times until all css refcnts reach zero and is allowed to
			
 
				- * veto removal on any invocation.  This behavior is deprecated and will be
			
 
				- * removed as soon as the existing user (memcg) is updated.
			
 
				- *
			
 
				- * If clear is not set, each css holds an extra reference to the cgroup's
			
 
				- * dentry and cgroup removal proceeds regardless of css refs.
			
 
				- * ->pre_destroy() will be called at least once and is not allowed to fail.
			
 
				- * On the last put of each css, whenever that may be, the extra dentry ref
			
 
				- * is put so that dentry destruction happens only after all css's are
			
 
				- * released.
			
 
				- */
			
 
				-static int cgroup_clear_css_refs(struct cgroup *cgrp)
			
 
				-{
			
 
				-	struct cgroup_subsys *ss;
			
 
				-	unsigned long flags;
			
 
				-	bool failed = false;
			
 
				-
			
 
				-	local_irq_save(flags);
			
 
				-
			
 
				-	/*
			
 
				-	 * Block new css_tryget() by deactivating refcnt.  If all refcnts
			
 
				-	 * for subsystems w/ clear_css_refs set were 1 at the moment of
			
 
				-	 * deactivation, we succeeded.
			
 
				-	 */
			
 
				-	for_each_subsys(cgrp->root, ss) {
			
 
				-		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
			
 
				-
			
 
				-		WARN_ON(atomic_read(&css->refcnt) < 0);
			
 
				-		atomic_add(CSS_DEACT_BIAS, &css->refcnt);
			
 
				-
			
 
				-		if (ss->__DEPRECATED_clear_css_refs)
			
 
				-			failed |= css_refcnt(css) != 1;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 * If succeeded, set REMOVED and put all the base refs; otherwise,
			
 
				-	 * restore refcnts to positive values.  Either way, all in-progress
			
 
				-	 * css_tryget() will be released.
			
 
				-	 */
			
 
				-	for_each_subsys(cgrp->root, ss) {
			
 
				-		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
			
 
				-
			
 
				-		if (!failed) {
			
 
				-			set_bit(CSS_REMOVED, &css->flags);
			
 
				-			css_put(css);
			
 
				-		} else {
			
 
				-			atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	local_irq_restore(flags);
			
 
				-	return !failed;
			
 
				-}
			
 
				-
			
 
				 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
			
 
				 {
			
 
				 	struct cgroup *cgrp = dentry->d_fsdata;
			
@@ -4270,70 +4154,52 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 
				 	struct cgroup *parent;
			
 
				 	DEFINE_WAIT(wait);
			
 
				 	struct cgroup_event *event, *tmp;
			
 
				-	int ret;
			
 
				+	struct cgroup_subsys *ss;
			
 
				 
			
 
				 	/* the vfs holds both inode->i_mutex already */
			
 
				-again:
			
 
				 	mutex_lock(&cgroup_mutex);
			
 
				-	if (atomic_read(&cgrp->count) != 0) {
			
 
				-		mutex_unlock(&cgroup_mutex);
			
 
				-		return -EBUSY;
			
 
				-	}
			
 
				-	if (!list_empty(&cgrp->children)) {
			
 
				+	parent = cgrp->parent;
			
 
				+	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
			
 
				 		mutex_unlock(&cgroup_mutex);
			
 
				 		return -EBUSY;
			
 
				 	}
			
 
				-	mutex_unlock(&cgroup_mutex);
			
 
				 
			
 
				 	/*
			
 
				-	 * In general, subsystem has no css->refcnt after pre_destroy(). But
			
 
				-	 * in racy cases, subsystem may have to get css->refcnt after
			
 
				-	 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
			
 
				-	 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
			
 
				-	 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
			
 
				-	 * and subsystem's reference count handling. Please see css_get/put
			
 
				-	 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
			
 
				+	 * Block new css_tryget() by deactivating refcnt and mark @cgrp
			
 
				+	 * removed.  This makes future css_tryget() and child creation
			
 
				+	 * attempts fail thus maintaining the removal conditions verified
			
 
				+	 * above.
			
 
				 	 */
			
 
				-	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
			
 
				+	for_each_subsys(cgrp->root, ss) {
			
 
				+		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
			
 
				 
			
 
				-	/*
			
 
				-	 * Call pre_destroy handlers of subsys. Notify subsystems
			
 
				-	 * that rmdir() request comes.
			
 
				-	 */
			
 
				-	ret = cgroup_call_pre_destroy(cgrp);
			
 
				-	if (ret) {
			
 
				-		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
			
 
				-		return ret;
			
 
				+		WARN_ON(atomic_read(&css->refcnt) < 0);
			
 
				+		atomic_add(CSS_DEACT_BIAS, &css->refcnt);
			
 
				 	}
			
 
				+	set_bit(CGRP_REMOVED, &cgrp->flags);
			
 
				 
			
 
				+	/*
			
 
				+	 * Tell subsystems to initate destruction.  pre_destroy() should be
			
 
				+	 * called with cgroup_mutex unlocked.  See 3fa59dfbc3 ("cgroup: fix
			
 
				+	 * potential deadlock in pre_destroy") for details.
			
 
				+	 */
			
 
				+	mutex_unlock(&cgroup_mutex);
			
 
				+	for_each_subsys(cgrp->root, ss)
			
 
				+		if (ss->pre_destroy)
			
 
				+			ss->pre_destroy(cgrp);
			
 
				 	mutex_lock(&cgroup_mutex);
			
 
				-	parent = cgrp->parent;
			
 
				-	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
			
 
				-		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
			
 
				-		mutex_unlock(&cgroup_mutex);
			
 
				-		return -EBUSY;
			
 
				-	}
			
 
				-	prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
			
 
				-	if (!cgroup_clear_css_refs(cgrp)) {
			
 
				-		mutex_unlock(&cgroup_mutex);
			
 
				-		/*
			
 
				-		 * Because someone may call cgroup_wakeup_rmdir_waiter() before
			
 
				-		 * prepare_to_wait(), we need to check this flag.
			
 
				-		 */
			
 
				-		if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
			
 
				-			schedule();
			
 
				-		finish_wait(&cgroup_rmdir_waitq, &wait);
			
 
				-		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
			
 
				-		if (signal_pending(current))
			
 
				-			return -EINTR;
			
 
				-		goto again;
			
 
				-	}
			
 
				-	/* NO css_tryget() can success after here. */
			
 
				-	finish_wait(&cgroup_rmdir_waitq, &wait);
			
 
				-	clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
			
 
				+
			
 
				+	/*
			
 
				+	 * Put all the base refs.  Each css holds an extra reference to the
			
 
				+	 * cgroup's dentry and cgroup removal proceeds regardless of css
			
 
				+	 * refs.  On the last put of each css, whenever that may be, the
			
 
				+	 * extra dentry ref is put so that dentry destruction happens only
			
 
				+	 * after all css's are released.
			
 
				+	 */
			
 
				+	for_each_subsys(cgrp->root, ss)
			
 
				+		css_put(cgrp->subsys[ss->subsys_id]);
			
 
				 
			
 
				 	raw_spin_lock(&release_list_lock);
			
 
				-	set_bit(CGRP_REMOVED, &cgrp->flags);
			
 
				 	if (!list_empty(&cgrp->release_list))
			
 
				 		list_del_init(&cgrp->release_list);
			
 
				 	raw_spin_unlock(&release_list_lock);
			
@@ -5041,15 +4907,17 @@ static void check_for_release(struct cgroup *cgrp)
 
				 /* Caller must verify that the css is not for root cgroup */
			
 
				 bool __css_tryget(struct cgroup_subsys_state *css)
			
 
				 {
			
 
				-	do {
			
 
				-		int v = css_refcnt(css);
			
 
				+	while (true) {
			
 
				+		int t, v;
			
 
				 
			
 
				-		if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v)
			
 
				+		v = css_refcnt(css);
			
 
				+		t = atomic_cmpxchg(&css->refcnt, v, v + 1);
			
 
				+		if (likely(t == v))
			
 
				 			return true;
			
 
				+		else if (t < 0)
			
 
				+			return false;
			
 
				 		cpu_relax();
			
 
				-	} while (!test_bit(CSS_REMOVED, &css->flags));
			
 
				-
			
 
				-	return false;
			
 
				+	}
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(__css_tryget);
			
 
				 
			
@@ -5068,11 +4936,9 @@ void __css_put(struct cgroup_subsys_state *css)
 
				 			set_bit(CGRP_RELEASABLE, &cgrp->flags);
			
 
				 			check_for_release(cgrp);
			
 
				 		}
			
 
				-		cgroup_wakeup_rmdir_waiter(cgrp);
			
 
				 		break;
			
 
				 	case 0:
			
 
				-		if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
			
 
				-			schedule_work(&css->dput_work);
			
 
				+		schedule_work(&css->dput_work);
			
 
				 		break;
			
 
				 	}
			
 
				 	rcu_read_unlock();
			
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -155,18 +155,13 @@ out:
 
				  * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
			
 
				  * the parent cgroup.
			
 
				  */
			
 
				-static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
			
 
				+static void hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
			
 
				 {
			
 
				 	struct hstate *h;
			
 
				 	struct page *page;
			
 
				-	int ret = 0, idx = 0;
			
 
				+	int idx = 0;
			
 
				 
			
 
				 	do {
			
 
				-		if (cgroup_task_count(cgroup) ||
			
 
				-		    !list_empty(&cgroup->children)) {
			
 
				-			ret = -EBUSY;
			
 
				-			goto out;
			
 
				-		}
			
 
				 		for_each_hstate(h) {
			
 
				 			spin_lock(&hugetlb_lock);
			
 
				 			list_for_each_entry(page, &h->hugepage_activelist, lru)
			
@@ -177,8 +172,6 @@ static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
 
				 		}
			
 
				 		cond_resched();
			
 
				 	} while (hugetlb_cgroup_have_usage(cgroup));
			
 
				-out:
			
 
				-	return ret;
			
 
				 }
			
 
				 
			
 
				 int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
			
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2337,7 +2337,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 
				 again:
			
 
				 	if (*ptr) { /* css should be a valid one */
			
 
				 		memcg = *ptr;
			
 
				-		VM_BUG_ON(css_is_removed(&memcg->css));
			
 
				 		if (mem_cgroup_is_root(memcg))
			
 
				 			goto done;
			
 
				 		if (nr_pages == 1 && consume_stock(memcg))
			
@@ -2477,9 +2476,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
 
				 
			
 
				 /*
			
 
				  * A helper function to get mem_cgroup from ID. must be called under
			
 
				- * rcu_read_lock(). The caller must check css_is_removed() or some if
			
 
				- * it's concern. (dropping refcnt from swap can be called against removed
			
 
				- * memcg.)
			
 
				+ * rcu_read_lock().  The caller is responsible for calling css_tryget if
			
 
				+ * the mem_cgroup is used for charging. (dropping refcnt from swap can be
			
 
				+ * called against removed memcg.)
			
 
				  */
			
 
				 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
			
 
				 {
			
@@ -2676,13 +2675,6 @@ static int mem_cgroup_move_account(struct page *page,
 
				 	/* caller should have done css_get */
			
 
				 	pc->mem_cgroup = to;
			
 
				 	mem_cgroup_charge_statistics(to, anon, nr_pages);
			
 
				-	/*
			
 
				-	 * We charges against "to" which may not have any tasks. Then, "to"
			
 
				-	 * can be under rmdir(). But in current implementation, caller of
			
 
				-	 * this function is just force_empty() and move charge, so it's
			
 
				-	 * guaranteed that "to" is never removed. So, we don't check rmdir
			
 
				-	 * status here.
			
 
				-	 */
			
 
				 	move_unlock_mem_cgroup(from, &flags);
			
 
				 	ret = 0;
			
 
				 unlock:
			
@@ -2696,10 +2688,27 @@ out:
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * move charges to its parent.
			
 
				+/**
			
 
				+ * mem_cgroup_move_parent - moves page to the parent group
			
 
				+ * @page: the page to move
			
 
				+ * @pc: page_cgroup of the page
			
 
				+ * @child: page's cgroup
			
 
				+ *
			
 
				+ * move charges to its parent or the root cgroup if the group has no
			
 
				+ * parent (aka use_hierarchy==0).
			
 
				+ * Although this might fail (get_page_unless_zero, isolate_lru_page or
			
 
				+ * mem_cgroup_move_account fails) the failure is always temporary and
			
 
				+ * it signals a race with a page removal/uncharge or migration. In the
			
 
				+ * first case the page is on the way out and it will vanish from the LRU
			
 
				+ * on the next attempt and the call should be retried later.
			
 
				+ * Isolation from the LRU fails only if page has been isolated from
			
 
				+ * the LRU since we looked at it and that usually means either global
			
 
				+ * reclaim or migration going on. The page will either get back to the
			
 
				+ * LRU or vanish.
			
 
				+ * Finaly mem_cgroup_move_account fails only if the page got uncharged
			
 
				+ * (!PageCgroupUsed) or moved to a different group. The page will
			
 
				+ * disappear in the next attempt.
			
 
				  */
			
 
				-
			
 
				 static int mem_cgroup_move_parent(struct page *page,
			
 
				 				  struct page_cgroup *pc,
			
 
				 				  struct mem_cgroup *child)
			
@@ -2709,9 +2718,7 @@ static int mem_cgroup_move_parent(struct page *page,
 
				 	unsigned long uninitialized_var(flags);
			
 
				 	int ret;
			
 
				 
			
 
				-	/* Is ROOT ? */
			
 
				-	if (mem_cgroup_is_root(child))
			
 
				-		return -EINVAL;
			
 
				+	VM_BUG_ON(mem_cgroup_is_root(child));
			
 
				 
			
 
				 	ret = -EBUSY;
			
 
				 	if (!get_page_unless_zero(page))
			
@@ -2728,8 +2735,10 @@ static int mem_cgroup_move_parent(struct page *page,
 
				 	if (!parent)
			
 
				 		parent = root_mem_cgroup;
			
 
				 
			
 
				-	if (nr_pages > 1)
			
 
				+	if (nr_pages > 1) {
			
 
				+		VM_BUG_ON(!PageTransHuge(page));
			
 
				 		flags = compound_lock_irqsave(page);
			
 
				+	}
			
 
				 
			
 
				 	ret = mem_cgroup_move_account(page, nr_pages,
			
 
				 				pc, child, parent);
			
@@ -2871,7 +2880,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
 
				 		return;
			
 
				 	if (!memcg)
			
 
				 		return;
			
 
				-	cgroup_exclude_rmdir(&memcg->css);
			
 
				 
			
 
				 	__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
			
 
				 	/*
			
@@ -2885,12 +2893,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
 
				 		swp_entry_t ent = {.val = page_private(page)};
			
 
				 		mem_cgroup_uncharge_swap(ent);
			
 
				 	}
			
 
				-	/*
			
 
				-	 * At swapin, we may charge account against cgroup which has no tasks.
			
 
				-	 * So, rmdir()->pre_destroy() can be called while we do this charge.
			
 
				-	 * In that case, we need to call pre_destroy() again. check it here.
			
 
				-	 */
			
 
				-	cgroup_release_and_wakeup_rmdir(&memcg->css);
			
 
				 }
			
 
				 
			
 
				 void mem_cgroup_commit_charge_swapin(struct page *page,
			
@@ -3338,8 +3340,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 
				 
			
 
				 	if (!memcg)
			
 
				 		return;
			
 
				-	/* blocks rmdir() */
			
 
				-	cgroup_exclude_rmdir(&memcg->css);
			
 
				+
			
 
				 	if (!migration_ok) {
			
 
				 		used = oldpage;
			
 
				 		unused = newpage;
			
@@ -3373,13 +3374,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 
				 	 */
			
 
				 	if (anon)
			
 
				 		mem_cgroup_uncharge_page(used);
			
 
				-	/*
			
 
				-	 * At migration, we may charge account against cgroup which has no
			
 
				-	 * tasks.
			
 
				-	 * So, rmdir()->pre_destroy() can be called while we do this charge.
			
 
				-	 * In that case, we need to call pre_destroy() again. check it here.
			
 
				-	 */
			
 
				-	cgroup_release_and_wakeup_rmdir(&memcg->css);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -3679,17 +3673,22 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 
				 	return nr_reclaimed;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				+/**
			
 
				+ * mem_cgroup_force_empty_list - clears LRU of a group
			
 
				+ * @memcg: group to clear
			
 
				+ * @node: NUMA node
			
 
				+ * @zid: zone id
			
 
				+ * @lru: lru to to clear
			
 
				+ *
			
 
				  * Traverse a specified page_cgroup list and try to drop them all.  This doesn't
			
 
				- * reclaim the pages page themselves - it just removes the page_cgroups.
			
 
				- * Returns true if some page_cgroups were not freed, indicating that the caller
			
 
				- * must retry this operation.
			
 
				+ * reclaim the pages page themselves - pages are moved to the parent (or root)
			
 
				+ * group.
			
 
				  */
			
 
				-static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
			
 
				+static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
			
 
				 				int node, int zid, enum lru_list lru)
			
 
				 {
			
 
				 	struct mem_cgroup_per_zone *mz;
			
 
				-	unsigned long flags, loop;
			
 
				+	unsigned long flags;
			
 
				 	struct list_head *list;
			
 
				 	struct page *busy;
			
 
				 	struct zone *zone;
			
@@ -3698,11 +3697,8 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
 
				 	mz = mem_cgroup_zoneinfo(memcg, node, zid);
			
 
				 	list = &mz->lruvec.lists[lru];
			
 
				 
			
 
				-	loop = mz->lru_size[lru];
			
 
				-	/* give some margin against EBUSY etc...*/
			
 
				-	loop += 256;
			
 
				 	busy = NULL;
			
 
				-	while (loop--) {
			
 
				+	do {
			
 
				 		struct page_cgroup *pc;
			
 
				 		struct page *page;
			
 
				 
			
@@ -3728,76 +3724,72 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
 
				 			cond_resched();
			
 
				 		} else
			
 
				 			busy = NULL;
			
 
				-	}
			
 
				-	return !list_empty(list);
			
 
				+	} while (!list_empty(list));
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * make mem_cgroup's charge to be 0 if there is no task.
			
 
				+ * make mem_cgroup's charge to be 0 if there is no task by moving
			
 
				+ * all the charges and pages to the parent.
			
 
				  * This enables deleting this mem_cgroup.
			
 
				+ *
			
 
				+ * Caller is responsible for holding css reference on the memcg.
			
 
				  */
			
 
				-static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
			
 
				+static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
			
 
				 {
			
 
				-	int ret;
			
 
				-	int node, zid, shrink;
			
 
				-	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
			
 
				-	struct cgroup *cgrp = memcg->css.cgroup;
			
 
				-
			
 
				-	css_get(&memcg->css);
			
 
				+	int node, zid;
			
 
				 
			
 
				-	shrink = 0;
			
 
				-	/* should free all ? */
			
 
				-	if (free_all)
			
 
				-		goto try_to_free;
			
 
				-move_account:
			
 
				 	do {
			
 
				-		ret = -EBUSY;
			
 
				-		if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
			
 
				-			goto out;
			
 
				 		/* This is for making all *used* pages to be on LRU. */
			
 
				 		lru_add_drain_all();
			
 
				 		drain_all_stock_sync(memcg);
			
 
				-		ret = 0;
			
 
				 		mem_cgroup_start_move(memcg);
			
 
				 		for_each_node_state(node, N_HIGH_MEMORY) {
			
 
				-			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
			
 
				+			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
			
 
				 				enum lru_list lru;
			
 
				 				for_each_lru(lru) {
			
 
				-					ret = mem_cgroup_force_empty_list(memcg,
			
 
				+					mem_cgroup_force_empty_list(memcg,
			
 
				 							node, zid, lru);
			
 
				-					if (ret)
			
 
				-						break;
			
 
				 				}
			
 
				 			}
			
 
				-			if (ret)
			
 
				-				break;
			
 
				 		}
			
 
				 		mem_cgroup_end_move(memcg);
			
 
				 		memcg_oom_recover(memcg);
			
 
				 		cond_resched();
			
 
				-	/* "ret" should also be checked to ensure all lists are empty. */
			
 
				-	} while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
			
 
				-out:
			
 
				-	css_put(&memcg->css);
			
 
				-	return ret;
			
 
				 
			
 
				-try_to_free:
			
 
				+		/*
			
 
				+		 * This is a safety check because mem_cgroup_force_empty_list
			
 
				+		 * could have raced with mem_cgroup_replace_page_cache callers
			
 
				+		 * so the lru seemed empty but the page could have been added
			
 
				+		 * right after the check. RES_USAGE should be safe as we always
			
 
				+		 * charge before adding to the LRU.
			
 
				+		 */
			
 
				+	} while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Reclaims as many pages from the given memcg as possible and moves
			
 
				+ * the rest to the parent.
			
 
				+ *
			
 
				+ * Caller is responsible for holding css reference for memcg.
			
 
				+ */
			
 
				+static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
			
 
				+{
			
 
				+	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
			
 
				+	struct cgroup *cgrp = memcg->css.cgroup;
			
 
				+
			
 
				 	/* returns EBUSY if there is a task or if we come here twice. */
			
 
				-	if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
			
 
				-		ret = -EBUSY;
			
 
				-		goto out;
			
 
				-	}
			
 
				+	if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
			
 
				+		return -EBUSY;
			
 
				+
			
 
				 	/* we call try-to-free pages for make this cgroup empty */
			
 
				 	lru_add_drain_all();
			
 
				 	/* try to free all pages in this cgroup */
			
 
				-	shrink = 1;
			
 
				 	while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
			
 
				 		int progress;
			
 
				 
			
 
				-		if (signal_pending(current)) {
			
 
				-			ret = -EINTR;
			
 
				-			goto out;
			
 
				-		}
			
 
				+		if (signal_pending(current))
			
 
				+			return -EINTR;
			
 
				+
			
 
				 		progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
			
 
				 						false);
			
 
				 		if (!progress) {
			
@@ -3808,13 +3800,23 @@ try_to_free:
 
				 
			
 
				 	}
			
 
				 	lru_add_drain();
			
 
				-	/* try move_account...there may be some *locked* pages. */
			
 
				-	goto move_account;
			
 
				+	mem_cgroup_reparent_charges(memcg);
			
 
				+
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
			
 
				 {
			
 
				-	return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
			
 
				+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
			
 
				+	int ret;
			
 
				+
			
 
				+	if (mem_cgroup_is_root(memcg))
			
 
				+		return -EINVAL;
			
 
				+	css_get(&memcg->css);
			
 
				+	ret = mem_cgroup_force_empty(memcg);
			
 
				+	css_put(&memcg->css);
			
 
				+
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 
			
@@ -5001,11 +5003,11 @@ free_out:
 
				 	return ERR_PTR(error);
			
 
				 }
			
 
				 
			
 
				-static int mem_cgroup_pre_destroy(struct cgroup *cont)
			
 
				+static void mem_cgroup_pre_destroy(struct cgroup *cont)
			
 
				 {
			
 
				 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
			
 
				 
			
 
				-	return mem_cgroup_force_empty(memcg, false);
			
 
				+	mem_cgroup_reparent_charges(memcg);
			
 
				 }
			
 
				 
			
 
				 static void mem_cgroup_destroy(struct cgroup *cont)
			
@@ -5607,7 +5609,6 @@ struct cgroup_subsys mem_cgroup_subsys = {
 
				 	.base_cftypes = mem_cgroup_files,
			
 
				 	.early_init = 0,
			
 
				 	.use_id = 1,
			
 
				-	.__DEPRECATED_clear_css_refs = true,
			
 
				 };
			
 
				 
			
 
				 #ifdef CONFIG_MEMCG_SWAP