12 years ago · ac9a197451
--- a/Documentation/block/cfq-iosched.txt
+++ b/Documentation/block/cfq-iosched.txt
@@ -102,6 +102,64 @@ processing of request. Therefore, increasing the value can imporve the
 
				 performace although this can cause the latency of some I/O to increase due
			
 
				 to more number of requests.
			
 
				 
			
 
				+CFQ Group scheduling
			
 
				+====================
			
 
				+
			
 
				+CFQ supports blkio cgroup and has "blkio." prefixed files in each
			
 
				+blkio cgroup directory. It is weight-based and there are four knobs
			
 
				+for configuration - weight[_device] and leaf_weight[_device].
			
 
				+Internal cgroup nodes (the ones with children) can also have tasks in
			
 
				+them, so the former two configure how much proportion the cgroup as a
			
 
				+whole is entitled to at its parent's level while the latter two
			
 
				+configure how much proportion the tasks in the cgroup have compared to
			
 
				+its direct children.
			
 
				+
			
 
				+Another way to think about it is assuming that each internal node has
			
 
				+an implicit leaf child node which hosts all the tasks whose weight is
			
 
				+configured by leaf_weight[_device]. Let's assume a blkio hierarchy
			
 
				+composed of five cgroups - root, A, B, AA and AB - with the following
			
 
				+weights where the names represent the hierarchy.
			
 
				+
			
 
				+        weight leaf_weight
			
 
				+ root :  125    125
			
 
				+ A    :  500    750
			
 
				+ B    :  250    500
			
 
				+ AA   :  500    500
			
 
				+ AB   : 1000    500
			
 
				+
			
 
				+root never has a parent making its weight is meaningless. For backward
			
 
				+compatibility, weight is always kept in sync with leaf_weight. B, AA
			
 
				+and AB have no child and thus its tasks have no children cgroup to
			
 
				+compete with. They always get 100% of what the cgroup won at the
			
 
				+parent level. Considering only the weights which matter, the hierarchy
			
 
				+looks like the following.
			
 
				+
			
 
				+          root
			
 
				+       /    |   \
			
 
				+      A     B    leaf
			
 
				+     500   250   125
			
 
				+   /  |  \
			
 
				+  AA  AB  leaf
			
 
				+ 500 1000 750
			
 
				+
			
 
				+If all cgroups have active IOs and competing with each other, disk
			
 
				+time will be distributed like the following.
			
 
				+
			
 
				+Distribution below root. The total active weight at this level is
			
 
				+A:500 + B:250 + C:125 = 875.
			
 
				+
			
 
				+ root-leaf :   125 /  875      =~ 14%
			
 
				+ A         :   500 /  875      =~ 57%
			
 
				+ B(-leaf)  :   250 /  875      =~ 28%
			
 
				+
			
 
				+A has children and further distributes its 57% among the children and
			
 
				+the implicit leaf node. The total active weight at this level is
			
 
				+AA:500 + AB:1000 + A-leaf:750 = 2250.
			
 
				+
			
 
				+ A-leaf    : ( 750 / 2250) * A =~ 19%
			
 
				+ AA(-leaf) : ( 500 / 2250) * A =~ 12%
			
 
				+ AB(-leaf) : (1000 / 2250) * A =~ 25%
			
 
				+
			
 
				 CFQ IOPS Mode for group scheduling
			
 
				 ===================================
			
 
				 Basic CFQ design is to provide priority based time slices. Higher priority
			
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -94,13 +94,11 @@ Throttling/Upper Limit policy
 
				 
			
 
				 Hierarchical Cgroups
			
 
				 ====================
			
 
				-- Currently none of the IO control policy supports hierarchical groups. But
			
 
				-  cgroup interface does allow creation of hierarchical cgroups and internally
			
 
				-  IO policies treat them as flat hierarchy.
			
 
				+- Currently only CFQ supports hierarchical groups. For throttling,
			
 
				+  cgroup interface does allow creation of hierarchical cgroups and
			
 
				+  internally it treats them as flat hierarchy.
			
 
				 
			
 
				-  So this patch will allow creation of cgroup hierarchcy but at the backend
			
 
				-  everything will be treated as flat. So if somebody created a hierarchy like
			
 
				-  as follows.
			
 
				+  If somebody created a hierarchy like as follows.
			
 
				 
			
 
				 			root
			
 
				 			/  \
			
@@ -108,16 +106,20 @@ Hierarchical Cgroups
 
				 			|
			
 
				 		     test3
			
 
				 
			
 
				-  CFQ and throttling will practically treat all groups at same level.
			
 
				+  CFQ will handle the hierarchy correctly but and throttling will
			
 
				+  practically treat all groups at same level. For details on CFQ
			
 
				+  hierarchy support, refer to Documentation/block/cfq-iosched.txt.
			
 
				+  Throttling will treat the hierarchy as if it looks like the
			
 
				+  following.
			
 
				 
			
 
				 				pivot
			
 
				 			     /  /   \  \
			
 
				 			root  test1 test2  test3
			
 
				 
			
 
				-  Down the line we can implement hierarchical accounting/control support
			
 
				-  and also introduce a new cgroup file "use_hierarchy" which will control
			
 
				-  whether cgroup hierarchy is viewed as flat or hierarchical by the policy..
			
 
				-  This is how memory controller also has implemented the things.
			
 
				+  Nesting cgroups, while allowed, isn't officially supported and blkio
			
 
				+  genereates warning when cgroups nest. Once throttling implements
			
 
				+  hierarchy support, hierarchy will be supported and the warning will
			
 
				+  be removed.
			
 
				 
			
 
				 Various user visible config options
			
 
				 ===================================
			
@@ -172,6 +174,12 @@ Proportional weight policy files
 
				 	  dev     weight
			
 
				 	  8:16    300
			
 
				 
			
 
				+- blkio.leaf_weight[_device]
			
 
				+	- Equivalents of blkio.weight[_device] for the purpose of
			
 
				+          deciding how much weight tasks in the given cgroup has while
			
 
				+          competing with the cgroup's child cgroups. For details,
			
 
				+          please refer to Documentation/block/cfq-iosched.txt.
			
 
				+
			
 
				 - blkio.time
			
 
				 	- disk time allocated to cgroup per device in milliseconds. First
			
 
				 	  two fields specify the major and minor number of the device and
			
@@ -279,6 +287,11 @@ Proportional weight policy files
 
				 	  and minor number of the device and third field specifies the number
			
 
				 	  of times a group was dequeued from a particular device.
			
 
				 
			
 
				+- blkio.*_recursive
			
 
				+	- Recursive version of various stats. These files show the
			
 
				+          same information as their non-recursive counterparts but
			
 
				+          include stats from all the descendant cgroups.
			
 
				+
			
 
				 Throttling/Upper limit policy files
			
 
				 -----------------------------------
			
 
				 - blkio.throttle.read_bps_device
			
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -26,11 +26,32 @@
 
				 
			
 
				 static DEFINE_MUTEX(blkcg_pol_mutex);
			
 
				 
			
 
				-struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT };
			
 
				+struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT,
			
 
				+			    .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, };
			
 
				 EXPORT_SYMBOL_GPL(blkcg_root);
			
 
				 
			
 
				 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
			
 
				 
			
 
				+static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
			
 
				+				      struct request_queue *q, bool update_hint);
			
 
				+
			
 
				+/**
			
 
				+ * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
			
 
				+ * @d_blkg: loop cursor pointing to the current descendant
			
 
				+ * @pos_cgrp: used for iteration
			
 
				+ * @p_blkg: target blkg to walk descendants of
			
 
				+ *
			
 
				+ * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
			
 
				+ * read locked.  If called under either blkcg or queue lock, the iteration
			
 
				+ * is guaranteed to include all and only online blkgs.  The caller may
			
 
				+ * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
			
 
				+ * subtree.
			
 
				+ */
			
 
				+#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg)		\
			
 
				+	cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
			
 
				+		if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
			
 
				+					      (p_blkg)->q, false)))
			
 
				+
			
 
				 static bool blkcg_policy_enabled(struct request_queue *q,
			
 
				 				 const struct blkcg_policy *pol)
			
 
				 {
			
@@ -112,9 +133,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 
				 
			
 
				 		blkg->pd[i] = pd;
			
 
				 		pd->blkg = blkg;
			
 
				+		pd->plid = i;
			
 
				 
			
 
				 		/* invoke per-policy init */
			
 
				-		if (blkcg_policy_enabled(blkg->q, pol))
			
 
				+		if (pol->pd_init_fn)
			
 
				 			pol->pd_init_fn(blkg);
			
 
				 	}
			
 
				 
			
@@ -125,8 +147,19 @@ err_free:
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * __blkg_lookup - internal version of blkg_lookup()
			
 
				+ * @blkcg: blkcg of interest
			
 
				+ * @q: request_queue of interest
			
 
				+ * @update_hint: whether to update lookup hint with the result or not
			
 
				+ *
			
 
				+ * This is internal version and shouldn't be used by policy
			
 
				+ * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
			
 
				+ * @q's bypass state.  If @update_hint is %true, the caller should be
			
 
				+ * holding @q->queue_lock and lookup hint is updated on success.
			
 
				+ */
			
 
				 static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
			
 
				-				      struct request_queue *q)
			
 
				+				      struct request_queue *q, bool update_hint)
			
 
				 {
			
 
				 	struct blkcg_gq *blkg;
			
 
				 
			
@@ -135,14 +168,19 @@ static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
 
				 		return blkg;
			
 
				 
			
 
				 	/*
			
 
				-	 * Hint didn't match.  Look up from the radix tree.  Note that we
			
 
				-	 * may not be holding queue_lock and thus are not sure whether
			
 
				-	 * @blkg from blkg_tree has already been removed or not, so we
			
 
				-	 * can't update hint to the lookup result.  Leave it to the caller.
			
 
				+	 * Hint didn't match.  Look up from the radix tree.  Note that the
			
 
				+	 * hint can only be updated under queue_lock as otherwise @blkg
			
 
				+	 * could have already been removed from blkg_tree.  The caller is
			
 
				+	 * responsible for grabbing queue_lock if @update_hint.
			
 
				 	 */
			
 
				 	blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
			
 
				-	if (blkg && blkg->q == q)
			
 
				+	if (blkg && blkg->q == q) {
			
 
				+		if (update_hint) {
			
 
				+			lockdep_assert_held(q->queue_lock);
			
 
				+			rcu_assign_pointer(blkcg->blkg_hint, blkg);
			
 
				+		}
			
 
				 		return blkg;
			
 
				+	}
			
 
				 
			
 
				 	return NULL;
			
 
				 }
			
@@ -162,7 +200,7 @@ struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
 
				 
			
 
				 	if (unlikely(blk_queue_bypass(q)))
			
 
				 		return NULL;
			
 
				-	return __blkg_lookup(blkcg, q);
			
 
				+	return __blkg_lookup(blkcg, q, false);
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(blkg_lookup);
			
 
				 
			
@@ -170,75 +208,129 @@ EXPORT_SYMBOL_GPL(blkg_lookup);
 
				  * If @new_blkg is %NULL, this function tries to allocate a new one as
			
 
				  * necessary using %GFP_ATOMIC.  @new_blkg is always consumed on return.
			
 
				  */
			
 
				-static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
			
 
				-					     struct request_queue *q,
			
 
				-					     struct blkcg_gq *new_blkg)
			
 
				+static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
			
 
				+				    struct request_queue *q,
			
 
				+				    struct blkcg_gq *new_blkg)
			
 
				 {
			
 
				 	struct blkcg_gq *blkg;
			
 
				-	int ret;
			
 
				+	int i, ret;
			
 
				 
			
 
				 	WARN_ON_ONCE(!rcu_read_lock_held());
			
 
				 	lockdep_assert_held(q->queue_lock);
			
 
				 
			
 
				-	/* lookup and update hint on success, see __blkg_lookup() for details */
			
 
				-	blkg = __blkg_lookup(blkcg, q);
			
 
				-	if (blkg) {
			
 
				-		rcu_assign_pointer(blkcg->blkg_hint, blkg);
			
 
				-		goto out_free;
			
 
				-	}
			
 
				-
			
 
				 	/* blkg holds a reference to blkcg */
			
 
				 	if (!css_tryget(&blkcg->css)) {
			
 
				-		blkg = ERR_PTR(-EINVAL);
			
 
				-		goto out_free;
			
 
				+		ret = -EINVAL;
			
 
				+		goto err_free_blkg;
			
 
				 	}
			
 
				 
			
 
				 	/* allocate */
			
 
				 	if (!new_blkg) {
			
 
				 		new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
			
 
				 		if (unlikely(!new_blkg)) {
			
 
				-			blkg = ERR_PTR(-ENOMEM);
			
 
				-			goto out_put;
			
 
				+			ret = -ENOMEM;
			
 
				+			goto err_put_css;
			
 
				 		}
			
 
				 	}
			
 
				 	blkg = new_blkg;
			
 
				 
			
 
				-	/* insert */
			
 
				+	/* link parent and insert */
			
 
				+	if (blkcg_parent(blkcg)) {
			
 
				+		blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
			
 
				+		if (WARN_ON_ONCE(!blkg->parent)) {
			
 
				+			blkg = ERR_PTR(-EINVAL);
			
 
				+			goto err_put_css;
			
 
				+		}
			
 
				+		blkg_get(blkg->parent);
			
 
				+	}
			
 
				+
			
 
				 	spin_lock(&blkcg->lock);
			
 
				 	ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
			
 
				 	if (likely(!ret)) {
			
 
				 		hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
			
 
				 		list_add(&blkg->q_node, &q->blkg_list);
			
 
				+
			
 
				+		for (i = 0; i < BLKCG_MAX_POLS; i++) {
			
 
				+			struct blkcg_policy *pol = blkcg_policy[i];
			
 
				+
			
 
				+			if (blkg->pd[i] && pol->pd_online_fn)
			
 
				+				pol->pd_online_fn(blkg);
			
 
				+		}
			
 
				 	}
			
 
				+	blkg->online = true;
			
 
				 	spin_unlock(&blkcg->lock);
			
 
				 
			
 
				 	if (!ret)
			
 
				 		return blkg;
			
 
				 
			
 
				-	blkg = ERR_PTR(ret);
			
 
				-out_put:
			
 
				+	/* @blkg failed fully initialized, use the usual release path */
			
 
				+	blkg_put(blkg);
			
 
				+	return ERR_PTR(ret);
			
 
				+
			
 
				+err_put_css:
			
 
				 	css_put(&blkcg->css);
			
 
				-out_free:
			
 
				+err_free_blkg:
			
 
				 	blkg_free(new_blkg);
			
 
				-	return blkg;
			
 
				+	return ERR_PTR(ret);
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * blkg_lookup_create - lookup blkg, try to create one if not there
			
 
				+ * @blkcg: blkcg of interest
			
 
				+ * @q: request_queue of interest
			
 
				+ *
			
 
				+ * Lookup blkg for the @blkcg - @q pair.  If it doesn't exist, try to
			
 
				+ * create one.  blkg creation is performed recursively from blkcg_root such
			
 
				+ * that all non-root blkg's have access to the parent blkg.  This function
			
 
				+ * should be called under RCU read lock and @q->queue_lock.
			
 
				+ *
			
 
				+ * Returns pointer to the looked up or created blkg on success, ERR_PTR()
			
 
				+ * value on error.  If @q is dead, returns ERR_PTR(-EINVAL).  If @q is not
			
 
				+ * dead and bypassing, returns ERR_PTR(-EBUSY).
			
 
				+ */
			
 
				 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
			
 
				 				    struct request_queue *q)
			
 
				 {
			
 
				+	struct blkcg_gq *blkg;
			
 
				+
			
 
				+	WARN_ON_ONCE(!rcu_read_lock_held());
			
 
				+	lockdep_assert_held(q->queue_lock);
			
 
				+
			
 
				 	/*
			
 
				 	 * This could be the first entry point of blkcg implementation and
			
 
				 	 * we shouldn't allow anything to go through for a bypassing queue.
			
 
				 	 */
			
 
				 	if (unlikely(blk_queue_bypass(q)))
			
 
				 		return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY);
			
 
				-	return __blkg_lookup_create(blkcg, q, NULL);
			
 
				+
			
 
				+	blkg = __blkg_lookup(blkcg, q, true);
			
 
				+	if (blkg)
			
 
				+		return blkg;
			
 
				+
			
 
				+	/*
			
 
				+	 * Create blkgs walking down from blkcg_root to @blkcg, so that all
			
 
				+	 * non-root blkgs have access to their parents.
			
 
				+	 */
			
 
				+	while (true) {
			
 
				+		struct blkcg *pos = blkcg;
			
 
				+		struct blkcg *parent = blkcg_parent(blkcg);
			
 
				+
			
 
				+		while (parent && !__blkg_lookup(parent, q, false)) {
			
 
				+			pos = parent;
			
 
				+			parent = blkcg_parent(parent);
			
 
				+		}
			
 
				+
			
 
				+		blkg = blkg_create(pos, q, NULL);
			
 
				+		if (pos == blkcg || IS_ERR(blkg))
			
 
				+			return blkg;
			
 
				+	}
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(blkg_lookup_create);
			
 
				 
			
 
				 static void blkg_destroy(struct blkcg_gq *blkg)
			
 
				 {
			
 
				 	struct blkcg *blkcg = blkg->blkcg;
			
 
				+	int i;
			
 
				 
			
 
				 	lockdep_assert_held(blkg->q->queue_lock);
			
 
				 	lockdep_assert_held(&blkcg->lock);
			
@@ -247,6 +339,14 @@ static void blkg_destroy(struct blkcg_gq *blkg)
 
				 	WARN_ON_ONCE(list_empty(&blkg->q_node));
			
 
				 	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
			
 
				 
			
 
				+	for (i = 0; i < BLKCG_MAX_POLS; i++) {
			
 
				+		struct blkcg_policy *pol = blkcg_policy[i];
			
 
				+
			
 
				+		if (blkg->pd[i] && pol->pd_offline_fn)
			
 
				+			pol->pd_offline_fn(blkg);
			
 
				+	}
			
 
				+	blkg->online = false;
			
 
				+
			
 
				 	radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
			
 
				 	list_del_init(&blkg->q_node);
			
 
				 	hlist_del_init_rcu(&blkg->blkcg_node);
			
@@ -301,8 +401,10 @@ static void blkg_rcu_free(struct rcu_head *rcu_head)
 
				 
			
 
				 void __blkg_release(struct blkcg_gq *blkg)
			
 
				 {
			
 
				-	/* release the extra blkcg reference this blkg has been holding */
			
 
				+	/* release the blkcg and parent blkg refs this blkg has been holding */
			
 
				 	css_put(&blkg->blkcg->css);
			
 
				+	if (blkg->parent)
			
 
				+		blkg_put(blkg->parent);
			
 
				 
			
 
				 	/*
			
 
				 	 * A group is freed in rcu manner. But having an rcu lock does not
			
@@ -402,8 +504,9 @@ static const char *blkg_dev_name(struct blkcg_gq *blkg)
 
				  *
			
 
				  * This function invokes @prfill on each blkg of @blkcg if pd for the
			
 
				  * policy specified by @pol exists.  @prfill is invoked with @sf, the
			
 
				- * policy data and @data.  If @show_total is %true, the sum of the return
			
 
				- * values from @prfill is printed with "Total" label at the end.
			
 
				+ * policy data and @data and the matching queue lock held.  If @show_total
			
 
				+ * is %true, the sum of the return values from @prfill is printed with
			
 
				+ * "Total" label at the end.
			
 
				  *
			
 
				  * This is to be used to construct print functions for
			
 
				  * cftype->read_seq_string method.
			
@@ -418,11 +521,14 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
 
				 	struct hlist_node *n;
			
 
				 	u64 total = 0;
			
 
				 
			
 
				-	spin_lock_irq(&blkcg->lock);
			
 
				-	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
			
 
				+	rcu_read_lock();
			
 
				+	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
			
 
				+		spin_lock_irq(blkg->q->queue_lock);
			
 
				 		if (blkcg_policy_enabled(blkg->q, pol))
			
 
				 			total += prfill(sf, blkg->pd[pol->plid], data);
			
 
				-	spin_unlock_irq(&blkcg->lock);
			
 
				+		spin_unlock_irq(blkg->q->queue_lock);
			
 
				+	}
			
 
				+	rcu_read_unlock();
			
 
				 
			
 
				 	if (show_total)
			
 
				 		seq_printf(sf, "Total %llu\n", (unsigned long long)total);
			
@@ -481,6 +587,7 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 
				 	seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
			
 
				 	return v;
			
 
				 }
			
 
				+EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
			
 
				 
			
 
				 /**
			
 
				  * blkg_prfill_stat - prfill callback for blkg_stat
			
@@ -513,6 +620,82 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
			
 
				 
			
 
				+/**
			
 
				+ * blkg_stat_recursive_sum - collect hierarchical blkg_stat
			
 
				+ * @pd: policy private data of interest
			
 
				+ * @off: offset to the blkg_stat in @pd
			
 
				+ *
			
 
				+ * Collect the blkg_stat specified by @off from @pd and all its online
			
 
				+ * descendants and return the sum.  The caller must be holding the queue
			
 
				+ * lock for online tests.
			
 
				+ */
			
 
				+u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
			
 
				+{
			
 
				+	struct blkcg_policy *pol = blkcg_policy[pd->plid];
			
 
				+	struct blkcg_gq *pos_blkg;
			
 
				+	struct cgroup *pos_cgrp;
			
 
				+	u64 sum;
			
 
				+
			
 
				+	lockdep_assert_held(pd->blkg->q->queue_lock);
			
 
				+
			
 
				+	sum = blkg_stat_read((void *)pd + off);
			
 
				+
			
 
				+	rcu_read_lock();
			
 
				+	blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) {
			
 
				+		struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
			
 
				+		struct blkg_stat *stat = (void *)pos_pd + off;
			
 
				+
			
 
				+		if (pos_blkg->online)
			
 
				+			sum += blkg_stat_read(stat);
			
 
				+	}
			
 
				+	rcu_read_unlock();
			
 
				+
			
 
				+	return sum;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
			
 
				+
			
 
				+/**
			
 
				+ * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
			
 
				+ * @pd: policy private data of interest
			
 
				+ * @off: offset to the blkg_stat in @pd
			
 
				+ *
			
 
				+ * Collect the blkg_rwstat specified by @off from @pd and all its online
			
 
				+ * descendants and return the sum.  The caller must be holding the queue
			
 
				+ * lock for online tests.
			
 
				+ */
			
 
				+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
			
 
				+					     int off)
			
 
				+{
			
 
				+	struct blkcg_policy *pol = blkcg_policy[pd->plid];
			
 
				+	struct blkcg_gq *pos_blkg;
			
 
				+	struct cgroup *pos_cgrp;
			
 
				+	struct blkg_rwstat sum;
			
 
				+	int i;
			
 
				+
			
 
				+	lockdep_assert_held(pd->blkg->q->queue_lock);
			
 
				+
			
 
				+	sum = blkg_rwstat_read((void *)pd + off);
			
 
				+
			
 
				+	rcu_read_lock();
			
 
				+	blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) {
			
 
				+		struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
			
 
				+		struct blkg_rwstat *rwstat = (void *)pos_pd + off;
			
 
				+		struct blkg_rwstat tmp;
			
 
				+
			
 
				+		if (!pos_blkg->online)
			
 
				+			continue;
			
 
				+
			
 
				+		tmp = blkg_rwstat_read(rwstat);
			
 
				+
			
 
				+		for (i = 0; i < BLKG_RWSTAT_NR; i++)
			
 
				+			sum.cnt[i] += tmp.cnt[i];
			
 
				+	}
			
 
				+	rcu_read_unlock();
			
 
				+
			
 
				+	return sum;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
			
 
				+
			
 
				 /**
			
 
				  * blkg_conf_prep - parse and prepare for per-blkg config update
			
 
				  * @blkcg: target block cgroup
			
@@ -658,6 +841,7 @@ static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup)
 
				 		return ERR_PTR(-ENOMEM);
			
 
				 
			
 
				 	blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
			
 
				+	blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT;
			
 
				 	blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
			
 
				 done:
			
 
				 	spin_lock_init(&blkcg->lock);
			
@@ -777,7 +961,7 @@ int blkcg_activate_policy(struct request_queue *q,
 
				 			  const struct blkcg_policy *pol)
			
 
				 {
			
 
				 	LIST_HEAD(pds);
			
 
				-	struct blkcg_gq *blkg;
			
 
				+	struct blkcg_gq *blkg, *new_blkg;
			
 
				 	struct blkg_policy_data *pd, *n;
			
 
				 	int cnt = 0, ret;
			
 
				 	bool preloaded;
			
@@ -786,19 +970,27 @@ int blkcg_activate_policy(struct request_queue *q,
 
				 		return 0;
			
 
				 
			
 
				 	/* preallocations for root blkg */
			
 
				-	blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
			
 
				-	if (!blkg)
			
 
				+	new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
			
 
				+	if (!new_blkg)
			
 
				 		return -ENOMEM;
			
 
				 
			
 
				 	preloaded = !radix_tree_preload(GFP_KERNEL);
			
 
				 
			
 
				 	blk_queue_bypass_start(q);
			
 
				 
			
 
				-	/* make sure the root blkg exists and count the existing blkgs */
			
 
				+	/*
			
 
				+	 * Make sure the root blkg exists and count the existing blkgs.  As
			
 
				+	 * @q is bypassing at this point, blkg_lookup_create() can't be
			
 
				+	 * used.  Open code it.
			
 
				+	 */
			
 
				 	spin_lock_irq(q->queue_lock);
			
 
				 
			
 
				 	rcu_read_lock();
			
 
				-	blkg = __blkg_lookup_create(&blkcg_root, q, blkg);
			
 
				+	blkg = __blkg_lookup(&blkcg_root, q, false);
			
 
				+	if (blkg)
			
 
				+		blkg_free(new_blkg);
			
 
				+	else
			
 
				+		blkg = blkg_create(&blkcg_root, q, new_blkg);
			
 
				 	rcu_read_unlock();
			
 
				 
			
 
				 	if (preloaded)
			
@@ -846,6 +1038,7 @@ int blkcg_activate_policy(struct request_queue *q,
 
				 
			
 
				 		blkg->pd[pol->plid] = pd;
			
 
				 		pd->blkg = blkg;
			
 
				+		pd->plid = pol->plid;
			
 
				 		pol->pd_init_fn(blkg);
			
 
				 
			
 
				 		spin_unlock(&blkg->blkcg->lock);
			
@@ -892,6 +1085,8 @@ void blkcg_deactivate_policy(struct request_queue *q,
 
				 		/* grab blkcg lock too while removing @pd from @blkg */
			
 
				 		spin_lock(&blkg->blkcg->lock);
			
 
				 
			
 
				+		if (pol->pd_offline_fn)
			
 
				+			pol->pd_offline_fn(blkg);
			
 
				 		if (pol->pd_exit_fn)
			
 
				 			pol->pd_exit_fn(blkg);
			
 
				 
			
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -54,6 +54,7 @@ struct blkcg {
 
				 
			
 
				 	/* TODO: per-policy storage in blkcg */
			
 
				 	unsigned int			cfq_weight;	/* belongs to cfq */
			
 
				+	unsigned int			cfq_leaf_weight;
			
 
				 };
			
 
				 
			
 
				 struct blkg_stat {
			
@@ -80,8 +81,9 @@ struct blkg_rwstat {
 
				  * beginning and pd_size can't be smaller than pd.
			
 
				  */
			
 
				 struct blkg_policy_data {
			
 
				-	/* the blkg this per-policy data belongs to */
			
 
				+	/* the blkg and policy id this per-policy data belongs to */
			
 
				 	struct blkcg_gq			*blkg;
			
 
				+	int				plid;
			
 
				 
			
 
				 	/* used during policy activation */
			
 
				 	struct list_head		alloc_node;
			
@@ -94,17 +96,27 @@ struct blkcg_gq {
 
				 	struct list_head		q_node;
			
 
				 	struct hlist_node		blkcg_node;
			
 
				 	struct blkcg			*blkcg;
			
 
				+
			
 
				+	/* all non-root blkcg_gq's are guaranteed to have access to parent */
			
 
				+	struct blkcg_gq			*parent;
			
 
				+
			
 
				 	/* request allocation list for this blkcg-q pair */
			
 
				 	struct request_list		rl;
			
 
				+
			
 
				 	/* reference count */
			
 
				 	int				refcnt;
			
 
				 
			
 
				+	/* is this blkg online? protected by both blkcg and q locks */
			
 
				+	bool				online;
			
 
				+
			
 
				 	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];
			
 
				 
			
 
				 	struct rcu_head			rcu_head;
			
 
				 };
			
 
				 
			
 
				 typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
			
 
				+typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
			
 
				+typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
			
 
				 typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
			
 
				 typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
			
 
				 
			
@@ -117,6 +129,8 @@ struct blkcg_policy {
 
				 
			
 
				 	/* operations */
			
 
				 	blkcg_pol_init_pd_fn		*pd_init_fn;
			
 
				+	blkcg_pol_online_pd_fn		*pd_online_fn;
			
 
				+	blkcg_pol_offline_pd_fn		*pd_offline_fn;
			
 
				 	blkcg_pol_exit_pd_fn		*pd_exit_fn;
			
 
				 	blkcg_pol_reset_pd_stats_fn	*pd_reset_stats_fn;
			
 
				 };
			
@@ -150,6 +164,10 @@ u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
 
				 u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
			
 
				 		       int off);
			
 
				 
			
 
				+u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
			
 
				+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
			
 
				+					     int off);
			
 
				+
			
 
				 struct blkg_conf_ctx {
			
 
				 	struct gendisk			*disk;
			
 
				 	struct blkcg_gq			*blkg;
			
@@ -180,6 +198,19 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
 
				 	return task_blkcg(current);
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * blkcg_parent - get the parent of a blkcg
			
 
				+ * @blkcg: blkcg of interest
			
 
				+ *
			
 
				+ * Return the parent blkcg of @blkcg.  Can be called anytime.
			
 
				+ */
			
 
				+static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
			
 
				+{
			
 
				+	struct cgroup *pcg = blkcg->css.cgroup->parent;
			
 
				+
			
 
				+	return pcg ? cgroup_to_blkcg(pcg) : NULL;
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * blkg_to_pdata - get policy private data
			
 
				  * @blkg: blkg of interest
			
@@ -386,6 +417,18 @@ static inline void blkg_stat_reset(struct blkg_stat *stat)
 
				 	stat->cnt = 0;
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * blkg_stat_merge - merge a blkg_stat into another
			
 
				+ * @to: the destination blkg_stat
			
 
				+ * @from: the source
			
 
				+ *
			
 
				+ * Add @from's count to @to.
			
 
				+ */
			
 
				+static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
			
 
				+{
			
 
				+	blkg_stat_add(to, blkg_stat_read(from));
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * blkg_rwstat_add - add a value to a blkg_rwstat
			
 
				  * @rwstat: target blkg_rwstat
			
@@ -434,14 +477,14 @@ static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * blkg_rwstat_sum - read the total count of a blkg_rwstat
			
 
				+ * blkg_rwstat_total - read the total count of a blkg_rwstat
			
 
				  * @rwstat: blkg_rwstat to read
			
 
				  *
			
 
				  * Return the total count of @rwstat regardless of the IO direction.  This
			
 
				  * function can be called without synchronization and takes care of u64
			
 
				  * atomicity.
			
 
				  */
			
 
				-static inline uint64_t blkg_rwstat_sum(struct blkg_rwstat *rwstat)
			
 
				+static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
			
 
				 {
			
 
				 	struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
			
 
				 
			
@@ -457,6 +500,25 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
 
				 	memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * blkg_rwstat_merge - merge a blkg_rwstat into another
			
 
				+ * @to: the destination blkg_rwstat
			
 
				+ * @from: the source
			
 
				+ *
			
 
				+ * Add @from's counts to @to.
			
 
				+ */
			
 
				+static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
			
 
				+				     struct blkg_rwstat *from)
			
 
				+{
			
 
				+	struct blkg_rwstat v = blkg_rwstat_read(from);
			
 
				+	int i;
			
 
				+
			
 
				+	u64_stats_update_begin(&to->syncp);
			
 
				+	for (i = 0; i < BLKG_RWSTAT_NR; i++)
			
 
				+		to->cnt[i] += v.cnt[i];
			
 
				+	u64_stats_update_end(&to->syncp);
			
 
				+}
			
 
				+
			
 
				 #else	/* CONFIG_BLK_CGROUP */
			
 
				 
			
 
				 struct cgroup;
			
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -497,6 +497,13 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
 
				 	return res;
			
 
				 }
			
 
				 
			
 
				+static void blk_free_queue_rcu(struct rcu_head *rcu_head)
			
 
				+{
			
 
				+	struct request_queue *q = container_of(rcu_head, struct request_queue,
			
 
				+					       rcu_head);
			
 
				+	kmem_cache_free(blk_requestq_cachep, q);
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * blk_release_queue: - release a &struct request_queue when it is no longer needed
			
 
				  * @kobj:    the kobj belonging to the request queue to be released
			
@@ -538,7 +545,7 @@ static void blk_release_queue(struct kobject *kobj)
 
				 	bdi_destroy(&q->backing_dev_info);
			
 
				 
			
 
				 	ida_simple_remove(&blk_queue_ida, q->id);
			
 
				-	kmem_cache_free(blk_requestq_cachep, q);
			
 
				+	call_rcu(&q->rcu_head, blk_free_queue_rcu);
			
 
				 }
			
 
				 
			
 
				 static const struct sysfs_ops queue_sysfs_ops = {
			
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -19,6 +19,7 @@
 
				 #include <linux/gfp.h>
			
 
				 #include <linux/bsg.h>
			
 
				 #include <linux/smp.h>
			
 
				+#include <linux/rcupdate.h>
			
 
				 
			
 
				 #include <asm/scatterlist.h>
			
 
				 
			
@@ -437,6 +438,7 @@ struct request_queue {
 
				 	/* Throttle data */
			
 
				 	struct throtl_data *td;
			
 
				 #endif
			
 
				+	struct rcu_head		rcu_head;
			
 
				 };
			
 
				 
			
 
				 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */