12 years ago · 4c16bd327c
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -45,6 +45,7 @@
 
				 #include <linux/hashtable.h>
			
 
				 #include <linux/rculist.h>
			
 
				 #include <linux/nodemask.h>
			
 
				+#include <linux/moduleparam.h>
			
 
				 
			
 
				 #include "workqueue_internal.h"
			
 
				 
			
@@ -245,6 +246,7 @@ struct workqueue_struct {
 
				 	int			saved_max_active; /* WQ: saved pwq max_active */
			
 
				 
			
 
				 	struct workqueue_attrs	*unbound_attrs;	/* WQ: only for unbound wqs */
			
 
				+	struct pool_workqueue	*dfl_pwq;	/* WQ: only for unbound wqs */
			
 
				 
			
 
				 #ifdef CONFIG_SYSFS
			
 
				 	struct wq_device	*wq_dev;	/* I: for sysfs interface */
			
@@ -268,6 +270,9 @@ static cpumask_var_t *wq_numa_possible_cpumask;
 
				 
			
 
				 static bool wq_numa_enabled;		/* unbound NUMA affinity enabled */
			
 
				 
			
 
				+/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
			
 
				+static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
			
 
				+
			
 
				 static DEFINE_MUTEX(wq_pool_mutex);	/* protects pools and workqueues list */
			
 
				 static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
			
 
				 
			
@@ -3710,6 +3715,61 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
 
				 	return pwq;
			
 
				 }
			
 
				 
			
 
				+/* undo alloc_unbound_pwq(), used only in the error path */
			
 
				+static void free_unbound_pwq(struct pool_workqueue *pwq)
			
 
				+{
			
 
				+	lockdep_assert_held(&wq_pool_mutex);
			
 
				+
			
 
				+	if (pwq) {
			
 
				+		put_unbound_pool(pwq->pool);
			
 
				+		kfree(pwq);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node
			
 
				+ * @attrs: the wq_attrs of interest
			
 
				+ * @node: the target NUMA node
			
 
				+ * @cpu_going_down: if >= 0, the CPU to consider as offline
			
 
				+ * @cpumask: outarg, the resulting cpumask
			
 
				+ *
			
 
				+ * Calculate the cpumask a workqueue with @attrs should use on @node.  If
			
 
				+ * @cpu_going_down is >= 0, that cpu is considered offline during
			
 
				+ * calculation.  The result is stored in @cpumask.  This function returns
			
 
				+ * %true if the resulting @cpumask is different from @attrs->cpumask,
			
 
				+ * %false if equal.
			
 
				+ *
			
 
				+ * If NUMA affinity is not enabled, @attrs->cpumask is always used.  If
			
 
				+ * enabled and @node has online CPUs requested by @attrs, the returned
			
 
				+ * cpumask is the intersection of the possible CPUs of @node and
			
 
				+ * @attrs->cpumask.
			
 
				+ *
			
 
				+ * The caller is responsible for ensuring that the cpumask of @node stays
			
 
				+ * stable.
			
 
				+ */
			
 
				+static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
			
 
				+				 int cpu_going_down, cpumask_t *cpumask)
			
 
				+{
			
 
				+	if (!wq_numa_enabled)
			
 
				+		goto use_dfl;
			
 
				+
			
 
				+	/* does @node have any online CPUs @attrs wants? */
			
 
				+	cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask);
			
 
				+	if (cpu_going_down >= 0)
			
 
				+		cpumask_clear_cpu(cpu_going_down, cpumask);
			
 
				+
			
 
				+	if (cpumask_empty(cpumask))
			
 
				+		goto use_dfl;
			
 
				+
			
 
				+	/* yeap, return possible CPUs in @node that @attrs wants */
			
 
				+	cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
			
 
				+	return !cpumask_equal(cpumask, attrs->cpumask);
			
 
				+
			
 
				+use_dfl:
			
 
				+	cpumask_copy(cpumask, attrs->cpumask);
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				 /* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */
			
 
				 static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
			
 
				 						   int node,
			
@@ -3732,11 +3792,12 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
 
				  * @wq: the target workqueue
			
 
				  * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
			
 
				  *
			
 
				- * Apply @attrs to an unbound workqueue @wq.  If @attrs doesn't match the
			
 
				- * current attributes, a new pwq is created and made the first pwq which
			
 
				- * will serve all new work items.  Older pwqs are released as in-flight
			
 
				- * work items finish.  Note that a work item which repeatedly requeues
			
 
				- * itself back-to-back will stay on its current pwq.
			
 
				+ * Apply @attrs to an unbound workqueue @wq.  Unless disabled, on NUMA
			
 
				+ * machines, this function maps a separate pwq to each NUMA node with
			
 
				+ * possibles CPUs in @attrs->cpumask so that work items are affine to the
			
 
				+ * NUMA node it was issued on.  Older pwqs are released as in-flight work
			
 
				+ * items finish.  Note that a work item which repeatedly requeues itself
			
 
				+ * back-to-back will stay on its current pwq.
			
 
				  *
			
 
				  * Performs GFP_KERNEL allocations.  Returns 0 on success and -errno on
			
 
				  * failure.
			
@@ -3744,8 +3805,8 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
 
				 int apply_workqueue_attrs(struct workqueue_struct *wq,
			
 
				 			  const struct workqueue_attrs *attrs)
			
 
				 {
			
 
				-	struct workqueue_attrs *new_attrs;
			
 
				-	struct pool_workqueue *pwq, *last_pwq = NULL;
			
 
				+	struct workqueue_attrs *new_attrs, *tmp_attrs;
			
 
				+	struct pool_workqueue **pwq_tbl, *dfl_pwq;
			
 
				 	int node, ret;
			
 
				 
			
 
				 	/* only unbound workqueues can change attributes */
			
@@ -3756,40 +3817,191 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
 
				 	if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
			
 
				 		return -EINVAL;
			
 
				 
			
 
				-	/* make a copy of @attrs and sanitize it */
			
 
				+	pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL);
			
 
				 	new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
			
 
				-	if (!new_attrs)
			
 
				+	tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
			
 
				+	if (!pwq_tbl || !new_attrs || !tmp_attrs)
			
 
				 		goto enomem;
			
 
				 
			
 
				+	/* make a copy of @attrs and sanitize it */
			
 
				 	copy_workqueue_attrs(new_attrs, attrs);
			
 
				 	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
			
 
				 
			
 
				+	/*
			
 
				+	 * We may create multiple pwqs with differing cpumasks.  Make a
			
 
				+	 * copy of @new_attrs which will be modified and used to obtain
			
 
				+	 * pools.
			
 
				+	 */
			
 
				+	copy_workqueue_attrs(tmp_attrs, new_attrs);
			
 
				+
			
 
				+	/*
			
 
				+	 * CPUs should stay stable across pwq creations and installations.
			
 
				+	 * Pin CPUs, determine the target cpumask for each node and create
			
 
				+	 * pwqs accordingly.
			
 
				+	 */
			
 
				+	get_online_cpus();
			
 
				+
			
 
				 	mutex_lock(&wq_pool_mutex);
			
 
				-	pwq = alloc_unbound_pwq(wq, new_attrs);
			
 
				+
			
 
				+	/*
			
 
				+	 * If something goes wrong during CPU up/down, we'll fall back to
			
 
				+	 * the default pwq covering whole @attrs->cpumask.  Always create
			
 
				+	 * it even if we don't use it immediately.
			
 
				+	 */
			
 
				+	dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
			
 
				+	if (!dfl_pwq)
			
 
				+		goto enomem_pwq;
			
 
				+
			
 
				+	for_each_node(node) {
			
 
				+		if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
			
 
				+			pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
			
 
				+			if (!pwq_tbl[node])
			
 
				+				goto enomem_pwq;
			
 
				+		} else {
			
 
				+			dfl_pwq->refcnt++;
			
 
				+			pwq_tbl[node] = dfl_pwq;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	mutex_unlock(&wq_pool_mutex);
			
 
				-	if (!pwq)
			
 
				-		goto enomem;
			
 
				 
			
 
				+	/* all pwqs have been created successfully, let's install'em */
			
 
				 	mutex_lock(&wq->mutex);
			
 
				 
			
 
				 	copy_workqueue_attrs(wq->unbound_attrs, new_attrs);
			
 
				+
			
 
				+	/* save the previous pwq and install the new one */
			
 
				 	for_each_node(node)
			
 
				-		last_pwq = numa_pwq_tbl_install(wq, node, pwq);
			
 
				+		pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]);
			
 
				+
			
 
				+	/* @dfl_pwq might not have been used, ensure it's linked */
			
 
				+	link_pwq(dfl_pwq);
			
 
				+	swap(wq->dfl_pwq, dfl_pwq);
			
 
				 
			
 
				 	mutex_unlock(&wq->mutex);
			
 
				 
			
 
				-	put_pwq_unlocked(last_pwq);
			
 
				+	/* put the old pwqs */
			
 
				+	for_each_node(node)
			
 
				+		put_pwq_unlocked(pwq_tbl[node]);
			
 
				+	put_pwq_unlocked(dfl_pwq);
			
 
				+
			
 
				+	put_online_cpus();
			
 
				 	ret = 0;
			
 
				 	/* fall through */
			
 
				 out_free:
			
 
				+	free_workqueue_attrs(tmp_attrs);
			
 
				 	free_workqueue_attrs(new_attrs);
			
 
				+	kfree(pwq_tbl);
			
 
				 	return ret;
			
 
				 
			
 
				+enomem_pwq:
			
 
				+	free_unbound_pwq(dfl_pwq);
			
 
				+	for_each_node(node)
			
 
				+		if (pwq_tbl && pwq_tbl[node] != dfl_pwq)
			
 
				+			free_unbound_pwq(pwq_tbl[node]);
			
 
				+	mutex_unlock(&wq_pool_mutex);
			
 
				+	put_online_cpus();
			
 
				 enomem:
			
 
				 	ret = -ENOMEM;
			
 
				 	goto out_free;
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
			
 
				+ * @wq: the target workqueue
			
 
				+ * @cpu: the CPU coming up or going down
			
 
				+ * @online: whether @cpu is coming up or going down
			
 
				+ *
			
 
				+ * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
			
 
				+ * %CPU_DOWN_FAILED.  @cpu is being hot[un]plugged, update NUMA affinity of
			
 
				+ * @wq accordingly.
			
 
				+ *
			
 
				+ * If NUMA affinity can't be adjusted due to memory allocation failure, it
			
 
				+ * falls back to @wq->dfl_pwq which may not be optimal but is always
			
 
				+ * correct.
			
 
				+ *
			
 
				+ * Note that when the last allowed CPU of a NUMA node goes offline for a
			
 
				+ * workqueue with a cpumask spanning multiple nodes, the workers which were
			
 
				+ * already executing the work items for the workqueue will lose their CPU
			
 
				+ * affinity and may execute on any CPU.  This is similar to how per-cpu
			
 
				+ * workqueues behave on CPU_DOWN.  If a workqueue user wants strict
			
 
				+ * affinity, it's the user's responsibility to flush the work item from
			
 
				+ * CPU_DOWN_PREPARE.
			
 
				+ */
			
 
				+static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
			
 
				+				   bool online)
			
 
				+{
			
 
				+	int node = cpu_to_node(cpu);
			
 
				+	int cpu_off = online ? -1 : cpu;
			
 
				+	struct pool_workqueue *old_pwq = NULL, *pwq;
			
 
				+	struct workqueue_attrs *target_attrs;
			
 
				+	cpumask_t *cpumask;
			
 
				+
			
 
				+	lockdep_assert_held(&wq_pool_mutex);
			
 
				+
			
 
				+	if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND))
			
 
				+		return;
			
 
				+
			
 
				+	/*
			
 
				+	 * We don't wanna alloc/free wq_attrs for each wq for each CPU.
			
 
				+	 * Let's use a preallocated one.  The following buf is protected by
			
 
				+	 * CPU hotplug exclusion.
			
 
				+	 */
			
 
				+	target_attrs = wq_update_unbound_numa_attrs_buf;
			
 
				+	cpumask = target_attrs->cpumask;
			
 
				+
			
 
				+	mutex_lock(&wq->mutex);
			
 
				+
			
 
				+	copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
			
 
				+	pwq = unbound_pwq_by_node(wq, node);
			
 
				+
			
 
				+	/*
			
 
				+	 * Let's determine what needs to be done.  If the target cpumask is
			
 
				+	 * different from wq's, we need to compare it to @pwq's and create
			
 
				+	 * a new one if they don't match.  If the target cpumask equals
			
 
				+	 * wq's, the default pwq should be used.  If @pwq is already the
			
 
				+	 * default one, nothing to do; otherwise, install the default one.
			
 
				+	 */
			
 
				+	if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
			
 
				+		if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
			
 
				+			goto out_unlock;
			
 
				+	} else {
			
 
				+		if (pwq == wq->dfl_pwq)
			
 
				+			goto out_unlock;
			
 
				+		else
			
 
				+			goto use_dfl_pwq;
			
 
				+	}
			
 
				+
			
 
				+	mutex_unlock(&wq->mutex);
			
 
				+
			
 
				+	/* create a new pwq */
			
 
				+	pwq = alloc_unbound_pwq(wq, target_attrs);
			
 
				+	if (!pwq) {
			
 
				+		pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
			
 
				+			   wq->name);
			
 
				+		goto out_unlock;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Install the new pwq.  As this function is called only from CPU
			
 
				+	 * hotplug callbacks and applying a new attrs is wrapped with
			
 
				+	 * get/put_online_cpus(), @wq->unbound_attrs couldn't have changed
			
 
				+	 * inbetween.
			
 
				+	 */
			
 
				+	mutex_lock(&wq->mutex);
			
 
				+	old_pwq = numa_pwq_tbl_install(wq, node, pwq);
			
 
				+	goto out_unlock;
			
 
				+
			
 
				+use_dfl_pwq:
			
 
				+	spin_lock_irq(&wq->dfl_pwq->pool->lock);
			
 
				+	get_pwq(wq->dfl_pwq);
			
 
				+	spin_unlock_irq(&wq->dfl_pwq->pool->lock);
			
 
				+	old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq);
			
 
				+out_unlock:
			
 
				+	mutex_unlock(&wq->mutex);
			
 
				+	put_pwq_unlocked(old_pwq);
			
 
				+}
			
 
				+
			
 
				 static int alloc_and_link_pwqs(struct workqueue_struct *wq)
			
 
				 {
			
 
				 	bool highpri = wq->flags & WQ_HIGHPRI;
			
@@ -3942,6 +4154,7 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
 
				 void destroy_workqueue(struct workqueue_struct *wq)
			
 
				 {
			
 
				 	struct pool_workqueue *pwq;
			
 
				+	int node;
			
 
				 
			
 
				 	/* drain it before proceeding with destruction */
			
 
				 	drain_workqueue(wq);
			
@@ -3993,11 +4206,21 @@ void destroy_workqueue(struct workqueue_struct *wq)
 
				 	} else {
			
 
				 		/*
			
 
				 		 * We're the sole accessor of @wq at this point.  Directly
			
 
				-		 * access the first pwq and put the base ref.  @wq will be
			
 
				-		 * freed when the last pwq is released.
			
 
				+		 * access numa_pwq_tbl[] and dfl_pwq to put the base refs.
			
 
				+		 * @wq will be freed when the last pwq is released.
			
 
				 		 */
			
 
				-		pwq = list_first_entry(&wq->pwqs, struct pool_workqueue,
			
 
				-				       pwqs_node);
			
 
				+		for_each_node(node) {
			
 
				+			pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
			
 
				+			RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL);
			
 
				+			put_pwq_unlocked(pwq);
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * Put dfl_pwq.  @wq may be freed any time after dfl_pwq is
			
 
				+		 * put.  Don't access it afterwards.
			
 
				+		 */
			
 
				+		pwq = wq->dfl_pwq;
			
 
				+		wq->dfl_pwq = NULL;
			
 
				 		put_pwq_unlocked(pwq);
			
 
				 	}
			
 
				 }
			
@@ -4285,6 +4508,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
 
				 {
			
 
				 	int cpu = (unsigned long)hcpu;
			
 
				 	struct worker_pool *pool;
			
 
				+	struct workqueue_struct *wq;
			
 
				 	int pi;
			
 
				 
			
 
				 	switch (action & ~CPU_TASKS_FROZEN) {
			
@@ -4317,6 +4541,10 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
 
				 			mutex_unlock(&pool->manager_mutex);
			
 
				 		}
			
 
				 
			
 
				+		/* update NUMA affinity of unbound workqueues */
			
 
				+		list_for_each_entry(wq, &workqueues, list)
			
 
				+			wq_update_unbound_numa(wq, cpu, true);
			
 
				+
			
 
				 		mutex_unlock(&wq_pool_mutex);
			
 
				 		break;
			
 
				 	}
			
@@ -4333,12 +4561,21 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
 
				 {
			
 
				 	int cpu = (unsigned long)hcpu;
			
 
				 	struct work_struct unbind_work;
			
 
				+	struct workqueue_struct *wq;
			
 
				 
			
 
				 	switch (action & ~CPU_TASKS_FROZEN) {
			
 
				 	case CPU_DOWN_PREPARE:
			
 
				-		/* unbinding should happen on the local CPU */
			
 
				+		/* unbinding per-cpu workers should happen on the local CPU */
			
 
				 		INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
			
 
				 		queue_work_on(cpu, system_highpri_wq, &unbind_work);
			
 
				+
			
 
				+		/* update NUMA affinity of unbound workqueues */
			
 
				+		mutex_lock(&wq_pool_mutex);
			
 
				+		list_for_each_entry(wq, &workqueues, list)
			
 
				+			wq_update_unbound_numa(wq, cpu, false);
			
 
				+		mutex_unlock(&wq_pool_mutex);
			
 
				+
			
 
				+		/* wait for per-cpu unbinding to finish */
			
 
				 		flush_work(&unbind_work);
			
 
				 		break;
			
 
				 	}
			
@@ -4526,6 +4763,9 @@ static void __init wq_numa_init(void)
 
				 	if (num_possible_nodes() <= 1)
			
 
				 		return;
			
 
				 
			
 
				+	wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
			
 
				+	BUG_ON(!wq_update_unbound_numa_attrs_buf);
			
 
				+
			
 
				 	/*
			
 
				 	 * We want masks of possible CPUs of each node which isn't readily
			
 
				 	 * available.  Build one from cpu_to_node() which should have been