|
@@ -38,6 +38,7 @@
|
|
|
#include <linux/mount.h>
|
|
|
#include <linux/namei.h>
|
|
|
#include <linux/pagemap.h>
|
|
|
+#include <linux/prio_heap.h>
|
|
|
#include <linux/proc_fs.h>
|
|
|
#include <linux/rcupdate.h>
|
|
|
#include <linux/sched.h>
|
|
@@ -701,6 +702,36 @@ done:
|
|
|
/* Don't kfree(doms) -- partition_sched_domains() does that. */
|
|
|
}
|
|
|
|
|
|
+static inline int started_after_time(struct task_struct *t1,
|
|
|
+ struct timespec *time,
|
|
|
+ struct task_struct *t2)
|
|
|
+{
|
|
|
+ int start_diff = timespec_compare(&t1->start_time, time);
|
|
|
+ if (start_diff > 0) {
|
|
|
+ return 1;
|
|
|
+ } else if (start_diff < 0) {
|
|
|
+ return 0;
|
|
|
+ } else {
|
|
|
+ /*
|
|
|
+ * Arbitrarily, if two processes started at the same
|
|
|
+ * time, we'll say that the lower pointer value
|
|
|
+ * started first. Note that t2 may have exited by now
|
|
|
+ * so this may not be a valid pointer any longer, but
|
|
|
+ * that's fine - it still serves to distinguish
|
|
|
+ * between two tasks started (effectively)
|
|
|
+ * simultaneously.
|
|
|
+ */
|
|
|
+ return t1 > t2;
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static inline int started_after(void *p1, void *p2)
|
|
|
+{
|
|
|
+ struct task_struct *t1 = p1;
|
|
|
+ struct task_struct *t2 = p2;
|
|
|
+ return started_after_time(t1, &t2->start_time, t2);
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* Call with manage_mutex held. May take callback_mutex during call.
|
|
|
*/
|
|
@@ -708,8 +739,15 @@ done:
|
|
|
static int update_cpumask(struct cpuset *cs, char *buf)
|
|
|
{
|
|
|
struct cpuset trialcs;
|
|
|
- int retval;
|
|
|
- int cpus_changed, is_load_balanced;
|
|
|
+ int retval, i;
|
|
|
+ int is_load_balanced;
|
|
|
+ struct cgroup_iter it;
|
|
|
+ struct cgroup *cgrp = cs->css.cgroup;
|
|
|
+ struct task_struct *p, *dropped;
|
|
|
+ /* Never dereference latest_task, since it's not refcounted */
|
|
|
+ struct task_struct *latest_task = NULL;
|
|
|
+ struct ptr_heap heap;
|
|
|
+ struct timespec latest_time = { 0, 0 };
|
|
|
|
|
|
/* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
|
|
|
if (cs == &top_cpuset)
|
|
@@ -736,14 +774,73 @@ static int update_cpumask(struct cpuset *cs, char *buf)
|
|
|
if (retval < 0)
|
|
|
return retval;
|
|
|
|
|
|
- cpus_changed = !cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
|
|
|
+ /* Nothing to do if the cpus didn't change */
|
|
|
+ if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
|
|
|
+ return 0;
|
|
|
+ retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
|
|
|
+ if (retval)
|
|
|
+ return retval;
|
|
|
+
|
|
|
is_load_balanced = is_sched_load_balance(&trialcs);
|
|
|
|
|
|
mutex_lock(&callback_mutex);
|
|
|
cs->cpus_allowed = trialcs.cpus_allowed;
|
|
|
mutex_unlock(&callback_mutex);
|
|
|
|
|
|
- if (cpus_changed && is_load_balanced)
|
|
|
+ again:
|
|
|
+ /*
|
|
|
+ * Scan tasks in the cpuset, and update the cpumasks of any
|
|
|
+ * that need an update. Since we can't call set_cpus_allowed()
|
|
|
+ * while holding tasklist_lock, gather tasks to be processed
|
|
|
+ * in a heap structure. If the statically-sized heap fills up,
|
|
|
+ * overflow tasks that started later, and in future iterations
|
|
|
+ * only consider tasks that started after the latest task in
|
|
|
+ * the previous pass. This guarantees forward progress and
|
|
|
+ * that we don't miss any tasks
|
|
|
+ */
|
|
|
+ heap.size = 0;
|
|
|
+ cgroup_iter_start(cgrp, &it);
|
|
|
+ while ((p = cgroup_iter_next(cgrp, &it))) {
|
|
|
+ /* Only affect tasks that don't have the right cpus_allowed */
|
|
|
+ if (cpus_equal(p->cpus_allowed, cs->cpus_allowed))
|
|
|
+ continue;
|
|
|
+ /*
|
|
|
+ * Only process tasks that started after the last task
|
|
|
+ * we processed
|
|
|
+ */
|
|
|
+ if (!started_after_time(p, &latest_time, latest_task))
|
|
|
+ continue;
|
|
|
+ dropped = heap_insert(&heap, p);
|
|
|
+ if (dropped == NULL) {
|
|
|
+ get_task_struct(p);
|
|
|
+ } else if (dropped != p) {
|
|
|
+ get_task_struct(p);
|
|
|
+ put_task_struct(dropped);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ cgroup_iter_end(cgrp, &it);
|
|
|
+ if (heap.size) {
|
|
|
+ for (i = 0; i < heap.size; i++) {
|
|
|
+ struct task_struct *p = heap.ptrs[i];
|
|
|
+ if (i == 0) {
|
|
|
+ latest_time = p->start_time;
|
|
|
+ latest_task = p;
|
|
|
+ }
|
|
|
+ set_cpus_allowed(p, cs->cpus_allowed);
|
|
|
+ put_task_struct(p);
|
|
|
+ }
|
|
|
+ /*
|
|
|
+ * If we had to process any tasks at all, scan again
|
|
|
+ * in case some of them were in the middle of forking
|
|
|
+ * children that didn't notice the new cpumask
|
|
|
+ * restriction. Not the most efficient way to do it,
|
|
|
+ * but it avoids having to take callback_mutex in the
|
|
|
+ * fork path
|
|
|
+ */
|
|
|
+ goto again;
|
|
|
+ }
|
|
|
+ heap_free(&heap);
|
|
|
+ if (is_load_balanced)
|
|
|
rebuild_sched_domains();
|
|
|
|
|
|
return 0;
|