瀏覽代碼

[PATCH] Dynamic sched domains: cpuset changes

Adds the core update_cpu_domains code and updated cpusets documentation

Signed-off-by: Dinakar Guniguntala <dino@in.ibm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Dinakar Guniguntala 20 年之前
父節點
當前提交
85d7b94981
共有 2 個文件被更改,包括 92 次插入13 次删除
  1. 16 0
      Documentation/cpusets.txt
  2. 76 13
      kernel/cpuset.c

+ 16 - 0
Documentation/cpusets.txt

@@ -51,6 +51,14 @@ mems_allowed vector.
 
 
 If a cpuset is cpu or mem exclusive, no other cpuset, other than a direct
 If a cpuset is cpu or mem exclusive, no other cpuset, other than a direct
 ancestor or descendent, may share any of the same CPUs or Memory Nodes.
 ancestor or descendent, may share any of the same CPUs or Memory Nodes.
+A cpuset that is cpu exclusive has a sched domain associated with it.
+The sched domain consists of all cpus in the current cpuset that are not
+part of any exclusive child cpusets.
+This ensures that the scheduler load balacing code only balances
+against the cpus that are in the sched domain as defined above and not
+all of the cpus in the system. This removes any overhead due to
+load balancing code trying to pull tasks outside of the cpu exclusive
+cpuset only to be prevented by the tasks' cpus_allowed mask.
 
 
 User level code may create and destroy cpusets by name in the cpuset
 User level code may create and destroy cpusets by name in the cpuset
 virtual file system, manage the attributes and permissions of these
 virtual file system, manage the attributes and permissions of these
@@ -84,6 +92,9 @@ This can be especially valuable on:
       and a database), or
       and a database), or
     * NUMA systems running large HPC applications with demanding
     * NUMA systems running large HPC applications with demanding
       performance characteristics.
       performance characteristics.
+    * Also cpu_exclusive cpusets are useful for servers running orthogonal
+      workloads such as RT applications requiring low latency and HPC
+      applications that are throughput sensitive
 
 
 These subsets, or "soft partitions" must be able to be dynamically
 These subsets, or "soft partitions" must be able to be dynamically
 adjusted, as the job mix changes, without impacting other concurrently
 adjusted, as the job mix changes, without impacting other concurrently
@@ -125,6 +136,8 @@ Cpusets extends these two mechanisms as follows:
  - A cpuset may be marked exclusive, which ensures that no other
  - A cpuset may be marked exclusive, which ensures that no other
    cpuset (except direct ancestors and descendents) may contain
    cpuset (except direct ancestors and descendents) may contain
    any overlapping CPUs or Memory Nodes.
    any overlapping CPUs or Memory Nodes.
+   Also a cpu_exclusive cpuset would be associated with a sched
+   domain.
  - You can list all the tasks (by pid) attached to any cpuset.
  - You can list all the tasks (by pid) attached to any cpuset.
 
 
 The implementation of cpusets requires a few, simple hooks
 The implementation of cpusets requires a few, simple hooks
@@ -136,6 +149,9 @@ into the rest of the kernel, none in performance critical paths:
    allowed in that tasks cpuset.
    allowed in that tasks cpuset.
  - in sched.c migrate_all_tasks(), to keep migrating tasks within
  - in sched.c migrate_all_tasks(), to keep migrating tasks within
    the CPUs allowed by their cpuset, if possible.
    the CPUs allowed by their cpuset, if possible.
+ - in sched.c, a new API partition_sched_domains for handling
+   sched domain changes associated with cpu_exclusive cpusets
+   and related changes in both sched.c and arch/ia64/kernel/domain.c
  - in the mbind and set_mempolicy system calls, to mask the requested
  - in the mbind and set_mempolicy system calls, to mask the requested
    Memory Nodes by what's allowed in that tasks cpuset.
    Memory Nodes by what's allowed in that tasks cpuset.
  - in page_alloc, to restrict memory to allowed nodes.
  - in page_alloc, to restrict memory to allowed nodes.

+ 76 - 13
kernel/cpuset.c

@@ -595,10 +595,62 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 	return 0;
 	return 0;
 }
 }
 
 
+/*
+ * For a given cpuset cur, partition the system as follows
+ * a. All cpus in the parent cpuset's cpus_allowed that are not part of any
+ *    exclusive child cpusets
+ * b. All cpus in the current cpuset's cpus_allowed that are not part of any
+ *    exclusive child cpusets
+ * Build these two partitions by calling partition_sched_domains
+ *
+ * Call with cpuset_sem held.  May nest a call to the
+ * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
+ */
+static void update_cpu_domains(struct cpuset *cur)
+{
+	struct cpuset *c, *par = cur->parent;
+	cpumask_t pspan, cspan;
+
+	if (par == NULL || cpus_empty(cur->cpus_allowed))
+		return;
+
+	/*
+	 * Get all cpus from parent's cpus_allowed not part of exclusive
+	 * children
+	 */
+	pspan = par->cpus_allowed;
+	list_for_each_entry(c, &par->children, sibling) {
+		if (is_cpu_exclusive(c))
+			cpus_andnot(pspan, pspan, c->cpus_allowed);
+	}
+	if (is_removed(cur) || !is_cpu_exclusive(cur)) {
+		cpus_or(pspan, pspan, cur->cpus_allowed);
+		if (cpus_equal(pspan, cur->cpus_allowed))
+			return;
+		cspan = CPU_MASK_NONE;
+	} else {
+		if (cpus_empty(pspan))
+			return;
+		cspan = cur->cpus_allowed;
+		/*
+		 * Get all cpus from current cpuset's cpus_allowed not part
+		 * of exclusive children
+		 */
+		list_for_each_entry(c, &cur->children, sibling) {
+			if (is_cpu_exclusive(c))
+				cpus_andnot(cspan, cspan, c->cpus_allowed);
+		}
+	}
+
+	lock_cpu_hotplug();
+	partition_sched_domains(&pspan, &cspan);
+	unlock_cpu_hotplug();
+}
+
 static int update_cpumask(struct cpuset *cs, char *buf)
 static int update_cpumask(struct cpuset *cs, char *buf)
 {
 {
 	struct cpuset trialcs;
 	struct cpuset trialcs;
-	int retval;
+	int retval, cpus_unchanged;
 
 
 	trialcs = *cs;
 	trialcs = *cs;
 	retval = cpulist_parse(buf, trialcs.cpus_allowed);
 	retval = cpulist_parse(buf, trialcs.cpus_allowed);
@@ -608,9 +660,13 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 	if (cpus_empty(trialcs.cpus_allowed))
 	if (cpus_empty(trialcs.cpus_allowed))
 		return -ENOSPC;
 		return -ENOSPC;
 	retval = validate_change(cs, &trialcs);
 	retval = validate_change(cs, &trialcs);
-	if (retval == 0)
-		cs->cpus_allowed = trialcs.cpus_allowed;
-	return retval;
+	if (retval < 0)
+		return retval;
+	cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
+	cs->cpus_allowed = trialcs.cpus_allowed;
+	if (is_cpu_exclusive(cs) && !cpus_unchanged)
+		update_cpu_domains(cs);
+	return 0;
 }
 }
 
 
 static int update_nodemask(struct cpuset *cs, char *buf)
 static int update_nodemask(struct cpuset *cs, char *buf)
@@ -646,7 +702,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
 {
 {
 	int turning_on;
 	int turning_on;
 	struct cpuset trialcs;
 	struct cpuset trialcs;
-	int err;
+	int err, cpu_exclusive_changed;
 
 
 	turning_on = (simple_strtoul(buf, NULL, 10) != 0);
 	turning_on = (simple_strtoul(buf, NULL, 10) != 0);
 
 
@@ -657,13 +713,18 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
 		clear_bit(bit, &trialcs.flags);
 		clear_bit(bit, &trialcs.flags);
 
 
 	err = validate_change(cs, &trialcs);
 	err = validate_change(cs, &trialcs);
-	if (err == 0) {
-		if (turning_on)
-			set_bit(bit, &cs->flags);
-		else
-			clear_bit(bit, &cs->flags);
-	}
-	return err;
+	if (err < 0)
+		return err;
+	cpu_exclusive_changed =
+		(is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
+	if (turning_on)
+		set_bit(bit, &cs->flags);
+	else
+		clear_bit(bit, &cs->flags);
+
+	if (cpu_exclusive_changed)
+                update_cpu_domains(cs);
+	return 0;
 }
 }
 
 
 static int attach_task(struct cpuset *cs, char *buf)
 static int attach_task(struct cpuset *cs, char *buf)
@@ -1309,12 +1370,14 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 		up(&cpuset_sem);
 		up(&cpuset_sem);
 		return -EBUSY;
 		return -EBUSY;
 	}
 	}
-	spin_lock(&cs->dentry->d_lock);
 	parent = cs->parent;
 	parent = cs->parent;
 	set_bit(CS_REMOVED, &cs->flags);
 	set_bit(CS_REMOVED, &cs->flags);
+	if (is_cpu_exclusive(cs))
+		update_cpu_domains(cs);
 	list_del(&cs->sibling);	/* delete my sibling from parent->children */
 	list_del(&cs->sibling);	/* delete my sibling from parent->children */
 	if (list_empty(&parent->children))
 	if (list_empty(&parent->children))
 		check_for_release(parent);
 		check_for_release(parent);
+	spin_lock(&cs->dentry->d_lock);
 	d = dget(cs->dentry);
 	d = dget(cs->dentry);
 	cs->dentry = NULL;
 	cs->dentry = NULL;
 	spin_unlock(&d->d_lock);
 	spin_unlock(&d->d_lock);