|
@@ -4779,7 +4779,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
|
|
* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
|
|
* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
|
|
* hold the hotplug lock.
|
|
* hold the hotplug lock.
|
|
*/
|
|
*/
|
|
-void cpu_attach_domain(struct sched_domain *sd, int cpu)
|
|
|
|
|
|
+static void cpu_attach_domain(struct sched_domain *sd, int cpu)
|
|
{
|
|
{
|
|
runqueue_t *rq = cpu_rq(cpu);
|
|
runqueue_t *rq = cpu_rq(cpu);
|
|
struct sched_domain *tmp;
|
|
struct sched_domain *tmp;
|
|
@@ -4802,7 +4802,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu)
|
|
}
|
|
}
|
|
|
|
|
|
/* cpus with isolated domains */
|
|
/* cpus with isolated domains */
|
|
-cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
|
|
|
|
|
|
+static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
|
|
|
|
|
|
/* Setup the mask of cpus configured for isolated domains */
|
|
/* Setup the mask of cpus configured for isolated domains */
|
|
static int __init isolated_cpu_setup(char *str)
|
|
static int __init isolated_cpu_setup(char *str)
|
|
@@ -4830,8 +4830,8 @@ __setup ("isolcpus=", isolated_cpu_setup);
|
|
* covered by the given span, and will set each group's ->cpumask correctly,
|
|
* covered by the given span, and will set each group's ->cpumask correctly,
|
|
* and ->cpu_power to 0.
|
|
* and ->cpu_power to 0.
|
|
*/
|
|
*/
|
|
-void init_sched_build_groups(struct sched_group groups[],
|
|
|
|
- cpumask_t span, int (*group_fn)(int cpu))
|
|
|
|
|
|
+static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
|
|
|
|
+ int (*group_fn)(int cpu))
|
|
{
|
|
{
|
|
struct sched_group *first = NULL, *last = NULL;
|
|
struct sched_group *first = NULL, *last = NULL;
|
|
cpumask_t covered = CPU_MASK_NONE;
|
|
cpumask_t covered = CPU_MASK_NONE;
|
|
@@ -4864,12 +4864,85 @@ void init_sched_build_groups(struct sched_group groups[],
|
|
last->next = first;
|
|
last->next = first;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+#define SD_NODES_PER_DOMAIN 16
|
|
|
|
|
|
-#ifdef ARCH_HAS_SCHED_DOMAIN
|
|
|
|
-extern void build_sched_domains(const cpumask_t *cpu_map);
|
|
|
|
-extern void arch_init_sched_domains(const cpumask_t *cpu_map);
|
|
|
|
-extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
|
|
|
|
-#else
|
|
|
|
|
|
+#ifdef CONFIG_NUMA
|
|
|
|
+/**
|
|
|
|
+ * find_next_best_node - find the next node to include in a sched_domain
|
|
|
|
+ * @node: node whose sched_domain we're building
|
|
|
|
+ * @used_nodes: nodes already in the sched_domain
|
|
|
|
+ *
|
|
|
|
+ * Find the next node to include in a given scheduling domain. Simply
|
|
|
|
+ * finds the closest node not already in the @used_nodes map.
|
|
|
|
+ *
|
|
|
|
+ * Should use nodemask_t.
|
|
|
|
+ */
|
|
|
|
+static int find_next_best_node(int node, unsigned long *used_nodes)
|
|
|
|
+{
|
|
|
|
+ int i, n, val, min_val, best_node = 0;
|
|
|
|
+
|
|
|
|
+ min_val = INT_MAX;
|
|
|
|
+
|
|
|
|
+ for (i = 0; i < MAX_NUMNODES; i++) {
|
|
|
|
+ /* Start at @node */
|
|
|
|
+ n = (node + i) % MAX_NUMNODES;
|
|
|
|
+
|
|
|
|
+ if (!nr_cpus_node(n))
|
|
|
|
+ continue;
|
|
|
|
+
|
|
|
|
+ /* Skip already used nodes */
|
|
|
|
+ if (test_bit(n, used_nodes))
|
|
|
|
+ continue;
|
|
|
|
+
|
|
|
|
+ /* Simple min distance search */
|
|
|
|
+ val = node_distance(node, n);
|
|
|
|
+
|
|
|
|
+ if (val < min_val) {
|
|
|
|
+ min_val = val;
|
|
|
|
+ best_node = n;
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ set_bit(best_node, used_nodes);
|
|
|
|
+ return best_node;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+/**
|
|
|
|
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
|
|
|
|
+ * @node: node whose cpumask we're constructing
|
|
|
|
+ * @size: number of nodes to include in this span
|
|
|
|
+ *
|
|
|
|
+ * Given a node, construct a good cpumask for its sched_domain to span. It
|
|
|
|
+ * should be one that prevents unnecessary balancing, but also spreads tasks
|
|
|
|
+ * out optimally.
|
|
|
|
+ */
|
|
|
|
+static cpumask_t sched_domain_node_span(int node)
|
|
|
|
+{
|
|
|
|
+ int i;
|
|
|
|
+ cpumask_t span, nodemask;
|
|
|
|
+ DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
|
|
|
|
+
|
|
|
|
+ cpus_clear(span);
|
|
|
|
+ bitmap_zero(used_nodes, MAX_NUMNODES);
|
|
|
|
+
|
|
|
|
+ nodemask = node_to_cpumask(node);
|
|
|
|
+ cpus_or(span, span, nodemask);
|
|
|
|
+ set_bit(node, used_nodes);
|
|
|
|
+
|
|
|
|
+ for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
|
|
|
|
+ int next_node = find_next_best_node(node, used_nodes);
|
|
|
|
+ nodemask = node_to_cpumask(next_node);
|
|
|
|
+ cpus_or(span, span, nodemask);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ return span;
|
|
|
|
+}
|
|
|
|
+#endif
|
|
|
|
+
|
|
|
|
+/*
|
|
|
|
+ * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
|
|
|
|
+ * can switch it on easily if needed.
|
|
|
|
+ */
|
|
#ifdef CONFIG_SCHED_SMT
|
|
#ifdef CONFIG_SCHED_SMT
|
|
static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
|
|
static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
|
|
static struct sched_group sched_group_cpus[NR_CPUS];
|
|
static struct sched_group sched_group_cpus[NR_CPUS];
|
|
@@ -4891,36 +4964,20 @@ static int cpu_to_phys_group(int cpu)
|
|
}
|
|
}
|
|
|
|
|
|
#ifdef CONFIG_NUMA
|
|
#ifdef CONFIG_NUMA
|
|
-
|
|
|
|
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
|
|
|
|
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
|
|
|
|
-static int cpu_to_node_group(int cpu)
|
|
|
|
-{
|
|
|
|
- return cpu_to_node(cpu);
|
|
|
|
-}
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
|
|
|
|
/*
|
|
/*
|
|
- * The domains setup code relies on siblings not spanning
|
|
|
|
- * multiple nodes. Make sure the architecture has a proper
|
|
|
|
- * siblings map:
|
|
|
|
|
|
+ * The init_sched_build_groups can't handle what we want to do with node
|
|
|
|
+ * groups, so roll our own. Now each node has its own list of groups which
|
|
|
|
+ * gets dynamically allocated.
|
|
*/
|
|
*/
|
|
-static void check_sibling_maps(void)
|
|
|
|
-{
|
|
|
|
- int i, j;
|
|
|
|
|
|
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
|
|
|
|
+static struct sched_group *sched_group_nodes[MAX_NUMNODES];
|
|
|
|
|
|
- for_each_online_cpu(i) {
|
|
|
|
- for_each_cpu_mask(j, cpu_sibling_map[i]) {
|
|
|
|
- if (cpu_to_node(i) != cpu_to_node(j)) {
|
|
|
|
- printk(KERN_INFO "warning: CPU %d siblings map "
|
|
|
|
- "to different node - isolating "
|
|
|
|
- "them.\n", i);
|
|
|
|
- cpu_sibling_map[i] = cpumask_of_cpu(i);
|
|
|
|
- break;
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
|
|
+static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
|
|
|
|
+static struct sched_group sched_group_allnodes[MAX_NUMNODES];
|
|
|
|
+
|
|
|
|
+static int cpu_to_allnodes_group(int cpu)
|
|
|
|
+{
|
|
|
|
+ return cpu_to_node(cpu);
|
|
}
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
|
|
@@ -4928,7 +4985,7 @@ static void check_sibling_maps(void)
|
|
* Build sched domains for a given set of cpus and attach the sched domains
|
|
* Build sched domains for a given set of cpus and attach the sched domains
|
|
* to the individual cpus
|
|
* to the individual cpus
|
|
*/
|
|
*/
|
|
-static void build_sched_domains(const cpumask_t *cpu_map)
|
|
|
|
|
|
+void build_sched_domains(const cpumask_t *cpu_map)
|
|
{
|
|
{
|
|
int i;
|
|
int i;
|
|
|
|
|
|
@@ -4943,11 +5000,22 @@ static void build_sched_domains(const cpumask_t *cpu_map)
|
|
cpus_and(nodemask, nodemask, *cpu_map);
|
|
cpus_and(nodemask, nodemask, *cpu_map);
|
|
|
|
|
|
#ifdef CONFIG_NUMA
|
|
#ifdef CONFIG_NUMA
|
|
|
|
+ if (num_online_cpus()
|
|
|
|
+ > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
|
|
|
|
+ sd = &per_cpu(allnodes_domains, i);
|
|
|
|
+ *sd = SD_ALLNODES_INIT;
|
|
|
|
+ sd->span = *cpu_map;
|
|
|
|
+ group = cpu_to_allnodes_group(i);
|
|
|
|
+ sd->groups = &sched_group_allnodes[group];
|
|
|
|
+ p = sd;
|
|
|
|
+ } else
|
|
|
|
+ p = NULL;
|
|
|
|
+
|
|
sd = &per_cpu(node_domains, i);
|
|
sd = &per_cpu(node_domains, i);
|
|
- group = cpu_to_node_group(i);
|
|
|
|
*sd = SD_NODE_INIT;
|
|
*sd = SD_NODE_INIT;
|
|
- sd->span = *cpu_map;
|
|
|
|
- sd->groups = &sched_group_nodes[group];
|
|
|
|
|
|
+ sd->span = sched_domain_node_span(cpu_to_node(i));
|
|
|
|
+ sd->parent = p;
|
|
|
|
+ cpus_and(sd->span, sd->span, *cpu_map);
|
|
#endif
|
|
#endif
|
|
|
|
|
|
p = sd;
|
|
p = sd;
|
|
@@ -4972,7 +5040,7 @@ static void build_sched_domains(const cpumask_t *cpu_map)
|
|
|
|
|
|
#ifdef CONFIG_SCHED_SMT
|
|
#ifdef CONFIG_SCHED_SMT
|
|
/* Set up CPU (sibling) groups */
|
|
/* Set up CPU (sibling) groups */
|
|
- for_each_online_cpu(i) {
|
|
|
|
|
|
+ for_each_cpu_mask(i, *cpu_map) {
|
|
cpumask_t this_sibling_map = cpu_sibling_map[i];
|
|
cpumask_t this_sibling_map = cpu_sibling_map[i];
|
|
cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
|
|
cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
|
|
if (i != first_cpu(this_sibling_map))
|
|
if (i != first_cpu(this_sibling_map))
|
|
@@ -4997,8 +5065,74 @@ static void build_sched_domains(const cpumask_t *cpu_map)
|
|
|
|
|
|
#ifdef CONFIG_NUMA
|
|
#ifdef CONFIG_NUMA
|
|
/* Set up node groups */
|
|
/* Set up node groups */
|
|
- init_sched_build_groups(sched_group_nodes, *cpu_map,
|
|
|
|
- &cpu_to_node_group);
|
|
|
|
|
|
+ init_sched_build_groups(sched_group_allnodes, *cpu_map,
|
|
|
|
+ &cpu_to_allnodes_group);
|
|
|
|
+
|
|
|
|
+ for (i = 0; i < MAX_NUMNODES; i++) {
|
|
|
|
+ /* Set up node groups */
|
|
|
|
+ struct sched_group *sg, *prev;
|
|
|
|
+ cpumask_t nodemask = node_to_cpumask(i);
|
|
|
|
+ cpumask_t domainspan;
|
|
|
|
+ cpumask_t covered = CPU_MASK_NONE;
|
|
|
|
+ int j;
|
|
|
|
+
|
|
|
|
+ cpus_and(nodemask, nodemask, *cpu_map);
|
|
|
|
+ if (cpus_empty(nodemask))
|
|
|
|
+ continue;
|
|
|
|
+
|
|
|
|
+ domainspan = sched_domain_node_span(i);
|
|
|
|
+ cpus_and(domainspan, domainspan, *cpu_map);
|
|
|
|
+
|
|
|
|
+ sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
|
|
|
|
+ sched_group_nodes[i] = sg;
|
|
|
|
+ for_each_cpu_mask(j, nodemask) {
|
|
|
|
+ struct sched_domain *sd;
|
|
|
|
+ sd = &per_cpu(node_domains, j);
|
|
|
|
+ sd->groups = sg;
|
|
|
|
+ if (sd->groups == NULL) {
|
|
|
|
+ /* Turn off balancing if we have no groups */
|
|
|
|
+ sd->flags = 0;
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ if (!sg) {
|
|
|
|
+ printk(KERN_WARNING
|
|
|
|
+ "Can not alloc domain group for node %d\n", i);
|
|
|
|
+ continue;
|
|
|
|
+ }
|
|
|
|
+ sg->cpu_power = 0;
|
|
|
|
+ sg->cpumask = nodemask;
|
|
|
|
+ cpus_or(covered, covered, nodemask);
|
|
|
|
+ prev = sg;
|
|
|
|
+
|
|
|
|
+ for (j = 0; j < MAX_NUMNODES; j++) {
|
|
|
|
+ cpumask_t tmp, notcovered;
|
|
|
|
+ int n = (i + j) % MAX_NUMNODES;
|
|
|
|
+
|
|
|
|
+ cpus_complement(notcovered, covered);
|
|
|
|
+ cpus_and(tmp, notcovered, *cpu_map);
|
|
|
|
+ cpus_and(tmp, tmp, domainspan);
|
|
|
|
+ if (cpus_empty(tmp))
|
|
|
|
+ break;
|
|
|
|
+
|
|
|
|
+ nodemask = node_to_cpumask(n);
|
|
|
|
+ cpus_and(tmp, tmp, nodemask);
|
|
|
|
+ if (cpus_empty(tmp))
|
|
|
|
+ continue;
|
|
|
|
+
|
|
|
|
+ sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
|
|
|
|
+ if (!sg) {
|
|
|
|
+ printk(KERN_WARNING
|
|
|
|
+ "Can not alloc domain group for node %d\n", j);
|
|
|
|
+ break;
|
|
|
|
+ }
|
|
|
|
+ sg->cpu_power = 0;
|
|
|
|
+ sg->cpumask = tmp;
|
|
|
|
+ cpus_or(covered, covered, tmp);
|
|
|
|
+ prev->next = sg;
|
|
|
|
+ prev = sg;
|
|
|
|
+ }
|
|
|
|
+ prev->next = sched_group_nodes[i];
|
|
|
|
+ }
|
|
#endif
|
|
#endif
|
|
|
|
|
|
/* Calculate CPU power for physical packages and nodes */
|
|
/* Calculate CPU power for physical packages and nodes */
|
|
@@ -5017,14 +5151,46 @@ static void build_sched_domains(const cpumask_t *cpu_map)
|
|
sd->groups->cpu_power = power;
|
|
sd->groups->cpu_power = power;
|
|
|
|
|
|
#ifdef CONFIG_NUMA
|
|
#ifdef CONFIG_NUMA
|
|
- if (i == first_cpu(sd->groups->cpumask)) {
|
|
|
|
- /* Only add "power" once for each physical package. */
|
|
|
|
- sd = &per_cpu(node_domains, i);
|
|
|
|
- sd->groups->cpu_power += power;
|
|
|
|
|
|
+ sd = &per_cpu(allnodes_domains, i);
|
|
|
|
+ if (sd->groups) {
|
|
|
|
+ power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
|
|
|
|
+ (cpus_weight(sd->groups->cpumask)-1) / 10;
|
|
|
|
+ sd->groups->cpu_power = power;
|
|
}
|
|
}
|
|
#endif
|
|
#endif
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+#ifdef CONFIG_NUMA
|
|
|
|
+ for (i = 0; i < MAX_NUMNODES; i++) {
|
|
|
|
+ struct sched_group *sg = sched_group_nodes[i];
|
|
|
|
+ int j;
|
|
|
|
+
|
|
|
|
+ if (sg == NULL)
|
|
|
|
+ continue;
|
|
|
|
+next_sg:
|
|
|
|
+ for_each_cpu_mask(j, sg->cpumask) {
|
|
|
|
+ struct sched_domain *sd;
|
|
|
|
+ int power;
|
|
|
|
+
|
|
|
|
+ sd = &per_cpu(phys_domains, j);
|
|
|
|
+ if (j != first_cpu(sd->groups->cpumask)) {
|
|
|
|
+ /*
|
|
|
|
+ * Only add "power" once for each
|
|
|
|
+ * physical package.
|
|
|
|
+ */
|
|
|
|
+ continue;
|
|
|
|
+ }
|
|
|
|
+ power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
|
|
|
|
+ (cpus_weight(sd->groups->cpumask)-1) / 10;
|
|
|
|
+
|
|
|
|
+ sg->cpu_power += power;
|
|
|
|
+ }
|
|
|
|
+ sg = sg->next;
|
|
|
|
+ if (sg != sched_group_nodes[i])
|
|
|
|
+ goto next_sg;
|
|
|
|
+ }
|
|
|
|
+#endif
|
|
|
|
+
|
|
/* Attach the domains */
|
|
/* Attach the domains */
|
|
for_each_cpu_mask(i, *cpu_map) {
|
|
for_each_cpu_mask(i, *cpu_map) {
|
|
struct sched_domain *sd;
|
|
struct sched_domain *sd;
|
|
@@ -5039,13 +5205,10 @@ static void build_sched_domains(const cpumask_t *cpu_map)
|
|
/*
|
|
/*
|
|
* Set up scheduler domains and groups. Callers must hold the hotplug lock.
|
|
* Set up scheduler domains and groups. Callers must hold the hotplug lock.
|
|
*/
|
|
*/
|
|
-static void arch_init_sched_domains(cpumask_t *cpu_map)
|
|
|
|
|
|
+static void arch_init_sched_domains(const cpumask_t *cpu_map)
|
|
{
|
|
{
|
|
cpumask_t cpu_default_map;
|
|
cpumask_t cpu_default_map;
|
|
|
|
|
|
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
|
|
|
|
- check_sibling_maps();
|
|
|
|
-#endif
|
|
|
|
/*
|
|
/*
|
|
* Setup mask for cpus without special case scheduling requirements.
|
|
* Setup mask for cpus without special case scheduling requirements.
|
|
* For now this just excludes isolated cpus, but could be used to
|
|
* For now this just excludes isolated cpus, but could be used to
|
|
@@ -5058,10 +5221,29 @@ static void arch_init_sched_domains(cpumask_t *cpu_map)
|
|
|
|
|
|
static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
|
|
static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
|
|
{
|
|
{
|
|
- /* Do nothing: everything is statically allocated. */
|
|
|
|
-}
|
|
|
|
|
|
+#ifdef CONFIG_NUMA
|
|
|
|
+ int i;
|
|
|
|
+ for (i = 0; i < MAX_NUMNODES; i++) {
|
|
|
|
+ cpumask_t nodemask = node_to_cpumask(i);
|
|
|
|
+ struct sched_group *oldsg, *sg = sched_group_nodes[i];
|
|
|
|
|
|
-#endif /* ARCH_HAS_SCHED_DOMAIN */
|
|
|
|
|
|
+ cpus_and(nodemask, nodemask, *cpu_map);
|
|
|
|
+ if (cpus_empty(nodemask))
|
|
|
|
+ continue;
|
|
|
|
+
|
|
|
|
+ if (sg == NULL)
|
|
|
|
+ continue;
|
|
|
|
+ sg = sg->next;
|
|
|
|
+next_sg:
|
|
|
|
+ oldsg = sg;
|
|
|
|
+ sg = sg->next;
|
|
|
|
+ kfree(oldsg);
|
|
|
|
+ if (oldsg != sched_group_nodes[i])
|
|
|
|
+ goto next_sg;
|
|
|
|
+ sched_group_nodes[i] = NULL;
|
|
|
|
+ }
|
|
|
|
+#endif
|
|
|
|
+}
|
|
|
|
|
|
/*
|
|
/*
|
|
* Detach sched domains from a group of cpus specified in cpu_map
|
|
* Detach sched domains from a group of cpus specified in cpu_map
|