19 years ago · 89c4710ee9
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -625,9 +625,17 @@ enum idle_type
 
															 #define SD_WAKE_BALANCE		64	/* Perform balancing at task wakeup */
														
 
															 #define SD_SHARE_CPUPOWER	128	/* Domain members share cpu power */
														
 
															 #define SD_POWERSAVINGS_BALANCE	256	/* Balance for power savings */
														
 
															+#define SD_SHARE_PKG_RESOURCES	512	/* Domain members share cpu pkg resources */
														
 
															-#define BALANCE_FOR_POWER	((sched_mc_power_savings || sched_smt_power_savings) \
														
 
															-				 ? SD_POWERSAVINGS_BALANCE : 0)
														
 
															+#define BALANCE_FOR_MC_POWER	\
														
 
															+	(sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
														
 
															+
														
 
															+#define BALANCE_FOR_PKG_POWER	\
														
 
															+	((sched_mc_power_savings || sched_smt_power_savings) ?	\
														
 
															+	 SD_POWERSAVINGS_BALANCE : 0)
														
 
															+
														
 
															+#define test_sd_parent(sd, flag)	((sd->parent &&		\
														
 
															+					 (sd->parent->flags & flag)) ? 1 : 0)
														
 
															 struct sched_group {
														
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -115,6 +115,38 @@
 
															 #endif
														
 
															 #endif /* CONFIG_SCHED_SMT */
														
 
															+#ifdef CONFIG_SCHED_MC
														
 
															+/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */
														
 
															+#ifndef SD_MC_INIT
														
 
															+#define SD_MC_INIT (struct sched_domain) {		\
														
 
															+	.span			= CPU_MASK_NONE,	\
														
 
															+	.parent			= NULL,			\
														
 
															+	.child			= NULL,			\
														
 
															+	.groups			= NULL,			\
														
 
															+	.min_interval		= 1,			\
														
 
															+	.max_interval		= 4,			\
														
 
															+	.busy_factor		= 64,			\
														
 
															+	.imbalance_pct		= 125,			\
														
 
															+	.cache_nice_tries	= 1,			\
														
 
															+	.per_cpu_gain		= 100,			\
														
 
															+	.busy_idx		= 2,			\
														
 
															+	.idle_idx		= 1,			\
														
 
															+	.newidle_idx		= 2,			\
														
 
															+	.wake_idx		= 1,			\
														
 
															+	.forkexec_idx		= 1,			\
														
 
															+	.flags			= SD_LOAD_BALANCE	\
														
 
															+				| SD_BALANCE_NEWIDLE	\
														
 
															+				| SD_BALANCE_EXEC	\
														
 
															+				| SD_WAKE_AFFINE	\
														
 
															+				| SD_SHARE_PKG_RESOURCES\
														
 
															+				| BALANCE_FOR_MC_POWER,	\
														
 
															+	.last_balance		= jiffies,		\
														
 
															+	.balance_interval	= 1,			\
														
 
															+	.nr_balance_failed	= 0,			\
														
 
															+}
														
 
															+#endif
														
 
															+#endif /* CONFIG_SCHED_MC */
														
 
															+
														
 
															 /* Common values for CPUs */
														
 
															 #ifndef SD_CPU_INIT
														
 
															 #define SD_CPU_INIT (struct sched_domain) {		\
														
@@ -137,7 +169,7 @@
 
															 				| SD_BALANCE_NEWIDLE	\
														
 
															 				| SD_BALANCE_EXEC	\
														
 
															 				| SD_WAKE_AFFINE	\
														
 
															-				| BALANCE_FOR_POWER,	\
														
 
															+				| BALANCE_FOR_PKG_POWER,\
														
 
															 	.last_balance		= jiffies,		\
														
 
															 	.balance_interval	= 1,			\
														
 
															 	.nr_balance_failed	= 0,			\
														
@@ -168,15 +200,6 @@
 
															 	.nr_balance_failed	= 0,			\
														
 
															 }
														
 
															-#ifdef CONFIG_SCHED_MC
														
 
															-#ifndef SD_MC_INIT
														
 
															-/* for now its same as SD_CPU_INIT.
														
 
															- * TBD: Tune Domain parameters!
														
 
															- */
														
 
															-#define SD_MC_INIT   SD_CPU_INIT
														
 
															-#endif
														
 
															-#endif
														
 
															-
														
 
															 #ifdef CONFIG_NUMA
														
 
															 #ifndef SD_NODE_INIT
														
 
															 #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
														
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2541,8 +2541,14 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
															 	struct rq *busiest;
														
 
															 	cpumask_t cpus = CPU_MASK_ALL;
														
 
															+	/*
														
 
															+	 * When power savings policy is enabled for the parent domain, idle
														
 
															+	 * sibling can pick up load irrespective of busy siblings. In this case,
														
 
															+	 * let the state of idle sibling percolate up as IDLE, instead of
														
 
															+	 * portraying it as NOT_IDLE.
														
 
															+	 */
														
 
															 	if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
														
 
															-	    !sched_smt_power_savings)
														
 
															+	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
														
 
															 		sd_idle = 1;
														
 
															 	schedstat_inc(sd, lb_cnt[idle]);
														
@@ -2638,7 +2644,7 @@ redo:
 
															 	}
														
 
															 	if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
														
 
															-	    !sched_smt_power_savings)
														
 
															+	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
														
 
															 		return -1;
														
 
															 	return nr_moved;
														
@@ -2654,7 +2660,7 @@ out_one_pinned:
 
															 		sd->balance_interval *= 2;
														
 
															 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
														
 
															-			!sched_smt_power_savings)
														
 
															+	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
														
 
															 		return -1;
														
 
															 	return 0;
														
 
															 }
														
@@ -2676,7 +2682,14 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 
															 	int sd_idle = 0;
														
 
															 	cpumask_t cpus = CPU_MASK_ALL;
														
 
															-	if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
														
 
															+	/*
														
 
															+	 * When power savings policy is enabled for the parent domain, idle
														
 
															+	 * sibling can pick up load irrespective of busy siblings. In this case,
														
 
															+	 * let the state of idle sibling percolate up as IDLE, instead of
														
 
															+	 * portraying it as NOT_IDLE.
														
 
															+	 */
														
 
															+	if (sd->flags & SD_SHARE_CPUPOWER &&
														
 
															+	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
														
 
															 		sd_idle = 1;
														
 
															 	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
														
@@ -2717,7 +2730,8 @@ redo:
 
															 	if (!nr_moved) {
														
 
															 		schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
														
 
															-		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
														
 
															+		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
														
 
															+		    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
														
 
															 			return -1;
														
 
															 	} else
														
 
															 		sd->nr_balance_failed = 0;
														
@@ -2727,7 +2741,7 @@ redo:
 
															 out_balanced:
														
 
															 	schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
														
 
															 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
														
 
															-					!sched_smt_power_savings)
														
 
															+	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
														
 
															 		return -1;
														
 
															 	sd->nr_balance_failed = 0;
														
@@ -5400,7 +5414,9 @@ static int sd_degenerate(struct sched_domain *sd)
 
															 	if (sd->flags & (SD_LOAD_BALANCE |
														
 
															 			 SD_BALANCE_NEWIDLE |
														
 
															 			 SD_BALANCE_FORK |
														
 
															-			 SD_BALANCE_EXEC)) {
														
 
															+			 SD_BALANCE_EXEC |
														
 
															+			 SD_SHARE_CPUPOWER |
														
 
															+			 SD_SHARE_PKG_RESOURCES)) {
														
 
															 		if (sd->groups != sd->groups->next)
														
 
															 			return 0;
														
 
															 	}
														
@@ -5434,7 +5450,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 
															 		pflags &= ~(SD_LOAD_BALANCE |
														
 
															 				SD_BALANCE_NEWIDLE |
														
 
															 				SD_BALANCE_FORK |
														
 
															-				SD_BALANCE_EXEC);
														
 
															+				SD_BALANCE_EXEC |
														
 
															+				SD_SHARE_CPUPOWER |
														
 
															+				SD_SHARE_PKG_RESOURCES);
														
 
															 	}
														
 
															 	if (~cflags & pflags)
														
 
															 		return 0;
														
@@ -6240,6 +6258,58 @@ static void free_sched_groups(const cpumask_t *cpu_map)
 
															 }
														
 
															 #endif
														
 
															+/*
														
 
															+ * Initialize sched groups cpu_power.
														
 
															+ *
														
 
															+ * cpu_power indicates the capacity of sched group, which is used while
														
 
															+ * distributing the load between different sched groups in a sched domain.
														
 
															+ * Typically cpu_power for all the groups in a sched domain will be same unless
														
 
															+ * there are asymmetries in the topology. If there are asymmetries, group
														
 
															+ * having more cpu_power will pickup more load compared to the group having
														
 
															+ * less cpu_power.
														
 
															+ *
														
 
															+ * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
														
 
															+ * the maximum number of tasks a group can handle in the presence of other idle
														
 
															+ * or lightly loaded groups in the same sched domain.
														
 
															+ */
														
 
															+static void init_sched_groups_power(int cpu, struct sched_domain *sd)
														
 
															+{
														
 
															+	struct sched_domain *child;
														
 
															+	struct sched_group *group;
														
 
															+
														
 
															+	WARN_ON(!sd || !sd->groups);
														
 
															+
														
 
															+	if (cpu != first_cpu(sd->groups->cpumask))
														
 
															+		return;
														
 
															+
														
 
															+	child = sd->child;
														
 
															+
														
 
															+	/*
														
 
															+	 * For perf policy, if the groups in child domain share resources
														
 
															+	 * (for example cores sharing some portions of the cache hierarchy
														
 
															+	 * or SMT), then set this domain groups cpu_power such that each group
														
 
															+	 * can handle only one task, when there are other idle groups in the
														
 
															+	 * same sched domain.
														
 
															+	 */
														
 
															+	if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
														
 
															+		       (child->flags &
														
 
															+			(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
														
 
															+		sd->groups->cpu_power = SCHED_LOAD_SCALE;
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	sd->groups->cpu_power = 0;
														
 
															+
														
 
															+	/*
														
 
															+	 * add cpu_power of each child group to this groups cpu_power
														
 
															+	 */
														
 
															+	group = child->groups;
														
 
															+	do {
														
 
															+		sd->groups->cpu_power += group->cpu_power;
														
 
															+		group = group->next;
														
 
															+	} while (group != child->groups);
														
 
															+}
														
 
															+
														
 
															 /*
														
 
															  * Build sched domains for a given set of cpus and attach the sched domains
														
 
															  * to the individual cpus
														
@@ -6247,6 +6317,7 @@ static void free_sched_groups(const cpumask_t *cpu_map)
 
															 static int build_sched_domains(const cpumask_t *cpu_map)
														
 
															 {
														
 
															 	int i;
														
 
															+	struct sched_domain *sd;
														
 
															 #ifdef CONFIG_NUMA
														
 
															 	struct sched_group **sched_group_nodes = NULL;
														
 
															 	struct sched_group *sched_group_allnodes = NULL;
														
@@ -6456,72 +6527,20 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 
															 	/* Calculate CPU power for physical packages and nodes */
														
 
															 #ifdef CONFIG_SCHED_SMT
														
 
															 	for_each_cpu_mask(i, *cpu_map) {
														
 
															-		struct sched_domain *sd;
														
 
															 		sd = &per_cpu(cpu_domains, i);
														
 
															-		sd->groups->cpu_power = SCHED_LOAD_SCALE;
														
 
															+		init_sched_groups_power(i, sd);
														
 
															 	}
														
 
															 #endif
														
 
															 #ifdef CONFIG_SCHED_MC
														
 
															 	for_each_cpu_mask(i, *cpu_map) {
														
 
															-		int power;
														
 
															-		struct sched_domain *sd;
														
 
															 		sd = &per_cpu(core_domains, i);
														
 
															-		if (sched_smt_power_savings)
														
 
															-			power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
														
 
															-		else
														
 
															-			power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
														
 
															-					    * SCHED_LOAD_SCALE / 10;
														
 
															-		sd->groups->cpu_power = power;
														
 
															+		init_sched_groups_power(i, sd);
														
 
															 	}
														
 
															 #endif
														
 
															 	for_each_cpu_mask(i, *cpu_map) {
														
 
															-		struct sched_domain *sd;
														
 
															-#ifdef CONFIG_SCHED_MC
														
 
															-		sd = &per_cpu(phys_domains, i);
														
 
															-		if (i != first_cpu(sd->groups->cpumask))
														
 
															-			continue;
														
 
															-
														
 
															-		sd->groups->cpu_power = 0;
														
 
															-		if (sched_mc_power_savings || sched_smt_power_savings) {
														
 
															-			int j;
														
 
															-
														
 
															- 			for_each_cpu_mask(j, sd->groups->cpumask) {
														
 
															-				struct sched_domain *sd1;
														
 
															- 				sd1 = &per_cpu(core_domains, j);
														
 
															- 				/*
														
 
															- 			 	 * for each core we will add once
														
 
															- 				 * to the group in physical domain
														
 
															- 			 	 */
														
 
															-  	 			if (j != first_cpu(sd1->groups->cpumask))
														
 
															- 					continue;
														
 
															-
														
 
															- 				if (sched_smt_power_savings)
														
 
															-   					sd->groups->cpu_power += sd1->groups->cpu_power;
														
 
															- 				else
														
 
															-   					sd->groups->cpu_power += SCHED_LOAD_SCALE;
														
 
															-   			}
														
 
															- 		} else
														
 
															- 			/*
														
 
															- 			 * This has to be < 2 * SCHED_LOAD_SCALE
														
 
															- 			 * Lets keep it SCHED_LOAD_SCALE, so that
														
 
															- 			 * while calculating NUMA group's cpu_power
														
 
															- 			 * we can simply do
														
 
															- 			 *  numa_group->cpu_power += phys_group->cpu_power;
														
 
															- 			 *
														
 
															- 			 * See "only add power once for each physical pkg"
														
 
															- 			 * comment below
														
 
															- 			 */
														
 
															- 			sd->groups->cpu_power = SCHED_LOAD_SCALE;
														
 
															-#else
														
 
															-		int power;
														
 
															 		sd = &per_cpu(phys_domains, i);
														
 
															-		if (sched_smt_power_savings)
														
 
															-			power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
														
 
															-		else
														
 
															-			power = SCHED_LOAD_SCALE;
														
 
															-		sd->groups->cpu_power = power;
														
 
															-#endif
														
 
															+		init_sched_groups_power(i, sd);
														
 
															 	}
														
 
															 #ifdef CONFIG_NUMA