16 vuotta sitten · 66fef08f7d
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3189,246 +3189,479 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 
															 	return 0;
														
 
															 }
														
 
															+/********** Helpers for find_busiest_group ************************/
														
 
															+/**
														
 
															+ * sd_lb_stats - Structure to store the statistics of a sched_domain
														
 
															+ * 		during load balancing.
														
 
															+ */
														
 
															+struct sd_lb_stats {
														
 
															+	struct sched_group *busiest; /* Busiest group in this sd */
														
 
															+	struct sched_group *this;  /* Local group in this sd */
														
 
															+	unsigned long total_load;  /* Total load of all groups in sd */
														
 
															+	unsigned long total_pwr;   /*	Total power of all groups in sd */
														
 
															+	unsigned long avg_load;	   /* Average load across all groups in sd */
														
 
															+
														
 
															+	/** Statistics of this group */
														
 
															+	unsigned long this_load;
														
 
															+	unsigned long this_load_per_task;
														
 
															+	unsigned long this_nr_running;
														
 
															+
														
 
															+	/* Statistics of the busiest group */
														
 
															+	unsigned long max_load;
														
 
															+	unsigned long busiest_load_per_task;
														
 
															+	unsigned long busiest_nr_running;
														
 
															+
														
 
															+	int group_imb; /* Is there imbalance in this sd */
														
 
															+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
														
 
															+	int power_savings_balance; /* Is powersave balance needed for this sd */
														
 
															+	struct sched_group *group_min; /* Least loaded group in sd */
														
 
															+	struct sched_group *group_leader; /* Group which relieves group_min */
														
 
															+	unsigned long min_load_per_task; /* load_per_task in group_min */
														
 
															+	unsigned long leader_nr_running; /* Nr running of group_leader */
														
 
															+	unsigned long min_nr_running; /* Nr running of group_min */
														
 
															+#endif
														
 
															+};
														
 
															-/*
														
 
															- * find_busiest_group finds and returns the busiest CPU group within the
														
 
															- * domain. It calculates and returns the amount of weighted load which
														
 
															- * should be moved to restore balance via the imbalance parameter.
														
 
															+/**
														
 
															+ * sg_lb_stats - stats of a sched_group required for load_balancing
														
 
															+ */
														
 
															+struct sg_lb_stats {
														
 
															+	unsigned long avg_load; /*Avg load across the CPUs of the group */
														
 
															+	unsigned long group_load; /* Total load over the CPUs of the group */
														
 
															+	unsigned long sum_nr_running; /* Nr tasks running in the group */
														
 
															+	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
														
 
															+	unsigned long group_capacity;
														
 
															+	int group_imb; /* Is there an imbalance in the group ? */
														
 
															+};
														
 
															+
														
 
															+/**
														
 
															+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
														
 
															+ * @group: The group whose first cpu is to be returned.
														
 
															  */
														
 
															-static struct sched_group *
														
 
															-find_busiest_group(struct sched_domain *sd, int this_cpu,
														
 
															-		   unsigned long *imbalance, enum cpu_idle_type idle,
														
 
															-		   int *sd_idle, const struct cpumask *cpus, int *balance)
														
 
															+static inline unsigned int group_first_cpu(struct sched_group *group)
														
 
															 {
														
 
															-	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
														
 
															-	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
														
 
															-	unsigned long max_pull;
														
 
															-	unsigned long busiest_load_per_task, busiest_nr_running;
														
 
															-	unsigned long this_load_per_task, this_nr_running;
														
 
															-	int load_idx, group_imb = 0;
														
 
															-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
														
 
															-	int power_savings_balance = 1;
														
 
															-	unsigned long leader_nr_running = 0, min_load_per_task = 0;
														
 
															-	unsigned long min_nr_running = ULONG_MAX;
														
 
															-	struct sched_group *group_min = NULL, *group_leader = NULL;
														
 
															-#endif
														
 
															+	return cpumask_first(sched_group_cpus(group));
														
 
															+}
														
 
															-	max_load = this_load = total_load = total_pwr = 0;
														
 
															-	busiest_load_per_task = busiest_nr_running = 0;
														
 
															-	this_load_per_task = this_nr_running = 0;
														
 
															+/**
														
 
															+ * get_sd_load_idx - Obtain the load index for a given sched domain.
														
 
															+ * @sd: The sched_domain whose load_idx is to be obtained.
														
 
															+ * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
														
 
															+ */
														
 
															+static inline int get_sd_load_idx(struct sched_domain *sd,
														
 
															+					enum cpu_idle_type idle)
														
 
															+{
														
 
															+	int load_idx;
														
 
															-	if (idle == CPU_NOT_IDLE)
														
 
															+	switch (idle) {
														
 
															+	case CPU_NOT_IDLE:
														
 
															 		load_idx = sd->busy_idx;
														
 
															-	else if (idle == CPU_NEWLY_IDLE)
														
 
															+		break;
														
 
															+
														
 
															+	case CPU_NEWLY_IDLE:
														
 
															 		load_idx = sd->newidle_idx;
														
 
															-	else
														
 
															+		break;
														
 
															+	default:
														
 
															 		load_idx = sd->idle_idx;
														
 
															+		break;
														
 
															+	}
														
 
															-	do {
														
 
															-		unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
														
 
															-		int local_group;
														
 
															-		int i;
														
 
															-		int __group_imb = 0;
														
 
															-		unsigned int balance_cpu = -1, first_idle_cpu = 0;
														
 
															-		unsigned long sum_nr_running, sum_weighted_load;
														
 
															-		unsigned long sum_avg_load_per_task;
														
 
															-		unsigned long avg_load_per_task;
														
 
															+	return load_idx;
														
 
															+}
														
 
															-		local_group = cpumask_test_cpu(this_cpu,
														
 
															-					       sched_group_cpus(group));
														
 
															-		if (local_group)
														
 
															-			balance_cpu = cpumask_first(sched_group_cpus(group));
														
 
															+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
														
 
															+/**
														
 
															+ * init_sd_power_savings_stats - Initialize power savings statistics for
														
 
															+ * the given sched_domain, during load balancing.
														
 
															+ *
														
 
															+ * @sd: Sched domain whose power-savings statistics are to be initialized.
														
 
															+ * @sds: Variable containing the statistics for sd.
														
 
															+ * @idle: Idle status of the CPU at which we're performing load-balancing.
														
 
															+ */
														
 
															+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
														
 
															+	struct sd_lb_stats *sds, enum cpu_idle_type idle)
														
 
															+{
														
 
															+	/*
														
 
															+	 * Busy processors will not participate in power savings
														
 
															+	 * balance.
														
 
															+	 */
														
 
															+	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
														
 
															+		sds->power_savings_balance = 0;
														
 
															+	else {
														
 
															+		sds->power_savings_balance = 1;
														
 
															+		sds->min_nr_running = ULONG_MAX;
														
 
															+		sds->leader_nr_running = 0;
														
 
															+	}
														
 
															+}
														
 
															-		/* Tally up the load of all CPUs in the group */
														
 
															-		sum_weighted_load = sum_nr_running = avg_load = 0;
														
 
															-		sum_avg_load_per_task = avg_load_per_task = 0;
														
 
															+/**
														
 
															+ * update_sd_power_savings_stats - Update the power saving stats for a
														
 
															+ * sched_domain while performing load balancing.
														
 
															+ *
														
 
															+ * @group: sched_group belonging to the sched_domain under consideration.
														
 
															+ * @sds: Variable containing the statistics of the sched_domain
														
 
															+ * @local_group: Does group contain the CPU for which we're performing
														
 
															+ * 		load balancing ?
														
 
															+ * @sgs: Variable containing the statistics of the group.
														
 
															+ */
														
 
															+static inline void update_sd_power_savings_stats(struct sched_group *group,
														
 
															+	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
														
 
															+{
														
 
															-		max_cpu_load = 0;
														
 
															-		min_cpu_load = ~0UL;
														
 
															+	if (!sds->power_savings_balance)
														
 
															+		return;
														
 
															-		for_each_cpu_and(i, sched_group_cpus(group), cpus) {
														
 
															-			struct rq *rq = cpu_rq(i);
														
 
															+	/*
														
 
															+	 * If the local group is idle or completely loaded
														
 
															+	 * no need to do power savings balance at this domain
														
 
															+	 */
														
 
															+	if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
														
 
															+				!sds->this_nr_running))
														
 
															+		sds->power_savings_balance = 0;
														
 
															-			if (*sd_idle && rq->nr_running)
														
 
															-				*sd_idle = 0;
														
 
															+	/*
														
 
															+	 * If a group is already running at full capacity or idle,
														
 
															+	 * don't include that group in power savings calculations
														
 
															+	 */
														
 
															+	if (!sds->power_savings_balance ||
														
 
															+		sgs->sum_nr_running >= sgs->group_capacity ||
														
 
															+		!sgs->sum_nr_running)
														
 
															+		return;
														
 
															-			/* Bias balancing toward cpus of our domain */
														
 
															-			if (local_group) {
														
 
															-				if (idle_cpu(i) && !first_idle_cpu) {
														
 
															-					first_idle_cpu = 1;
														
 
															-					balance_cpu = i;
														
 
															-				}
														
 
															+	/*
														
 
															+	 * Calculate the group which has the least non-idle load.
														
 
															+	 * This is the group from where we need to pick up the load
														
 
															+	 * for saving power
														
 
															+	 */
														
 
															+	if ((sgs->sum_nr_running < sds->min_nr_running) ||
														
 
															+	    (sgs->sum_nr_running == sds->min_nr_running &&
														
 
															+	     group_first_cpu(group) > group_first_cpu(sds->group_min))) {
														
 
															+		sds->group_min = group;
														
 
															+		sds->min_nr_running = sgs->sum_nr_running;
														
 
															+		sds->min_load_per_task = sgs->sum_weighted_load /
														
 
															+						sgs->sum_nr_running;
														
 
															+	}
														
 
															-				load = target_load(i, load_idx);
														
 
															-			} else {
														
 
															-				load = source_load(i, load_idx);
														
 
															-				if (load > max_cpu_load)
														
 
															-					max_cpu_load = load;
														
 
															-				if (min_cpu_load > load)
														
 
															-					min_cpu_load = load;
														
 
															-			}
														
 
															+	/*
														
 
															+	 * Calculate the group which is almost near its
														
 
															+	 * capacity but still has some space to pick up some load
														
 
															+	 * from other group and save more power
														
 
															+	 */
														
 
															+	if (sgs->sum_nr_running > sgs->group_capacity - 1)
														
 
															+		return;
														
 
															-			avg_load += load;
														
 
															-			sum_nr_running += rq->nr_running;
														
 
															-			sum_weighted_load += weighted_cpuload(i);
														
 
															+	if (sgs->sum_nr_running > sds->leader_nr_running ||
														
 
															+	    (sgs->sum_nr_running == sds->leader_nr_running &&
														
 
															+	     group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
														
 
															+		sds->group_leader = group;
														
 
															+		sds->leader_nr_running = sgs->sum_nr_running;
														
 
															+	}
														
 
															+}
														
 
															-			sum_avg_load_per_task += cpu_avg_load_per_task(i);
														
 
															-		}
														
 
															+/**
														
 
															+ * check_power_save_busiest_group - Check if we have potential to perform
														
 
															+ *	some power-savings balance. If yes, set the busiest group to be
														
 
															+ *	the least loaded group in the sched_domain, so that it's CPUs can
														
 
															+ *	be put to idle.
														
 
															+ *
														
 
															+ * @sds: Variable containing the statistics of the sched_domain
														
 
															+ *	under consideration.
														
 
															+ * @this_cpu: Cpu at which we're currently performing load-balancing.
														
 
															+ * @imbalance: Variable to store the imbalance.
														
 
															+ *
														
 
															+ * Returns 1 if there is potential to perform power-savings balance.
														
 
															+ * Else returns 0.
														
 
															+ */
														
 
															+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
														
 
															+					int this_cpu, unsigned long *imbalance)
														
 
															+{
														
 
															+	if (!sds->power_savings_balance)
														
 
															+		return 0;
														
 
															-		/*
														
 
															-		 * First idle cpu or the first cpu(busiest) in this sched group
														
 
															-		 * is eligible for doing load balancing at this and above
														
 
															-		 * domains. In the newly idle case, we will allow all the cpu's
														
 
															-		 * to do the newly idle load balance.
														
 
															-		 */
														
 
															-		if (idle != CPU_NEWLY_IDLE && local_group &&
														
 
															-		    balance_cpu != this_cpu && balance) {
														
 
															-			*balance = 0;
														
 
															-			goto ret;
														
 
															-		}
														
 
															+	if (sds->this != sds->group_leader ||
														
 
															+			sds->group_leader == sds->group_min)
														
 
															+		return 0;
														
 
															-		total_load += avg_load;
														
 
															-		total_pwr += group->__cpu_power;
														
 
															+	*imbalance = sds->min_load_per_task;
														
 
															+	sds->busiest = sds->group_min;
														
 
															-		/* Adjust by relative CPU power of the group */
														
 
															-		avg_load = sg_div_cpu_power(group,
														
 
															-				avg_load * SCHED_LOAD_SCALE);
														
 
															+	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
														
 
															+		cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
														
 
															+			group_first_cpu(sds->group_leader);
														
 
															+	}
														
 
															+	return 1;
														
 
															-		/*
														
 
															-		 * Consider the group unbalanced when the imbalance is larger
														
 
															-		 * than the average weight of two tasks.
														
 
															-		 *
														
 
															-		 * APZ: with cgroup the avg task weight can vary wildly and
														
 
															-		 *      might not be a suitable number - should we keep a
														
 
															-		 *      normalized nr_running number somewhere that negates
														
 
															-		 *      the hierarchy?
														
 
															-		 */
														
 
															-		avg_load_per_task = sg_div_cpu_power(group,
														
 
															-				sum_avg_load_per_task * SCHED_LOAD_SCALE);
														
 
															+}
														
 
															+#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
														
 
															+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
														
 
															+	struct sd_lb_stats *sds, enum cpu_idle_type idle)
														
 
															+{
														
 
															+	return;
														
 
															+}
														
 
															+
														
 
															+static inline void update_sd_power_savings_stats(struct sched_group *group,
														
 
															+	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
														
 
															+{
														
 
															+	return;
														
 
															+}
														
 
															+
														
 
															+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
														
 
															+					int this_cpu, unsigned long *imbalance)
														
 
															+{
														
 
															+	return 0;
														
 
															+}
														
 
															+#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
														
 
															+
														
 
															+
														
 
															+/**
														
 
															+ * update_sg_lb_stats - Update sched_group's statistics for load balancing.
														
 
															+ * @group: sched_group whose statistics are to be updated.
														
 
															+ * @this_cpu: Cpu for which load balance is currently performed.
														
 
															+ * @idle: Idle status of this_cpu
														
 
															+ * @load_idx: Load index of sched_domain of this_cpu for load calc.
														
 
															+ * @sd_idle: Idle status of the sched_domain containing group.
														
 
															+ * @local_group: Does group contain this_cpu.
														
 
															+ * @cpus: Set of cpus considered for load balancing.
														
 
															+ * @balance: Should we balance.
														
 
															+ * @sgs: variable to hold the statistics for this group.
														
 
															+ */
														
 
															+static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
														
 
															+			enum cpu_idle_type idle, int load_idx, int *sd_idle,
														
 
															+			int local_group, const struct cpumask *cpus,
														
 
															+			int *balance, struct sg_lb_stats *sgs)
														
 
															+{
														
 
															+	unsigned long load, max_cpu_load, min_cpu_load;
														
 
															+	int i;
														
 
															+	unsigned int balance_cpu = -1, first_idle_cpu = 0;
														
 
															+	unsigned long sum_avg_load_per_task;
														
 
															+	unsigned long avg_load_per_task;
														
 
															-		if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
														
 
															-			__group_imb = 1;
														
 
															+	if (local_group)
														
 
															+		balance_cpu = group_first_cpu(group);
														
 
															-		group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
														
 
															+	/* Tally up the load of all CPUs in the group */
														
 
															+	sum_avg_load_per_task = avg_load_per_task = 0;
														
 
															+	max_cpu_load = 0;
														
 
															+	min_cpu_load = ~0UL;
														
 
															+	for_each_cpu_and(i, sched_group_cpus(group), cpus) {
														
 
															+		struct rq *rq = cpu_rq(i);
														
 
															+
														
 
															+		if (*sd_idle && rq->nr_running)
														
 
															+			*sd_idle = 0;
														
 
															+
														
 
															+		/* Bias balancing toward cpus of our domain */
														
 
															 		if (local_group) {
														
 
															-			this_load = avg_load;
														
 
															-			this = group;
														
 
															-			this_nr_running = sum_nr_running;
														
 
															-			this_load_per_task = sum_weighted_load;
														
 
															-		} else if (avg_load > max_load &&
														
 
															-			   (sum_nr_running > group_capacity || __group_imb)) {
														
 
															-			max_load = avg_load;
														
 
															-			busiest = group;
														
 
															-			busiest_nr_running = sum_nr_running;
														
 
															-			busiest_load_per_task = sum_weighted_load;
														
 
															-			group_imb = __group_imb;
														
 
															+			if (idle_cpu(i) && !first_idle_cpu) {
														
 
															+				first_idle_cpu = 1;
														
 
															+				balance_cpu = i;
														
 
															+			}
														
 
															+
														
 
															+			load = target_load(i, load_idx);
														
 
															+		} else {
														
 
															+			load = source_load(i, load_idx);
														
 
															+			if (load > max_cpu_load)
														
 
															+				max_cpu_load = load;
														
 
															+			if (min_cpu_load > load)
														
 
															+				min_cpu_load = load;
														
 
															 		}
														
 
															-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
														
 
															-		/*
														
 
															-		 * Busy processors will not participate in power savings
														
 
															-		 * balance.
														
 
															-		 */
														
 
															-		if (idle == CPU_NOT_IDLE ||
														
 
															-				!(sd->flags & SD_POWERSAVINGS_BALANCE))
														
 
															-			goto group_next;
														
 
															+		sgs->group_load += load;
														
 
															+		sgs->sum_nr_running += rq->nr_running;
														
 
															+		sgs->sum_weighted_load += weighted_cpuload(i);
														
 
															-		/*
														
 
															-		 * If the local group is idle or completely loaded
														
 
															-		 * no need to do power savings balance at this domain
														
 
															-		 */
														
 
															-		if (local_group && (this_nr_running >= group_capacity ||
														
 
															-				    !this_nr_running))
														
 
															-			power_savings_balance = 0;
														
 
															+		sum_avg_load_per_task += cpu_avg_load_per_task(i);
														
 
															+	}
														
 
															-		/*
														
 
															-		 * If a group is already running at full capacity or idle,
														
 
															-		 * don't include that group in power savings calculations
														
 
															-		 */
														
 
															-		if (!power_savings_balance || sum_nr_running >= group_capacity
														
 
															-		    || !sum_nr_running)
														
 
															-			goto group_next;
														
 
															+	/*
														
 
															+	 * First idle cpu or the first cpu(busiest) in this sched group
														
 
															+	 * is eligible for doing load balancing at this and above
														
 
															+	 * domains. In the newly idle case, we will allow all the cpu's
														
 
															+	 * to do the newly idle load balance.
														
 
															+	 */
														
 
															+	if (idle != CPU_NEWLY_IDLE && local_group &&
														
 
															+	    balance_cpu != this_cpu && balance) {
														
 
															+		*balance = 0;
														
 
															+		return;
														
 
															+	}
														
 
															-		/*
														
 
															-		 * Calculate the group which has the least non-idle load.
														
 
															-		 * This is the group from where we need to pick up the load
														
 
															-		 * for saving power
														
 
															-		 */
														
 
															-		if ((sum_nr_running < min_nr_running) ||
														
 
															-		    (sum_nr_running == min_nr_running &&
														
 
															-		     cpumask_first(sched_group_cpus(group)) >
														
 
															-		     cpumask_first(sched_group_cpus(group_min)))) {
														
 
															-			group_min = group;
														
 
															-			min_nr_running = sum_nr_running;
														
 
															-			min_load_per_task = sum_weighted_load /
														
 
															-						sum_nr_running;
														
 
															-		}
														
 
															+	/* Adjust by relative CPU power of the group */
														
 
															+	sgs->avg_load = sg_div_cpu_power(group,
														
 
															+			sgs->group_load * SCHED_LOAD_SCALE);
														
 
															-		/*
														
 
															-		 * Calculate the group which is almost near its
														
 
															-		 * capacity but still has some space to pick up some load
														
 
															-		 * from other group and save more power
														
 
															-		 */
														
 
															-		if (sum_nr_running <= group_capacity - 1) {
														
 
															-			if (sum_nr_running > leader_nr_running ||
														
 
															-			    (sum_nr_running == leader_nr_running &&
														
 
															-			     cpumask_first(sched_group_cpus(group)) <
														
 
															-			     cpumask_first(sched_group_cpus(group_leader)))) {
														
 
															-				group_leader = group;
														
 
															-				leader_nr_running = sum_nr_running;
														
 
															-			}
														
 
															+
														
 
															+	/*
														
 
															+	 * Consider the group unbalanced when the imbalance is larger
														
 
															+	 * than the average weight of two tasks.
														
 
															+	 *
														
 
															+	 * APZ: with cgroup the avg task weight can vary wildly and
														
 
															+	 *      might not be a suitable number - should we keep a
														
 
															+	 *      normalized nr_running number somewhere that negates
														
 
															+	 *      the hierarchy?
														
 
															+	 */
														
 
															+	avg_load_per_task = sg_div_cpu_power(group,
														
 
															+			sum_avg_load_per_task * SCHED_LOAD_SCALE);
														
 
															+
														
 
															+	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
														
 
															+		sgs->group_imb = 1;
														
 
															+
														
 
															+	sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
														
 
															+
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * update_sd_lb_stats - Update sched_group's statistics for load balancing.
														
 
															+ * @sd: sched_domain whose statistics are to be updated.
														
 
															+ * @this_cpu: Cpu for which load balance is currently performed.
														
 
															+ * @idle: Idle status of this_cpu
														
 
															+ * @sd_idle: Idle status of the sched_domain containing group.
														
 
															+ * @cpus: Set of cpus considered for load balancing.
														
 
															+ * @balance: Should we balance.
														
 
															+ * @sds: variable to hold the statistics for this sched_domain.
														
 
															+ */
														
 
															+static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
														
 
															+			enum cpu_idle_type idle, int *sd_idle,
														
 
															+			const struct cpumask *cpus, int *balance,
														
 
															+			struct sd_lb_stats *sds)
														
 
															+{
														
 
															+	struct sched_group *group = sd->groups;
														
 
															+	struct sg_lb_stats sgs;
														
 
															+	int load_idx;
														
 
															+
														
 
															+	init_sd_power_savings_stats(sd, sds, idle);
														
 
															+	load_idx = get_sd_load_idx(sd, idle);
														
 
															+
														
 
															+	do {
														
 
															+		int local_group;
														
 
															+
														
 
															+		local_group = cpumask_test_cpu(this_cpu,
														
 
															+					       sched_group_cpus(group));
														
 
															+		memset(&sgs, 0, sizeof(sgs));
														
 
															+		update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
														
 
															+				local_group, cpus, balance, &sgs);
														
 
															+
														
 
															+		if (local_group && balance && !(*balance))
														
 
															+			return;
														
 
															+
														
 
															+		sds->total_load += sgs.group_load;
														
 
															+		sds->total_pwr += group->__cpu_power;
														
 
															+
														
 
															+		if (local_group) {
														
 
															+			sds->this_load = sgs.avg_load;
														
 
															+			sds->this = group;
														
 
															+			sds->this_nr_running = sgs.sum_nr_running;
														
 
															+			sds->this_load_per_task = sgs.sum_weighted_load;
														
 
															+		} else if (sgs.avg_load > sds->max_load &&
														
 
															+			   (sgs.sum_nr_running > sgs.group_capacity ||
														
 
															+				sgs.group_imb)) {
														
 
															+			sds->max_load = sgs.avg_load;
														
 
															+			sds->busiest = group;
														
 
															+			sds->busiest_nr_running = sgs.sum_nr_running;
														
 
															+			sds->busiest_load_per_task = sgs.sum_weighted_load;
														
 
															+			sds->group_imb = sgs.group_imb;
														
 
															 		}
														
 
															-group_next:
														
 
															-#endif
														
 
															+
														
 
															+		update_sd_power_savings_stats(group, sds, local_group, &sgs);
														
 
															 		group = group->next;
														
 
															 	} while (group != sd->groups);
														
 
															-	if (!busiest || this_load >= max_load || busiest_nr_running == 0)
														
 
															-		goto out_balanced;
														
 
															-
														
 
															-	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
														
 
															+}
														
 
															-	if (this_load >= avg_load ||
														
 
															-			100*max_load <= sd->imbalance_pct*this_load)
														
 
															-		goto out_balanced;
														
 
															+/**
														
 
															+ * fix_small_imbalance - Calculate the minor imbalance that exists
														
 
															+ *			amongst the groups of a sched_domain, during
														
 
															+ *			load balancing.
														
 
															+ * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
														
 
															+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
														
 
															+ * @imbalance: Variable to store the imbalance.
														
 
															+ */
														
 
															+static inline void fix_small_imbalance(struct sd_lb_stats *sds,
														
 
															+				int this_cpu, unsigned long *imbalance)
														
 
															+{
														
 
															+	unsigned long tmp, pwr_now = 0, pwr_move = 0;
														
 
															+	unsigned int imbn = 2;
														
 
															+
														
 
															+	if (sds->this_nr_running) {
														
 
															+		sds->this_load_per_task /= sds->this_nr_running;
														
 
															+		if (sds->busiest_load_per_task >
														
 
															+				sds->this_load_per_task)
														
 
															+			imbn = 1;
														
 
															+	} else
														
 
															+		sds->this_load_per_task =
														
 
															+			cpu_avg_load_per_task(this_cpu);
														
 
															-	busiest_load_per_task /= busiest_nr_running;
														
 
															-	if (group_imb)
														
 
															-		busiest_load_per_task = min(busiest_load_per_task, avg_load);
														
 
															+	if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
														
 
															+			sds->busiest_load_per_task * imbn) {
														
 
															+		*imbalance = sds->busiest_load_per_task;
														
 
															+		return;
														
 
															+	}
														
 
															 	/*
														
 
															-	 * We're trying to get all the cpus to the average_load, so we don't
														
 
															-	 * want to push ourselves above the average load, nor do we wish to
														
 
															-	 * reduce the max loaded cpu below the average load, as either of these
														
 
															-	 * actions would just result in more rebalancing later, and ping-pong
														
 
															-	 * tasks around. Thus we look for the minimum possible imbalance.
														
 
															-	 * Negative imbalances (*we* are more loaded than anyone else) will
														
 
															-	 * be counted as no imbalance for these purposes -- we can't fix that
														
 
															-	 * by pulling tasks to us. Be careful of negative numbers as they'll
														
 
															-	 * appear as very large values with unsigned longs.
														
 
															+	 * OK, we don't have enough imbalance to justify moving tasks,
														
 
															+	 * however we may be able to increase total CPU power used by
														
 
															+	 * moving them.
														
 
															 	 */
														
 
															-	if (max_load <= busiest_load_per_task)
														
 
															-		goto out_balanced;
														
 
															+	pwr_now += sds->busiest->__cpu_power *
														
 
															+			min(sds->busiest_load_per_task, sds->max_load);
														
 
															+	pwr_now += sds->this->__cpu_power *
														
 
															+			min(sds->this_load_per_task, sds->this_load);
														
 
															+	pwr_now /= SCHED_LOAD_SCALE;
														
 
															+
														
 
															+	/* Amount of load we'd subtract */
														
 
															+	tmp = sg_div_cpu_power(sds->busiest,
														
 
															+			sds->busiest_load_per_task * SCHED_LOAD_SCALE);
														
 
															+	if (sds->max_load > tmp)
														
 
															+		pwr_move += sds->busiest->__cpu_power *
														
 
															+			min(sds->busiest_load_per_task, sds->max_load - tmp);
														
 
															+
														
 
															+	/* Amount of load we'd add */
														
 
															+	if (sds->max_load * sds->busiest->__cpu_power <
														
 
															+		sds->busiest_load_per_task * SCHED_LOAD_SCALE)
														
 
															+		tmp = sg_div_cpu_power(sds->this,
														
 
															+			sds->max_load * sds->busiest->__cpu_power);
														
 
															+	else
														
 
															+		tmp = sg_div_cpu_power(sds->this,
														
 
															+			sds->busiest_load_per_task * SCHED_LOAD_SCALE);
														
 
															+	pwr_move += sds->this->__cpu_power *
														
 
															+			min(sds->this_load_per_task, sds->this_load + tmp);
														
 
															+	pwr_move /= SCHED_LOAD_SCALE;
														
 
															+
														
 
															+	/* Move if we gain throughput */
														
 
															+	if (pwr_move > pwr_now)
														
 
															+		*imbalance = sds->busiest_load_per_task;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * calculate_imbalance - Calculate the amount of imbalance present within the
														
 
															+ *			 groups of a given sched_domain during load balance.
														
 
															+ * @sds: statistics of the sched_domain whose imbalance is to be calculated.
														
 
															+ * @this_cpu: Cpu for which currently load balance is being performed.
														
 
															+ * @imbalance: The variable to store the imbalance.
														
 
															+ */
														
 
															+static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
														
 
															+		unsigned long *imbalance)
														
 
															+{
														
 
															+	unsigned long max_pull;
														
 
															 	/*
														
 
															 	 * In the presence of smp nice balancing, certain scenarios can have
														
 
															 	 * max load less than avg load(as we skip the groups at or below
														
 
															 	 * its cpu_power, while calculating max_load..)
														
 
															 	 */
														
 
															-	if (max_load < avg_load) {
														
 
															+	if (sds->max_load < sds->avg_load) {
														
 
															 		*imbalance = 0;
														
 
															-		goto small_imbalance;
														
 
															+		return fix_small_imbalance(sds, this_cpu, imbalance);
														
 
															 	}
														
 
															 	/* Don't want to pull so many tasks that a group would go idle */
														
 
															-	max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
														
 
															+	max_pull = min(sds->max_load - sds->avg_load,
														
 
															+			sds->max_load - sds->busiest_load_per_task);
														
 
															 	/* How much load to actually move to equalise the imbalance */
														
 
															-	*imbalance = min(max_pull * busiest->__cpu_power,
														
 
															-				(avg_load - this_load) * this->__cpu_power)
														
 
															+	*imbalance = min(max_pull * sds->busiest->__cpu_power,
														
 
															+		(sds->avg_load - sds->this_load) * sds->this->__cpu_power)
														
 
															 			/ SCHED_LOAD_SCALE;
														
 
															 	/*
														
@@ -3437,78 +3670,110 @@ group_next:
 
															 	 * a think about bumping its value to force at least one task to be
														
 
															 	 * moved
														
 
															 	 */
														
 
															-	if (*imbalance < busiest_load_per_task) {
														
 
															-		unsigned long tmp, pwr_now, pwr_move;
														
 
															-		unsigned int imbn;
														
 
															-
														
 
															-small_imbalance:
														
 
															-		pwr_move = pwr_now = 0;
														
 
															-		imbn = 2;
														
 
															-		if (this_nr_running) {
														
 
															-			this_load_per_task /= this_nr_running;
														
 
															-			if (busiest_load_per_task > this_load_per_task)
														
 
															-				imbn = 1;
														
 
															-		} else
														
 
															-			this_load_per_task = cpu_avg_load_per_task(this_cpu);
														
 
															+	if (*imbalance < sds->busiest_load_per_task)
														
 
															+		return fix_small_imbalance(sds, this_cpu, imbalance);
														
 
															-		if (max_load - this_load + busiest_load_per_task >=
														
 
															-					busiest_load_per_task * imbn) {
														
 
															-			*imbalance = busiest_load_per_task;
														
 
															-			return busiest;
														
 
															-		}
														
 
															+}
														
 
															+/******* find_busiest_group() helpers end here *********************/
														
 
															-		/*
														
 
															-		 * OK, we don't have enough imbalance to justify moving tasks,
														
 
															-		 * however we may be able to increase total CPU power used by
														
 
															-		 * moving them.
														
 
															-		 */
														
 
															+/**
														
 
															+ * find_busiest_group - Returns the busiest group within the sched_domain
														
 
															+ * if there is an imbalance. If there isn't an imbalance, and
														
 
															+ * the user has opted for power-savings, it returns a group whose
														
 
															+ * CPUs can be put to idle by rebalancing those tasks elsewhere, if
														
 
															+ * such a group exists.
														
 
															+ *
														
 
															+ * Also calculates the amount of weighted load which should be moved
														
 
															+ * to restore balance.
														
 
															+ *
														
 
															+ * @sd: The sched_domain whose busiest group is to be returned.
														
 
															+ * @this_cpu: The cpu for which load balancing is currently being performed.
														
 
															+ * @imbalance: Variable which stores amount of weighted load which should
														
 
															+ *		be moved to restore balance/put a group to idle.
														
 
															+ * @idle: The idle status of this_cpu.
														
 
															+ * @sd_idle: The idleness of sd
														
 
															+ * @cpus: The set of CPUs under consideration for load-balancing.
														
 
															+ * @balance: Pointer to a variable indicating if this_cpu
														
 
															+ *	is the appropriate cpu to perform load balancing at this_level.
														
 
															+ *
														
 
															+ * Returns:	- the busiest group if imbalance exists.
														
 
															+ *		- If no imbalance and user has opted for power-savings balance,
														
 
															+ *		   return the least loaded group whose CPUs can be
														
 
															+ *		   put to idle by rebalancing its tasks onto our group.
														
 
															+ */
														
 
															+static struct sched_group *
														
 
															+find_busiest_group(struct sched_domain *sd, int this_cpu,
														
 
															+		   unsigned long *imbalance, enum cpu_idle_type idle,
														
 
															+		   int *sd_idle, const struct cpumask *cpus, int *balance)
														
 
															+{
														
 
															+	struct sd_lb_stats sds;
														
 
															-		pwr_now += busiest->__cpu_power *
														
 
															-				min(busiest_load_per_task, max_load);
														
 
															-		pwr_now += this->__cpu_power *
														
 
															-				min(this_load_per_task, this_load);
														
 
															-		pwr_now /= SCHED_LOAD_SCALE;
														
 
															-
														
 
															-		/* Amount of load we'd subtract */
														
 
															-		tmp = sg_div_cpu_power(busiest,
														
 
															-				busiest_load_per_task * SCHED_LOAD_SCALE);
														
 
															-		if (max_load > tmp)
														
 
															-			pwr_move += busiest->__cpu_power *
														
 
															-				min(busiest_load_per_task, max_load - tmp);
														
 
															-
														
 
															-		/* Amount of load we'd add */
														
 
															-		if (max_load * busiest->__cpu_power <
														
 
															-				busiest_load_per_task * SCHED_LOAD_SCALE)
														
 
															-			tmp = sg_div_cpu_power(this,
														
 
															-					max_load * busiest->__cpu_power);
														
 
															-		else
														
 
															-			tmp = sg_div_cpu_power(this,
														
 
															-				busiest_load_per_task * SCHED_LOAD_SCALE);
														
 
															-		pwr_move += this->__cpu_power *
														
 
															-				min(this_load_per_task, this_load + tmp);
														
 
															-		pwr_move /= SCHED_LOAD_SCALE;
														
 
															+	memset(&sds, 0, sizeof(sds));
														
 
															-		/* Move if we gain throughput */
														
 
															-		if (pwr_move > pwr_now)
														
 
															-			*imbalance = busiest_load_per_task;
														
 
															-	}
														
 
															+	/*
														
 
															+	 * Compute the various statistics relavent for load balancing at
														
 
															+	 * this level.
														
 
															+	 */
														
 
															+	update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
														
 
															+					balance, &sds);
														
 
															+
														
 
															+	/* Cases where imbalance does not exist from POV of this_cpu */
														
 
															+	/* 1) this_cpu is not the appropriate cpu to perform load balancing
														
 
															+	 *    at this level.
														
 
															+	 * 2) There is no busy sibling group to pull from.
														
 
															+	 * 3) This group is the busiest group.
														
 
															+	 * 4) This group is more busy than the avg busieness at this
														
 
															+	 *    sched_domain.
														
 
															+	 * 5) The imbalance is within the specified limit.
														
 
															+	 * 6) Any rebalance would lead to ping-pong
														
 
															+	 */
														
 
															+	if (balance && !(*balance))
														
 
															+		goto ret;
														
 
															-	return busiest;
														
 
															+	if (!sds.busiest || sds.busiest_nr_running == 0)
														
 
															+		goto out_balanced;
														
 
															-out_balanced:
														
 
															-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
														
 
															-	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
														
 
															-		goto ret;
														
 
															+	if (sds.this_load >= sds.max_load)
														
 
															+		goto out_balanced;
														
 
															-	if (this == group_leader && group_leader != group_min) {
														
 
															-		*imbalance = min_load_per_task;
														
 
															-		if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
														
 
															-			cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
														
 
															-				cpumask_first(sched_group_cpus(group_leader));
														
 
															-		}
														
 
															-		return group_min;
														
 
															-	}
														
 
															-#endif
														
 
															+	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
														
 
															+
														
 
															+	if (sds.this_load >= sds.avg_load)
														
 
															+		goto out_balanced;
														
 
															+
														
 
															+	if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
														
 
															+		goto out_balanced;
														
 
															+
														
 
															+	sds.busiest_load_per_task /= sds.busiest_nr_running;
														
 
															+	if (sds.group_imb)
														
 
															+		sds.busiest_load_per_task =
														
 
															+			min(sds.busiest_load_per_task, sds.avg_load);
														
 
															+
														
 
															+	/*
														
 
															+	 * We're trying to get all the cpus to the average_load, so we don't
														
 
															+	 * want to push ourselves above the average load, nor do we wish to
														
 
															+	 * reduce the max loaded cpu below the average load, as either of these
														
 
															+	 * actions would just result in more rebalancing later, and ping-pong
														
 
															+	 * tasks around. Thus we look for the minimum possible imbalance.
														
 
															+	 * Negative imbalances (*we* are more loaded than anyone else) will
														
 
															+	 * be counted as no imbalance for these purposes -- we can't fix that
														
 
															+	 * by pulling tasks to us. Be careful of negative numbers as they'll
														
 
															+	 * appear as very large values with unsigned longs.
														
 
															+	 */
														
 
															+	if (sds.max_load <= sds.busiest_load_per_task)
														
 
															+		goto out_balanced;
														
 
															+
														
 
															+	/* Looks like there is an imbalance. Compute it */
														
 
															+	calculate_imbalance(&sds, this_cpu, imbalance);
														
 
															+	return sds.busiest;
														
 
															+
														
 
															+out_balanced:
														
 
															+	/*
														
 
															+	 * There is no obvious imbalance. But check if we can do some balancing
														
 
															+	 * to save power.
														
 
															+	 */
														
 
															+	if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
														
 
															+		return sds.busiest;
														
 
															 ret:
														
 
															 	*imbalance = 0;
														
 
															 	return NULL;