13 years ago · 88b8dac0a1
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3054,6 +3054,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 
				 
			
 
				 #define LBF_ALL_PINNED	0x01
			
 
				 #define LBF_NEED_BREAK	0x02
			
 
				+#define LBF_SOME_PINNED 0x04
			
 
				 
			
 
				 struct lb_env {
			
 
				 	struct sched_domain	*sd;
			
@@ -3064,6 +3065,8 @@ struct lb_env {
 
				 	int			dst_cpu;
			
 
				 	struct rq		*dst_rq;
			
 
				 
			
 
				+	struct cpumask		*dst_grpmask;
			
 
				+	int			new_dst_cpu;
			
 
				 	enum cpu_idle_type	idle;
			
 
				 	long			imbalance;
			
 
				 	unsigned int		flags;
			
@@ -3131,9 +3134,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 
				 	 * 3) are cache-hot on their current CPU.
			
 
				 	 */
			
 
				 	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
			
 
				+		int new_dst_cpu;
			
 
				+
			
 
				 		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
			
 
				+
			
 
				+		/*
			
 
				+		 * Remember if this task can be migrated to any other cpu in
			
 
				+		 * our sched_group. We may want to revisit it if we couldn't
			
 
				+		 * meet load balance goals by pulling other tasks on src_cpu.
			
 
				+		 *
			
 
				+		 * Also avoid computing new_dst_cpu if we have already computed
			
 
				+		 * one in current iteration.
			
 
				+		 */
			
 
				+		if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
			
 
				+			return 0;
			
 
				+
			
 
				+		new_dst_cpu = cpumask_first_and(env->dst_grpmask,
			
 
				+						tsk_cpus_allowed(p));
			
 
				+		if (new_dst_cpu < nr_cpu_ids) {
			
 
				+			env->flags |= LBF_SOME_PINNED;
			
 
				+			env->new_dst_cpu = new_dst_cpu;
			
 
				+		}
			
 
				 		return 0;
			
 
				 	}
			
 
				+
			
 
				+	/* Record that we found atleast one task that could run on dst_cpu */
			
 
				 	env->flags &= ~LBF_ALL_PINNED;
			
 
				 
			
 
				 	if (task_running(env->src_rq, p)) {
			
@@ -4213,7 +4238,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
				 			struct sched_domain *sd, enum cpu_idle_type idle,
			
 
				 			int *balance)
			
 
				 {
			
 
				-	int ld_moved, active_balance = 0;
			
 
				+	int ld_moved, cur_ld_moved, active_balance = 0;
			
 
				+	int lb_iterations, max_lb_iterations;
			
 
				 	struct sched_group *group;
			
 
				 	struct rq *busiest;
			
 
				 	unsigned long flags;
			
@@ -4223,11 +4249,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
				 		.sd		= sd,
			
 
				 		.dst_cpu	= this_cpu,
			
 
				 		.dst_rq		= this_rq,
			
 
				+		.dst_grpmask    = sched_group_cpus(sd->groups),
			
 
				 		.idle		= idle,
			
 
				 		.loop_break	= sched_nr_migrate_break,
			
 
				 	};
			
 
				 
			
 
				 	cpumask_copy(cpus, cpu_active_mask);
			
 
				+	max_lb_iterations = cpumask_weight(env.dst_grpmask);
			
 
				 
			
 
				 	schedstat_inc(sd, lb_count[idle]);
			
 
				 
			
@@ -4253,6 +4281,7 @@ redo:
 
				 	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
			
 
				 
			
 
				 	ld_moved = 0;
			
 
				+	lb_iterations = 1;
			
 
				 	if (busiest->nr_running > 1) {
			
 
				 		/*
			
 
				 		 * Attempt to move tasks. If find_busiest_group has found
			
@@ -4270,7 +4299,13 @@ more_balance:
 
				 		double_rq_lock(this_rq, busiest);
			
 
				 		if (!env.loop)
			
 
				 			update_h_load(env.src_cpu);
			
 
				-		ld_moved += move_tasks(&env);
			
 
				+
			
 
				+		/*
			
 
				+		 * cur_ld_moved - load moved in current iteration
			
 
				+		 * ld_moved     - cumulative load moved across iterations
			
 
				+		 */
			
 
				+		cur_ld_moved = move_tasks(&env);
			
 
				+		ld_moved += cur_ld_moved;
			
 
				 		double_rq_unlock(this_rq, busiest);
			
 
				 		local_irq_restore(flags);
			
 
				 
			
@@ -4282,8 +4317,43 @@ more_balance:
 
				 		/*
			
 
				 		 * some other cpu did the load balance for us.
			
 
				 		 */
			
 
				-		if (ld_moved && this_cpu != smp_processor_id())
			
 
				-			resched_cpu(this_cpu);
			
 
				+		if (cur_ld_moved && env.dst_cpu != smp_processor_id())
			
 
				+			resched_cpu(env.dst_cpu);
			
 
				+
			
 
				+		/*
			
 
				+		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
			
 
				+		 * us and move them to an alternate dst_cpu in our sched_group
			
 
				+		 * where they can run. The upper limit on how many times we
			
 
				+		 * iterate on same src_cpu is dependent on number of cpus in our
			
 
				+		 * sched_group.
			
 
				+		 *
			
 
				+		 * This changes load balance semantics a bit on who can move
			
 
				+		 * load to a given_cpu. In addition to the given_cpu itself
			
 
				+		 * (or a ilb_cpu acting on its behalf where given_cpu is
			
 
				+		 * nohz-idle), we now have balance_cpu in a position to move
			
 
				+		 * load to given_cpu. In rare situations, this may cause
			
 
				+		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
			
 
				+		 * _independently_ and at _same_ time to move some load to
			
 
				+		 * given_cpu) causing exceess load to be moved to given_cpu.
			
 
				+		 * This however should not happen so much in practice and
			
 
				+		 * moreover subsequent load balance cycles should correct the
			
 
				+		 * excess load moved.
			
 
				+		 */
			
 
				+		if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
			
 
				+				lb_iterations++ < max_lb_iterations) {
			
 
				+
			
 
				+			this_rq		 = cpu_rq(env.new_dst_cpu);
			
 
				+			env.dst_rq	 = this_rq;
			
 
				+			env.dst_cpu	 = env.new_dst_cpu;
			
 
				+			env.flags	&= ~LBF_SOME_PINNED;
			
 
				+			env.loop	 = 0;
			
 
				+			env.loop_break	 = sched_nr_migrate_break;
			
 
				+			/*
			
 
				+			 * Go back to "more_balance" rather than "redo" since we
			
 
				+			 * need to continue with same src_cpu.
			
 
				+			 */
			
 
				+			goto more_balance;
			
 
				+		}
			
 
				 
			
 
				 		/* All tasks on this runqueue were pinned by CPU affinity */
			
 
				 		if (unlikely(env.flags & LBF_ALL_PINNED)) {