|
@@ -3054,6 +3054,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
|
|
|
|
|
|
#define LBF_ALL_PINNED 0x01
|
|
|
#define LBF_NEED_BREAK 0x02
|
|
|
+#define LBF_SOME_PINNED 0x04
|
|
|
|
|
|
struct lb_env {
|
|
|
struct sched_domain *sd;
|
|
@@ -3064,6 +3065,8 @@ struct lb_env {
|
|
|
int dst_cpu;
|
|
|
struct rq *dst_rq;
|
|
|
|
|
|
+ struct cpumask *dst_grpmask;
|
|
|
+ int new_dst_cpu;
|
|
|
enum cpu_idle_type idle;
|
|
|
long imbalance;
|
|
|
unsigned int flags;
|
|
@@ -3131,9 +3134,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
|
|
|
* 3) are cache-hot on their current CPU.
|
|
|
*/
|
|
|
if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
|
|
|
+ int new_dst_cpu;
|
|
|
+
|
|
|
schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Remember if this task can be migrated to any other cpu in
|
|
|
+ * our sched_group. We may want to revisit it if we couldn't
|
|
|
+ * meet load balance goals by pulling other tasks on src_cpu.
|
|
|
+ *
|
|
|
+ * Also avoid computing new_dst_cpu if we have already computed
|
|
|
+ * one in current iteration.
|
|
|
+ */
|
|
|
+ if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ new_dst_cpu = cpumask_first_and(env->dst_grpmask,
|
|
|
+ tsk_cpus_allowed(p));
|
|
|
+ if (new_dst_cpu < nr_cpu_ids) {
|
|
|
+ env->flags |= LBF_SOME_PINNED;
|
|
|
+ env->new_dst_cpu = new_dst_cpu;
|
|
|
+ }
|
|
|
return 0;
|
|
|
}
|
|
|
+
|
|
|
+ /* Record that we found atleast one task that could run on dst_cpu */
|
|
|
env->flags &= ~LBF_ALL_PINNED;
|
|
|
|
|
|
if (task_running(env->src_rq, p)) {
|
|
@@ -4213,7 +4238,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
|
|
|
struct sched_domain *sd, enum cpu_idle_type idle,
|
|
|
int *balance)
|
|
|
{
|
|
|
- int ld_moved, active_balance = 0;
|
|
|
+ int ld_moved, cur_ld_moved, active_balance = 0;
|
|
|
+ int lb_iterations, max_lb_iterations;
|
|
|
struct sched_group *group;
|
|
|
struct rq *busiest;
|
|
|
unsigned long flags;
|
|
@@ -4223,11 +4249,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
|
|
|
.sd = sd,
|
|
|
.dst_cpu = this_cpu,
|
|
|
.dst_rq = this_rq,
|
|
|
+ .dst_grpmask = sched_group_cpus(sd->groups),
|
|
|
.idle = idle,
|
|
|
.loop_break = sched_nr_migrate_break,
|
|
|
};
|
|
|
|
|
|
cpumask_copy(cpus, cpu_active_mask);
|
|
|
+ max_lb_iterations = cpumask_weight(env.dst_grpmask);
|
|
|
|
|
|
schedstat_inc(sd, lb_count[idle]);
|
|
|
|
|
@@ -4253,6 +4281,7 @@ redo:
|
|
|
schedstat_add(sd, lb_imbalance[idle], env.imbalance);
|
|
|
|
|
|
ld_moved = 0;
|
|
|
+ lb_iterations = 1;
|
|
|
if (busiest->nr_running > 1) {
|
|
|
/*
|
|
|
* Attempt to move tasks. If find_busiest_group has found
|
|
@@ -4270,7 +4299,13 @@ more_balance:
|
|
|
double_rq_lock(this_rq, busiest);
|
|
|
if (!env.loop)
|
|
|
update_h_load(env.src_cpu);
|
|
|
- ld_moved += move_tasks(&env);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * cur_ld_moved - load moved in current iteration
|
|
|
+ * ld_moved - cumulative load moved across iterations
|
|
|
+ */
|
|
|
+ cur_ld_moved = move_tasks(&env);
|
|
|
+ ld_moved += cur_ld_moved;
|
|
|
double_rq_unlock(this_rq, busiest);
|
|
|
local_irq_restore(flags);
|
|
|
|
|
@@ -4282,8 +4317,43 @@ more_balance:
|
|
|
/*
|
|
|
* some other cpu did the load balance for us.
|
|
|
*/
|
|
|
- if (ld_moved && this_cpu != smp_processor_id())
|
|
|
- resched_cpu(this_cpu);
|
|
|
+ if (cur_ld_moved && env.dst_cpu != smp_processor_id())
|
|
|
+ resched_cpu(env.dst_cpu);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Revisit (affine) tasks on src_cpu that couldn't be moved to
|
|
|
+ * us and move them to an alternate dst_cpu in our sched_group
|
|
|
+ * where they can run. The upper limit on how many times we
|
|
|
+ * iterate on same src_cpu is dependent on number of cpus in our
|
|
|
+ * sched_group.
|
|
|
+ *
|
|
|
+ * This changes load balance semantics a bit on who can move
|
|
|
+ * load to a given_cpu. In addition to the given_cpu itself
|
|
|
+ * (or a ilb_cpu acting on its behalf where given_cpu is
|
|
|
+ * nohz-idle), we now have balance_cpu in a position to move
|
|
|
+ * load to given_cpu. In rare situations, this may cause
|
|
|
+ * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
|
|
|
+ * _independently_ and at _same_ time to move some load to
|
|
|
+ * given_cpu) causing exceess load to be moved to given_cpu.
|
|
|
+ * This however should not happen so much in practice and
|
|
|
+ * moreover subsequent load balance cycles should correct the
|
|
|
+ * excess load moved.
|
|
|
+ */
|
|
|
+ if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
|
|
|
+ lb_iterations++ < max_lb_iterations) {
|
|
|
+
|
|
|
+ this_rq = cpu_rq(env.new_dst_cpu);
|
|
|
+ env.dst_rq = this_rq;
|
|
|
+ env.dst_cpu = env.new_dst_cpu;
|
|
|
+ env.flags &= ~LBF_SOME_PINNED;
|
|
|
+ env.loop = 0;
|
|
|
+ env.loop_break = sched_nr_migrate_break;
|
|
|
+ /*
|
|
|
+ * Go back to "more_balance" rather than "redo" since we
|
|
|
+ * need to continue with same src_cpu.
|
|
|
+ */
|
|
|
+ goto more_balance;
|
|
|
+ }
|
|
|
|
|
|
/* All tasks on this runqueue were pinned by CPU affinity */
|
|
|
if (unlikely(env.flags & LBF_ALL_PINNED)) {
|