|
@@ -901,28 +901,114 @@ static inline unsigned long task_faults(struct task_struct *p, int nid)
|
|
|
}
|
|
|
|
|
|
static unsigned long weighted_cpuload(const int cpu);
|
|
|
+static unsigned long source_load(int cpu, int type);
|
|
|
+static unsigned long target_load(int cpu, int type);
|
|
|
+static unsigned long power_of(int cpu);
|
|
|
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
|
|
|
|
|
|
+struct numa_stats {
|
|
|
+ unsigned long load;
|
|
|
+ s64 eff_load;
|
|
|
+ unsigned long faults;
|
|
|
+};
|
|
|
|
|
|
-static int
|
|
|
-find_idlest_cpu_node(int this_cpu, int nid)
|
|
|
-{
|
|
|
- unsigned long load, min_load = ULONG_MAX;
|
|
|
- int i, idlest_cpu = this_cpu;
|
|
|
+struct task_numa_env {
|
|
|
+ struct task_struct *p;
|
|
|
|
|
|
- BUG_ON(cpu_to_node(this_cpu) == nid);
|
|
|
+ int src_cpu, src_nid;
|
|
|
+ int dst_cpu, dst_nid;
|
|
|
|
|
|
- rcu_read_lock();
|
|
|
- for_each_cpu(i, cpumask_of_node(nid)) {
|
|
|
- load = weighted_cpuload(i);
|
|
|
+ struct numa_stats src_stats, dst_stats;
|
|
|
|
|
|
- if (load < min_load) {
|
|
|
- min_load = load;
|
|
|
- idlest_cpu = i;
|
|
|
+ unsigned long best_load;
|
|
|
+ int best_cpu;
|
|
|
+};
|
|
|
+
|
|
|
+static int task_numa_migrate(struct task_struct *p)
|
|
|
+{
|
|
|
+ int node_cpu = cpumask_first(cpumask_of_node(p->numa_preferred_nid));
|
|
|
+ struct task_numa_env env = {
|
|
|
+ .p = p,
|
|
|
+ .src_cpu = task_cpu(p),
|
|
|
+ .src_nid = cpu_to_node(task_cpu(p)),
|
|
|
+ .dst_cpu = node_cpu,
|
|
|
+ .dst_nid = p->numa_preferred_nid,
|
|
|
+ .best_load = ULONG_MAX,
|
|
|
+ .best_cpu = task_cpu(p),
|
|
|
+ };
|
|
|
+ struct sched_domain *sd;
|
|
|
+ int cpu;
|
|
|
+ struct task_group *tg = task_group(p);
|
|
|
+ unsigned long weight;
|
|
|
+ bool balanced;
|
|
|
+ int imbalance_pct, idx = -1;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Find the lowest common scheduling domain covering the nodes of both
|
|
|
+ * the CPU the task is currently running on and the target NUMA node.
|
|
|
+ */
|
|
|
+ rcu_read_lock();
|
|
|
+ for_each_domain(env.src_cpu, sd) {
|
|
|
+ if (cpumask_test_cpu(node_cpu, sched_domain_span(sd))) {
|
|
|
+ /*
|
|
|
+ * busy_idx is used for the load decision as it is the
|
|
|
+ * same index used by the regular load balancer for an
|
|
|
+ * active cpu.
|
|
|
+ */
|
|
|
+ idx = sd->busy_idx;
|
|
|
+ imbalance_pct = sd->imbalance_pct;
|
|
|
+ break;
|
|
|
}
|
|
|
}
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
- return idlest_cpu;
|
|
|
+ if (WARN_ON_ONCE(idx == -1))
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * XXX the below is mostly nicked from wake_affine(); we should
|
|
|
+ * see about sharing a bit if at all possible; also it might want
|
|
|
+ * some per entity weight love.
|
|
|
+ */
|
|
|
+ weight = p->se.load.weight;
|
|
|
+ env.src_stats.load = source_load(env.src_cpu, idx);
|
|
|
+ env.src_stats.eff_load = 100 + (imbalance_pct - 100) / 2;
|
|
|
+ env.src_stats.eff_load *= power_of(env.src_cpu);
|
|
|
+ env.src_stats.eff_load *= env.src_stats.load + effective_load(tg, env.src_cpu, -weight, -weight);
|
|
|
+
|
|
|
+ for_each_cpu(cpu, cpumask_of_node(env.dst_nid)) {
|
|
|
+ env.dst_cpu = cpu;
|
|
|
+ env.dst_stats.load = target_load(cpu, idx);
|
|
|
+
|
|
|
+ /* If the CPU is idle, use it */
|
|
|
+ if (!env.dst_stats.load) {
|
|
|
+ env.best_cpu = cpu;
|
|
|
+ goto migrate;
|
|
|
+ }
|
|
|
+
|
|
|
+ /* Otherwise check the target CPU load */
|
|
|
+ env.dst_stats.eff_load = 100;
|
|
|
+ env.dst_stats.eff_load *= power_of(cpu);
|
|
|
+ env.dst_stats.eff_load *= env.dst_stats.load + effective_load(tg, cpu, weight, weight);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Destination is considered balanced if the destination CPU is
|
|
|
+ * less loaded than the source CPU. Unfortunately there is a
|
|
|
+ * risk that a task running on a lightly loaded CPU will not
|
|
|
+ * migrate to its preferred node due to load imbalances.
|
|
|
+ */
|
|
|
+ balanced = (env.dst_stats.eff_load <= env.src_stats.eff_load);
|
|
|
+ if (!balanced)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ if (env.dst_stats.eff_load < env.best_load) {
|
|
|
+ env.best_load = env.dst_stats.eff_load;
|
|
|
+ env.best_cpu = cpu;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+migrate:
|
|
|
+ return migrate_task_to(p, env.best_cpu);
|
|
|
}
|
|
|
|
|
|
static void task_numa_placement(struct task_struct *p)
|
|
@@ -966,22 +1052,10 @@ static void task_numa_placement(struct task_struct *p)
|
|
|
* the working set placement.
|
|
|
*/
|
|
|
if (max_faults && max_nid != p->numa_preferred_nid) {
|
|
|
- int preferred_cpu;
|
|
|
-
|
|
|
- /*
|
|
|
- * If the task is not on the preferred node then find the most
|
|
|
- * idle CPU to migrate to.
|
|
|
- */
|
|
|
- preferred_cpu = task_cpu(p);
|
|
|
- if (cpu_to_node(preferred_cpu) != max_nid) {
|
|
|
- preferred_cpu = find_idlest_cpu_node(preferred_cpu,
|
|
|
- max_nid);
|
|
|
- }
|
|
|
-
|
|
|
/* Update the preferred nid and migrate task if possible */
|
|
|
p->numa_preferred_nid = max_nid;
|
|
|
p->numa_migrate_seq = 1;
|
|
|
- migrate_task_to(p, preferred_cpu);
|
|
|
+ task_numa_migrate(p);
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -3292,7 +3366,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
|
|
|
{
|
|
|
struct sched_entity *se = tg->se[cpu];
|
|
|
|
|
|
- if (!tg->parent) /* the trivial, non-cgroup case */
|
|
|
+ if (!tg->parent || !wl) /* the trivial, non-cgroup case */
|
|
|
return wl;
|
|
|
|
|
|
for_each_sched_entity(se) {
|
|
@@ -3345,8 +3419,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
|
|
|
}
|
|
|
#else
|
|
|
|
|
|
-static inline unsigned long effective_load(struct task_group *tg, int cpu,
|
|
|
- unsigned long wl, unsigned long wg)
|
|
|
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
|
|
|
{
|
|
|
return wl;
|
|
|
}
|