|
@@ -888,6 +888,18 @@ static unsigned int task_scan_max(struct task_struct *p)
|
|
|
*/
|
|
|
unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
|
|
|
|
|
|
+static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
|
|
|
+{
|
|
|
+ rq->nr_numa_running += (p->numa_preferred_nid != -1);
|
|
|
+ rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
|
|
|
+}
|
|
|
+
|
|
|
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
|
|
|
+{
|
|
|
+ rq->nr_numa_running -= (p->numa_preferred_nid != -1);
|
|
|
+ rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
|
|
|
+}
|
|
|
+
|
|
|
struct numa_group {
|
|
|
atomic_t refcount;
|
|
|
|
|
@@ -1227,6 +1239,8 @@ static int task_numa_migrate(struct task_struct *p)
|
|
|
if (env.best_cpu == -1)
|
|
|
return -EAGAIN;
|
|
|
|
|
|
+ sched_setnuma(p, env.dst_nid);
|
|
|
+
|
|
|
if (env.best_task == NULL) {
|
|
|
int ret = migrate_task_to(p, env.best_cpu);
|
|
|
return ret;
|
|
@@ -1342,8 +1356,7 @@ static void task_numa_placement(struct task_struct *p)
|
|
|
/* Preferred node as the node with the most faults */
|
|
|
if (max_faults && max_nid != p->numa_preferred_nid) {
|
|
|
/* Update the preferred nid and migrate task if possible */
|
|
|
- p->numa_preferred_nid = max_nid;
|
|
|
- p->numa_migrate_seq = 1;
|
|
|
+ sched_setnuma(p, max_nid);
|
|
|
numa_migrate_preferred(p);
|
|
|
}
|
|
|
}
|
|
@@ -1741,6 +1754,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
|
|
|
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
|
|
|
{
|
|
|
}
|
|
|
+
|
|
|
+static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
|
|
|
+{
|
|
|
+}
|
|
|
+
|
|
|
+static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
|
|
|
+{
|
|
|
+}
|
|
|
#endif /* CONFIG_NUMA_BALANCING */
|
|
|
|
|
|
static void
|
|
@@ -1750,8 +1771,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
if (!parent_entity(se))
|
|
|
update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
|
|
|
#ifdef CONFIG_SMP
|
|
|
- if (entity_is_task(se))
|
|
|
- list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
|
|
|
+ if (entity_is_task(se)) {
|
|
|
+ struct rq *rq = rq_of(cfs_rq);
|
|
|
+
|
|
|
+ account_numa_enqueue(rq, task_of(se));
|
|
|
+ list_add(&se->group_node, &rq->cfs_tasks);
|
|
|
+ }
|
|
|
#endif
|
|
|
cfs_rq->nr_running++;
|
|
|
}
|
|
@@ -1762,8 +1787,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
update_load_sub(&cfs_rq->load, se->load.weight);
|
|
|
if (!parent_entity(se))
|
|
|
update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
|
|
|
- if (entity_is_task(se))
|
|
|
+ if (entity_is_task(se)) {
|
|
|
+ account_numa_dequeue(rq_of(cfs_rq), task_of(se));
|
|
|
list_del_init(&se->group_node);
|
|
|
+ }
|
|
|
cfs_rq->nr_running--;
|
|
|
}
|
|
|
|
|
@@ -4605,6 +4632,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
|
|
|
|
|
|
static unsigned long __read_mostly max_load_balance_interval = HZ/10;
|
|
|
|
|
|
+enum fbq_type { regular, remote, all };
|
|
|
+
|
|
|
#define LBF_ALL_PINNED 0x01
|
|
|
#define LBF_NEED_BREAK 0x02
|
|
|
#define LBF_DST_PINNED 0x04
|
|
@@ -4631,6 +4660,8 @@ struct lb_env {
|
|
|
unsigned int loop;
|
|
|
unsigned int loop_break;
|
|
|
unsigned int loop_max;
|
|
|
+
|
|
|
+ enum fbq_type fbq_type;
|
|
|
};
|
|
|
|
|
|
/*
|
|
@@ -5092,6 +5123,10 @@ struct sg_lb_stats {
|
|
|
unsigned int group_weight;
|
|
|
int group_imb; /* Is there an imbalance in the group ? */
|
|
|
int group_has_capacity; /* Is there extra capacity in the group? */
|
|
|
+#ifdef CONFIG_NUMA_BALANCING
|
|
|
+ unsigned int nr_numa_running;
|
|
|
+ unsigned int nr_preferred_running;
|
|
|
+#endif
|
|
|
};
|
|
|
|
|
|
/*
|
|
@@ -5409,6 +5444,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
|
|
|
|
|
sgs->group_load += load;
|
|
|
sgs->sum_nr_running += nr_running;
|
|
|
+#ifdef CONFIG_NUMA_BALANCING
|
|
|
+ sgs->nr_numa_running += rq->nr_numa_running;
|
|
|
+ sgs->nr_preferred_running += rq->nr_preferred_running;
|
|
|
+#endif
|
|
|
sgs->sum_weighted_load += weighted_cpuload(i);
|
|
|
if (idle_cpu(i))
|
|
|
sgs->idle_cpus++;
|
|
@@ -5474,14 +5513,43 @@ static bool update_sd_pick_busiest(struct lb_env *env,
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
+#ifdef CONFIG_NUMA_BALANCING
|
|
|
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
|
|
|
+{
|
|
|
+ if (sgs->sum_nr_running > sgs->nr_numa_running)
|
|
|
+ return regular;
|
|
|
+ if (sgs->sum_nr_running > sgs->nr_preferred_running)
|
|
|
+ return remote;
|
|
|
+ return all;
|
|
|
+}
|
|
|
+
|
|
|
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
|
|
|
+{
|
|
|
+ if (rq->nr_running > rq->nr_numa_running)
|
|
|
+ return regular;
|
|
|
+ if (rq->nr_running > rq->nr_preferred_running)
|
|
|
+ return remote;
|
|
|
+ return all;
|
|
|
+}
|
|
|
+#else
|
|
|
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
|
|
|
+{
|
|
|
+ return all;
|
|
|
+}
|
|
|
+
|
|
|
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
|
|
|
+{
|
|
|
+ return regular;
|
|
|
+}
|
|
|
+#endif /* CONFIG_NUMA_BALANCING */
|
|
|
+
|
|
|
/**
|
|
|
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
|
|
|
* @env: The load balancing environment.
|
|
|
* @balance: Should we balance.
|
|
|
* @sds: variable to hold the statistics for this sched_domain.
|
|
|
*/
|
|
|
-static inline void update_sd_lb_stats(struct lb_env *env,
|
|
|
- struct sd_lb_stats *sds)
|
|
|
+static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
|
|
|
{
|
|
|
struct sched_domain *child = env->sd->child;
|
|
|
struct sched_group *sg = env->sd->groups;
|
|
@@ -5538,6 +5606,9 @@ next_group:
|
|
|
|
|
|
sg = sg->next;
|
|
|
} while (sg != env->sd->groups);
|
|
|
+
|
|
|
+ if (env->sd->flags & SD_NUMA)
|
|
|
+ env->fbq_type = fbq_classify_group(&sds->busiest_stat);
|
|
|
}
|
|
|
|
|
|
/**
|
|
@@ -5841,15 +5912,39 @@ static struct rq *find_busiest_queue(struct lb_env *env,
|
|
|
int i;
|
|
|
|
|
|
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
|
|
|
- unsigned long power = power_of(i);
|
|
|
- unsigned long capacity = DIV_ROUND_CLOSEST(power,
|
|
|
- SCHED_POWER_SCALE);
|
|
|
- unsigned long wl;
|
|
|
+ unsigned long power, capacity, wl;
|
|
|
+ enum fbq_type rt;
|
|
|
+
|
|
|
+ rq = cpu_rq(i);
|
|
|
+ rt = fbq_classify_rq(rq);
|
|
|
|
|
|
+ /*
|
|
|
+ * We classify groups/runqueues into three groups:
|
|
|
+ * - regular: there are !numa tasks
|
|
|
+ * - remote: there are numa tasks that run on the 'wrong' node
|
|
|
+ * - all: there is no distinction
|
|
|
+ *
|
|
|
+ * In order to avoid migrating ideally placed numa tasks,
|
|
|
+ * ignore those when there's better options.
|
|
|
+ *
|
|
|
+ * If we ignore the actual busiest queue to migrate another
|
|
|
+ * task, the next balance pass can still reduce the busiest
|
|
|
+ * queue by moving tasks around inside the node.
|
|
|
+ *
|
|
|
+ * If we cannot move enough load due to this classification
|
|
|
+ * the next pass will adjust the group classification and
|
|
|
+ * allow migration of more tasks.
|
|
|
+ *
|
|
|
+ * Both cases only affect the total convergence complexity.
|
|
|
+ */
|
|
|
+ if (rt > env->fbq_type)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ power = power_of(i);
|
|
|
+ capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
|
|
|
if (!capacity)
|
|
|
capacity = fix_small_capacity(env->sd, group);
|
|
|
|
|
|
- rq = cpu_rq(i);
|
|
|
wl = weighted_cpuload(i);
|
|
|
|
|
|
/*
|
|
@@ -5966,6 +6061,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
|
|
|
.idle = idle,
|
|
|
.loop_break = sched_nr_migrate_break,
|
|
|
.cpus = cpus,
|
|
|
+ .fbq_type = all,
|
|
|
};
|
|
|
|
|
|
/*
|