|
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
|
|
|
|
|
|
/*
|
|
|
* SCHED_OTHER wake-up granularity.
|
|
|
- * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
|
|
|
+ * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
|
|
|
*
|
|
|
* This option delays the preemption effects of decoupled workloads
|
|
|
* and reduces their over-scheduling. Synchronous workloads will still
|
|
|
* have immediate wakeup/sleep latencies.
|
|
|
*/
|
|
|
-unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
|
|
|
+unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
|
|
|
|
|
|
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
|
|
|
|
|
@@ -333,6 +333,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
|
|
|
}
|
|
|
#endif
|
|
|
|
|
|
+/*
|
|
|
+ * delta *= w / rw
|
|
|
+ */
|
|
|
+static inline unsigned long
|
|
|
+calc_delta_weight(unsigned long delta, struct sched_entity *se)
|
|
|
+{
|
|
|
+ for_each_sched_entity(se) {
|
|
|
+ delta = calc_delta_mine(delta,
|
|
|
+ se->load.weight, &cfs_rq_of(se)->load);
|
|
|
+ }
|
|
|
+
|
|
|
+ return delta;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * delta *= rw / w
|
|
|
+ */
|
|
|
+static inline unsigned long
|
|
|
+calc_delta_fair(unsigned long delta, struct sched_entity *se)
|
|
|
+{
|
|
|
+ for_each_sched_entity(se) {
|
|
|
+ delta = calc_delta_mine(delta,
|
|
|
+ cfs_rq_of(se)->load.weight, &se->load);
|
|
|
+ }
|
|
|
+
|
|
|
+ return delta;
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* The idea is to set a period in which each task runs once.
|
|
|
*
|
|
@@ -362,47 +390,80 @@ static u64 __sched_period(unsigned long nr_running)
|
|
|
*/
|
|
|
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
{
|
|
|
- u64 slice = __sched_period(cfs_rq->nr_running);
|
|
|
-
|
|
|
- for_each_sched_entity(se) {
|
|
|
- cfs_rq = cfs_rq_of(se);
|
|
|
-
|
|
|
- slice *= se->load.weight;
|
|
|
- do_div(slice, cfs_rq->load.weight);
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
- return slice;
|
|
|
+ return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
* We calculate the vruntime slice of a to be inserted task
|
|
|
*
|
|
|
- * vs = s/w = p/rw
|
|
|
+ * vs = s*rw/w = p
|
|
|
*/
|
|
|
static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
{
|
|
|
unsigned long nr_running = cfs_rq->nr_running;
|
|
|
- unsigned long weight;
|
|
|
- u64 vslice;
|
|
|
|
|
|
if (!se->on_rq)
|
|
|
nr_running++;
|
|
|
|
|
|
- vslice = __sched_period(nr_running);
|
|
|
+ return __sched_period(nr_running);
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
|
|
|
+ * that it favours >=0 over <0.
|
|
|
+ *
|
|
|
+ * -20 |
|
|
|
+ * |
|
|
|
+ * 0 --------+-------
|
|
|
+ * .'
|
|
|
+ * 19 .'
|
|
|
+ *
|
|
|
+ */
|
|
|
+static unsigned long
|
|
|
+calc_delta_asym(unsigned long delta, struct sched_entity *se)
|
|
|
+{
|
|
|
+ struct load_weight lw = {
|
|
|
+ .weight = NICE_0_LOAD,
|
|
|
+ .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
|
|
|
+ };
|
|
|
|
|
|
for_each_sched_entity(se) {
|
|
|
- cfs_rq = cfs_rq_of(se);
|
|
|
+ struct load_weight *se_lw = &se->load;
|
|
|
+ unsigned long rw = cfs_rq_of(se)->load.weight;
|
|
|
+
|
|
|
+#ifdef CONFIG_FAIR_SCHED_GROUP
|
|
|
+ struct cfs_rq *cfs_rq = se->my_q;
|
|
|
+ struct task_group *tg = NULL
|
|
|
+
|
|
|
+ if (cfs_rq)
|
|
|
+ tg = cfs_rq->tg;
|
|
|
+
|
|
|
+ if (tg && tg->shares < NICE_0_LOAD) {
|
|
|
+ /*
|
|
|
+ * scale shares to what it would have been had
|
|
|
+ * tg->weight been NICE_0_LOAD:
|
|
|
+ *
|
|
|
+ * weight = 1024 * shares / tg->weight
|
|
|
+ */
|
|
|
+ lw.weight *= se->load.weight;
|
|
|
+ lw.weight /= tg->shares;
|
|
|
+
|
|
|
+ lw.inv_weight = 0;
|
|
|
+
|
|
|
+ se_lw = &lw;
|
|
|
+ rw += lw.weight - se->load.weight;
|
|
|
+ } else
|
|
|
+#endif
|
|
|
|
|
|
- weight = cfs_rq->load.weight;
|
|
|
- if (!se->on_rq)
|
|
|
- weight += se->load.weight;
|
|
|
+ if (se->load.weight < NICE_0_LOAD) {
|
|
|
+ se_lw = &lw;
|
|
|
+ rw += NICE_0_LOAD - se->load.weight;
|
|
|
+ }
|
|
|
|
|
|
- vslice *= NICE_0_LOAD;
|
|
|
- do_div(vslice, weight);
|
|
|
+ delta = calc_delta_mine(delta, rw, se_lw);
|
|
|
}
|
|
|
|
|
|
- return vslice;
|
|
|
+ return delta;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -419,11 +480,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
|
|
|
|
|
|
curr->sum_exec_runtime += delta_exec;
|
|
|
schedstat_add(cfs_rq, exec_clock, delta_exec);
|
|
|
- delta_exec_weighted = delta_exec;
|
|
|
- if (unlikely(curr->load.weight != NICE_0_LOAD)) {
|
|
|
- delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
|
|
|
- &curr->load);
|
|
|
- }
|
|
|
+ delta_exec_weighted = calc_delta_fair(delta_exec, curr);
|
|
|
curr->vruntime += delta_exec_weighted;
|
|
|
}
|
|
|
|
|
@@ -510,10 +567,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
* Scheduling class queueing methods:
|
|
|
*/
|
|
|
|
|
|
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
|
|
|
+static void
|
|
|
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
|
|
|
+{
|
|
|
+ cfs_rq->task_weight += weight;
|
|
|
+}
|
|
|
+#else
|
|
|
+static inline void
|
|
|
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
|
|
|
+{
|
|
|
+}
|
|
|
+#endif
|
|
|
+
|
|
|
static void
|
|
|
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
{
|
|
|
update_load_add(&cfs_rq->load, se->load.weight);
|
|
|
+ if (!parent_entity(se))
|
|
|
+ inc_cpu_load(rq_of(cfs_rq), se->load.weight);
|
|
|
+ if (entity_is_task(se))
|
|
|
+ add_cfs_task_weight(cfs_rq, se->load.weight);
|
|
|
cfs_rq->nr_running++;
|
|
|
se->on_rq = 1;
|
|
|
list_add(&se->group_node, &cfs_rq->tasks);
|
|
@@ -523,6 +597,10 @@ static void
|
|
|
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
{
|
|
|
update_load_sub(&cfs_rq->load, se->load.weight);
|
|
|
+ if (!parent_entity(se))
|
|
|
+ dec_cpu_load(rq_of(cfs_rq), se->load.weight);
|
|
|
+ if (entity_is_task(se))
|
|
|
+ add_cfs_task_weight(cfs_rq, -se->load.weight);
|
|
|
cfs_rq->nr_running--;
|
|
|
se->on_rq = 0;
|
|
|
list_del_init(&se->group_node);
|
|
@@ -609,8 +687,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
|
|
|
|
|
|
if (!initial) {
|
|
|
/* sleeps upto a single latency don't count. */
|
|
|
- if (sched_feat(NEW_FAIR_SLEEPERS))
|
|
|
- vruntime -= sysctl_sched_latency;
|
|
|
+ if (sched_feat(NEW_FAIR_SLEEPERS)) {
|
|
|
+ unsigned long thresh = sysctl_sched_latency;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * convert the sleeper threshold into virtual time
|
|
|
+ */
|
|
|
+ if (sched_feat(NORMALIZED_SLEEPER))
|
|
|
+ thresh = calc_delta_fair(thresh, se);
|
|
|
+
|
|
|
+ vruntime -= thresh;
|
|
|
+ }
|
|
|
|
|
|
/* ensure we never gain time by being placed backwards. */
|
|
|
vruntime = max_vruntime(se->vruntime, vruntime);
|
|
@@ -639,21 +726,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
|
|
|
__enqueue_entity(cfs_rq, se);
|
|
|
}
|
|
|
|
|
|
-static void update_avg(u64 *avg, u64 sample)
|
|
|
-{
|
|
|
- s64 diff = sample - *avg;
|
|
|
- *avg += diff >> 3;
|
|
|
-}
|
|
|
-
|
|
|
-static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
-{
|
|
|
- if (!se->last_wakeup)
|
|
|
- return;
|
|
|
-
|
|
|
- update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
|
|
|
- se->last_wakeup = 0;
|
|
|
-}
|
|
|
-
|
|
|
static void
|
|
|
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
|
|
|
{
|
|
@@ -664,7 +736,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
|
|
|
|
|
|
update_stats_dequeue(cfs_rq, se);
|
|
|
if (sleep) {
|
|
|
- update_avg_stats(cfs_rq, se);
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
|
|
if (entity_is_task(se)) {
|
|
|
struct task_struct *tsk = task_of(se);
|
|
@@ -726,17 +797,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
se->prev_sum_exec_runtime = se->sum_exec_runtime;
|
|
|
}
|
|
|
|
|
|
-static int
|
|
|
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
|
|
|
-
|
|
|
static struct sched_entity *
|
|
|
pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
{
|
|
|
- if (!cfs_rq->next)
|
|
|
- return se;
|
|
|
+ struct rq *rq = rq_of(cfs_rq);
|
|
|
+ u64 pair_slice = rq->clock - cfs_rq->pair_start;
|
|
|
|
|
|
- if (wakeup_preempt_entity(cfs_rq->next, se) != 0)
|
|
|
+ if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) {
|
|
|
+ cfs_rq->pair_start = rq->clock;
|
|
|
return se;
|
|
|
+ }
|
|
|
|
|
|
return cfs_rq->next;
|
|
|
}
|
|
@@ -835,7 +905,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
|
|
|
hrtick_start(rq, delta, requeue);
|
|
|
}
|
|
|
}
|
|
|
-#else
|
|
|
+#else /* !CONFIG_SCHED_HRTICK */
|
|
|
static inline void
|
|
|
hrtick_start_fair(struct rq *rq, struct task_struct *p)
|
|
|
{
|
|
@@ -976,7 +1046,7 @@ static int wake_idle(int cpu, struct task_struct *p)
|
|
|
}
|
|
|
return cpu;
|
|
|
}
|
|
|
-#else
|
|
|
+#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
|
|
|
static inline int wake_idle(int cpu, struct task_struct *p)
|
|
|
{
|
|
|
return cpu;
|
|
@@ -987,6 +1057,89 @@ static inline int wake_idle(int cpu, struct task_struct *p)
|
|
|
|
|
|
static const struct sched_class fair_sched_class;
|
|
|
|
|
|
+#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
+/*
|
|
|
+ * effective_load() calculates the load change as seen from the root_task_group
|
|
|
+ *
|
|
|
+ * Adding load to a group doesn't make a group heavier, but can cause movement
|
|
|
+ * of group shares between cpus. Assuming the shares were perfectly aligned one
|
|
|
+ * can calculate the shift in shares.
|
|
|
+ *
|
|
|
+ * The problem is that perfectly aligning the shares is rather expensive, hence
|
|
|
+ * we try to avoid doing that too often - see update_shares(), which ratelimits
|
|
|
+ * this change.
|
|
|
+ *
|
|
|
+ * We compensate this by not only taking the current delta into account, but
|
|
|
+ * also considering the delta between when the shares were last adjusted and
|
|
|
+ * now.
|
|
|
+ *
|
|
|
+ * We still saw a performance dip, some tracing learned us that between
|
|
|
+ * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
|
|
|
+ * significantly. Therefore try to bias the error in direction of failing
|
|
|
+ * the affine wakeup.
|
|
|
+ *
|
|
|
+ */
|
|
|
+static long effective_load(struct task_group *tg, int cpu,
|
|
|
+ long wl, long wg)
|
|
|
+{
|
|
|
+ struct sched_entity *se = tg->se[cpu];
|
|
|
+ long more_w;
|
|
|
+
|
|
|
+ if (!tg->parent)
|
|
|
+ return wl;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * By not taking the decrease of shares on the other cpu into
|
|
|
+ * account our error leans towards reducing the affine wakeups.
|
|
|
+ */
|
|
|
+ if (!wl && sched_feat(ASYM_EFF_LOAD))
|
|
|
+ return wl;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Instead of using this increment, also add the difference
|
|
|
+ * between when the shares were last updated and now.
|
|
|
+ */
|
|
|
+ more_w = se->my_q->load.weight - se->my_q->rq_weight;
|
|
|
+ wl += more_w;
|
|
|
+ wg += more_w;
|
|
|
+
|
|
|
+ for_each_sched_entity(se) {
|
|
|
+#define D(n) (likely(n) ? (n) : 1)
|
|
|
+
|
|
|
+ long S, rw, s, a, b;
|
|
|
+
|
|
|
+ S = se->my_q->tg->shares;
|
|
|
+ s = se->my_q->shares;
|
|
|
+ rw = se->my_q->rq_weight;
|
|
|
+
|
|
|
+ a = S*(rw + wl);
|
|
|
+ b = S*rw + s*wg;
|
|
|
+
|
|
|
+ wl = s*(a-b)/D(b);
|
|
|
+ /*
|
|
|
+ * Assume the group is already running and will
|
|
|
+ * thus already be accounted for in the weight.
|
|
|
+ *
|
|
|
+ * That is, moving shares between CPUs, does not
|
|
|
+ * alter the group weight.
|
|
|
+ */
|
|
|
+ wg = 0;
|
|
|
+#undef D
|
|
|
+ }
|
|
|
+
|
|
|
+ return wl;
|
|
|
+}
|
|
|
+
|
|
|
+#else
|
|
|
+
|
|
|
+static inline unsigned long effective_load(struct task_group *tg, int cpu,
|
|
|
+ unsigned long wl, unsigned long wg)
|
|
|
+{
|
|
|
+ return wl;
|
|
|
+}
|
|
|
+
|
|
|
+#endif
|
|
|
+
|
|
|
static int
|
|
|
wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
|
|
|
struct task_struct *p, int prev_cpu, int this_cpu, int sync,
|
|
@@ -994,8 +1147,10 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
|
|
|
unsigned int imbalance)
|
|
|
{
|
|
|
struct task_struct *curr = this_rq->curr;
|
|
|
+ struct task_group *tg;
|
|
|
unsigned long tl = this_load;
|
|
|
unsigned long tl_per_task;
|
|
|
+ unsigned long weight;
|
|
|
int balanced;
|
|
|
|
|
|
if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
|
|
@@ -1006,19 +1161,28 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
|
|
|
* effect of the currently running task from the load
|
|
|
* of the current CPU:
|
|
|
*/
|
|
|
- if (sync)
|
|
|
- tl -= current->se.load.weight;
|
|
|
+ if (sync) {
|
|
|
+ tg = task_group(current);
|
|
|
+ weight = current->se.load.weight;
|
|
|
+
|
|
|
+ tl += effective_load(tg, this_cpu, -weight, -weight);
|
|
|
+ load += effective_load(tg, prev_cpu, 0, -weight);
|
|
|
+ }
|
|
|
|
|
|
- balanced = 100*(tl + p->se.load.weight) <= imbalance*load;
|
|
|
+ tg = task_group(p);
|
|
|
+ weight = p->se.load.weight;
|
|
|
+
|
|
|
+ balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
|
|
|
+ imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
|
|
|
|
|
|
/*
|
|
|
* If the currently running task will sleep within
|
|
|
* a reasonable amount of time then attract this newly
|
|
|
* woken task:
|
|
|
*/
|
|
|
- if (sync && balanced && curr->sched_class == &fair_sched_class) {
|
|
|
+ if (sync && balanced) {
|
|
|
if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
|
|
|
- p->se.avg_overlap < sysctl_sched_migration_cost)
|
|
|
+ p->se.avg_overlap < sysctl_sched_migration_cost)
|
|
|
return 1;
|
|
|
}
|
|
|
|
|
@@ -1111,11 +1275,13 @@ static unsigned long wakeup_gran(struct sched_entity *se)
|
|
|
unsigned long gran = sysctl_sched_wakeup_granularity;
|
|
|
|
|
|
/*
|
|
|
- * More easily preempt - nice tasks, while not making
|
|
|
- * it harder for + nice tasks.
|
|
|
+ * More easily preempt - nice tasks, while not making it harder for
|
|
|
+ * + nice tasks.
|
|
|
*/
|
|
|
- if (unlikely(se->load.weight > NICE_0_LOAD))
|
|
|
- gran = calc_delta_fair(gran, &se->load);
|
|
|
+ if (sched_feat(ASYM_GRAN))
|
|
|
+ gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
|
|
|
+ else
|
|
|
+ gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
|
|
|
|
|
|
return gran;
|
|
|
}
|
|
@@ -1177,7 +1343,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
- se->last_wakeup = se->sum_exec_runtime;
|
|
|
if (unlikely(se == pse))
|
|
|
return;
|
|
|
|
|
@@ -1275,23 +1440,18 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
|
|
|
struct task_struct *p = NULL;
|
|
|
struct sched_entity *se;
|
|
|
|
|
|
- if (next == &cfs_rq->tasks)
|
|
|
- return NULL;
|
|
|
-
|
|
|
- /* Skip over entities that are not tasks */
|
|
|
- do {
|
|
|
+ while (next != &cfs_rq->tasks) {
|
|
|
se = list_entry(next, struct sched_entity, group_node);
|
|
|
next = next->next;
|
|
|
- } while (next != &cfs_rq->tasks && !entity_is_task(se));
|
|
|
|
|
|
- if (next == &cfs_rq->tasks)
|
|
|
- return NULL;
|
|
|
+ /* Skip over entities that are not tasks */
|
|
|
+ if (entity_is_task(se)) {
|
|
|
+ p = task_of(se);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
cfs_rq->balance_iterator = next;
|
|
|
-
|
|
|
- if (entity_is_task(se))
|
|
|
- p = task_of(se);
|
|
|
-
|
|
|
return p;
|
|
|
}
|
|
|
|
|
@@ -1309,75 +1469,82 @@ static struct task_struct *load_balance_next_fair(void *arg)
|
|
|
return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
|
|
|
}
|
|
|
|
|
|
-#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
-static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
|
|
|
+static unsigned long
|
|
|
+__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
|
|
|
+ unsigned long max_load_move, struct sched_domain *sd,
|
|
|
+ enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
|
|
|
+ struct cfs_rq *cfs_rq)
|
|
|
{
|
|
|
- struct sched_entity *curr;
|
|
|
- struct task_struct *p;
|
|
|
-
|
|
|
- if (!cfs_rq->nr_running || !first_fair(cfs_rq))
|
|
|
- return MAX_PRIO;
|
|
|
-
|
|
|
- curr = cfs_rq->curr;
|
|
|
- if (!curr)
|
|
|
- curr = __pick_next_entity(cfs_rq);
|
|
|
+ struct rq_iterator cfs_rq_iterator;
|
|
|
|
|
|
- p = task_of(curr);
|
|
|
+ cfs_rq_iterator.start = load_balance_start_fair;
|
|
|
+ cfs_rq_iterator.next = load_balance_next_fair;
|
|
|
+ cfs_rq_iterator.arg = cfs_rq;
|
|
|
|
|
|
- return p->prio;
|
|
|
+ return balance_tasks(this_rq, this_cpu, busiest,
|
|
|
+ max_load_move, sd, idle, all_pinned,
|
|
|
+ this_best_prio, &cfs_rq_iterator);
|
|
|
}
|
|
|
-#endif
|
|
|
|
|
|
+#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
static unsigned long
|
|
|
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
|
|
|
unsigned long max_load_move,
|
|
|
struct sched_domain *sd, enum cpu_idle_type idle,
|
|
|
int *all_pinned, int *this_best_prio)
|
|
|
{
|
|
|
- struct cfs_rq *busy_cfs_rq;
|
|
|
long rem_load_move = max_load_move;
|
|
|
- struct rq_iterator cfs_rq_iterator;
|
|
|
-
|
|
|
- cfs_rq_iterator.start = load_balance_start_fair;
|
|
|
- cfs_rq_iterator.next = load_balance_next_fair;
|
|
|
+ int busiest_cpu = cpu_of(busiest);
|
|
|
+ struct task_group *tg;
|
|
|
|
|
|
- for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
|
|
|
-#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
- struct cfs_rq *this_cfs_rq;
|
|
|
- long imbalance;
|
|
|
- unsigned long maxload;
|
|
|
+ rcu_read_lock();
|
|
|
+ update_h_load(busiest_cpu);
|
|
|
|
|
|
- this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
|
|
|
+ list_for_each_entry(tg, &task_groups, list) {
|
|
|
+ struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
|
|
|
+ unsigned long busiest_h_load = busiest_cfs_rq->h_load;
|
|
|
+ unsigned long busiest_weight = busiest_cfs_rq->load.weight;
|
|
|
+ u64 rem_load, moved_load;
|
|
|
|
|
|
- imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
|
|
|
- /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
|
|
|
- if (imbalance <= 0)
|
|
|
+ /*
|
|
|
+ * empty group
|
|
|
+ */
|
|
|
+ if (!busiest_cfs_rq->task_weight)
|
|
|
continue;
|
|
|
|
|
|
- /* Don't pull more than imbalance/2 */
|
|
|
- imbalance /= 2;
|
|
|
- maxload = min(rem_load_move, imbalance);
|
|
|
+ rem_load = (u64)rem_load_move * busiest_weight;
|
|
|
+ rem_load = div_u64(rem_load, busiest_h_load + 1);
|
|
|
|
|
|
- *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
|
|
|
-#else
|
|
|
-# define maxload rem_load_move
|
|
|
-#endif
|
|
|
- /*
|
|
|
- * pass busy_cfs_rq argument into
|
|
|
- * load_balance_[start|next]_fair iterators
|
|
|
- */
|
|
|
- cfs_rq_iterator.arg = busy_cfs_rq;
|
|
|
- rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
|
|
|
- maxload, sd, idle, all_pinned,
|
|
|
- this_best_prio,
|
|
|
- &cfs_rq_iterator);
|
|
|
+ moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
|
|
|
+ rem_load, sd, idle, all_pinned, this_best_prio,
|
|
|
+ tg->cfs_rq[busiest_cpu]);
|
|
|
+
|
|
|
+ if (!moved_load)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ moved_load *= busiest_h_load;
|
|
|
+ moved_load = div_u64(moved_load, busiest_weight + 1);
|
|
|
|
|
|
- if (rem_load_move <= 0)
|
|
|
+ rem_load_move -= moved_load;
|
|
|
+ if (rem_load_move < 0)
|
|
|
break;
|
|
|
}
|
|
|
+ rcu_read_unlock();
|
|
|
|
|
|
return max_load_move - rem_load_move;
|
|
|
}
|
|
|
+#else
|
|
|
+static unsigned long
|
|
|
+load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
|
|
|
+ unsigned long max_load_move,
|
|
|
+ struct sched_domain *sd, enum cpu_idle_type idle,
|
|
|
+ int *all_pinned, int *this_best_prio)
|
|
|
+{
|
|
|
+ return __load_balance_fair(this_rq, this_cpu, busiest,
|
|
|
+ max_load_move, sd, idle, all_pinned,
|
|
|
+ this_best_prio, &busiest->cfs);
|
|
|
+}
|
|
|
+#endif
|
|
|
|
|
|
static int
|
|
|
move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
|
|
@@ -1402,7 +1569,7 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
|
-#endif
|
|
|
+#endif /* CONFIG_SMP */
|
|
|
|
|
|
/*
|
|
|
* scheduler tick hitting a task of our scheduling class:
|