|
@@ -772,19 +772,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
|
|
|
list_del_leaf_cfs_rq(cfs_rq);
|
|
|
}
|
|
|
|
|
|
+static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
|
|
|
+{
|
|
|
+ long tg_weight;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Use this CPU's actual weight instead of the last load_contribution
|
|
|
+ * to gain a more accurate current total weight. See
|
|
|
+ * update_cfs_rq_load_contribution().
|
|
|
+ */
|
|
|
+ tg_weight = atomic_read(&tg->load_weight);
|
|
|
+ tg_weight -= cfs_rq->load_contribution;
|
|
|
+ tg_weight += cfs_rq->load.weight;
|
|
|
+
|
|
|
+ return tg_weight;
|
|
|
+}
|
|
|
+
|
|
|
static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
|
|
|
{
|
|
|
- long load_weight, load, shares;
|
|
|
+ long tg_weight, load, shares;
|
|
|
|
|
|
+ tg_weight = calc_tg_weight(tg, cfs_rq);
|
|
|
load = cfs_rq->load.weight;
|
|
|
|
|
|
- load_weight = atomic_read(&tg->load_weight);
|
|
|
- load_weight += load;
|
|
|
- load_weight -= cfs_rq->load_contribution;
|
|
|
-
|
|
|
shares = (tg->shares * load);
|
|
|
- if (load_weight)
|
|
|
- shares /= load_weight;
|
|
|
+ if (tg_weight)
|
|
|
+ shares /= tg_weight;
|
|
|
|
|
|
if (shares < MIN_SHARES)
|
|
|
shares = MIN_SHARES;
|
|
@@ -1743,7 +1756,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
|
|
|
|
|
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
|
|
{
|
|
|
- if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
|
|
|
+ if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
|
|
|
return;
|
|
|
|
|
|
__return_cfs_rq_runtime(cfs_rq);
|
|
@@ -2036,36 +2049,100 @@ static void task_waking_fair(struct task_struct *p)
|
|
|
* Adding load to a group doesn't make a group heavier, but can cause movement
|
|
|
* of group shares between cpus. Assuming the shares were perfectly aligned one
|
|
|
* can calculate the shift in shares.
|
|
|
+ *
|
|
|
+ * Calculate the effective load difference if @wl is added (subtracted) to @tg
|
|
|
+ * on this @cpu and results in a total addition (subtraction) of @wg to the
|
|
|
+ * total group weight.
|
|
|
+ *
|
|
|
+ * Given a runqueue weight distribution (rw_i) we can compute a shares
|
|
|
+ * distribution (s_i) using:
|
|
|
+ *
|
|
|
+ * s_i = rw_i / \Sum rw_j (1)
|
|
|
+ *
|
|
|
+ * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
|
|
|
+ * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
|
|
|
+ * shares distribution (s_i):
|
|
|
+ *
|
|
|
+ * rw_i = { 2, 4, 1, 0 }
|
|
|
+ * s_i = { 2/7, 4/7, 1/7, 0 }
|
|
|
+ *
|
|
|
+ * As per wake_affine() we're interested in the load of two CPUs (the CPU the
|
|
|
+ * task used to run on and the CPU the waker is running on), we need to
|
|
|
+ * compute the effect of waking a task on either CPU and, in case of a sync
|
|
|
+ * wakeup, compute the effect of the current task going to sleep.
|
|
|
+ *
|
|
|
+ * So for a change of @wl to the local @cpu with an overall group weight change
|
|
|
+ * of @wl we can compute the new shares distribution (s'_i) using:
|
|
|
+ *
|
|
|
+ * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
|
|
|
+ *
|
|
|
+ * Suppose we're interested in CPUs 0 and 1, and want to compute the load
|
|
|
+ * differences in waking a task to CPU 0. The additional task changes the
|
|
|
+ * weight and shares distributions like:
|
|
|
+ *
|
|
|
+ * rw'_i = { 3, 4, 1, 0 }
|
|
|
+ * s'_i = { 3/8, 4/8, 1/8, 0 }
|
|
|
+ *
|
|
|
+ * We can then compute the difference in effective weight by using:
|
|
|
+ *
|
|
|
+ * dw_i = S * (s'_i - s_i) (3)
|
|
|
+ *
|
|
|
+ * Where 'S' is the group weight as seen by its parent.
|
|
|
+ *
|
|
|
+ * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
|
|
|
+ * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
|
|
|
+ * 4/7) times the weight of the group.
|
|
|
*/
|
|
|
static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
|
|
|
{
|
|
|
struct sched_entity *se = tg->se[cpu];
|
|
|
|
|
|
- if (!tg->parent)
|
|
|
+ if (!tg->parent) /* the trivial, non-cgroup case */
|
|
|
return wl;
|
|
|
|
|
|
for_each_sched_entity(se) {
|
|
|
- long lw, w;
|
|
|
+ long w, W;
|
|
|
|
|
|
tg = se->my_q->tg;
|
|
|
- w = se->my_q->load.weight;
|
|
|
|
|
|
- /* use this cpu's instantaneous contribution */
|
|
|
- lw = atomic_read(&tg->load_weight);
|
|
|
- lw -= se->my_q->load_contribution;
|
|
|
- lw += w + wg;
|
|
|
+ /*
|
|
|
+ * W = @wg + \Sum rw_j
|
|
|
+ */
|
|
|
+ W = wg + calc_tg_weight(tg, se->my_q);
|
|
|
|
|
|
- wl += w;
|
|
|
+ /*
|
|
|
+ * w = rw_i + @wl
|
|
|
+ */
|
|
|
+ w = se->my_q->load.weight + wl;
|
|
|
|
|
|
- if (lw > 0 && wl < lw)
|
|
|
- wl = (wl * tg->shares) / lw;
|
|
|
+ /*
|
|
|
+ * wl = S * s'_i; see (2)
|
|
|
+ */
|
|
|
+ if (W > 0 && w < W)
|
|
|
+ wl = (w * tg->shares) / W;
|
|
|
else
|
|
|
wl = tg->shares;
|
|
|
|
|
|
- /* zero point is MIN_SHARES */
|
|
|
+ /*
|
|
|
+ * Per the above, wl is the new se->load.weight value; since
|
|
|
+ * those are clipped to [MIN_SHARES, ...) do so now. See
|
|
|
+ * calc_cfs_shares().
|
|
|
+ */
|
|
|
if (wl < MIN_SHARES)
|
|
|
wl = MIN_SHARES;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * wl = dw_i = S * (s'_i - s_i); see (3)
|
|
|
+ */
|
|
|
wl -= se->load.weight;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Recursively apply this logic to all parent groups to compute
|
|
|
+ * the final effective load change on the root group. Since
|
|
|
+ * only the @tg group gets extra weight, all parent groups can
|
|
|
+ * only redistribute existing shares. @wl is the shift in shares
|
|
|
+ * resulting from this level per the above.
|
|
|
+ */
|
|
|
wg = 0;
|
|
|
}
|
|
|
|
|
@@ -2249,7 +2326,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
|
|
|
int cpu = smp_processor_id();
|
|
|
int prev_cpu = task_cpu(p);
|
|
|
struct sched_domain *sd;
|
|
|
- int i;
|
|
|
+ struct sched_group *sg;
|
|
|
+ int i, smt = 0;
|
|
|
|
|
|
/*
|
|
|
* If the task is going to be woken-up on this cpu and if it is
|
|
@@ -2269,25 +2347,38 @@ static int select_idle_sibling(struct task_struct *p, int target)
|
|
|
* Otherwise, iterate the domains and find an elegible idle cpu.
|
|
|
*/
|
|
|
rcu_read_lock();
|
|
|
+again:
|
|
|
for_each_domain(target, sd) {
|
|
|
- if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
|
|
|
- break;
|
|
|
+ if (!smt && (sd->flags & SD_SHARE_CPUPOWER))
|
|
|
+ continue;
|
|
|
|
|
|
- for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) {
|
|
|
- if (idle_cpu(i)) {
|
|
|
- target = i;
|
|
|
- break;
|
|
|
+ if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) {
|
|
|
+ if (!smt) {
|
|
|
+ smt = 1;
|
|
|
+ goto again;
|
|
|
}
|
|
|
+ break;
|
|
|
}
|
|
|
|
|
|
- /*
|
|
|
- * Lets stop looking for an idle sibling when we reached
|
|
|
- * the domain that spans the current cpu and prev_cpu.
|
|
|
- */
|
|
|
- if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
|
|
|
- cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
|
|
|
- break;
|
|
|
+ sg = sd->groups;
|
|
|
+ do {
|
|
|
+ if (!cpumask_intersects(sched_group_cpus(sg),
|
|
|
+ tsk_cpus_allowed(p)))
|
|
|
+ goto next;
|
|
|
+
|
|
|
+ for_each_cpu(i, sched_group_cpus(sg)) {
|
|
|
+ if (!idle_cpu(i))
|
|
|
+ goto next;
|
|
|
+ }
|
|
|
+
|
|
|
+ target = cpumask_first_and(sched_group_cpus(sg),
|
|
|
+ tsk_cpus_allowed(p));
|
|
|
+ goto done;
|
|
|
+next:
|
|
|
+ sg = sg->next;
|
|
|
+ } while (sg != sd->groups);
|
|
|
}
|
|
|
+done:
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
return target;
|
|
@@ -3511,7 +3602,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
- * update_sd_lb_stats - Update sched_group's statistics for load balancing.
|
|
|
+ * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
|
|
|
* @sd: sched_domain whose statistics are to be updated.
|
|
|
* @this_cpu: Cpu for which load balance is currently performed.
|
|
|
* @idle: Idle status of this_cpu
|