|
@@ -406,34 +406,23 @@ struct cfs_rq {
|
|
struct task_group *tg; /* group that "owns" this runqueue */
|
|
struct task_group *tg; /* group that "owns" this runqueue */
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
|
#ifdef CONFIG_SMP
|
|
- unsigned long task_weight;
|
|
|
|
- unsigned long shares;
|
|
|
|
/*
|
|
/*
|
|
- * We need space to build a sched_domain wide view of the full task
|
|
|
|
- * group tree, in order to avoid depending on dynamic memory allocation
|
|
|
|
- * during the load balancing we place this in the per cpu task group
|
|
|
|
- * hierarchy. This limits the load balancing to one instance per cpu,
|
|
|
|
- * but more should not be needed anyway.
|
|
|
|
|
|
+ * the part of load.weight contributed by tasks
|
|
*/
|
|
*/
|
|
- struct aggregate_struct {
|
|
|
|
- /*
|
|
|
|
- * load = weight(cpus) * f(tg)
|
|
|
|
- *
|
|
|
|
- * Where f(tg) is the recursive weight fraction assigned to
|
|
|
|
- * this group.
|
|
|
|
- */
|
|
|
|
- unsigned long load;
|
|
|
|
|
|
+ unsigned long task_weight;
|
|
|
|
|
|
- /*
|
|
|
|
- * part of the group weight distributed to this span.
|
|
|
|
- */
|
|
|
|
- unsigned long shares;
|
|
|
|
|
|
+ /*
|
|
|
|
+ * h_load = weight * f(tg)
|
|
|
|
+ *
|
|
|
|
+ * Where f(tg) is the recursive weight fraction assigned to
|
|
|
|
+ * this group.
|
|
|
|
+ */
|
|
|
|
+ unsigned long h_load;
|
|
|
|
|
|
- /*
|
|
|
|
- * The sum of all runqueue weights within this span.
|
|
|
|
- */
|
|
|
|
- unsigned long rq_weight;
|
|
|
|
- } aggregate;
|
|
|
|
|
|
+ /*
|
|
|
|
+ * this cpu's part of tg->shares
|
|
|
|
+ */
|
|
|
|
+ unsigned long shares;
|
|
#endif
|
|
#endif
|
|
#endif
|
|
#endif
|
|
};
|
|
};
|
|
@@ -1443,47 +1432,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
|
|
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
|
|
|
-/*
|
|
|
|
- * Group load balancing.
|
|
|
|
- *
|
|
|
|
- * We calculate a few balance domain wide aggregate numbers; load and weight.
|
|
|
|
- * Given the pictures below, and assuming each item has equal weight:
|
|
|
|
- *
|
|
|
|
- * root 1 - thread
|
|
|
|
- * / | \ A - group
|
|
|
|
- * A 1 B
|
|
|
|
- * /|\ / \
|
|
|
|
- * C 2 D 3 4
|
|
|
|
- * | |
|
|
|
|
- * 5 6
|
|
|
|
- *
|
|
|
|
- * load:
|
|
|
|
- * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
|
|
|
|
- * which equals 1/9-th of the total load.
|
|
|
|
- *
|
|
|
|
- * shares:
|
|
|
|
- * The weight of this group on the selected cpus.
|
|
|
|
- *
|
|
|
|
- * rq_weight:
|
|
|
|
- * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
|
|
|
|
- * B would get 2.
|
|
|
|
- */
|
|
|
|
-
|
|
|
|
-static inline struct aggregate_struct *
|
|
|
|
-aggregate(struct task_group *tg, int cpu)
|
|
|
|
-{
|
|
|
|
- return &tg->cfs_rq[cpu]->aggregate;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-typedef void (*aggregate_func)(struct task_group *, int, struct sched_domain *);
|
|
|
|
|
|
+typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
|
|
|
|
|
|
/*
|
|
/*
|
|
* Iterate the full tree, calling @down when first entering a node and @up when
|
|
* Iterate the full tree, calling @down when first entering a node and @up when
|
|
* leaving it for the final time.
|
|
* leaving it for the final time.
|
|
*/
|
|
*/
|
|
-static
|
|
|
|
-void aggregate_walk_tree(aggregate_func down, aggregate_func up,
|
|
|
|
- int cpu, struct sched_domain *sd)
|
|
|
|
|
|
+static void
|
|
|
|
+walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
|
|
{
|
|
{
|
|
struct task_group *parent, *child;
|
|
struct task_group *parent, *child;
|
|
|
|
|
|
@@ -1507,72 +1463,6 @@ up:
|
|
rcu_read_unlock();
|
|
rcu_read_unlock();
|
|
}
|
|
}
|
|
|
|
|
|
-/*
|
|
|
|
- * Calculate the aggregate runqueue weight.
|
|
|
|
- */
|
|
|
|
-static void
|
|
|
|
-aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd)
|
|
|
|
-{
|
|
|
|
- unsigned long rq_weight = 0;
|
|
|
|
- int i;
|
|
|
|
-
|
|
|
|
- for_each_cpu_mask(i, sd->span)
|
|
|
|
- rq_weight += tg->cfs_rq[i]->load.weight;
|
|
|
|
-
|
|
|
|
- aggregate(tg, cpu)->rq_weight = rq_weight;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * Compute the weight of this group on the given cpus.
|
|
|
|
- */
|
|
|
|
-static void
|
|
|
|
-aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
|
|
|
|
-{
|
|
|
|
- unsigned long shares = 0;
|
|
|
|
- int i;
|
|
|
|
-
|
|
|
|
- for_each_cpu_mask(i, sd->span)
|
|
|
|
- shares += tg->cfs_rq[i]->shares;
|
|
|
|
-
|
|
|
|
- if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares)
|
|
|
|
- shares = tg->shares;
|
|
|
|
-
|
|
|
|
- if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
|
|
|
|
- shares = tg->shares;
|
|
|
|
-
|
|
|
|
- aggregate(tg, cpu)->shares = shares;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * Compute the load fraction assigned to this group, relies on the aggregate
|
|
|
|
- * weight and this group's parent's load, i.e. top-down.
|
|
|
|
- */
|
|
|
|
-static void
|
|
|
|
-aggregate_group_load(struct task_group *tg, int cpu, struct sched_domain *sd)
|
|
|
|
-{
|
|
|
|
- unsigned long load;
|
|
|
|
-
|
|
|
|
- if (!tg->parent) {
|
|
|
|
- int i;
|
|
|
|
-
|
|
|
|
- load = 0;
|
|
|
|
- for_each_cpu_mask(i, sd->span)
|
|
|
|
- load += cpu_rq(i)->load.weight;
|
|
|
|
-
|
|
|
|
- } else {
|
|
|
|
- load = aggregate(tg->parent, cpu)->load;
|
|
|
|
-
|
|
|
|
- /*
|
|
|
|
- * shares is our weight in the parent's rq so
|
|
|
|
- * shares/parent->rq_weight gives our fraction of the load
|
|
|
|
- */
|
|
|
|
- load *= aggregate(tg, cpu)->shares;
|
|
|
|
- load /= aggregate(tg->parent, cpu)->rq_weight + 1;
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- aggregate(tg, cpu)->load = load;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
static void __set_se_shares(struct sched_entity *se, unsigned long shares);
|
|
static void __set_se_shares(struct sched_entity *se, unsigned long shares);
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -1580,16 +1470,16 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
|
|
*/
|
|
*/
|
|
static void
|
|
static void
|
|
__update_group_shares_cpu(struct task_group *tg, int cpu,
|
|
__update_group_shares_cpu(struct task_group *tg, int cpu,
|
|
- struct sched_domain *sd, int tcpu)
|
|
|
|
|
|
+ unsigned long sd_shares, unsigned long sd_rq_weight)
|
|
{
|
|
{
|
|
int boost = 0;
|
|
int boost = 0;
|
|
unsigned long shares;
|
|
unsigned long shares;
|
|
unsigned long rq_weight;
|
|
unsigned long rq_weight;
|
|
|
|
|
|
- if (!tg->se[tcpu])
|
|
|
|
|
|
+ if (!tg->se[cpu])
|
|
return;
|
|
return;
|
|
|
|
|
|
- rq_weight = tg->cfs_rq[tcpu]->load.weight;
|
|
|
|
|
|
+ rq_weight = tg->cfs_rq[cpu]->load.weight;
|
|
|
|
|
|
/*
|
|
/*
|
|
* If there are currently no tasks on the cpu pretend there is one of
|
|
* If there are currently no tasks on the cpu pretend there is one of
|
|
@@ -1601,124 +1491,97 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
|
|
rq_weight = NICE_0_LOAD;
|
|
rq_weight = NICE_0_LOAD;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ if (unlikely(rq_weight > sd_rq_weight))
|
|
|
|
+ rq_weight = sd_rq_weight;
|
|
|
|
+
|
|
/*
|
|
/*
|
|
* \Sum shares * rq_weight
|
|
* \Sum shares * rq_weight
|
|
* shares = -----------------------
|
|
* shares = -----------------------
|
|
* \Sum rq_weight
|
|
* \Sum rq_weight
|
|
*
|
|
*
|
|
*/
|
|
*/
|
|
- shares = aggregate(tg, cpu)->shares * rq_weight;
|
|
|
|
- shares /= aggregate(tg, cpu)->rq_weight + 1;
|
|
|
|
|
|
+ shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
|
|
|
|
|
|
/*
|
|
/*
|
|
* record the actual number of shares, not the boosted amount.
|
|
* record the actual number of shares, not the boosted amount.
|
|
*/
|
|
*/
|
|
- tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
|
|
|
|
|
|
+ tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
|
|
|
|
|
|
if (shares < MIN_SHARES)
|
|
if (shares < MIN_SHARES)
|
|
shares = MIN_SHARES;
|
|
shares = MIN_SHARES;
|
|
else if (shares > MAX_SHARES)
|
|
else if (shares > MAX_SHARES)
|
|
shares = MAX_SHARES;
|
|
shares = MAX_SHARES;
|
|
|
|
|
|
- __set_se_shares(tg->se[tcpu], shares);
|
|
|
|
|
|
+ __set_se_shares(tg->se[cpu], shares);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
/*
|
|
- * Re-adjust the weights on the cpu the task came from and on the cpu the
|
|
|
|
- * task went to.
|
|
|
|
|
|
+ * Re-compute the task group their per cpu shares over the given domain.
|
|
|
|
+ * This needs to be done in a bottom-up fashion because the rq weight of a
|
|
|
|
+ * parent group depends on the shares of its child groups.
|
|
*/
|
|
*/
|
|
static void
|
|
static void
|
|
-__move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
|
|
|
|
- int scpu, int dcpu)
|
|
|
|
|
|
+tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
|
|
{
|
|
{
|
|
- __update_group_shares_cpu(tg, cpu, sd, scpu);
|
|
|
|
- __update_group_shares_cpu(tg, cpu, sd, dcpu);
|
|
|
|
-}
|
|
|
|
|
|
+ unsigned long rq_weight = 0;
|
|
|
|
+ unsigned long shares = 0;
|
|
|
|
+ int i;
|
|
|
|
|
|
-/*
|
|
|
|
- * Because changing a group's shares changes the weight of the super-group
|
|
|
|
- * we need to walk up the tree and change all shares until we hit the root.
|
|
|
|
- */
|
|
|
|
-static void
|
|
|
|
-move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
|
|
|
|
- int scpu, int dcpu)
|
|
|
|
-{
|
|
|
|
- while (tg) {
|
|
|
|
- __move_group_shares(tg, cpu, sd, scpu, dcpu);
|
|
|
|
- tg = tg->parent;
|
|
|
|
|
|
+ for_each_cpu_mask(i, sd->span) {
|
|
|
|
+ rq_weight += tg->cfs_rq[i]->load.weight;
|
|
|
|
+ shares += tg->cfs_rq[i]->shares;
|
|
}
|
|
}
|
|
-}
|
|
|
|
|
|
|
|
-static void
|
|
|
|
-aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
|
|
|
|
-{
|
|
|
|
- int i;
|
|
|
|
|
|
+ if ((!shares && rq_weight) || shares > tg->shares)
|
|
|
|
+ shares = tg->shares;
|
|
|
|
+
|
|
|
|
+ if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
|
|
|
|
+ shares = tg->shares;
|
|
|
|
|
|
for_each_cpu_mask(i, sd->span) {
|
|
for_each_cpu_mask(i, sd->span) {
|
|
struct rq *rq = cpu_rq(i);
|
|
struct rq *rq = cpu_rq(i);
|
|
unsigned long flags;
|
|
unsigned long flags;
|
|
|
|
|
|
spin_lock_irqsave(&rq->lock, flags);
|
|
spin_lock_irqsave(&rq->lock, flags);
|
|
- __update_group_shares_cpu(tg, cpu, sd, i);
|
|
|
|
|
|
+ __update_group_shares_cpu(tg, i, shares, rq_weight);
|
|
spin_unlock_irqrestore(&rq->lock, flags);
|
|
spin_unlock_irqrestore(&rq->lock, flags);
|
|
}
|
|
}
|
|
-
|
|
|
|
- aggregate_group_shares(tg, cpu, sd);
|
|
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
/*
|
|
- * Calculate the accumulative weight and recursive load of each task group
|
|
|
|
- * while walking down the tree.
|
|
|
|
|
|
+ * Compute the cpu's hierarchical load factor for each task group.
|
|
|
|
+ * This needs to be done in a top-down fashion because the load of a child
|
|
|
|
+ * group is a fraction of its parents load.
|
|
*/
|
|
*/
|
|
static void
|
|
static void
|
|
-aggregate_get_down(struct task_group *tg, int cpu, struct sched_domain *sd)
|
|
|
|
|
|
+tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
|
|
{
|
|
{
|
|
- aggregate_group_weight(tg, cpu, sd);
|
|
|
|
- aggregate_group_shares(tg, cpu, sd);
|
|
|
|
- aggregate_group_load(tg, cpu, sd);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * Rebalance the cpu shares while walking back up the tree.
|
|
|
|
- */
|
|
|
|
-static void
|
|
|
|
-aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd)
|
|
|
|
-{
|
|
|
|
- aggregate_group_set_shares(tg, cpu, sd);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static void
|
|
|
|
-aggregate_get_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
|
|
|
|
-{
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
|
|
|
|
|
|
+ unsigned long load;
|
|
|
|
|
|
-static void __init init_aggregate(void)
|
|
|
|
-{
|
|
|
|
- int i;
|
|
|
|
|
|
+ if (!tg->parent) {
|
|
|
|
+ load = cpu_rq(cpu)->load.weight;
|
|
|
|
+ } else {
|
|
|
|
+ load = tg->parent->cfs_rq[cpu]->h_load;
|
|
|
|
+ load *= tg->cfs_rq[cpu]->shares;
|
|
|
|
+ load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
|
|
|
|
+ }
|
|
|
|
|
|
- for_each_possible_cpu(i)
|
|
|
|
- spin_lock_init(&per_cpu(aggregate_lock, i));
|
|
|
|
|
|
+ tg->cfs_rq[cpu]->h_load = load;
|
|
}
|
|
}
|
|
|
|
|
|
-static int get_aggregate(int cpu, struct sched_domain *sd)
|
|
|
|
|
|
+static void
|
|
|
|
+tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
|
|
{
|
|
{
|
|
- if (!spin_trylock(&per_cpu(aggregate_lock, cpu)))
|
|
|
|
- return 0;
|
|
|
|
-
|
|
|
|
- aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd);
|
|
|
|
- return 1;
|
|
|
|
}
|
|
}
|
|
|
|
|
|
-static void update_aggregate(int cpu, struct sched_domain *sd)
|
|
|
|
|
|
+static void update_shares(struct sched_domain *sd)
|
|
{
|
|
{
|
|
- aggregate_walk_tree(aggregate_get_down, aggregate_get_nop, cpu, sd);
|
|
|
|
|
|
+ walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
|
|
}
|
|
}
|
|
|
|
|
|
-static void put_aggregate(int cpu, struct sched_domain *sd)
|
|
|
|
|
|
+static void update_h_load(int cpu)
|
|
{
|
|
{
|
|
- spin_unlock(&per_cpu(aggregate_lock, cpu));
|
|
|
|
|
|
+ walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
|
|
}
|
|
}
|
|
|
|
|
|
static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
|
|
static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
|
|
@@ -1728,22 +1591,10 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
|
|
|
|
|
|
#else
|
|
#else
|
|
|
|
|
|
-static inline void init_aggregate(void)
|
|
|
|
-{
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static inline int get_aggregate(int cpu, struct sched_domain *sd)
|
|
|
|
-{
|
|
|
|
- return 0;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static inline void update_aggregate(int cpu, struct sched_domain *sd)
|
|
|
|
|
|
+static inline void update_shares(struct sched_domain *sd)
|
|
{
|
|
{
|
|
}
|
|
}
|
|
|
|
|
|
-static inline void put_aggregate(int cpu, struct sched_domain *sd)
|
|
|
|
-{
|
|
|
|
-}
|
|
|
|
#endif
|
|
#endif
|
|
|
|
|
|
#endif
|
|
#endif
|
|
@@ -2172,12 +2023,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
|
|
int load_idx = sd->forkexec_idx;
|
|
int load_idx = sd->forkexec_idx;
|
|
int imbalance = 100 + (sd->imbalance_pct-100)/2;
|
|
int imbalance = 100 + (sd->imbalance_pct-100)/2;
|
|
|
|
|
|
- /*
|
|
|
|
- * now that we have both rqs locked the rq weight won't change
|
|
|
|
- * anymore - so update the stats.
|
|
|
|
- */
|
|
|
|
- update_aggregate(this_cpu, sd);
|
|
|
|
-
|
|
|
|
do {
|
|
do {
|
|
unsigned long load, avg_load;
|
|
unsigned long load, avg_load;
|
|
int local_group;
|
|
int local_group;
|
|
@@ -3521,12 +3366,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
|
|
unsigned long imbalance;
|
|
unsigned long imbalance;
|
|
struct rq *busiest;
|
|
struct rq *busiest;
|
|
unsigned long flags;
|
|
unsigned long flags;
|
|
- int unlock_aggregate;
|
|
|
|
|
|
|
|
cpus_setall(*cpus);
|
|
cpus_setall(*cpus);
|
|
|
|
|
|
- unlock_aggregate = get_aggregate(this_cpu, sd);
|
|
|
|
-
|
|
|
|
/*
|
|
/*
|
|
* When power savings policy is enabled for the parent domain, idle
|
|
* When power savings policy is enabled for the parent domain, idle
|
|
* sibling can pick up load irrespective of busy siblings. In this case,
|
|
* sibling can pick up load irrespective of busy siblings. In this case,
|
|
@@ -3540,6 +3382,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
|
|
schedstat_inc(sd, lb_count[idle]);
|
|
schedstat_inc(sd, lb_count[idle]);
|
|
|
|
|
|
redo:
|
|
redo:
|
|
|
|
+ update_shares(sd);
|
|
group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
|
|
group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
|
|
cpus, balance);
|
|
cpus, balance);
|
|
|
|
|
|
@@ -3663,8 +3506,8 @@ out_one_pinned:
|
|
else
|
|
else
|
|
ld_moved = 0;
|
|
ld_moved = 0;
|
|
out:
|
|
out:
|
|
- if (unlock_aggregate)
|
|
|
|
- put_aggregate(this_cpu, sd);
|
|
|
|
|
|
+ if (ld_moved)
|
|
|
|
+ update_shares(sd);
|
|
return ld_moved;
|
|
return ld_moved;
|
|
}
|
|
}
|
|
|
|
|
|
@@ -8019,7 +7862,6 @@ void __init sched_init(void)
|
|
}
|
|
}
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
|
#ifdef CONFIG_SMP
|
|
- init_aggregate();
|
|
|
|
init_defrootdomain();
|
|
init_defrootdomain();
|
|
#endif
|
|
#endif
|
|
|
|
|