|
@@ -253,6 +253,8 @@ struct task_group {
|
|
|
/* runqueue "owned" by this group on each cpu */
|
|
|
struct cfs_rq **cfs_rq;
|
|
|
unsigned long shares;
|
|
|
+
|
|
|
+ atomic_t load_weight;
|
|
|
#endif
|
|
|
|
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
|
@@ -359,15 +361,11 @@ struct cfs_rq {
|
|
|
*/
|
|
|
unsigned long h_load;
|
|
|
|
|
|
- /*
|
|
|
- * this cpu's part of tg->shares
|
|
|
- */
|
|
|
- unsigned long shares;
|
|
|
+ u64 load_avg;
|
|
|
+ u64 load_period;
|
|
|
+ u64 load_stamp;
|
|
|
|
|
|
- /*
|
|
|
- * load.weight at the time we set shares
|
|
|
- */
|
|
|
- unsigned long rq_weight;
|
|
|
+ unsigned long load_contribution;
|
|
|
#endif
|
|
|
#endif
|
|
|
};
|
|
@@ -806,20 +804,6 @@ late_initcall(sched_init_debug);
|
|
|
*/
|
|
|
const_debug unsigned int sysctl_sched_nr_migrate = 32;
|
|
|
|
|
|
-/*
|
|
|
- * ratelimit for updating the group shares.
|
|
|
- * default: 0.25ms
|
|
|
- */
|
|
|
-unsigned int sysctl_sched_shares_ratelimit = 250000;
|
|
|
-unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
|
|
|
-
|
|
|
-/*
|
|
|
- * Inject some fuzzyness into changing the per-cpu group shares
|
|
|
- * this avoids remote rq-locks at the expense of fairness.
|
|
|
- * default: 4
|
|
|
- */
|
|
|
-unsigned int sysctl_sched_shares_thresh = 4;
|
|
|
-
|
|
|
/*
|
|
|
* period over which we average the RT time consumption, measured
|
|
|
* in ms.
|
|
@@ -1369,6 +1353,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
|
|
|
lw->inv_weight = 0;
|
|
|
}
|
|
|
|
|
|
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
|
|
|
+{
|
|
|
+ lw->weight = w;
|
|
|
+ lw->inv_weight = 0;
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* To aid in avoiding the subversion of "niceness" due to uneven distribution
|
|
|
* of tasks with abnormal "nice" values across CPUs the contribution that
|
|
@@ -1557,97 +1547,44 @@ static unsigned long cpu_avg_load_per_task(int cpu)
|
|
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
|
|
|
-static __read_mostly unsigned long __percpu *update_shares_data;
|
|
|
-
|
|
|
-static void __set_se_shares(struct sched_entity *se, unsigned long shares);
|
|
|
-
|
|
|
-/*
|
|
|
- * Calculate and set the cpu's group shares.
|
|
|
- */
|
|
|
-static void update_group_shares_cpu(struct task_group *tg, int cpu,
|
|
|
- unsigned long sd_shares,
|
|
|
- unsigned long sd_rq_weight,
|
|
|
- unsigned long *usd_rq_weight)
|
|
|
-{
|
|
|
- unsigned long shares, rq_weight;
|
|
|
- int boost = 0;
|
|
|
-
|
|
|
- rq_weight = usd_rq_weight[cpu];
|
|
|
- if (!rq_weight) {
|
|
|
- boost = 1;
|
|
|
- rq_weight = NICE_0_LOAD;
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- * \Sum_j shares_j * rq_weight_i
|
|
|
- * shares_i = -----------------------------
|
|
|
- * \Sum_j rq_weight_j
|
|
|
- */
|
|
|
- shares = (sd_shares * rq_weight) / sd_rq_weight;
|
|
|
- shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
|
|
|
-
|
|
|
- if (abs(shares - tg->se[cpu]->load.weight) >
|
|
|
- sysctl_sched_shares_thresh) {
|
|
|
- struct rq *rq = cpu_rq(cpu);
|
|
|
- unsigned long flags;
|
|
|
-
|
|
|
- raw_spin_lock_irqsave(&rq->lock, flags);
|
|
|
- tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
|
|
|
- tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
|
|
|
- __set_se_shares(tg->se[cpu], shares);
|
|
|
- raw_spin_unlock_irqrestore(&rq->lock, flags);
|
|
|
- }
|
|
|
-}
|
|
|
+static void update_cfs_load(struct cfs_rq *cfs_rq);
|
|
|
+static void update_cfs_shares(struct cfs_rq *cfs_rq);
|
|
|
|
|
|
/*
|
|
|
- * Re-compute the task group their per cpu shares over the given domain.
|
|
|
- * This needs to be done in a bottom-up fashion because the rq weight of a
|
|
|
- * parent group depends on the shares of its child groups.
|
|
|
+ * update tg->load_weight by folding this cpu's load_avg
|
|
|
*/
|
|
|
static int tg_shares_up(struct task_group *tg, void *data)
|
|
|
{
|
|
|
- unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
|
|
|
- unsigned long *usd_rq_weight;
|
|
|
- struct sched_domain *sd = data;
|
|
|
+ long load_avg;
|
|
|
+ struct cfs_rq *cfs_rq;
|
|
|
unsigned long flags;
|
|
|
- int i;
|
|
|
+ int cpu = (long)data;
|
|
|
+ struct rq *rq;
|
|
|
|
|
|
- if (!tg->se[0])
|
|
|
+ if (!tg->se[cpu])
|
|
|
return 0;
|
|
|
|
|
|
- local_irq_save(flags);
|
|
|
- usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
|
|
|
-
|
|
|
- for_each_cpu(i, sched_domain_span(sd)) {
|
|
|
- weight = tg->cfs_rq[i]->load.weight;
|
|
|
- usd_rq_weight[i] = weight;
|
|
|
-
|
|
|
- rq_weight += weight;
|
|
|
- /*
|
|
|
- * If there are currently no tasks on the cpu pretend there
|
|
|
- * is one of average load so that when a new task gets to
|
|
|
- * run here it will not get delayed by group starvation.
|
|
|
- */
|
|
|
- if (!weight)
|
|
|
- weight = NICE_0_LOAD;
|
|
|
+ rq = cpu_rq(cpu);
|
|
|
+ cfs_rq = tg->cfs_rq[cpu];
|
|
|
|
|
|
- sum_weight += weight;
|
|
|
- shares += tg->cfs_rq[i]->shares;
|
|
|
- }
|
|
|
+ raw_spin_lock_irqsave(&rq->lock, flags);
|
|
|
|
|
|
- if (!rq_weight)
|
|
|
- rq_weight = sum_weight;
|
|
|
+ update_rq_clock(rq);
|
|
|
+ update_cfs_load(cfs_rq);
|
|
|
|
|
|
- if ((!shares && rq_weight) || shares > tg->shares)
|
|
|
- shares = tg->shares;
|
|
|
+ load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
|
|
|
+ load_avg -= cfs_rq->load_contribution;
|
|
|
|
|
|
- if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
|
|
|
- shares = tg->shares;
|
|
|
+ atomic_add(load_avg, &tg->load_weight);
|
|
|
+ cfs_rq->load_contribution += load_avg;
|
|
|
|
|
|
- for_each_cpu(i, sched_domain_span(sd))
|
|
|
- update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
|
|
|
+ /*
|
|
|
+ * We need to update shares after updating tg->load_weight in
|
|
|
+ * order to adjust the weight of groups with long running tasks.
|
|
|
+ */
|
|
|
+ update_cfs_shares(cfs_rq);
|
|
|
|
|
|
- local_irq_restore(flags);
|
|
|
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
@@ -1666,7 +1603,7 @@ static int tg_load_down(struct task_group *tg, void *data)
|
|
|
load = cpu_rq(cpu)->load.weight;
|
|
|
} else {
|
|
|
load = tg->parent->cfs_rq[cpu]->h_load;
|
|
|
- load *= tg->cfs_rq[cpu]->shares;
|
|
|
+ load *= tg->se[cpu]->load.weight;
|
|
|
load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
|
|
|
}
|
|
|
|
|
@@ -1675,21 +1612,16 @@ static int tg_load_down(struct task_group *tg, void *data)
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
-static void update_shares(struct sched_domain *sd)
|
|
|
+static void update_shares(long cpu)
|
|
|
{
|
|
|
- s64 elapsed;
|
|
|
- u64 now;
|
|
|
-
|
|
|
if (root_task_group_empty())
|
|
|
return;
|
|
|
|
|
|
- now = local_clock();
|
|
|
- elapsed = now - sd->last_update;
|
|
|
+ /*
|
|
|
+ * XXX: replace with an on-demand list
|
|
|
+ */
|
|
|
|
|
|
- if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
|
|
|
- sd->last_update = now;
|
|
|
- walk_tg_tree(tg_nop, tg_shares_up, sd);
|
|
|
- }
|
|
|
+ walk_tg_tree(tg_nop, tg_shares_up, (void *)cpu);
|
|
|
}
|
|
|
|
|
|
static void update_h_load(long cpu)
|
|
@@ -1699,7 +1631,7 @@ static void update_h_load(long cpu)
|
|
|
|
|
|
#else
|
|
|
|
|
|
-static inline void update_shares(struct sched_domain *sd)
|
|
|
+static inline void update_shares(int cpu)
|
|
|
{
|
|
|
}
|
|
|
|
|
@@ -1824,15 +1756,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
|
|
|
|
|
|
#endif
|
|
|
|
|
|
-#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
|
|
|
-{
|
|
|
-#ifdef CONFIG_SMP
|
|
|
- cfs_rq->shares = shares;
|
|
|
-#endif
|
|
|
-}
|
|
|
-#endif
|
|
|
-
|
|
|
static void calc_load_account_idle(struct rq *this_rq);
|
|
|
static void update_sysctl(void);
|
|
|
static int get_update_sysctl_factor(void);
|
|
@@ -5551,7 +5474,6 @@ static void update_sysctl(void)
|
|
|
SET_SYSCTL(sched_min_granularity);
|
|
|
SET_SYSCTL(sched_latency);
|
|
|
SET_SYSCTL(sched_wakeup_granularity);
|
|
|
- SET_SYSCTL(sched_shares_ratelimit);
|
|
|
#undef SET_SYSCTL
|
|
|
}
|
|
|
|
|
@@ -7787,8 +7709,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
|
|
|
se->cfs_rq = parent->my_q;
|
|
|
|
|
|
se->my_q = cfs_rq;
|
|
|
- se->load.weight = tg->shares;
|
|
|
- se->load.inv_weight = 0;
|
|
|
+ update_load_set(&se->load, tg->shares);
|
|
|
se->parent = parent;
|
|
|
}
|
|
|
#endif
|
|
@@ -7881,10 +7802,6 @@ void __init sched_init(void)
|
|
|
|
|
|
#endif /* CONFIG_CGROUP_SCHED */
|
|
|
|
|
|
-#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
|
|
|
- update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
|
|
|
- __alignof__(unsigned long));
|
|
|
-#endif
|
|
|
for_each_possible_cpu(i) {
|
|
|
struct rq *rq;
|
|
|
|
|
@@ -8452,8 +8369,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
|
|
|
if (on_rq)
|
|
|
dequeue_entity(cfs_rq, se, 0);
|
|
|
|
|
|
- se->load.weight = shares;
|
|
|
- se->load.inv_weight = 0;
|
|
|
+ update_load_set(&se->load, shares);
|
|
|
|
|
|
if (on_rq)
|
|
|
enqueue_entity(cfs_rq, se, 0);
|
|
@@ -8510,7 +8426,6 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
|
|
|
/*
|
|
|
* force a rebalance
|
|
|
*/
|
|
|
- cfs_rq_set_shares(tg->cfs_rq[i], 0);
|
|
|
set_se_shares(tg->se[i], shares);
|
|
|
}
|
|
|
|