11 years ago · fe8a45df36
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -306,7 +306,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 
				 				__func__, cpu);
			
 
				 		goto out_release;
			
 
				 	}
			
 
				-	smpboot_park_threads(cpu);
			
 
				 
			
 
				 	/*
			
 
				 	 * By now we've cleared cpu_active_mask, wait for all preempt-disabled
			
@@ -315,12 +314,16 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 
				 	 *
			
 
				 	 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
			
 
				 	 * not imply sync_sched(), so explicitly call both.
			
 
				+	 *
			
 
				+	 * Do sync before park smpboot threads to take care the rcu boost case.
			
 
				 	 */
			
 
				 #ifdef CONFIG_PREEMPT
			
 
				 	synchronize_sched();
			
 
				 #endif
			
 
				 	synchronize_rcu();
			
 
				 
			
 
				+	smpboot_park_threads(cpu);
			
 
				+
			
 
				 	/*
			
 
				 	 * So now all preempt/rcu users must observe !cpu_active().
			
 
				 	 */
			
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2253,6 +2253,20 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 
				 	struct rq *rq;
			
 
				 	u64 ns = 0;
			
 
				 
			
 
				+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
			
 
				+	/*
			
 
				+	 * 64-bit doesn't need locks to atomically read a 64bit value.
			
 
				+	 * So we have a optimization chance when the task's delta_exec is 0.
			
 
				+	 * Reading ->on_cpu is racy, but this is ok.
			
 
				+	 *
			
 
				+	 * If we race with it leaving cpu, we'll take a lock. So we're correct.
			
 
				+	 * If we race with it entering cpu, unaccounted time is 0. This is
			
 
				+	 * indistinguishable from the read occurring a few cycles earlier.
			
 
				+	 */
			
 
				+	if (!p->on_cpu)
			
 
				+		return p->se.sum_exec_runtime;
			
 
				+#endif
			
 
				+
			
 
				 	rq = task_rq_lock(p, &flags);
			
 
				 	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
			
 
				 	task_rq_unlock(rq, p, &flags);
			
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1000,7 +1000,7 @@ struct numa_stats {
 
				  */
			
 
				 static void update_numa_stats(struct numa_stats *ns, int nid)
			
 
				 {
			
 
				-	int cpu;
			
 
				+	int cpu, cpus = 0;
			
 
				 
			
 
				 	memset(ns, 0, sizeof(*ns));
			
 
				 	for_each_cpu(cpu, cpumask_of_node(nid)) {
			
@@ -1009,8 +1009,21 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
 
				 		ns->nr_running += rq->nr_running;
			
 
				 		ns->load += weighted_cpuload(cpu);
			
 
				 		ns->power += power_of(cpu);
			
 
				+
			
 
				+		cpus++;
			
 
				 	}
			
 
				 
			
 
				+	/*
			
 
				+	 * If we raced with hotplug and there are no CPUs left in our mask
			
 
				+	 * the @ns structure is NULL'ed and task_numa_compare() will
			
 
				+	 * not find this node attractive.
			
 
				+	 *
			
 
				+	 * We'll either bail at !has_capacity, or we'll detect a huge imbalance
			
 
				+	 * and bail there.
			
 
				+	 */
			
 
				+	if (!cpus)
			
 
				+		return;
			
 
				+
			
 
				 	ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
			
 
				 	ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
			
 
				 	ns->has_capacity = (ns->nr_running < ns->capacity);
			
@@ -1201,9 +1214,21 @@ static int task_numa_migrate(struct task_struct *p)
 
				 	 */
			
 
				 	rcu_read_lock();
			
 
				 	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
			
 
				-	env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
			
 
				+	if (sd)
			
 
				+		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
			
 
				 	rcu_read_unlock();
			
 
				 
			
 
				+	/*
			
 
				+	 * Cpusets can break the scheduler domain tree into smaller
			
 
				+	 * balance domains, some of which do not cross NUMA boundaries.
			
 
				+	 * Tasks that are "trapped" in such domains cannot be migrated
			
 
				+	 * elsewhere, so there is no point in (re)trying.
			
 
				+	 */
			
 
				+	if (unlikely(!sd)) {
			
 
				+		p->numa_preferred_nid = cpu_to_node(task_cpu(p));
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				 	taskweight = task_weight(p, env.src_nid);
			
 
				 	groupweight = group_weight(p, env.src_nid);
			
 
				 	update_numa_stats(&env.src_stats, env.src_nid);
			
@@ -2153,7 +2178,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
 
				 	long contrib;
			
 
				 
			
 
				 	/* The fraction of a cpu used by this cfs_rq */
			
 
				-	contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
			
 
				+	contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
			
 
				 			  sa->runnable_avg_period + 1);
			
 
				 	contrib -= cfs_rq->tg_runnable_contrib;