14 years ago · 21228e4557
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -143,7 +143,7 @@ extern unsigned long nr_iowait_cpu(int cpu);
 
				 extern unsigned long this_cpu_load(void);
			
 
				 
			
 
				 
			
 
				-extern void calc_global_load(void);
			
 
				+extern void calc_global_load(unsigned long ticks);
			
 
				 
			
 
				 extern unsigned long get_parent_ip(unsigned long addr);
			
 
				 
			
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -273,6 +273,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 
				 
			
 
				 	setup_thread_stack(tsk, orig);
			
 
				 	clear_user_return_notifier(tsk);
			
 
				+	clear_tsk_need_resched(tsk);
			
 
				 	stackend = end_of_stack(tsk);
			
 
				 	*stackend = STACK_END_MAGIC;	/* for overflow detection */
			
 
				 
			
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -636,22 +636,18 @@ static inline struct task_group *task_group(struct task_struct *p)
 
				 
			
 
				 #endif /* CONFIG_CGROUP_SCHED */
			
 
				 
			
 
				-static u64 irq_time_cpu(int cpu);
			
 
				-static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
			
 
				+static void update_rq_clock_task(struct rq *rq, s64 delta);
			
 
				 
			
 
				-inline void update_rq_clock(struct rq *rq)
			
 
				+static void update_rq_clock(struct rq *rq)
			
 
				 {
			
 
				-	if (!rq->skip_clock_update) {
			
 
				-		int cpu = cpu_of(rq);
			
 
				-		u64 irq_time;
			
 
				+	s64 delta;
			
 
				 
			
 
				-		rq->clock = sched_clock_cpu(cpu);
			
 
				-		irq_time = irq_time_cpu(cpu);
			
 
				-		if (rq->clock - irq_time > rq->clock_task)
			
 
				-			rq->clock_task = rq->clock - irq_time;
			
 
				+	if (rq->skip_clock_update)
			
 
				+		return;
			
 
				 
			
 
				-		sched_irq_time_avg_update(rq, irq_time);
			
 
				-	}
			
 
				+	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
			
 
				+	rq->clock += delta;
			
 
				+	update_rq_clock_task(rq, delta);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1924,10 +1920,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 
				  * They are read and saved off onto struct rq in update_rq_clock().
			
 
				  * This may result in other CPU reading this CPU's irq time and can
			
 
				  * race with irq/account_system_vtime on this CPU. We would either get old
			
 
				- * or new value (or semi updated value on 32 bit) with a side effect of
			
 
				- * accounting a slice of irq time to wrong task when irq is in progress
			
 
				- * while we read rq->clock. That is a worthy compromise in place of having
			
 
				- * locks on each irq in account_system_time.
			
 
				+ * or new value with a side effect of accounting a slice of irq time to wrong
			
 
				+ * task when irq is in progress while we read rq->clock. That is a worthy
			
 
				+ * compromise in place of having locks on each irq in account_system_time.
			
 
				  */
			
 
				 static DEFINE_PER_CPU(u64, cpu_hardirq_time);
			
 
				 static DEFINE_PER_CPU(u64, cpu_softirq_time);
			
@@ -1945,19 +1940,58 @@ void disable_sched_clock_irqtime(void)
 
				 	sched_clock_irqtime = 0;
			
 
				 }
			
 
				 
			
 
				-static u64 irq_time_cpu(int cpu)
			
 
				+#ifndef CONFIG_64BIT
			
 
				+static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
			
 
				+
			
 
				+static inline void irq_time_write_begin(void)
			
 
				 {
			
 
				-	if (!sched_clock_irqtime)
			
 
				-		return 0;
			
 
				+	__this_cpu_inc(irq_time_seq.sequence);
			
 
				+	smp_wmb();
			
 
				+}
			
 
				+
			
 
				+static inline void irq_time_write_end(void)
			
 
				+{
			
 
				+	smp_wmb();
			
 
				+	__this_cpu_inc(irq_time_seq.sequence);
			
 
				+}
			
 
				+
			
 
				+static inline u64 irq_time_read(int cpu)
			
 
				+{
			
 
				+	u64 irq_time;
			
 
				+	unsigned seq;
			
 
				 
			
 
				+	do {
			
 
				+		seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
			
 
				+		irq_time = per_cpu(cpu_softirq_time, cpu) +
			
 
				+			   per_cpu(cpu_hardirq_time, cpu);
			
 
				+	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
			
 
				+
			
 
				+	return irq_time;
			
 
				+}
			
 
				+#else /* CONFIG_64BIT */
			
 
				+static inline void irq_time_write_begin(void)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+static inline void irq_time_write_end(void)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+static inline u64 irq_time_read(int cpu)
			
 
				+{
			
 
				 	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
			
 
				 }
			
 
				+#endif /* CONFIG_64BIT */
			
 
				 
			
 
				+/*
			
 
				+ * Called before incrementing preempt_count on {soft,}irq_enter
			
 
				+ * and before decrementing preempt_count on {soft,}irq_exit.
			
 
				+ */
			
 
				 void account_system_vtime(struct task_struct *curr)
			
 
				 {
			
 
				 	unsigned long flags;
			
 
				+	s64 delta;
			
 
				 	int cpu;
			
 
				-	u64 now, delta;
			
 
				 
			
 
				 	if (!sched_clock_irqtime)
			
 
				 		return;
			
@@ -1965,9 +1999,10 @@ void account_system_vtime(struct task_struct *curr)
 
				 	local_irq_save(flags);
			
 
				 
			
 
				 	cpu = smp_processor_id();
			
 
				-	now = sched_clock_cpu(cpu);
			
 
				-	delta = now - per_cpu(irq_start_time, cpu);
			
 
				-	per_cpu(irq_start_time, cpu) = now;
			
 
				+	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
			
 
				+	__this_cpu_add(irq_start_time, delta);
			
 
				+
			
 
				+	irq_time_write_begin();
			
 
				 	/*
			
 
				 	 * We do not account for softirq time from ksoftirqd here.
			
 
				 	 * We want to continue accounting softirq time to ksoftirqd thread
			
@@ -1975,33 +2010,55 @@ void account_system_vtime(struct task_struct *curr)
 
				 	 * that do not consume any time, but still wants to run.
			
 
				 	 */
			
 
				 	if (hardirq_count())
			
 
				-		per_cpu(cpu_hardirq_time, cpu) += delta;
			
 
				+		__this_cpu_add(cpu_hardirq_time, delta);
			
 
				 	else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
			
 
				-		per_cpu(cpu_softirq_time, cpu) += delta;
			
 
				+		__this_cpu_add(cpu_softirq_time, delta);
			
 
				 
			
 
				+	irq_time_write_end();
			
 
				 	local_irq_restore(flags);
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(account_system_vtime);
			
 
				 
			
 
				-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
			
 
				+static void update_rq_clock_task(struct rq *rq, s64 delta)
			
 
				 {
			
 
				-	if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
			
 
				-		u64 delta_irq = curr_irq_time - rq->prev_irq_time;
			
 
				-		rq->prev_irq_time = curr_irq_time;
			
 
				-		sched_rt_avg_update(rq, delta_irq);
			
 
				-	}
			
 
				+	s64 irq_delta;
			
 
				+
			
 
				+	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
			
 
				+
			
 
				+	/*
			
 
				+	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
			
 
				+	 * this case when a previous update_rq_clock() happened inside a
			
 
				+	 * {soft,}irq region.
			
 
				+	 *
			
 
				+	 * When this happens, we stop ->clock_task and only update the
			
 
				+	 * prev_irq_time stamp to account for the part that fit, so that a next
			
 
				+	 * update will consume the rest. This ensures ->clock_task is
			
 
				+	 * monotonic.
			
 
				+	 *
			
 
				+	 * It does however cause some slight miss-attribution of {soft,}irq
			
 
				+	 * time, a more accurate solution would be to update the irq_time using
			
 
				+	 * the current rq->clock timestamp, except that would require using
			
 
				+	 * atomic ops.
			
 
				+	 */
			
 
				+	if (irq_delta > delta)
			
 
				+		irq_delta = delta;
			
 
				+
			
 
				+	rq->prev_irq_time += irq_delta;
			
 
				+	delta -= irq_delta;
			
 
				+	rq->clock_task += delta;
			
 
				+
			
 
				+	if (irq_delta && sched_feat(NONIRQ_POWER))
			
 
				+		sched_rt_avg_update(rq, irq_delta);
			
 
				 }
			
 
				 
			
 
				-#else
			
 
				+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
			
 
				 
			
 
				-static u64 irq_time_cpu(int cpu)
			
 
				+static void update_rq_clock_task(struct rq *rq, s64 delta)
			
 
				 {
			
 
				-	return 0;
			
 
				+	rq->clock_task += delta;
			
 
				 }
			
 
				 
			
 
				-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
			
 
				-
			
 
				-#endif
			
 
				+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
			
 
				 
			
 
				 #include "sched_idletask.c"
			
 
				 #include "sched_fair.c"
			
@@ -2129,7 +2186,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 
				 	 * A queue event has occurred, and we're going to schedule.  In
			
 
				 	 * this case, we can save a useless back to back clock update.
			
 
				 	 */
			
 
				-	if (test_tsk_need_resched(rq->curr))
			
 
				+	if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
			
 
				 		rq->skip_clock_update = 1;
			
 
				 }
			
 
				 
			
@@ -3119,6 +3176,15 @@ static long calc_load_fold_active(struct rq *this_rq)
 
				 	return delta;
			
 
				 }
			
 
				 
			
 
				+static unsigned long
			
 
				+calc_load(unsigned long load, unsigned long exp, unsigned long active)
			
 
				+{
			
 
				+	load *= exp;
			
 
				+	load += active * (FIXED_1 - exp);
			
 
				+	load += 1UL << (FSHIFT - 1);
			
 
				+	return load >> FSHIFT;
			
 
				+}
			
 
				+
			
 
				 #ifdef CONFIG_NO_HZ
			
 
				 /*
			
 
				  * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
			
@@ -3148,6 +3214,128 @@ static long calc_load_fold_idle(void)
 
				 
			
 
				 	return delta;
			
 
				 }
			
 
				+
			
 
				+/**
			
 
				+ * fixed_power_int - compute: x^n, in O(log n) time
			
 
				+ *
			
 
				+ * @x:         base of the power
			
 
				+ * @frac_bits: fractional bits of @x
			
 
				+ * @n:         power to raise @x to.
			
 
				+ *
			
 
				+ * By exploiting the relation between the definition of the natural power
			
 
				+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
			
 
				+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
			
 
				+ * (where: n_i \elem {0, 1}, the binary vector representing n),
			
 
				+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
			
 
				+ * of course trivially computable in O(log_2 n), the length of our binary
			
 
				+ * vector.
			
 
				+ */
			
 
				+static unsigned long
			
 
				+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
			
 
				+{
			
 
				+	unsigned long result = 1UL << frac_bits;
			
 
				+
			
 
				+	if (n) for (;;) {
			
 
				+		if (n & 1) {
			
 
				+			result *= x;
			
 
				+			result += 1UL << (frac_bits - 1);
			
 
				+			result >>= frac_bits;
			
 
				+		}
			
 
				+		n >>= 1;
			
 
				+		if (!n)
			
 
				+			break;
			
 
				+		x *= x;
			
 
				+		x += 1UL << (frac_bits - 1);
			
 
				+		x >>= frac_bits;
			
 
				+	}
			
 
				+
			
 
				+	return result;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * a1 = a0 * e + a * (1 - e)
			
 
				+ *
			
 
				+ * a2 = a1 * e + a * (1 - e)
			
 
				+ *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
			
 
				+ *    = a0 * e^2 + a * (1 - e) * (1 + e)
			
 
				+ *
			
 
				+ * a3 = a2 * e + a * (1 - e)
			
 
				+ *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
			
 
				+ *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
			
 
				+ *
			
 
				+ *  ...
			
 
				+ *
			
 
				+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
			
 
				+ *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
			
 
				+ *    = a0 * e^n + a * (1 - e^n)
			
 
				+ *
			
 
				+ * [1] application of the geometric series:
			
 
				+ *
			
 
				+ *              n         1 - x^(n+1)
			
 
				+ *     S_n := \Sum x^i = -------------
			
 
				+ *             i=0          1 - x
			
 
				+ */
			
 
				+static unsigned long
			
 
				+calc_load_n(unsigned long load, unsigned long exp,
			
 
				+	    unsigned long active, unsigned int n)
			
 
				+{
			
 
				+
			
 
				+	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * NO_HZ can leave us missing all per-cpu ticks calling
			
 
				+ * calc_load_account_active(), but since an idle CPU folds its delta into
			
 
				+ * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
			
 
				+ * in the pending idle delta if our idle period crossed a load cycle boundary.
			
 
				+ *
			
 
				+ * Once we've updated the global active value, we need to apply the exponential
			
 
				+ * weights adjusted to the number of cycles missed.
			
 
				+ */
			
 
				+static void calc_global_nohz(unsigned long ticks)
			
 
				+{
			
 
				+	long delta, active, n;
			
 
				+
			
 
				+	if (time_before(jiffies, calc_load_update))
			
 
				+		return;
			
 
				+
			
 
				+	/*
			
 
				+	 * If we crossed a calc_load_update boundary, make sure to fold
			
 
				+	 * any pending idle changes, the respective CPUs might have
			
 
				+	 * missed the tick driven calc_load_account_active() update
			
 
				+	 * due to NO_HZ.
			
 
				+	 */
			
 
				+	delta = calc_load_fold_idle();
			
 
				+	if (delta)
			
 
				+		atomic_long_add(delta, &calc_load_tasks);
			
 
				+
			
 
				+	/*
			
 
				+	 * If we were idle for multiple load cycles, apply them.
			
 
				+	 */
			
 
				+	if (ticks >= LOAD_FREQ) {
			
 
				+		n = ticks / LOAD_FREQ;
			
 
				+
			
 
				+		active = atomic_long_read(&calc_load_tasks);
			
 
				+		active = active > 0 ? active * FIXED_1 : 0;
			
 
				+
			
 
				+		avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
			
 
				+		avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
			
 
				+		avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
			
 
				+
			
 
				+		calc_load_update += n * LOAD_FREQ;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Its possible the remainder of the above division also crosses
			
 
				+	 * a LOAD_FREQ period, the regular check in calc_global_load()
			
 
				+	 * which comes after this will take care of that.
			
 
				+	 *
			
 
				+	 * Consider us being 11 ticks before a cycle completion, and us
			
 
				+	 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
			
 
				+	 * age us 4 cycles, and the test in calc_global_load() will
			
 
				+	 * pick up the final one.
			
 
				+	 */
			
 
				+}
			
 
				 #else
			
 
				 static void calc_load_account_idle(struct rq *this_rq)
			
 
				 {
			
@@ -3157,6 +3345,10 @@ static inline long calc_load_fold_idle(void)
 
				 {
			
 
				 	return 0;
			
 
				 }
			
 
				+
			
 
				+static void calc_global_nohz(unsigned long ticks)
			
 
				+{
			
 
				+}
			
 
				 #endif
			
 
				 
			
 
				 /**
			
@@ -3174,24 +3366,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
 
				 	loads[2] = (avenrun[2] + offset) << shift;
			
 
				 }
			
 
				 
			
 
				-static unsigned long
			
 
				-calc_load(unsigned long load, unsigned long exp, unsigned long active)
			
 
				-{
			
 
				-	load *= exp;
			
 
				-	load += active * (FIXED_1 - exp);
			
 
				-	return load >> FSHIFT;
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * calc_load - update the avenrun load estimates 10 ticks after the
			
 
				  * CPUs have updated calc_load_tasks.
			
 
				  */
			
 
				-void calc_global_load(void)
			
 
				+void calc_global_load(unsigned long ticks)
			
 
				 {
			
 
				-	unsigned long upd = calc_load_update + 10;
			
 
				 	long active;
			
 
				 
			
 
				-	if (time_before(jiffies, upd))
			
 
				+	calc_global_nohz(ticks);
			
 
				+
			
 
				+	if (time_before(jiffies, calc_load_update + 10))
			
 
				 		return;
			
 
				 
			
 
				 	active = atomic_long_read(&calc_load_tasks);
			
@@ -3845,7 +4030,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
 
				 {
			
 
				 	if (prev->se.on_rq)
			
 
				 		update_rq_clock(rq);
			
 
				-	rq->skip_clock_update = 0;
			
 
				 	prev->sched_class->put_prev_task(rq, prev);
			
 
				 }
			
 
				 
			
@@ -3903,7 +4087,6 @@ need_resched_nonpreemptible:
 
				 		hrtick_clear(rq);
			
 
				 
			
 
				 	raw_spin_lock_irq(&rq->lock);
			
 
				-	clear_tsk_need_resched(prev);
			
 
				 
			
 
				 	switch_count = &prev->nivcsw;
			
 
				 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
			
@@ -3935,6 +4118,8 @@ need_resched_nonpreemptible:
 
				 
			
 
				 	put_prev_task(rq, prev);
			
 
				 	next = pick_next_task(rq);
			
 
				+	clear_tsk_need_resched(prev);
			
 
				+	rq->skip_clock_update = 0;
			
 
				 
			
 
				 	if (likely(prev != next)) {
			
 
				 		sched_info_switch(prev, next);
			
@@ -3943,6 +4128,7 @@ need_resched_nonpreemptible:
 
				 		rq->nr_switches++;
			
 
				 		rq->curr = next;
			
 
				 		++*switch_count;
			
 
				+		WARN_ON_ONCE(test_tsk_need_resched(next));
			
 
				 
			
 
				 		context_switch(rq, prev, next); /* unlocks the rq */
			
 
				 		/*
			
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1252,6 +1252,12 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 
				 	struct tvec_base *base = __get_cpu_var(tvec_bases);
			
 
				 	unsigned long expires;
			
 
				 
			
 
				+	/*
			
 
				+	 * Pretend that there is no timer pending if the cpu is offline.
			
 
				+	 * Possible pending timers will be migrated later to an active cpu.
			
 
				+	 */
			
 
				+	if (cpu_is_offline(smp_processor_id()))
			
 
				+		return now + NEXT_TIMER_MAX_DELTA;
			
 
				 	spin_lock(&base->lock);
			
 
				 	if (time_before_eq(base->next_timer, base->timer_jiffies))
			
 
				 		base->next_timer = __next_timer_interrupt(base);
			
@@ -1319,7 +1325,7 @@ void do_timer(unsigned long ticks)
 
				 {
			
 
				 	jiffies_64 += ticks;
			
 
				 	update_wall_time();
			
 
				-	calc_global_load();
			
 
				+	calc_global_load(ticks);
			
 
				 }
			
 
				 
			
 
				 #ifdef __ARCH_WANT_SYS_ALARM