瀏覽代碼

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (24 commits)
  sched: Cleanup duplicate local variable in [enqueue|dequeue]_task_fair
  sched: Replace use of entity_key()
  sched: Separate group-scheduling code more clearly
  sched: Reorder root_domain to remove 64 bit alignment padding
  sched: Do not attempt to destroy uninitialized rt_bandwidth
  sched: Remove unused function cpu_cfs_rq()
  sched: Fix (harmless) typo 'CONFG_FAIR_GROUP_SCHED'
  sched, cgroup: Optimize load_balance_fair()
  sched: Don't update shares twice on on_rq parent
  sched: update correct entity's runtime in check_preempt_wakeup()
  xtensa: Use generic config PREEMPT definition
  h8300: Use generic config PREEMPT definition
  m32r: Use generic PREEMPT config
  sched: Skip autogroup when looking for all rt sched groups
  sched: Simplify mutex_spin_on_owner()
  sched: Remove rcu_read_lock() from wake_affine()
  sched: Generalize sleep inside spinlock detection
  sched: Make sleeping inside spinlock detection working in !CONFIG_PREEMPT
  sched: Isolate preempt counting in its own config option
  sched: Remove pointless in_atomic() definition check
  ...
Linus Torvalds 14 年之前
父節點
當前提交
bdc7ccfc06

+ 1 - 1
Documentation/DocBook/kernel-hacking.tmpl

@@ -409,7 +409,7 @@ cond_resched(); /* Will sleep */
 
 
   <para>
   <para>
    You should always compile your kernel
    You should always compile your kernel
-   <symbol>CONFIG_DEBUG_SPINLOCK_SLEEP</symbol> on, and it will warn
+   <symbol>CONFIG_DEBUG_ATOMIC_SLEEP</symbol> on, and it will warn
    you if you break these rules.  If you <emphasis>do</emphasis> break
    you if you break these rules.  If you <emphasis>do</emphasis> break
    the rules, you will eventually lock up your box.
    the rules, you will eventually lock up your box.
   </para>
   </para>

+ 1 - 1
Documentation/SubmitChecklist

@@ -53,7 +53,7 @@ kernel patches.
 
 
 12: Has been tested with CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT,
 12: Has been tested with CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT,
     CONFIG_DEBUG_SLAB, CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES,
     CONFIG_DEBUG_SLAB, CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES,
-    CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_SPINLOCK_SLEEP all simultaneously
+    CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_ATOMIC_SLEEP all simultaneously
     enabled.
     enabled.
 
 
 13: Has been build- and runtime tested with and without CONFIG_SMP and
 13: Has been build- and runtime tested with and without CONFIG_SMP and

+ 1 - 1
Documentation/development-process/4.Coding

@@ -244,7 +244,7 @@ testing purposes.  In particular, you should turn on:
  - DEBUG_SLAB can find a variety of memory allocation and use errors; it
  - DEBUG_SLAB can find a variety of memory allocation and use errors; it
    should be used on most development kernels.
    should be used on most development kernels.
 
 
- - DEBUG_SPINLOCK, DEBUG_SPINLOCK_SLEEP, and DEBUG_MUTEXES will find a
+ - DEBUG_SPINLOCK, DEBUG_ATOMIC_SLEEP, and DEBUG_MUTEXES will find a
    number of common locking errors.
    number of common locking errors.
 
 
 There are quite a few other debugging options, some of which will be
 There are quite a few other debugging options, some of which will be

+ 1 - 1
Documentation/ja_JP/SubmitChecklist

@@ -68,7 +68,7 @@ Linux カーネルパッチ投稿者向けチェックリスト
 
 
 12: CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT, CONFIG_DEBUG_SLAB,
 12: CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT, CONFIG_DEBUG_SLAB,
     CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES, CONFIG_DEBUG_SPINLOCK,
     CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES, CONFIG_DEBUG_SPINLOCK,
-    CONFIG_DEBUG_SPINLOCK_SLEEP これら全てを同時に有効にして動作確認を
+    CONFIG_DEBUG_ATOMIC_SLEEP これら全てを同時に有効にして動作確認を
     行ってください。
     行ってください。
 
 
 13: CONFIG_SMP, CONFIG_PREEMPT を有効にした場合と無効にした場合の両方で
 13: CONFIG_SMP, CONFIG_PREEMPT を有効にした場合と無効にした場合の両方で

+ 1 - 1
Documentation/zh_CN/SubmitChecklist

@@ -67,7 +67,7 @@ Linux
 
 
 12:已经通过CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT,
 12:已经通过CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT,
     CONFIG_DEBUG_SLAB, CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES,
     CONFIG_DEBUG_SLAB, CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES,
-    CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_SPINLOCK_SLEEP测试,并且同时都
+    CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_ATOMIC_SLEEP测试,并且同时都
     使能。
     使能。
 
 
 13:已经都构建并且使用或者不使用 CONFIG_SMP 和 CONFIG_PREEMPT测试执行时间。
 13:已经都构建并且使用或者不使用 CONFIG_SMP 和 CONFIG_PREEMPT测试执行时间。

+ 1 - 3
arch/h8300/Kconfig.cpu

@@ -162,9 +162,7 @@ config H8300_TPU_CH
 	int "TPU channel"
 	int "TPU channel"
 	depends on H8300_TPU
 	depends on H8300_TPU
 
 
-config PREEMPT
-	bool "Preemptible Kernel"
-	default n
+source "kernel/Kconfig.preempt"
 
 
 source "mm/Kconfig"
 source "mm/Kconfig"
 
 

+ 1 - 11
arch/m32r/Kconfig

@@ -268,17 +268,7 @@ config SCHED_OMIT_FRAME_POINTER
         bool
         bool
         default y
         default y
 
 
-config PREEMPT
-	bool "Preemptible Kernel"
-	help
-	  This option reduces the latency of the kernel when reacting to
-	  real-time or interactive events by allowing a low priority process to
-	  be preempted even if it is in kernel mode executing a system call.
-	  This allows applications to run more reliably even when the system is
-	  under load.
-
-	  Say Y here if you are building a kernel for a desktop, embedded
-	  or real-time system.  Say N if you are unsure.
+source "kernel/Kconfig.preempt"
 
 
 config SMP
 config SMP
 	bool "Symmetric multi-processing support"
 	bool "Symmetric multi-processing support"

+ 1 - 12
arch/xtensa/Kconfig

@@ -80,18 +80,7 @@ config XTENSA_UNALIGNED_USER
 
 
 	  Say Y here to enable unaligned memory access in user space.
 	  Say Y here to enable unaligned memory access in user space.
 
 
-config PREEMPT
-	bool "Preemptible Kernel"
-	help
-          This option reduces the latency of the kernel when reacting to
-          real-time or interactive events by allowing a low priority process to
-          be preempted even if it is in kernel mode executing a system call.
-          Unfortunately the kernel code has some race conditions if both
-          CONFIG_SMP and CONFIG_PREEMPT are enabled, so this option is
-          currently disabled if you are building an SMP kernel.
-
-          Say Y here if you are building a kernel for a desktop, embedded
-          or real-time system.  Say N if you are unsure.
+source "kernel/Kconfig.preempt"
 
 
 config MATH_EMULATION
 config MATH_EMULATION
 	bool "Math emulation"
 	bool "Math emulation"

+ 1 - 1
include/linux/bit_spinlock.h

@@ -88,7 +88,7 @@ static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
 {
 {
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
 	return test_bit(bitnum, addr);
 	return test_bit(bitnum, addr);
-#elif defined CONFIG_PREEMPT
+#elif defined CONFIG_PREEMPT_COUNT
 	return preempt_count();
 	return preempt_count();
 #else
 #else
 	return 1;
 	return 1;

+ 2 - 2
include/linux/hardirq.h

@@ -93,7 +93,7 @@
  */
  */
 #define in_nmi()	(preempt_count() & NMI_MASK)
 #define in_nmi()	(preempt_count() & NMI_MASK)
 
 
-#if defined(CONFIG_PREEMPT)
+#if defined(CONFIG_PREEMPT_COUNT)
 # define PREEMPT_CHECK_OFFSET 1
 # define PREEMPT_CHECK_OFFSET 1
 #else
 #else
 # define PREEMPT_CHECK_OFFSET 0
 # define PREEMPT_CHECK_OFFSET 0
@@ -115,7 +115,7 @@
 #define in_atomic_preempt_off() \
 #define in_atomic_preempt_off() \
 		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
 		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
 
 
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
 # define preemptible()	(preempt_count() == 0 && !irqs_disabled())
 # define preemptible()	(preempt_count() == 0 && !irqs_disabled())
 # define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1)
 # define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1)
 #else
 #else

+ 1 - 1
include/linux/kernel.h

@@ -121,7 +121,7 @@ extern int _cond_resched(void);
 # define might_resched() do { } while (0)
 # define might_resched() do { } while (0)
 #endif
 #endif
 
 
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
   void __might_sleep(const char *file, int line, int preempt_offset);
   void __might_sleep(const char *file, int line, int preempt_offset);
 /**
 /**
  * might_sleep - annotation for functions that can sleep
  * might_sleep - annotation for functions that can sleep

+ 2 - 2
include/linux/pagemap.h

@@ -134,7 +134,7 @@ static inline int page_cache_get_speculative(struct page *page)
 	VM_BUG_ON(in_interrupt());
 	VM_BUG_ON(in_interrupt());
 
 
 #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
 #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
-# ifdef CONFIG_PREEMPT
+# ifdef CONFIG_PREEMPT_COUNT
 	VM_BUG_ON(!in_atomic());
 	VM_BUG_ON(!in_atomic());
 # endif
 # endif
 	/*
 	/*
@@ -172,7 +172,7 @@ static inline int page_cache_add_speculative(struct page *page, int count)
 	VM_BUG_ON(in_interrupt());
 	VM_BUG_ON(in_interrupt());
 
 
 #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
 #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
-# ifdef CONFIG_PREEMPT
+# ifdef CONFIG_PREEMPT_COUNT
 	VM_BUG_ON(!in_atomic());
 	VM_BUG_ON(!in_atomic());
 # endif
 # endif
 	VM_BUG_ON(page_count(page) == 0);
 	VM_BUG_ON(page_count(page) == 0);

+ 17 - 9
include/linux/preempt.h

@@ -27,6 +27,21 @@
 
 
 asmlinkage void preempt_schedule(void);
 asmlinkage void preempt_schedule(void);
 
 
+#define preempt_check_resched() \
+do { \
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
+		preempt_schedule(); \
+} while (0)
+
+#else /* !CONFIG_PREEMPT */
+
+#define preempt_check_resched()		do { } while (0)
+
+#endif /* CONFIG_PREEMPT */
+
+
+#ifdef CONFIG_PREEMPT_COUNT
+
 #define preempt_disable() \
 #define preempt_disable() \
 do { \
 do { \
 	inc_preempt_count(); \
 	inc_preempt_count(); \
@@ -39,12 +54,6 @@ do { \
 	dec_preempt_count(); \
 	dec_preempt_count(); \
 } while (0)
 } while (0)
 
 
-#define preempt_check_resched() \
-do { \
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
-		preempt_schedule(); \
-} while (0)
-
 #define preempt_enable() \
 #define preempt_enable() \
 do { \
 do { \
 	preempt_enable_no_resched(); \
 	preempt_enable_no_resched(); \
@@ -80,18 +89,17 @@ do { \
 	preempt_check_resched(); \
 	preempt_check_resched(); \
 } while (0)
 } while (0)
 
 
-#else
+#else /* !CONFIG_PREEMPT_COUNT */
 
 
 #define preempt_disable()		do { } while (0)
 #define preempt_disable()		do { } while (0)
 #define preempt_enable_no_resched()	do { } while (0)
 #define preempt_enable_no_resched()	do { } while (0)
 #define preempt_enable()		do { } while (0)
 #define preempt_enable()		do { } while (0)
-#define preempt_check_resched()		do { } while (0)
 
 
 #define preempt_disable_notrace()		do { } while (0)
 #define preempt_disable_notrace()		do { } while (0)
 #define preempt_enable_no_resched_notrace()	do { } while (0)
 #define preempt_enable_no_resched_notrace()	do { } while (0)
 #define preempt_enable_notrace()		do { } while (0)
 #define preempt_enable_notrace()		do { } while (0)
 
 
-#endif
+#endif /* CONFIG_PREEMPT_COUNT */
 
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 
 

+ 6 - 6
include/linux/rcupdate.h

@@ -239,7 +239,7 @@ extern int rcu_read_lock_bh_held(void);
  * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
  * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
  * and while lockdep is disabled.
  * and while lockdep is disabled.
  */
  */
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
 static inline int rcu_read_lock_sched_held(void)
 static inline int rcu_read_lock_sched_held(void)
 {
 {
 	int lockdep_opinion = 0;
 	int lockdep_opinion = 0;
@@ -250,12 +250,12 @@ static inline int rcu_read_lock_sched_held(void)
 		lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
 		lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
 	return lockdep_opinion || preempt_count() != 0 || irqs_disabled();
 	return lockdep_opinion || preempt_count() != 0 || irqs_disabled();
 }
 }
-#else /* #ifdef CONFIG_PREEMPT */
+#else /* #ifdef CONFIG_PREEMPT_COUNT */
 static inline int rcu_read_lock_sched_held(void)
 static inline int rcu_read_lock_sched_held(void)
 {
 {
 	return 1;
 	return 1;
 }
 }
-#endif /* #else #ifdef CONFIG_PREEMPT */
+#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */
 
 
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 
@@ -276,17 +276,17 @@ static inline int rcu_read_lock_bh_held(void)
 	return 1;
 	return 1;
 }
 }
 
 
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
 static inline int rcu_read_lock_sched_held(void)
 static inline int rcu_read_lock_sched_held(void)
 {
 {
 	return preempt_count() != 0 || irqs_disabled();
 	return preempt_count() != 0 || irqs_disabled();
 }
 }
-#else /* #ifdef CONFIG_PREEMPT */
+#else /* #ifdef CONFIG_PREEMPT_COUNT */
 static inline int rcu_read_lock_sched_held(void)
 static inline int rcu_read_lock_sched_held(void)
 {
 {
 	return 1;
 	return 1;
 }
 }
-#endif /* #else #ifdef CONFIG_PREEMPT */
+#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */
 
 
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 

+ 1 - 1
include/linux/sched.h

@@ -2526,7 +2526,7 @@ extern int _cond_resched(void);
 
 
 extern int __cond_resched_lock(spinlock_t *lock);
 extern int __cond_resched_lock(spinlock_t *lock);
 
 
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
 #define PREEMPT_LOCK_OFFSET	PREEMPT_OFFSET
 #define PREEMPT_LOCK_OFFSET	PREEMPT_OFFSET
 #else
 #else
 #define PREEMPT_LOCK_OFFSET	0
 #define PREEMPT_LOCK_OFFSET	0

+ 3 - 0
kernel/Kconfig.preempt

@@ -35,6 +35,7 @@ config PREEMPT_VOLUNTARY
 
 
 config PREEMPT
 config PREEMPT
 	bool "Preemptible Kernel (Low-Latency Desktop)"
 	bool "Preemptible Kernel (Low-Latency Desktop)"
+	select PREEMPT_COUNT
 	help
 	help
 	  This option reduces the latency of the kernel by making
 	  This option reduces the latency of the kernel by making
 	  all kernel code (that is not executing in a critical section)
 	  all kernel code (that is not executing in a critical section)
@@ -52,3 +53,5 @@ config PREEMPT
 
 
 endchoice
 endchoice
 
 
+config PREEMPT_COUNT
+       bool

+ 36 - 81
kernel/sched.c

@@ -124,7 +124,7 @@
 
 
 static inline int rt_policy(int policy)
 static inline int rt_policy(int policy)
 {
 {
-	if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
+	if (policy == SCHED_FIFO || policy == SCHED_RR)
 		return 1;
 		return 1;
 	return 0;
 	return 0;
 }
 }
@@ -422,6 +422,7 @@ struct rt_rq {
  */
  */
 struct root_domain {
 struct root_domain {
 	atomic_t refcount;
 	atomic_t refcount;
+	atomic_t rto_count;
 	struct rcu_head rcu;
 	struct rcu_head rcu;
 	cpumask_var_t span;
 	cpumask_var_t span;
 	cpumask_var_t online;
 	cpumask_var_t online;
@@ -431,7 +432,6 @@ struct root_domain {
 	 * one runnable RT task.
 	 * one runnable RT task.
 	 */
 	 */
 	cpumask_var_t rto_mask;
 	cpumask_var_t rto_mask;
-	atomic_t rto_count;
 	struct cpupri cpupri;
 	struct cpupri cpupri;
 };
 };
 
 
@@ -1568,38 +1568,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 	return rq->avg_load_per_task;
 	return rq->avg_load_per_task;
 }
 }
 
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/*
- * Compute the cpu's hierarchical load factor for each task group.
- * This needs to be done in a top-down fashion because the load of a child
- * group is a fraction of its parents load.
- */
-static int tg_load_down(struct task_group *tg, void *data)
-{
-	unsigned long load;
-	long cpu = (long)data;
-
-	if (!tg->parent) {
-		load = cpu_rq(cpu)->load.weight;
-	} else {
-		load = tg->parent->cfs_rq[cpu]->h_load;
-		load *= tg->se[cpu]->load.weight;
-		load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
-	}
-
-	tg->cfs_rq[cpu]->h_load = load;
-
-	return 0;
-}
-
-static void update_h_load(long cpu)
-{
-	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
-}
-
-#endif
-
 #ifdef CONFIG_PREEMPT
 #ifdef CONFIG_PREEMPT
 
 
 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -2497,7 +2465,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 	if (p->sched_class->task_woken)
 	if (p->sched_class->task_woken)
 		p->sched_class->task_woken(rq, p);
 		p->sched_class->task_woken(rq, p);
 
 
-	if (unlikely(rq->idle_stamp)) {
+	if (rq->idle_stamp) {
 		u64 delta = rq->clock - rq->idle_stamp;
 		u64 delta = rq->clock - rq->idle_stamp;
 		u64 max = 2*sysctl_sched_migration_cost;
 		u64 max = 2*sysctl_sched_migration_cost;
 
 
@@ -2886,7 +2854,7 @@ void sched_fork(struct task_struct *p)
 #if defined(CONFIG_SMP)
 #if defined(CONFIG_SMP)
 	p->on_cpu = 0;
 	p->on_cpu = 0;
 #endif
 #endif
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
 	/* Want to start with kernel preemption disabled. */
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 	task_thread_info(p)->preempt_count = 1;
 #endif
 #endif
@@ -4338,11 +4306,8 @@ EXPORT_SYMBOL(schedule);
 
 
 static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
 static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
 {
 {
-	bool ret = false;
-
-	rcu_read_lock();
 	if (lock->owner != owner)
 	if (lock->owner != owner)
-		goto fail;
+		return false;
 
 
 	/*
 	/*
 	 * Ensure we emit the owner->on_cpu, dereference _after_ checking
 	 * Ensure we emit the owner->on_cpu, dereference _after_ checking
@@ -4352,11 +4317,7 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
 	 */
 	 */
 	barrier();
 	barrier();
 
 
-	ret = owner->on_cpu;
-fail:
-	rcu_read_unlock();
-
-	return ret;
+	return owner->on_cpu;
 }
 }
 
 
 /*
 /*
@@ -4368,21 +4329,21 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
 	if (!sched_feat(OWNER_SPIN))
 	if (!sched_feat(OWNER_SPIN))
 		return 0;
 		return 0;
 
 
+	rcu_read_lock();
 	while (owner_running(lock, owner)) {
 	while (owner_running(lock, owner)) {
 		if (need_resched())
 		if (need_resched())
-			return 0;
+			break;
 
 
 		arch_mutex_cpu_relax();
 		arch_mutex_cpu_relax();
 	}
 	}
+	rcu_read_unlock();
 
 
 	/*
 	/*
-	 * If the owner changed to another task there is likely
-	 * heavy contention, stop spinning.
+	 * We break out the loop above on need_resched() and when the
+	 * owner changed, which is a sign for heavy contention. Return
+	 * success only when lock->owner is NULL.
 	 */
 	 */
-	if (lock->owner)
-		return 0;
-
-	return 1;
+	return lock->owner == NULL;
 }
 }
 #endif
 #endif
 
 
@@ -7898,17 +7859,10 @@ int in_sched_functions(unsigned long addr)
 		&& addr < (unsigned long)__sched_text_end);
 		&& addr < (unsigned long)__sched_text_end);
 }
 }
 
 
-static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
+static void init_cfs_rq(struct cfs_rq *cfs_rq)
 {
 {
 	cfs_rq->tasks_timeline = RB_ROOT;
 	cfs_rq->tasks_timeline = RB_ROOT;
 	INIT_LIST_HEAD(&cfs_rq->tasks);
 	INIT_LIST_HEAD(&cfs_rq->tasks);
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	cfs_rq->rq = rq;
-	/* allow initial update_cfs_load() to truncate */
-#ifdef CONFIG_SMP
-	cfs_rq->load_stamp = 1;
-#endif
-#endif
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 #ifndef CONFIG_64BIT
 #ifndef CONFIG_64BIT
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
@@ -7928,13 +7882,9 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 	/* delimiter for bitsearch: */
 	/* delimiter for bitsearch: */
 	__set_bit(MAX_RT_PRIO, array->bitmap);
 	__set_bit(MAX_RT_PRIO, array->bitmap);
 
 
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+#if defined CONFIG_SMP
 	rt_rq->highest_prio.curr = MAX_RT_PRIO;
 	rt_rq->highest_prio.curr = MAX_RT_PRIO;
-#ifdef CONFIG_SMP
 	rt_rq->highest_prio.next = MAX_RT_PRIO;
 	rt_rq->highest_prio.next = MAX_RT_PRIO;
-#endif
-#endif
-#ifdef CONFIG_SMP
 	rt_rq->rt_nr_migratory = 0;
 	rt_rq->rt_nr_migratory = 0;
 	rt_rq->overloaded = 0;
 	rt_rq->overloaded = 0;
 	plist_head_init(&rt_rq->pushable_tasks);
 	plist_head_init(&rt_rq->pushable_tasks);
@@ -7944,11 +7894,6 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 	rt_rq->rt_throttled = 0;
 	rt_rq->rt_throttled = 0;
 	rt_rq->rt_runtime = 0;
 	rt_rq->rt_runtime = 0;
 	raw_spin_lock_init(&rt_rq->rt_runtime_lock);
 	raw_spin_lock_init(&rt_rq->rt_runtime_lock);
-
-#ifdef CONFIG_RT_GROUP_SCHED
-	rt_rq->rt_nr_boosted = 0;
-	rt_rq->rq = rq;
-#endif
 }
 }
 
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -7957,11 +7902,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 				struct sched_entity *parent)
 				struct sched_entity *parent)
 {
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct rq *rq = cpu_rq(cpu);
-	tg->cfs_rq[cpu] = cfs_rq;
-	init_cfs_rq(cfs_rq, rq);
+
 	cfs_rq->tg = tg;
 	cfs_rq->tg = tg;
+	cfs_rq->rq = rq;
+#ifdef CONFIG_SMP
+	/* allow initial update_cfs_load() to truncate */
+	cfs_rq->load_stamp = 1;
+#endif
 
 
+	tg->cfs_rq[cpu] = cfs_rq;
 	tg->se[cpu] = se;
 	tg->se[cpu] = se;
+
 	/* se could be NULL for root_task_group */
 	/* se could be NULL for root_task_group */
 	if (!se)
 	if (!se)
 		return;
 		return;
@@ -7984,12 +7935,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 {
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct rq *rq = cpu_rq(cpu);
 
 
-	tg->rt_rq[cpu] = rt_rq;
-	init_rt_rq(rt_rq, rq);
+	rt_rq->highest_prio.curr = MAX_RT_PRIO;
+	rt_rq->rt_nr_boosted = 0;
+	rt_rq->rq = rq;
 	rt_rq->tg = tg;
 	rt_rq->tg = tg;
-	rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
 
 
+	tg->rt_rq[cpu] = rt_rq;
 	tg->rt_se[cpu] = rt_se;
 	tg->rt_se[cpu] = rt_se;
+
 	if (!rt_se)
 	if (!rt_se)
 		return;
 		return;
 
 
@@ -8071,7 +8024,7 @@ void __init sched_init(void)
 		rq->nr_running = 0;
 		rq->nr_running = 0;
 		rq->calc_load_active = 0;
 		rq->calc_load_active = 0;
 		rq->calc_load_update = jiffies + LOAD_FREQ;
 		rq->calc_load_update = jiffies + LOAD_FREQ;
-		init_cfs_rq(&rq->cfs, rq);
+		init_cfs_rq(&rq->cfs);
 		init_rt_rq(&rq->rt, rq);
 		init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		root_task_group.shares = root_task_group_load;
 		root_task_group.shares = root_task_group_load;
@@ -8185,7 +8138,7 @@ void __init sched_init(void)
 	scheduler_running = 1;
 	scheduler_running = 1;
 }
 }
 
 
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 static inline int preempt_count_equals(int preempt_offset)
 {
 {
 	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
 	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
@@ -8195,7 +8148,6 @@ static inline int preempt_count_equals(int preempt_offset)
 
 
 void __might_sleep(const char *file, int line, int preempt_offset)
 void __might_sleep(const char *file, int line, int preempt_offset)
 {
 {
-#ifdef in_atomic
 	static unsigned long prev_jiffy;	/* ratelimiting */
 	static unsigned long prev_jiffy;	/* ratelimiting */
 
 
 	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
 	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
@@ -8217,7 +8169,6 @@ void __might_sleep(const char *file, int line, int preempt_offset)
 	if (irqs_disabled())
 	if (irqs_disabled())
 		print_irqtrace_events(current);
 		print_irqtrace_events(current);
 	dump_stack();
 	dump_stack();
-#endif
 }
 }
 EXPORT_SYMBOL(__might_sleep);
 EXPORT_SYMBOL(__might_sleep);
 #endif
 #endif
@@ -8376,6 +8327,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 		if (!se)
 		if (!se)
 			goto err_free_rq;
 			goto err_free_rq;
 
 
+		init_cfs_rq(cfs_rq);
 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
 	}
 	}
 
 
@@ -8403,7 +8355,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
 	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 }
-#else /* !CONFG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED */
 static inline void free_fair_sched_group(struct task_group *tg)
 static inline void free_fair_sched_group(struct task_group *tg)
 {
 {
 }
 }
@@ -8424,7 +8376,8 @@ static void free_rt_sched_group(struct task_group *tg)
 {
 {
 	int i;
 	int i;
 
 
-	destroy_rt_bandwidth(&tg->rt_bandwidth);
+	if (tg->rt_se)
+		destroy_rt_bandwidth(&tg->rt_bandwidth);
 
 
 	for_each_possible_cpu(i) {
 	for_each_possible_cpu(i) {
 		if (tg->rt_rq)
 		if (tg->rt_rq)
@@ -8465,6 +8418,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 		if (!rt_se)
 		if (!rt_se)
 			goto err_free_rq;
 			goto err_free_rq;
 
 
+		init_rt_rq(rt_rq, cpu_rq(i));
+		rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
 		init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
 		init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
 	}
 	}
 
 

+ 1 - 0
kernel/sched_autogroup.h

@@ -13,6 +13,7 @@ struct autogroup {
 	int			nice;
 	int			nice;
 };
 };
 
 
+static inline bool task_group_is_autogroup(struct task_group *tg);
 static inline struct task_group *
 static inline struct task_group *
 autogroup_task_group(struct task_struct *p, struct task_group *tg);
 autogroup_task_group(struct task_struct *p, struct task_group *tg);
 
 

+ 42 - 30
kernel/sched_fair.c

@@ -135,14 +135,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 	return grp->my_q;
 	return grp->my_q;
 }
 }
 
 
-/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
- * another cpu ('this_cpu')
- */
-static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
-{
-	return cfs_rq->tg->cfs_rq[this_cpu];
-}
-
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 {
 	if (!cfs_rq->on_list) {
 	if (!cfs_rq->on_list) {
@@ -271,11 +263,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 	return NULL;
 	return NULL;
 }
 }
 
 
-static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
-{
-	return &cpu_rq(this_cpu)->cfs;
-}
-
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 {
 }
 }
@@ -334,11 +321,6 @@ static inline int entity_before(struct sched_entity *a,
 	return (s64)(a->vruntime - b->vruntime) < 0;
 	return (s64)(a->vruntime - b->vruntime) < 0;
 }
 }
 
 
-static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	return se->vruntime - cfs_rq->min_vruntime;
-}
-
 static void update_min_vruntime(struct cfs_rq *cfs_rq)
 static void update_min_vruntime(struct cfs_rq *cfs_rq)
 {
 {
 	u64 vruntime = cfs_rq->min_vruntime;
 	u64 vruntime = cfs_rq->min_vruntime;
@@ -372,7 +354,6 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 	struct rb_node *parent = NULL;
 	struct rb_node *parent = NULL;
 	struct sched_entity *entry;
 	struct sched_entity *entry;
-	s64 key = entity_key(cfs_rq, se);
 	int leftmost = 1;
 	int leftmost = 1;
 
 
 	/*
 	/*
@@ -385,7 +366,7 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		 * We dont care about collisions. Nodes with
 		 * We dont care about collisions. Nodes with
 		 * the same key stay together.
 		 * the same key stay together.
 		 */
 		 */
-		if (key < entity_key(cfs_rq, entry)) {
+		if (entity_before(se, entry)) {
 			link = &parent->rb_left;
 			link = &parent->rb_left;
 		} else {
 		} else {
 			link = &parent->rb_right;
 			link = &parent->rb_right;
@@ -1336,7 +1317,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	}
 	}
 
 
 	for_each_sched_entity(se) {
 	for_each_sched_entity(se) {
-		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+		cfs_rq = cfs_rq_of(se);
 
 
 		update_cfs_load(cfs_rq, 0);
 		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
 		update_cfs_shares(cfs_rq);
@@ -1370,13 +1351,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			 */
 			 */
 			if (task_sleep && parent_entity(se))
 			if (task_sleep && parent_entity(se))
 				set_next_buddy(parent_entity(se));
 				set_next_buddy(parent_entity(se));
+
+			/* avoid re-evaluating load for this entity */
+			se = parent_entity(se);
 			break;
 			break;
 		}
 		}
 		flags |= DEQUEUE_SLEEP;
 		flags |= DEQUEUE_SLEEP;
 	}
 	}
 
 
 	for_each_sched_entity(se) {
 	for_each_sched_entity(se) {
-		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+		cfs_rq = cfs_rq_of(se);
 
 
 		update_cfs_load(cfs_rq, 0);
 		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
 		update_cfs_shares(cfs_rq);
@@ -1481,7 +1465,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	 * effect of the currently running task from the load
 	 * effect of the currently running task from the load
 	 * of the current CPU:
 	 * of the current CPU:
 	 */
 	 */
-	rcu_read_lock();
 	if (sync) {
 	if (sync) {
 		tg = task_group(current);
 		tg = task_group(current);
 		weight = current->se.load.weight;
 		weight = current->se.load.weight;
@@ -1517,7 +1500,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 		balanced = this_eff_load <= prev_eff_load;
 		balanced = this_eff_load <= prev_eff_load;
 	} else
 	} else
 		balanced = true;
 		balanced = true;
-	rcu_read_unlock();
 
 
 	/*
 	/*
 	 * If the currently running task will sleep within
 	 * If the currently running task will sleep within
@@ -1921,8 +1903,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	if (!sched_feat(WAKEUP_PREEMPT))
 	if (!sched_feat(WAKEUP_PREEMPT))
 		return;
 		return;
 
 
-	update_curr(cfs_rq);
 	find_matching_se(&se, &pse);
 	find_matching_se(&se, &pse);
+	update_curr(cfs_rq_of(se));
 	BUG_ON(!pse);
 	BUG_ON(!pse);
 	if (wakeup_preempt_entity(se, pse) == 1) {
 	if (wakeup_preempt_entity(se, pse) == 1) {
 		/*
 		/*
@@ -2231,11 +2213,43 @@ static void update_shares(int cpu)
 	struct rq *rq = cpu_rq(cpu);
 	struct rq *rq = cpu_rq(cpu);
 
 
 	rcu_read_lock();
 	rcu_read_lock();
+	/*
+	 * Iterates the task_group tree in a bottom up fashion, see
+	 * list_add_leaf_cfs_rq() for details.
+	 */
 	for_each_leaf_cfs_rq(rq, cfs_rq)
 	for_each_leaf_cfs_rq(rq, cfs_rq)
 		update_shares_cpu(cfs_rq->tg, cpu);
 		update_shares_cpu(cfs_rq->tg, cpu);
 	rcu_read_unlock();
 	rcu_read_unlock();
 }
 }
 
 
+/*
+ * Compute the cpu's hierarchical load factor for each task group.
+ * This needs to be done in a top-down fashion because the load of a child
+ * group is a fraction of its parents load.
+ */
+static int tg_load_down(struct task_group *tg, void *data)
+{
+	unsigned long load;
+	long cpu = (long)data;
+
+	if (!tg->parent) {
+		load = cpu_rq(cpu)->load.weight;
+	} else {
+		load = tg->parent->cfs_rq[cpu]->h_load;
+		load *= tg->se[cpu]->load.weight;
+		load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
+	}
+
+	tg->cfs_rq[cpu]->h_load = load;
+
+	return 0;
+}
+
+static void update_h_load(long cpu)
+{
+	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
+}
+
 static unsigned long
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
 		  unsigned long max_load_move,
@@ -2243,14 +2257,12 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  int *all_pinned)
 		  int *all_pinned)
 {
 {
 	long rem_load_move = max_load_move;
 	long rem_load_move = max_load_move;
-	int busiest_cpu = cpu_of(busiest);
-	struct task_group *tg;
+	struct cfs_rq *busiest_cfs_rq;
 
 
 	rcu_read_lock();
 	rcu_read_lock();
-	update_h_load(busiest_cpu);
+	update_h_load(cpu_of(busiest));
 
 
-	list_for_each_entry_rcu(tg, &task_groups, list) {
-		struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
+	for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) {
 		unsigned long busiest_h_load = busiest_cfs_rq->h_load;
 		unsigned long busiest_h_load = busiest_cfs_rq->h_load;
 		unsigned long busiest_weight = busiest_cfs_rq->load.weight;
 		unsigned long busiest_weight = busiest_cfs_rq->load.weight;
 		u64 rem_load, moved_load;
 		u64 rem_load, moved_load;

+ 19 - 7
kernel/sched_rt.c

@@ -185,11 +185,23 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 
 
 typedef struct task_group *rt_rq_iter_t;
 typedef struct task_group *rt_rq_iter_t;
 
 
-#define for_each_rt_rq(rt_rq, iter, rq) \
-	for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \
-	     (&iter->list != &task_groups) && \
-	     (rt_rq = iter->rt_rq[cpu_of(rq)]); \
-	     iter = list_entry_rcu(iter->list.next, typeof(*iter), list))
+static inline struct task_group *next_task_group(struct task_group *tg)
+{
+	do {
+		tg = list_entry_rcu(tg->list.next,
+			typeof(struct task_group), list);
+	} while (&tg->list != &task_groups && task_group_is_autogroup(tg));
+
+	if (&tg->list == &task_groups)
+		tg = NULL;
+
+	return tg;
+}
+
+#define for_each_rt_rq(rt_rq, iter, rq)					\
+	for (iter = container_of(&task_groups, typeof(*iter), list);	\
+		(iter = next_task_group(iter)) &&			\
+		(rt_rq = iter->rt_rq[cpu_of(rq)]);)
 
 
 static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
 static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
 {
 {
@@ -1126,7 +1138,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 
 
 	rt_rq = &rq->rt;
 	rt_rq = &rq->rt;
 
 
-	if (unlikely(!rt_rq->rt_nr_running))
+	if (!rt_rq->rt_nr_running)
 		return NULL;
 		return NULL;
 
 
 	if (rt_rq_throttled(rt_rq))
 	if (rt_rq_throttled(rt_rq))
@@ -1548,7 +1560,7 @@ skip:
 static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
 static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
 {
 {
 	/* Try to pull RT tasks here if we lower this rq's prio */
 	/* Try to pull RT tasks here if we lower this rq's prio */
-	if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
+	if (rq->rt.highest_prio.curr > prev->prio)
 		pull_rt_task(rq);
 		pull_rt_task(rq);
 }
 }
 
 

+ 6 - 3
lib/Kconfig.debug

@@ -648,12 +648,15 @@ config TRACE_IRQFLAGS
 	  Enables hooks to interrupt enabling and disabling for
 	  Enables hooks to interrupt enabling and disabling for
 	  either tracing or lock debugging.
 	  either tracing or lock debugging.
 
 
-config DEBUG_SPINLOCK_SLEEP
-	bool "Spinlock debugging: sleep-inside-spinlock checking"
+config DEBUG_ATOMIC_SLEEP
+	bool "Sleep inside atomic section checking"
+	select PREEMPT_COUNT
 	depends on DEBUG_KERNEL
 	depends on DEBUG_KERNEL
 	help
 	help
 	  If you say Y here, various routines which may sleep will become very
 	  If you say Y here, various routines which may sleep will become very
-	  noisy if they are called with a spinlock held.
+	  noisy if they are called inside atomic sections: when a spinlock is
+	  held, inside an rcu read side critical section, inside preempt disabled
+	  sections, inside an interrupt, etc...
 
 
 config DEBUG_LOCKING_API_SELFTESTS
 config DEBUG_LOCKING_API_SELFTESTS
 	bool "Locking API boot-time self-tests"
 	bool "Locking API boot-time self-tests"