16 年之前 · 774a694f8c
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -129,25 +129,34 @@ extern unsigned long node_remap_size[];
 
				 #endif
			
 
				 
			
 
				 /* sched_domains SD_NODE_INIT for NUMA machines */
			
 
				-#define SD_NODE_INIT (struct sched_domain) {		\
			
 
				-	.min_interval		= 8,			\
			
 
				-	.max_interval		= 32,			\
			
 
				-	.busy_factor		= 32,			\
			
 
				-	.imbalance_pct		= 125,			\
			
 
				-	.cache_nice_tries	= SD_CACHE_NICE_TRIES,	\
			
 
				-	.busy_idx		= 3,			\
			
 
				-	.idle_idx		= SD_IDLE_IDX,		\
			
 
				-	.newidle_idx		= SD_NEWIDLE_IDX,	\
			
 
				-	.wake_idx		= 1,			\
			
 
				-	.forkexec_idx		= SD_FORKEXEC_IDX,	\
			
 
				-	.flags			= SD_LOAD_BALANCE	\
			
 
				-				| SD_BALANCE_EXEC	\
			
 
				-				| SD_BALANCE_FORK	\
			
 
				-				| SD_WAKE_AFFINE	\
			
 
				-				| SD_WAKE_BALANCE	\
			
 
				-				| SD_SERIALIZE,		\
			
 
				-	.last_balance		= jiffies,		\
			
 
				-	.balance_interval	= 1,			\
			
 
				+#define SD_NODE_INIT (struct sched_domain) {				\
			
 
				+	.min_interval		= 8,					\
			
 
				+	.max_interval		= 32,					\
			
 
				+	.busy_factor		= 32,					\
			
 
				+	.imbalance_pct		= 125,					\
			
 
				+	.cache_nice_tries	= SD_CACHE_NICE_TRIES,			\
			
 
				+	.busy_idx		= 3,					\
			
 
				+	.idle_idx		= SD_IDLE_IDX,				\
			
 
				+	.newidle_idx		= SD_NEWIDLE_IDX,			\
			
 
				+	.wake_idx		= 1,					\
			
 
				+	.forkexec_idx		= SD_FORKEXEC_IDX,			\
			
 
				+									\
			
 
				+	.flags			= 1*SD_LOAD_BALANCE			\
			
 
				+				| 1*SD_BALANCE_NEWIDLE			\
			
 
				+				| 1*SD_BALANCE_EXEC			\
			
 
				+				| 1*SD_BALANCE_FORK			\
			
 
				+				| 0*SD_WAKE_IDLE			\
			
 
				+				| 1*SD_WAKE_AFFINE			\
			
 
				+				| 1*SD_WAKE_BALANCE			\
			
 
				+				| 0*SD_SHARE_CPUPOWER			\
			
 
				+				| 0*SD_POWERSAVINGS_BALANCE		\
			
 
				+				| 0*SD_SHARE_PKG_RESOURCES		\
			
 
				+				| 1*SD_SERIALIZE			\
			
 
				+				| 1*SD_WAKE_IDLE_FAR			\
			
 
				+				| 0*SD_PREFER_SIBLING			\
			
 
				+				,					\
			
 
				+	.last_balance		= jiffies,				\
			
 
				+	.balance_interval	= 1,					\
			
 
				 }
			
 
				 
			
 
				 #ifdef CONFIG_X86_64_ACPI_NUMA
			
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -32,6 +32,7 @@
 
				 #include <linux/swap.h>
			
 
				 #include <linux/bootmem.h>
			
 
				 #include <linux/fs_struct.h>
			
 
				+#include <linux/hardirq.h>
			
 
				 #include "internal.h"
			
 
				 
			
 
				 int sysctl_vfs_cache_pressure __read_mostly = 100;
			
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 
				 	 * give it the opportunity to lock the file.
			
 
				 	 */
			
 
				 	if (found)
			
 
				-		cond_resched_bkl();
			
 
				+		cond_resched();
			
 
				 
			
 
				 find_conflict:
			
 
				 	for_each_lock(inode, before) {
			
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -64,6 +64,12 @@
 
				 #define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
			
 
				 #define NMI_OFFSET	(1UL << NMI_SHIFT)
			
 
				 
			
 
				+#ifndef PREEMPT_ACTIVE
			
 
				+#define PREEMPT_ACTIVE_BITS	1
			
 
				+#define PREEMPT_ACTIVE_SHIFT	(NMI_SHIFT + NMI_BITS)
			
 
				+#define PREEMPT_ACTIVE	(__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
			
 
				+#endif
			
 
				+
			
 
				 #if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS))
			
 
				 #error PREEMPT_ACTIVE is too low!
			
 
				 #endif
			
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -125,7 +125,7 @@ extern int _cond_resched(void);
 
				 #endif
			
 
				 
			
 
				 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
			
 
				-  void __might_sleep(char *file, int line);
			
 
				+  void __might_sleep(char *file, int line, int preempt_offset);
			
 
				 /**
			
 
				  * might_sleep - annotation for functions that can sleep
			
 
				  *
			
@@ -137,8 +137,9 @@ extern int _cond_resched(void);
 
				  * supposed to.
			
 
				  */
			
 
				 # define might_sleep() \
			
 
				-	do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0)
			
 
				+	do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
			
 
				 #else
			
 
				+  static inline void __might_sleep(char *file, int line, int preempt_offset) { }
			
 
				 # define might_sleep() do { might_resched(); } while (0)
			
 
				 #endif
			
 
				 
			
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -38,6 +38,8 @@
 
				 #define SCHED_BATCH		3
			
 
				 /* SCHED_ISO: reserved but not implemented yet */
			
 
				 #define SCHED_IDLE		5
			
 
				+/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
			
 
				+#define SCHED_RESET_ON_FORK     0x40000000
			
 
				 
			
 
				 #ifdef __KERNEL__
			
 
				 
			
@@ -796,18 +798,19 @@ enum cpu_idle_type {
 
				 #define SCHED_LOAD_SCALE_FUZZ	SCHED_LOAD_SCALE
			
 
				 
			
 
				 #ifdef CONFIG_SMP
			
 
				-#define SD_LOAD_BALANCE		1	/* Do load balancing on this domain. */
			
 
				-#define SD_BALANCE_NEWIDLE	2	/* Balance when about to become idle */
			
 
				-#define SD_BALANCE_EXEC		4	/* Balance on exec */
			
 
				-#define SD_BALANCE_FORK		8	/* Balance on fork, clone */
			
 
				-#define SD_WAKE_IDLE		16	/* Wake to idle CPU on task wakeup */
			
 
				-#define SD_WAKE_AFFINE		32	/* Wake task to waking CPU */
			
 
				-#define SD_WAKE_BALANCE		64	/* Perform balancing at task wakeup */
			
 
				-#define SD_SHARE_CPUPOWER	128	/* Domain members share cpu power */
			
 
				-#define SD_POWERSAVINGS_BALANCE	256	/* Balance for power savings */
			
 
				-#define SD_SHARE_PKG_RESOURCES	512	/* Domain members share cpu pkg resources */
			
 
				-#define SD_SERIALIZE		1024	/* Only a single load balancing instance */
			
 
				-#define SD_WAKE_IDLE_FAR	2048	/* Gain latency sacrificing cache hit */
			
 
				+#define SD_LOAD_BALANCE		0x0001	/* Do load balancing on this domain. */
			
 
				+#define SD_BALANCE_NEWIDLE	0x0002	/* Balance when about to become idle */
			
 
				+#define SD_BALANCE_EXEC		0x0004	/* Balance on exec */
			
 
				+#define SD_BALANCE_FORK		0x0008	/* Balance on fork, clone */
			
 
				+#define SD_WAKE_IDLE		0x0010	/* Wake to idle CPU on task wakeup */
			
 
				+#define SD_WAKE_AFFINE		0x0020	/* Wake task to waking CPU */
			
 
				+#define SD_WAKE_BALANCE		0x0040	/* Perform balancing at task wakeup */
			
 
				+#define SD_SHARE_CPUPOWER	0x0080	/* Domain members share cpu power */
			
 
				+#define SD_POWERSAVINGS_BALANCE	0x0100	/* Balance for power savings */
			
 
				+#define SD_SHARE_PKG_RESOURCES	0x0200	/* Domain members share cpu pkg resources */
			
 
				+#define SD_SERIALIZE		0x0400	/* Only a single load balancing instance */
			
 
				+#define SD_WAKE_IDLE_FAR	0x0800	/* Gain latency sacrificing cache hit */
			
 
				+#define SD_PREFER_SIBLING	0x1000	/* Prefer to place tasks in a sibling domain */
			
 
				 
			
 
				 enum powersavings_balance_level {
			
 
				 	POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
			
@@ -827,7 +830,7 @@ static inline int sd_balance_for_mc_power(void)
 
				 	if (sched_smt_power_savings)
			
 
				 		return SD_POWERSAVINGS_BALANCE;
			
 
				 
			
 
				-	return 0;
			
 
				+	return SD_PREFER_SIBLING;
			
 
				 }
			
 
				 
			
 
				 static inline int sd_balance_for_package_power(void)
			
@@ -835,7 +838,7 @@ static inline int sd_balance_for_package_power(void)
 
				 	if (sched_mc_power_savings | sched_smt_power_savings)
			
 
				 		return SD_POWERSAVINGS_BALANCE;
			
 
				 
			
 
				-	return 0;
			
 
				+	return SD_PREFER_SIBLING;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -857,15 +860,9 @@ struct sched_group {
 
				 
			
 
				 	/*
			
 
				 	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
			
 
				-	 * single CPU. This is read only (except for setup, hotplug CPU).
			
 
				-	 * Note : Never change cpu_power without recompute its reciprocal
			
 
				-	 */
			
 
				-	unsigned int __cpu_power;
			
 
				-	/*
			
 
				-	 * reciprocal value of cpu_power to avoid expensive divides
			
 
				-	 * (see include/linux/reciprocal_div.h)
			
 
				+	 * single CPU.
			
 
				 	 */
			
 
				-	u32 reciprocal_cpu_power;
			
 
				+	unsigned int cpu_power;
			
 
				 
			
 
				 	/*
			
 
				 	 * The CPUs this group covers.
			
@@ -918,6 +915,7 @@ struct sched_domain {
 
				 	unsigned int newidle_idx;
			
 
				 	unsigned int wake_idx;
			
 
				 	unsigned int forkexec_idx;
			
 
				+	unsigned int smt_gain;
			
 
				 	int flags;			/* See SD_* */
			
 
				 	enum sched_domain_level level;
			
 
				 
			
@@ -1045,7 +1043,6 @@ struct sched_class {
 
				 			      struct rq *busiest, struct sched_domain *sd,
			
 
				 			      enum cpu_idle_type idle);
			
 
				 	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
			
 
				-	int (*needs_post_schedule) (struct rq *this_rq);
			
 
				 	void (*post_schedule) (struct rq *this_rq);
			
 
				 	void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
			
 
				 
			
@@ -1110,6 +1107,8 @@ struct sched_entity {
 
				 	u64			wait_max;
			
 
				 	u64			wait_count;
			
 
				 	u64			wait_sum;
			
 
				+	u64			iowait_count;
			
 
				+	u64			iowait_sum;
			
 
				 
			
 
				 	u64			sleep_start;
			
 
				 	u64			sleep_max;
			
@@ -1234,11 +1233,19 @@ struct task_struct {
 
				 	unsigned did_exec:1;
			
 
				 	unsigned in_execve:1;	/* Tell the LSMs that the process is doing an
			
 
				 				 * execve */
			
 
				+	unsigned in_iowait:1;
			
 
				+
			
 
				+
			
 
				+	/* Revert to default priority/policy when forking */
			
 
				+	unsigned sched_reset_on_fork:1;
			
 
				+
			
 
				 	pid_t pid;
			
 
				 	pid_t tgid;
			
 
				 
			
 
				+#ifdef CONFIG_CC_STACKPROTECTOR
			
 
				 	/* Canary value for the -fstack-protector gcc feature */
			
 
				 	unsigned long stack_canary;
			
 
				+#endif
			
 
				 
			
 
				 	/* 
			
 
				 	 * pointers to (original) parent process, youngest child, younger sibling,
			
@@ -1840,11 +1847,12 @@ extern unsigned int sysctl_sched_min_granularity;
 
				 extern unsigned int sysctl_sched_wakeup_granularity;
			
 
				 extern unsigned int sysctl_sched_shares_ratelimit;
			
 
				 extern unsigned int sysctl_sched_shares_thresh;
			
 
				-#ifdef CONFIG_SCHED_DEBUG
			
 
				 extern unsigned int sysctl_sched_child_runs_first;
			
 
				+#ifdef CONFIG_SCHED_DEBUG
			
 
				 extern unsigned int sysctl_sched_features;
			
 
				 extern unsigned int sysctl_sched_migration_cost;
			
 
				 extern unsigned int sysctl_sched_nr_migrate;
			
 
				+extern unsigned int sysctl_sched_time_avg;
			
 
				 extern unsigned int sysctl_timer_migration;
			
 
				 
			
 
				 int sched_nr_latency_handler(struct ctl_table *table, int write,
			
@@ -2308,23 +2316,31 @@ static inline int need_resched(void)
 
				  * cond_resched_softirq() will enable bhs before scheduling.
			
 
				  */
			
 
				 extern int _cond_resched(void);
			
 
				-#ifdef CONFIG_PREEMPT_BKL
			
 
				-static inline int cond_resched(void)
			
 
				-{
			
 
				-	return 0;
			
 
				-}
			
 
				+
			
 
				+#define cond_resched() ({			\
			
 
				+	__might_sleep(__FILE__, __LINE__, 0);	\
			
 
				+	_cond_resched();			\
			
 
				+})
			
 
				+
			
 
				+extern int __cond_resched_lock(spinlock_t *lock);
			
 
				+
			
 
				+#ifdef CONFIG_PREEMPT
			
 
				+#define PREEMPT_LOCK_OFFSET	PREEMPT_OFFSET
			
 
				 #else
			
 
				-static inline int cond_resched(void)
			
 
				-{
			
 
				-	return _cond_resched();
			
 
				-}
			
 
				+#define PREEMPT_LOCK_OFFSET	0
			
 
				 #endif
			
 
				-extern int cond_resched_lock(spinlock_t * lock);
			
 
				-extern int cond_resched_softirq(void);
			
 
				-static inline int cond_resched_bkl(void)
			
 
				-{
			
 
				-	return _cond_resched();
			
 
				-}
			
 
				+
			
 
				+#define cond_resched_lock(lock) ({				\
			
 
				+	__might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);	\
			
 
				+	__cond_resched_lock(lock);				\
			
 
				+})
			
 
				+
			
 
				+extern int __cond_resched_softirq(void);
			
 
				+
			
 
				+#define cond_resched_softirq() ({				\
			
 
				+	__might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET);	\
			
 
				+	__cond_resched_softirq();				\
			
 
				+})
			
 
				 
			
 
				 /*
			
 
				  * Does a critical section need to be broken due to another
			
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -85,20 +85,29 @@ int arch_update_cpu_topology(void);
 
				 #define ARCH_HAS_SCHED_WAKE_IDLE
			
 
				 /* Common values for SMT siblings */
			
 
				 #ifndef SD_SIBLING_INIT
			
 
				-#define SD_SIBLING_INIT (struct sched_domain) {		\
			
 
				-	.min_interval		= 1,			\
			
 
				-	.max_interval		= 2,			\
			
 
				-	.busy_factor		= 64,			\
			
 
				-	.imbalance_pct		= 110,			\
			
 
				-	.flags			= SD_LOAD_BALANCE	\
			
 
				-				| SD_BALANCE_NEWIDLE	\
			
 
				-				| SD_BALANCE_FORK	\
			
 
				-				| SD_BALANCE_EXEC	\
			
 
				-				| SD_WAKE_AFFINE	\
			
 
				-				| SD_WAKE_BALANCE	\
			
 
				-				| SD_SHARE_CPUPOWER,	\
			
 
				-	.last_balance		= jiffies,		\
			
 
				-	.balance_interval	= 1,			\
			
 
				+#define SD_SIBLING_INIT (struct sched_domain) {				\
			
 
				+	.min_interval		= 1,					\
			
 
				+	.max_interval		= 2,					\
			
 
				+	.busy_factor		= 64,					\
			
 
				+	.imbalance_pct		= 110,					\
			
 
				+									\
			
 
				+	.flags			= 1*SD_LOAD_BALANCE			\
			
 
				+				| 1*SD_BALANCE_NEWIDLE			\
			
 
				+				| 1*SD_BALANCE_EXEC			\
			
 
				+				| 1*SD_BALANCE_FORK			\
			
 
				+				| 0*SD_WAKE_IDLE			\
			
 
				+				| 1*SD_WAKE_AFFINE			\
			
 
				+				| 1*SD_WAKE_BALANCE			\
			
 
				+				| 1*SD_SHARE_CPUPOWER			\
			
 
				+				| 0*SD_POWERSAVINGS_BALANCE		\
			
 
				+				| 0*SD_SHARE_PKG_RESOURCES		\
			
 
				+				| 0*SD_SERIALIZE			\
			
 
				+				| 0*SD_WAKE_IDLE_FAR			\
			
 
				+				| 0*SD_PREFER_SIBLING			\
			
 
				+				,					\
			
 
				+	.last_balance		= jiffies,				\
			
 
				+	.balance_interval	= 1,					\
			
 
				+	.smt_gain		= 1178,	/* 15% */			\
			
 
				 }
			
 
				 #endif
			
 
				 #endif /* CONFIG_SCHED_SMT */
			
@@ -106,69 +115,94 @@ int arch_update_cpu_topology(void);
 
				 #ifdef CONFIG_SCHED_MC
			
 
				 /* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */
			
 
				 #ifndef SD_MC_INIT
			
 
				-#define SD_MC_INIT (struct sched_domain) {		\
			
 
				-	.min_interval		= 1,			\
			
 
				-	.max_interval		= 4,			\
			
 
				-	.busy_factor		= 64,			\
			
 
				-	.imbalance_pct		= 125,			\
			
 
				-	.cache_nice_tries	= 1,			\
			
 
				-	.busy_idx		= 2,			\
			
 
				-	.wake_idx		= 1,			\
			
 
				-	.forkexec_idx		= 1,			\
			
 
				-	.flags			= SD_LOAD_BALANCE	\
			
 
				-				| SD_BALANCE_FORK	\
			
 
				-				| SD_BALANCE_EXEC	\
			
 
				-				| SD_WAKE_AFFINE	\
			
 
				-				| SD_WAKE_BALANCE	\
			
 
				-				| SD_SHARE_PKG_RESOURCES\
			
 
				-				| sd_balance_for_mc_power()\
			
 
				-				| sd_power_saving_flags(),\
			
 
				-	.last_balance		= jiffies,		\
			
 
				-	.balance_interval	= 1,			\
			
 
				+#define SD_MC_INIT (struct sched_domain) {				\
			
 
				+	.min_interval		= 1,					\
			
 
				+	.max_interval		= 4,					\
			
 
				+	.busy_factor		= 64,					\
			
 
				+	.imbalance_pct		= 125,					\
			
 
				+	.cache_nice_tries	= 1,					\
			
 
				+	.busy_idx		= 2,					\
			
 
				+	.wake_idx		= 1,					\
			
 
				+	.forkexec_idx		= 1,					\
			
 
				+									\
			
 
				+	.flags			= 1*SD_LOAD_BALANCE			\
			
 
				+				| 1*SD_BALANCE_NEWIDLE			\
			
 
				+				| 1*SD_BALANCE_EXEC			\
			
 
				+				| 1*SD_BALANCE_FORK			\
			
 
				+				| 1*SD_WAKE_IDLE			\
			
 
				+				| 1*SD_WAKE_AFFINE			\
			
 
				+				| 1*SD_WAKE_BALANCE			\
			
 
				+				| 0*SD_SHARE_CPUPOWER			\
			
 
				+				| 1*SD_SHARE_PKG_RESOURCES		\
			
 
				+				| 0*SD_SERIALIZE			\
			
 
				+				| 0*SD_WAKE_IDLE_FAR			\
			
 
				+				| sd_balance_for_mc_power()		\
			
 
				+				| sd_power_saving_flags()		\
			
 
				+				,					\
			
 
				+	.last_balance		= jiffies,				\
			
 
				+	.balance_interval	= 1,					\
			
 
				 }
			
 
				 #endif
			
 
				 #endif /* CONFIG_SCHED_MC */
			
 
				 
			
 
				 /* Common values for CPUs */
			
 
				 #ifndef SD_CPU_INIT
			
 
				-#define SD_CPU_INIT (struct sched_domain) {		\
			
 
				-	.min_interval		= 1,			\
			
 
				-	.max_interval		= 4,			\
			
 
				-	.busy_factor		= 64,			\
			
 
				-	.imbalance_pct		= 125,			\
			
 
				-	.cache_nice_tries	= 1,			\
			
 
				-	.busy_idx		= 2,			\
			
 
				-	.idle_idx		= 1,			\
			
 
				-	.newidle_idx		= 2,			\
			
 
				-	.wake_idx		= 1,			\
			
 
				-	.forkexec_idx		= 1,			\
			
 
				-	.flags			= SD_LOAD_BALANCE	\
			
 
				-				| SD_BALANCE_EXEC	\
			
 
				-				| SD_BALANCE_FORK	\
			
 
				-				| SD_WAKE_AFFINE	\
			
 
				-				| SD_WAKE_BALANCE	\
			
 
				-				| sd_balance_for_package_power()\
			
 
				-				| sd_power_saving_flags(),\
			
 
				-	.last_balance		= jiffies,		\
			
 
				-	.balance_interval	= 1,			\
			
 
				+#define SD_CPU_INIT (struct sched_domain) {				\
			
 
				+	.min_interval		= 1,					\
			
 
				+	.max_interval		= 4,					\
			
 
				+	.busy_factor		= 64,					\
			
 
				+	.imbalance_pct		= 125,					\
			
 
				+	.cache_nice_tries	= 1,					\
			
 
				+	.busy_idx		= 2,					\
			
 
				+	.idle_idx		= 1,					\
			
 
				+	.newidle_idx		= 2,					\
			
 
				+	.wake_idx		= 1,					\
			
 
				+	.forkexec_idx		= 1,					\
			
 
				+									\
			
 
				+	.flags			= 1*SD_LOAD_BALANCE			\
			
 
				+				| 1*SD_BALANCE_NEWIDLE			\
			
 
				+				| 1*SD_BALANCE_EXEC			\
			
 
				+				| 1*SD_BALANCE_FORK			\
			
 
				+				| 1*SD_WAKE_IDLE			\
			
 
				+				| 0*SD_WAKE_AFFINE			\
			
 
				+				| 1*SD_WAKE_BALANCE			\
			
 
				+				| 0*SD_SHARE_CPUPOWER			\
			
 
				+				| 0*SD_SHARE_PKG_RESOURCES		\
			
 
				+				| 0*SD_SERIALIZE			\
			
 
				+				| 0*SD_WAKE_IDLE_FAR			\
			
 
				+				| sd_balance_for_package_power()	\
			
 
				+				| sd_power_saving_flags()		\
			
 
				+				,					\
			
 
				+	.last_balance		= jiffies,				\
			
 
				+	.balance_interval	= 1,					\
			
 
				 }
			
 
				 #endif
			
 
				 
			
 
				 /* sched_domains SD_ALLNODES_INIT for NUMA machines */
			
 
				-#define SD_ALLNODES_INIT (struct sched_domain) {	\
			
 
				-	.min_interval		= 64,			\
			
 
				-	.max_interval		= 64*num_online_cpus(),	\
			
 
				-	.busy_factor		= 128,			\
			
 
				-	.imbalance_pct		= 133,			\
			
 
				-	.cache_nice_tries	= 1,			\
			
 
				-	.busy_idx		= 3,			\
			
 
				-	.idle_idx		= 3,			\
			
 
				-	.flags			= SD_LOAD_BALANCE	\
			
 
				-				| SD_BALANCE_NEWIDLE	\
			
 
				-				| SD_WAKE_AFFINE	\
			
 
				-				| SD_SERIALIZE,		\
			
 
				-	.last_balance		= jiffies,		\
			
 
				-	.balance_interval	= 64,			\
			
 
				+#define SD_ALLNODES_INIT (struct sched_domain) {			\
			
 
				+	.min_interval		= 64,					\
			
 
				+	.max_interval		= 64*num_online_cpus(),			\
			
 
				+	.busy_factor		= 128,					\
			
 
				+	.imbalance_pct		= 133,					\
			
 
				+	.cache_nice_tries	= 1,					\
			
 
				+	.busy_idx		= 3,					\
			
 
				+	.idle_idx		= 3,					\
			
 
				+	.flags			= 1*SD_LOAD_BALANCE			\
			
 
				+				| 1*SD_BALANCE_NEWIDLE			\
			
 
				+				| 0*SD_BALANCE_EXEC			\
			
 
				+				| 0*SD_BALANCE_FORK			\
			
 
				+				| 0*SD_WAKE_IDLE			\
			
 
				+				| 1*SD_WAKE_AFFINE			\
			
 
				+				| 0*SD_WAKE_BALANCE			\
			
 
				+				| 0*SD_SHARE_CPUPOWER			\
			
 
				+				| 0*SD_POWERSAVINGS_BALANCE		\
			
 
				+				| 0*SD_SHARE_PKG_RESOURCES		\
			
 
				+				| 1*SD_SERIALIZE			\
			
 
				+				| 1*SD_WAKE_IDLE_FAR			\
			
 
				+				| 0*SD_PREFER_SIBLING			\
			
 
				+				,					\
			
 
				+	.last_balance		= jiffies,				\
			
 
				+	.balance_interval	= 64,					\
			
 
				 }
			
 
				 
			
 
				 #ifdef CONFIG_NUMA
			
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -340,6 +340,101 @@ TRACE_EVENT(sched_signal_send,
 
				 		  __entry->sig, __entry->comm, __entry->pid)
			
 
				 );
			
 
				 
			
 
				+/*
			
 
				+ * XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE
			
 
				+ *     adding sched_stat support to SCHED_FIFO/RR would be welcome.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Tracepoint for accounting wait time (time the task is runnable
			
 
				+ * but not actually running due to scheduler contention).
			
 
				+ */
			
 
				+TRACE_EVENT(sched_stat_wait,
			
 
				+
			
 
				+	TP_PROTO(struct task_struct *tsk, u64 delay),
			
 
				+
			
 
				+	TP_ARGS(tsk, delay),
			
 
				+
			
 
				+	TP_STRUCT__entry(
			
 
				+		__array( char,	comm,	TASK_COMM_LEN	)
			
 
				+		__field( pid_t,	pid			)
			
 
				+		__field( u64,	delay			)
			
 
				+	),
			
 
				+
			
 
				+	TP_fast_assign(
			
 
				+		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
			
 
				+		__entry->pid	= tsk->pid;
			
 
				+		__entry->delay	= delay;
			
 
				+	)
			
 
				+	TP_perf_assign(
			
 
				+		__perf_count(delay);
			
 
				+	),
			
 
				+
			
 
				+	TP_printk("task: %s:%d wait: %Lu [ns]",
			
 
				+			__entry->comm, __entry->pid,
			
 
				+			(unsigned long long)__entry->delay)
			
 
				+);
			
 
				+
			
 
				+/*
			
 
				+ * Tracepoint for accounting sleep time (time the task is not runnable,
			
 
				+ * including iowait, see below).
			
 
				+ */
			
 
				+TRACE_EVENT(sched_stat_sleep,
			
 
				+
			
 
				+	TP_PROTO(struct task_struct *tsk, u64 delay),
			
 
				+
			
 
				+	TP_ARGS(tsk, delay),
			
 
				+
			
 
				+	TP_STRUCT__entry(
			
 
				+		__array( char,	comm,	TASK_COMM_LEN	)
			
 
				+		__field( pid_t,	pid			)
			
 
				+		__field( u64,	delay			)
			
 
				+	),
			
 
				+
			
 
				+	TP_fast_assign(
			
 
				+		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
			
 
				+		__entry->pid	= tsk->pid;
			
 
				+		__entry->delay	= delay;
			
 
				+	)
			
 
				+	TP_perf_assign(
			
 
				+		__perf_count(delay);
			
 
				+	),
			
 
				+
			
 
				+	TP_printk("task: %s:%d sleep: %Lu [ns]",
			
 
				+			__entry->comm, __entry->pid,
			
 
				+			(unsigned long long)__entry->delay)
			
 
				+);
			
 
				+
			
 
				+/*
			
 
				+ * Tracepoint for accounting iowait time (time the task is not runnable
			
 
				+ * due to waiting on IO to complete).
			
 
				+ */
			
 
				+TRACE_EVENT(sched_stat_iowait,
			
 
				+
			
 
				+	TP_PROTO(struct task_struct *tsk, u64 delay),
			
 
				+
			
 
				+	TP_ARGS(tsk, delay),
			
 
				+
			
 
				+	TP_STRUCT__entry(
			
 
				+		__array( char,	comm,	TASK_COMM_LEN	)
			
 
				+		__field( pid_t,	pid			)
			
 
				+		__field( u64,	delay			)
			
 
				+	),
			
 
				+
			
 
				+	TP_fast_assign(
			
 
				+		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
			
 
				+		__entry->pid	= tsk->pid;
			
 
				+		__entry->delay	= delay;
			
 
				+	)
			
 
				+	TP_perf_assign(
			
 
				+		__perf_count(delay);
			
 
				+	),
			
 
				+
			
 
				+	TP_printk("task: %s:%d iowait: %Lu [ns]",
			
 
				+			__entry->comm, __entry->pid,
			
 
				+			(unsigned long long)__entry->delay)
			
 
				+);
			
 
				+
			
 
				 #endif /* _TRACE_SCHED_H */
			
 
				 
			
 
				 /* This part must be outside protection */
			
--- a/init/main.c
+++ b/init/main.c
@@ -631,7 +631,6 @@ asmlinkage void __init start_kernel(void)
 
				 	softirq_init();
			
 
				 	timekeeping_init();
			
 
				 	time_init();
			
 
				-	sched_clock_init();
			
 
				 	profile_init();
			
 
				 	if (!irqs_disabled())
			
 
				 		printk(KERN_CRIT "start_kernel(): bug: interrupts were "
			
@@ -682,6 +681,7 @@ asmlinkage void __init start_kernel(void)
 
				 	numa_policy_init();
			
 
				 	if (late_time_init)
			
 
				 		late_time_init();
			
 
				+	sched_clock_init();
			
 
				 	calibrate_delay();
			
 
				 	pidmap_init();
			
 
				 	anon_vma_init();
			
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -16,8 +16,6 @@
 
				 #include <linux/mutex.h>
			
 
				 #include <trace/events/sched.h>
			
 
				 
			
 
				-#define KTHREAD_NICE_LEVEL (-5)
			
 
				-
			
 
				 static DEFINE_SPINLOCK(kthread_create_lock);
			
 
				 static LIST_HEAD(kthread_create_list);
			
 
				 struct task_struct *kthreadd_task;
			
@@ -145,7 +143,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 
				 		 * The kernel thread should not inherit these properties.
			
 
				 		 */
			
 
				 		sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
			
 
				-		set_user_nice(create.result, KTHREAD_NICE_LEVEL);
			
 
				 		set_cpus_allowed_ptr(create.result, cpu_all_mask);
			
 
				 	}
			
 
				 	return create.result;
			
@@ -221,7 +218,6 @@ int kthreadd(void *unused)
 
				 	/* Setup a clean context for our children to inherit. */
			
 
				 	set_task_comm(tsk, "kthreadd");
			
 
				 	ignore_signals(tsk);
			
 
				-	set_user_nice(tsk, KTHREAD_NICE_LEVEL);
			
 
				 	set_cpus_allowed_ptr(tsk, cpu_all_mask);
			
 
				 	set_mems_allowed(node_possible_map);
			
 
				 
			
--- a/kernel/sched.c
+++ b/kernel/sched.c
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -127,21 +127,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 
				 
			
 
				 	/*
			
 
				 	 * If the cpu was currently mapped to a different value, we
			
 
				-	 * first need to unmap the old value
			
 
				+	 * need to map it to the new value then remove the old value.
			
 
				+	 * Note, we must add the new value first, otherwise we risk the
			
 
				+	 * cpu being cleared from pri_active, and this cpu could be
			
 
				+	 * missed for a push or pull.
			
 
				 	 */
			
 
				-	if (likely(oldpri != CPUPRI_INVALID)) {
			
 
				-		struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
			
 
				-
			
 
				-		spin_lock_irqsave(&vec->lock, flags);
			
 
				-
			
 
				-		vec->count--;
			
 
				-		if (!vec->count)
			
 
				-			clear_bit(oldpri, cp->pri_active);
			
 
				-		cpumask_clear_cpu(cpu, vec->mask);
			
 
				-
			
 
				-		spin_unlock_irqrestore(&vec->lock, flags);
			
 
				-	}
			
 
				-
			
 
				 	if (likely(newpri != CPUPRI_INVALID)) {
			
 
				 		struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
			
 
				 
			
@@ -154,6 +144,18 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 
				 
			
 
				 		spin_unlock_irqrestore(&vec->lock, flags);
			
 
				 	}
			
 
				+	if (likely(oldpri != CPUPRI_INVALID)) {
			
 
				+		struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
			
 
				+
			
 
				+		spin_lock_irqsave(&vec->lock, flags);
			
 
				+
			
 
				+		vec->count--;
			
 
				+		if (!vec->count)
			
 
				+			clear_bit(oldpri, cp->pri_active);
			
 
				+		cpumask_clear_cpu(cpu, vec->mask);
			
 
				+
			
 
				+		spin_unlock_irqrestore(&vec->lock, flags);
			
 
				+	}
			
 
				 
			
 
				 	*currpri = newpri;
			
 
				 }
			
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -409,6 +409,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 
				 	PN(se.wait_max);
			
 
				 	PN(se.wait_sum);
			
 
				 	P(se.wait_count);
			
 
				+	PN(se.iowait_sum);
			
 
				+	P(se.iowait_count);
			
 
				 	P(sched_info.bkl_count);
			
 
				 	P(se.nr_migrations);
			
 
				 	P(se.nr_migrations_cold);
			
@@ -479,6 +481,8 @@ void proc_sched_set_task(struct task_struct *p)
 
				 	p->se.wait_max				= 0;
			
 
				 	p->se.wait_sum				= 0;
			
 
				 	p->se.wait_count			= 0;
			
 
				+	p->se.iowait_sum			= 0;
			
 
				+	p->se.iowait_count			= 0;
			
 
				 	p->se.sleep_max				= 0;
			
 
				 	p->se.sum_sleep_runtime			= 0;
			
 
				 	p->se.block_max				= 0;
			
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -24,7 +24,7 @@
 
				 
			
 
				 /*
			
 
				  * Targeted preemption latency for CPU-bound tasks:
			
 
				- * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds)
			
 
				+ * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
			
 
				  *
			
 
				  * NOTE: this latency value is not the same as the concept of
			
 
				  * 'timeslice length' - timeslices in CFS are of variable length
			
@@ -34,13 +34,13 @@
 
				  * (to see the precise effective timeslice length of your workload,
			
 
				  *  run vmstat and monitor the context-switches (cs) field)
			
 
				  */
			
 
				-unsigned int sysctl_sched_latency = 20000000ULL;
			
 
				+unsigned int sysctl_sched_latency = 5000000ULL;
			
 
				 
			
 
				 /*
			
 
				  * Minimal preemption granularity for CPU-bound tasks:
			
 
				- * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds)
			
 
				+ * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
			
 
				  */
			
 
				-unsigned int sysctl_sched_min_granularity = 4000000ULL;
			
 
				+unsigned int sysctl_sched_min_granularity = 1000000ULL;
			
 
				 
			
 
				 /*
			
 
				  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
			
@@ -48,10 +48,10 @@ unsigned int sysctl_sched_min_granularity = 4000000ULL;
 
				 static unsigned int sched_nr_latency = 5;
			
 
				 
			
 
				 /*
			
 
				- * After fork, child runs first. (default) If set to 0 then
			
 
				+ * After fork, child runs first. If set to 0 (default) then
			
 
				  * parent will (try to) run first.
			
 
				  */
			
 
				-const_debug unsigned int sysctl_sched_child_runs_first = 1;
			
 
				+unsigned int sysctl_sched_child_runs_first __read_mostly;
			
 
				 
			
 
				 /*
			
 
				  * sys_sched_yield() compat mode
			
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
 
				 
			
 
				 /*
			
 
				  * SCHED_OTHER wake-up granularity.
			
 
				- * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
			
 
				+ * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
			
 
				  *
			
 
				  * This option delays the preemption effects of decoupled workloads
			
 
				  * and reduces their over-scheduling. Synchronous workloads will still
			
 
				  * have immediate wakeup/sleep latencies.
			
 
				  */
			
 
				-unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
			
 
				+unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
			
 
				 
			
 
				 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
			
 
				 
			
@@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class;
 
				  * CFS operations on generic schedulable entities:
			
 
				  */
			
 
				 
			
 
				-static inline struct task_struct *task_of(struct sched_entity *se)
			
 
				-{
			
 
				-	return container_of(se, struct task_struct, se);
			
 
				-}
			
 
				-
			
 
				 #ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				 
			
 
				 /* cpu runqueue to which this cfs_rq is attached */
			
@@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 
				 /* An entity is a task if it doesn't "own" a runqueue */
			
 
				 #define entity_is_task(se)	(!se->my_q)
			
 
				 
			
 
				+static inline struct task_struct *task_of(struct sched_entity *se)
			
 
				+{
			
 
				+#ifdef CONFIG_SCHED_DEBUG
			
 
				+	WARN_ON_ONCE(!entity_is_task(se));
			
 
				+#endif
			
 
				+	return container_of(se, struct task_struct, se);
			
 
				+}
			
 
				+
			
 
				 /* Walk up scheduling entities hierarchy */
			
 
				 #define for_each_sched_entity(se) \
			
 
				 		for (; se; se = se->parent)
			
@@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-#else	/* CONFIG_FAIR_GROUP_SCHED */
			
 
				+#else	/* !CONFIG_FAIR_GROUP_SCHED */
			
 
				+
			
 
				+static inline struct task_struct *task_of(struct sched_entity *se)
			
 
				+{
			
 
				+	return container_of(se, struct task_struct, se);
			
 
				+}
			
 
				 
			
 
				 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
			
 
				 {
			
@@ -537,6 +545,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
				 	schedstat_set(se->wait_count, se->wait_count + 1);
			
 
				 	schedstat_set(se->wait_sum, se->wait_sum +
			
 
				 			rq_of(cfs_rq)->clock - se->wait_start);
			
 
				+#ifdef CONFIG_SCHEDSTATS
			
 
				+	if (entity_is_task(se)) {
			
 
				+		trace_sched_stat_wait(task_of(se),
			
 
				+			rq_of(cfs_rq)->clock - se->wait_start);
			
 
				+	}
			
 
				+#endif
			
 
				 	schedstat_set(se->wait_start, 0);
			
 
				 }
			
 
				 
			
@@ -628,8 +642,10 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
				 		se->sleep_start = 0;
			
 
				 		se->sum_sleep_runtime += delta;
			
 
				 
			
 
				-		if (tsk)
			
 
				+		if (tsk) {
			
 
				 			account_scheduler_latency(tsk, delta >> 10, 1);
			
 
				+			trace_sched_stat_sleep(tsk, delta);
			
 
				+		}
			
 
				 	}
			
 
				 	if (se->block_start) {
			
 
				 		u64 delta = rq_of(cfs_rq)->clock - se->block_start;
			
@@ -644,6 +660,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
				 		se->sum_sleep_runtime += delta;
			
 
				 
			
 
				 		if (tsk) {
			
 
				+			if (tsk->in_iowait) {
			
 
				+				se->iowait_sum += delta;
			
 
				+				se->iowait_count++;
			
 
				+				trace_sched_stat_iowait(tsk, delta);
			
 
				+			}
			
 
				+
			
 
				 			/*
			
 
				 			 * Blocking time is in units of nanosecs, so shift by
			
 
				 			 * 20 to get a milliseconds-range estimation of the
			
@@ -705,11 +727,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 
				 
			
 
				 			vruntime -= thresh;
			
 
				 		}
			
 
				-
			
 
				-		/* ensure we never gain time by being placed backwards. */
			
 
				-		vruntime = max_vruntime(se->vruntime, vruntime);
			
 
				 	}
			
 
				 
			
 
				+	/* ensure we never gain time by being placed backwards. */
			
 
				+	vruntime = max_vruntime(se->vruntime, vruntime);
			
 
				+
			
 
				 	se->vruntime = vruntime;
			
 
				 }
			
 
				 
			
@@ -1046,17 +1068,21 @@ static void yield_task_fair(struct rq *rq)
 
				  * search starts with cpus closest then further out as needed,
			
 
				  * so we always favor a closer, idle cpu.
			
 
				  * Domains may include CPUs that are not usable for migration,
			
 
				- * hence we need to mask them out (cpu_active_mask)
			
 
				+ * hence we need to mask them out (rq->rd->online)
			
 
				  *
			
 
				  * Returns the CPU we should wake onto.
			
 
				  */
			
 
				 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
			
 
				+
			
 
				+#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
			
 
				+
			
 
				 static int wake_idle(int cpu, struct task_struct *p)
			
 
				 {
			
 
				 	struct sched_domain *sd;
			
 
				 	int i;
			
 
				 	unsigned int chosen_wakeup_cpu;
			
 
				 	int this_cpu;
			
 
				+	struct rq *task_rq = task_rq(p);
			
 
				 
			
 
				 	/*
			
 
				 	 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
			
@@ -1089,10 +1115,10 @@ static int wake_idle(int cpu, struct task_struct *p)
 
				 	for_each_domain(cpu, sd) {
			
 
				 		if ((sd->flags & SD_WAKE_IDLE)
			
 
				 		    || ((sd->flags & SD_WAKE_IDLE_FAR)
			
 
				-			&& !task_hot(p, task_rq(p)->clock, sd))) {
			
 
				+			&& !task_hot(p, task_rq->clock, sd))) {
			
 
				 			for_each_cpu_and(i, sched_domain_span(sd),
			
 
				 					 &p->cpus_allowed) {
			
 
				-				if (cpu_active(i) && idle_cpu(i)) {
			
 
				+				if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
			
 
				 					if (i != task_cpu(p)) {
			
 
				 						schedstat_inc(p,
			
 
				 						       se.nr_wakeups_idle);
			
@@ -1235,7 +1261,17 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
 
				 	tg = task_group(p);
			
 
				 	weight = p->se.load.weight;
			
 
				 
			
 
				-	balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
			
 
				+	/*
			
 
				+	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
			
 
				+	 * due to the sync cause above having dropped tl to 0, we'll always have
			
 
				+	 * an imbalance, but there's really nothing you can do about that, so
			
 
				+	 * that's good too.
			
 
				+	 *
			
 
				+	 * Otherwise check if either cpus are near enough in load to allow this
			
 
				+	 * task to be woken on this_cpu.
			
 
				+	 */
			
 
				+	balanced = !tl ||
			
 
				+		100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
			
 
				 		imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
			
 
				 
			
 
				 	/*
			
@@ -1278,8 +1314,6 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
 
				 	this_rq		= cpu_rq(this_cpu);
			
 
				 	new_cpu		= prev_cpu;
			
 
				 
			
 
				-	if (prev_cpu == this_cpu)
			
 
				-		goto out;
			
 
				 	/*
			
 
				 	 * 'this_sd' is the first domain that both
			
 
				 	 * this_cpu and prev_cpu are present in:
			
@@ -1721,6 +1755,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 
				 	sched_info_queued(p);
			
 
				 
			
 
				 	update_curr(cfs_rq);
			
 
				+	if (curr)
			
 
				+		se->vruntime = curr->vruntime;
			
 
				 	place_entity(cfs_rq, se, 1);
			
 
				 
			
 
				 	/* 'curr' will be NULL if the child belongs to a different group */
			
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,4 +1,4 @@
 
				-SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
			
 
				+SCHED_FEAT(NEW_FAIR_SLEEPERS, 0)
			
 
				 SCHED_FEAT(NORMALIZED_SLEEPER, 0)
			
 
				 SCHED_FEAT(ADAPTIVE_GRAN, 1)
			
 
				 SCHED_FEAT(WAKEUP_PREEMPT, 1)
			
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,15 +3,18 @@
 
				  * policies)
			
 
				  */
			
 
				 
			
 
				+#ifdef CONFIG_RT_GROUP_SCHED
			
 
				+
			
 
				+#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
			
 
				+
			
 
				 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
			
 
				 {
			
 
				+#ifdef CONFIG_SCHED_DEBUG
			
 
				+	WARN_ON_ONCE(!rt_entity_is_task(rt_se));
			
 
				+#endif
			
 
				 	return container_of(rt_se, struct task_struct, rt);
			
 
				 }
			
 
				 
			
 
				-#ifdef CONFIG_RT_GROUP_SCHED
			
 
				-
			
 
				-#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
			
 
				-
			
 
				 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
			
 
				 {
			
 
				 	return rt_rq->rq;
			
@@ -26,6 +29,11 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 
				 
			
 
				 #define rt_entity_is_task(rt_se) (1)
			
 
				 
			
 
				+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
			
 
				+{
			
 
				+	return container_of(rt_se, struct task_struct, rt);
			
 
				+}
			
 
				+
			
 
				 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
			
 
				 {
			
 
				 	return container_of(rt_rq, struct rq, rt);
			
@@ -128,6 +136,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 
				 	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
			
 
				 }
			
 
				 
			
 
				+static inline int has_pushable_tasks(struct rq *rq)
			
 
				+{
			
 
				+	return !plist_head_empty(&rq->rt.pushable_tasks);
			
 
				+}
			
 
				+
			
 
				 #else
			
 
				 
			
 
				 static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
			
@@ -602,6 +615,8 @@ static void update_curr_rt(struct rq *rq)
 
				 	curr->se.exec_start = rq->clock;
			
 
				 	cpuacct_charge(curr, delta_exec);
			
 
				 
			
 
				+	sched_rt_avg_update(rq, delta_exec);
			
 
				+
			
 
				 	if (!rt_bandwidth_enabled())
			
 
				 		return;
			
 
				 
			
@@ -874,8 +889,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 
				 
			
 
				 	if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
			
 
				 		enqueue_pushable_task(rq, p);
			
 
				-
			
 
				-	inc_cpu_load(rq, p->se.load.weight);
			
 
				 }
			
 
				 
			
 
				 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
			
@@ -886,8 +899,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 
				 	dequeue_rt_entity(rt_se);
			
 
				 
			
 
				 	dequeue_pushable_task(rq, p);
			
 
				-
			
 
				-	dec_cpu_load(rq, p->se.load.weight);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1064,6 +1075,14 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
 
				 	if (p)
			
 
				 		dequeue_pushable_task(rq, p);
			
 
				 
			
 
				+#ifdef CONFIG_SMP
			
 
				+	/*
			
 
				+	 * We detect this state here so that we can avoid taking the RQ
			
 
				+	 * lock again later if there is no need to push
			
 
				+	 */
			
 
				+	rq->post_schedule = has_pushable_tasks(rq);
			
 
				+#endif
			
 
				+
			
 
				 	return p;
			
 
				 }
			
 
				 
			
@@ -1161,13 +1180,6 @@ static int find_lowest_rq(struct task_struct *task)
 
				 	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
			
 
				 		return -1; /* No targets found */
			
 
				 
			
 
				-	/*
			
 
				-	 * Only consider CPUs that are usable for migration.
			
 
				-	 * I guess we might want to change cpupri_find() to ignore those
			
 
				-	 * in the first place.
			
 
				-	 */
			
 
				-	cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
			
 
				-
			
 
				 	/*
			
 
				 	 * At this point we have built a mask of cpus representing the
			
 
				 	 * lowest priority tasks in the system.  Now we want to elect
			
@@ -1262,11 +1274,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 
				 	return lowest_rq;
			
 
				 }
			
 
				 
			
 
				-static inline int has_pushable_tasks(struct rq *rq)
			
 
				-{
			
 
				-	return !plist_head_empty(&rq->rt.pushable_tasks);
			
 
				-}
			
 
				-
			
 
				 static struct task_struct *pick_next_pushable_task(struct rq *rq)
			
 
				 {
			
 
				 	struct task_struct *p;
			
@@ -1466,23 +1473,9 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
 
				 		pull_rt_task(rq);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * assumes rq->lock is held
			
 
				- */
			
 
				-static int needs_post_schedule_rt(struct rq *rq)
			
 
				-{
			
 
				-	return has_pushable_tasks(rq);
			
 
				-}
			
 
				-
			
 
				 static void post_schedule_rt(struct rq *rq)
			
 
				 {
			
 
				-	/*
			
 
				-	 * This is only called if needs_post_schedule_rt() indicates that
			
 
				-	 * we need to push tasks away
			
 
				-	 */
			
 
				-	spin_lock_irq(&rq->lock);
			
 
				 	push_rt_tasks(rq);
			
 
				-	spin_unlock_irq(&rq->lock);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1758,7 +1751,6 @@ static const struct sched_class rt_sched_class = {
 
				 	.rq_online              = rq_online_rt,
			
 
				 	.rq_offline             = rq_offline_rt,
			
 
				 	.pre_schedule		= pre_schedule_rt,
			
 
				-	.needs_post_schedule	= needs_post_schedule_rt,
			
 
				 	.post_schedule		= post_schedule_rt,
			
 
				 	.task_wake_up		= task_wake_up_rt,
			
 
				 	.switched_from		= switched_from_rt,
			
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -245,6 +245,14 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC;	/* 1 second */
 
				 #endif
			
 
				 
			
 
				 static struct ctl_table kern_table[] = {
			
 
				+	{
			
 
				+		.ctl_name	= CTL_UNNUMBERED,
			
 
				+		.procname	= "sched_child_runs_first",
			
 
				+		.data		= &sysctl_sched_child_runs_first,
			
 
				+		.maxlen		= sizeof(unsigned int),
			
 
				+		.mode		= 0644,
			
 
				+		.proc_handler	= &proc_dointvec,
			
 
				+	},
			
 
				 #ifdef CONFIG_SCHED_DEBUG
			
 
				 	{
			
 
				 		.ctl_name	= CTL_UNNUMBERED,
			
@@ -297,14 +305,6 @@ static struct ctl_table kern_table[] = {
 
				 		.strategy	= &sysctl_intvec,
			
 
				 		.extra1		= &zero,
			
 
				 	},
			
 
				-	{
			
 
				-		.ctl_name	= CTL_UNNUMBERED,
			
 
				-		.procname	= "sched_child_runs_first",
			
 
				-		.data		= &sysctl_sched_child_runs_first,
			
 
				-		.maxlen		= sizeof(unsigned int),
			
 
				-		.mode		= 0644,
			
 
				-		.proc_handler	= &proc_dointvec,
			
 
				-	},
			
 
				 	{
			
 
				 		.ctl_name	= CTL_UNNUMBERED,
			
 
				 		.procname	= "sched_features",
			
@@ -329,6 +329,14 @@ static struct ctl_table kern_table[] = {
 
				 		.mode		= 0644,
			
 
				 		.proc_handler	= &proc_dointvec,
			
 
				 	},
			
 
				+	{
			
 
				+		.ctl_name	= CTL_UNNUMBERED,
			
 
				+		.procname	= "sched_time_avg",
			
 
				+		.data		= &sysctl_sched_time_avg,
			
 
				+		.maxlen		= sizeof(unsigned int),
			
 
				+		.mode		= 0644,
			
 
				+		.proc_handler	= &proc_dointvec,
			
 
				+	},
			
 
				 	{
			
 
				 		.ctl_name	= CTL_UNNUMBERED,
			
 
				 		.procname	= "timer_migration",
			
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -317,8 +317,6 @@ static int worker_thread(void *__cwq)
 
				 	if (cwq->wq->freezeable)
			
 
				 		set_freezable();
			
 
				 
			
 
				-	set_user_nice(current, -5);
			
 
				-
			
 
				 	for (;;) {
			
 
				 		prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
			
 
				 		if (!freezing(current) &&