11 years ago · 39cf275a1a
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -355,6 +355,82 @@ utilize.
 
				 
			
 
				 ==============================================================
			
 
				 
			
 
				+numa_balancing
			
 
				+
			
 
				+Enables/disables automatic page fault based NUMA memory
			
 
				+balancing. Memory is moved automatically to nodes
			
 
				+that access it often.
			
 
				+
			
 
				+Enables/disables automatic NUMA memory balancing. On NUMA machines, there
			
 
				+is a performance penalty if remote memory is accessed by a CPU. When this
			
 
				+feature is enabled the kernel samples what task thread is accessing memory
			
 
				+by periodically unmapping pages and later trapping a page fault. At the
			
 
				+time of the page fault, it is determined if the data being accessed should
			
 
				+be migrated to a local memory node.
			
 
				+
			
 
				+The unmapping of pages and trapping faults incur additional overhead that
			
 
				+ideally is offset by improved memory locality but there is no universal
			
 
				+guarantee. If the target workload is already bound to NUMA nodes then this
			
 
				+feature should be disabled. Otherwise, if the system overhead from the
			
 
				+feature is too high then the rate the kernel samples for NUMA hinting
			
 
				+faults may be controlled by the numa_balancing_scan_period_min_ms,
			
 
				+numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
			
 
				+numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and
			
 
				+numa_balancing_migrate_deferred.
			
 
				+
			
 
				+==============================================================
			
 
				+
			
 
				+numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms,
			
 
				+numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb
			
 
				+
			
 
				+Automatic NUMA balancing scans tasks address space and unmaps pages to
			
 
				+detect if pages are properly placed or if the data should be migrated to a
			
 
				+memory node local to where the task is running.  Every "scan delay" the task
			
 
				+scans the next "scan size" number of pages in its address space. When the
			
 
				+end of the address space is reached the scanner restarts from the beginning.
			
 
				+
			
 
				+In combination, the "scan delay" and "scan size" determine the scan rate.
			
 
				+When "scan delay" decreases, the scan rate increases.  The scan delay and
			
 
				+hence the scan rate of every task is adaptive and depends on historical
			
 
				+behaviour. If pages are properly placed then the scan delay increases,
			
 
				+otherwise the scan delay decreases.  The "scan size" is not adaptive but
			
 
				+the higher the "scan size", the higher the scan rate.
			
 
				+
			
 
				+Higher scan rates incur higher system overhead as page faults must be
			
 
				+trapped and potentially data must be migrated. However, the higher the scan
			
 
				+rate, the more quickly a tasks memory is migrated to a local node if the
			
 
				+workload pattern changes and minimises performance impact due to remote
			
 
				+memory accesses. These sysctls control the thresholds for scan delays and
			
 
				+the number of pages scanned.
			
 
				+
			
 
				+numa_balancing_scan_period_min_ms is the minimum time in milliseconds to
			
 
				+scan a tasks virtual memory. It effectively controls the maximum scanning
			
 
				+rate for each task.
			
 
				+
			
 
				+numa_balancing_scan_delay_ms is the starting "scan delay" used for a task
			
 
				+when it initially forks.
			
 
				+
			
 
				+numa_balancing_scan_period_max_ms is the maximum time in milliseconds to
			
 
				+scan a tasks virtual memory. It effectively controls the minimum scanning
			
 
				+rate for each task.
			
 
				+
			
 
				+numa_balancing_scan_size_mb is how many megabytes worth of pages are
			
 
				+scanned for a given scan.
			
 
				+
			
 
				+numa_balancing_settle_count is how many scan periods must complete before
			
 
				+the schedule balancer stops pushing the task towards a preferred node. This
			
 
				+gives the scheduler a chance to place the task on an alternative node if the
			
 
				+preferred node is overloaded.
			
 
				+
			
 
				+numa_balancing_migrate_deferred is how many page migrations get skipped
			
 
				+unconditionally, after a page migration is skipped because a page is shared
			
 
				+with other tasks. This reduces page migration overhead, and determines
			
 
				+how much stronger the "move task near its memory" policy scheduler becomes,
			
 
				+versus the "move memory near its task" memory management policy, for workloads
			
 
				+with shared memory.
			
 
				+
			
 
				+==============================================================
			
 
				+
			
 
				 osrelease, ostype & version:
			
 
				 
			
 
				 # cat osrelease
			
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -655,7 +655,11 @@ explains which is which.
 
				 		  read the irq flags variable, an 'X' will always
			
 
				 		  be printed here.
			
 
				 
			
 
				-  need-resched: 'N' task need_resched is set, '.' otherwise.
			
 
				+  need-resched:
			
 
				+	'N' both TIF_NEED_RESCHED and PREEMPT_NEED_RESCHED is set,
			
 
				+	'n' only TIF_NEED_RESCHED is set,
			
 
				+	'p' only PREEMPT_NEED_RESCHED is set,
			
 
				+	'.' otherwise.
			
 
				 
			
 
				   hardirq/softirq:
			
 
				 	'H' - hard irq occurred inside a softirq.
			
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7326,6 +7326,8 @@ S:	Maintained
 
				 F:	kernel/sched/
			
 
				 F:	include/linux/sched.h
			
 
				 F:	include/uapi/linux/sched.h
			
 
				+F:	kernel/wait.c
			
 
				+F:	include/linux/wait.h
			
 
				 
			
 
				 SCORE ARCHITECTURE
			
 
				 M:	Chen Liqin <liqin.linux@gmail.com>
			
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -3,3 +3,4 @@ generic-y += clkdev.h
 
				 
			
 
				 generic-y += exec.h
			
 
				 generic-y += trace_clock.h
			
 
				+generic-y += preempt.h
			
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -46,3 +46,4 @@ generic-y += ucontext.h
 
				 generic-y += user.h
			
 
				 generic-y += vga.h
			
 
				 generic-y += xor.h
			
 
				+generic-y += preempt.h
			
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -32,3 +32,4 @@ generic-y += termios.h
 
				 generic-y += timex.h
			
 
				 generic-y += trace_clock.h
			
 
				 generic-y += unaligned.h
			
 
				+generic-y += preempt.h
			
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -50,3 +50,4 @@ generic-y += unaligned.h
 
				 generic-y += user.h
			
 
				 generic-y += vga.h
			
 
				 generic-y += xor.h
			
 
				+generic-y += preempt.h
			
--- a/arch/avr32/include/asm/Kbuild
+++ b/arch/avr32/include/asm/Kbuild
@@ -7,6 +7,7 @@ generic-y       += div64.h
 
				 generic-y       += emergency-restart.h
			
 
				 generic-y	+= exec.h
			
 
				 generic-y       += futex.h
			
 
				+generic-y	+= preempt.h
			
 
				 generic-y       += irq_regs.h
			
 
				 generic-y	+= param.h
			
 
				 generic-y       += local.h
			
--- a/arch/blackfin/include/asm/Kbuild
+++ b/arch/blackfin/include/asm/Kbuild
@@ -44,3 +44,4 @@ generic-y += ucontext.h
 
				 generic-y += unaligned.h
			
 
				 generic-y += user.h
			
 
				 generic-y += xor.h
			
 
				+generic-y += preempt.h
			
--- a/arch/c6x/include/asm/Kbuild
+++ b/arch/c6x/include/asm/Kbuild
@@ -56,3 +56,4 @@ generic-y += ucontext.h
 
				 generic-y += user.h
			
 
				 generic-y += vga.h
			
 
				 generic-y += xor.h
			
 
				+generic-y += preempt.h
			
--- a/arch/cris/include/asm/Kbuild
+++ b/arch/cris/include/asm/Kbuild
@@ -11,3 +11,4 @@ generic-y += module.h
 
				 generic-y += trace_clock.h
			
 
				 generic-y += vga.h
			
 
				 generic-y += xor.h
			
 
				+generic-y += preempt.h
			
--- a/arch/frv/include/asm/Kbuild
+++ b/arch/frv/include/asm/Kbuild
@@ -2,3 +2,4 @@
 
				 generic-y += clkdev.h
			
 
				 generic-y += exec.h
			
 
				 generic-y += trace_clock.h
			
 
				+generic-y += preempt.h
			
--- a/arch/h8300/include/asm/Kbuild
+++ b/arch/h8300/include/asm/Kbuild
@@ -6,3 +6,4 @@ generic-y += mmu.h
 
				 generic-y += module.h
			
 
				 generic-y += trace_clock.h
			
 
				 generic-y += xor.h
			
 
				+generic-y += preempt.h
			
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -53,3 +53,4 @@ generic-y += types.h
 
				 generic-y += ucontext.h
			
 
				 generic-y += unaligned.h
			
 
				 generic-y += xor.h
			
 
				+generic-y += preempt.h
			
--- a/arch/ia64/include/asm/Kbuild
+++ b/arch/ia64/include/asm/Kbuild
@@ -3,4 +3,5 @@ generic-y += clkdev.h
 
				 generic-y += exec.h
			
 
				 generic-y += kvm_para.h
			
 
				 generic-y += trace_clock.h
			
 
				+generic-y += preempt.h
			
 
				 generic-y += vtime.h
			
--- a/arch/m32r/include/asm/Kbuild
+++ b/arch/m32r/include/asm/Kbuild
@@ -3,3 +3,4 @@ generic-y += clkdev.h
 
				 generic-y += exec.h
			
 
				 generic-y += module.h
			
 
				 generic-y += trace_clock.h
			
 
				+generic-y += preempt.h
			
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -31,3 +31,4 @@ generic-y += trace_clock.h
 
				 generic-y += types.h
			
 
				 generic-y += word-at-a-time.h
			
 
				 generic-y += xor.h
			
 
				+generic-y += preempt.h
			
--- a/arch/metag/include/asm/Kbuild
+++ b/arch/metag/include/asm/Kbuild
@@ -52,3 +52,4 @@ generic-y += unaligned.h
 
				 generic-y += user.h
			
 
				 generic-y += vga.h
			
 
				 generic-y += xor.h
			
 
				+generic-y += preempt.h
			
--- a/arch/metag/include/asm/topology.h
+++ b/arch/metag/include/asm/topology.h
@@ -26,6 +26,8 @@
 
				 	.last_balance		= jiffies,		\
			
 
				 	.balance_interval	= 1,			\
			
 
				 	.nr_balance_failed	= 0,			\
			
 
				+	.max_newidle_lb_cost	= 0,			\
			
 
				+	.next_decay_max_lb_cost	= jiffies,		\
			
 
				 }
			
 
				 
			
 
				 #define cpu_to_node(cpu)	((void)(cpu), 0)
			
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -3,3 +3,4 @@ generic-y += clkdev.h
 
				 generic-y += exec.h
			
 
				 generic-y += trace_clock.h
			
 
				 generic-y += syscalls.h
			
 
				+generic-y += preempt.h
			
--- a/arch/mips/include/asm/Kbuild
+++ b/arch/mips/include/asm/Kbuild
@@ -11,5 +11,6 @@ generic-y += sections.h
 
				 generic-y += segment.h
			
 
				 generic-y += serial.h
			
 
				 generic-y += trace_clock.h
			
 
				+generic-y += preempt.h
			
 
				 generic-y += ucontext.h
			
 
				 generic-y += xor.h
			
--- a/arch/mips/kernel/rtlx.c
+++ b/arch/mips/kernel/rtlx.c
@@ -172,8 +172,9 @@ int rtlx_open(int index, int can_sleep)
 
				 	if (rtlx == NULL) {
			
 
				 		if( (p = vpe_get_shared(tclimit)) == NULL) {
			
 
				 		    if (can_sleep) {
			
 
				-			__wait_event_interruptible(channel_wqs[index].lx_queue,
			
 
				-				(p = vpe_get_shared(tclimit)), ret);
			
 
				+			ret = __wait_event_interruptible(
			
 
				+					channel_wqs[index].lx_queue,
			
 
				+					(p = vpe_get_shared(tclimit)));
			
 
				 			if (ret)
			
 
				 				goto out_fail;
			
 
				 		    } else {
			
@@ -263,11 +264,10 @@ unsigned int rtlx_read_poll(int index, int can_sleep)
 
				 	/* data available to read? */
			
 
				 	if (chan->lx_read == chan->lx_write) {
			
 
				 		if (can_sleep) {
			
 
				-			int ret = 0;
			
 
				-
			
 
				-			__wait_event_interruptible(channel_wqs[index].lx_queue,
			
 
				+			int ret = __wait_event_interruptible(
			
 
				+				channel_wqs[index].lx_queue,
			
 
				 				(chan->lx_read != chan->lx_write) ||
			
 
				-				sp_stopping, ret);
			
 
				+				sp_stopping);
			
 
				 			if (ret)
			
 
				 				return ret;
			
 
				 
			
@@ -440,14 +440,13 @@ static ssize_t file_write(struct file *file, const char __user * buffer,
 
				 
			
 
				 	/* any space left... */
			
 
				 	if (!rtlx_write_poll(minor)) {
			
 
				-		int ret = 0;
			
 
				+		int ret;
			
 
				 
			
 
				 		if (file->f_flags & O_NONBLOCK)
			
 
				 			return -EAGAIN;
			
 
				 
			
 
				-		__wait_event_interruptible(channel_wqs[minor].rt_queue,
			
 
				-					   rtlx_write_poll(minor),
			
 
				-					   ret);
			
 
				+		ret = __wait_event_interruptible(channel_wqs[minor].rt_queue,
			
 
				+					   rtlx_write_poll(minor));
			
 
				 		if (ret)
			
 
				 			return ret;
			
 
				 	}
			
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -124,7 +124,7 @@ void *kmap_coherent(struct page *page, unsigned long addr)
 
				 
			
 
				 	BUG_ON(Page_dcache_dirty(page));
			
 
				 
			
 
				-	inc_preempt_count();
			
 
				+	pagefault_disable();
			
 
				 	idx = (addr >> PAGE_SHIFT) & (FIX_N_COLOURS - 1);
			
 
				 #ifdef CONFIG_MIPS_MT_SMTC
			
 
				 	idx += FIX_N_COLOURS * smp_processor_id() +
			
@@ -193,8 +193,7 @@ void kunmap_coherent(void)
 
				 	write_c0_entryhi(old_ctx);
			
 
				 	EXIT_CRITICAL(flags);
			
 
				 #endif
			
 
				-	dec_preempt_count();
			
 
				-	preempt_check_resched();
			
 
				+	pagefault_enable();
			
 
				 }
			
 
				 
			
 
				 void copy_user_highpage(struct page *to, struct page *from,
			
--- a/arch/mn10300/include/asm/Kbuild
+++ b/arch/mn10300/include/asm/Kbuild
@@ -2,3 +2,4 @@
 
				 generic-y += clkdev.h
			
 
				 generic-y += exec.h
			
 
				 generic-y += trace_clock.h
			
 
				+generic-y += preempt.h
			
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -67,3 +67,4 @@ generic-y += ucontext.h
 
				 generic-y += user.h
			
 
				 generic-y += word-at-a-time.h
			
 
				 generic-y += xor.h
			
 
				+generic-y += preempt.h
			
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -4,3 +4,4 @@ generic-y += word-at-a-time.h auxvec.h user.h cputime.h emergency-restart.h \
 
				 	  div64.h irq_regs.h kdebug.h kvm_para.h local64.h local.h param.h \
			
 
				 	  poll.h xor.h clkdev.h exec.h
			
 
				 generic-y += trace_clock.h
			
 
				+generic-y += preempt.h
			
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -2,4 +2,5 @@
 
				 generic-y += clkdev.h
			
 
				 generic-y += rwsem.h
			
 
				 generic-y += trace_clock.h
			
 
				+generic-y += preempt.h
			
 
				 generic-y += vtime.h
			
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -2,3 +2,4 @@
 
				 
			
 
				 generic-y += clkdev.h
			
 
				 generic-y += trace_clock.h
			
 
				+generic-y += preempt.h
			
--- a/arch/score/include/asm/Kbuild
+++ b/arch/score/include/asm/Kbuild
@@ -4,3 +4,4 @@ header-y +=
 
				 generic-y += clkdev.h
			
 
				 generic-y += trace_clock.h
			
 
				 generic-y += xor.h
			
 
				+generic-y += preempt.h
			
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -34,3 +34,4 @@ generic-y += termios.h
 
				 generic-y += trace_clock.h
			
 
				 generic-y += ucontext.h
			
 
				 generic-y += xor.h
			
 
				+generic-y += preempt.h
			
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -16,3 +16,4 @@ generic-y += serial.h
 
				 generic-y += trace_clock.h
			
 
				 generic-y += types.h
			
 
				 generic-y += word-at-a-time.h
			
 
				+generic-y += preempt.h
			
--- a/arch/tile/include/asm/Kbuild
+++ b/arch/tile/include/asm/Kbuild
@@ -38,3 +38,4 @@ generic-y += termios.h
 
				 generic-y += trace_clock.h
			
 
				 generic-y += types.h
			
 
				 generic-y += xor.h
			
 
				+generic-y += preempt.h
			
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -3,3 +3,4 @@ generic-y += hw_irq.h irq_regs.h kdebug.h percpu.h sections.h topology.h xor.h
 
				 generic-y += ftrace.h pci.h io.h param.h delay.h mutex.h current.h exec.h
			
 
				 generic-y += switch_to.h clkdev.h
			
 
				 generic-y += trace_clock.h
			
 
				+generic-y += preempt.h
			
--- a/arch/unicore32/include/asm/Kbuild
+++ b/arch/unicore32/include/asm/Kbuild
@@ -60,3 +60,4 @@ generic-y += unaligned.h
 
				 generic-y += user.h
			
 
				 generic-y += vga.h
			
 
				 generic-y += xor.h
			
 
				+generic-y += preempt.h
			
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -6,6 +6,7 @@
 
				 #include <asm/processor.h>
			
 
				 #include <asm/alternative.h>
			
 
				 #include <asm/cmpxchg.h>
			
 
				+#include <asm/rmwcc.h>
			
 
				 
			
 
				 /*
			
 
				  * Atomic operations that C can't guarantee us.  Useful for
			
@@ -76,12 +77,7 @@ static inline void atomic_sub(int i, atomic_t *v)
 
				  */
			
 
				 static inline int atomic_sub_and_test(int i, atomic_t *v)
			
 
				 {
			
 
				-	unsigned char c;
			
 
				-
			
 
				-	asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
			
 
				-		     : "+m" (v->counter), "=qm" (c)
			
 
				-		     : "ir" (i) : "memory");
			
 
				-	return c;
			
 
				+	GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, i, "%0", "e");
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -118,12 +114,7 @@ static inline void atomic_dec(atomic_t *v)
 
				  */
			
 
				 static inline int atomic_dec_and_test(atomic_t *v)
			
 
				 {
			
 
				-	unsigned char c;
			
 
				-
			
 
				-	asm volatile(LOCK_PREFIX "decl %0; sete %1"
			
 
				-		     : "+m" (v->counter), "=qm" (c)
			
 
				-		     : : "memory");
			
 
				-	return c != 0;
			
 
				+	GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e");
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -136,12 +127,7 @@ static inline int atomic_dec_and_test(atomic_t *v)
 
				  */
			
 
				 static inline int atomic_inc_and_test(atomic_t *v)
			
 
				 {
			
 
				-	unsigned char c;
			
 
				-
			
 
				-	asm volatile(LOCK_PREFIX "incl %0; sete %1"
			
 
				-		     : "+m" (v->counter), "=qm" (c)
			
 
				-		     : : "memory");
			
 
				-	return c != 0;
			
 
				+	GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", "e");
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -155,12 +141,7 @@ static inline int atomic_inc_and_test(atomic_t *v)
 
				  */
			
 
				 static inline int atomic_add_negative(int i, atomic_t *v)
			
 
				 {
			
 
				-	unsigned char c;
			
 
				-
			
 
				-	asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
			
 
				-		     : "+m" (v->counter), "=qm" (c)
			
 
				-		     : "ir" (i) : "memory");
			
 
				-	return c;
			
 
				+	GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, i, "%0", "s");
			
 
				 }
			
 
				 
			
 
				 /**
			
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -72,12 +72,7 @@ static inline void atomic64_sub(long i, atomic64_t *v)
 
				  */
			
 
				 static inline int atomic64_sub_and_test(long i, atomic64_t *v)
			
 
				 {
			
 
				-	unsigned char c;
			
 
				-
			
 
				-	asm volatile(LOCK_PREFIX "subq %2,%0; sete %1"
			
 
				-		     : "=m" (v->counter), "=qm" (c)
			
 
				-		     : "er" (i), "m" (v->counter) : "memory");
			
 
				-	return c;
			
 
				+	GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, i, "%0", "e");
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -116,12 +111,7 @@ static inline void atomic64_dec(atomic64_t *v)
 
				  */
			
 
				 static inline int atomic64_dec_and_test(atomic64_t *v)
			
 
				 {
			
 
				-	unsigned char c;
			
 
				-
			
 
				-	asm volatile(LOCK_PREFIX "decq %0; sete %1"
			
 
				-		     : "=m" (v->counter), "=qm" (c)
			
 
				-		     : "m" (v->counter) : "memory");
			
 
				-	return c != 0;
			
 
				+	GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", "e");
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -134,12 +124,7 @@ static inline int atomic64_dec_and_test(atomic64_t *v)
 
				  */
			
 
				 static inline int atomic64_inc_and_test(atomic64_t *v)
			
 
				 {
			
 
				-	unsigned char c;
			
 
				-
			
 
				-	asm volatile(LOCK_PREFIX "incq %0; sete %1"
			
 
				-		     : "=m" (v->counter), "=qm" (c)
			
 
				-		     : "m" (v->counter) : "memory");
			
 
				-	return c != 0;
			
 
				+	GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", "e");
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -153,12 +138,7 @@ static inline int atomic64_inc_and_test(atomic64_t *v)
 
				  */
			
 
				 static inline int atomic64_add_negative(long i, atomic64_t *v)
			
 
				 {
			
 
				-	unsigned char c;
			
 
				-
			
 
				-	asm volatile(LOCK_PREFIX "addq %2,%0; sets %1"
			
 
				-		     : "=m" (v->counter), "=qm" (c)
			
 
				-		     : "er" (i), "m" (v->counter) : "memory");
			
 
				-	return c;
			
 
				+	GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, i, "%0", "s");
			
 
				 }
			
 
				 
			
 
				 /**
			
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -14,6 +14,7 @@
 
				 
			
 
				 #include <linux/compiler.h>
			
 
				 #include <asm/alternative.h>
			
 
				+#include <asm/rmwcc.h>
			
 
				 
			
 
				 #if BITS_PER_LONG == 32
			
 
				 # define _BITOPS_LONG_SHIFT 5
			
@@ -204,12 +205,7 @@ static inline void change_bit(long nr, volatile unsigned long *addr)
 
				  */
			
 
				 static inline int test_and_set_bit(long nr, volatile unsigned long *addr)
			
 
				 {
			
 
				-	int oldbit;
			
 
				-
			
 
				-	asm volatile(LOCK_PREFIX "bts %2,%1\n\t"
			
 
				-		     "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
			
 
				-
			
 
				-	return oldbit;
			
 
				+	GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, nr, "%0", "c");
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -255,13 +251,7 @@ static inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
 
				  */
			
 
				 static inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
			
 
				 {
			
 
				-	int oldbit;
			
 
				-
			
 
				-	asm volatile(LOCK_PREFIX "btr %2,%1\n\t"
			
 
				-		     "sbb %0,%0"
			
 
				-		     : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
			
 
				-
			
 
				-	return oldbit;
			
 
				+	GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, nr, "%0", "c");
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -314,13 +304,7 @@ static inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
 
				  */
			
 
				 static inline int test_and_change_bit(long nr, volatile unsigned long *addr)
			
 
				 {
			
 
				-	int oldbit;
			
 
				-
			
 
				-	asm volatile(LOCK_PREFIX "btc %2,%1\n\t"
			
 
				-		     "sbb %0,%0"
			
 
				-		     : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
			
 
				-
			
 
				-	return oldbit;
			
 
				+	GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, nr, "%0", "c");
			
 
				 }
			
 
				 
			
 
				 static __always_inline int constant_test_bit(long nr, const volatile unsigned long *addr)
			
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -48,6 +48,8 @@ For 32-bit we have the following conventions - kernel is built with
 
				 
			
 
				 #include <asm/dwarf2.h>
			
 
				 
			
 
				+#ifdef CONFIG_X86_64
			
 
				+
			
 
				 /*
			
 
				  * 64-bit system call stack frame layout defines and helpers,
			
 
				  * for assembly code:
			
@@ -192,3 +194,51 @@ For 32-bit we have the following conventions - kernel is built with
 
				 	.macro icebp
			
 
				 	.byte 0xf1
			
 
				 	.endm
			
 
				+
			
 
				+#else /* CONFIG_X86_64 */
			
 
				+
			
 
				+/*
			
 
				+ * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These
			
 
				+ * are different from the entry_32.S versions in not changing the segment
			
 
				+ * registers. So only suitable for in kernel use, not when transitioning
			
 
				+ * from or to user space. The resulting stack frame is not a standard
			
 
				+ * pt_regs frame. The main use case is calling C code from assembler
			
 
				+ * when all the registers need to be preserved.
			
 
				+ */
			
 
				+
			
 
				+	.macro SAVE_ALL
			
 
				+	pushl_cfi %eax
			
 
				+	CFI_REL_OFFSET eax, 0
			
 
				+	pushl_cfi %ebp
			
 
				+	CFI_REL_OFFSET ebp, 0
			
 
				+	pushl_cfi %edi
			
 
				+	CFI_REL_OFFSET edi, 0
			
 
				+	pushl_cfi %esi
			
 
				+	CFI_REL_OFFSET esi, 0
			
 
				+	pushl_cfi %edx
			
 
				+	CFI_REL_OFFSET edx, 0
			
 
				+	pushl_cfi %ecx
			
 
				+	CFI_REL_OFFSET ecx, 0
			
 
				+	pushl_cfi %ebx
			
 
				+	CFI_REL_OFFSET ebx, 0
			
 
				+	.endm
			
 
				+
			
 
				+	.macro RESTORE_ALL
			
 
				+	popl_cfi %ebx
			
 
				+	CFI_RESTORE ebx
			
 
				+	popl_cfi %ecx
			
 
				+	CFI_RESTORE ecx
			
 
				+	popl_cfi %edx
			
 
				+	CFI_RESTORE edx
			
 
				+	popl_cfi %esi
			
 
				+	CFI_RESTORE esi
			
 
				+	popl_cfi %edi
			
 
				+	CFI_RESTORE edi
			
 
				+	popl_cfi %ebp
			
 
				+	CFI_RESTORE ebp
			
 
				+	popl_cfi %eax
			
 
				+	CFI_RESTORE eax
			
 
				+	.endm
			
 
				+
			
 
				+#endif /* CONFIG_X86_64 */
			
 
				+
			
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -52,12 +52,7 @@ static inline void local_sub(long i, local_t *l)
 
				  */
			
 
				 static inline int local_sub_and_test(long i, local_t *l)
			
 
				 {
			
 
				-	unsigned char c;
			
 
				-
			
 
				-	asm volatile(_ASM_SUB "%2,%0; sete %1"
			
 
				-		     : "+m" (l->a.counter), "=qm" (c)
			
 
				-		     : "ir" (i) : "memory");
			
 
				-	return c;
			
 
				+	GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, i, "%0", "e");
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -70,12 +65,7 @@ static inline int local_sub_and_test(long i, local_t *l)
 
				  */
			
 
				 static inline int local_dec_and_test(local_t *l)
			
 
				 {
			
 
				-	unsigned char c;
			
 
				-
			
 
				-	asm volatile(_ASM_DEC "%0; sete %1"
			
 
				-		     : "+m" (l->a.counter), "=qm" (c)
			
 
				-		     : : "memory");
			
 
				-	return c != 0;
			
 
				+	GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, "%0", "e");
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -88,12 +78,7 @@ static inline int local_dec_and_test(local_t *l)
 
				  */
			
 
				 static inline int local_inc_and_test(local_t *l)
			
 
				 {
			
 
				-	unsigned char c;
			
 
				-
			
 
				-	asm volatile(_ASM_INC "%0; sete %1"
			
 
				-		     : "+m" (l->a.counter), "=qm" (c)
			
 
				-		     : : "memory");
			
 
				-	return c != 0;
			
 
				+	GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, "%0", "e");
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -107,12 +92,7 @@ static inline int local_inc_and_test(local_t *l)
 
				  */
			
 
				 static inline int local_add_negative(long i, local_t *l)
			
 
				 {
			
 
				-	unsigned char c;
			
 
				-
			
 
				-	asm volatile(_ASM_ADD "%2,%0; sets %1"
			
 
				-		     : "+m" (l->a.counter), "=qm" (c)
			
 
				-		     : "ir" (i) : "memory");
			
 
				-	return c;
			
 
				+	GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, i, "%0", "s");
			
 
				 }
			
 
				 
			
 
				 /**
			
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -0,0 +1,100 @@
 
				+#ifndef __ASM_PREEMPT_H
			
 
				+#define __ASM_PREEMPT_H
			
 
				+
			
 
				+#include <asm/rmwcc.h>
			
 
				+#include <asm/percpu.h>
			
 
				+#include <linux/thread_info.h>
			
 
				+
			
 
				+DECLARE_PER_CPU(int, __preempt_count);
			
 
				+
			
 
				+/*
			
 
				+ * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
			
 
				+ * that think a non-zero value indicates we cannot preempt.
			
 
				+ */
			
 
				+static __always_inline int preempt_count(void)
			
 
				+{
			
 
				+	return __this_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
			
 
				+}
			
 
				+
			
 
				+static __always_inline void preempt_count_set(int pc)
			
 
				+{
			
 
				+	__this_cpu_write_4(__preempt_count, pc);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * must be macros to avoid header recursion hell
			
 
				+ */
			
 
				+#define task_preempt_count(p) \
			
 
				+	(task_thread_info(p)->saved_preempt_count & ~PREEMPT_NEED_RESCHED)
			
 
				+
			
 
				+#define init_task_preempt_count(p) do { \
			
 
				+	task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \
			
 
				+} while (0)
			
 
				+
			
 
				+#define init_idle_preempt_count(p, cpu) do { \
			
 
				+	task_thread_info(p)->saved_preempt_count = PREEMPT_ENABLED; \
			
 
				+	per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \
			
 
				+} while (0)
			
 
				+
			
 
				+/*
			
 
				+ * We fold the NEED_RESCHED bit into the preempt count such that
			
 
				+ * preempt_enable() can decrement and test for needing to reschedule with a
			
 
				+ * single instruction.
			
 
				+ *
			
 
				+ * We invert the actual bit, so that when the decrement hits 0 we know we both
			
 
				+ * need to resched (the bit is cleared) and can resched (no preempt count).
			
 
				+ */
			
 
				+
			
 
				+static __always_inline void set_preempt_need_resched(void)
			
 
				+{
			
 
				+	__this_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED);
			
 
				+}
			
 
				+
			
 
				+static __always_inline void clear_preempt_need_resched(void)
			
 
				+{
			
 
				+	__this_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED);
			
 
				+}
			
 
				+
			
 
				+static __always_inline bool test_preempt_need_resched(void)
			
 
				+{
			
 
				+	return !(__this_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * The various preempt_count add/sub methods
			
 
				+ */
			
 
				+
			
 
				+static __always_inline void __preempt_count_add(int val)
			
 
				+{
			
 
				+	__this_cpu_add_4(__preempt_count, val);
			
 
				+}
			
 
				+
			
 
				+static __always_inline void __preempt_count_sub(int val)
			
 
				+{
			
 
				+	__this_cpu_add_4(__preempt_count, -val);
			
 
				+}
			
 
				+
			
 
				+static __always_inline bool __preempt_count_dec_and_test(void)
			
 
				+{
			
 
				+	GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e");
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Returns true when we need to resched and can (barring IRQ state).
			
 
				+ */
			
 
				+static __always_inline bool should_resched(void)
			
 
				+{
			
 
				+	return unlikely(!__this_cpu_read_4(__preempt_count));
			
 
				+}
			
 
				+
			
 
				+#ifdef CONFIG_PREEMPT
			
 
				+  extern asmlinkage void ___preempt_schedule(void);
			
 
				+# define __preempt_schedule() asm ("call ___preempt_schedule")
			
 
				+  extern asmlinkage void preempt_schedule(void);
			
 
				+# ifdef CONFIG_CONTEXT_TRACKING
			
 
				+    extern asmlinkage void ___preempt_schedule_context(void);
			
 
				+#   define __preempt_schedule_context() asm ("call ___preempt_schedule_context")
			
 
				+# endif
			
 
				+#endif
			
 
				+
			
 
				+#endif /* __ASM_PREEMPT_H */
			
--- a/arch/x86/include/asm/rmwcc.h
+++ b/arch/x86/include/asm/rmwcc.h
@@ -0,0 +1,41 @@
 
				+#ifndef _ASM_X86_RMWcc
			
 
				+#define _ASM_X86_RMWcc
			
 
				+
			
 
				+#ifdef CC_HAVE_ASM_GOTO
			
 
				+
			
 
				+#define __GEN_RMWcc(fullop, var, cc, ...)				\
			
 
				+do {									\
			
 
				+	asm_volatile_goto (fullop "; j" cc " %l[cc_label]"		\
			
 
				+			: : "m" (var), ## __VA_ARGS__ 			\
			
 
				+			: "memory" : cc_label);				\
			
 
				+	return 0;							\
			
 
				+cc_label:								\
			
 
				+	return 1;							\
			
 
				+} while (0)
			
 
				+
			
 
				+#define GEN_UNARY_RMWcc(op, var, arg0, cc) 				\
			
 
				+	__GEN_RMWcc(op " " arg0, var, cc)
			
 
				+
			
 
				+#define GEN_BINARY_RMWcc(op, var, val, arg0, cc)			\
			
 
				+	__GEN_RMWcc(op " %1, " arg0, var, cc, "er" (val))
			
 
				+
			
 
				+#else /* !CC_HAVE_ASM_GOTO */
			
 
				+
			
 
				+#define __GEN_RMWcc(fullop, var, cc, ...)				\
			
 
				+do {									\
			
 
				+	char c;								\
			
 
				+	asm volatile (fullop "; set" cc " %1"				\
			
 
				+			: "+m" (var), "=qm" (c)				\
			
 
				+			: __VA_ARGS__ : "memory");			\
			
 
				+	return c != 0;							\
			
 
				+} while (0)
			
 
				+
			
 
				+#define GEN_UNARY_RMWcc(op, var, arg0, cc)				\
			
 
				+	__GEN_RMWcc(op " " arg0, var, cc)
			
 
				+
			
 
				+#define GEN_BINARY_RMWcc(op, var, val, arg0, cc)			\
			
 
				+	__GEN_RMWcc(op " %2, " arg0, var, cc, "er" (val))
			
 
				+
			
 
				+#endif /* CC_HAVE_ASM_GOTO */
			
 
				+
			
 
				+#endif /* _ASM_X86_RMWcc */
			
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -28,8 +28,7 @@ struct thread_info {
 
				 	__u32			flags;		/* low level flags */
			
 
				 	__u32			status;		/* thread synchronous flags */
			
 
				 	__u32			cpu;		/* current CPU */
			
 
				-	int			preempt_count;	/* 0 => preemptable,
			
 
				-						   <0 => BUG */
			
 
				+	int			saved_preempt_count;
			
 
				 	mm_segment_t		addr_limit;
			
 
				 	struct restart_block    restart_block;
			
 
				 	void __user		*sysenter_return;
			
@@ -49,7 +48,7 @@ struct thread_info {
 
				 	.exec_domain	= &default_exec_domain,	\
			
 
				 	.flags		= 0,			\
			
 
				 	.cpu		= 0,			\
			
 
				-	.preempt_count	= INIT_PREEMPT_COUNT,	\
			
 
				+	.saved_preempt_count = INIT_PREEMPT_COUNT,	\
			
 
				 	.addr_limit	= KERNEL_DS,		\
			
 
				 	.restart_block = {			\
			
 
				 		.fn = do_no_restart_syscall,	\
			
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -36,6 +36,8 @@ obj-y			+= tsc.o io_delay.o rtc.o
 
				 obj-y			+= pci-iommu_table.o
			
 
				 obj-y			+= resource.o
			
 
				 
			
 
				+obj-$(CONFIG_PREEMPT)	+= preempt.o
			
 
				+
			
 
				 obj-y				+= process.o
			
 
				 obj-y				+= i387.o xsave.o
			
 
				 obj-y				+= ptrace.o
			
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -32,7 +32,6 @@ void common(void) {
 
				 	OFFSET(TI_flags, thread_info, flags);
			
 
				 	OFFSET(TI_status, thread_info, status);
			
 
				 	OFFSET(TI_addr_limit, thread_info, addr_limit);
			
 
				-	OFFSET(TI_preempt_count, thread_info, preempt_count);
			
 
				 
			
 
				 	BLANK();
			
 
				 	OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
			
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1095,6 +1095,9 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) =
 
				 
			
 
				 DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
			
 
				 
			
 
				+DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
			
 
				+EXPORT_PER_CPU_SYMBOL(__preempt_count);
			
 
				+
			
 
				 DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
			
 
				 
			
 
				 /*
			
@@ -1169,6 +1172,8 @@ void debug_stack_reset(void)
 
				 
			
 
				 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
			
 
				 EXPORT_PER_CPU_SYMBOL(current_task);
			
 
				+DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
			
 
				+EXPORT_PER_CPU_SYMBOL(__preempt_count);
			
 
				 DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
			
 
				 
			
 
				 #ifdef CONFIG_CC_STACKPROTECTOR
			
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -362,12 +362,9 @@ END(ret_from_exception)
 
				 #ifdef CONFIG_PREEMPT
			
 
				 ENTRY(resume_kernel)
			
 
				 	DISABLE_INTERRUPTS(CLBR_ANY)
			
 
				-	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
			
 
				-	jnz restore_all
			
 
				 need_resched:
			
 
				-	movl TI_flags(%ebp), %ecx	# need_resched set ?
			
 
				-	testb $_TIF_NEED_RESCHED, %cl
			
 
				-	jz restore_all
			
 
				+	cmpl $0,PER_CPU_VAR(__preempt_count)
			
 
				+	jnz restore_all
			
 
				 	testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)	# interrupts off (exception path) ?
			
 
				 	jz restore_all
			
 
				 	call preempt_schedule_irq
			
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1103,10 +1103,8 @@ retint_signal:
 
				 	/* Returning to kernel space. Check if we need preemption */
			
 
				 	/* rcx:	 threadinfo. interrupts off. */
			
 
				 ENTRY(retint_kernel)
			
 
				-	cmpl $0,TI_preempt_count(%rcx)
			
 
				+	cmpl $0,PER_CPU_VAR(__preempt_count)
			
 
				 	jnz  retint_restore_args
			
 
				-	bt  $TIF_NEED_RESCHED,TI_flags(%rcx)
			
 
				-	jnc  retint_restore_args
			
 
				 	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */
			
 
				 	jnc  retint_restore_args
			
 
				 	call preempt_schedule_irq
			
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -37,3 +37,10 @@ EXPORT_SYMBOL(strstr);
 
				 
			
 
				 EXPORT_SYMBOL(csum_partial);
			
 
				 EXPORT_SYMBOL(empty_zero_page);
			
 
				+
			
 
				+#ifdef CONFIG_PREEMPT
			
 
				+EXPORT_SYMBOL(___preempt_schedule);
			
 
				+#ifdef CONFIG_CONTEXT_TRACKING
			
 
				+EXPORT_SYMBOL(___preempt_schedule_context);
			
 
				+#endif
			
 
				+#endif
			
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -100,9 +100,6 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
 
				 	irqctx->tinfo.task = curctx->tinfo.task;
			
 
				 	irqctx->tinfo.previous_esp = current_stack_pointer;
			
 
				 
			
 
				-	/* Copy the preempt_count so that the [soft]irq checks work. */
			
 
				-	irqctx->tinfo.preempt_count = curctx->tinfo.preempt_count;
			
 
				-
			
 
				 	if (unlikely(overflow))
			
 
				 		call_on_stack(print_stack_overflow, isp);
			
 
				 
			
@@ -131,7 +128,6 @@ void irq_ctx_init(int cpu)
 
				 					       THREAD_SIZE_ORDER));
			
 
				 	memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
			
 
				 	irqctx->tinfo.cpu		= cpu;
			
 
				-	irqctx->tinfo.preempt_count	= HARDIRQ_OFFSET;
			
 
				 	irqctx->tinfo.addr_limit	= MAKE_MM_SEG(0);
			
 
				 
			
 
				 	per_cpu(hardirq_ctx, cpu) = irqctx;
			
--- a/arch/x86/kernel/preempt.S
+++ b/arch/x86/kernel/preempt.S
@@ -0,0 +1,25 @@
 
				+
			
 
				+#include <linux/linkage.h>
			
 
				+#include <asm/dwarf2.h>
			
 
				+#include <asm/asm.h>
			
 
				+#include <asm/calling.h>
			
 
				+
			
 
				+ENTRY(___preempt_schedule)
			
 
				+	CFI_STARTPROC
			
 
				+	SAVE_ALL
			
 
				+	call preempt_schedule
			
 
				+	RESTORE_ALL
			
 
				+	ret
			
 
				+	CFI_ENDPROC
			
 
				+
			
 
				+#ifdef CONFIG_CONTEXT_TRACKING
			
 
				+
			
 
				+ENTRY(___preempt_schedule_context)
			
 
				+	CFI_STARTPROC
			
 
				+	SAVE_ALL
			
 
				+	call preempt_schedule_context
			
 
				+	RESTORE_ALL
			
 
				+	ret
			
 
				+	CFI_ENDPROC
			
 
				+
			
 
				+#endif
			
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -391,9 +391,9 @@ static void amd_e400_idle(void)
 
				 		 * The switch back from broadcast mode needs to be
			
 
				 		 * called with interrupts disabled.
			
 
				 		 */
			
 
				-		 local_irq_disable();
			
 
				-		 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
			
 
				-		 local_irq_enable();
			
 
				+		local_irq_disable();
			
 
				+		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
			
 
				+		local_irq_enable();
			
 
				 	} else
			
 
				 		default_idle();
			
 
				 }
			
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -291,6 +291,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
				 	if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
			
 
				 		set_iopl_mask(next->iopl);
			
 
				 
			
 
				+	/*
			
 
				+	 * If it were not for PREEMPT_ACTIVE we could guarantee that the
			
 
				+	 * preempt_count of all tasks was equal here and this would not be
			
 
				+	 * needed.
			
 
				+	 */
			
 
				+	task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
			
 
				+	this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
			
 
				+
			
 
				 	/*
			
 
				 	 * Now maybe handle debug registers and/or IO bitmaps
			
 
				 	 */
			
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -363,6 +363,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
				 	this_cpu_write(old_rsp, next->usersp);
			
 
				 	this_cpu_write(current_task, next_p);
			
 
				 
			
 
				+	/*
			
 
				+	 * If it were not for PREEMPT_ACTIVE we could guarantee that the
			
 
				+	 * preempt_count of all tasks was equal here and this would not be
			
 
				+	 * needed.
			
 
				+	 */
			
 
				+	task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
			
 
				+	this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
			
 
				+
			
 
				 	this_cpu_write(kernel_stack,
			
 
				 		  (unsigned long)task_stack_page(next_p) +
			
 
				 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
			
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -88,7 +88,7 @@ static inline void conditional_sti(struct pt_regs *regs)
 
				 
			
 
				 static inline void preempt_conditional_sti(struct pt_regs *regs)
			
 
				 {
			
 
				-	inc_preempt_count();
			
 
				+	preempt_count_inc();
			
 
				 	if (regs->flags & X86_EFLAGS_IF)
			
 
				 		local_irq_enable();
			
 
				 }
			
@@ -103,7 +103,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 
				 {
			
 
				 	if (regs->flags & X86_EFLAGS_IF)
			
 
				 		local_irq_disable();
			
 
				-	dec_preempt_count();
			
 
				+	preempt_count_dec();
			
 
				 }
			
 
				 
			
 
				 static int __kprobes
			
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -66,3 +66,10 @@ EXPORT_SYMBOL(empty_zero_page);
 
				 #ifndef CONFIG_PARAVIRT
			
 
				 EXPORT_SYMBOL(native_load_gs_index);
			
 
				 #endif
			
 
				+
			
 
				+#ifdef CONFIG_PREEMPT
			
 
				+EXPORT_SYMBOL(___preempt_schedule);
			
 
				+#ifdef CONFIG_CONTEXT_TRACKING
			
 
				+EXPORT_SYMBOL(___preempt_schedule_context);
			
 
				+#endif
			
 
				+#endif
			
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -28,3 +28,4 @@ generic-y += termios.h
 
				 generic-y += topology.h
			
 
				 generic-y += trace_clock.h
			
 
				 generic-y += xor.h
			
 
				+generic-y += preempt.h
			
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -119,17 +119,10 @@ static struct dmi_system_id processor_power_dmi_table[] = {
 
				  */
			
 
				 static void acpi_safe_halt(void)
			
 
				 {
			
 
				-	current_thread_info()->status &= ~TS_POLLING;
			
 
				-	/*
			
 
				-	 * TS_POLLING-cleared state must be visible before we
			
 
				-	 * test NEED_RESCHED:
			
 
				-	 */
			
 
				-	smp_mb();
			
 
				-	if (!need_resched()) {
			
 
				+	if (!tif_need_resched()) {
			
 
				 		safe_halt();
			
 
				 		local_irq_disable();
			
 
				 	}
			
 
				-	current_thread_info()->status |= TS_POLLING;
			
 
				 }
			
 
				 
			
 
				 #ifdef ARCH_APICTIMER_STOPS_ON_C3
			
@@ -737,6 +730,11 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev,
 
				 	if (unlikely(!pr))
			
 
				 		return -EINVAL;
			
 
				 
			
 
				+	if (cx->entry_method == ACPI_CSTATE_FFH) {
			
 
				+		if (current_set_polling_and_test())
			
 
				+			return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				 	lapic_timer_state_broadcast(pr, cx, 1);
			
 
				 	acpi_idle_do_entry(cx);
			
 
				 
			
@@ -790,18 +788,9 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
 
				 	if (unlikely(!pr))
			
 
				 		return -EINVAL;
			
 
				 
			
 
				-	if (cx->entry_method != ACPI_CSTATE_FFH) {
			
 
				-		current_thread_info()->status &= ~TS_POLLING;
			
 
				-		/*
			
 
				-		 * TS_POLLING-cleared state must be visible before we test
			
 
				-		 * NEED_RESCHED:
			
 
				-		 */
			
 
				-		smp_mb();
			
 
				-
			
 
				-		if (unlikely(need_resched())) {
			
 
				-			current_thread_info()->status |= TS_POLLING;
			
 
				+	if (cx->entry_method == ACPI_CSTATE_FFH) {
			
 
				+		if (current_set_polling_and_test())
			
 
				 			return -EINVAL;
			
 
				-		}
			
 
				 	}
			
 
				 
			
 
				 	/*
			
@@ -819,9 +808,6 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
 
				 
			
 
				 	sched_clock_idle_wakeup_event(0);
			
 
				 
			
 
				-	if (cx->entry_method != ACPI_CSTATE_FFH)
			
 
				-		current_thread_info()->status |= TS_POLLING;
			
 
				-
			
 
				 	lapic_timer_state_broadcast(pr, cx, 0);
			
 
				 	return index;
			
 
				 }
			
@@ -858,18 +844,9 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev,
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	if (cx->entry_method != ACPI_CSTATE_FFH) {
			
 
				-		current_thread_info()->status &= ~TS_POLLING;
			
 
				-		/*
			
 
				-		 * TS_POLLING-cleared state must be visible before we test
			
 
				-		 * NEED_RESCHED:
			
 
				-		 */
			
 
				-		smp_mb();
			
 
				-
			
 
				-		if (unlikely(need_resched())) {
			
 
				-			current_thread_info()->status |= TS_POLLING;
			
 
				+	if (cx->entry_method == ACPI_CSTATE_FFH) {
			
 
				+		if (current_set_polling_and_test())
			
 
				 			return -EINVAL;
			
 
				-		}
			
 
				 	}
			
 
				 
			
 
				 	acpi_unlazy_tlb(smp_processor_id());
			
@@ -915,9 +892,6 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev,
 
				 
			
 
				 	sched_clock_idle_wakeup_event(0);
			
 
				 
			
 
				-	if (cx->entry_method != ACPI_CSTATE_FFH)
			
 
				-		current_thread_info()->status |= TS_POLLING;
			
 
				-
			
 
				 	lapic_timer_state_broadcast(pr, cx, 0);
			
 
				 	return index;
			
 
				 }
			
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -359,7 +359,7 @@ static int intel_idle(struct cpuidle_device *dev,
 
				 	if (!(lapic_timer_reliable_states & (1 << (cstate))))
			
 
				 		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
			
 
				 
			
 
				-	if (!need_resched()) {
			
 
				+	if (!current_set_polling_and_test()) {
			
 
				 
			
 
				 		__monitor((void *)&current_thread_info()->flags, 0, 0);
			
 
				 		smp_mb();
			
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1547,6 +1547,7 @@ static int do_execve_common(const char *filename,
 
				 	current->fs->in_exec = 0;
			
 
				 	current->in_execve = 0;
			
 
				 	acct_update_integrals(current);
			
 
				+	task_numa_free(current);
			
 
				 	free_bprm(bprm);
			
 
				 	if (displaced)
			
 
				 		put_files_struct(displaced);
			
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -183,6 +183,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 
				 	seq_printf(m,
			
 
				 		"State:\t%s\n"
			
 
				 		"Tgid:\t%d\n"
			
 
				+		"Ngid:\t%d\n"
			
 
				 		"Pid:\t%d\n"
			
 
				 		"PPid:\t%d\n"
			
 
				 		"TracerPid:\t%d\n"
			
@@ -190,6 +191,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 
				 		"Gid:\t%d\t%d\t%d\t%d\n",
			
 
				 		get_task_state(p),
			
 
				 		task_tgid_nr_ns(p, ns),
			
 
				+		task_numa_group_id(p),
			
 
				 		pid_nr_ns(pid, ns),
			
 
				 		ppid, tpid,
			
 
				 		from_kuid_munged(user_ns, cred->uid),
			
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -0,0 +1,105 @@
 
				+#ifndef __ASM_PREEMPT_H
			
 
				+#define __ASM_PREEMPT_H
			
 
				+
			
 
				+#include <linux/thread_info.h>
			
 
				+
			
 
				+/*
			
 
				+ * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
			
 
				+ * that think a non-zero value indicates we cannot preempt.
			
 
				+ */
			
 
				+static __always_inline int preempt_count(void)
			
 
				+{
			
 
				+	return current_thread_info()->preempt_count & ~PREEMPT_NEED_RESCHED;
			
 
				+}
			
 
				+
			
 
				+static __always_inline int *preempt_count_ptr(void)
			
 
				+{
			
 
				+	return &current_thread_info()->preempt_count;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * We now loose PREEMPT_NEED_RESCHED and cause an extra reschedule; however the
			
 
				+ * alternative is loosing a reschedule. Better schedule too often -- also this
			
 
				+ * should be a very rare operation.
			
 
				+ */
			
 
				+static __always_inline void preempt_count_set(int pc)
			
 
				+{
			
 
				+	*preempt_count_ptr() = pc;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * must be macros to avoid header recursion hell
			
 
				+ */
			
 
				+#define task_preempt_count(p) \
			
 
				+	(task_thread_info(p)->preempt_count & ~PREEMPT_NEED_RESCHED)
			
 
				+
			
 
				+#define init_task_preempt_count(p) do { \
			
 
				+	task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \
			
 
				+} while (0)
			
 
				+
			
 
				+#define init_idle_preempt_count(p, cpu) do { \
			
 
				+	task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \
			
 
				+} while (0)
			
 
				+
			
 
				+/*
			
 
				+ * We fold the NEED_RESCHED bit into the preempt count such that
			
 
				+ * preempt_enable() can decrement and test for needing to reschedule with a
			
 
				+ * single instruction.
			
 
				+ *
			
 
				+ * We invert the actual bit, so that when the decrement hits 0 we know we both
			
 
				+ * need to resched (the bit is cleared) and can resched (no preempt count).
			
 
				+ */
			
 
				+
			
 
				+static __always_inline void set_preempt_need_resched(void)
			
 
				+{
			
 
				+	*preempt_count_ptr() &= ~PREEMPT_NEED_RESCHED;
			
 
				+}
			
 
				+
			
 
				+static __always_inline void clear_preempt_need_resched(void)
			
 
				+{
			
 
				+	*preempt_count_ptr() |= PREEMPT_NEED_RESCHED;
			
 
				+}
			
 
				+
			
 
				+static __always_inline bool test_preempt_need_resched(void)
			
 
				+{
			
 
				+	return !(*preempt_count_ptr() & PREEMPT_NEED_RESCHED);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * The various preempt_count add/sub methods
			
 
				+ */
			
 
				+
			
 
				+static __always_inline void __preempt_count_add(int val)
			
 
				+{
			
 
				+	*preempt_count_ptr() += val;
			
 
				+}
			
 
				+
			
 
				+static __always_inline void __preempt_count_sub(int val)
			
 
				+{
			
 
				+	*preempt_count_ptr() -= val;
			
 
				+}
			
 
				+
			
 
				+static __always_inline bool __preempt_count_dec_and_test(void)
			
 
				+{
			
 
				+	return !--*preempt_count_ptr();
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Returns true when we need to resched and can (barring IRQ state).
			
 
				+ */
			
 
				+static __always_inline bool should_resched(void)
			
 
				+{
			
 
				+	return unlikely(!*preempt_count_ptr());
			
 
				+}
			
 
				+
			
 
				+#ifdef CONFIG_PREEMPT
			
 
				+extern asmlinkage void preempt_schedule(void);
			
 
				+#define __preempt_schedule() preempt_schedule()
			
 
				+
			
 
				+#ifdef CONFIG_CONTEXT_TRACKING
			
 
				+extern asmlinkage void preempt_schedule_context(void);
			
 
				+#define __preempt_schedule_context() preempt_schedule_context()
			
 
				+#endif
			
 
				+#endif /* CONFIG_PREEMPT */
			
 
				+
			
 
				+#endif /* __ASM_PREEMPT_H */
			
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -5,7 +5,7 @@
 
				  * (C) Copyright 2001 Linus Torvalds
			
 
				  *
			
 
				  * Atomic wait-for-completion handler data structures.
			
 
				- * See kernel/sched/core.c for details.
			
 
				+ * See kernel/sched/completion.c for details.
			
 
				  */
			
 
				 
			
 
				 #include <linux/wait.h>
			
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -33,7 +33,7 @@ extern void rcu_nmi_exit(void);
 
				 #define __irq_enter()					\
			
 
				 	do {						\
			
 
				 		account_irq_enter_time(current);	\
			
 
				-		add_preempt_count(HARDIRQ_OFFSET);	\
			
 
				+		preempt_count_add(HARDIRQ_OFFSET);	\
			
 
				 		trace_hardirq_enter();			\
			
 
				 	} while (0)
			
 
				 
			
@@ -49,7 +49,7 @@ extern void irq_enter(void);
 
				 	do {						\
			
 
				 		trace_hardirq_exit();			\
			
 
				 		account_irq_exit_time(current);		\
			
 
				-		sub_preempt_count(HARDIRQ_OFFSET);	\
			
 
				+		preempt_count_sub(HARDIRQ_OFFSET);	\
			
 
				 	} while (0)
			
 
				 
			
 
				 /*
			
@@ -62,7 +62,7 @@ extern void irq_exit(void);
 
				 		lockdep_off();					\
			
 
				 		ftrace_nmi_enter();				\
			
 
				 		BUG_ON(in_nmi());				\
			
 
				-		add_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET);	\
			
 
				+		preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);	\
			
 
				 		rcu_nmi_enter();				\
			
 
				 		trace_hardirq_enter();				\
			
 
				 	} while (0)
			
@@ -72,7 +72,7 @@ extern void irq_exit(void);
 
				 		trace_hardirq_exit();				\
			
 
				 		rcu_nmi_exit();					\
			
 
				 		BUG_ON(!in_nmi());				\
			
 
				-		sub_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET);	\
			
 
				+		preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET);	\
			
 
				 		ftrace_nmi_exit();				\
			
 
				 		lockdep_on();					\
			
 
				 	} while (0)
			
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -136,6 +136,7 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
 
				 
			
 
				 struct mempolicy *get_vma_policy(struct task_struct *tsk,
			
 
				 		struct vm_area_struct *vma, unsigned long addr);
			
 
				+bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma);
			
 
				 
			
 
				 extern void numa_default_policy(void);
			
 
				 extern void numa_policy_init(void);
			
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -90,11 +90,12 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 
				 #endif /* CONFIG_MIGRATION */
			
 
				 
			
 
				 #ifdef CONFIG_NUMA_BALANCING
			
 
				-extern int migrate_misplaced_page(struct page *page, int node);
			
 
				-extern int migrate_misplaced_page(struct page *page, int node);
			
 
				+extern int migrate_misplaced_page(struct page *page,
			
 
				+				  struct vm_area_struct *vma, int node);
			
 
				 extern bool migrate_ratelimited(int node);
			
 
				 #else
			
 
				-static inline int migrate_misplaced_page(struct page *page, int node)
			
 
				+static inline int migrate_misplaced_page(struct page *page,
			
 
				+					 struct vm_area_struct *vma, int node)
			
 
				 {
			
 
				 	return -EAGAIN; /* can't migrate now */
			
 
				 }
			
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -581,11 +581,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 
				  * sets it, so none of the operations on it need to be atomic.
			
 
				  */
			
 
				 
			
 
				-/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NID] | ... | FLAGS | */
			
 
				+/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
			
 
				 #define SECTIONS_PGOFF		((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
			
 
				 #define NODES_PGOFF		(SECTIONS_PGOFF - NODES_WIDTH)
			
 
				 #define ZONES_PGOFF		(NODES_PGOFF - ZONES_WIDTH)
			
 
				-#define LAST_NID_PGOFF		(ZONES_PGOFF - LAST_NID_WIDTH)
			
 
				+#define LAST_CPUPID_PGOFF	(ZONES_PGOFF - LAST_CPUPID_WIDTH)
			
 
				 
			
 
				 /*
			
 
				  * Define the bit shifts to access each section.  For non-existent
			
@@ -595,7 +595,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 
				 #define SECTIONS_PGSHIFT	(SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
			
 
				 #define NODES_PGSHIFT		(NODES_PGOFF * (NODES_WIDTH != 0))
			
 
				 #define ZONES_PGSHIFT		(ZONES_PGOFF * (ZONES_WIDTH != 0))
			
 
				-#define LAST_NID_PGSHIFT	(LAST_NID_PGOFF * (LAST_NID_WIDTH != 0))
			
 
				+#define LAST_CPUPID_PGSHIFT	(LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0))
			
 
				 
			
 
				 /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
			
 
				 #ifdef NODE_NOT_IN_PAGE_FLAGS
			
@@ -617,7 +617,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 
				 #define ZONES_MASK		((1UL << ZONES_WIDTH) - 1)
			
 
				 #define NODES_MASK		((1UL << NODES_WIDTH) - 1)
			
 
				 #define SECTIONS_MASK		((1UL << SECTIONS_WIDTH) - 1)
			
 
				-#define LAST_NID_MASK		((1UL << LAST_NID_WIDTH) - 1)
			
 
				+#define LAST_CPUPID_MASK	((1UL << LAST_CPUPID_WIDTH) - 1)
			
 
				 #define ZONEID_MASK		((1UL << ZONEID_SHIFT) - 1)
			
 
				 
			
 
				 static inline enum zone_type page_zonenum(const struct page *page)
			
@@ -661,51 +661,117 @@ static inline int page_to_nid(const struct page *page)
 
				 #endif
			
 
				 
			
 
				 #ifdef CONFIG_NUMA_BALANCING
			
 
				-#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
			
 
				-static inline int page_nid_xchg_last(struct page *page, int nid)
			
 
				+static inline int cpu_pid_to_cpupid(int cpu, int pid)
			
 
				 {
			
 
				-	return xchg(&page->_last_nid, nid);
			
 
				+	return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK);
			
 
				 }
			
 
				 
			
 
				-static inline int page_nid_last(struct page *page)
			
 
				+static inline int cpupid_to_pid(int cpupid)
			
 
				 {
			
 
				-	return page->_last_nid;
			
 
				+	return cpupid & LAST__PID_MASK;
			
 
				 }
			
 
				-static inline void page_nid_reset_last(struct page *page)
			
 
				+
			
 
				+static inline int cpupid_to_cpu(int cpupid)
			
 
				 {
			
 
				-	page->_last_nid = -1;
			
 
				+	return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK;
			
 
				 }
			
 
				-#else
			
 
				-static inline int page_nid_last(struct page *page)
			
 
				+
			
 
				+static inline int cpupid_to_nid(int cpupid)
			
 
				 {
			
 
				-	return (page->flags >> LAST_NID_PGSHIFT) & LAST_NID_MASK;
			
 
				+	return cpu_to_node(cpupid_to_cpu(cpupid));
			
 
				 }
			
 
				 
			
 
				-extern int page_nid_xchg_last(struct page *page, int nid);
			
 
				+static inline bool cpupid_pid_unset(int cpupid)
			
 
				+{
			
 
				+	return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK);
			
 
				+}
			
 
				 
			
 
				-static inline void page_nid_reset_last(struct page *page)
			
 
				+static inline bool cpupid_cpu_unset(int cpupid)
			
 
				 {
			
 
				-	int nid = (1 << LAST_NID_SHIFT) - 1;
			
 
				+	return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK);
			
 
				+}
			
 
				 
			
 
				-	page->flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT);
			
 
				-	page->flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT;
			
 
				+static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
			
 
				+{
			
 
				+	return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid);
			
 
				+}
			
 
				+
			
 
				+#define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
			
 
				+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
			
 
				+static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
			
 
				+{
			
 
				+	return xchg(&page->_last_cpupid, cpupid);
			
 
				+}
			
 
				+
			
 
				+static inline int page_cpupid_last(struct page *page)
			
 
				+{
			
 
				+	return page->_last_cpupid;
			
 
				+}
			
 
				+static inline void page_cpupid_reset_last(struct page *page)
			
 
				+{
			
 
				+	page->_last_cpupid = -1;
			
 
				 }
			
 
				-#endif /* LAST_NID_NOT_IN_PAGE_FLAGS */
			
 
				 #else
			
 
				-static inline int page_nid_xchg_last(struct page *page, int nid)
			
 
				+static inline int page_cpupid_last(struct page *page)
			
 
				 {
			
 
				-	return page_to_nid(page);
			
 
				+	return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
			
 
				 }
			
 
				 
			
 
				-static inline int page_nid_last(struct page *page)
			
 
				+extern int page_cpupid_xchg_last(struct page *page, int cpupid);
			
 
				+
			
 
				+static inline void page_cpupid_reset_last(struct page *page)
			
 
				 {
			
 
				-	return page_to_nid(page);
			
 
				+	int cpupid = (1 << LAST_CPUPID_SHIFT) - 1;
			
 
				+
			
 
				+	page->flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
			
 
				+	page->flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
			
 
				+}
			
 
				+#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
			
 
				+#else /* !CONFIG_NUMA_BALANCING */
			
 
				+static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
			
 
				+{
			
 
				+	return page_to_nid(page); /* XXX */
			
 
				 }
			
 
				 
			
 
				-static inline void page_nid_reset_last(struct page *page)
			
 
				+static inline int page_cpupid_last(struct page *page)
			
 
				 {
			
 
				+	return page_to_nid(page); /* XXX */
			
 
				 }
			
 
				-#endif
			
 
				+
			
 
				+static inline int cpupid_to_nid(int cpupid)
			
 
				+{
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+static inline int cpupid_to_pid(int cpupid)
			
 
				+{
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+static inline int cpupid_to_cpu(int cpupid)
			
 
				+{
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+static inline int cpu_pid_to_cpupid(int nid, int pid)
			
 
				+{
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+static inline bool cpupid_pid_unset(int cpupid)
			
 
				+{
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+static inline void page_cpupid_reset_last(struct page *page)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
			
 
				+{
			
 
				+	return false;
			
 
				+}
			
 
				+#endif /* CONFIG_NUMA_BALANCING */
			
 
				 
			
 
				 static inline struct zone *page_zone(const struct page *page)
			
 
				 {
			
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -174,8 +174,8 @@ struct page {
 
				 	void *shadow;
			
 
				 #endif
			
 
				 
			
 
				-#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
			
 
				-	int _last_nid;
			
 
				+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
			
 
				+	int _last_cpupid;
			
 
				 #endif
			
 
				 }
			
 
				 /*
			
@@ -420,28 +420,15 @@ struct mm_struct {
 
				 	 */
			
 
				 	unsigned long numa_next_scan;
			
 
				 
			
 
				-	/* numa_next_reset is when the PTE scanner period will be reset */
			
 
				-	unsigned long numa_next_reset;
			
 
				-
			
 
				 	/* Restart point for scanning and setting pte_numa */
			
 
				 	unsigned long numa_scan_offset;
			
 
				 
			
 
				 	/* numa_scan_seq prevents two threads setting pte_numa */
			
 
				 	int numa_scan_seq;
			
 
				-
			
 
				-	/*
			
 
				-	 * The first node a task was scheduled on. If a task runs on
			
 
				-	 * a different node than Make PTE Scan Go Now.
			
 
				-	 */
			
 
				-	int first_nid;
			
 
				 #endif
			
 
				 	struct uprobes_state uprobes_state;
			
 
				 };
			
 
				 
			
 
				-/* first nid will either be a valid NID or one of these values */
			
 
				-#define NUMA_PTE_SCAN_INIT	-1
			
 
				-#define NUMA_PTE_SCAN_ACTIVE	-2
			
 
				-
			
 
				 static inline void mm_init_cpumask(struct mm_struct *mm)
			
 
				 {
			
 
				 #ifdef CONFIG_CPUMASK_OFFSTACK
			
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -38,10 +38,10 @@
 
				  * The last is when there is insufficient space in page->flags and a separate
			
 
				  * lookup is necessary.
			
 
				  *
			
 
				- * No sparsemem or sparsemem vmemmap: |       NODE     | ZONE |          ... | FLAGS |
			
 
				- *         " plus space for last_nid: |       NODE     | ZONE | LAST_NID ... | FLAGS |
			
 
				- * classic sparse with space for node:| SECTION | NODE | ZONE |          ... | FLAGS |
			
 
				- *         " plus space for last_nid: | SECTION | NODE | ZONE | LAST_NID ... | FLAGS |
			
 
				+ * No sparsemem or sparsemem vmemmap: |       NODE     | ZONE |             ... | FLAGS |
			
 
				+ *      " plus space for last_cpupid: |       NODE     | ZONE | LAST_CPUPID ... | FLAGS |
			
 
				+ * classic sparse with space for node:| SECTION | NODE | ZONE |             ... | FLAGS |
			
 
				+ *      " plus space for last_cpupid: | SECTION | NODE | ZONE | LAST_CPUPID ... | FLAGS |
			
 
				  * classic sparse no space for node:  | SECTION |     ZONE    | ... | FLAGS |
			
 
				  */
			
 
				 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
			
@@ -62,15 +62,21 @@
 
				 #endif
			
 
				 
			
 
				 #ifdef CONFIG_NUMA_BALANCING
			
 
				-#define LAST_NID_SHIFT NODES_SHIFT
			
 
				+#define LAST__PID_SHIFT 8
			
 
				+#define LAST__PID_MASK  ((1 << LAST__PID_SHIFT)-1)
			
 
				+
			
 
				+#define LAST__CPU_SHIFT NR_CPUS_BITS
			
 
				+#define LAST__CPU_MASK  ((1 << LAST__CPU_SHIFT)-1)
			
 
				+
			
 
				+#define LAST_CPUPID_SHIFT (LAST__PID_SHIFT+LAST__CPU_SHIFT)
			
 
				 #else
			
 
				-#define LAST_NID_SHIFT 0
			
 
				+#define LAST_CPUPID_SHIFT 0
			
 
				 #endif
			
 
				 
			
 
				-#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
			
 
				-#define LAST_NID_WIDTH LAST_NID_SHIFT
			
 
				+#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
			
 
				+#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
			
 
				 #else
			
 
				-#define LAST_NID_WIDTH 0
			
 
				+#define LAST_CPUPID_WIDTH 0
			
 
				 #endif
			
 
				 
			
 
				 /*
			
@@ -81,8 +87,8 @@
 
				 #define NODE_NOT_IN_PAGE_FLAGS
			
 
				 #endif
			
 
				 
			
 
				-#if defined(CONFIG_NUMA_BALANCING) && LAST_NID_WIDTH == 0
			
 
				-#define LAST_NID_NOT_IN_PAGE_FLAGS
			
 
				+#if defined(CONFIG_NUMA_BALANCING) && LAST_CPUPID_WIDTH == 0
			
 
				+#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
			
 
				 #endif
			
 
				 
			
 
				 #endif /* _LINUX_PAGE_FLAGS_LAYOUT */
			
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -6,106 +6,95 @@
 
				  * preempt_count (used for kernel preemption, interrupt count, etc.)
			
 
				  */
			
 
				 
			
 
				-#include <linux/thread_info.h>
			
 
				 #include <linux/linkage.h>
			
 
				 #include <linux/list.h>
			
 
				 
			
 
				-#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
			
 
				-  extern void add_preempt_count(int val);
			
 
				-  extern void sub_preempt_count(int val);
			
 
				-#else
			
 
				-# define add_preempt_count(val)	do { preempt_count() += (val); } while (0)
			
 
				-# define sub_preempt_count(val)	do { preempt_count() -= (val); } while (0)
			
 
				-#endif
			
 
				-
			
 
				-#define inc_preempt_count() add_preempt_count(1)
			
 
				-#define dec_preempt_count() sub_preempt_count(1)
			
 
				-
			
 
				-#define preempt_count()	(current_thread_info()->preempt_count)
			
 
				-
			
 
				-#ifdef CONFIG_PREEMPT
			
 
				-
			
 
				-asmlinkage void preempt_schedule(void);
			
 
				-
			
 
				-#define preempt_check_resched() \
			
 
				-do { \
			
 
				-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
			
 
				-		preempt_schedule(); \
			
 
				-} while (0)
			
 
				-
			
 
				-#ifdef CONFIG_CONTEXT_TRACKING
			
 
				+/*
			
 
				+ * We use the MSB mostly because its available; see <linux/preempt_mask.h> for
			
 
				+ * the other bits -- can't include that header due to inclusion hell.
			
 
				+ */
			
 
				+#define PREEMPT_NEED_RESCHED	0x80000000
			
 
				 
			
 
				-void preempt_schedule_context(void);
			
 
				+#include <asm/preempt.h>
			
 
				 
			
 
				-#define preempt_check_resched_context() \
			
 
				-do { \
			
 
				-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
			
 
				-		preempt_schedule_context(); \
			
 
				-} while (0)
			
 
				+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
			
 
				+extern void preempt_count_add(int val);
			
 
				+extern void preempt_count_sub(int val);
			
 
				+#define preempt_count_dec_and_test() ({ preempt_count_sub(1); should_resched(); })
			
 
				 #else
			
 
				+#define preempt_count_add(val)	__preempt_count_add(val)
			
 
				+#define preempt_count_sub(val)	__preempt_count_sub(val)
			
 
				+#define preempt_count_dec_and_test() __preempt_count_dec_and_test()
			
 
				+#endif
			
 
				 
			
 
				-#define preempt_check_resched_context() preempt_check_resched()
			
 
				-
			
 
				-#endif /* CONFIG_CONTEXT_TRACKING */
			
 
				-
			
 
				-#else /* !CONFIG_PREEMPT */
			
 
				-
			
 
				-#define preempt_check_resched()		do { } while (0)
			
 
				-#define preempt_check_resched_context()	do { } while (0)
			
 
				-
			
 
				-#endif /* CONFIG_PREEMPT */
			
 
				+#define __preempt_count_inc() __preempt_count_add(1)
			
 
				+#define __preempt_count_dec() __preempt_count_sub(1)
			
 
				 
			
 
				+#define preempt_count_inc() preempt_count_add(1)
			
 
				+#define preempt_count_dec() preempt_count_sub(1)
			
 
				 
			
 
				 #ifdef CONFIG_PREEMPT_COUNT
			
 
				 
			
 
				 #define preempt_disable() \
			
 
				 do { \
			
 
				-	inc_preempt_count(); \
			
 
				+	preempt_count_inc(); \
			
 
				 	barrier(); \
			
 
				 } while (0)
			
 
				 
			
 
				 #define sched_preempt_enable_no_resched() \
			
 
				 do { \
			
 
				 	barrier(); \
			
 
				-	dec_preempt_count(); \
			
 
				+	preempt_count_dec(); \
			
 
				 } while (0)
			
 
				 
			
 
				-#define preempt_enable_no_resched()	sched_preempt_enable_no_resched()
			
 
				+#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
			
 
				 
			
 
				+#ifdef CONFIG_PREEMPT
			
 
				 #define preempt_enable() \
			
 
				 do { \
			
 
				-	preempt_enable_no_resched(); \
			
 
				 	barrier(); \
			
 
				-	preempt_check_resched(); \
			
 
				+	if (unlikely(preempt_count_dec_and_test())) \
			
 
				+		__preempt_schedule(); \
			
 
				+} while (0)
			
 
				+
			
 
				+#define preempt_check_resched() \
			
 
				+do { \
			
 
				+	if (should_resched()) \
			
 
				+		__preempt_schedule(); \
			
 
				 } while (0)
			
 
				 
			
 
				-/* For debugging and tracer internals only! */
			
 
				-#define add_preempt_count_notrace(val)			\
			
 
				-	do { preempt_count() += (val); } while (0)
			
 
				-#define sub_preempt_count_notrace(val)			\
			
 
				-	do { preempt_count() -= (val); } while (0)
			
 
				-#define inc_preempt_count_notrace() add_preempt_count_notrace(1)
			
 
				-#define dec_preempt_count_notrace() sub_preempt_count_notrace(1)
			
 
				+#else
			
 
				+#define preempt_enable() preempt_enable_no_resched()
			
 
				+#define preempt_check_resched() do { } while (0)
			
 
				+#endif
			
 
				 
			
 
				 #define preempt_disable_notrace() \
			
 
				 do { \
			
 
				-	inc_preempt_count_notrace(); \
			
 
				+	__preempt_count_inc(); \
			
 
				 	barrier(); \
			
 
				 } while (0)
			
 
				 
			
 
				 #define preempt_enable_no_resched_notrace() \
			
 
				 do { \
			
 
				 	barrier(); \
			
 
				-	dec_preempt_count_notrace(); \
			
 
				+	__preempt_count_dec(); \
			
 
				 } while (0)
			
 
				 
			
 
				-/* preempt_check_resched is OK to trace */
			
 
				+#ifdef CONFIG_PREEMPT
			
 
				+
			
 
				+#ifndef CONFIG_CONTEXT_TRACKING
			
 
				+#define __preempt_schedule_context() __preempt_schedule()
			
 
				+#endif
			
 
				+
			
 
				 #define preempt_enable_notrace() \
			
 
				 do { \
			
 
				-	preempt_enable_no_resched_notrace(); \
			
 
				 	barrier(); \
			
 
				-	preempt_check_resched_context(); \
			
 
				+	if (unlikely(__preempt_count_dec_and_test())) \
			
 
				+		__preempt_schedule_context(); \
			
 
				 } while (0)
			
 
				+#else
			
 
				+#define preempt_enable_notrace() preempt_enable_no_resched_notrace()
			
 
				+#endif
			
 
				 
			
 
				 #else /* !CONFIG_PREEMPT_COUNT */
			
 
				 
			
@@ -115,10 +104,11 @@ do { \
 
				  * that can cause faults and scheduling migrate into our preempt-protected
			
 
				  * region.
			
 
				  */
			
 
				-#define preempt_disable()		barrier()
			
 
				+#define preempt_disable()			barrier()
			
 
				 #define sched_preempt_enable_no_resched()	barrier()
			
 
				-#define preempt_enable_no_resched()	barrier()
			
 
				-#define preempt_enable()		barrier()
			
 
				+#define preempt_enable_no_resched()		barrier()
			
 
				+#define preempt_enable()			barrier()
			
 
				+#define preempt_check_resched()			do { } while (0)
			
 
				 
			
 
				 #define preempt_disable_notrace()		barrier()
			
 
				 #define preempt_enable_no_resched_notrace()	barrier()
			
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -22,6 +22,7 @@ struct sched_param {
 
				 #include <linux/errno.h>
			
 
				 #include <linux/nodemask.h>
			
 
				 #include <linux/mm_types.h>
			
 
				+#include <linux/preempt.h>
			
 
				 
			
 
				 #include <asm/page.h>
			
 
				 #include <asm/ptrace.h>
			
@@ -427,6 +428,14 @@ struct task_cputime {
 
				 		.sum_exec_runtime = 0,				\
			
 
				 	}
			
 
				 
			
 
				+#define PREEMPT_ENABLED		(PREEMPT_NEED_RESCHED)
			
 
				+
			
 
				+#ifdef CONFIG_PREEMPT_COUNT
			
 
				+#define PREEMPT_DISABLED	(1 + PREEMPT_ENABLED)
			
 
				+#else
			
 
				+#define PREEMPT_DISABLED	PREEMPT_ENABLED
			
 
				+#endif
			
 
				+
			
 
				 /*
			
 
				  * Disable preemption until the scheduler is running.
			
 
				  * Reset by start_kernel()->sched_init()->init_idle().
			
@@ -434,7 +443,7 @@ struct task_cputime {
 
				  * We include PREEMPT_ACTIVE to avoid cond_resched() from working
			
 
				  * before the scheduler is active -- see should_resched().
			
 
				  */
			
 
				-#define INIT_PREEMPT_COUNT	(1 + PREEMPT_ACTIVE)
			
 
				+#define INIT_PREEMPT_COUNT	(PREEMPT_DISABLED + PREEMPT_ACTIVE)
			
 
				 
			
 
				 /**
			
 
				  * struct thread_group_cputimer - thread group interval timer counts
			
@@ -768,6 +777,7 @@ enum cpu_idle_type {
 
				 #define SD_ASYM_PACKING		0x0800  /* Place busy groups earlier in the domain */
			
 
				 #define SD_PREFER_SIBLING	0x1000	/* Prefer to place tasks in a sibling domain */
			
 
				 #define SD_OVERLAP		0x2000	/* sched_domains of this level overlap */
			
 
				+#define SD_NUMA			0x4000	/* cross-node balancing */
			
 
				 
			
 
				 extern int __weak arch_sd_sibiling_asym_packing(void);
			
 
				 
			
@@ -811,6 +821,10 @@ struct sched_domain {
 
				 
			
 
				 	u64 last_update;
			
 
				 
			
 
				+	/* idle_balance() stats */
			
 
				+	u64 max_newidle_lb_cost;
			
 
				+	unsigned long next_decay_max_lb_cost;
			
 
				+
			
 
				 #ifdef CONFIG_SCHEDSTATS
			
 
				 	/* load_balance() stats */
			
 
				 	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
			
@@ -1029,6 +1043,8 @@ struct task_struct {
 
				 	struct task_struct *last_wakee;
			
 
				 	unsigned long wakee_flips;
			
 
				 	unsigned long wakee_flip_decay_ts;
			
 
				+
			
 
				+	int wake_cpu;
			
 
				 #endif
			
 
				 	int on_rq;
			
 
				 
			
@@ -1324,10 +1340,41 @@ struct task_struct {
 
				 #endif
			
 
				 #ifdef CONFIG_NUMA_BALANCING
			
 
				 	int numa_scan_seq;
			
 
				-	int numa_migrate_seq;
			
 
				 	unsigned int numa_scan_period;
			
 
				+	unsigned int numa_scan_period_max;
			
 
				+	int numa_preferred_nid;
			
 
				+	int numa_migrate_deferred;
			
 
				+	unsigned long numa_migrate_retry;
			
 
				 	u64 node_stamp;			/* migration stamp  */
			
 
				 	struct callback_head numa_work;
			
 
				+
			
 
				+	struct list_head numa_entry;
			
 
				+	struct numa_group *numa_group;
			
 
				+
			
 
				+	/*
			
 
				+	 * Exponential decaying average of faults on a per-node basis.
			
 
				+	 * Scheduling placement decisions are made based on the these counts.
			
 
				+	 * The values remain static for the duration of a PTE scan
			
 
				+	 */
			
 
				+	unsigned long *numa_faults;
			
 
				+	unsigned long total_numa_faults;
			
 
				+
			
 
				+	/*
			
 
				+	 * numa_faults_buffer records faults per node during the current
			
 
				+	 * scan window. When the scan completes, the counts in numa_faults
			
 
				+	 * decay and these values are copied.
			
 
				+	 */
			
 
				+	unsigned long *numa_faults_buffer;
			
 
				+
			
 
				+	/*
			
 
				+	 * numa_faults_locality tracks if faults recorded during the last
			
 
				+	 * scan window were remote/local. The task scan period is adapted
			
 
				+	 * based on the locality of the faults with different weights
			
 
				+	 * depending on whether they were shared or private faults
			
 
				+	 */
			
 
				+	unsigned long numa_faults_locality[2];
			
 
				+
			
 
				+	unsigned long numa_pages_migrated;
			
 
				 #endif /* CONFIG_NUMA_BALANCING */
			
 
				 
			
 
				 	struct rcu_head rcu;
			
@@ -1412,16 +1459,33 @@ struct task_struct {
 
				 /* Future-safe accessor for struct task_struct's cpus_allowed. */
			
 
				 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
			
 
				 
			
 
				+#define TNF_MIGRATED	0x01
			
 
				+#define TNF_NO_GROUP	0x02
			
 
				+#define TNF_SHARED	0x04
			
 
				+#define TNF_FAULT_LOCAL	0x08
			
 
				+
			
 
				 #ifdef CONFIG_NUMA_BALANCING
			
 
				-extern void task_numa_fault(int node, int pages, bool migrated);
			
 
				+extern void task_numa_fault(int last_node, int node, int pages, int flags);
			
 
				+extern pid_t task_numa_group_id(struct task_struct *p);
			
 
				 extern void set_numabalancing_state(bool enabled);
			
 
				+extern void task_numa_free(struct task_struct *p);
			
 
				+
			
 
				+extern unsigned int sysctl_numa_balancing_migrate_deferred;
			
 
				 #else
			
 
				-static inline void task_numa_fault(int node, int pages, bool migrated)
			
 
				+static inline void task_numa_fault(int last_node, int node, int pages,
			
 
				+				   int flags)
			
 
				 {
			
 
				 }
			
 
				+static inline pid_t task_numa_group_id(struct task_struct *p)
			
 
				+{
			
 
				+	return 0;
			
 
				+}
			
 
				 static inline void set_numabalancing_state(bool enabled)
			
 
				 {
			
 
				 }
			
 
				+static inline void task_numa_free(struct task_struct *p)
			
 
				+{
			
 
				+}
			
 
				 #endif
			
 
				 
			
 
				 static inline struct pid *task_pid(struct task_struct *task)
			
@@ -1974,7 +2038,7 @@ extern void wake_up_new_task(struct task_struct *tsk);
 
				 #else
			
 
				  static inline void kick_process(struct task_struct *tsk) { }
			
 
				 #endif
			
 
				-extern void sched_fork(struct task_struct *p);
			
 
				+extern void sched_fork(unsigned long clone_flags, struct task_struct *p);
			
 
				 extern void sched_dead(struct task_struct *p);
			
 
				 
			
 
				 extern void proc_caches_init(void);
			
@@ -2401,11 +2465,6 @@ static inline int signal_pending_state(long state, struct task_struct *p)
 
				 	return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
			
 
				 }
			
 
				 
			
 
				-static inline int need_resched(void)
			
 
				-{
			
 
				-	return unlikely(test_thread_flag(TIF_NEED_RESCHED));
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * cond_resched() and cond_resched_lock(): latency reduction via
			
 
				  * explicit rescheduling in places that are safe. The return
			
@@ -2474,36 +2533,105 @@ static inline int tsk_is_polling(struct task_struct *p)
 
				 {
			
 
				 	return task_thread_info(p)->status & TS_POLLING;
			
 
				 }
			
 
				-static inline void current_set_polling(void)
			
 
				+static inline void __current_set_polling(void)
			
 
				 {
			
 
				 	current_thread_info()->status |= TS_POLLING;
			
 
				 }
			
 
				 
			
 
				-static inline void current_clr_polling(void)
			
 
				+static inline bool __must_check current_set_polling_and_test(void)
			
 
				+{
			
 
				+	__current_set_polling();
			
 
				+
			
 
				+	/*
			
 
				+	 * Polling state must be visible before we test NEED_RESCHED,
			
 
				+	 * paired by resched_task()
			
 
				+	 */
			
 
				+	smp_mb();
			
 
				+
			
 
				+	return unlikely(tif_need_resched());
			
 
				+}
			
 
				+
			
 
				+static inline void __current_clr_polling(void)
			
 
				 {
			
 
				 	current_thread_info()->status &= ~TS_POLLING;
			
 
				-	smp_mb__after_clear_bit();
			
 
				+}
			
 
				+
			
 
				+static inline bool __must_check current_clr_polling_and_test(void)
			
 
				+{
			
 
				+	__current_clr_polling();
			
 
				+
			
 
				+	/*
			
 
				+	 * Polling state must be visible before we test NEED_RESCHED,
			
 
				+	 * paired by resched_task()
			
 
				+	 */
			
 
				+	smp_mb();
			
 
				+
			
 
				+	return unlikely(tif_need_resched());
			
 
				 }
			
 
				 #elif defined(TIF_POLLING_NRFLAG)
			
 
				 static inline int tsk_is_polling(struct task_struct *p)
			
 
				 {
			
 
				 	return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
			
 
				 }
			
 
				-static inline void current_set_polling(void)
			
 
				+
			
 
				+static inline void __current_set_polling(void)
			
 
				 {
			
 
				 	set_thread_flag(TIF_POLLING_NRFLAG);
			
 
				 }
			
 
				 
			
 
				-static inline void current_clr_polling(void)
			
 
				+static inline bool __must_check current_set_polling_and_test(void)
			
 
				+{
			
 
				+	__current_set_polling();
			
 
				+
			
 
				+	/*
			
 
				+	 * Polling state must be visible before we test NEED_RESCHED,
			
 
				+	 * paired by resched_task()
			
 
				+	 *
			
 
				+	 * XXX: assumes set/clear bit are identical barrier wise.
			
 
				+	 */
			
 
				+	smp_mb__after_clear_bit();
			
 
				+
			
 
				+	return unlikely(tif_need_resched());
			
 
				+}
			
 
				+
			
 
				+static inline void __current_clr_polling(void)
			
 
				 {
			
 
				 	clear_thread_flag(TIF_POLLING_NRFLAG);
			
 
				 }
			
 
				+
			
 
				+static inline bool __must_check current_clr_polling_and_test(void)
			
 
				+{
			
 
				+	__current_clr_polling();
			
 
				+
			
 
				+	/*
			
 
				+	 * Polling state must be visible before we test NEED_RESCHED,
			
 
				+	 * paired by resched_task()
			
 
				+	 */
			
 
				+	smp_mb__after_clear_bit();
			
 
				+
			
 
				+	return unlikely(tif_need_resched());
			
 
				+}
			
 
				+
			
 
				 #else
			
 
				 static inline int tsk_is_polling(struct task_struct *p) { return 0; }
			
 
				-static inline void current_set_polling(void) { }
			
 
				-static inline void current_clr_polling(void) { }
			
 
				+static inline void __current_set_polling(void) { }
			
 
				+static inline void __current_clr_polling(void) { }
			
 
				+
			
 
				+static inline bool __must_check current_set_polling_and_test(void)
			
 
				+{
			
 
				+	return unlikely(tif_need_resched());
			
 
				+}
			
 
				+static inline bool __must_check current_clr_polling_and_test(void)
			
 
				+{
			
 
				+	return unlikely(tif_need_resched());
			
 
				+}
			
 
				 #endif
			
 
				 
			
 
				+static __always_inline bool need_resched(void)
			
 
				+{
			
 
				+	return unlikely(tif_need_resched());
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Thread group CPU time accounting.
			
 
				  */
			
@@ -2545,6 +2673,11 @@ static inline unsigned int task_cpu(const struct task_struct *p)
 
				 	return task_thread_info(p)->cpu;
			
 
				 }
			
 
				 
			
 
				+static inline int task_node(const struct task_struct *p)
			
 
				+{
			
 
				+	return cpu_to_node(task_cpu(p));
			
 
				+}
			
 
				+
			
 
				 extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
			
 
				 
			
 
				 #else
			
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -47,7 +47,6 @@ extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
 
				 extern unsigned int sysctl_numa_balancing_scan_delay;
			
 
				 extern unsigned int sysctl_numa_balancing_scan_period_min;
			
 
				 extern unsigned int sysctl_numa_balancing_scan_period_max;
			
 
				-extern unsigned int sysctl_numa_balancing_scan_period_reset;
			
 
				 extern unsigned int sysctl_numa_balancing_scan_size;
			
 
				 extern unsigned int sysctl_numa_balancing_settle_count;
			
 
				 
			
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -28,6 +28,7 @@ struct cpu_stop_work {
 
				 };
			
 
				 
			
 
				 int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
			
 
				+int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg);
			
 
				 void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
			
 
				 			 struct cpu_stop_work *work_buf);
			
 
				 int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
			
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -104,8 +104,21 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
 
				 #define test_thread_flag(flag) \
			
 
				 	test_ti_thread_flag(current_thread_info(), flag)
			
 
				 
			
 
				-#define set_need_resched()	set_thread_flag(TIF_NEED_RESCHED)
			
 
				-#define clear_need_resched()	clear_thread_flag(TIF_NEED_RESCHED)
			
 
				+static inline __deprecated void set_need_resched(void)
			
 
				+{
			
 
				+	/*
			
 
				+	 * Use of this function in deprecated.
			
 
				+	 *
			
 
				+	 * As of this writing there are only a few users in the DRM tree left
			
 
				+	 * all of which are wrong and can be removed without causing too much
			
 
				+	 * grief.
			
 
				+	 *
			
 
				+	 * The DRM people are aware and are working on removing the last few
			
 
				+	 * instances.
			
 
				+	 */
			
 
				+}
			
 
				+
			
 
				+#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
			
 
				 
			
 
				 #if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK
			
 
				 /*
			
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -106,6 +106,8 @@ int arch_update_cpu_topology(void);
 
				 	.last_balance		= jiffies,				\
			
 
				 	.balance_interval	= 1,					\
			
 
				 	.smt_gain		= 1178,	/* 15% */			\
			
 
				+	.max_newidle_lb_cost	= 0,					\
			
 
				+	.next_decay_max_lb_cost	= jiffies,				\
			
 
				 }
			
 
				 #endif
			
 
				 #endif /* CONFIG_SCHED_SMT */
			
@@ -135,6 +137,8 @@ int arch_update_cpu_topology(void);
 
				 				,					\
			
 
				 	.last_balance		= jiffies,				\
			
 
				 	.balance_interval	= 1,					\
			
 
				+	.max_newidle_lb_cost	= 0,					\
			
 
				+	.next_decay_max_lb_cost	= jiffies,				\
			
 
				 }
			
 
				 #endif
			
 
				 #endif /* CONFIG_SCHED_MC */
			
@@ -166,6 +170,8 @@ int arch_update_cpu_topology(void);
 
				 				,					\
			
 
				 	.last_balance		= jiffies,				\
			
 
				 	.balance_interval	= 1,					\
			
 
				+	.max_newidle_lb_cost	= 0,					\
			
 
				+	.next_decay_max_lb_cost	= jiffies,				\
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -671,31 +671,17 @@ static inline void tty_wait_until_sent_from_close(struct tty_struct *tty,
 
				 #define wait_event_interruptible_tty(tty, wq, condition)		\
			
 
				 ({									\
			
 
				 	int __ret = 0;							\
			
 
				-	if (!(condition)) {						\
			
 
				-		__wait_event_interruptible_tty(tty, wq, condition, __ret);	\
			
 
				-	}								\
			
 
				+	if (!(condition))						\
			
 
				+		__ret = __wait_event_interruptible_tty(tty, wq,		\
			
 
				+						       condition);	\
			
 
				 	__ret;								\
			
 
				 })
			
 
				 
			
 
				-#define __wait_event_interruptible_tty(tty, wq, condition, ret)		\
			
 
				-do {									\
			
 
				-	DEFINE_WAIT(__wait);						\
			
 
				-									\
			
 
				-	for (;;) {							\
			
 
				-		prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE);	\
			
 
				-		if (condition)						\
			
 
				-			break;						\
			
 
				-		if (!signal_pending(current)) {				\
			
 
				-			tty_unlock(tty);					\
			
 
				+#define __wait_event_interruptible_tty(tty, wq, condition)		\
			
 
				+	___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0,		\
			
 
				+			tty_unlock(tty);				\
			
 
				 			schedule();					\
			
 
				-			tty_lock(tty);					\
			
 
				-			continue;					\
			
 
				-		}							\
			
 
				-		ret = -ERESTARTSYS;					\
			
 
				-		break;							\
			
 
				-	}								\
			
 
				-	finish_wait(&wq, &__wait);					\
			
 
				-} while (0)
			
 
				+			tty_lock(tty))
			
 
				 
			
 
				 #ifdef CONFIG_PROC_FS
			
 
				 extern void proc_tty_register_driver(struct tty_driver *);
			
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -15,7 +15,7 @@
 
				  */
			
 
				 static inline void pagefault_disable(void)
			
 
				 {
			
 
				-	inc_preempt_count();
			
 
				+	preempt_count_inc();
			
 
				 	/*
			
 
				 	 * make sure to have issued the store before a pagefault
			
 
				 	 * can hit.
			
@@ -30,11 +30,7 @@ static inline void pagefault_enable(void)
 
				 	 * the pagefault handler again.
			
 
				 	 */
			
 
				 	barrier();
			
 
				-	dec_preempt_count();
			
 
				-	/*
			
 
				-	 * make sure we do..
			
 
				-	 */
			
 
				-	barrier();
			
 
				+	preempt_count_dec();
			
 
				 	preempt_check_resched();
			
 
				 }
			
 
				 
			
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -1,7 +1,8 @@
 
				 #ifndef _LINUX_WAIT_H
			
 
				 #define _LINUX_WAIT_H
			
 
				-
			
 
				-
			
 
				+/*
			
 
				+ * Linux wait queue related types and methods
			
 
				+ */
			
 
				 #include <linux/list.h>
			
 
				 #include <linux/stddef.h>
			
 
				 #include <linux/spinlock.h>
			
@@ -13,27 +14,27 @@ typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, v
 
				 int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key);
			
 
				 
			
 
				 struct __wait_queue {
			
 
				-	unsigned int flags;
			
 
				+	unsigned int		flags;
			
 
				 #define WQ_FLAG_EXCLUSIVE	0x01
			
 
				-	void *private;
			
 
				-	wait_queue_func_t func;
			
 
				-	struct list_head task_list;
			
 
				+	void			*private;
			
 
				+	wait_queue_func_t	func;
			
 
				+	struct list_head	task_list;
			
 
				 };
			
 
				 
			
 
				 struct wait_bit_key {
			
 
				-	void *flags;
			
 
				-	int bit_nr;
			
 
				-#define WAIT_ATOMIC_T_BIT_NR -1
			
 
				+	void			*flags;
			
 
				+	int			bit_nr;
			
 
				+#define WAIT_ATOMIC_T_BIT_NR	-1
			
 
				 };
			
 
				 
			
 
				 struct wait_bit_queue {
			
 
				-	struct wait_bit_key key;
			
 
				-	wait_queue_t wait;
			
 
				+	struct wait_bit_key	key;
			
 
				+	wait_queue_t		wait;
			
 
				 };
			
 
				 
			
 
				 struct __wait_queue_head {
			
 
				-	spinlock_t lock;
			
 
				-	struct list_head task_list;
			
 
				+	spinlock_t		lock;
			
 
				+	struct list_head	task_list;
			
 
				 };
			
 
				 typedef struct __wait_queue_head wait_queue_head_t;
			
 
				 
			
@@ -84,17 +85,17 @@ extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct
 
				 
			
 
				 static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
			
 
				 {
			
 
				-	q->flags = 0;
			
 
				-	q->private = p;
			
 
				-	q->func = default_wake_function;
			
 
				+	q->flags	= 0;
			
 
				+	q->private	= p;
			
 
				+	q->func		= default_wake_function;
			
 
				 }
			
 
				 
			
 
				-static inline void init_waitqueue_func_entry(wait_queue_t *q,
			
 
				-					wait_queue_func_t func)
			
 
				+static inline void
			
 
				+init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func)
			
 
				 {
			
 
				-	q->flags = 0;
			
 
				-	q->private = NULL;
			
 
				-	q->func = func;
			
 
				+	q->flags	= 0;
			
 
				+	q->private	= NULL;
			
 
				+	q->func		= func;
			
 
				 }
			
 
				 
			
 
				 static inline int waitqueue_active(wait_queue_head_t *q)
			
@@ -114,8 +115,8 @@ static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
 
				 /*
			
 
				  * Used for wake-one threads:
			
 
				  */
			
 
				-static inline void __add_wait_queue_exclusive(wait_queue_head_t *q,
			
 
				-					      wait_queue_t *wait)
			
 
				+static inline void
			
 
				+__add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
			
 
				 {
			
 
				 	wait->flags |= WQ_FLAG_EXCLUSIVE;
			
 
				 	__add_wait_queue(q, wait);
			
@@ -127,23 +128,22 @@ static inline void __add_wait_queue_tail(wait_queue_head_t *head,
 
				 	list_add_tail(&new->task_list, &head->task_list);
			
 
				 }
			
 
				 
			
 
				-static inline void __add_wait_queue_tail_exclusive(wait_queue_head_t *q,
			
 
				-					      wait_queue_t *wait)
			
 
				+static inline void
			
 
				+__add_wait_queue_tail_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
			
 
				 {
			
 
				 	wait->flags |= WQ_FLAG_EXCLUSIVE;
			
 
				 	__add_wait_queue_tail(q, wait);
			
 
				 }
			
 
				 
			
 
				-static inline void __remove_wait_queue(wait_queue_head_t *head,
			
 
				-							wait_queue_t *old)
			
 
				+static inline void
			
 
				+__remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old)
			
 
				 {
			
 
				 	list_del(&old->task_list);
			
 
				 }
			
 
				 
			
 
				 void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
			
 
				 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
			
 
				-void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr,
			
 
				-			void *key);
			
 
				+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
			
 
				 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr);
			
 
				 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
			
 
				 void __wake_up_bit(wait_queue_head_t *, void *, int);
			
@@ -170,27 +170,64 @@ wait_queue_head_t *bit_waitqueue(void *, int);
 
				 /*
			
 
				  * Wakeup macros to be used to report events to the targets.
			
 
				  */
			
 
				-#define wake_up_poll(x, m)				\
			
 
				+#define wake_up_poll(x, m)						\
			
 
				 	__wake_up(x, TASK_NORMAL, 1, (void *) (m))
			
 
				-#define wake_up_locked_poll(x, m)				\
			
 
				+#define wake_up_locked_poll(x, m)					\
			
 
				 	__wake_up_locked_key((x), TASK_NORMAL, (void *) (m))
			
 
				-#define wake_up_interruptible_poll(x, m)			\
			
 
				+#define wake_up_interruptible_poll(x, m)				\
			
 
				 	__wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m))
			
 
				 #define wake_up_interruptible_sync_poll(x, m)				\
			
 
				 	__wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m))
			
 
				 
			
 
				-#define __wait_event(wq, condition) 					\
			
 
				-do {									\
			
 
				-	DEFINE_WAIT(__wait);						\
			
 
				+#define ___wait_cond_timeout(condition)					\
			
 
				+({									\
			
 
				+	bool __cond = (condition);					\
			
 
				+	if (__cond && !__ret)						\
			
 
				+		__ret = 1;						\
			
 
				+	__cond || !__ret;						\
			
 
				+})
			
 
				+
			
 
				+#define ___wait_is_interruptible(state)					\
			
 
				+	(!__builtin_constant_p(state) ||				\
			
 
				+		state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE)	\
			
 
				+
			
 
				+#define ___wait_event(wq, condition, state, exclusive, ret, cmd)	\
			
 
				+({									\
			
 
				+	__label__ __out;						\
			
 
				+	wait_queue_t __wait;						\
			
 
				+	long __ret = ret;						\
			
 
				+									\
			
 
				+	INIT_LIST_HEAD(&__wait.task_list);				\
			
 
				+	if (exclusive)							\
			
 
				+		__wait.flags = WQ_FLAG_EXCLUSIVE;			\
			
 
				+	else								\
			
 
				+		__wait.flags = 0;					\
			
 
				 									\
			
 
				 	for (;;) {							\
			
 
				-		prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE);	\
			
 
				+		long __int = prepare_to_wait_event(&wq, &__wait, state);\
			
 
				+									\
			
 
				 		if (condition)						\
			
 
				 			break;						\
			
 
				-		schedule();						\
			
 
				+									\
			
 
				+		if (___wait_is_interruptible(state) && __int) {		\
			
 
				+			__ret = __int;					\
			
 
				+			if (exclusive) {				\
			
 
				+				abort_exclusive_wait(&wq, &__wait,	\
			
 
				+						     state, NULL);	\
			
 
				+				goto __out;				\
			
 
				+			}						\
			
 
				+			break;						\
			
 
				+		}							\
			
 
				+									\
			
 
				+		cmd;							\
			
 
				 	}								\
			
 
				 	finish_wait(&wq, &__wait);					\
			
 
				-} while (0)
			
 
				+__out:	__ret;								\
			
 
				+})
			
 
				+
			
 
				+#define __wait_event(wq, condition)					\
			
 
				+	(void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0,	\
			
 
				+			    schedule())
			
 
				 
			
 
				 /**
			
 
				  * wait_event - sleep until a condition gets true
			
@@ -204,29 +241,17 @@ do {									\
 
				  * wake_up() has to be called after changing any variable that could
			
 
				  * change the result of the wait condition.
			
 
				  */
			
 
				-#define wait_event(wq, condition) 					\
			
 
				+#define wait_event(wq, condition)					\
			
 
				 do {									\
			
 
				-	if (condition)	 						\
			
 
				+	if (condition)							\
			
 
				 		break;							\
			
 
				 	__wait_event(wq, condition);					\
			
 
				 } while (0)
			
 
				 
			
 
				-#define __wait_event_timeout(wq, condition, ret)			\
			
 
				-do {									\
			
 
				-	DEFINE_WAIT(__wait);						\
			
 
				-									\
			
 
				-	for (;;) {							\
			
 
				-		prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE);	\
			
 
				-		if (condition)						\
			
 
				-			break;						\
			
 
				-		ret = schedule_timeout(ret);				\
			
 
				-		if (!ret)						\
			
 
				-			break;						\
			
 
				-	}								\
			
 
				-	if (!ret && (condition))					\
			
 
				-		ret = 1;						\
			
 
				-	finish_wait(&wq, &__wait);					\
			
 
				-} while (0)
			
 
				+#define __wait_event_timeout(wq, condition, timeout)			\
			
 
				+	___wait_event(wq, ___wait_cond_timeout(condition),		\
			
 
				+		      TASK_UNINTERRUPTIBLE, 0, timeout,			\
			
 
				+		      __ret = schedule_timeout(__ret))
			
 
				 
			
 
				 /**
			
 
				  * wait_event_timeout - sleep until a condition gets true or a timeout elapses
			
@@ -248,28 +273,14 @@ do {									\
 
				 #define wait_event_timeout(wq, condition, timeout)			\
			
 
				 ({									\
			
 
				 	long __ret = timeout;						\
			
 
				-	if (!(condition)) 						\
			
 
				-		__wait_event_timeout(wq, condition, __ret);		\
			
 
				+	if (!___wait_cond_timeout(condition))				\
			
 
				+		__ret = __wait_event_timeout(wq, condition, timeout);	\
			
 
				 	__ret;								\
			
 
				 })
			
 
				 
			
 
				-#define __wait_event_interruptible(wq, condition, ret)			\
			
 
				-do {									\
			
 
				-	DEFINE_WAIT(__wait);						\
			
 
				-									\
			
 
				-	for (;;) {							\
			
 
				-		prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE);	\
			
 
				-		if (condition)						\
			
 
				-			break;						\
			
 
				-		if (!signal_pending(current)) {				\
			
 
				-			schedule();					\
			
 
				-			continue;					\
			
 
				-		}							\
			
 
				-		ret = -ERESTARTSYS;					\
			
 
				-		break;							\
			
 
				-	}								\
			
 
				-	finish_wait(&wq, &__wait);					\
			
 
				-} while (0)
			
 
				+#define __wait_event_interruptible(wq, condition)			\
			
 
				+	___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0,		\
			
 
				+		      schedule())
			
 
				 
			
 
				 /**
			
 
				  * wait_event_interruptible - sleep until a condition gets true
			
@@ -290,31 +301,14 @@ do {									\
 
				 ({									\
			
 
				 	int __ret = 0;							\
			
 
				 	if (!(condition))						\
			
 
				-		__wait_event_interruptible(wq, condition, __ret);	\
			
 
				+		__ret = __wait_event_interruptible(wq, condition);	\
			
 
				 	__ret;								\
			
 
				 })
			
 
				 
			
 
				-#define __wait_event_interruptible_timeout(wq, condition, ret)		\
			
 
				-do {									\
			
 
				-	DEFINE_WAIT(__wait);						\
			
 
				-									\
			
 
				-	for (;;) {							\
			
 
				-		prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE);	\
			
 
				-		if (condition)						\
			
 
				-			break;						\
			
 
				-		if (!signal_pending(current)) {				\
			
 
				-			ret = schedule_timeout(ret);			\
			
 
				-			if (!ret)					\
			
 
				-				break;					\
			
 
				-			continue;					\
			
 
				-		}							\
			
 
				-		ret = -ERESTARTSYS;					\
			
 
				-		break;							\
			
 
				-	}								\
			
 
				-	if (!ret && (condition))					\
			
 
				-		ret = 1;						\
			
 
				-	finish_wait(&wq, &__wait);					\
			
 
				-} while (0)
			
 
				+#define __wait_event_interruptible_timeout(wq, condition, timeout)	\
			
 
				+	___wait_event(wq, ___wait_cond_timeout(condition),		\
			
 
				+		      TASK_INTERRUPTIBLE, 0, timeout,			\
			
 
				+		      __ret = schedule_timeout(__ret))
			
 
				 
			
 
				 /**
			
 
				  * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses
			
@@ -337,15 +331,15 @@ do {									\
 
				 #define wait_event_interruptible_timeout(wq, condition, timeout)	\
			
 
				 ({									\
			
 
				 	long __ret = timeout;						\
			
 
				-	if (!(condition))						\
			
 
				-		__wait_event_interruptible_timeout(wq, condition, __ret); \
			
 
				+	if (!___wait_cond_timeout(condition))				\
			
 
				+		__ret = __wait_event_interruptible_timeout(wq,		\
			
 
				+						condition, timeout);	\
			
 
				 	__ret;								\
			
 
				 })
			
 
				 
			
 
				 #define __wait_event_hrtimeout(wq, condition, timeout, state)		\
			
 
				 ({									\
			
 
				 	int __ret = 0;							\
			
 
				-	DEFINE_WAIT(__wait);						\
			
 
				 	struct hrtimer_sleeper __t;					\
			
 
				 									\
			
 
				 	hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC,		\
			
@@ -356,25 +350,15 @@ do {									\
 
				 				       current->timer_slack_ns,		\
			
 
				 				       HRTIMER_MODE_REL);		\
			
 
				 									\
			
 
				-	for (;;) {							\
			
 
				-		prepare_to_wait(&wq, &__wait, state);			\
			
 
				-		if (condition)						\
			
 
				-			break;						\
			
 
				-		if (state == TASK_INTERRUPTIBLE &&			\
			
 
				-		    signal_pending(current)) {				\
			
 
				-			__ret = -ERESTARTSYS;				\
			
 
				-			break;						\
			
 
				-		}							\
			
 
				+	__ret = ___wait_event(wq, condition, state, 0, 0,		\
			
 
				 		if (!__t.task) {					\
			
 
				 			__ret = -ETIME;					\
			
 
				 			break;						\
			
 
				 		}							\
			
 
				-		schedule();						\
			
 
				-	}								\
			
 
				+		schedule());						\
			
 
				 									\
			
 
				 	hrtimer_cancel(&__t.timer);					\
			
 
				 	destroy_hrtimer_on_stack(&__t.timer);				\
			
 
				-	finish_wait(&wq, &__wait);					\
			
 
				 	__ret;								\
			
 
				 })
			
 
				 
			
@@ -428,33 +412,15 @@ do {									\
 
				 	__ret;								\
			
 
				 })
			
 
				 
			
 
				-#define __wait_event_interruptible_exclusive(wq, condition, ret)	\
			
 
				-do {									\
			
 
				-	DEFINE_WAIT(__wait);						\
			
 
				-									\
			
 
				-	for (;;) {							\
			
 
				-		prepare_to_wait_exclusive(&wq, &__wait,			\
			
 
				-					TASK_INTERRUPTIBLE);		\
			
 
				-		if (condition) {					\
			
 
				-			finish_wait(&wq, &__wait);			\
			
 
				-			break;						\
			
 
				-		}							\
			
 
				-		if (!signal_pending(current)) {				\
			
 
				-			schedule();					\
			
 
				-			continue;					\
			
 
				-		}							\
			
 
				-		ret = -ERESTARTSYS;					\
			
 
				-		abort_exclusive_wait(&wq, &__wait, 			\
			
 
				-				TASK_INTERRUPTIBLE, NULL);		\
			
 
				-		break;							\
			
 
				-	}								\
			
 
				-} while (0)
			
 
				+#define __wait_event_interruptible_exclusive(wq, condition)		\
			
 
				+	___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0,		\
			
 
				+		      schedule())
			
 
				 
			
 
				 #define wait_event_interruptible_exclusive(wq, condition)		\
			
 
				 ({									\
			
 
				 	int __ret = 0;							\
			
 
				 	if (!(condition))						\
			
 
				-		__wait_event_interruptible_exclusive(wq, condition, __ret);\
			
 
				+		__ret = __wait_event_interruptible_exclusive(wq, condition);\
			
 
				 	__ret;								\
			
 
				 })
			
 
				 
			
@@ -606,24 +572,8 @@ do {									\
 
				 	 ? 0 : __wait_event_interruptible_locked(wq, condition, 1, 1))
			
 
				 
			
 
				 
			
 
				-
			
 
				-#define __wait_event_killable(wq, condition, ret)			\
			
 
				-do {									\
			
 
				-	DEFINE_WAIT(__wait);						\
			
 
				-									\
			
 
				-	for (;;) {							\
			
 
				-		prepare_to_wait(&wq, &__wait, TASK_KILLABLE);		\
			
 
				-		if (condition)						\
			
 
				-			break;						\
			
 
				-		if (!fatal_signal_pending(current)) {			\
			
 
				-			schedule();					\
			
 
				-			continue;					\
			
 
				-		}							\
			
 
				-		ret = -ERESTARTSYS;					\
			
 
				-		break;							\
			
 
				-	}								\
			
 
				-	finish_wait(&wq, &__wait);					\
			
 
				-} while (0)
			
 
				+#define __wait_event_killable(wq, condition)				\
			
 
				+	___wait_event(wq, condition, TASK_KILLABLE, 0, 0, schedule())
			
 
				 
			
 
				 /**
			
 
				  * wait_event_killable - sleep until a condition gets true
			
@@ -644,26 +594,17 @@ do {									\
 
				 ({									\
			
 
				 	int __ret = 0;							\
			
 
				 	if (!(condition))						\
			
 
				-		__wait_event_killable(wq, condition, __ret);		\
			
 
				+		__ret = __wait_event_killable(wq, condition);		\
			
 
				 	__ret;								\
			
 
				 })
			
 
				 
			
 
				 
			
 
				 #define __wait_event_lock_irq(wq, condition, lock, cmd)			\
			
 
				-do {									\
			
 
				-	DEFINE_WAIT(__wait);						\
			
 
				-									\
			
 
				-	for (;;) {							\
			
 
				-		prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE);	\
			
 
				-		if (condition)						\
			
 
				-			break;						\
			
 
				-		spin_unlock_irq(&lock);					\
			
 
				-		cmd;							\
			
 
				-		schedule();						\
			
 
				-		spin_lock_irq(&lock);					\
			
 
				-	}								\
			
 
				-	finish_wait(&wq, &__wait);					\
			
 
				-} while (0)
			
 
				+	(void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0,	\
			
 
				+			    spin_unlock_irq(&lock);			\
			
 
				+			    cmd;					\
			
 
				+			    schedule();					\
			
 
				+			    spin_lock_irq(&lock))
			
 
				 
			
 
				 /**
			
 
				  * wait_event_lock_irq_cmd - sleep until a condition gets true. The
			
@@ -723,26 +664,12 @@ do {									\
 
				 } while (0)
			
 
				 
			
 
				 
			
 
				-#define __wait_event_interruptible_lock_irq(wq, condition,		\
			
 
				-					    lock, ret, cmd)		\
			
 
				-do {									\
			
 
				-	DEFINE_WAIT(__wait);						\
			
 
				-									\
			
 
				-	for (;;) {							\
			
 
				-		prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE);	\
			
 
				-		if (condition)						\
			
 
				-			break;						\
			
 
				-		if (signal_pending(current)) {				\
			
 
				-			ret = -ERESTARTSYS;				\
			
 
				-			break;						\
			
 
				-		}							\
			
 
				-		spin_unlock_irq(&lock);					\
			
 
				-		cmd;							\
			
 
				-		schedule();						\
			
 
				-		spin_lock_irq(&lock);					\
			
 
				-	}								\
			
 
				-	finish_wait(&wq, &__wait);					\
			
 
				-} while (0)
			
 
				+#define __wait_event_interruptible_lock_irq(wq, condition, lock, cmd)	\
			
 
				+	___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0,		\
			
 
				+		      spin_unlock_irq(&lock);				\
			
 
				+		      cmd;						\
			
 
				+		      schedule();					\
			
 
				+		      spin_lock_irq(&lock))
			
 
				 
			
 
				 /**
			
 
				  * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true.
			
@@ -772,10 +699,9 @@ do {									\
 
				 #define wait_event_interruptible_lock_irq_cmd(wq, condition, lock, cmd)	\
			
 
				 ({									\
			
 
				 	int __ret = 0;							\
			
 
				-									\
			
 
				 	if (!(condition))						\
			
 
				-		__wait_event_interruptible_lock_irq(wq, condition,	\
			
 
				-						    lock, __ret, cmd);	\
			
 
				+		__ret = __wait_event_interruptible_lock_irq(wq,		\
			
 
				+						condition, lock, cmd);	\
			
 
				 	__ret;								\
			
 
				 })
			
 
				 
			
@@ -804,39 +730,24 @@ do {									\
 
				 #define wait_event_interruptible_lock_irq(wq, condition, lock)		\
			
 
				 ({									\
			
 
				 	int __ret = 0;							\
			
 
				-									\
			
 
				 	if (!(condition))						\
			
 
				-		__wait_event_interruptible_lock_irq(wq, condition,	\
			
 
				-						    lock, __ret, );	\
			
 
				+		__ret = __wait_event_interruptible_lock_irq(wq,		\
			
 
				+						condition, lock,);	\
			
 
				 	__ret;								\
			
 
				 })
			
 
				 
			
 
				 #define __wait_event_interruptible_lock_irq_timeout(wq, condition,	\
			
 
				-						    lock, ret)		\
			
 
				-do {									\
			
 
				-	DEFINE_WAIT(__wait);						\
			
 
				-									\
			
 
				-	for (;;) {							\
			
 
				-		prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE);	\
			
 
				-		if (condition)						\
			
 
				-			break;						\
			
 
				-		if (signal_pending(current)) {				\
			
 
				-			ret = -ERESTARTSYS;				\
			
 
				-			break;						\
			
 
				-		}							\
			
 
				-		spin_unlock_irq(&lock);					\
			
 
				-		ret = schedule_timeout(ret);				\
			
 
				-		spin_lock_irq(&lock);					\
			
 
				-		if (!ret)						\
			
 
				-			break;						\
			
 
				-	}								\
			
 
				-	finish_wait(&wq, &__wait);					\
			
 
				-} while (0)
			
 
				+						    lock, timeout)	\
			
 
				+	___wait_event(wq, ___wait_cond_timeout(condition),		\
			
 
				+		      TASK_INTERRUPTIBLE, 0, timeout,			\
			
 
				+		      spin_unlock_irq(&lock);				\
			
 
				+		      __ret = schedule_timeout(__ret);			\
			
 
				+		      spin_lock_irq(&lock));
			
 
				 
			
 
				 /**
			
 
				- * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets true or a timeout elapses.
			
 
				- *		The condition is checked under the lock. This is expected
			
 
				- *		to be called with the lock taken.
			
 
				+ * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets
			
 
				+ *		true or a timeout elapses. The condition is checked under
			
 
				+ *		the lock. This is expected to be called with the lock taken.
			
 
				  * @wq: the waitqueue to wait on
			
 
				  * @condition: a C expression for the event to wait for
			
 
				  * @lock: a locked spinlock_t, which will be released before schedule()
			
@@ -860,11 +771,10 @@ do {									\
 
				 #define wait_event_interruptible_lock_irq_timeout(wq, condition, lock,	\
			
 
				 						  timeout)		\
			
 
				 ({									\
			
 
				-	int __ret = timeout;						\
			
 
				-									\
			
 
				-	if (!(condition))						\
			
 
				-		__wait_event_interruptible_lock_irq_timeout(		\
			
 
				-					wq, condition, lock, __ret);	\
			
 
				+	long __ret = timeout;						\
			
 
				+	if (!___wait_cond_timeout(condition))				\
			
 
				+		__ret = __wait_event_interruptible_lock_irq_timeout(	\
			
 
				+					wq, condition, lock, timeout);	\
			
 
				 	__ret;								\
			
 
				 })
			
 
				 
			
@@ -875,20 +785,18 @@ do {									\
 
				  * We plan to remove these interfaces.
			
 
				  */
			
 
				 extern void sleep_on(wait_queue_head_t *q);
			
 
				-extern long sleep_on_timeout(wait_queue_head_t *q,
			
 
				-				      signed long timeout);
			
 
				+extern long sleep_on_timeout(wait_queue_head_t *q, signed long timeout);
			
 
				 extern void interruptible_sleep_on(wait_queue_head_t *q);
			
 
				-extern long interruptible_sleep_on_timeout(wait_queue_head_t *q,
			
 
				-					   signed long timeout);
			
 
				+extern long interruptible_sleep_on_timeout(wait_queue_head_t *q, signed long timeout);
			
 
				 
			
 
				 /*
			
 
				  * Waitqueues which are removed from the waitqueue_head at wakeup time
			
 
				  */
			
 
				 void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state);
			
 
				 void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state);
			
 
				+long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state);
			
 
				 void finish_wait(wait_queue_head_t *q, wait_queue_t *wait);
			
 
				-void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
			
 
				-			unsigned int mode, void *key);
			
 
				+void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, unsigned int mode, void *key);
			
 
				 int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
			
 
				 int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
			
 
				 
			
@@ -934,8 +842,8 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
 
				  * One uses wait_on_bit() where one is waiting for the bit to clear,
			
 
				  * but has no intention of setting it.
			
 
				  */
			
 
				-static inline int wait_on_bit(void *word, int bit,
			
 
				-				int (*action)(void *), unsigned mode)
			
 
				+static inline int
			
 
				+wait_on_bit(void *word, int bit, int (*action)(void *), unsigned mode)
			
 
				 {
			
 
				 	if (!test_bit(bit, word))
			
 
				 		return 0;
			
@@ -958,8 +866,8 @@ static inline int wait_on_bit(void *word, int bit,
 
				  * One uses wait_on_bit_lock() where one is waiting for the bit to
			
 
				  * clear with the intention of setting it, and when done, clearing it.
			
 
				  */
			
 
				-static inline int wait_on_bit_lock(void *word, int bit,
			
 
				-				int (*action)(void *), unsigned mode)
			
 
				+static inline int
			
 
				+wait_on_bit_lock(void *word, int bit, int (*action)(void *), unsigned mode)
			
 
				 {
			
 
				 	if (!test_and_set_bit(bit, word))
			
 
				 		return 0;
			
@@ -983,5 +891,5 @@ int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode)
 
				 		return 0;
			
 
				 	return out_of_line_wait_on_atomic_t(val, action, mode);
			
 
				 }
			
 
				-	
			
 
				-#endif
			
 
				+
			
 
				+#endif /* _LINUX_WAIT_H */
			
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -100,7 +100,7 @@ static inline long __trace_sched_switch_state(struct task_struct *p)
 
				 	/*
			
 
				 	 * For all intents and purposes a preempted task is a running task.
			
 
				 	 */
			
 
				-	if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)
			
 
				+	if (task_preempt_count(p) & PREEMPT_ACTIVE)
			
 
				 		state = TASK_RUNNING | TASK_STATE_MAX;
			
 
				 #endif
			
 
				 
			
--- a/init/main.c
+++ b/init/main.c
@@ -693,7 +693,7 @@ int __init_or_module do_one_initcall(initcall_t fn)
 
				 
			
 
				 	if (preempt_count() != count) {
			
 
				 		sprintf(msgbuf, "preemption imbalance ");
			
 
				-		preempt_count() = count;
			
 
				+		preempt_count_set(count);
			
 
				 	}
			
 
				 	if (irqs_disabled()) {
			
 
				 		strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf));
			
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -7,7 +7,7 @@ obj-y     = fork.o exec_domain.o panic.o \
 
				 	    sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
			
 
				 	    signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
			
 
				 	    extable.o params.o posix-timers.o \
			
 
				-	    kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \
			
 
				+	    kthread.o sys_ni.o posix-cpu-timers.o mutex.o \
			
 
				 	    hrtimer.o rwsem.o nsproxy.o semaphore.o \
			
 
				 	    notifier.o ksysfs.o cred.o reboot.o \
			
 
				 	    async.o range.o groups.o lglock.o smpboot.o
			
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -10,6 +10,7 @@
 
				 #include <linux/mmzone.h>
			
 
				 #include <linux/kbuild.h>
			
 
				 #include <linux/page_cgroup.h>
			
 
				+#include <linux/log2.h>
			
 
				 
			
 
				 void foo(void)
			
 
				 {
			
@@ -17,5 +18,8 @@ void foo(void)
 
				 	DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
			
 
				 	DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
			
 
				 	DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
			
 
				+#ifdef CONFIG_SMP
			
 
				+	DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
			
 
				+#endif
			
 
				 	/* End of constants */
			
 
				 }
			
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -120,7 +120,7 @@ void context_tracking_user_enter(void)
 
				  * instead of preempt_schedule() to exit user context if needed before
			
 
				  * calling the scheduler.
			
 
				  */
			
 
				-void __sched notrace preempt_schedule_context(void)
			
 
				+asmlinkage void __sched notrace preempt_schedule_context(void)
			
 
				 {
			
 
				 	enum ctx_state prev_ctx;
			
 
				 
			
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -308,6 +308,23 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 
				 	}
			
 
				 	smpboot_park_threads(cpu);
			
 
				 
			
 
				+	/*
			
 
				+	 * By now we've cleared cpu_active_mask, wait for all preempt-disabled
			
 
				+	 * and RCU users of this state to go away such that all new such users
			
 
				+	 * will observe it.
			
 
				+	 *
			
 
				+	 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
			
 
				+	 * not imply sync_sched(), so explicitly call both.
			
 
				+	 */
			
 
				+#ifdef CONFIG_PREEMPT
			
 
				+	synchronize_sched();
			
 
				+#endif
			
 
				+	synchronize_rcu();
			
 
				+
			
 
				+	/*
			
 
				+	 * So now all preempt/rcu users must observe !cpu_active().
			
 
				+	 */
			
 
				+
			
 
				 	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
			
 
				 	if (err) {
			
 
				 		/* CPU didn't die: tell everyone.  Can't complain. */
			
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -44,7 +44,7 @@ static inline int cpu_idle_poll(void)
 
				 	rcu_idle_enter();
			
 
				 	trace_cpu_idle_rcuidle(0, smp_processor_id());
			
 
				 	local_irq_enable();
			
 
				-	while (!need_resched())
			
 
				+	while (!tif_need_resched())
			
 
				 		cpu_relax();
			
 
				 	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
			
 
				 	rcu_idle_exit();
			
@@ -92,8 +92,7 @@ static void cpu_idle_loop(void)
 
				 			if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
			
 
				 				cpu_idle_poll();
			
 
				 			} else {
			
 
				-				current_clr_polling();
			
 
				-				if (!need_resched()) {
			
 
				+				if (!current_clr_polling_and_test()) {
			
 
				 					stop_critical_timings();
			
 
				 					rcu_idle_enter();
			
 
				 					arch_cpu_idle();
			
@@ -103,9 +102,16 @@ static void cpu_idle_loop(void)
 
				 				} else {
			
 
				 					local_irq_enable();
			
 
				 				}
			
 
				-				current_set_polling();
			
 
				+				__current_set_polling();
			
 
				 			}
			
 
				 			arch_cpu_idle_exit();
			
 
				+			/*
			
 
				+			 * We need to test and propagate the TIF_NEED_RESCHED
			
 
				+			 * bit here because we might not have send the
			
 
				+			 * reschedule IPI to idle tasks.
			
 
				+			 */
			
 
				+			if (tif_need_resched())
			
 
				+				set_preempt_need_resched();
			
 
				 		}
			
 
				 		tick_nohz_idle_exit();
			
 
				 		schedule_preempt_disabled();
			
@@ -129,7 +135,7 @@ void cpu_startup_entry(enum cpuhp_state state)
 
				 	 */
			
 
				 	boot_init_stack_canary();
			
 
				 #endif
			
 
				-	current_set_polling();
			
 
				+	__current_set_polling();
			
 
				 	arch_cpu_idle_prepare();
			
 
				 	cpu_idle_loop();
			
 
				 }
			
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -816,9 +816,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 
				 
			
 
				 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
			
 
				 	mm->pmd_huge_pte = NULL;
			
 
				-#endif
			
 
				-#ifdef CONFIG_NUMA_BALANCING
			
 
				-	mm->first_nid = NUMA_PTE_SCAN_INIT;
			
 
				 #endif
			
 
				 	if (!mm_init(mm, tsk))
			
 
				 		goto fail_nomem;
			
@@ -1313,7 +1310,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
				 #endif
			
 
				 
			
 
				 	/* Perform scheduler related setup. Assign this task to a CPU. */
			
 
				-	sched_fork(p);
			
 
				+	sched_fork(clone_flags, p);
			
 
				 
			
 
				 	retval = perf_event_init_task(p);
			
 
				 	if (retval)
			
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -916,6 +916,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
 
				 	force_quiescent_state(rsp);  /* Kick them all. */
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * This function really isn't for public consumption, but RCU is special in
			
 
				+ * that context switches can allow the state machine to make progress.
			
 
				+ */
			
 
				+extern void resched_cpu(int cpu);
			
 
				+
			
 
				 static void print_cpu_stall(struct rcu_state *rsp)
			
 
				 {
			
 
				 	int cpu;
			
@@ -945,7 +951,14 @@ static void print_cpu_stall(struct rcu_state *rsp)
 
				 				     3 * rcu_jiffies_till_stall_check() + 3;
			
 
				 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
			
 
				 
			
 
				-	set_need_resched();  /* kick ourselves to get things going. */
			
 
				+	/*
			
 
				+	 * Attempt to revive the RCU machinery by forcing a context switch.
			
 
				+	 *
			
 
				+	 * A context switch would normally allow the RCU state machine to make
			
 
				+	 * progress and it could be we're stuck in kernel space without context
			
 
				+	 * switches for an entirely unreasonable amount of time.
			
 
				+	 */
			
 
				+	resched_cpu(smp_processor_id());
			
 
				 }
			
 
				 
			
 
				 static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
			
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -12,6 +12,7 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 
				 endif
			
 
				 
			
 
				 obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
			
 
				+obj-y += wait.o completion.o
			
 
				 obj-$(CONFIG_SMP) += cpupri.o
			
 
				 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
			
 
				 obj-$(CONFIG_SCHEDSTATS) += stats.o
			
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -0,0 +1,299 @@
 
				+/*
			
 
				+ * Generic wait-for-completion handler;
			
 
				+ *
			
 
				+ * It differs from semaphores in that their default case is the opposite,
			
 
				+ * wait_for_completion default blocks whereas semaphore default non-block. The
			
 
				+ * interface also makes it easy to 'complete' multiple waiting threads,
			
 
				+ * something which isn't entirely natural for semaphores.
			
 
				+ *
			
 
				+ * But more importantly, the primitive documents the usage. Semaphores would
			
 
				+ * typically be used for exclusion which gives rise to priority inversion.
			
 
				+ * Waiting for completion is a typically sync point, but not an exclusion point.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/sched.h>
			
 
				+#include <linux/completion.h>
			
 
				+
			
 
				+/**
			
 
				+ * complete: - signals a single thread waiting on this completion
			
 
				+ * @x:  holds the state of this particular completion
			
 
				+ *
			
 
				+ * This will wake up a single thread waiting on this completion. Threads will be
			
 
				+ * awakened in the same order in which they were queued.
			
 
				+ *
			
 
				+ * See also complete_all(), wait_for_completion() and related routines.
			
 
				+ *
			
 
				+ * It may be assumed that this function implies a write memory barrier before
			
 
				+ * changing the task state if and only if any tasks are woken up.
			
 
				+ */
			
 
				+void complete(struct completion *x)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&x->wait.lock, flags);
			
 
				+	x->done++;
			
 
				+	__wake_up_locked(&x->wait, TASK_NORMAL, 1);
			
 
				+	spin_unlock_irqrestore(&x->wait.lock, flags);
			
 
				+}
			
 
				+EXPORT_SYMBOL(complete);
			
 
				+
			
 
				+/**
			
 
				+ * complete_all: - signals all threads waiting on this completion
			
 
				+ * @x:  holds the state of this particular completion
			
 
				+ *
			
 
				+ * This will wake up all threads waiting on this particular completion event.
			
 
				+ *
			
 
				+ * It may be assumed that this function implies a write memory barrier before
			
 
				+ * changing the task state if and only if any tasks are woken up.
			
 
				+ */
			
 
				+void complete_all(struct completion *x)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&x->wait.lock, flags);
			
 
				+	x->done += UINT_MAX/2;
			
 
				+	__wake_up_locked(&x->wait, TASK_NORMAL, 0);
			
 
				+	spin_unlock_irqrestore(&x->wait.lock, flags);
			
 
				+}
			
 
				+EXPORT_SYMBOL(complete_all);
			
 
				+
			
 
				+static inline long __sched
			
 
				+do_wait_for_common(struct completion *x,
			
 
				+		   long (*action)(long), long timeout, int state)
			
 
				+{
			
 
				+	if (!x->done) {
			
 
				+		DECLARE_WAITQUEUE(wait, current);
			
 
				+
			
 
				+		__add_wait_queue_tail_exclusive(&x->wait, &wait);
			
 
				+		do {
			
 
				+			if (signal_pending_state(state, current)) {
			
 
				+				timeout = -ERESTARTSYS;
			
 
				+				break;
			
 
				+			}
			
 
				+			__set_current_state(state);
			
 
				+			spin_unlock_irq(&x->wait.lock);
			
 
				+			timeout = action(timeout);
			
 
				+			spin_lock_irq(&x->wait.lock);
			
 
				+		} while (!x->done && timeout);
			
 
				+		__remove_wait_queue(&x->wait, &wait);
			
 
				+		if (!x->done)
			
 
				+			return timeout;
			
 
				+	}
			
 
				+	x->done--;
			
 
				+	return timeout ?: 1;
			
 
				+}
			
 
				+
			
 
				+static inline long __sched
			
 
				+__wait_for_common(struct completion *x,
			
 
				+		  long (*action)(long), long timeout, int state)
			
 
				+{
			
 
				+	might_sleep();
			
 
				+
			
 
				+	spin_lock_irq(&x->wait.lock);
			
 
				+	timeout = do_wait_for_common(x, action, timeout, state);
			
 
				+	spin_unlock_irq(&x->wait.lock);
			
 
				+	return timeout;
			
 
				+}
			
 
				+
			
 
				+static long __sched
			
 
				+wait_for_common(struct completion *x, long timeout, int state)
			
 
				+{
			
 
				+	return __wait_for_common(x, schedule_timeout, timeout, state);
			
 
				+}
			
 
				+
			
 
				+static long __sched
			
 
				+wait_for_common_io(struct completion *x, long timeout, int state)
			
 
				+{
			
 
				+	return __wait_for_common(x, io_schedule_timeout, timeout, state);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * wait_for_completion: - waits for completion of a task
			
 
				+ * @x:  holds the state of this particular completion
			
 
				+ *
			
 
				+ * This waits to be signaled for completion of a specific task. It is NOT
			
 
				+ * interruptible and there is no timeout.
			
 
				+ *
			
 
				+ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
			
 
				+ * and interrupt capability. Also see complete().
			
 
				+ */
			
 
				+void __sched wait_for_completion(struct completion *x)
			
 
				+{
			
 
				+	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
			
 
				+}
			
 
				+EXPORT_SYMBOL(wait_for_completion);
			
 
				+
			
 
				+/**
			
 
				+ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
			
 
				+ * @x:  holds the state of this particular completion
			
 
				+ * @timeout:  timeout value in jiffies
			
 
				+ *
			
 
				+ * This waits for either a completion of a specific task to be signaled or for a
			
 
				+ * specified timeout to expire. The timeout is in jiffies. It is not
			
 
				+ * interruptible.
			
 
				+ *
			
 
				+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
			
 
				+ * till timeout) if completed.
			
 
				+ */
			
 
				+unsigned long __sched
			
 
				+wait_for_completion_timeout(struct completion *x, unsigned long timeout)
			
 
				+{
			
 
				+	return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
			
 
				+}
			
 
				+EXPORT_SYMBOL(wait_for_completion_timeout);
			
 
				+
			
 
				+/**
			
 
				+ * wait_for_completion_io: - waits for completion of a task
			
 
				+ * @x:  holds the state of this particular completion
			
 
				+ *
			
 
				+ * This waits to be signaled for completion of a specific task. It is NOT
			
 
				+ * interruptible and there is no timeout. The caller is accounted as waiting
			
 
				+ * for IO.
			
 
				+ */
			
 
				+void __sched wait_for_completion_io(struct completion *x)
			
 
				+{
			
 
				+	wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
			
 
				+}
			
 
				+EXPORT_SYMBOL(wait_for_completion_io);
			
 
				+
			
 
				+/**
			
 
				+ * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
			
 
				+ * @x:  holds the state of this particular completion
			
 
				+ * @timeout:  timeout value in jiffies
			
 
				+ *
			
 
				+ * This waits for either a completion of a specific task to be signaled or for a
			
 
				+ * specified timeout to expire. The timeout is in jiffies. It is not
			
 
				+ * interruptible. The caller is accounted as waiting for IO.
			
 
				+ *
			
 
				+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
			
 
				+ * till timeout) if completed.
			
 
				+ */
			
 
				+unsigned long __sched
			
 
				+wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
			
 
				+{
			
 
				+	return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
			
 
				+}
			
 
				+EXPORT_SYMBOL(wait_for_completion_io_timeout);
			
 
				+
			
 
				+/**
			
 
				+ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
			
 
				+ * @x:  holds the state of this particular completion
			
 
				+ *
			
 
				+ * This waits for completion of a specific task to be signaled. It is
			
 
				+ * interruptible.
			
 
				+ *
			
 
				+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
			
 
				+ */
			
 
				+int __sched wait_for_completion_interruptible(struct completion *x)
			
 
				+{
			
 
				+	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
			
 
				+	if (t == -ERESTARTSYS)
			
 
				+		return t;
			
 
				+	return 0;
			
 
				+}
			
 
				+EXPORT_SYMBOL(wait_for_completion_interruptible);
			
 
				+
			
 
				+/**
			
 
				+ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
			
 
				+ * @x:  holds the state of this particular completion
			
 
				+ * @timeout:  timeout value in jiffies
			
 
				+ *
			
 
				+ * This waits for either a completion of a specific task to be signaled or for a
			
 
				+ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
			
 
				+ *
			
 
				+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
			
 
				+ * or number of jiffies left till timeout) if completed.
			
 
				+ */
			
 
				+long __sched
			
 
				+wait_for_completion_interruptible_timeout(struct completion *x,
			
 
				+					  unsigned long timeout)
			
 
				+{
			
 
				+	return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
			
 
				+}
			
 
				+EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
			
 
				+
			
 
				+/**
			
 
				+ * wait_for_completion_killable: - waits for completion of a task (killable)
			
 
				+ * @x:  holds the state of this particular completion
			
 
				+ *
			
 
				+ * This waits to be signaled for completion of a specific task. It can be
			
 
				+ * interrupted by a kill signal.
			
 
				+ *
			
 
				+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
			
 
				+ */
			
 
				+int __sched wait_for_completion_killable(struct completion *x)
			
 
				+{
			
 
				+	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
			
 
				+	if (t == -ERESTARTSYS)
			
 
				+		return t;
			
 
				+	return 0;
			
 
				+}
			
 
				+EXPORT_SYMBOL(wait_for_completion_killable);
			
 
				+
			
 
				+/**
			
 
				+ * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
			
 
				+ * @x:  holds the state of this particular completion
			
 
				+ * @timeout:  timeout value in jiffies
			
 
				+ *
			
 
				+ * This waits for either a completion of a specific task to be
			
 
				+ * signaled or for a specified timeout to expire. It can be
			
 
				+ * interrupted by a kill signal. The timeout is in jiffies.
			
 
				+ *
			
 
				+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
			
 
				+ * or number of jiffies left till timeout) if completed.
			
 
				+ */
			
 
				+long __sched
			
 
				+wait_for_completion_killable_timeout(struct completion *x,
			
 
				+				     unsigned long timeout)
			
 
				+{
			
 
				+	return wait_for_common(x, timeout, TASK_KILLABLE);
			
 
				+}
			
 
				+EXPORT_SYMBOL(wait_for_completion_killable_timeout);
			
 
				+
			
 
				+/**
			
 
				+ *	try_wait_for_completion - try to decrement a completion without blocking
			
 
				+ *	@x:	completion structure
			
 
				+ *
			
 
				+ *	Return: 0 if a decrement cannot be done without blocking
			
 
				+ *		 1 if a decrement succeeded.
			
 
				+ *
			
 
				+ *	If a completion is being used as a counting completion,
			
 
				+ *	attempt to decrement the counter without blocking. This
			
 
				+ *	enables us to avoid waiting if the resource the completion
			
 
				+ *	is protecting is not available.
			
 
				+ */
			
 
				+bool try_wait_for_completion(struct completion *x)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+	int ret = 1;
			
 
				+
			
 
				+	spin_lock_irqsave(&x->wait.lock, flags);
			
 
				+	if (!x->done)
			
 
				+		ret = 0;
			
 
				+	else
			
 
				+		x->done--;
			
 
				+	spin_unlock_irqrestore(&x->wait.lock, flags);
			
 
				+	return ret;
			
 
				+}
			
 
				+EXPORT_SYMBOL(try_wait_for_completion);
			
 
				+
			
 
				+/**
			
 
				+ *	completion_done - Test to see if a completion has any waiters
			
 
				+ *	@x:	completion structure
			
 
				+ *
			
 
				+ *	Return: 0 if there are waiters (wait_for_completion() in progress)
			
 
				+ *		 1 if there are no waiters.
			
 
				+ *
			
 
				+ */
			
 
				+bool completion_done(struct completion *x)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+	int ret = 1;
			
 
				+
			
 
				+	spin_lock_irqsave(&x->wait.lock, flags);
			
 
				+	if (!x->done)
			
 
				+		ret = 0;
			
 
				+	spin_unlock_irqrestore(&x->wait.lock, flags);
			
 
				+	return ret;
			
 
				+}
			
 
				+EXPORT_SYMBOL(completion_done);
			
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -15,6 +15,7 @@
 
				 #include <linux/seq_file.h>
			
 
				 #include <linux/kallsyms.h>
			
 
				 #include <linux/utsname.h>
			
 
				+#include <linux/mempolicy.h>
			
 
				 
			
 
				 #include "sched.h"
			
 
				 
			
@@ -137,6 +138,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 
				 	SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
			
 
				 		0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
			
 
				 #endif
			
 
				+#ifdef CONFIG_NUMA_BALANCING
			
 
				+	SEQ_printf(m, " %d", cpu_to_node(task_cpu(p)));
			
 
				+#endif
			
 
				 #ifdef CONFIG_CGROUP_SCHED
			
 
				 	SEQ_printf(m, " %s", task_group_path(task_group(p)));
			
 
				 #endif
			
@@ -159,7 +163,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
 
				 	read_lock_irqsave(&tasklist_lock, flags);
			
 
				 
			
 
				 	do_each_thread(g, p) {
			
 
				-		if (!p->on_rq || task_cpu(p) != rq_cpu)
			
 
				+		if (task_cpu(p) != rq_cpu)
			
 
				 			continue;
			
 
				 
			
 
				 		print_task(m, rq, p);
			
@@ -225,6 +229,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 
				 			atomic_read(&cfs_rq->tg->runnable_avg));
			
 
				 #endif
			
 
				 #endif
			
 
				+#ifdef CONFIG_CFS_BANDWIDTH
			
 
				+	SEQ_printf(m, "  .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
			
 
				+			cfs_rq->tg->cfs_bandwidth.timer_active);
			
 
				+	SEQ_printf(m, "  .%-30s: %d\n", "throttled",
			
 
				+			cfs_rq->throttled);
			
 
				+	SEQ_printf(m, "  .%-30s: %d\n", "throttle_count",
			
 
				+			cfs_rq->throttle_count);
			
 
				+#endif
			
 
				 
			
 
				 #ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				 	print_cfs_group_stats(m, cpu, cfs_rq->tg);
			
@@ -345,7 +357,7 @@ static void sched_debug_header(struct seq_file *m)
 
				 	cpu_clk = local_clock();
			
 
				 	local_irq_restore(flags);
			
 
				 
			
 
				-	SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
			
 
				+	SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n",
			
 
				 		init_utsname()->release,
			
 
				 		(int)strcspn(init_utsname()->version, " "),
			
 
				 		init_utsname()->version);
			
@@ -488,6 +500,56 @@ static int __init init_sched_debug_procfs(void)
 
				 
			
 
				 __initcall(init_sched_debug_procfs);
			
 
				 
			
 
				+#define __P(F) \
			
 
				+	SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
			
 
				+#define P(F) \
			
 
				+	SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
			
 
				+#define __PN(F) \
			
 
				+	SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
			
 
				+#define PN(F) \
			
 
				+	SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
			
 
				+
			
 
				+
			
 
				+static void sched_show_numa(struct task_struct *p, struct seq_file *m)
			
 
				+{
			
 
				+#ifdef CONFIG_NUMA_BALANCING
			
 
				+	struct mempolicy *pol;
			
 
				+	int node, i;
			
 
				+
			
 
				+	if (p->mm)
			
 
				+		P(mm->numa_scan_seq);
			
 
				+
			
 
				+	task_lock(p);
			
 
				+	pol = p->mempolicy;
			
 
				+	if (pol && !(pol->flags & MPOL_F_MORON))
			
 
				+		pol = NULL;
			
 
				+	mpol_get(pol);
			
 
				+	task_unlock(p);
			
 
				+
			
 
				+	SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0));
			
 
				+
			
 
				+	for_each_online_node(node) {
			
 
				+		for (i = 0; i < 2; i++) {
			
 
				+			unsigned long nr_faults = -1;
			
 
				+			int cpu_current, home_node;
			
 
				+
			
 
				+			if (p->numa_faults)
			
 
				+				nr_faults = p->numa_faults[2*node + i];
			
 
				+
			
 
				+			cpu_current = !i ? (task_node(p) == node) :
			
 
				+				(pol && node_isset(node, pol->v.nodes));
			
 
				+
			
 
				+			home_node = (p->numa_preferred_nid == node);
			
 
				+
			
 
				+			SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n",
			
 
				+				i, node, cpu_current, home_node, nr_faults);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	mpol_put(pol);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				 void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
			
 
				 {
			
 
				 	unsigned long nr_switches;
			
@@ -591,6 +653,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 
				 		SEQ_printf(m, "%-45s:%21Ld\n",
			
 
				 			   "clock-delta", (long long)(t1-t0));
			
 
				 	}
			
 
				+
			
 
				+	sched_show_numa(p, m);
			
 
				 }
			
 
				 
			
 
				 void proc_sched_set_task(struct task_struct *p)
			
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -63,10 +63,23 @@ SCHED_FEAT(LB_MIN, false)
 
				 /*
			
 
				  * Apply the automatic NUMA scheduling policy. Enabled automatically
			
 
				  * at runtime if running on a NUMA machine. Can be controlled via
			
 
				- * numa_balancing=. Allow PTE scanning to be forced on UMA machines
			
 
				- * for debugging the core machinery.
			
 
				+ * numa_balancing=
			
 
				  */
			
 
				 #ifdef CONFIG_NUMA_BALANCING
			
 
				 SCHED_FEAT(NUMA,	false)
			
 
				-SCHED_FEAT(NUMA_FORCE,	false)
			
 
				+
			
 
				+/*
			
 
				+ * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
			
 
				+ * higher number of hinting faults are recorded during active load
			
 
				+ * balancing.
			
 
				+ */
			
 
				+SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
			
 
				+
			
 
				+/*
			
 
				+ * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
			
 
				+ * lower number of hinting faults have been recorded. As this has
			
 
				+ * the potential to prevent a task ever migrating to a new node
			
 
				+ * due to CPU overload it is disabled by default.
			
 
				+ */
			
 
				+SCHED_FEAT(NUMA_RESIST_LOWER, false)
			
 
				 #endif
			
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -9,7 +9,7 @@
 
				 
			
 
				 #ifdef CONFIG_SMP
			
 
				 static int
			
 
				-select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
			
 
				+select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
			
 
				 {
			
 
				 	return task_cpu(p); /* IDLE tasks as never migrated */
			
 
				 }
			
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -246,8 +246,10 @@ static inline void rt_set_overload(struct rq *rq)
 
				 	 * if we should look at the mask. It would be a shame
			
 
				 	 * if we looked at the mask, but the mask was not
			
 
				 	 * updated yet.
			
 
				+	 *
			
 
				+	 * Matched by the barrier in pull_rt_task().
			
 
				 	 */
			
 
				-	wmb();
			
 
				+	smp_wmb();
			
 
				 	atomic_inc(&rq->rd->rto_count);
			
 
				 }
			
 
				 
			
@@ -1169,13 +1171,10 @@ static void yield_task_rt(struct rq *rq)
 
				 static int find_lowest_rq(struct task_struct *task);
			
 
				 
			
 
				 static int
			
 
				-select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
			
 
				+select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
			
 
				 {
			
 
				 	struct task_struct *curr;
			
 
				 	struct rq *rq;
			
 
				-	int cpu;
			
 
				-
			
 
				-	cpu = task_cpu(p);
			
 
				 
			
 
				 	if (p->nr_cpus_allowed == 1)
			
 
				 		goto out;
			
@@ -1213,8 +1212,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
 
				 	 */
			
 
				 	if (curr && unlikely(rt_task(curr)) &&
			
 
				 	    (curr->nr_cpus_allowed < 2 ||
			
 
				-	     curr->prio <= p->prio) &&
			
 
				-	    (p->nr_cpus_allowed > 1)) {
			
 
				+	     curr->prio <= p->prio)) {
			
 
				 		int target = find_lowest_rq(p);
			
 
				 
			
 
				 		if (target != -1)
			
@@ -1630,6 +1628,12 @@ static int pull_rt_task(struct rq *this_rq)
 
				 	if (likely(!rt_overloaded(this_rq)))
			
 
				 		return 0;
			
 
				 
			
 
				+	/*
			
 
				+	 * Match the barrier from rt_set_overloaded; this guarantees that if we
			
 
				+	 * see overloaded we must also see the rto_mask bit.
			
 
				+	 */
			
 
				+	smp_rmb();
			
 
				+
			
 
				 	for_each_cpu(cpu, this_rq->rd->rto_mask) {
			
 
				 		if (this_cpu == cpu)
			
 
				 			continue;
			
@@ -1931,8 +1935,8 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 
				 	p->rt.time_slice = sched_rr_timeslice;
			
 
				 
			
 
				 	/*
			
 
				-	 * Requeue to the end of queue if we (and all of our ancestors) are the
			
 
				-	 * only element on the queue
			
 
				+	 * Requeue to the end of queue if we (and all of our ancestors) are not
			
 
				+	 * the only element on the queue
			
 
				 	 */
			
 
				 	for_each_sched_rt_entity(rt_se) {
			
 
				 		if (rt_se->run_list.prev != rt_se->run_list.next) {
			
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -6,6 +6,7 @@
 
				 #include <linux/spinlock.h>
			
 
				 #include <linux/stop_machine.h>
			
 
				 #include <linux/tick.h>
			
 
				+#include <linux/slab.h>
			
 
				 
			
 
				 #include "cpupri.h"
			
 
				 #include "cpuacct.h"
			
@@ -408,6 +409,10 @@ struct rq {
 
				 	 * remote CPUs use both these fields when doing load calculation.
			
 
				 	 */
			
 
				 	unsigned int nr_running;
			
 
				+#ifdef CONFIG_NUMA_BALANCING
			
 
				+	unsigned int nr_numa_running;
			
 
				+	unsigned int nr_preferred_running;
			
 
				+#endif
			
 
				 	#define CPU_LOAD_IDX_MAX 5
			
 
				 	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
			
 
				 	unsigned long last_load_update_tick;
			
@@ -476,6 +481,9 @@ struct rq {
 
				 	u64 age_stamp;
			
 
				 	u64 idle_stamp;
			
 
				 	u64 avg_idle;
			
 
				+
			
 
				+	/* This is used to determine avg_idle's max value */
			
 
				+	u64 max_idle_balance_cost;
			
 
				 #endif
			
 
				 
			
 
				 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
			
@@ -552,6 +560,12 @@ static inline u64 rq_clock_task(struct rq *rq)
 
				 	return rq->clock_task;
			
 
				 }
			
 
				 
			
 
				+#ifdef CONFIG_NUMA_BALANCING
			
 
				+extern void sched_setnuma(struct task_struct *p, int node);
			
 
				+extern int migrate_task_to(struct task_struct *p, int cpu);
			
 
				+extern int migrate_swap(struct task_struct *, struct task_struct *);
			
 
				+#endif /* CONFIG_NUMA_BALANCING */
			
 
				+
			
 
				 #ifdef CONFIG_SMP
			
 
				 
			
 
				 #define rcu_dereference_check_sched_domain(p) \
			
@@ -593,9 +607,24 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
 
				 	return hsd;
			
 
				 }
			
 
				 
			
 
				+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
			
 
				+{
			
 
				+	struct sched_domain *sd;
			
 
				+
			
 
				+	for_each_domain(cpu, sd) {
			
 
				+		if (sd->flags & flag)
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	return sd;
			
 
				+}
			
 
				+
			
 
				 DECLARE_PER_CPU(struct sched_domain *, sd_llc);
			
 
				 DECLARE_PER_CPU(int, sd_llc_size);
			
 
				 DECLARE_PER_CPU(int, sd_llc_id);
			
 
				+DECLARE_PER_CPU(struct sched_domain *, sd_numa);
			
 
				+DECLARE_PER_CPU(struct sched_domain *, sd_busy);
			
 
				+DECLARE_PER_CPU(struct sched_domain *, sd_asym);
			
 
				 
			
 
				 struct sched_group_power {
			
 
				 	atomic_t ref;
			
@@ -605,6 +634,7 @@ struct sched_group_power {
 
				 	 */
			
 
				 	unsigned int power, power_orig;
			
 
				 	unsigned long next_update;
			
 
				+	int imbalance; /* XXX unrelated to power but shared group state */
			
 
				 	/*
			
 
				 	 * Number of busy cpus in this group.
			
 
				 	 */
			
@@ -719,6 +749,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 
				 	 */
			
 
				 	smp_wmb();
			
 
				 	task_thread_info(p)->cpu = cpu;
			
 
				+	p->wake_cpu = cpu;
			
 
				 #endif
			
 
				 }
			
 
				 
			
@@ -974,7 +1005,7 @@ struct sched_class {
 
				 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
			
 
				 
			
 
				 #ifdef CONFIG_SMP
			
 
				-	int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
			
 
				+	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
			
 
				 	void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
			
 
				 
			
 
				 	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
			
@@ -1220,6 +1251,24 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
 
				 	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
			
 
				 }
			
 
				 
			
 
				+static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
			
 
				+{
			
 
				+	if (l1 > l2)
			
 
				+		swap(l1, l2);
			
 
				+
			
 
				+	spin_lock(l1);
			
 
				+	spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
			
 
				+}
			
 
				+
			
 
				+static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
			
 
				+{
			
 
				+	if (l1 > l2)
			
 
				+		swap(l1, l2);
			
 
				+
			
 
				+	raw_spin_lock(l1);
			
 
				+	raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * double_rq_lock - safely lock two runqueues
			
 
				  *
			
@@ -1305,7 +1354,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
 
				 extern void init_cfs_rq(struct cfs_rq *cfs_rq);
			
 
				 extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
			
 
				 
			
 
				-extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
			
 
				+extern void cfs_bandwidth_usage_inc(void);
			
 
				+extern void cfs_bandwidth_usage_dec(void);
			
 
				 
			
 
				 #ifdef CONFIG_NO_HZ_COMMON
			
 
				 enum rq_nohz_flag_bits {
			
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
 
				  * from dequeue_task() to account for possible rq->clock skew across cpus. The
			
 
				  * delta taken on each cpu would annul the skew.
			
 
				  */
			
 
				-static inline void sched_info_dequeued(struct task_struct *t)
			
 
				+static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
			
 
				 {
			
 
				-	unsigned long long now = rq_clock(task_rq(t)), delta = 0;
			
 
				+	unsigned long long now = rq_clock(rq), delta = 0;
			
 
				 
			
 
				 	if (unlikely(sched_info_on()))
			
 
				 		if (t->sched_info.last_queued)
			
@@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
 
				 	sched_info_reset_dequeued(t);
			
 
				 	t->sched_info.run_delay += delta;
			
 
				 
			
 
				-	rq_sched_info_dequeued(task_rq(t), delta);
			
 
				+	rq_sched_info_dequeued(rq, delta);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t)
 
				  * long it was waiting to run.  We also note when it began so that we
			
 
				  * can keep stats on how long its timeslice is.
			
 
				  */
			
 
				-static void sched_info_arrive(struct task_struct *t)
			
 
				+static void sched_info_arrive(struct rq *rq, struct task_struct *t)
			
 
				 {
			
 
				-	unsigned long long now = rq_clock(task_rq(t)), delta = 0;
			
 
				+	unsigned long long now = rq_clock(rq), delta = 0;
			
 
				 
			
 
				 	if (t->sched_info.last_queued)
			
 
				 		delta = now - t->sched_info.last_queued;
			
@@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t)
 
				 	t->sched_info.last_arrival = now;
			
 
				 	t->sched_info.pcount++;
			
 
				 
			
 
				-	rq_sched_info_arrive(task_rq(t), delta);
			
 
				+	rq_sched_info_arrive(rq, delta);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -96,11 +96,11 @@ static void sched_info_arrive(struct task_struct *t)
 
				  * the timestamp if it is already not set.  It's assumed that
			
 
				  * sched_info_dequeued() will clear that stamp when appropriate.
			
 
				  */
			
 
				-static inline void sched_info_queued(struct task_struct *t)
			
 
				+static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
			
 
				 {
			
 
				 	if (unlikely(sched_info_on()))
			
 
				 		if (!t->sched_info.last_queued)
			
 
				-			t->sched_info.last_queued = rq_clock(task_rq(t));
			
 
				+			t->sched_info.last_queued = rq_clock(rq);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -111,15 +111,15 @@ static inline void sched_info_queued(struct task_struct *t)
 
				  * sched_info_queued() to mark that it has now again started waiting on
			
 
				  * the runqueue.
			
 
				  */
			
 
				-static inline void sched_info_depart(struct task_struct *t)
			
 
				+static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
			
 
				 {
			
 
				-	unsigned long long delta = rq_clock(task_rq(t)) -
			
 
				+	unsigned long long delta = rq_clock(rq) -
			
 
				 					t->sched_info.last_arrival;
			
 
				 
			
 
				-	rq_sched_info_depart(task_rq(t), delta);
			
 
				+	rq_sched_info_depart(rq, delta);
			
 
				 
			
 
				 	if (t->state == TASK_RUNNING)
			
 
				-		sched_info_queued(t);
			
 
				+		sched_info_queued(rq, t);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -128,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t)
 
				  * the idle task.)  We are only called when prev != next.
			
 
				  */
			
 
				 static inline void
			
 
				-__sched_info_switch(struct task_struct *prev, struct task_struct *next)
			
 
				+__sched_info_switch(struct rq *rq,
			
 
				+		    struct task_struct *prev, struct task_struct *next)
			
 
				 {
			
 
				-	struct rq *rq = task_rq(prev);
			
 
				-
			
 
				 	/*
			
 
				 	 * prev now departs the cpu.  It's not interesting to record
			
 
				 	 * stats about how efficient we were at scheduling the idle
			
 
				 	 * process, however.
			
 
				 	 */
			
 
				 	if (prev != rq->idle)
			
 
				-		sched_info_depart(prev);
			
 
				+		sched_info_depart(rq, prev);
			
 
				 
			
 
				 	if (next != rq->idle)
			
 
				-		sched_info_arrive(next);
			
 
				+		sched_info_arrive(rq, next);
			
 
				 }
			
 
				 static inline void
			
 
				-sched_info_switch(struct task_struct *prev, struct task_struct *next)
			
 
				+sched_info_switch(struct rq *rq,
			
 
				+		  struct task_struct *prev, struct task_struct *next)
			
 
				 {
			
 
				 	if (unlikely(sched_info_on()))
			
 
				-		__sched_info_switch(prev, next);
			
 
				+		__sched_info_switch(rq, prev, next);
			
 
				 }
			
 
				 #else
			
 
				-#define sched_info_queued(t)			do { } while (0)
			
 
				+#define sched_info_queued(rq, t)		do { } while (0)
			
 
				 #define sched_info_reset_dequeued(t)	do { } while (0)
			
 
				-#define sched_info_dequeued(t)			do { } while (0)
			
 
				-#define sched_info_switch(t, next)		do { } while (0)
			
 
				+#define sched_info_dequeued(rq, t)		do { } while (0)
			
 
				+#define sched_info_depart(rq, t)		do { } while (0)
			
 
				+#define sched_info_arrive(rq, next)		do { } while (0)
			
 
				+#define sched_info_switch(rq, t, next)		do { } while (0)
			
 
				 #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
			
 
				 
			
 
				 /*
			
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -11,7 +11,7 @@
 
				 
			
 
				 #ifdef CONFIG_SMP
			
 
				 static int
			
 
				-select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
			
 
				+select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
			
 
				 {
			
 
				 	return task_cpu(p); /* stop tasks as never migrate */
			
 
				 }
			
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -52,6 +52,109 @@ void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
 
				 EXPORT_SYMBOL(remove_wait_queue);
			
 
				 
			
 
				 
			
 
				+/*
			
 
				+ * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
			
 
				+ * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
			
 
				+ * number) then we wake all the non-exclusive tasks and one exclusive task.
			
 
				+ *
			
 
				+ * There are circumstances in which we can try to wake a task which has already
			
 
				+ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
			
 
				+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
			
 
				+ */
			
 
				+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
			
 
				+			int nr_exclusive, int wake_flags, void *key)
			
 
				+{
			
 
				+	wait_queue_t *curr, *next;
			
 
				+
			
 
				+	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
			
 
				+		unsigned flags = curr->flags;
			
 
				+
			
 
				+		if (curr->func(curr, mode, wake_flags, key) &&
			
 
				+				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
			
 
				+			break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * __wake_up - wake up threads blocked on a waitqueue.
			
 
				+ * @q: the waitqueue
			
 
				+ * @mode: which threads
			
 
				+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
			
 
				+ * @key: is directly passed to the wakeup function
			
 
				+ *
			
 
				+ * It may be assumed that this function implies a write memory barrier before
			
 
				+ * changing the task state if and only if any tasks are woken up.
			
 
				+ */
			
 
				+void __wake_up(wait_queue_head_t *q, unsigned int mode,
			
 
				+			int nr_exclusive, void *key)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&q->lock, flags);
			
 
				+	__wake_up_common(q, mode, nr_exclusive, 0, key);
			
 
				+	spin_unlock_irqrestore(&q->lock, flags);
			
 
				+}
			
 
				+EXPORT_SYMBOL(__wake_up);
			
 
				+
			
 
				+/*
			
 
				+ * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
			
 
				+ */
			
 
				+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
			
 
				+{
			
 
				+	__wake_up_common(q, mode, nr, 0, NULL);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(__wake_up_locked);
			
 
				+
			
 
				+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
			
 
				+{
			
 
				+	__wake_up_common(q, mode, 1, 0, key);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
			
 
				+
			
 
				+/**
			
 
				+ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
			
 
				+ * @q: the waitqueue
			
 
				+ * @mode: which threads
			
 
				+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
			
 
				+ * @key: opaque value to be passed to wakeup targets
			
 
				+ *
			
 
				+ * The sync wakeup differs that the waker knows that it will schedule
			
 
				+ * away soon, so while the target thread will be woken up, it will not
			
 
				+ * be migrated to another CPU - ie. the two threads are 'synchronized'
			
 
				+ * with each other. This can prevent needless bouncing between CPUs.
			
 
				+ *
			
 
				+ * On UP it can prevent extra preemption.
			
 
				+ *
			
 
				+ * It may be assumed that this function implies a write memory barrier before
			
 
				+ * changing the task state if and only if any tasks are woken up.
			
 
				+ */
			
 
				+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
			
 
				+			int nr_exclusive, void *key)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+	int wake_flags = 1; /* XXX WF_SYNC */
			
 
				+
			
 
				+	if (unlikely(!q))
			
 
				+		return;
			
 
				+
			
 
				+	if (unlikely(nr_exclusive != 1))
			
 
				+		wake_flags = 0;
			
 
				+
			
 
				+	spin_lock_irqsave(&q->lock, flags);
			
 
				+	__wake_up_common(q, mode, nr_exclusive, wake_flags, key);
			
 
				+	spin_unlock_irqrestore(&q->lock, flags);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(__wake_up_sync_key);
			
 
				+
			
 
				+/*
			
 
				+ * __wake_up_sync - see __wake_up_sync_key()
			
 
				+ */
			
 
				+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
			
 
				+{
			
 
				+	__wake_up_sync_key(q, mode, nr_exclusive, NULL);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
			
 
				+
			
 
				 /*
			
 
				  * Note: we use "set_current_state()" _after_ the wait-queue add,
			
 
				  * because we need a memory barrier there on SMP, so that any
			
@@ -92,6 +195,30 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
 
				 }
			
 
				 EXPORT_SYMBOL(prepare_to_wait_exclusive);
			
 
				 
			
 
				+long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	if (signal_pending_state(state, current))
			
 
				+		return -ERESTARTSYS;
			
 
				+
			
 
				+	wait->private = current;
			
 
				+	wait->func = autoremove_wake_function;
			
 
				+
			
 
				+	spin_lock_irqsave(&q->lock, flags);
			
 
				+	if (list_empty(&wait->task_list)) {
			
 
				+		if (wait->flags & WQ_FLAG_EXCLUSIVE)
			
 
				+			__add_wait_queue_tail(q, wait);
			
 
				+		else
			
 
				+			__add_wait_queue(q, wait);
			
 
				+	}
			
 
				+	set_current_state(state);
			
 
				+	spin_unlock_irqrestore(&q->lock, flags);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+EXPORT_SYMBOL(prepare_to_wait_event);
			
 
				+
			
 
				 /**
			
 
				  * finish_wait - clean up after waiting in a queue
			
 
				  * @q: waitqueue waited on
			
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -99,13 +99,13 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
 
				 
			
 
				 	raw_local_irq_save(flags);
			
 
				 	/*
			
 
				-	 * The preempt tracer hooks into add_preempt_count and will break
			
 
				+	 * The preempt tracer hooks into preempt_count_add and will break
			
 
				 	 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
			
 
				 	 * is set and before current->softirq_enabled is cleared.
			
 
				 	 * We must manually increment preempt_count here and manually
			
 
				 	 * call the trace_preempt_off later.
			
 
				 	 */
			
 
				-	preempt_count() += cnt;
			
 
				+	__preempt_count_add(cnt);
			
 
				 	/*
			
 
				 	 * Were softirqs turned off above:
			
 
				 	 */
			
@@ -119,7 +119,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
 
				 #else /* !CONFIG_TRACE_IRQFLAGS */
			
 
				 static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
			
 
				 {
			
 
				-	add_preempt_count(cnt);
			
 
				+	preempt_count_add(cnt);
			
 
				 	barrier();
			
 
				 }
			
 
				 #endif /* CONFIG_TRACE_IRQFLAGS */
			
@@ -137,7 +137,7 @@ static void __local_bh_enable(unsigned int cnt)
 
				 
			
 
				 	if (softirq_count() == cnt)
			
 
				 		trace_softirqs_on(_RET_IP_);
			
 
				-	sub_preempt_count(cnt);
			
 
				+	preempt_count_sub(cnt);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -168,7 +168,7 @@ static inline void _local_bh_enable_ip(unsigned long ip)
 
				 	 * Keep preemption disabled until we are done with
			
 
				 	 * softirq processing:
			
 
				  	 */
			
 
				-	sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
			
 
				+	preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1);
			
 
				 
			
 
				 	if (unlikely(!in_interrupt() && local_softirq_pending())) {
			
 
				 		/*
			
@@ -178,7 +178,7 @@ static inline void _local_bh_enable_ip(unsigned long ip)
 
				 		do_softirq();
			
 
				 	}
			
 
				 
			
 
				-	dec_preempt_count();
			
 
				+	preempt_count_dec();
			
 
				 #ifdef CONFIG_TRACE_IRQFLAGS
			
 
				 	local_irq_enable();
			
 
				 #endif
			
@@ -260,7 +260,7 @@ restart:
 
				 				       " exited with %08x?\n", vec_nr,
			
 
				 				       softirq_to_name[vec_nr], h->action,
			
 
				 				       prev_count, preempt_count());
			
 
				-				preempt_count() = prev_count;
			
 
				+				preempt_count_set(prev_count);
			
 
				 			}
			
 
				 
			
 
				 			rcu_bh_qs(cpu);
			
@@ -378,7 +378,7 @@ void irq_exit(void)
 
				 
			
 
				 	account_irq_exit_time(current);
			
 
				 	trace_hardirq_exit();
			
 
				-	sub_preempt_count(HARDIRQ_OFFSET);
			
 
				+	preempt_count_sub(HARDIRQ_OFFSET);
			
 
				 	if (!in_interrupt() && local_softirq_pending())
			
 
				 		invoke_softirq();
			
 
				 
			
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -20,6 +20,7 @@
 
				 #include <linux/kallsyms.h>
			
 
				 #include <linux/smpboot.h>
			
 
				 #include <linux/atomic.h>
			
 
				+#include <linux/lglock.h>
			
 
				 
			
 
				 /*
			
 
				  * Structure to determine completion condition and record errors.  May
			
@@ -43,6 +44,14 @@ static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
 
				 static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
			
 
				 static bool stop_machine_initialized = false;
			
 
				 
			
 
				+/*
			
 
				+ * Avoids a race between stop_two_cpus and global stop_cpus, where
			
 
				+ * the stoppers could get queued up in reverse order, leading to
			
 
				+ * system deadlock. Using an lglock means stop_two_cpus remains
			
 
				+ * relatively cheap.
			
 
				+ */
			
 
				+DEFINE_STATIC_LGLOCK(stop_cpus_lock);
			
 
				+
			
 
				 static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
			
 
				 {
			
 
				 	memset(done, 0, sizeof(*done));
			
@@ -115,6 +124,184 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
 
				 	return done.executed ? done.ret : -ENOENT;
			
 
				 }
			
 
				 
			
 
				+/* This controls the threads on each CPU. */
			
 
				+enum multi_stop_state {
			
 
				+	/* Dummy starting state for thread. */
			
 
				+	MULTI_STOP_NONE,
			
 
				+	/* Awaiting everyone to be scheduled. */
			
 
				+	MULTI_STOP_PREPARE,
			
 
				+	/* Disable interrupts. */
			
 
				+	MULTI_STOP_DISABLE_IRQ,
			
 
				+	/* Run the function */
			
 
				+	MULTI_STOP_RUN,
			
 
				+	/* Exit */
			
 
				+	MULTI_STOP_EXIT,
			
 
				+};
			
 
				+
			
 
				+struct multi_stop_data {
			
 
				+	int			(*fn)(void *);
			
 
				+	void			*data;
			
 
				+	/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
			
 
				+	unsigned int		num_threads;
			
 
				+	const struct cpumask	*active_cpus;
			
 
				+
			
 
				+	enum multi_stop_state	state;
			
 
				+	atomic_t		thread_ack;
			
 
				+};
			
 
				+
			
 
				+static void set_state(struct multi_stop_data *msdata,
			
 
				+		      enum multi_stop_state newstate)
			
 
				+{
			
 
				+	/* Reset ack counter. */
			
 
				+	atomic_set(&msdata->thread_ack, msdata->num_threads);
			
 
				+	smp_wmb();
			
 
				+	msdata->state = newstate;
			
 
				+}
			
 
				+
			
 
				+/* Last one to ack a state moves to the next state. */
			
 
				+static void ack_state(struct multi_stop_data *msdata)
			
 
				+{
			
 
				+	if (atomic_dec_and_test(&msdata->thread_ack))
			
 
				+		set_state(msdata, msdata->state + 1);
			
 
				+}
			
 
				+
			
 
				+/* This is the cpu_stop function which stops the CPU. */
			
 
				+static int multi_cpu_stop(void *data)
			
 
				+{
			
 
				+	struct multi_stop_data *msdata = data;
			
 
				+	enum multi_stop_state curstate = MULTI_STOP_NONE;
			
 
				+	int cpu = smp_processor_id(), err = 0;
			
 
				+	unsigned long flags;
			
 
				+	bool is_active;
			
 
				+
			
 
				+	/*
			
 
				+	 * When called from stop_machine_from_inactive_cpu(), irq might
			
 
				+	 * already be disabled.  Save the state and restore it on exit.
			
 
				+	 */
			
 
				+	local_save_flags(flags);
			
 
				+
			
 
				+	if (!msdata->active_cpus)
			
 
				+		is_active = cpu == cpumask_first(cpu_online_mask);
			
 
				+	else
			
 
				+		is_active = cpumask_test_cpu(cpu, msdata->active_cpus);
			
 
				+
			
 
				+	/* Simple state machine */
			
 
				+	do {
			
 
				+		/* Chill out and ensure we re-read multi_stop_state. */
			
 
				+		cpu_relax();
			
 
				+		if (msdata->state != curstate) {
			
 
				+			curstate = msdata->state;
			
 
				+			switch (curstate) {
			
 
				+			case MULTI_STOP_DISABLE_IRQ:
			
 
				+				local_irq_disable();
			
 
				+				hard_irq_disable();
			
 
				+				break;
			
 
				+			case MULTI_STOP_RUN:
			
 
				+				if (is_active)
			
 
				+					err = msdata->fn(msdata->data);
			
 
				+				break;
			
 
				+			default:
			
 
				+				break;
			
 
				+			}
			
 
				+			ack_state(msdata);
			
 
				+		}
			
 
				+	} while (curstate != MULTI_STOP_EXIT);
			
 
				+
			
 
				+	local_irq_restore(flags);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+struct irq_cpu_stop_queue_work_info {
			
 
				+	int cpu1;
			
 
				+	int cpu2;
			
 
				+	struct cpu_stop_work *work1;
			
 
				+	struct cpu_stop_work *work2;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * This function is always run with irqs and preemption disabled.
			
 
				+ * This guarantees that both work1 and work2 get queued, before
			
 
				+ * our local migrate thread gets the chance to preempt us.
			
 
				+ */
			
 
				+static void irq_cpu_stop_queue_work(void *arg)
			
 
				+{
			
 
				+	struct irq_cpu_stop_queue_work_info *info = arg;
			
 
				+	cpu_stop_queue_work(info->cpu1, info->work1);
			
 
				+	cpu_stop_queue_work(info->cpu2, info->work2);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * stop_two_cpus - stops two cpus
			
 
				+ * @cpu1: the cpu to stop
			
 
				+ * @cpu2: the other cpu to stop
			
 
				+ * @fn: function to execute
			
 
				+ * @arg: argument to @fn
			
 
				+ *
			
 
				+ * Stops both the current and specified CPU and runs @fn on one of them.
			
 
				+ *
			
 
				+ * returns when both are completed.
			
 
				+ */
			
 
				+int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
			
 
				+{
			
 
				+	struct cpu_stop_done done;
			
 
				+	struct cpu_stop_work work1, work2;
			
 
				+	struct irq_cpu_stop_queue_work_info call_args;
			
 
				+	struct multi_stop_data msdata;
			
 
				+
			
 
				+	preempt_disable();
			
 
				+	msdata = (struct multi_stop_data){
			
 
				+		.fn = fn,
			
 
				+		.data = arg,
			
 
				+		.num_threads = 2,
			
 
				+		.active_cpus = cpumask_of(cpu1),
			
 
				+	};
			
 
				+
			
 
				+	work1 = work2 = (struct cpu_stop_work){
			
 
				+		.fn = multi_cpu_stop,
			
 
				+		.arg = &msdata,
			
 
				+		.done = &done
			
 
				+	};
			
 
				+
			
 
				+	call_args = (struct irq_cpu_stop_queue_work_info){
			
 
				+		.cpu1 = cpu1,
			
 
				+		.cpu2 = cpu2,
			
 
				+		.work1 = &work1,
			
 
				+		.work2 = &work2,
			
 
				+	};
			
 
				+
			
 
				+	cpu_stop_init_done(&done, 2);
			
 
				+	set_state(&msdata, MULTI_STOP_PREPARE);
			
 
				+
			
 
				+	/*
			
 
				+	 * If we observe both CPUs active we know _cpu_down() cannot yet have
			
 
				+	 * queued its stop_machine works and therefore ours will get executed
			
 
				+	 * first. Or its not either one of our CPUs that's getting unplugged,
			
 
				+	 * in which case we don't care.
			
 
				+	 *
			
 
				+	 * This relies on the stopper workqueues to be FIFO.
			
 
				+	 */
			
 
				+	if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
			
 
				+		preempt_enable();
			
 
				+		return -ENOENT;
			
 
				+	}
			
 
				+
			
 
				+	lg_local_lock(&stop_cpus_lock);
			
 
				+	/*
			
 
				+	 * Queuing needs to be done by the lowest numbered CPU, to ensure
			
 
				+	 * that works are always queued in the same order on every CPU.
			
 
				+	 * This prevents deadlocks.
			
 
				+	 */
			
 
				+	smp_call_function_single(min(cpu1, cpu2),
			
 
				+				 &irq_cpu_stop_queue_work,
			
 
				+				 &call_args, 0);
			
 
				+	lg_local_unlock(&stop_cpus_lock);
			
 
				+	preempt_enable();
			
 
				+
			
 
				+	wait_for_completion(&done.completion);
			
 
				+
			
 
				+	return done.executed ? done.ret : -ENOENT;
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * stop_one_cpu_nowait - stop a cpu but don't wait for completion
			
 
				  * @cpu: cpu to stop
			
@@ -159,10 +346,10 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
 
				 	 * preempted by a stopper which might wait for other stoppers
			
 
				 	 * to enter @fn which can lead to deadlock.
			
 
				 	 */
			
 
				-	preempt_disable();
			
 
				+	lg_global_lock(&stop_cpus_lock);
			
 
				 	for_each_cpu(cpu, cpumask)
			
 
				 		cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
			
 
				-	preempt_enable();
			
 
				+	lg_global_unlock(&stop_cpus_lock);
			
 
				 }
			
 
				 
			
 
				 static int __stop_cpus(const struct cpumask *cpumask,
			
@@ -359,98 +546,14 @@ early_initcall(cpu_stop_init);
 
				 
			
 
				 #ifdef CONFIG_STOP_MACHINE
			
 
				 
			
 
				-/* This controls the threads on each CPU. */
			
 
				-enum stopmachine_state {
			
 
				-	/* Dummy starting state for thread. */
			
 
				-	STOPMACHINE_NONE,
			
 
				-	/* Awaiting everyone to be scheduled. */
			
 
				-	STOPMACHINE_PREPARE,
			
 
				-	/* Disable interrupts. */
			
 
				-	STOPMACHINE_DISABLE_IRQ,
			
 
				-	/* Run the function */
			
 
				-	STOPMACHINE_RUN,
			
 
				-	/* Exit */
			
 
				-	STOPMACHINE_EXIT,
			
 
				-};
			
 
				-
			
 
				-struct stop_machine_data {
			
 
				-	int			(*fn)(void *);
			
 
				-	void			*data;
			
 
				-	/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
			
 
				-	unsigned int		num_threads;
			
 
				-	const struct cpumask	*active_cpus;
			
 
				-
			
 
				-	enum stopmachine_state	state;
			
 
				-	atomic_t		thread_ack;
			
 
				-};
			
 
				-
			
 
				-static void set_state(struct stop_machine_data *smdata,
			
 
				-		      enum stopmachine_state newstate)
			
 
				-{
			
 
				-	/* Reset ack counter. */
			
 
				-	atomic_set(&smdata->thread_ack, smdata->num_threads);
			
 
				-	smp_wmb();
			
 
				-	smdata->state = newstate;
			
 
				-}
			
 
				-
			
 
				-/* Last one to ack a state moves to the next state. */
			
 
				-static void ack_state(struct stop_machine_data *smdata)
			
 
				-{
			
 
				-	if (atomic_dec_and_test(&smdata->thread_ack))
			
 
				-		set_state(smdata, smdata->state + 1);
			
 
				-}
			
 
				-
			
 
				-/* This is the cpu_stop function which stops the CPU. */
			
 
				-static int stop_machine_cpu_stop(void *data)
			
 
				-{
			
 
				-	struct stop_machine_data *smdata = data;
			
 
				-	enum stopmachine_state curstate = STOPMACHINE_NONE;
			
 
				-	int cpu = smp_processor_id(), err = 0;
			
 
				-	unsigned long flags;
			
 
				-	bool is_active;
			
 
				-
			
 
				-	/*
			
 
				-	 * When called from stop_machine_from_inactive_cpu(), irq might
			
 
				-	 * already be disabled.  Save the state and restore it on exit.
			
 
				-	 */
			
 
				-	local_save_flags(flags);
			
 
				-
			
 
				-	if (!smdata->active_cpus)
			
 
				-		is_active = cpu == cpumask_first(cpu_online_mask);
			
 
				-	else
			
 
				-		is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
			
 
				-
			
 
				-	/* Simple state machine */
			
 
				-	do {
			
 
				-		/* Chill out and ensure we re-read stopmachine_state. */
			
 
				-		cpu_relax();
			
 
				-		if (smdata->state != curstate) {
			
 
				-			curstate = smdata->state;
			
 
				-			switch (curstate) {
			
 
				-			case STOPMACHINE_DISABLE_IRQ:
			
 
				-				local_irq_disable();
			
 
				-				hard_irq_disable();
			
 
				-				break;
			
 
				-			case STOPMACHINE_RUN:
			
 
				-				if (is_active)
			
 
				-					err = smdata->fn(smdata->data);
			
 
				-				break;
			
 
				-			default:
			
 
				-				break;
			
 
				-			}
			
 
				-			ack_state(smdata);
			
 
				-		}
			
 
				-	} while (curstate != STOPMACHINE_EXIT);
			
 
				-
			
 
				-	local_irq_restore(flags);
			
 
				-	return err;
			
 
				-}
			
 
				-
			
 
				 int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
			
 
				 {
			
 
				-	struct stop_machine_data smdata = { .fn = fn, .data = data,
			
 
				-					    .num_threads = num_online_cpus(),
			
 
				-					    .active_cpus = cpus };
			
 
				+	struct multi_stop_data msdata = {
			
 
				+		.fn = fn,
			
 
				+		.data = data,
			
 
				+		.num_threads = num_online_cpus(),
			
 
				+		.active_cpus = cpus,
			
 
				+	};
			
 
				 
			
 
				 	if (!stop_machine_initialized) {
			
 
				 		/*
			
@@ -461,7 +564,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 
				 		unsigned long flags;
			
 
				 		int ret;
			
 
				 
			
 
				-		WARN_ON_ONCE(smdata.num_threads != 1);
			
 
				+		WARN_ON_ONCE(msdata.num_threads != 1);
			
 
				 
			
 
				 		local_irq_save(flags);
			
 
				 		hard_irq_disable();
			
@@ -472,8 +575,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 
				 	}
			
 
				 
			
 
				 	/* Set the initial state and stop all online cpus. */
			
 
				-	set_state(&smdata, STOPMACHINE_PREPARE);
			
 
				-	return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
			
 
				+	set_state(&msdata, MULTI_STOP_PREPARE);
			
 
				+	return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
			
 
				 }
			
 
				 
			
 
				 int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
			
@@ -513,25 +616,25 @@ EXPORT_SYMBOL_GPL(stop_machine);
 
				 int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
			
 
				 				  const struct cpumask *cpus)
			
 
				 {
			
 
				-	struct stop_machine_data smdata = { .fn = fn, .data = data,
			
 
				+	struct multi_stop_data msdata = { .fn = fn, .data = data,
			
 
				 					    .active_cpus = cpus };
			
 
				 	struct cpu_stop_done done;
			
 
				 	int ret;
			
 
				 
			
 
				 	/* Local CPU must be inactive and CPU hotplug in progress. */
			
 
				 	BUG_ON(cpu_active(raw_smp_processor_id()));
			
 
				-	smdata.num_threads = num_active_cpus() + 1;	/* +1 for local */
			
 
				+	msdata.num_threads = num_active_cpus() + 1;	/* +1 for local */
			
 
				 
			
 
				 	/* No proper task established and can't sleep - busy wait for lock. */
			
 
				 	while (!mutex_trylock(&stop_cpus_mutex))
			
 
				 		cpu_relax();
			
 
				 
			
 
				 	/* Schedule work on other CPUs and execute directly for local CPU */
			
 
				-	set_state(&smdata, STOPMACHINE_PREPARE);
			
 
				+	set_state(&msdata, MULTI_STOP_PREPARE);
			
 
				 	cpu_stop_init_done(&done, num_active_cpus());
			
 
				-	queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata,
			
 
				+	queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
			
 
				 			     &done);
			
 
				-	ret = stop_machine_cpu_stop(&smdata);
			
 
				+	ret = multi_cpu_stop(&msdata);
			
 
				 
			
 
				 	/* Busy wait for completion. */
			
 
				 	while (!completion_done(&done.completion))