17 years ago · ace7f1b796
--- a/Documentation/DocBook/kgdb.tmpl
+++ b/Documentation/DocBook/kgdb.tmpl
@@ -84,10 +84,9 @@
 
				     runs an instance of gdb against the vmlinux file which contains
			
 
				     the symbols (not boot image such as bzImage, zImage, uImage...).
			
 
				     In gdb the developer specifies the connection parameters and
			
 
				-    connects to kgdb.  Depending on which kgdb I/O modules exist in
			
 
				-    the kernel for a given architecture, it may be possible to debug
			
 
				-    the test machine's kernel with the development machine using a
			
 
				-    rs232 or ethernet connection.
			
 
				+    connects to kgdb.  The type of connection a developer makes with
			
 
				+    gdb depends on the availability of kgdb I/O modules compiled as
			
 
				+    builtin's or kernel modules in the test machine's kernel.
			
 
				     </para>
			
 
				   </chapter>
			
 
				   <chapter id="CompilingAKernel">
			
@@ -223,7 +222,7 @@
 
				   </para>
			
 
				   <para>
			
 
				   IMPORTANT NOTE: Using this option with kgdb over the console
			
 
				-  (kgdboc) or kgdb over ethernet (kgdboe) is not supported.
			
 
				+  (kgdboc) is not supported.
			
 
				   </para>
			
 
				   </sect1>
			
 
				   </chapter>
			
@@ -249,18 +248,11 @@
 
				     (gdb) target remote /dev/ttyS0
			
 
				     </programlisting>
			
 
				     <para>
			
 
				-    Example (kgdb to a terminal server):
			
 
				+    Example (kgdb to a terminal server on tcp port 2012):
			
 
				     </para>
			
 
				     <programlisting>
			
 
				     % gdb ./vmlinux
			
 
				-    (gdb) target remote udp:192.168.2.2:6443
			
 
				-    </programlisting>
			
 
				-    <para>
			
 
				-    Example (kgdb over ethernet):
			
 
				-    </para>
			
 
				-    <programlisting>
			
 
				-    % gdb ./vmlinux
			
 
				-    (gdb) target remote udp:192.168.2.2:6443
			
 
				+    (gdb) target remote 192.168.2.2:2012
			
 
				     </programlisting>
			
 
				     <para>
			
 
				     Once connected, you can debug a kernel the way you would debug an
			
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 
				 VERSION = 2
			
 
				 PATCHLEVEL = 6
			
 
				 SUBLEVEL = 26
			
 
				-EXTRAVERSION = -rc7
			
 
				+EXTRAVERSION = -rc8
			
 
				 NAME = Rotary Wombat
			
 
				 
			
 
				 # *DOCUMENTATION*
			
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -558,8 +558,6 @@ static struct iosapic_rte_info * __init_refok iosapic_alloc_rte (void)
 
				 	if (!iosapic_kmalloc_ok && list_empty(&free_rte_list)) {
			
 
				 		rte = alloc_bootmem(sizeof(struct iosapic_rte_info) *
			
 
				 				    NR_PREALLOCATE_RTE_ENTRIES);
			
 
				-		if (!rte)
			
 
				-			return NULL;
			
 
				 		for (i = 0; i < NR_PREALLOCATE_RTE_ENTRIES; i++, rte++)
			
 
				 			list_add(&rte->rte_list, &free_rte_list);
			
 
				 	}
			
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -578,8 +578,6 @@ setup_arch (char **cmdline_p)
 
				 	cpu_init();	/* initialize the bootstrap CPU */
			
 
				 	mmu_context_init();	/* initialize context_id bitmap */
			
 
				 
			
 
				-	check_sal_cache_flush();
			
 
				-
			
 
				 #ifdef CONFIG_ACPI
			
 
				 	acpi_boot_init();
			
 
				 #endif
			
@@ -607,6 +605,7 @@ setup_arch (char **cmdline_p)
 
				 		ia64_mca_init();
			
 
				 
			
 
				 	platform_setup(cmdline_p);
			
 
				+	check_sal_cache_flush();
			
 
				 	paging_init();
			
 
				 }
			
 
				 
			
--- a/arch/ia64/sn/kernel/sn2/sn2_smp.c
+++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c
@@ -512,7 +512,7 @@ static ssize_t sn2_ptc_proc_write(struct file *file, const char __user *user, si
 
				 	int cpu;
			
 
				 	char optstr[64];
			
 
				 
			
 
				-	if (count > sizeof(optstr))
			
 
				+	if (count == 0 || count > sizeof(optstr))
			
 
				 		return -EINVAL;
			
 
				 	if (copy_from_user(optstr, user, count))
			
 
				 		return -EFAULT;
			
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -383,6 +383,7 @@ config VMI
 
				 config KVM_CLOCK
			
 
				 	bool "KVM paravirtualized clock"
			
 
				 	select PARAVIRT
			
 
				+	select PARAVIRT_CLOCK
			
 
				 	depends on !(X86_VISWS || X86_VOYAGER)
			
 
				 	help
			
 
				 	  Turning on this option will allow you to run a paravirtualized clock
			
@@ -410,6 +411,10 @@ config PARAVIRT
 
				 	  over full virtualization.  However, when run without a hypervisor
			
 
				 	  the kernel is theoretically slower and slightly larger.
			
 
				 
			
 
				+config PARAVIRT_CLOCK
			
 
				+	bool
			
 
				+	default n
			
 
				+
			
 
				 endif
			
 
				 
			
 
				 config MEMTEST_BOOTPARAM
			
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_VMI)		+= vmi_32.o vmiclock_32.o
 
				 obj-$(CONFIG_KVM_GUEST)		+= kvm.o
			
 
				 obj-$(CONFIG_KVM_CLOCK)		+= kvmclock.o
			
 
				 obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o
			
 
				+obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o
			
 
				 
			
 
				 obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o
			
 
				 
			
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -18,6 +18,7 @@
 
				 
			
 
				 #include <linux/clocksource.h>
			
 
				 #include <linux/kvm_para.h>
			
 
				+#include <asm/pvclock.h>
			
 
				 #include <asm/arch_hooks.h>
			
 
				 #include <asm/msr.h>
			
 
				 #include <asm/apic.h>
			
@@ -36,18 +37,9 @@ static int parse_no_kvmclock(char *arg)
 
				 early_param("no-kvmclock", parse_no_kvmclock);
			
 
				 
			
 
				 /* The hypervisor will put information about time periodically here */
			
 
				-static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock);
			
 
				-#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field
			
 
				+static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock);
			
 
				+static struct pvclock_wall_clock wall_clock;
			
 
				 
			
 
				-static inline u64 kvm_get_delta(u64 last_tsc)
			
 
				-{
			
 
				-	int cpu = smp_processor_id();
			
 
				-	u64 delta = native_read_tsc() - last_tsc;
			
 
				-	return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
			
 
				-}
			
 
				-
			
 
				-static struct kvm_wall_clock wall_clock;
			
 
				-static cycle_t kvm_clock_read(void);
			
 
				 /*
			
 
				  * The wallclock is the time of day when we booted. Since then, some time may
			
 
				  * have elapsed since the hypervisor wrote the data. So we try to account for
			
@@ -55,64 +47,37 @@ static cycle_t kvm_clock_read(void);
 
				  */
			
 
				 static unsigned long kvm_get_wallclock(void)
			
 
				 {
			
 
				-	u32 wc_sec, wc_nsec;
			
 
				-	u64 delta;
			
 
				+	struct pvclock_vcpu_time_info *vcpu_time;
			
 
				 	struct timespec ts;
			
 
				-	int version, nsec;
			
 
				 	int low, high;
			
 
				 
			
 
				 	low = (int)__pa(&wall_clock);
			
 
				 	high = ((u64)__pa(&wall_clock) >> 32);
			
 
				+	native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
			
 
				 
			
 
				-	delta = kvm_clock_read();
			
 
				+	vcpu_time = &get_cpu_var(hv_clock);
			
 
				+	pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
			
 
				+	put_cpu_var(hv_clock);
			
 
				 
			
 
				-	native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
			
 
				-	do {
			
 
				-		version = wall_clock.wc_version;
			
 
				-		rmb();
			
 
				-		wc_sec = wall_clock.wc_sec;
			
 
				-		wc_nsec = wall_clock.wc_nsec;
			
 
				-		rmb();
			
 
				-	} while ((wall_clock.wc_version != version) || (version & 1));
			
 
				-
			
 
				-	delta = kvm_clock_read() - delta;
			
 
				-	delta += wc_nsec;
			
 
				-	nsec = do_div(delta, NSEC_PER_SEC);
			
 
				-	set_normalized_timespec(&ts, wc_sec + delta, nsec);
			
 
				-	/*
			
 
				-	 * Of all mechanisms of time adjustment I've tested, this one
			
 
				-	 * was the champion!
			
 
				-	 */
			
 
				-	return ts.tv_sec + 1;
			
 
				+	return ts.tv_sec;
			
 
				 }
			
 
				 
			
 
				 static int kvm_set_wallclock(unsigned long now)
			
 
				 {
			
 
				-	return 0;
			
 
				+	return -1;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * This is our read_clock function. The host puts an tsc timestamp each time
			
 
				- * it updates a new time. Without the tsc adjustment, we can have a situation
			
 
				- * in which a vcpu starts to run earlier (smaller system_time), but probes
			
 
				- * time later (compared to another vcpu), leading to backwards time
			
 
				- */
			
 
				 static cycle_t kvm_clock_read(void)
			
 
				 {
			
 
				-	u64 last_tsc, now;
			
 
				-	int cpu;
			
 
				+	struct pvclock_vcpu_time_info *src;
			
 
				+	cycle_t ret;
			
 
				 
			
 
				-	preempt_disable();
			
 
				-	cpu = smp_processor_id();
			
 
				-
			
 
				-	last_tsc = get_clock(cpu, tsc_timestamp);
			
 
				-	now = get_clock(cpu, system_time);
			
 
				-
			
 
				-	now += kvm_get_delta(last_tsc);
			
 
				-	preempt_enable();
			
 
				-
			
 
				-	return now;
			
 
				+	src = &get_cpu_var(hv_clock);
			
 
				+	ret = pvclock_clocksource_read(src);
			
 
				+	put_cpu_var(hv_clock);
			
 
				+	return ret;
			
 
				 }
			
 
				+
			
 
				 static struct clocksource kvm_clock = {
			
 
				 	.name = "kvm-clock",
			
 
				 	.read = kvm_clock_read,
			
@@ -123,13 +88,14 @@ static struct clocksource kvm_clock = {
 
				 	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
			
 
				 };
			
 
				 
			
 
				-static int kvm_register_clock(void)
			
 
				+static int kvm_register_clock(char *txt)
			
 
				 {
			
 
				 	int cpu = smp_processor_id();
			
 
				 	int low, high;
			
 
				 	low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
			
 
				 	high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
			
 
				-
			
 
				+	printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
			
 
				+	       cpu, high, low, txt);
			
 
				 	return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
			
 
				 }
			
 
				 
			
@@ -140,12 +106,20 @@ static void kvm_setup_secondary_clock(void)
 
				 	 * Now that the first cpu already had this clocksource initialized,
			
 
				 	 * we shouldn't fail.
			
 
				 	 */
			
 
				-	WARN_ON(kvm_register_clock());
			
 
				+	WARN_ON(kvm_register_clock("secondary cpu clock"));
			
 
				 	/* ok, done with our trickery, call native */
			
 
				 	setup_secondary_APIC_clock();
			
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+#ifdef CONFIG_SMP
			
 
				+void __init kvm_smp_prepare_boot_cpu(void)
			
 
				+{
			
 
				+	WARN_ON(kvm_register_clock("primary cpu clock"));
			
 
				+	native_smp_prepare_boot_cpu();
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 /*
			
 
				  * After the clock is registered, the host will keep writing to the
			
 
				  * registered memory location. If the guest happens to shutdown, this memory
			
@@ -174,13 +148,16 @@ void __init kvmclock_init(void)
 
				 		return;
			
 
				 
			
 
				 	if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
			
 
				-		if (kvm_register_clock())
			
 
				+		if (kvm_register_clock("boot clock"))
			
 
				 			return;
			
 
				 		pv_time_ops.get_wallclock = kvm_get_wallclock;
			
 
				 		pv_time_ops.set_wallclock = kvm_set_wallclock;
			
 
				 		pv_time_ops.sched_clock = kvm_clock_read;
			
 
				 #ifdef CONFIG_X86_LOCAL_APIC
			
 
				 		pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
			
 
				+#endif
			
 
				+#ifdef CONFIG_SMP
			
 
				+		smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
			
 
				 #endif
			
 
				 		machine_ops.shutdown  = kvm_shutdown;
			
 
				 #ifdef CONFIG_KEXEC
			
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -0,0 +1,141 @@
 
				+/*  paravirtual clock -- common code used by kvm/xen
			
 
				+
			
 
				+    This program is free software; you can redistribute it and/or modify
			
 
				+    it under the terms of the GNU General Public License as published by
			
 
				+    the Free Software Foundation; either version 2 of the License, or
			
 
				+    (at your option) any later version.
			
 
				+
			
 
				+    This program is distributed in the hope that it will be useful,
			
 
				+    but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
			
 
				+    GNU General Public License for more details.
			
 
				+
			
 
				+    You should have received a copy of the GNU General Public License
			
 
				+    along with this program; if not, write to the Free Software
			
 
				+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
			
 
				+*/
			
 
				+
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/percpu.h>
			
 
				+#include <asm/pvclock.h>
			
 
				+
			
 
				+/*
			
 
				+ * These are perodically updated
			
 
				+ *    xen: magic shared_info page
			
 
				+ *    kvm: gpa registered via msr
			
 
				+ * and then copied here.
			
 
				+ */
			
 
				+struct pvclock_shadow_time {
			
 
				+	u64 tsc_timestamp;     /* TSC at last update of time vals.  */
			
 
				+	u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
			
 
				+	u32 tsc_to_nsec_mul;
			
 
				+	int tsc_shift;
			
 
				+	u32 version;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
			
 
				+ * yielding a 64-bit result.
			
 
				+ */
			
 
				+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
			
 
				+{
			
 
				+	u64 product;
			
 
				+#ifdef __i386__
			
 
				+	u32 tmp1, tmp2;
			
 
				+#endif
			
 
				+
			
 
				+	if (shift < 0)
			
 
				+		delta >>= -shift;
			
 
				+	else
			
 
				+		delta <<= shift;
			
 
				+
			
 
				+#ifdef __i386__
			
 
				+	__asm__ (
			
 
				+		"mul  %5       ; "
			
 
				+		"mov  %4,%%eax ; "
			
 
				+		"mov  %%edx,%4 ; "
			
 
				+		"mul  %5       ; "
			
 
				+		"xor  %5,%5    ; "
			
 
				+		"add  %4,%%eax ; "
			
 
				+		"adc  %5,%%edx ; "
			
 
				+		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
			
 
				+		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
			
 
				+#elif __x86_64__
			
 
				+	__asm__ (
			
 
				+		"mul %%rdx ; shrd $32,%%rdx,%%rax"
			
 
				+		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
			
 
				+#else
			
 
				+#error implement me!
			
 
				+#endif
			
 
				+
			
 
				+	return product;
			
 
				+}
			
 
				+
			
 
				+static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
			
 
				+{
			
 
				+	u64 delta = native_read_tsc() - shadow->tsc_timestamp;
			
 
				+	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Reads a consistent set of time-base values from hypervisor,
			
 
				+ * into a shadow data area.
			
 
				+ */
			
 
				+static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
			
 
				+					struct pvclock_vcpu_time_info *src)
			
 
				+{
			
 
				+	do {
			
 
				+		dst->version = src->version;
			
 
				+		rmb();		/* fetch version before data */
			
 
				+		dst->tsc_timestamp     = src->tsc_timestamp;
			
 
				+		dst->system_timestamp  = src->system_time;
			
 
				+		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
			
 
				+		dst->tsc_shift         = src->tsc_shift;
			
 
				+		rmb();		/* test version after fetching data */
			
 
				+	} while ((src->version & 1) || (dst->version != src->version));
			
 
				+
			
 
				+	return dst->version;
			
 
				+}
			
 
				+
			
 
				+cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
			
 
				+{
			
 
				+	struct pvclock_shadow_time shadow;
			
 
				+	unsigned version;
			
 
				+	cycle_t ret, offset;
			
 
				+
			
 
				+	do {
			
 
				+		version = pvclock_get_time_values(&shadow, src);
			
 
				+		barrier();
			
 
				+		offset = pvclock_get_nsec_offset(&shadow);
			
 
				+		ret = shadow.system_timestamp + offset;
			
 
				+		barrier();
			
 
				+	} while (version != src->version);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
			
 
				+			    struct pvclock_vcpu_time_info *vcpu_time,
			
 
				+			    struct timespec *ts)
			
 
				+{
			
 
				+	u32 version;
			
 
				+	u64 delta;
			
 
				+	struct timespec now;
			
 
				+
			
 
				+	/* get wallclock at system boot */
			
 
				+	do {
			
 
				+		version = wall_clock->version;
			
 
				+		rmb();		/* fetch version before time */
			
 
				+		now.tv_sec  = wall_clock->sec;
			
 
				+		now.tv_nsec = wall_clock->nsec;
			
 
				+		rmb();		/* fetch time before checking version */
			
 
				+	} while ((wall_clock->version & 1) || (version != wall_clock->version));
			
 
				+
			
 
				+	delta = pvclock_clocksource_read(vcpu_time);	/* time since system boot */
			
 
				+	delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
			
 
				+
			
 
				+	now.tv_nsec = do_div(delta, NSEC_PER_SEC);
			
 
				+	now.tv_sec = delta;
			
 
				+
			
 
				+	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
			
 
				+}
			
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -200,9 +200,12 @@ int __pit_timer_fn(struct kvm_kpit_state *ps)
 
				 
			
 
				 	atomic_inc(&pt->pending);
			
 
				 	smp_mb__after_atomic_inc();
			
 
				-	if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
			
 
				-		vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
			
 
				-		wake_up_interruptible(&vcpu0->wq);
			
 
				+	if (vcpu0) {
			
 
				+		set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
			
 
				+		if (waitqueue_active(&vcpu0->wq)) {
			
 
				+			vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
			
 
				+			wake_up_interruptible(&vcpu0->wq);
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
			
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -940,6 +940,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
 
				 	wait_queue_head_t *q = &apic->vcpu->wq;
			
 
				 
			
 
				 	atomic_inc(&apic->timer.pending);
			
 
				+	set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
			
 
				 	if (waitqueue_active(q)) {
			
 
				 		apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
			
 
				 		wake_up_interruptible(q);
			
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -640,6 +640,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
 
				 			rmap_remove(kvm, spte);
			
 
				 			--kvm->stat.lpages;
			
 
				 			set_shadow_pte(spte, shadow_trap_nonpresent_pte);
			
 
				+			spte = NULL;
			
 
				 			write_protected = 1;
			
 
				 		}
			
 
				 		spte = rmap_next(kvm, rmapp, spte);
			
@@ -1082,10 +1083,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 
				 		struct kvm_mmu_page *shadow;
			
 
				 
			
 
				 		spte |= PT_WRITABLE_MASK;
			
 
				-		if (user_fault) {
			
 
				-			mmu_unshadow(vcpu->kvm, gfn);
			
 
				-			goto unshadowed;
			
 
				-		}
			
 
				 
			
 
				 		shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
			
 
				 		if (shadow ||
			
@@ -1102,8 +1099,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-unshadowed:
			
 
				-
			
 
				 	if (pte_access & ACC_WRITE_MASK)
			
 
				 		mark_page_dirty(vcpu->kvm, gfn);
			
 
				 
			
@@ -1580,11 +1575,13 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
 
				 				  u64 *spte,
			
 
				 				  const void *new)
			
 
				 {
			
 
				-	if ((sp->role.level != PT_PAGE_TABLE_LEVEL)
			
 
				-	    && !vcpu->arch.update_pte.largepage) {
			
 
				-		++vcpu->kvm->stat.mmu_pde_zapped;
			
 
				-		return;
			
 
				-	}
			
 
				+	if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
			
 
				+		if (!vcpu->arch.update_pte.largepage ||
			
 
				+		    sp->role.glevels == PT32_ROOT_LEVEL) {
			
 
				+			++vcpu->kvm->stat.mmu_pde_zapped;
			
 
				+			return;
			
 
				+		}
			
 
				+        }
			
 
				 
			
 
				 	++vcpu->kvm->stat.mmu_pte_updated;
			
 
				 	if (sp->role.glevels == PT32_ROOT_LEVEL)
			
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -566,7 +566,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 
				 	load_transition_efer(vmx);
			
 
				 }
			
 
				 
			
 
				-static void vmx_load_host_state(struct vcpu_vmx *vmx)
			
 
				+static void __vmx_load_host_state(struct vcpu_vmx *vmx)
			
 
				 {
			
 
				 	unsigned long flags;
			
 
				 
			
@@ -596,6 +596,13 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
 
				 	reload_host_efer(vmx);
			
 
				 }
			
 
				 
			
 
				+static void vmx_load_host_state(struct vcpu_vmx *vmx)
			
 
				+{
			
 
				+	preempt_disable();
			
 
				+	__vmx_load_host_state(vmx);
			
 
				+	preempt_enable();
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
			
 
				  * vcpu mutex is already taken.
			
@@ -654,7 +661,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
				 
			
 
				 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
			
 
				 {
			
 
				-	vmx_load_host_state(to_vmx(vcpu));
			
 
				+	__vmx_load_host_state(to_vmx(vcpu));
			
 
				 }
			
 
				 
			
 
				 static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
			
@@ -884,11 +891,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 
				 	switch (msr_index) {
			
 
				 #ifdef CONFIG_X86_64
			
 
				 	case MSR_EFER:
			
 
				+		vmx_load_host_state(vmx);
			
 
				 		ret = kvm_set_msr_common(vcpu, msr_index, data);
			
 
				-		if (vmx->host_state.loaded) {
			
 
				-			reload_host_efer(vmx);
			
 
				-			load_transition_efer(vmx);
			
 
				-		}
			
 
				 		break;
			
 
				 	case MSR_FS_BASE:
			
 
				 		vmcs_writel(GUEST_FS_BASE, data);
			
@@ -910,11 +914,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 
				 		guest_write_tsc(data);
			
 
				 		break;
			
 
				 	default:
			
 
				+		vmx_load_host_state(vmx);
			
 
				 		msr = find_msr_entry(vmx, msr_index);
			
 
				 		if (msr) {
			
 
				 			msr->data = data;
			
 
				-			if (vmx->host_state.loaded)
			
 
				-				load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
			
 
				 			break;
			
 
				 		}
			
 
				 		ret = kvm_set_msr_common(vcpu, msr_index, data);
			
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -492,8 +492,8 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 
				 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
			
 
				 {
			
 
				 	static int version;
			
 
				-	struct kvm_wall_clock wc;
			
 
				-	struct timespec wc_ts;
			
 
				+	struct pvclock_wall_clock wc;
			
 
				+	struct timespec now, sys, boot;
			
 
				 
			
 
				 	if (!wall_clock)
			
 
				 		return;
			
@@ -502,10 +502,19 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 
				 
			
 
				 	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
			
 
				 
			
 
				-	wc_ts = current_kernel_time();
			
 
				-	wc.wc_sec = wc_ts.tv_sec;
			
 
				-	wc.wc_nsec = wc_ts.tv_nsec;
			
 
				-	wc.wc_version = version;
			
 
				+	/*
			
 
				+	 * The guest calculates current wall clock time by adding
			
 
				+	 * system time (updated by kvm_write_guest_time below) to the
			
 
				+	 * wall clock specified here.  guest system time equals host
			
 
				+	 * system time for us, thus we must fill in host boot time here.
			
 
				+	 */
			
 
				+	now = current_kernel_time();
			
 
				+	ktime_get_ts(&sys);
			
 
				+	boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
			
 
				+
			
 
				+	wc.sec = boot.tv_sec;
			
 
				+	wc.nsec = boot.tv_nsec;
			
 
				+	wc.version = version;
			
 
				 
			
 
				 	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
			
 
				 
			
@@ -513,6 +522,45 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 
				 	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
			
 
				 }
			
 
				 
			
 
				+static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
			
 
				+{
			
 
				+	uint32_t quotient, remainder;
			
 
				+
			
 
				+	/* Don't try to replace with do_div(), this one calculates
			
 
				+	 * "(dividend << 32) / divisor" */
			
 
				+	__asm__ ( "divl %4"
			
 
				+		  : "=a" (quotient), "=d" (remainder)
			
 
				+		  : "0" (0), "1" (dividend), "r" (divisor) );
			
 
				+	return quotient;
			
 
				+}
			
 
				+
			
 
				+static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
			
 
				+{
			
 
				+	uint64_t nsecs = 1000000000LL;
			
 
				+	int32_t  shift = 0;
			
 
				+	uint64_t tps64;
			
 
				+	uint32_t tps32;
			
 
				+
			
 
				+	tps64 = tsc_khz * 1000LL;
			
 
				+	while (tps64 > nsecs*2) {
			
 
				+		tps64 >>= 1;
			
 
				+		shift--;
			
 
				+	}
			
 
				+
			
 
				+	tps32 = (uint32_t)tps64;
			
 
				+	while (tps32 <= (uint32_t)nsecs) {
			
 
				+		tps32 <<= 1;
			
 
				+		shift++;
			
 
				+	}
			
 
				+
			
 
				+	hv_clock->tsc_shift = shift;
			
 
				+	hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
			
 
				+
			
 
				+	pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
			
 
				+		 __FUNCTION__, tsc_khz, hv_clock->tsc_shift,
			
 
				+		 hv_clock->tsc_to_system_mul);
			
 
				+}
			
 
				+
			
 
				 static void kvm_write_guest_time(struct kvm_vcpu *v)
			
 
				 {
			
 
				 	struct timespec ts;
			
@@ -523,6 +571,11 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 
				 	if ((!vcpu->time_page))
			
 
				 		return;
			
 
				 
			
 
				+	if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
			
 
				+		kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
			
 
				+		vcpu->hv_clock_tsc_khz = tsc_khz;
			
 
				+	}
			
 
				+
			
 
				 	/* Keep irq disabled to prevent changes to the clock */
			
 
				 	local_irq_save(flags);
			
 
				 	kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
			
@@ -537,14 +590,14 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 
				 	/*
			
 
				 	 * The interface expects us to write an even number signaling that the
			
 
				 	 * update is finished. Since the guest won't see the intermediate
			
 
				-	 * state, we just write "2" at the end
			
 
				+	 * state, we just increase by 2 at the end.
			
 
				 	 */
			
 
				-	vcpu->hv_clock.version = 2;
			
 
				+	vcpu->hv_clock.version += 2;
			
 
				 
			
 
				 	shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
			
 
				 
			
 
				 	memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
			
 
				-		sizeof(vcpu->hv_clock));
			
 
				+	       sizeof(vcpu->hv_clock));
			
 
				 
			
 
				 	kunmap_atomic(shared_kaddr, KM_USER0);
			
 
				 
			
@@ -599,10 +652,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 
				 		/* ...but clean it before doing the actual write */
			
 
				 		vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
			
 
				 
			
 
				-		vcpu->arch.hv_clock.tsc_to_system_mul =
			
 
				-					clocksource_khz2mult(tsc_khz, 22);
			
 
				-		vcpu->arch.hv_clock.tsc_shift = 22;
			
 
				-
			
 
				 		down_read(&current->mm->mmap_sem);
			
 
				 		vcpu->arch.time_page =
			
 
				 				gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
			
@@ -2759,6 +2808,8 @@ again:
 
				 	if (vcpu->requests) {
			
 
				 		if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
			
 
				 			__kvm_migrate_timers(vcpu);
			
 
				+		if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
			
 
				+			kvm_x86_ops->tlb_flush(vcpu);
			
 
				 		if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
			
 
				 				       &vcpu->requests)) {
			
 
				 			kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
			
@@ -2772,6 +2823,7 @@ again:
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
			
 
				 	kvm_inject_pending_timer_irqs(vcpu);
			
 
				 
			
 
				 	preempt_disable();
			
@@ -2781,21 +2833,13 @@ again:
 
				 
			
 
				 	local_irq_disable();
			
 
				 
			
 
				-	if (need_resched()) {
			
 
				+	if (vcpu->requests || need_resched()) {
			
 
				 		local_irq_enable();
			
 
				 		preempt_enable();
			
 
				 		r = 1;
			
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				-	if (vcpu->requests)
			
 
				-		if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) {
			
 
				-			local_irq_enable();
			
 
				-			preempt_enable();
			
 
				-			r = 1;
			
 
				-			goto out;
			
 
				-		}
			
 
				-
			
 
				 	if (signal_pending(current)) {
			
 
				 		local_irq_enable();
			
 
				 		preempt_enable();
			
@@ -2825,9 +2869,6 @@ again:
 
				 
			
 
				 	kvm_guest_enter();
			
 
				 
			
 
				-	if (vcpu->requests)
			
 
				-		if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
			
 
				-			kvm_x86_ops->tlb_flush(vcpu);
			
 
				 
			
 
				 	KVMTRACE_0D(VMENTRY, vcpu, entryexit);
			
 
				 	kvm_x86_ops->run(vcpu, kvm_run);
			
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -5,8 +5,9 @@
 
				 config XEN
			
 
				 	bool "Xen guest support"
			
 
				 	select PARAVIRT
			
 
				+	select PARAVIRT_CLOCK
			
 
				 	depends on X86_32
			
 
				-	depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER)
			
 
				+	depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER)
			
 
				 	help
			
 
				 	  This is the Linux Xen port.  Enabling this will allow the
			
 
				 	  kernel to boot in a paravirtualized environment under the
			
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -785,38 +785,35 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
 
				 static __init void xen_pagetable_setup_start(pgd_t *base)
			
 
				 {
			
 
				 	pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
			
 
				+	int i;
			
 
				 
			
 
				 	/* special set_pte for pagetable initialization */
			
 
				 	pv_mmu_ops.set_pte = xen_set_pte_init;
			
 
				 
			
 
				 	init_mm.pgd = base;
			
 
				 	/*
			
 
				-	 * copy top-level of Xen-supplied pagetable into place.	 For
			
 
				-	 * !PAE we can use this as-is, but for PAE it is a stand-in
			
 
				-	 * while we copy the pmd pages.
			
 
				+	 * copy top-level of Xen-supplied pagetable into place.  This
			
 
				+	 * is a stand-in while we copy the pmd pages.
			
 
				 	 */
			
 
				 	memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
			
 
				 
			
 
				-	if (PTRS_PER_PMD > 1) {
			
 
				-		int i;
			
 
				-		/*
			
 
				-		 * For PAE, need to allocate new pmds, rather than
			
 
				-		 * share Xen's, since Xen doesn't like pmd's being
			
 
				-		 * shared between address spaces.
			
 
				-		 */
			
 
				-		for (i = 0; i < PTRS_PER_PGD; i++) {
			
 
				-			if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
			
 
				-				pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
			
 
				+	/*
			
 
				+	 * For PAE, need to allocate new pmds, rather than
			
 
				+	 * share Xen's, since Xen doesn't like pmd's being
			
 
				+	 * shared between address spaces.
			
 
				+	 */
			
 
				+	for (i = 0; i < PTRS_PER_PGD; i++) {
			
 
				+		if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
			
 
				+			pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
			
 
				 
			
 
				-				memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
			
 
				-				       PAGE_SIZE);
			
 
				+			memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
			
 
				+			       PAGE_SIZE);
			
 
				 
			
 
				-				make_lowmem_page_readonly(pmd);
			
 
				+			make_lowmem_page_readonly(pmd);
			
 
				 
			
 
				-				set_pgd(&base[i], __pgd(1 + __pa(pmd)));
			
 
				-			} else
			
 
				-				pgd_clear(&base[i]);
			
 
				-		}
			
 
				+			set_pgd(&base[i], __pgd(1 + __pa(pmd)));
			
 
				+		} else
			
 
				+			pgd_clear(&base[i]);
			
 
				 	}
			
 
				 
			
 
				 	/* make sure zero_page is mapped RO so we can use it in pagetables */
			
@@ -873,17 +870,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
 
				 
			
 
				 	/* Actually pin the pagetable down, but we can't set PG_pinned
			
 
				 	   yet because the page structures don't exist yet. */
			
 
				-	{
			
 
				-		unsigned level;
			
 
				-
			
 
				-#ifdef CONFIG_X86_PAE
			
 
				-		level = MMUEXT_PIN_L3_TABLE;
			
 
				-#else
			
 
				-		level = MMUEXT_PIN_L2_TABLE;
			
 
				-#endif
			
 
				-
			
 
				-		pin_pagetable_pfn(level, PFN_DOWN(__pa(base)));
			
 
				-	}
			
 
				+	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
			
 
				 }
			
 
				 
			
 
				 /* This is called once we have the cpu_possible_map */
			
@@ -1093,7 +1080,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
 
				 	.make_pte = xen_make_pte,
			
 
				 	.make_pgd = xen_make_pgd,
			
 
				 
			
 
				-#ifdef CONFIG_X86_PAE
			
 
				 	.set_pte_atomic = xen_set_pte_atomic,
			
 
				 	.set_pte_present = xen_set_pte_at,
			
 
				 	.set_pud = xen_set_pud,
			
@@ -1102,7 +1088,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
 
				 
			
 
				 	.make_pmd = xen_make_pmd,
			
 
				 	.pmd_val = xen_pmd_val,
			
 
				-#endif	/* PAE */
			
 
				 
			
 
				 	.activate_mm = xen_activate_mm,
			
 
				 	.dup_mmap = xen_dup_mmap,
			
@@ -1228,6 +1213,11 @@ asmlinkage void __init xen_start_kernel(void)
 
				 	if (xen_feature(XENFEAT_supervisor_mode_kernel))
			
 
				 		pv_info.kernel_rpl = 0;
			
 
				 
			
 
				+	/* Prevent unwanted bits from being set in PTEs. */
			
 
				+	__supported_pte_mask &= ~_PAGE_GLOBAL;
			
 
				+	if (!is_initial_xendomain())
			
 
				+		__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
			
 
				+
			
 
				 	/* set the limit of our address space */
			
 
				 	xen_reserve_top();
			
 
				 
			
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -179,50 +179,56 @@ out:
 
				 		preempt_enable();
			
 
				 }
			
 
				 
			
 
				-pteval_t xen_pte_val(pte_t pte)
			
 
				+/* Assume pteval_t is equivalent to all the other *val_t types. */
			
 
				+static pteval_t pte_mfn_to_pfn(pteval_t val)
			
 
				+{
			
 
				+	if (val & _PAGE_PRESENT) {
			
 
				+		unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT;
			
 
				+		pteval_t flags = val & ~PTE_MASK;
			
 
				+		val = (mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
			
 
				+	}
			
 
				+
			
 
				+	return val;
			
 
				+}
			
 
				+
			
 
				+static pteval_t pte_pfn_to_mfn(pteval_t val)
			
 
				 {
			
 
				-	pteval_t ret = pte.pte;
			
 
				+	if (val & _PAGE_PRESENT) {
			
 
				+		unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT;
			
 
				+		pteval_t flags = val & ~PTE_MASK;
			
 
				+		val = (pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
			
 
				+	}
			
 
				 
			
 
				-	if (ret & _PAGE_PRESENT)
			
 
				-		ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
			
 
				+	return val;
			
 
				+}
			
 
				 
			
 
				-	return ret;
			
 
				+pteval_t xen_pte_val(pte_t pte)
			
 
				+{
			
 
				+	return pte_mfn_to_pfn(pte.pte);
			
 
				 }
			
 
				 
			
 
				 pgdval_t xen_pgd_val(pgd_t pgd)
			
 
				 {
			
 
				-	pgdval_t ret = pgd.pgd;
			
 
				-	if (ret & _PAGE_PRESENT)
			
 
				-		ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
			
 
				-	return ret;
			
 
				+	return pte_mfn_to_pfn(pgd.pgd);
			
 
				 }
			
 
				 
			
 
				 pte_t xen_make_pte(pteval_t pte)
			
 
				 {
			
 
				-	if (pte & _PAGE_PRESENT) {
			
 
				-		pte = phys_to_machine(XPADDR(pte)).maddr;
			
 
				-		pte &= ~(_PAGE_PCD | _PAGE_PWT);
			
 
				-	}
			
 
				-
			
 
				-	return (pte_t){ .pte = pte };
			
 
				+	pte = pte_pfn_to_mfn(pte);
			
 
				+	return native_make_pte(pte);
			
 
				 }
			
 
				 
			
 
				 pgd_t xen_make_pgd(pgdval_t pgd)
			
 
				 {
			
 
				-	if (pgd & _PAGE_PRESENT)
			
 
				-		pgd = phys_to_machine(XPADDR(pgd)).maddr;
			
 
				-
			
 
				-	return (pgd_t){ pgd };
			
 
				+	pgd = pte_pfn_to_mfn(pgd);
			
 
				+	return native_make_pgd(pgd);
			
 
				 }
			
 
				 
			
 
				 pmdval_t xen_pmd_val(pmd_t pmd)
			
 
				 {
			
 
				-	pmdval_t ret = native_pmd_val(pmd);
			
 
				-	if (ret & _PAGE_PRESENT)
			
 
				-		ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
			
 
				-	return ret;
			
 
				+	return pte_mfn_to_pfn(pmd.pmd);
			
 
				 }
			
 
				-#ifdef CONFIG_X86_PAE
			
 
				+
			
 
				 void xen_set_pud(pud_t *ptr, pud_t val)
			
 
				 {
			
 
				 	struct multicall_space mcs;
			
@@ -267,17 +273,9 @@ void xen_pmd_clear(pmd_t *pmdp)
 
				 
			
 
				 pmd_t xen_make_pmd(pmdval_t pmd)
			
 
				 {
			
 
				-	if (pmd & _PAGE_PRESENT)
			
 
				-		pmd = phys_to_machine(XPADDR(pmd)).maddr;
			
 
				-
			
 
				+	pmd = pte_pfn_to_mfn(pmd);
			
 
				 	return native_make_pmd(pmd);
			
 
				 }
			
 
				-#else  /* !PAE */
			
 
				-void xen_set_pte(pte_t *ptep, pte_t pte)
			
 
				-{
			
 
				-	*ptep = pte;
			
 
				-}
			
 
				-#endif	/* CONFIG_X86_PAE */
			
 
				 
			
 
				 /*
			
 
				   (Yet another) pagetable walker.  This one is intended for pinning a
			
@@ -430,8 +428,6 @@ static int pin_page(struct page *page, enum pt_level level)
 
				    read-only, and can be pinned. */
			
 
				 void xen_pgd_pin(pgd_t *pgd)
			
 
				 {
			
 
				-	unsigned level;
			
 
				-
			
 
				 	xen_mc_batch();
			
 
				 
			
 
				 	if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
			
@@ -441,14 +437,7 @@ void xen_pgd_pin(pgd_t *pgd)
 
				 		xen_mc_batch();
			
 
				 	}
			
 
				 
			
 
				-#ifdef CONFIG_X86_PAE
			
 
				-	level = MMUEXT_PIN_L3_TABLE;
			
 
				-#else
			
 
				-	level = MMUEXT_PIN_L2_TABLE;
			
 
				-#endif
			
 
				-
			
 
				-	xen_do_pin(level, PFN_DOWN(__pa(pgd)));
			
 
				-
			
 
				+	xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
			
 
				 	xen_mc_issue(0);
			
 
				 }
			
 
				 
			
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -37,14 +37,13 @@ void xen_exit_mmap(struct mm_struct *mm);
 
				 void xen_pgd_pin(pgd_t *pgd);
			
 
				 //void xen_pgd_unpin(pgd_t *pgd);
			
 
				 
			
 
				-#ifdef CONFIG_X86_PAE
			
 
				-unsigned long long xen_pte_val(pte_t);
			
 
				-unsigned long long xen_pmd_val(pmd_t);
			
 
				-unsigned long long xen_pgd_val(pgd_t);
			
 
				+pteval_t xen_pte_val(pte_t);
			
 
				+pmdval_t xen_pmd_val(pmd_t);
			
 
				+pgdval_t xen_pgd_val(pgd_t);
			
 
				 
			
 
				-pte_t xen_make_pte(unsigned long long);
			
 
				-pmd_t xen_make_pmd(unsigned long long);
			
 
				-pgd_t xen_make_pgd(unsigned long long);
			
 
				+pte_t xen_make_pte(pteval_t);
			
 
				+pmd_t xen_make_pmd(pmdval_t);
			
 
				+pgd_t xen_make_pgd(pgdval_t);
			
 
				 
			
 
				 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
			
 
				 		    pte_t *ptep, pte_t pteval);
			
@@ -53,15 +52,4 @@ void xen_set_pud(pud_t *ptr, pud_t val);
 
				 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
			
 
				 void xen_pmd_clear(pmd_t *pmdp);
			
 
				 
			
 
				-
			
 
				-#else
			
 
				-unsigned long xen_pte_val(pte_t);
			
 
				-unsigned long xen_pmd_val(pmd_t);
			
 
				-unsigned long xen_pgd_val(pgd_t);
			
 
				-
			
 
				-pte_t xen_make_pte(unsigned long);
			
 
				-pmd_t xen_make_pmd(unsigned long);
			
 
				-pgd_t xen_make_pgd(unsigned long);
			
 
				-#endif
			
 
				-
			
 
				 #endif	/* _XEN_MMU_H */
			
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -14,6 +14,7 @@
 
				 #include <linux/kernel_stat.h>
			
 
				 #include <linux/math64.h>
			
 
				 
			
 
				+#include <asm/pvclock.h>
			
 
				 #include <asm/xen/hypervisor.h>
			
 
				 #include <asm/xen/hypercall.h>
			
 
				 
			
@@ -31,17 +32,6 @@
 
				 
			
 
				 static cycle_t xen_clocksource_read(void);
			
 
				 
			
 
				-/* These are perodically updated in shared_info, and then copied here. */
			
 
				-struct shadow_time_info {
			
 
				-	u64 tsc_timestamp;     /* TSC at last update of time vals.  */
			
 
				-	u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
			
 
				-	u32 tsc_to_nsec_mul;
			
 
				-	int tsc_shift;
			
 
				-	u32 version;
			
 
				-};
			
 
				-
			
 
				-static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
			
 
				-
			
 
				 /* runstate info updated by Xen */
			
 
				 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
			
 
				 
			
@@ -211,7 +201,7 @@ unsigned long long xen_sched_clock(void)
 
				 unsigned long xen_cpu_khz(void)
			
 
				 {
			
 
				 	u64 xen_khz = 1000000ULL << 32;
			
 
				-	const struct vcpu_time_info *info =
			
 
				+	const struct pvclock_vcpu_time_info *info =
			
 
				 		&HYPERVISOR_shared_info->vcpu_info[0].time;
			
 
				 
			
 
				 	do_div(xen_khz, info->tsc_to_system_mul);
			
@@ -223,121 +213,26 @@ unsigned long xen_cpu_khz(void)
 
				 	return xen_khz;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Reads a consistent set of time-base values from Xen, into a shadow data
			
 
				- * area.
			
 
				- */
			
 
				-static unsigned get_time_values_from_xen(void)
			
 
				-{
			
 
				-	struct vcpu_time_info   *src;
			
 
				-	struct shadow_time_info *dst;
			
 
				-
			
 
				-	/* src is shared memory with the hypervisor, so we need to
			
 
				-	   make sure we get a consistent snapshot, even in the face of
			
 
				-	   being preempted. */
			
 
				-	src = &__get_cpu_var(xen_vcpu)->time;
			
 
				-	dst = &__get_cpu_var(shadow_time);
			
 
				-
			
 
				-	do {
			
 
				-		dst->version = src->version;
			
 
				-		rmb();		/* fetch version before data */
			
 
				-		dst->tsc_timestamp     = src->tsc_timestamp;
			
 
				-		dst->system_timestamp  = src->system_time;
			
 
				-		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
			
 
				-		dst->tsc_shift         = src->tsc_shift;
			
 
				-		rmb();		/* test version after fetching data */
			
 
				-	} while ((src->version & 1) | (dst->version ^ src->version));
			
 
				-
			
 
				-	return dst->version;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
			
 
				- * yielding a 64-bit result.
			
 
				- */
			
 
				-static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
			
 
				-{
			
 
				-	u64 product;
			
 
				-#ifdef __i386__
			
 
				-	u32 tmp1, tmp2;
			
 
				-#endif
			
 
				-
			
 
				-	if (shift < 0)
			
 
				-		delta >>= -shift;
			
 
				-	else
			
 
				-		delta <<= shift;
			
 
				-
			
 
				-#ifdef __i386__
			
 
				-	__asm__ (
			
 
				-		"mul  %5       ; "
			
 
				-		"mov  %4,%%eax ; "
			
 
				-		"mov  %%edx,%4 ; "
			
 
				-		"mul  %5       ; "
			
 
				-		"xor  %5,%5    ; "
			
 
				-		"add  %4,%%eax ; "
			
 
				-		"adc  %5,%%edx ; "
			
 
				-		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
			
 
				-		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
			
 
				-#elif __x86_64__
			
 
				-	__asm__ (
			
 
				-		"mul %%rdx ; shrd $32,%%rdx,%%rax"
			
 
				-		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
			
 
				-#else
			
 
				-#error implement me!
			
 
				-#endif
			
 
				-
			
 
				-	return product;
			
 
				-}
			
 
				-
			
 
				-static u64 get_nsec_offset(struct shadow_time_info *shadow)
			
 
				-{
			
 
				-	u64 now, delta;
			
 
				-	now = native_read_tsc();
			
 
				-	delta = now - shadow->tsc_timestamp;
			
 
				-	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
			
 
				-}
			
 
				-
			
 
				 static cycle_t xen_clocksource_read(void)
			
 
				 {
			
 
				-	struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
			
 
				+        struct pvclock_vcpu_time_info *src;
			
 
				 	cycle_t ret;
			
 
				-	unsigned version;
			
 
				-
			
 
				-	do {
			
 
				-		version = get_time_values_from_xen();
			
 
				-		barrier();
			
 
				-		ret = shadow->system_timestamp + get_nsec_offset(shadow);
			
 
				-		barrier();
			
 
				-	} while (version != __get_cpu_var(xen_vcpu)->time.version);
			
 
				-
			
 
				-	put_cpu_var(shadow_time);
			
 
				 
			
 
				+	src = &get_cpu_var(xen_vcpu)->time;
			
 
				+	ret = pvclock_clocksource_read(src);
			
 
				+	put_cpu_var(xen_vcpu);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				 static void xen_read_wallclock(struct timespec *ts)
			
 
				 {
			
 
				-	const struct shared_info *s = HYPERVISOR_shared_info;
			
 
				-	u32 version;
			
 
				-	u64 delta;
			
 
				-	struct timespec now;
			
 
				-
			
 
				-	/* get wallclock at system boot */
			
 
				-	do {
			
 
				-		version = s->wc_version;
			
 
				-		rmb();		/* fetch version before time */
			
 
				-		now.tv_sec  = s->wc_sec;
			
 
				-		now.tv_nsec = s->wc_nsec;
			
 
				-		rmb();		/* fetch time before checking version */
			
 
				-	} while ((s->wc_version & 1) | (version ^ s->wc_version));
			
 
				+	struct shared_info *s = HYPERVISOR_shared_info;
			
 
				+	struct pvclock_wall_clock *wall_clock = &(s->wc);
			
 
				+        struct pvclock_vcpu_time_info *vcpu_time;
			
 
				 
			
 
				-	delta = xen_clocksource_read();	/* time since system boot */
			
 
				-	delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
			
 
				-
			
 
				-	now.tv_nsec = do_div(delta, NSEC_PER_SEC);
			
 
				-	now.tv_sec = delta;
			
 
				-
			
 
				-	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
			
 
				+	vcpu_time = &get_cpu_var(xen_vcpu)->time;
			
 
				+	pvclock_read_wallclock(wall_clock, vcpu_time, ts);
			
 
				+	put_cpu_var(xen_vcpu);
			
 
				 }
			
 
				 
			
 
				 unsigned long xen_get_wallclock(void)
			
@@ -345,7 +240,6 @@ unsigned long xen_get_wallclock(void)
 
				 	struct timespec ts;
			
 
				 
			
 
				 	xen_read_wallclock(&ts);
			
 
				-
			
 
				 	return ts.tv_sec;
			
 
				 }
			
 
				 
			
@@ -569,8 +463,6 @@ __init void xen_time_init(void)
 
				 {
			
 
				 	int cpu = smp_processor_id();
			
 
				 
			
 
				-	get_time_values_from_xen();
			
 
				-
			
 
				 	clocksource_register(&xen_clocksource);
			
 
				 
			
 
				 	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
			
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -17,7 +17,7 @@ ENTRY(startup_xen)
 
				 
			
 
				 	__FINIT
			
 
				 
			
 
				-.pushsection .bss.page_aligned
			
 
				+.pushsection .text
			
 
				 	.align PAGE_SIZE_asm
			
 
				 ENTRY(hypercall_page)
			
 
				 	.skip 0x1000
			
@@ -30,11 +30,7 @@ ENTRY(hypercall_page)
 
				 	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .long  startup_xen)
			
 
				 	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long  hypercall_page)
			
 
				 	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz "!writable_page_tables|pae_pgdir_above_4gb")
			
 
				-#ifdef CONFIG_X86_PAE
			
 
				 	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")
			
 
				-#else
			
 
				-	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "no")
			
 
				-#endif
			
 
				 	ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
			
 
				 
			
 
				 #endif /*CONFIG_XEN */
			
--- a/drivers/char/drm/i915_drv.c
+++ b/drivers/char/drm/i915_drv.c
@@ -389,6 +389,7 @@ static int i915_resume(struct drm_device *dev)
 
				 	pci_restore_state(dev->pdev);
			
 
				 	if (pci_enable_device(dev->pdev))
			
 
				 		return -1;
			
 
				+	pci_set_master(dev->pdev);
			
 
				 
			
 
				 	pci_write_config_byte(dev->pdev, LBB, dev_priv->saveLBB);
			
 
				 
			
--- a/drivers/char/tty_ioctl.c
+++ b/drivers/char/tty_ioctl.c
@@ -981,16 +981,9 @@ EXPORT_SYMBOL_GPL(tty_perform_flush);
 
				 int n_tty_ioctl(struct tty_struct *tty, struct file *file,
			
 
				 		       unsigned int cmd, unsigned long arg)
			
 
				 {
			
 
				-	struct tty_struct *real_tty;
			
 
				 	unsigned long flags;
			
 
				 	int retval;
			
 
				 
			
 
				-	if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
			
 
				-	    tty->driver->subtype == PTY_TYPE_MASTER)
			
 
				-		real_tty = tty->link;
			
 
				-	else
			
 
				-		real_tty = tty;
			
 
				-
			
 
				 	switch (cmd) {
			
 
				 	case TCXONC:
			
 
				 		retval = tty_check_change(tty);
			
--- a/drivers/infiniband/hw/mthca/mthca_memfree.c
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -109,7 +109,11 @@ static int mthca_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_m
 
				 {
			
 
				 	struct page *page;
			
 
				 
			
 
				-	page = alloc_pages(gfp_mask, order);
			
 
				+	/*
			
 
				+	 * Use __GFP_ZERO because buggy firmware assumes ICM pages are
			
 
				+	 * cleared, and subtle failures are seen if they aren't.
			
 
				+	 */
			
 
				+	page = alloc_pages(gfp_mask | __GFP_ZERO, order);
			
 
				 	if (!page)
			
 
				 		return -ENOMEM;
			
 
				 
			
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -176,7 +176,7 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
 
				 	 * we set it now, so we can trap and pass that trap to the Guest if it
			
 
				 	 * uses the FPU. */
			
 
				 	if (cpu->ts)
			
 
				-		lguest_set_ts();
			
 
				+		unlazy_fpu(current);
			
 
				 
			
 
				 	/* SYSENTER is an optimized way of doing system calls.  We can't allow
			
 
				 	 * it because it always jumps to privilege level 0.  A normal Guest
			
@@ -196,6 +196,10 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
 
				 	 * trap made the switcher code come back, and an error code which some
			
 
				 	 * traps set.  */
			
 
				 
			
 
				+	 /* Restore SYSENTER if it's supposed to be on. */
			
 
				+	 if (boot_cpu_has(X86_FEATURE_SEP))
			
 
				+		wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
			
 
				+
			
 
				 	/* If the Guest page faulted, then the cr2 register will tell us the
			
 
				 	 * bad virtual address.  We have to grab this now, because once we
			
 
				 	 * re-enable interrupts an interrupt could fault and thus overwrite
			
@@ -203,13 +207,12 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
 
				 	if (cpu->regs->trapnum == 14)
			
 
				 		cpu->arch.last_pagefault = read_cr2();
			
 
				 	/* Similarly, if we took a trap because the Guest used the FPU,
			
 
				-	 * we have to restore the FPU it expects to see. */
			
 
				+	 * we have to restore the FPU it expects to see.
			
 
				+	 * math_state_restore() may sleep and we may even move off to
			
 
				+	 * a different CPU. So all the critical stuff should be done
			
 
				+	 * before this.  */
			
 
				 	else if (cpu->regs->trapnum == 7)
			
 
				 		math_state_restore();
			
 
				-
			
 
				-	/* Restore SYSENTER if it's supposed to be on. */
			
 
				-	if (boot_cpu_has(X86_FEATURE_SEP))
			
 
				-		wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
			
 
				 }
			
 
				 
			
 
				 /*H:130 Now we've examined the hypercall code; our Guest can make requests.
			
--- a/drivers/watchdog/Makefile
+++ b/drivers/watchdog/Makefile
@@ -68,7 +68,6 @@ obj-$(CONFIG_WAFER_WDT) += wafer5823wdt.o
 
				 obj-$(CONFIG_I6300ESB_WDT) += i6300esb.o
			
 
				 obj-$(CONFIG_ITCO_WDT) += iTCO_wdt.o iTCO_vendor_support.o
			
 
				 obj-$(CONFIG_IT8712F_WDT) += it8712f_wdt.o
			
 
				-CFLAGS_hpwdt.o += -O
			
 
				 obj-$(CONFIG_HP_WATCHDOG) += hpwdt.o
			
 
				 obj-$(CONFIG_SC1200_WDT) += sc1200wdt.o
			
 
				 obj-$(CONFIG_SCx200_WDT) += scx200_wdt.o
			
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -529,7 +529,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
 
				 
			
 
				 #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
			
 
				 		/* Clear master flag /before/ clearing selector flag. */
			
 
				-		rmb();
			
 
				+		wmb();
			
 
				 #endif
			
 
				 		pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
			
 
				 		while (pending_words != 0) {
			
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -246,15 +246,11 @@ static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
 
				 
			
 
				 }
			
 
				 
			
 
				-static inline unsigned int zero_metapath_length(const struct metapath *mp,
			
 
				-						unsigned height)
			
 
				+static inline unsigned int metapath_branch_start(const struct metapath *mp)
			
 
				 {
			
 
				-	unsigned int i;
			
 
				-	for (i = 0; i < height - 1; i++) {
			
 
				-		if (mp->mp_list[i] != 0)
			
 
				-			return i;
			
 
				-	}
			
 
				-	return height;
			
 
				+	if (mp->mp_list[0] == 0)
			
 
				+		return 2;
			
 
				+	return 1;
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -436,7 +432,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 
				 	struct gfs2_sbd *sdp = GFS2_SB(inode);
			
 
				 	struct buffer_head *dibh = mp->mp_bh[0];
			
 
				 	u64 bn, dblock = 0;
			
 
				-	unsigned n, i, blks, alloced = 0, iblks = 0, zmpl = 0;
			
 
				+	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
			
 
				 	unsigned dblks = 0;
			
 
				 	unsigned ptrs_per_blk;
			
 
				 	const unsigned end_of_metadata = height - 1;
			
@@ -471,9 +467,8 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 
				 			/* Building up tree height */
			
 
				 			state = ALLOC_GROW_HEIGHT;
			
 
				 			iblks = height - ip->i_height;
			
 
				-			zmpl = zero_metapath_length(mp, height);
			
 
				-			iblks -= zmpl;
			
 
				-			iblks += height;
			
 
				+			branch_start = metapath_branch_start(mp);
			
 
				+			iblks += (height - branch_start);
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -509,13 +504,13 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 
				 					sizeof(struct gfs2_meta_header));
			
 
				 				*ptr = zero_bn;
			
 
				 				state = ALLOC_GROW_DEPTH;
			
 
				-				for(i = zmpl; i < height; i++) {
			
 
				+				for(i = branch_start; i < height; i++) {
			
 
				 					if (mp->mp_bh[i] == NULL)
			
 
				 						break;
			
 
				 					brelse(mp->mp_bh[i]);
			
 
				 					mp->mp_bh[i] = NULL;
			
 
				 				}
			
 
				-				i = zmpl;
			
 
				+				i = branch_start;
			
 
				 			}
			
 
				 			if (n == 0)
			
 
				 				break;
			
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -195,7 +195,7 @@ ulong_aligned:
 
				 	   depending on architecture.  I've experimented with several ways
			
 
				 	   of writing this section such as using an else before the goto
			
 
				 	   but this one seems to be the fastest. */
			
 
				-	while ((unsigned char *)plong < end - 1) {
			
 
				+	while ((unsigned char *)plong < end - sizeof(unsigned long)) {
			
 
				 		prefetch(plong + 1);
			
 
				 		if (((*plong) & LBITMASK) != lskipval)
			
 
				 			break;
			
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -130,10 +130,11 @@ static int xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p,
 
				 				struct mnt_fhstatus *res)
			
 
				 {
			
 
				 	struct nfs_fh *fh = res->fh;
			
 
				+	unsigned size;
			
 
				 
			
 
				 	if ((res->status = ntohl(*p++)) == 0) {
			
 
				-		int size = ntohl(*p++);
			
 
				-		if (size <= NFS3_FHSIZE) {
			
 
				+		size = ntohl(*p++);
			
 
				+		if (size <= NFS3_FHSIZE && size != 0) {
			
 
				 			fh->size = size;
			
 
				 			memcpy(fh->data, p, size);
			
 
				 		} else
			
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1216,8 +1216,6 @@ static int nfs_validate_mount_data(void *options,
 
				 {
			
 
				 	struct nfs_mount_data *data = (struct nfs_mount_data *)options;
			
 
				 
			
 
				-	memset(args, 0, sizeof(*args));
			
 
				-
			
 
				 	if (data == NULL)
			
 
				 		goto out_no_data;
			
 
				 
			
@@ -1251,13 +1249,13 @@ static int nfs_validate_mount_data(void *options,
 
				 	case 5:
			
 
				 		memset(data->context, 0, sizeof(data->context));
			
 
				 	case 6:
			
 
				-		if (data->flags & NFS_MOUNT_VER3)
			
 
				+		if (data->flags & NFS_MOUNT_VER3) {
			
 
				+			if (data->root.size > NFS3_FHSIZE || data->root.size == 0)
			
 
				+				goto out_invalid_fh;
			
 
				 			mntfh->size = data->root.size;
			
 
				-		else
			
 
				+		} else
			
 
				 			mntfh->size = NFS2_FHSIZE;
			
 
				 
			
 
				-		if (mntfh->size > sizeof(mntfh->data))
			
 
				-			goto out_invalid_fh;
			
 
				 
			
 
				 		memcpy(mntfh->data, data->root.data, mntfh->size);
			
 
				 		if (mntfh->size < sizeof(mntfh->data))
			
@@ -1585,24 +1583,29 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 
				 {
			
 
				 	struct nfs_server *server = NULL;
			
 
				 	struct super_block *s;
			
 
				-	struct nfs_fh mntfh;
			
 
				-	struct nfs_parsed_mount_data data;
			
 
				+	struct nfs_parsed_mount_data *data;
			
 
				+	struct nfs_fh *mntfh;
			
 
				 	struct dentry *mntroot;
			
 
				 	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
			
 
				 	struct nfs_sb_mountdata sb_mntdata = {
			
 
				 		.mntflags = flags,
			
 
				 	};
			
 
				-	int error;
			
 
				+	int error = -ENOMEM;
			
 
				+
			
 
				+	data = kzalloc(sizeof(*data), GFP_KERNEL);
			
 
				+	mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
			
 
				+	if (data == NULL || mntfh == NULL)
			
 
				+		goto out_free_fh;
			
 
				 
			
 
				-	security_init_mnt_opts(&data.lsm_opts);
			
 
				+	security_init_mnt_opts(&data->lsm_opts);
			
 
				 
			
 
				 	/* Validate the mount data */
			
 
				-	error = nfs_validate_mount_data(raw_data, &data, &mntfh, dev_name);
			
 
				+	error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
			
 
				 	if (error < 0)
			
 
				 		goto out;
			
 
				 
			
 
				 	/* Get a volume representation */
			
 
				-	server = nfs_create_server(&data, &mntfh);
			
 
				+	server = nfs_create_server(data, mntfh);
			
 
				 	if (IS_ERR(server)) {
			
 
				 		error = PTR_ERR(server);
			
 
				 		goto out;
			
@@ -1630,16 +1633,16 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 
				 
			
 
				 	if (!s->s_root) {
			
 
				 		/* initial superblock/root creation */
			
 
				-		nfs_fill_super(s, &data);
			
 
				+		nfs_fill_super(s, data);
			
 
				 	}
			
 
				 
			
 
				-	mntroot = nfs_get_root(s, &mntfh);
			
 
				+	mntroot = nfs_get_root(s, mntfh);
			
 
				 	if (IS_ERR(mntroot)) {
			
 
				 		error = PTR_ERR(mntroot);
			
 
				 		goto error_splat_super;
			
 
				 	}
			
 
				 
			
 
				-	error = security_sb_set_mnt_opts(s, &data.lsm_opts);
			
 
				+	error = security_sb_set_mnt_opts(s, &data->lsm_opts);
			
 
				 	if (error)
			
 
				 		goto error_splat_root;
			
 
				 
			
@@ -1649,9 +1652,12 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 
				 	error = 0;
			
 
				 
			
 
				 out:
			
 
				-	kfree(data.nfs_server.hostname);
			
 
				-	kfree(data.mount_server.hostname);
			
 
				-	security_free_mnt_opts(&data.lsm_opts);
			
 
				+	kfree(data->nfs_server.hostname);
			
 
				+	kfree(data->mount_server.hostname);
			
 
				+	security_free_mnt_opts(&data->lsm_opts);
			
 
				+out_free_fh:
			
 
				+	kfree(mntfh);
			
 
				+	kfree(data);
			
 
				 	return error;
			
 
				 
			
 
				 out_err_nosb:
			
@@ -1800,8 +1806,6 @@ static int nfs4_validate_mount_data(void *options,
 
				 	struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
			
 
				 	char *c;
			
 
				 
			
 
				-	memset(args, 0, sizeof(*args));
			
 
				-
			
 
				 	if (data == NULL)
			
 
				 		goto out_no_data;
			
 
				 
			
@@ -1959,26 +1963,31 @@ out_no_client_address:
 
				 static int nfs4_get_sb(struct file_system_type *fs_type,
			
 
				 	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
			
 
				 {
			
 
				-	struct nfs_parsed_mount_data data;
			
 
				+	struct nfs_parsed_mount_data *data;
			
 
				 	struct super_block *s;
			
 
				 	struct nfs_server *server;
			
 
				-	struct nfs_fh mntfh;
			
 
				+	struct nfs_fh *mntfh;
			
 
				 	struct dentry *mntroot;
			
 
				 	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
			
 
				 	struct nfs_sb_mountdata sb_mntdata = {
			
 
				 		.mntflags = flags,
			
 
				 	};
			
 
				-	int error;
			
 
				+	int error = -ENOMEM;
			
 
				 
			
 
				-	security_init_mnt_opts(&data.lsm_opts);
			
 
				+	data = kzalloc(sizeof(*data), GFP_KERNEL);
			
 
				+	mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
			
 
				+	if (data == NULL || mntfh == NULL)
			
 
				+		goto out_free_fh;
			
 
				+
			
 
				+	security_init_mnt_opts(&data->lsm_opts);
			
 
				 
			
 
				 	/* Validate the mount data */
			
 
				-	error = nfs4_validate_mount_data(raw_data, &data, dev_name);
			
 
				+	error = nfs4_validate_mount_data(raw_data, data, dev_name);
			
 
				 	if (error < 0)
			
 
				 		goto out;
			
 
				 
			
 
				 	/* Get a volume representation */
			
 
				-	server = nfs4_create_server(&data, &mntfh);
			
 
				+	server = nfs4_create_server(data, mntfh);
			
 
				 	if (IS_ERR(server)) {
			
 
				 		error = PTR_ERR(server);
			
 
				 		goto out;
			
@@ -2009,13 +2018,13 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 
				 		nfs4_fill_super(s);
			
 
				 	}
			
 
				 
			
 
				-	mntroot = nfs4_get_root(s, &mntfh);
			
 
				+	mntroot = nfs4_get_root(s, mntfh);
			
 
				 	if (IS_ERR(mntroot)) {
			
 
				 		error = PTR_ERR(mntroot);
			
 
				 		goto error_splat_super;
			
 
				 	}
			
 
				 
			
 
				-	error = security_sb_set_mnt_opts(s, &data.lsm_opts);
			
 
				+	error = security_sb_set_mnt_opts(s, &data->lsm_opts);
			
 
				 	if (error)
			
 
				 		goto error_splat_root;
			
 
				 
			
@@ -2025,10 +2034,13 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 
				 	error = 0;
			
 
				 
			
 
				 out:
			
 
				-	kfree(data.client_address);
			
 
				-	kfree(data.nfs_server.export_path);
			
 
				-	kfree(data.nfs_server.hostname);
			
 
				-	security_free_mnt_opts(&data.lsm_opts);
			
 
				+	kfree(data->client_address);
			
 
				+	kfree(data->nfs_server.export_path);
			
 
				+	kfree(data->nfs_server.hostname);
			
 
				+	security_free_mnt_opts(&data->lsm_opts);
			
 
				+out_free_fh:
			
 
				+	kfree(mntfh);
			
 
				+	kfree(data);
			
 
				 	return error;
			
 
				 
			
 
				 out_free:
			
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -739,12 +739,13 @@ int nfs_updatepage(struct file *file, struct page *page,
 
				 	}
			
 
				 
			
 
				 	status = nfs_writepage_setup(ctx, page, offset, count);
			
 
				-	__set_page_dirty_nobuffers(page);
			
 
				+	if (status < 0)
			
 
				+		nfs_set_pageerror(page);
			
 
				+	else
			
 
				+		__set_page_dirty_nobuffers(page);
			
 
				 
			
 
				         dprintk("NFS:      nfs_updatepage returns %d (isize %Ld)\n",
			
 
				 			status, (long long)i_size_read(inode));
			
 
				-	if (status < 0)
			
 
				-		nfs_set_pageerror(page);
			
 
				 	return status;
			
 
				 }
			
 
				 
			
--- a/fs/select.c
+++ b/fs/select.c
@@ -249,7 +249,6 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
 
				 						retval++;
			
 
				 					}
			
 
				 				}
			
 
				-				cond_resched();
			
 
				 			}
			
 
				 			if (res_in)
			
 
				 				*rinp = res_in;
			
@@ -257,6 +256,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
 
				 				*routp = res_out;
			
 
				 			if (res_ex)
			
 
				 				*rexp = res_ex;
			
 
				+			cond_resched();
			
 
				 		}
			
 
				 		wait = NULL;
			
 
				 		if (retval || !*timeout || signal_pending(current))
			
--- a/include/asm-alpha/percpu.h
+++ b/include/asm-alpha/percpu.h
@@ -69,6 +69,8 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
 
				 #define __get_cpu_var(var)		per_cpu_var(var)
			
 
				 #define __raw_get_cpu_var(var)		per_cpu_var(var)
			
 
				 
			
 
				+#define PER_CPU_ATTRIBUTES
			
 
				+
			
 
				 #endif /* SMP */
			
 
				 
			
 
				 #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu_var(name)
			
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -18,6 +18,7 @@
 
				 #include <linux/kvm_para.h>
			
 
				 #include <linux/kvm_types.h>
			
 
				 
			
 
				+#include <asm/pvclock-abi.h>
			
 
				 #include <asm/desc.h>
			
 
				 
			
 
				 #define KVM_MAX_VCPUS 16
			
@@ -282,7 +283,8 @@ struct kvm_vcpu_arch {
 
				 	struct x86_emulate_ctxt emulate_ctxt;
			
 
				 
			
 
				 	gpa_t time;
			
 
				-	struct kvm_vcpu_time_info hv_clock;
			
 
				+	struct pvclock_vcpu_time_info hv_clock;
			
 
				+	unsigned int hv_clock_tsc_khz;
			
 
				 	unsigned int time_offset;
			
 
				 	struct page *time_page;
			
 
				 };
			
--- a/include/asm-x86/kvm_para.h
+++ b/include/asm-x86/kvm_para.h
@@ -48,24 +48,6 @@ struct kvm_mmu_op_release_pt {
 
				 #ifdef __KERNEL__
			
 
				 #include <asm/processor.h>
			
 
				 
			
 
				-/* xen binary-compatible interface. See xen headers for details */
			
 
				-struct kvm_vcpu_time_info {
			
 
				-	uint32_t version;
			
 
				-	uint32_t pad0;
			
 
				-	uint64_t tsc_timestamp;
			
 
				-	uint64_t system_time;
			
 
				-	uint32_t tsc_to_system_mul;
			
 
				-	int8_t   tsc_shift;
			
 
				-	int8_t	 pad[3];
			
 
				-} __attribute__((__packed__)); /* 32 bytes */
			
 
				-
			
 
				-struct kvm_wall_clock {
			
 
				-	uint32_t wc_version;
			
 
				-	uint32_t wc_sec;
			
 
				-	uint32_t wc_nsec;
			
 
				-} __attribute__((__packed__));
			
 
				-
			
 
				-
			
 
				 extern void kvmclock_init(void);
			
 
				 
			
 
				 
			
--- a/include/asm-x86/pvclock-abi.h
+++ b/include/asm-x86/pvclock-abi.h
@@ -0,0 +1,42 @@
 
				+#ifndef _ASM_X86_PVCLOCK_ABI_H_
			
 
				+#define _ASM_X86_PVCLOCK_ABI_H_
			
 
				+#ifndef __ASSEMBLY__
			
 
				+
			
 
				+/*
			
 
				+ * These structs MUST NOT be changed.
			
 
				+ * They are the ABI between hypervisor and guest OS.
			
 
				+ * Both Xen and KVM are using this.
			
 
				+ *
			
 
				+ * pvclock_vcpu_time_info holds the system time and the tsc timestamp
			
 
				+ * of the last update. So the guest can use the tsc delta to get a
			
 
				+ * more precise system time.  There is one per virtual cpu.
			
 
				+ *
			
 
				+ * pvclock_wall_clock references the point in time when the system
			
 
				+ * time was zero (usually boot time), thus the guest calculates the
			
 
				+ * current wall clock by adding the system time.
			
 
				+ *
			
 
				+ * Protocol for the "version" fields is: hypervisor raises it (making
			
 
				+ * it uneven) before it starts updating the fields and raises it again
			
 
				+ * (making it even) when it is done.  Thus the guest can make sure the
			
 
				+ * time values it got are consistent by checking the version before
			
 
				+ * and after reading them.
			
 
				+ */
			
 
				+
			
 
				+struct pvclock_vcpu_time_info {
			
 
				+	u32   version;
			
 
				+	u32   pad0;
			
 
				+	u64   tsc_timestamp;
			
 
				+	u64   system_time;
			
 
				+	u32   tsc_to_system_mul;
			
 
				+	s8    tsc_shift;
			
 
				+	u8    pad[3];
			
 
				+} __attribute__((__packed__)); /* 32 bytes */
			
 
				+
			
 
				+struct pvclock_wall_clock {
			
 
				+	u32   version;
			
 
				+	u32   sec;
			
 
				+	u32   nsec;
			
 
				+} __attribute__((__packed__));
			
 
				+
			
 
				+#endif /* __ASSEMBLY__ */
			
 
				+#endif /* _ASM_X86_PVCLOCK_ABI_H_ */
			
--- a/include/asm-x86/pvclock.h
+++ b/include/asm-x86/pvclock.h
@@ -0,0 +1,13 @@
 
				+#ifndef _ASM_X86_PVCLOCK_H_
			
 
				+#define _ASM_X86_PVCLOCK_H_
			
 
				+
			
 
				+#include <linux/clocksource.h>
			
 
				+#include <asm/pvclock-abi.h>
			
 
				+
			
 
				+/* some helper functions for xen and kvm pv clock sources */
			
 
				+cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
			
 
				+void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
			
 
				+			    struct pvclock_vcpu_time_info *vcpu,
			
 
				+			    struct timespec *ts);
			
 
				+
			
 
				+#endif /* _ASM_X86_PVCLOCK_H_ */
			
--- a/include/asm-x86/xen/page.h
+++ b/include/asm-x86/xen/page.h
@@ -150,13 +150,9 @@ static inline pte_t __pte_ma(pteval_t x)
 
				 	return (pte_t) { .pte = x };
			
 
				 }
			
 
				 
			
 
				-#ifdef CONFIG_X86_PAE
			
 
				 #define pmd_val_ma(v) ((v).pmd)
			
 
				 #define pud_val_ma(v) ((v).pgd.pgd)
			
 
				 #define __pmd_ma(x)	((pmd_t) { (x) } )
			
 
				-#else  /* !X86_PAE */
			
 
				-#define pmd_val_ma(v)	((v).pud.pgd.pgd)
			
 
				-#endif	/* CONFIG_X86_PAE */
			
 
				 
			
 
				 #define pgd_val_ma(x)	((x).pgd)
			
 
				 
			
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -33,6 +33,7 @@
 
				 #define KVM_REQ_REPORT_TPR_ACCESS  2
			
 
				 #define KVM_REQ_MMU_RELOAD         3
			
 
				 #define KVM_REQ_TRIPLE_FAULT       4
			
 
				+#define KVM_REQ_PENDING_TIMER      5
			
 
				 
			
 
				 struct kvm_vcpu;
			
 
				 extern struct kmem_cache *kvm_vcpu_cache;
			
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -27,8 +27,7 @@
 
				  * 	This routine is called by the kernel to write a series of
			
 
				  * 	characters to the tty device.  The characters may come from
			
 
				  * 	user space or kernel space.  This routine will return the
			
 
				- *	number of characters actually accepted for writing.  This
			
 
				- *	routine is mandatory.
			
 
				+ *	number of characters actually accepted for writing.
			
 
				  *
			
 
				  *	Optional: Required for writable devices.
			
 
				  *
			
@@ -134,7 +133,7 @@
 
				  * 	This routine notifies the tty driver that it should hangup the
			
 
				  * 	tty device.
			
 
				  *
			
 
				- *	Required:
			
 
				+ *	Optional:
			
 
				  *
			
 
				  * void (*break_ctl)(struct tty_stuct *tty, int state);
			
 
				  *
			
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -10,6 +10,7 @@
 
				 #define __XEN_PUBLIC_XEN_H__
			
 
				 
			
 
				 #include <asm/xen/interface.h>
			
 
				+#include <asm/pvclock-abi.h>
			
 
				 
			
 
				 /*
			
 
				  * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS).
			
@@ -336,7 +337,7 @@ struct vcpu_info {
 
				 	uint8_t evtchn_upcall_mask;
			
 
				 	unsigned long evtchn_pending_sel;
			
 
				 	struct arch_vcpu_info arch;
			
 
				-	struct vcpu_time_info time;
			
 
				+	struct pvclock_vcpu_time_info time;
			
 
				 }; /* 64 bytes (x86) */
			
 
				 
			
 
				 /*
			
@@ -384,9 +385,7 @@ struct shared_info {
 
				 	 * Wallclock time: updated only by control software. Guests should base
			
 
				 	 * their gettimeofday() syscall on this wallclock-base value.
			
 
				 	 */
			
 
				-	uint32_t wc_version;      /* Version counter: see vcpu_time_info_t. */
			
 
				-	uint32_t wc_sec;          /* Secs  00:00:00 UTC, Jan 1, 1970.  */
			
 
				-	uint32_t wc_nsec;         /* Nsecs 00:00:00 UTC, Jan 1, 1970.  */
			
 
				+	struct pvclock_wall_clock wc;
			
 
				 
			
 
				 	struct arch_shared_info arch;
			
 
				 
			
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1096,21 +1096,64 @@ static void unqueue_me_pi(struct futex_q *q)
 
				  * private futexes.
			
 
				  */
			
 
				 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
			
 
				-				struct task_struct *newowner)
			
 
				+				struct task_struct *newowner,
			
 
				+				struct rw_semaphore *fshared)
			
 
				 {
			
 
				 	u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
			
 
				 	struct futex_pi_state *pi_state = q->pi_state;
			
 
				+	struct task_struct *oldowner = pi_state->owner;
			
 
				 	u32 uval, curval, newval;
			
 
				-	int ret;
			
 
				+	int ret, attempt = 0;
			
 
				 
			
 
				 	/* Owner died? */
			
 
				+	if (!pi_state->owner)
			
 
				+		newtid |= FUTEX_OWNER_DIED;
			
 
				+
			
 
				+	/*
			
 
				+	 * We are here either because we stole the rtmutex from the
			
 
				+	 * pending owner or we are the pending owner which failed to
			
 
				+	 * get the rtmutex. We have to replace the pending owner TID
			
 
				+	 * in the user space variable. This must be atomic as we have
			
 
				+	 * to preserve the owner died bit here.
			
 
				+	 *
			
 
				+	 * Note: We write the user space value _before_ changing the
			
 
				+	 * pi_state because we can fault here. Imagine swapped out
			
 
				+	 * pages or a fork, which was running right before we acquired
			
 
				+	 * mmap_sem, that marked all the anonymous memory readonly for
			
 
				+	 * cow.
			
 
				+	 *
			
 
				+	 * Modifying pi_state _before_ the user space value would
			
 
				+	 * leave the pi_state in an inconsistent state when we fault
			
 
				+	 * here, because we need to drop the hash bucket lock to
			
 
				+	 * handle the fault. This might be observed in the PID check
			
 
				+	 * in lookup_pi_state.
			
 
				+	 */
			
 
				+retry:
			
 
				+	if (get_futex_value_locked(&uval, uaddr))
			
 
				+		goto handle_fault;
			
 
				+
			
 
				+	while (1) {
			
 
				+		newval = (uval & FUTEX_OWNER_DIED) | newtid;
			
 
				+
			
 
				+		curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
			
 
				+
			
 
				+		if (curval == -EFAULT)
			
 
				+			goto handle_fault;
			
 
				+		if (curval == uval)
			
 
				+			break;
			
 
				+		uval = curval;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * We fixed up user space. Now we need to fix the pi_state
			
 
				+	 * itself.
			
 
				+	 */
			
 
				 	if (pi_state->owner != NULL) {
			
 
				 		spin_lock_irq(&pi_state->owner->pi_lock);
			
 
				 		WARN_ON(list_empty(&pi_state->list));
			
 
				 		list_del_init(&pi_state->list);
			
 
				 		spin_unlock_irq(&pi_state->owner->pi_lock);
			
 
				-	} else
			
 
				-		newtid |= FUTEX_OWNER_DIED;
			
 
				+	}
			
 
				 
			
 
				 	pi_state->owner = newowner;
			
 
				 
			
@@ -1118,26 +1161,35 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 
				 	WARN_ON(!list_empty(&pi_state->list));
			
 
				 	list_add(&pi_state->list, &newowner->pi_state_list);
			
 
				 	spin_unlock_irq(&newowner->pi_lock);
			
 
				+	return 0;
			
 
				 
			
 
				 	/*
			
 
				-	 * We own it, so we have to replace the pending owner
			
 
				-	 * TID. This must be atomic as we have preserve the
			
 
				-	 * owner died bit here.
			
 
				+	 * To handle the page fault we need to drop the hash bucket
			
 
				+	 * lock here. That gives the other task (either the pending
			
 
				+	 * owner itself or the task which stole the rtmutex) the
			
 
				+	 * chance to try the fixup of the pi_state. So once we are
			
 
				+	 * back from handling the fault we need to check the pi_state
			
 
				+	 * after reacquiring the hash bucket lock and before trying to
			
 
				+	 * do another fixup. When the fixup has been done already we
			
 
				+	 * simply return.
			
 
				 	 */
			
 
				-	ret = get_futex_value_locked(&uval, uaddr);
			
 
				+handle_fault:
			
 
				+	spin_unlock(q->lock_ptr);
			
 
				 
			
 
				-	while (!ret) {
			
 
				-		newval = (uval & FUTEX_OWNER_DIED) | newtid;
			
 
				+	ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++);
			
 
				 
			
 
				-		curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
			
 
				+	spin_lock(q->lock_ptr);
			
 
				 
			
 
				-		if (curval == -EFAULT)
			
 
				-			ret = -EFAULT;
			
 
				-		if (curval == uval)
			
 
				-			break;
			
 
				-		uval = curval;
			
 
				-	}
			
 
				-	return ret;
			
 
				+	/*
			
 
				+	 * Check if someone else fixed it for us:
			
 
				+	 */
			
 
				+	if (pi_state->owner != oldowner)
			
 
				+		return 0;
			
 
				+
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	goto retry;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1507,7 +1559,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 
				 		 * that case:
			
 
				 		 */
			
 
				 		if (q.pi_state->owner != curr)
			
 
				-			ret = fixup_pi_state_owner(uaddr, &q, curr);
			
 
				+			ret = fixup_pi_state_owner(uaddr, &q, curr, fshared);
			
 
				 	} else {
			
 
				 		/*
			
 
				 		 * Catch the rare case, where the lock was released
			
@@ -1539,7 +1591,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 
				 				int res;
			
 
				 
			
 
				 				owner = rt_mutex_owner(&q.pi_state->pi_mutex);
			
 
				-				res = fixup_pi_state_owner(uaddr, &q, owner);
			
 
				+				res = fixup_pi_state_owner(uaddr, &q, owner,
			
 
				+							   fshared);
			
 
				 
			
 
				 				/* propagate -EFAULT, if the fixup failed */
			
 
				 				if (res)
			
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -1499,7 +1499,8 @@ int kgdb_nmicallback(int cpu, void *regs)
 
				 	return 1;
			
 
				 }
			
 
				 
			
 
				-void kgdb_console_write(struct console *co, const char *s, unsigned count)
			
 
				+static void kgdb_console_write(struct console *co, const char *s,
			
 
				+   unsigned count)
			
 
				 {
			
 
				 	unsigned long flags;
			
 
				 
			
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4398,22 +4398,20 @@ do_wait_for_common(struct completion *x, long timeout, int state)
 
				 			     signal_pending(current)) ||
			
 
				 			    (state == TASK_KILLABLE &&
			
 
				 			     fatal_signal_pending(current))) {
			
 
				-				__remove_wait_queue(&x->wait, &wait);
			
 
				-				return -ERESTARTSYS;
			
 
				+				timeout = -ERESTARTSYS;
			
 
				+				break;
			
 
				 			}
			
 
				 			__set_current_state(state);
			
 
				 			spin_unlock_irq(&x->wait.lock);
			
 
				 			timeout = schedule_timeout(timeout);
			
 
				 			spin_lock_irq(&x->wait.lock);
			
 
				-			if (!timeout) {
			
 
				-				__remove_wait_queue(&x->wait, &wait);
			
 
				-				return timeout;
			
 
				-			}
			
 
				-		} while (!x->done);
			
 
				+		} while (!x->done && timeout);
			
 
				 		__remove_wait_queue(&x->wait, &wait);
			
 
				+		if (!x->done)
			
 
				+			return timeout;
			
 
				 	}
			
 
				 	x->done--;
			
 
				-	return timeout;
			
 
				+	return timeout ?: 1;
			
 
				 }
			
 
				 
			
 
				 static long __sched
			
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -250,7 +250,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 
				 			if (rt_rq->rt_time || rt_rq->rt_nr_running)
			
 
				 				idle = 0;
			
 
				 			spin_unlock(&rt_rq->rt_runtime_lock);
			
 
				-		}
			
 
				+		} else if (rt_rq->rt_nr_running)
			
 
				+			idle = 0;
			
 
				 
			
 
				 		if (enqueue)
			
 
				 			sched_rt_rq_enqueue(rt_rq);
			
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1045,6 +1045,26 @@ no_page_table:
 
				 	return page;
			
 
				 }
			
 
				 
			
 
				+/* Can we do the FOLL_ANON optimization? */
			
 
				+static inline int use_zero_page(struct vm_area_struct *vma)
			
 
				+{
			
 
				+	/*
			
 
				+	 * We don't want to optimize FOLL_ANON for make_pages_present()
			
 
				+	 * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
			
 
				+	 * we want to get the page from the page tables to make sure
			
 
				+	 * that we serialize and update with any other user of that
			
 
				+	 * mapping.
			
 
				+	 */
			
 
				+	if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
			
 
				+		return 0;
			
 
				+	/*
			
 
				+	 * And if we have a fault or a nopfn routine, it's not an
			
 
				+	 * anonymous region.
			
 
				+	 */
			
 
				+	return !vma->vm_ops ||
			
 
				+		(!vma->vm_ops->fault && !vma->vm_ops->nopfn);
			
 
				+}
			
 
				+
			
 
				 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
			
 
				 		unsigned long start, int len, int write, int force,
			
 
				 		struct page **pages, struct vm_area_struct **vmas)
			
@@ -1119,8 +1139,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 
				 		foll_flags = FOLL_TOUCH;
			
 
				 		if (pages)
			
 
				 			foll_flags |= FOLL_GET;
			
 
				-		if (!write && !(vma->vm_flags & VM_LOCKED) &&
			
 
				-		    (!vma->vm_ops || !vma->vm_ops->fault))
			
 
				+		if (!write && use_zero_page(vma))
			
 
				 			foll_flags |= FOLL_ANON;
			
 
				 
			
 
				 		do {
			
@@ -1766,7 +1785,6 @@ gotten:
 
				 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
			
 
				 	if (likely(pte_same(*page_table, orig_pte))) {
			
 
				 		if (old_page) {
			
 
				-			page_remove_rmap(old_page, vma);
			
 
				 			if (!PageAnon(old_page)) {
			
 
				 				dec_mm_counter(mm, file_rss);
			
 
				 				inc_mm_counter(mm, anon_rss);
			
@@ -1788,6 +1806,32 @@ gotten:
 
				 		lru_cache_add_active(new_page);
			
 
				 		page_add_new_anon_rmap(new_page, vma, address);
			
 
				 
			
 
				+		if (old_page) {
			
 
				+			/*
			
 
				+			 * Only after switching the pte to the new page may
			
 
				+			 * we remove the mapcount here. Otherwise another
			
 
				+			 * process may come and find the rmap count decremented
			
 
				+			 * before the pte is switched to the new page, and
			
 
				+			 * "reuse" the old page writing into it while our pte
			
 
				+			 * here still points into it and can be read by other
			
 
				+			 * threads.
			
 
				+			 *
			
 
				+			 * The critical issue is to order this
			
 
				+			 * page_remove_rmap with the ptp_clear_flush above.
			
 
				+			 * Those stores are ordered by (if nothing else,)
			
 
				+			 * the barrier present in the atomic_add_negative
			
 
				+			 * in page_remove_rmap.
			
 
				+			 *
			
 
				+			 * Then the TLB flush in ptep_clear_flush ensures that
			
 
				+			 * no process can access the old page before the
			
 
				+			 * decremented mapcount is visible. And the old page
			
 
				+			 * cannot be reused until after the decremented
			
 
				+			 * mapcount is visible. So transitively, TLBs to
			
 
				+			 * old page will be flushed before it can be reused.
			
 
				+			 */
			
 
				+			page_remove_rmap(old_page, vma);
			
 
				+		}
			
 
				+
			
 
				 		/* Free the old page.. */
			
 
				 		new_page = old_page;
			
 
				 		ret |= VM_FAULT_WRITE;
			
--- a/sound/isa/sb/sb_mixer.c
+++ b/sound/isa/sb/sb_mixer.c
@@ -925,7 +925,7 @@ static unsigned char als4000_saved_regs[] = {
 
				 static void save_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs)
			
 
				 {
			
 
				 	unsigned char *val = chip->saved_regs;
			
 
				-	snd_assert(num_regs > ARRAY_SIZE(chip->saved_regs), return);
			
 
				+	snd_assert(num_regs <= ARRAY_SIZE(chip->saved_regs), return);
			
 
				 	for (; num_regs; num_regs--)
			
 
				 		*val++ = snd_sbmixer_read(chip, *regs++);
			
 
				 }
			
@@ -933,7 +933,7 @@ static void save_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs)
 
				 static void restore_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs)
			
 
				 {
			
 
				 	unsigned char *val = chip->saved_regs;
			
 
				-	snd_assert(num_regs > ARRAY_SIZE(chip->saved_regs), return);
			
 
				+	snd_assert(num_regs <= ARRAY_SIZE(chip->saved_regs), return);
			
 
				 	for (; num_regs; num_regs--)
			
 
				 		snd_sbmixer_write(chip, *regs++, *val++);
			
 
				 }
			
--- a/sound/pci/aw2/aw2-alsa.c
+++ b/sound/pci/aw2/aw2-alsa.c
@@ -316,6 +316,8 @@ static int __devinit snd_aw2_create(struct snd_card *card,
 
				 		return -ENOMEM;
			
 
				 	}
			
 
				 
			
 
				+	/* (2) initialization of the chip hardware */
			
 
				+	snd_aw2_saa7146_setup(&chip->saa7146, chip->iobase_virt);
			
 
				 
			
 
				 	if (request_irq(pci->irq, snd_aw2_saa7146_interrupt,
			
 
				 			IRQF_SHARED, "Audiowerk2", chip)) {
			
@@ -329,8 +331,6 @@ static int __devinit snd_aw2_create(struct snd_card *card,
 
				 	}
			
 
				 	chip->irq = pci->irq;
			
 
				 
			
 
				-	/* (2) initialization of the chip hardware */
			
 
				-	snd_aw2_saa7146_setup(&chip->saa7146, chip->iobase_virt);
			
 
				 	err = snd_device_new(card, SNDRV_DEV_LOWLEVEL, chip, &ops);
			
 
				 	if (err < 0) {
			
 
				 		free_irq(chip->irq, (void *)chip);
			
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -269,28 +269,9 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector)
			
 
				+static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi)
			
 
				 {
			
 
				-	int i;
			
 
				-
			
 
				-	for (i = 0; i < IOAPIC_NUM_PINS; i++)
			
 
				-		if (ioapic->redirtbl[i].fields.vector == vector)
			
 
				-			return i;
			
 
				-	return -1;
			
 
				-}
			
 
				-
			
 
				-void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
			
 
				-{
			
 
				-	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
			
 
				 	union ioapic_redir_entry *ent;
			
 
				-	int gsi;
			
 
				-
			
 
				-	gsi = get_eoi_gsi(ioapic, vector);
			
 
				-	if (gsi == -1) {
			
 
				-		printk(KERN_WARNING "Can't find redir item for %d EOI\n",
			
 
				-		       vector);
			
 
				-		return;
			
 
				-	}
			
 
				 
			
 
				 	ent = &ioapic->redirtbl[gsi];
			
 
				 	ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
			
@@ -300,6 +281,16 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
 
				 		ioapic_deliver(ioapic, gsi);
			
 
				 }
			
 
				 
			
 
				+void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
			
 
				+{
			
 
				+	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < IOAPIC_NUM_PINS; i++)
			
 
				+		if (ioapic->redirtbl[i].fields.vector == vector)
			
 
				+			__kvm_ioapic_update_eoi(ioapic, i);
			
 
				+}
			
 
				+
			
 
				 static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr)
			
 
				 {
			
 
				 	struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;