18 years ago · 5cc97bf2d8
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -222,6 +222,8 @@ config PARAVIRT
 
				 	  However, when run without a hypervisor the kernel is
			
 
				 	  theoretically slower.  If in doubt, say N.
			
 
				 
			
 
				+source "arch/i386/xen/Kconfig"
			
 
				+
			
 
				 config VMI
			
 
				 	bool "VMI Paravirt-ops support"
			
 
				 	depends on PARAVIRT
			
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -93,6 +93,9 @@ mflags-$(CONFIG_X86_ES7000)	:= -Iinclude/asm-i386/mach-es7000
 
				 mcore-$(CONFIG_X86_ES7000)	:= mach-default
			
 
				 core-$(CONFIG_X86_ES7000)	:= arch/i386/mach-es7000/
			
 
				 
			
 
				+# Xen paravirtualization support
			
 
				+core-$(CONFIG_XEN)		+= arch/i386/xen/
			
 
				+
			
 
				 # default subarch .h files
			
 
				 mflags-y += -Iinclude/asm-i386/mach-default
			
 
				 
			
--- a/arch/i386/boot/compressed/relocs.c
+++ b/arch/i386/boot/compressed/relocs.c
@@ -31,6 +31,8 @@ static const char* safe_abs_relocs[] = {
 
				 		"__kernel_rt_sigreturn",
			
 
				 		"__kernel_sigreturn",
			
 
				 		"SYSENTER_RETURN",
			
 
				+		"xen_irq_disable_direct_reloc",
			
 
				+		"xen_save_fl_direct_reloc",
			
 
				 };
			
 
				 
			
 
				 static int is_safe_abs_reloc(const char* sym_name)
			
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -17,6 +17,8 @@
 
				 #include <asm/thread_info.h>
			
 
				 #include <asm/elf.h>
			
 
				 
			
 
				+#include <xen/interface/xen.h>
			
 
				+
			
 
				 #define DEFINE(sym, val) \
			
 
				         asm volatile("\n->" #sym " %0 " #val : : "i" (val))
			
 
				 
			
@@ -59,6 +61,7 @@ void foo(void)
 
				 	OFFSET(TI_addr_limit, thread_info, addr_limit);
			
 
				 	OFFSET(TI_restart_block, thread_info, restart_block);
			
 
				 	OFFSET(TI_sysenter_return, thread_info, sysenter_return);
			
 
				+	OFFSET(TI_cpu, thread_info, cpu);
			
 
				 	BLANK();
			
 
				 
			
 
				 	OFFSET(GDS_size, Xgt_desc_struct, size);
			
@@ -115,4 +118,10 @@ void foo(void)
 
				 	OFFSET(PARAVIRT_iret, paravirt_ops, iret);
			
 
				 	OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
			
 
				 #endif
			
 
				+
			
 
				+#ifdef CONFIG_XEN
			
 
				+	BLANK();
			
 
				+	OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
			
 
				+	OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
			
 
				+#endif
			
 
				 }
			
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -1023,6 +1023,91 @@ ENTRY(kernel_thread_helper)
 
				 	CFI_ENDPROC
			
 
				 ENDPROC(kernel_thread_helper)
			
 
				 
			
 
				+#ifdef CONFIG_XEN
			
 
				+ENTRY(xen_hypervisor_callback)
			
 
				+	CFI_STARTPROC
			
 
				+	pushl $0
			
 
				+	CFI_ADJUST_CFA_OFFSET 4
			
 
				+	SAVE_ALL
			
 
				+	TRACE_IRQS_OFF
			
 
				+
			
 
				+	/* Check to see if we got the event in the critical
			
 
				+	   region in xen_iret_direct, after we've reenabled
			
 
				+	   events and checked for pending events.  This simulates
			
 
				+	   iret instruction's behaviour where it delivers a
			
 
				+	   pending interrupt when enabling interrupts. */
			
 
				+	movl PT_EIP(%esp),%eax
			
 
				+	cmpl $xen_iret_start_crit,%eax
			
 
				+	jb   1f
			
 
				+	cmpl $xen_iret_end_crit,%eax
			
 
				+	jae  1f
			
 
				+
			
 
				+	call xen_iret_crit_fixup
			
 
				+
			
 
				+1:	mov %esp, %eax
			
 
				+	call xen_evtchn_do_upcall
			
 
				+	jmp  ret_from_intr
			
 
				+	CFI_ENDPROC
			
 
				+ENDPROC(xen_hypervisor_callback)
			
 
				+
			
 
				+# Hypervisor uses this for application faults while it executes.
			
 
				+# We get here for two reasons:
			
 
				+#  1. Fault while reloading DS, ES, FS or GS
			
 
				+#  2. Fault while executing IRET
			
 
				+# Category 1 we fix up by reattempting the load, and zeroing the segment
			
 
				+# register if the load fails.
			
 
				+# Category 2 we fix up by jumping to do_iret_error. We cannot use the
			
 
				+# normal Linux return path in this case because if we use the IRET hypercall
			
 
				+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
			
 
				+# We distinguish between categories by maintaining a status value in EAX.
			
 
				+ENTRY(xen_failsafe_callback)
			
 
				+	CFI_STARTPROC
			
 
				+	pushl %eax
			
 
				+	CFI_ADJUST_CFA_OFFSET 4
			
 
				+	movl $1,%eax
			
 
				+1:	mov 4(%esp),%ds
			
 
				+2:	mov 8(%esp),%es
			
 
				+3:	mov 12(%esp),%fs
			
 
				+4:	mov 16(%esp),%gs
			
 
				+	testl %eax,%eax
			
 
				+	popl %eax
			
 
				+	CFI_ADJUST_CFA_OFFSET -4
			
 
				+	lea 16(%esp),%esp
			
 
				+	CFI_ADJUST_CFA_OFFSET -16
			
 
				+	jz 5f
			
 
				+	addl $16,%esp
			
 
				+	jmp iret_exc		# EAX != 0 => Category 2 (Bad IRET)
			
 
				+5:	pushl $0		# EAX == 0 => Category 1 (Bad segment)
			
 
				+	CFI_ADJUST_CFA_OFFSET 4
			
 
				+	SAVE_ALL
			
 
				+	jmp ret_from_exception
			
 
				+	CFI_ENDPROC
			
 
				+
			
 
				+.section .fixup,"ax"
			
 
				+6:	xorl %eax,%eax
			
 
				+	movl %eax,4(%esp)
			
 
				+	jmp 1b
			
 
				+7:	xorl %eax,%eax
			
 
				+	movl %eax,8(%esp)
			
 
				+	jmp 2b
			
 
				+8:	xorl %eax,%eax
			
 
				+	movl %eax,12(%esp)
			
 
				+	jmp 3b
			
 
				+9:	xorl %eax,%eax
			
 
				+	movl %eax,16(%esp)
			
 
				+	jmp 4b
			
 
				+.previous
			
 
				+.section __ex_table,"a"
			
 
				+	.align 4
			
 
				+	.long 1b,6b
			
 
				+	.long 2b,7b
			
 
				+	.long 3b,8b
			
 
				+	.long 4b,9b
			
 
				+.previous
			
 
				+ENDPROC(xen_failsafe_callback)
			
 
				+
			
 
				+#endif	/* CONFIG_XEN */
			
 
				+
			
 
				 .section .rodata,"a"
			
 
				 #include "syscall_table.S"
			
 
				 
			
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -510,7 +510,8 @@ ENTRY(_stext)
 
				 /*
			
 
				  * BSS section
			
 
				  */
			
 
				-.section ".bss.page_aligned","w"
			
 
				+.section ".bss.page_aligned","wa"
			
 
				+	.align PAGE_SIZE_asm
			
 
				 ENTRY(swapper_pg_dir)
			
 
				 	.fill 1024,4,0
			
 
				 ENTRY(swapper_pg_pmd)
			
@@ -538,6 +539,8 @@ fault_msg:
 
				 	.ascii "Int %d: CR2 %p  err %p  EIP %p  CS %p  flags %p\n"
			
 
				 	.asciz "Stack: %p %p %p %p %p %p %p %p\n"
			
 
				 
			
 
				+#include "../xen/xen-head.S"
			
 
				+
			
 
				 /*
			
 
				  * The IDT and GDT 'descriptors' are a strange 48-bit object
			
 
				  * only used by the lidt and lgdt instructions. They are not
			
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -228,6 +228,41 @@ static int __init print_banner(void)
 
				 }
			
 
				 core_initcall(print_banner);
			
 
				 
			
 
				+static struct resource reserve_ioports = {
			
 
				+	.start = 0,
			
 
				+	.end = IO_SPACE_LIMIT,
			
 
				+	.name = "paravirt-ioport",
			
 
				+	.flags = IORESOURCE_IO | IORESOURCE_BUSY,
			
 
				+};
			
 
				+
			
 
				+static struct resource reserve_iomem = {
			
 
				+	.start = 0,
			
 
				+	.end = -1,
			
 
				+	.name = "paravirt-iomem",
			
 
				+	.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Reserve the whole legacy IO space to prevent any legacy drivers
			
 
				+ * from wasting time probing for their hardware.  This is a fairly
			
 
				+ * brute-force approach to disabling all non-virtual drivers.
			
 
				+ *
			
 
				+ * Note that this must be called very early to have any effect.
			
 
				+ */
			
 
				+int paravirt_disable_iospace(void)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = request_resource(&ioport_resource, &reserve_ioports);
			
 
				+	if (ret == 0) {
			
 
				+		ret = request_resource(&iomem_resource, &reserve_iomem);
			
 
				+		if (ret)
			
 
				+			release_resource(&reserve_ioports);
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 struct paravirt_ops paravirt_ops = {
			
 
				 	.name = "bare hardware",
			
 
				 	.paravirt_enabled = 0,
			
@@ -267,7 +302,7 @@ struct paravirt_ops paravirt_ops = {
 
				 	.write_msr = native_write_msr_safe,
			
 
				 	.read_tsc = native_read_tsc,
			
 
				 	.read_pmc = native_read_pmc,
			
 
				-	.get_scheduled_cycles = native_read_tsc,
			
 
				+	.sched_clock = native_sched_clock,
			
 
				 	.get_cpu_khz = native_calculate_cpu_khz,
			
 
				 	.load_tr_desc = native_load_tr_desc,
			
 
				 	.set_ldt = native_set_ldt,
			
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -601,6 +601,8 @@ void __init setup_arch(char **cmdline_p)
 
				 	 * NOTE: at this point the bootmem allocator is fully available.
			
 
				 	 */
			
 
				 
			
 
				+	paravirt_post_allocator_init();
			
 
				+
			
 
				 	dmi_scan_machine();
			
 
				 
			
 
				 #ifdef CONFIG_X86_GENERICARCH
			
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -22,6 +22,7 @@
 
				 
			
 
				 #include <asm/mtrr.h>
			
 
				 #include <asm/tlbflush.h>
			
 
				+#include <asm/mmu_context.h>
			
 
				 #include <mach_apic.h>
			
 
				 
			
 
				 /*
			
@@ -249,13 +250,13 @@ static unsigned long flush_va;
 
				 static DEFINE_SPINLOCK(tlbstate_lock);
			
 
				 
			
 
				 /*
			
 
				- * We cannot call mmdrop() because we are in interrupt context, 
			
 
				+ * We cannot call mmdrop() because we are in interrupt context,
			
 
				  * instead update mm->cpu_vm_mask.
			
 
				  *
			
 
				  * We need to reload %cr3 since the page tables may be going
			
 
				  * away from under us..
			
 
				  */
			
 
				-static inline void leave_mm (unsigned long cpu)
			
 
				+void leave_mm(unsigned long cpu)
			
 
				 {
			
 
				 	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
			
 
				 		BUG();
			
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -148,7 +148,7 @@ void __init smp_alloc_memory(void)
 
				  * a given CPU
			
 
				  */
			
 
				 
			
 
				-static void __cpuinit smp_store_cpu_info(int id)
			
 
				+void __cpuinit smp_store_cpu_info(int id)
			
 
				 {
			
 
				 	struct cpuinfo_x86 *c = cpu_data + id;
			
 
				 
			
@@ -308,8 +308,7 @@ cpumask_t cpu_coregroup_map(int cpu)
 
				 /* representing cpus for which sibling maps can be computed */
			
 
				 static cpumask_t cpu_sibling_setup_map;
			
 
				 
			
 
				-static inline void
			
 
				-set_cpu_sibling_map(int cpu)
			
 
				+void set_cpu_sibling_map(int cpu)
			
 
				 {
			
 
				 	int i;
			
 
				 	struct cpuinfo_x86 *c = cpu_data;
			
@@ -1144,8 +1143,7 @@ void __init native_smp_prepare_boot_cpu(void)
 
				 }
			
 
				 
			
 
				 #ifdef CONFIG_HOTPLUG_CPU
			
 
				-static void
			
 
				-remove_siblinginfo(int cpu)
			
 
				+void remove_siblinginfo(int cpu)
			
 
				 {
			
 
				 	int sibling;
			
 
				 	struct cpuinfo_x86 *c = cpu_data;
			
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -84,7 +84,7 @@ static inline int check_tsc_unstable(void)
 
				  *
			
 
				  *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
			
 
				  */
			
 
				-static unsigned long cyc2ns_scale __read_mostly;
			
 
				+unsigned long cyc2ns_scale __read_mostly;
			
 
				 
			
 
				 #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
			
 
				 
			
@@ -93,15 +93,10 @@ static inline void set_cyc2ns_scale(unsigned long cpu_khz)
 
				 	cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
			
 
				 }
			
 
				 
			
 
				-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
			
 
				-{
			
 
				-	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * Scheduler clock - returns current time in nanosec units.
			
 
				  */
			
 
				-unsigned long long sched_clock(void)
			
 
				+unsigned long long native_sched_clock(void)
			
 
				 {
			
 
				 	unsigned long long this_offset;
			
 
				 
			
@@ -118,12 +113,24 @@ unsigned long long sched_clock(void)
 
				 		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
			
 
				 
			
 
				 	/* read the Time Stamp Counter: */
			
 
				-	get_scheduled_cycles(this_offset);
			
 
				+	rdtscll(this_offset);
			
 
				 
			
 
				 	/* return the value in ns */
			
 
				 	return cycles_2_ns(this_offset);
			
 
				 }
			
 
				 
			
 
				+/* We need to define a real function for sched_clock, to override the
			
 
				+   weak default version */
			
 
				+#ifdef CONFIG_PARAVIRT
			
 
				+unsigned long long sched_clock(void)
			
 
				+{
			
 
				+	return paravirt_sched_clock();
			
 
				+}
			
 
				+#else
			
 
				+unsigned long long sched_clock(void)
			
 
				+	__attribute__((alias("native_sched_clock")));
			
 
				+#endif
			
 
				+
			
 
				 unsigned long native_calculate_cpu_khz(void)
			
 
				 {
			
 
				 	unsigned long long start, end;
			
--- a/arch/i386/kernel/vmi.c
+++ b/arch/i386/kernel/vmi.c
@@ -362,7 +362,7 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-static void vmi_allocate_pt(u32 pfn)
			
 
				+static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
			
 
				 {
			
 
				 	vmi_set_page_type(pfn, VMI_PAGE_L1);
			
 
				 	vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
			
@@ -891,7 +891,7 @@ static inline int __init activate_vmi(void)
 
				 		paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
			
 
				 		paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
			
 
				 #endif
			
 
				-		paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles;
			
 
				+		paravirt_ops.sched_clock = vmi_sched_clock;
			
 
				  		paravirt_ops.get_cpu_khz = vmi_cpu_khz;
			
 
				 
			
 
				 		/* We have true wallclock functions; disable CMOS clock sync */
			
--- a/arch/i386/kernel/vmiclock.c
+++ b/arch/i386/kernel/vmiclock.c
@@ -64,10 +64,10 @@ int vmi_set_wallclock(unsigned long now)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-/* paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles */
			
 
				-unsigned long long vmi_get_sched_cycles(void)
			
 
				+/* paravirt_ops.sched_clock = vmi_sched_clock */
			
 
				+unsigned long long vmi_sched_clock(void)
			
 
				 {
			
 
				-	return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
			
 
				+	return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
			
 
				 }
			
 
				 
			
 
				 /* paravirt_ops.get_cpu_khz = vmi_cpu_khz */
			
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -88,6 +88,7 @@ SECTIONS
 
				 
			
 
				   . = ALIGN(4096);
			
 
				   .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
			
 
				+	*(.data.page_aligned)
			
 
				 	*(.data.idt)
			
 
				   }
			
 
				 
			
--- a/arch/i386/kernel/vsyscall-note.S
+++ b/arch/i386/kernel/vsyscall-note.S
@@ -3,23 +3,40 @@
 
				  * Here we can supply some information useful to userland.
			
 
				  */
			
 
				 
			
 
				-#include <linux/uts.h>
			
 
				 #include <linux/version.h>
			
 
				+#include <linux/elfnote.h>
			
 
				 
			
 
				-#define ASM_ELF_NOTE_BEGIN(name, flags, vendor, type)			      \
			
 
				-	.section name, flags;						      \
			
 
				-	.balign 4;							      \
			
 
				-	.long 1f - 0f;		/* name length */			      \
			
 
				-	.long 3f - 2f;		/* data length */			      \
			
 
				-	.long type;		/* note type */				      \
			
 
				-0:	.asciz vendor;		/* vendor name */			      \
			
 
				-1:	.balign 4;							      \
			
 
				-2:
			
 
				+/* Ideally this would use UTS_NAME, but using a quoted string here
			
 
				+   doesn't work. Remember to change this when changing the
			
 
				+   kernel's name. */
			
 
				+ELFNOTE_START(Linux, 0, "a")
			
 
				+	.long LINUX_VERSION_CODE
			
 
				+ELFNOTE_END
			
 
				 
			
 
				-#define ASM_ELF_NOTE_END						      \
			
 
				-3:	.balign 4;		/* pad out section */			      \
			
 
				-	.previous
			
 
				+#ifdef CONFIG_XEN
			
 
				 
			
 
				-	ASM_ELF_NOTE_BEGIN(".note.kernel-version", "a", UTS_SYSNAME, 0)
			
 
				-	.long LINUX_VERSION_CODE
			
 
				-	ASM_ELF_NOTE_END
			
 
				+/*
			
 
				+ * Add a special note telling glibc's dynamic linker a fake hardware
			
 
				+ * flavor that it will use to choose the search path for libraries in the
			
 
				+ * same way it uses real hardware capabilities like "mmx".
			
 
				+ * We supply "nosegneg" as the fake capability, to indicate that we
			
 
				+ * do not like negative offsets in instructions using segment overrides,
			
 
				+ * since we implement those inefficiently.  This makes it possible to
			
 
				+ * install libraries optimized to avoid those access patterns in someplace
			
 
				+ * like /lib/i686/tls/nosegneg.  Note that an /etc/ld.so.conf.d/file
			
 
				+ * corresponding to the bits here is needed to make ldconfig work right.
			
 
				+ * It should contain:
			
 
				+ *	hwcap 1 nosegneg
			
 
				+ * to match the mapping of bit to name that we give here.
			
 
				+ */
			
 
				+
			
 
				+/* Bit used for the pseudo-hwcap for non-negative segments.  We use
			
 
				+   bit 1 to avoid bugs in some versions of glibc when bit 0 is
			
 
				+   used; the choice is otherwise arbitrary. */
			
 
				+#define VDSO_NOTE_NONEGSEG_BIT	1
			
 
				+
			
 
				+ELFNOTE_START(GNU, 2, "a")
			
 
				+	.long 1, 1<<VDSO_NOTE_NONEGSEG_BIT		/* ncaps, mask */
			
 
				+	.byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg"	/* bit, name */
			
 
				+ELFNOTE_END
			
 
				+#endif
			
--- a/arch/i386/mach-voyager/voyager_thread.c
+++ b/arch/i386/mach-voyager/voyager_thread.c
@@ -52,7 +52,7 @@ execute(const char *string)
 
				 		NULL,
			
 
				 	};
			
 
				 
			
 
				-	if ((ret = call_usermodehelper(argv[0], argv, envp, 1)) != 0) {
			
 
				+	if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) {
			
 
				 		printk(KERN_ERR "Voyager failed to run \"%s\": %i\n",
			
 
				 		       string, ret);
			
 
				 	}
			
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -87,7 +87,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
 
				 	if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
			
 
				 		pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
			
 
				 
			
 
				-		paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT);
			
 
				+		paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
			
 
				 		set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
			
 
				 		BUG_ON(page_table != pte_offset_kernel(pmd, 0));
			
 
				 	}
			
@@ -473,6 +473,7 @@ void zap_low_mappings (void)
 
				 
			
 
				 static int disable_nx __initdata = 0;
			
 
				 u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
			
 
				+EXPORT_SYMBOL_GPL(__supported_pte_mask);
			
 
				 
			
 
				 /*
			
 
				  * noexec = on|off
			
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -60,7 +60,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
 
				 	address = __pa(address);
			
 
				 	addr = address & LARGE_PAGE_MASK; 
			
 
				 	pbase = (pte_t *)page_address(base);
			
 
				-	paravirt_alloc_pt(page_to_pfn(base));
			
 
				+	paravirt_alloc_pt(&init_mm, page_to_pfn(base));
			
 
				 	for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
			
 
				                set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
			
 
				                                           addr == address ? prot : ref_prot));
			
--- a/arch/i386/xen/Kconfig
+++ b/arch/i386/xen/Kconfig
@@ -0,0 +1,11 @@
 
				+#
			
 
				+# This Kconfig describes xen options
			
 
				+#
			
 
				+
			
 
				+config XEN
			
 
				+	bool "Enable support for Xen hypervisor"
			
 
				+	depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES
			
 
				+	help
			
 
				+	  This is the Linux Xen port.  Enabling this will allow the
			
 
				+	  kernel to boot in a paravirtualized environment under the
			
 
				+	  Xen hypervisor.
			
--- a/arch/i386/xen/Makefile
+++ b/arch/i386/xen/Makefile
@@ -0,0 +1,4 @@
 
				+obj-y		:= enlighten.o setup.o features.o multicalls.o mmu.o \
			
 
				+			events.o time.o manage.o xen-asm.o
			
 
				+
			
 
				+obj-$(CONFIG_SMP)	+= smp.o
			
--- a/arch/i386/xen/enlighten.c
+++ b/arch/i386/xen/enlighten.c
@@ -0,0 +1,1144 @@
 
				+/*
			
 
				+ * Core of Xen paravirt_ops implementation.
			
 
				+ *
			
 
				+ * This file contains the xen_paravirt_ops structure itself, and the
			
 
				+ * implementations for:
			
 
				+ * - privileged instructions
			
 
				+ * - interrupt flags
			
 
				+ * - segment operations
			
 
				+ * - booting and setup
			
 
				+ *
			
 
				+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
			
 
				+ */
			
 
				+
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/init.h>
			
 
				+#include <linux/smp.h>
			
 
				+#include <linux/preempt.h>
			
 
				+#include <linux/hardirq.h>
			
 
				+#include <linux/percpu.h>
			
 
				+#include <linux/delay.h>
			
 
				+#include <linux/start_kernel.h>
			
 
				+#include <linux/sched.h>
			
 
				+#include <linux/bootmem.h>
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/mm.h>
			
 
				+#include <linux/page-flags.h>
			
 
				+#include <linux/highmem.h>
			
 
				+#include <linux/smp.h>
			
 
				+
			
 
				+#include <xen/interface/xen.h>
			
 
				+#include <xen/interface/physdev.h>
			
 
				+#include <xen/interface/vcpu.h>
			
 
				+#include <xen/interface/sched.h>
			
 
				+#include <xen/features.h>
			
 
				+#include <xen/page.h>
			
 
				+
			
 
				+#include <asm/paravirt.h>
			
 
				+#include <asm/page.h>
			
 
				+#include <asm/xen/hypercall.h>
			
 
				+#include <asm/xen/hypervisor.h>
			
 
				+#include <asm/fixmap.h>
			
 
				+#include <asm/processor.h>
			
 
				+#include <asm/setup.h>
			
 
				+#include <asm/desc.h>
			
 
				+#include <asm/pgtable.h>
			
 
				+#include <asm/tlbflush.h>
			
 
				+#include <asm/reboot.h>
			
 
				+
			
 
				+#include "xen-ops.h"
			
 
				+#include "mmu.h"
			
 
				+#include "multicalls.h"
			
 
				+
			
 
				+EXPORT_SYMBOL_GPL(hypercall_page);
			
 
				+
			
 
				+DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
			
 
				+
			
 
				+DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
			
 
				+DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
			
 
				+DEFINE_PER_CPU(unsigned long, xen_cr3);
			
 
				+
			
 
				+struct start_info *xen_start_info;
			
 
				+EXPORT_SYMBOL_GPL(xen_start_info);
			
 
				+
			
 
				+static /* __initdata */ struct shared_info dummy_shared_info;
			
 
				+
			
 
				+/*
			
 
				+ * Point at some empty memory to start with. We map the real shared_info
			
 
				+ * page as soon as fixmap is up and running.
			
 
				+ */
			
 
				+struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
			
 
				+
			
 
				+/*
			
 
				+ * Flag to determine whether vcpu info placement is available on all
			
 
				+ * VCPUs.  We assume it is to start with, and then set it to zero on
			
 
				+ * the first failure.  This is because it can succeed on some VCPUs
			
 
				+ * and not others, since it can involve hypervisor memory allocation,
			
 
				+ * or because the guest failed to guarantee all the appropriate
			
 
				+ * constraints on all VCPUs (ie buffer can't cross a page boundary).
			
 
				+ *
			
 
				+ * Note that any particular CPU may be using a placed vcpu structure,
			
 
				+ * but we can only optimise if the all are.
			
 
				+ *
			
 
				+ * 0: not available, 1: available
			
 
				+ */
			
 
				+static int have_vcpu_info_placement = 1;
			
 
				+
			
 
				+static void __init xen_vcpu_setup(int cpu)
			
 
				+{
			
 
				+	struct vcpu_register_vcpu_info info;
			
 
				+	int err;
			
 
				+	struct vcpu_info *vcpup;
			
 
				+
			
 
				+	per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
			
 
				+
			
 
				+	if (!have_vcpu_info_placement)
			
 
				+		return;		/* already tested, not available */
			
 
				+
			
 
				+	vcpup = &per_cpu(xen_vcpu_info, cpu);
			
 
				+
			
 
				+	info.mfn = virt_to_mfn(vcpup);
			
 
				+	info.offset = offset_in_page(vcpup);
			
 
				+
			
 
				+	printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n",
			
 
				+	       cpu, vcpup, info.mfn, info.offset);
			
 
				+
			
 
				+	/* Check to see if the hypervisor will put the vcpu_info
			
 
				+	   structure where we want it, which allows direct access via
			
 
				+	   a percpu-variable. */
			
 
				+	err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info);
			
 
				+
			
 
				+	if (err) {
			
 
				+		printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
			
 
				+		have_vcpu_info_placement = 0;
			
 
				+	} else {
			
 
				+		/* This cpu is using the registered vcpu info, even if
			
 
				+		   later ones fail to. */
			
 
				+		per_cpu(xen_vcpu, cpu) = vcpup;
			
 
				+
			
 
				+		printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
			
 
				+		       cpu, vcpup);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void __init xen_banner(void)
			
 
				+{
			
 
				+	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
			
 
				+	       paravirt_ops.name);
			
 
				+	printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
			
 
				+}
			
 
				+
			
 
				+static void xen_cpuid(unsigned int *eax, unsigned int *ebx,
			
 
				+		      unsigned int *ecx, unsigned int *edx)
			
 
				+{
			
 
				+	unsigned maskedx = ~0;
			
 
				+
			
 
				+	/*
			
 
				+	 * Mask out inconvenient features, to try and disable as many
			
 
				+	 * unsupported kernel subsystems as possible.
			
 
				+	 */
			
 
				+	if (*eax == 1)
			
 
				+		maskedx = ~((1 << X86_FEATURE_APIC) |  /* disable APIC */
			
 
				+			    (1 << X86_FEATURE_ACPI) |  /* disable ACPI */
			
 
				+			    (1 << X86_FEATURE_ACC));   /* thermal monitoring */
			
 
				+
			
 
				+	asm(XEN_EMULATE_PREFIX "cpuid"
			
 
				+		: "=a" (*eax),
			
 
				+		  "=b" (*ebx),
			
 
				+		  "=c" (*ecx),
			
 
				+		  "=d" (*edx)
			
 
				+		: "0" (*eax), "2" (*ecx));
			
 
				+	*edx &= maskedx;
			
 
				+}
			
 
				+
			
 
				+static void xen_set_debugreg(int reg, unsigned long val)
			
 
				+{
			
 
				+	HYPERVISOR_set_debugreg(reg, val);
			
 
				+}
			
 
				+
			
 
				+static unsigned long xen_get_debugreg(int reg)
			
 
				+{
			
 
				+	return HYPERVISOR_get_debugreg(reg);
			
 
				+}
			
 
				+
			
 
				+static unsigned long xen_save_fl(void)
			
 
				+{
			
 
				+	struct vcpu_info *vcpu;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	vcpu = x86_read_percpu(xen_vcpu);
			
 
				+
			
 
				+	/* flag has opposite sense of mask */
			
 
				+	flags = !vcpu->evtchn_upcall_mask;
			
 
				+
			
 
				+	/* convert to IF type flag
			
 
				+	   -0 -> 0x00000000
			
 
				+	   -1 -> 0xffffffff
			
 
				+	*/
			
 
				+	return (-flags) & X86_EFLAGS_IF;
			
 
				+}
			
 
				+
			
 
				+static void xen_restore_fl(unsigned long flags)
			
 
				+{
			
 
				+	struct vcpu_info *vcpu;
			
 
				+
			
 
				+	/* convert from IF type flag */
			
 
				+	flags = !(flags & X86_EFLAGS_IF);
			
 
				+
			
 
				+	/* There's a one instruction preempt window here.  We need to
			
 
				+	   make sure we're don't switch CPUs between getting the vcpu
			
 
				+	   pointer and updating the mask. */
			
 
				+	preempt_disable();
			
 
				+	vcpu = x86_read_percpu(xen_vcpu);
			
 
				+	vcpu->evtchn_upcall_mask = flags;
			
 
				+	preempt_enable_no_resched();
			
 
				+
			
 
				+	/* Doesn't matter if we get preempted here, because any
			
 
				+	   pending event will get dealt with anyway. */
			
 
				+
			
 
				+	if (flags == 0) {
			
 
				+		preempt_check_resched();
			
 
				+		barrier(); /* unmask then check (avoid races) */
			
 
				+		if (unlikely(vcpu->evtchn_upcall_pending))
			
 
				+			force_evtchn_callback();
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void xen_irq_disable(void)
			
 
				+{
			
 
				+	/* There's a one instruction preempt window here.  We need to
			
 
				+	   make sure we're don't switch CPUs between getting the vcpu
			
 
				+	   pointer and updating the mask. */
			
 
				+	preempt_disable();
			
 
				+	x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
			
 
				+	preempt_enable_no_resched();
			
 
				+}
			
 
				+
			
 
				+static void xen_irq_enable(void)
			
 
				+{
			
 
				+	struct vcpu_info *vcpu;
			
 
				+
			
 
				+	/* There's a one instruction preempt window here.  We need to
			
 
				+	   make sure we're don't switch CPUs between getting the vcpu
			
 
				+	   pointer and updating the mask. */
			
 
				+	preempt_disable();
			
 
				+	vcpu = x86_read_percpu(xen_vcpu);
			
 
				+	vcpu->evtchn_upcall_mask = 0;
			
 
				+	preempt_enable_no_resched();
			
 
				+
			
 
				+	/* Doesn't matter if we get preempted here, because any
			
 
				+	   pending event will get dealt with anyway. */
			
 
				+
			
 
				+	barrier(); /* unmask then check (avoid races) */
			
 
				+	if (unlikely(vcpu->evtchn_upcall_pending))
			
 
				+		force_evtchn_callback();
			
 
				+}
			
 
				+
			
 
				+static void xen_safe_halt(void)
			
 
				+{
			
 
				+	/* Blocking includes an implicit local_irq_enable(). */
			
 
				+	if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0)
			
 
				+		BUG();
			
 
				+}
			
 
				+
			
 
				+static void xen_halt(void)
			
 
				+{
			
 
				+	if (irqs_disabled())
			
 
				+		HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
			
 
				+	else
			
 
				+		xen_safe_halt();
			
 
				+}
			
 
				+
			
 
				+static void xen_set_lazy_mode(enum paravirt_lazy_mode mode)
			
 
				+{
			
 
				+	BUG_ON(preemptible());
			
 
				+
			
 
				+	switch (mode) {
			
 
				+	case PARAVIRT_LAZY_NONE:
			
 
				+		BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
			
 
				+		break;
			
 
				+
			
 
				+	case PARAVIRT_LAZY_MMU:
			
 
				+	case PARAVIRT_LAZY_CPU:
			
 
				+		BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE);
			
 
				+		break;
			
 
				+
			
 
				+	case PARAVIRT_LAZY_FLUSH:
			
 
				+		/* flush if necessary, but don't change state */
			
 
				+		if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE)
			
 
				+			xen_mc_flush();
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	xen_mc_flush();
			
 
				+	x86_write_percpu(xen_lazy_mode, mode);
			
 
				+}
			
 
				+
			
 
				+static unsigned long xen_store_tr(void)
			
 
				+{
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void xen_set_ldt(const void *addr, unsigned entries)
			
 
				+{
			
 
				+	unsigned long linear_addr = (unsigned long)addr;
			
 
				+	struct mmuext_op *op;
			
 
				+	struct multicall_space mcs = xen_mc_entry(sizeof(*op));
			
 
				+
			
 
				+	op = mcs.args;
			
 
				+	op->cmd = MMUEXT_SET_LDT;
			
 
				+	if (linear_addr) {
			
 
				+		/* ldt my be vmalloced, use arbitrary_virt_to_machine */
			
 
				+		xmaddr_t maddr;
			
 
				+		maddr = arbitrary_virt_to_machine((unsigned long)addr);
			
 
				+		linear_addr = (unsigned long)maddr.maddr;
			
 
				+	}
			
 
				+	op->arg1.linear_addr = linear_addr;
			
 
				+	op->arg2.nr_ents = entries;
			
 
				+
			
 
				+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
			
 
				+
			
 
				+	xen_mc_issue(PARAVIRT_LAZY_CPU);
			
 
				+}
			
 
				+
			
 
				+static void xen_load_gdt(const struct Xgt_desc_struct *dtr)
			
 
				+{
			
 
				+	unsigned long *frames;
			
 
				+	unsigned long va = dtr->address;
			
 
				+	unsigned int size = dtr->size + 1;
			
 
				+	unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
			
 
				+	int f;
			
 
				+	struct multicall_space mcs;
			
 
				+
			
 
				+	/* A GDT can be up to 64k in size, which corresponds to 8192
			
 
				+	   8-byte entries, or 16 4k pages.. */
			
 
				+
			
 
				+	BUG_ON(size > 65536);
			
 
				+	BUG_ON(va & ~PAGE_MASK);
			
 
				+
			
 
				+	mcs = xen_mc_entry(sizeof(*frames) * pages);
			
 
				+	frames = mcs.args;
			
 
				+
			
 
				+	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
			
 
				+		frames[f] = virt_to_mfn(va);
			
 
				+		make_lowmem_page_readonly((void *)va);
			
 
				+	}
			
 
				+
			
 
				+	MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
			
 
				+
			
 
				+	xen_mc_issue(PARAVIRT_LAZY_CPU);
			
 
				+}
			
 
				+
			
 
				+static void load_TLS_descriptor(struct thread_struct *t,
			
 
				+				unsigned int cpu, unsigned int i)
			
 
				+{
			
 
				+	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
			
 
				+	xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
			
 
				+	struct multicall_space mc = __xen_mc_entry(0);
			
 
				+
			
 
				+	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
			
 
				+}
			
 
				+
			
 
				+static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
			
 
				+{
			
 
				+	xen_mc_batch();
			
 
				+
			
 
				+	load_TLS_descriptor(t, cpu, 0);
			
 
				+	load_TLS_descriptor(t, cpu, 1);
			
 
				+	load_TLS_descriptor(t, cpu, 2);
			
 
				+
			
 
				+	xen_mc_issue(PARAVIRT_LAZY_CPU);
			
 
				+
			
 
				+	/*
			
 
				+	 * XXX sleazy hack: If we're being called in a lazy-cpu zone,
			
 
				+	 * it means we're in a context switch, and %gs has just been
			
 
				+	 * saved.  This means we can zero it out to prevent faults on
			
 
				+	 * exit from the hypervisor if the next process has no %gs.
			
 
				+	 * Either way, it has been saved, and the new value will get
			
 
				+	 * loaded properly.  This will go away as soon as Xen has been
			
 
				+	 * modified to not save/restore %gs for normal hypercalls.
			
 
				+	 */
			
 
				+	if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU)
			
 
				+		loadsegment(gs, 0);
			
 
				+}
			
 
				+
			
 
				+static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
			
 
				+				u32 low, u32 high)
			
 
				+{
			
 
				+	unsigned long lp = (unsigned long)&dt[entrynum];
			
 
				+	xmaddr_t mach_lp = virt_to_machine(lp);
			
 
				+	u64 entry = (u64)high << 32 | low;
			
 
				+
			
 
				+	preempt_disable();
			
 
				+
			
 
				+	xen_mc_flush();
			
 
				+	if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
			
 
				+		BUG();
			
 
				+
			
 
				+	preempt_enable();
			
 
				+}
			
 
				+
			
 
				+static int cvt_gate_to_trap(int vector, u32 low, u32 high,
			
 
				+			    struct trap_info *info)
			
 
				+{
			
 
				+	u8 type, dpl;
			
 
				+
			
 
				+	type = (high >> 8) & 0x1f;
			
 
				+	dpl = (high >> 13) & 3;
			
 
				+
			
 
				+	if (type != 0xf && type != 0xe)
			
 
				+		return 0;
			
 
				+
			
 
				+	info->vector = vector;
			
 
				+	info->address = (high & 0xffff0000) | (low & 0x0000ffff);
			
 
				+	info->cs = low >> 16;
			
 
				+	info->flags = dpl;
			
 
				+	/* interrupt gates clear IF */
			
 
				+	if (type == 0xe)
			
 
				+		info->flags |= 4;
			
 
				+
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+/* Locations of each CPU's IDT */
			
 
				+static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc);
			
 
				+
			
 
				+/* Set an IDT entry.  If the entry is part of the current IDT, then
			
 
				+   also update Xen. */
			
 
				+static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
			
 
				+				u32 low, u32 high)
			
 
				+{
			
 
				+	unsigned long p = (unsigned long)&dt[entrynum];
			
 
				+	unsigned long start, end;
			
 
				+
			
 
				+	preempt_disable();
			
 
				+
			
 
				+	start = __get_cpu_var(idt_desc).address;
			
 
				+	end = start + __get_cpu_var(idt_desc).size + 1;
			
 
				+
			
 
				+	xen_mc_flush();
			
 
				+
			
 
				+	write_dt_entry(dt, entrynum, low, high);
			
 
				+
			
 
				+	if (p >= start && (p + 8) <= end) {
			
 
				+		struct trap_info info[2];
			
 
				+
			
 
				+		info[1].address = 0;
			
 
				+
			
 
				+		if (cvt_gate_to_trap(entrynum, low, high, &info[0]))
			
 
				+			if (HYPERVISOR_set_trap_table(info))
			
 
				+				BUG();
			
 
				+	}
			
 
				+
			
 
				+	preempt_enable();
			
 
				+}
			
 
				+
			
 
				+static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
			
 
				+				  struct trap_info *traps)
			
 
				+{
			
 
				+	unsigned in, out, count;
			
 
				+
			
 
				+	count = (desc->size+1) / 8;
			
 
				+	BUG_ON(count > 256);
			
 
				+
			
 
				+	for (in = out = 0; in < count; in++) {
			
 
				+		const u32 *entry = (u32 *)(desc->address + in * 8);
			
 
				+
			
 
				+		if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out]))
			
 
				+			out++;
			
 
				+	}
			
 
				+	traps[out].address = 0;
			
 
				+}
			
 
				+
			
 
				+void xen_copy_trap_info(struct trap_info *traps)
			
 
				+{
			
 
				+	const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc);
			
 
				+
			
 
				+	xen_convert_trap_info(desc, traps);
			
 
				+}
			
 
				+
			
 
				+/* Load a new IDT into Xen.  In principle this can be per-CPU, so we
			
 
				+   hold a spinlock to protect the static traps[] array (static because
			
 
				+   it avoids allocation, and saves stack space). */
			
 
				+static void xen_load_idt(const struct Xgt_desc_struct *desc)
			
 
				+{
			
 
				+	static DEFINE_SPINLOCK(lock);
			
 
				+	static struct trap_info traps[257];
			
 
				+
			
 
				+	spin_lock(&lock);
			
 
				+
			
 
				+	__get_cpu_var(idt_desc) = *desc;
			
 
				+
			
 
				+	xen_convert_trap_info(desc, traps);
			
 
				+
			
 
				+	xen_mc_flush();
			
 
				+	if (HYPERVISOR_set_trap_table(traps))
			
 
				+		BUG();
			
 
				+
			
 
				+	spin_unlock(&lock);
			
 
				+}
			
 
				+
			
 
				+/* Write a GDT descriptor entry.  Ignore LDT descriptors, since
			
 
				+   they're handled differently. */
			
 
				+static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
			
 
				+				u32 low, u32 high)
			
 
				+{
			
 
				+	preempt_disable();
			
 
				+
			
 
				+	switch ((high >> 8) & 0xff) {
			
 
				+	case DESCTYPE_LDT:
			
 
				+	case DESCTYPE_TSS:
			
 
				+		/* ignore */
			
 
				+		break;
			
 
				+
			
 
				+	default: {
			
 
				+		xmaddr_t maddr = virt_to_machine(&dt[entry]);
			
 
				+		u64 desc = (u64)high << 32 | low;
			
 
				+
			
 
				+		xen_mc_flush();
			
 
				+		if (HYPERVISOR_update_descriptor(maddr.maddr, desc))
			
 
				+			BUG();
			
 
				+	}
			
 
				+
			
 
				+	}
			
 
				+
			
 
				+	preempt_enable();
			
 
				+}
			
 
				+
			
 
				+static void xen_load_esp0(struct tss_struct *tss,
			
 
				+			  struct thread_struct *thread)
			
 
				+{
			
 
				+	struct multicall_space mcs = xen_mc_entry(0);
			
 
				+	MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0);
			
 
				+	xen_mc_issue(PARAVIRT_LAZY_CPU);
			
 
				+}
			
 
				+
			
 
				+static void xen_set_iopl_mask(unsigned mask)
			
 
				+{
			
 
				+	struct physdev_set_iopl set_iopl;
			
 
				+
			
 
				+	/* Force the change at ring 0. */
			
 
				+	set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
			
 
				+	HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
			
 
				+}
			
 
				+
			
 
				+static void xen_io_delay(void)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+#ifdef CONFIG_X86_LOCAL_APIC
			
 
				+static unsigned long xen_apic_read(unsigned long reg)
			
 
				+{
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void xen_apic_write(unsigned long reg, unsigned long val)
			
 
				+{
			
 
				+	/* Warn to see if there's any stray references */
			
 
				+	WARN_ON(1);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+static void xen_flush_tlb(void)
			
 
				+{
			
 
				+	struct mmuext_op *op;
			
 
				+	struct multicall_space mcs = xen_mc_entry(sizeof(*op));
			
 
				+
			
 
				+	op = mcs.args;
			
 
				+	op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
			
 
				+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
			
 
				+
			
 
				+	xen_mc_issue(PARAVIRT_LAZY_MMU);
			
 
				+}
			
 
				+
			
 
				+static void xen_flush_tlb_single(unsigned long addr)
			
 
				+{
			
 
				+	struct mmuext_op *op;
			
 
				+	struct multicall_space mcs = xen_mc_entry(sizeof(*op));
			
 
				+
			
 
				+	op = mcs.args;
			
 
				+	op->cmd = MMUEXT_INVLPG_LOCAL;
			
 
				+	op->arg1.linear_addr = addr & PAGE_MASK;
			
 
				+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
			
 
				+
			
 
				+	xen_mc_issue(PARAVIRT_LAZY_MMU);
			
 
				+}
			
 
				+
			
 
				+static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
			
 
				+				 unsigned long va)
			
 
				+{
			
 
				+	struct {
			
 
				+		struct mmuext_op op;
			
 
				+		cpumask_t mask;
			
 
				+	} *args;
			
 
				+	cpumask_t cpumask = *cpus;
			
 
				+	struct multicall_space mcs;
			
 
				+
			
 
				+	/*
			
 
				+	 * A couple of (to be removed) sanity checks:
			
 
				+	 *
			
 
				+	 * - current CPU must not be in mask
			
 
				+	 * - mask must exist :)
			
 
				+	 */
			
 
				+	BUG_ON(cpus_empty(cpumask));
			
 
				+	BUG_ON(cpu_isset(smp_processor_id(), cpumask));
			
 
				+	BUG_ON(!mm);
			
 
				+
			
 
				+	/* If a CPU which we ran on has gone down, OK. */
			
 
				+	cpus_and(cpumask, cpumask, cpu_online_map);
			
 
				+	if (cpus_empty(cpumask))
			
 
				+		return;
			
 
				+
			
 
				+	mcs = xen_mc_entry(sizeof(*args));
			
 
				+	args = mcs.args;
			
 
				+	args->mask = cpumask;
			
 
				+	args->op.arg2.vcpumask = &args->mask;
			
 
				+
			
 
				+	if (va == TLB_FLUSH_ALL) {
			
 
				+		args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
			
 
				+	} else {
			
 
				+		args->op.cmd = MMUEXT_INVLPG_MULTI;
			
 
				+		args->op.arg1.linear_addr = va;
			
 
				+	}
			
 
				+
			
 
				+	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
			
 
				+
			
 
				+	xen_mc_issue(PARAVIRT_LAZY_MMU);
			
 
				+}
			
 
				+
			
 
				+static void xen_write_cr2(unsigned long cr2)
			
 
				+{
			
 
				+	x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
			
 
				+}
			
 
				+
			
 
				+static unsigned long xen_read_cr2(void)
			
 
				+{
			
 
				+	return x86_read_percpu(xen_vcpu)->arch.cr2;
			
 
				+}
			
 
				+
			
 
				+static unsigned long xen_read_cr2_direct(void)
			
 
				+{
			
 
				+	return x86_read_percpu(xen_vcpu_info.arch.cr2);
			
 
				+}
			
 
				+
			
 
				+static void xen_write_cr4(unsigned long cr4)
			
 
				+{
			
 
				+	/* never allow TSC to be disabled */
			
 
				+	native_write_cr4(cr4 & ~X86_CR4_TSD);
			
 
				+}
			
 
				+
			
 
				+static unsigned long xen_read_cr3(void)
			
 
				+{
			
 
				+	return x86_read_percpu(xen_cr3);
			
 
				+}
			
 
				+
			
 
				+static void xen_write_cr3(unsigned long cr3)
			
 
				+{
			
 
				+	BUG_ON(preemptible());
			
 
				+
			
 
				+	if (cr3 == x86_read_percpu(xen_cr3)) {
			
 
				+		/* just a simple tlb flush */
			
 
				+		xen_flush_tlb();
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	x86_write_percpu(xen_cr3, cr3);
			
 
				+
			
 
				+
			
 
				+	{
			
 
				+		struct mmuext_op *op;
			
 
				+		struct multicall_space mcs = xen_mc_entry(sizeof(*op));
			
 
				+		unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
			
 
				+
			
 
				+		op = mcs.args;
			
 
				+		op->cmd = MMUEXT_NEW_BASEPTR;
			
 
				+		op->arg1.mfn = mfn;
			
 
				+
			
 
				+		MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
			
 
				+
			
 
				+		xen_mc_issue(PARAVIRT_LAZY_CPU);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* Early in boot, while setting up the initial pagetable, assume
			
 
				+   everything is pinned. */
			
 
				+static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
			
 
				+{
			
 
				+	BUG_ON(mem_map);	/* should only be used early */
			
 
				+	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
			
 
				+}
			
 
				+
			
 
				+/* This needs to make sure the new pte page is pinned iff its being
			
 
				+   attached to a pinned pagetable. */
			
 
				+static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
			
 
				+{
			
 
				+	struct page *page = pfn_to_page(pfn);
			
 
				+
			
 
				+	if (PagePinned(virt_to_page(mm->pgd))) {
			
 
				+		SetPagePinned(page);
			
 
				+
			
 
				+		if (!PageHighMem(page))
			
 
				+			make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
			
 
				+		else
			
 
				+			/* make sure there are no stray mappings of
			
 
				+			   this page */
			
 
				+			kmap_flush_unused();
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* This should never happen until we're OK to use struct page */
			
 
				+static void xen_release_pt(u32 pfn)
			
 
				+{
			
 
				+	struct page *page = pfn_to_page(pfn);
			
 
				+
			
 
				+	if (PagePinned(page)) {
			
 
				+		if (!PageHighMem(page))
			
 
				+			make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#ifdef CONFIG_HIGHPTE
			
 
				+static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
			
 
				+{
			
 
				+	pgprot_t prot = PAGE_KERNEL;
			
 
				+
			
 
				+	if (PagePinned(page))
			
 
				+		prot = PAGE_KERNEL_RO;
			
 
				+
			
 
				+	if (0 && PageHighMem(page))
			
 
				+		printk("mapping highpte %lx type %d prot %s\n",
			
 
				+		       page_to_pfn(page), type,
			
 
				+		       (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
			
 
				+
			
 
				+	return kmap_atomic_prot(page, type, prot);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
			
 
				+{
			
 
				+	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
			
 
				+	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
			
 
				+		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
			
 
				+			       pte_val_ma(pte));
			
 
				+
			
 
				+	return pte;
			
 
				+}
			
 
				+
			
 
				+/* Init-time set_pte while constructing initial pagetables, which
			
 
				+   doesn't allow RO pagetable pages to be remapped RW */
			
 
				+static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
			
 
				+{
			
 
				+	pte = mask_rw_pte(ptep, pte);
			
 
				+
			
 
				+	xen_set_pte(ptep, pte);
			
 
				+}
			
 
				+
			
 
				+static __init void xen_pagetable_setup_start(pgd_t *base)
			
 
				+{
			
 
				+	pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
			
 
				+
			
 
				+	/* special set_pte for pagetable initialization */
			
 
				+	paravirt_ops.set_pte = xen_set_pte_init;
			
 
				+
			
 
				+	init_mm.pgd = base;
			
 
				+	/*
			
 
				+	 * copy top-level of Xen-supplied pagetable into place.	 For
			
 
				+	 * !PAE we can use this as-is, but for PAE it is a stand-in
			
 
				+	 * while we copy the pmd pages.
			
 
				+	 */
			
 
				+	memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
			
 
				+
			
 
				+	if (PTRS_PER_PMD > 1) {
			
 
				+		int i;
			
 
				+		/*
			
 
				+		 * For PAE, need to allocate new pmds, rather than
			
 
				+		 * share Xen's, since Xen doesn't like pmd's being
			
 
				+		 * shared between address spaces.
			
 
				+		 */
			
 
				+		for (i = 0; i < PTRS_PER_PGD; i++) {
			
 
				+			if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
			
 
				+				pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
			
 
				+
			
 
				+				memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
			
 
				+				       PAGE_SIZE);
			
 
				+
			
 
				+				make_lowmem_page_readonly(pmd);
			
 
				+
			
 
				+				set_pgd(&base[i], __pgd(1 + __pa(pmd)));
			
 
				+			} else
			
 
				+				pgd_clear(&base[i]);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* make sure zero_page is mapped RO so we can use it in pagetables */
			
 
				+	make_lowmem_page_readonly(empty_zero_page);
			
 
				+	make_lowmem_page_readonly(base);
			
 
				+	/*
			
 
				+	 * Switch to new pagetable.  This is done before
			
 
				+	 * pagetable_init has done anything so that the new pages
			
 
				+	 * added to the table can be prepared properly for Xen.
			
 
				+	 */
			
 
				+	xen_write_cr3(__pa(base));
			
 
				+}
			
 
				+
			
 
				+static __init void xen_pagetable_setup_done(pgd_t *base)
			
 
				+{
			
 
				+	/* This will work as long as patching hasn't happened yet
			
 
				+	   (which it hasn't) */
			
 
				+	paravirt_ops.alloc_pt = xen_alloc_pt;
			
 
				+	paravirt_ops.set_pte = xen_set_pte;
			
 
				+
			
 
				+	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
			
 
				+		/*
			
 
				+		 * Create a mapping for the shared info page.
			
 
				+		 * Should be set_fixmap(), but shared_info is a machine
			
 
				+		 * address with no corresponding pseudo-phys address.
			
 
				+		 */
			
 
				+		set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
			
 
				+			    PFN_DOWN(xen_start_info->shared_info),
			
 
				+			    PAGE_KERNEL);
			
 
				+
			
 
				+		HYPERVISOR_shared_info =
			
 
				+			(struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
			
 
				+
			
 
				+	} else
			
 
				+		HYPERVISOR_shared_info =
			
 
				+			(struct shared_info *)__va(xen_start_info->shared_info);
			
 
				+
			
 
				+	/* Actually pin the pagetable down, but we can't set PG_pinned
			
 
				+	   yet because the page structures don't exist yet. */
			
 
				+	{
			
 
				+		struct mmuext_op op;
			
 
				+#ifdef CONFIG_X86_PAE
			
 
				+		op.cmd = MMUEXT_PIN_L3_TABLE;
			
 
				+#else
			
 
				+		op.cmd = MMUEXT_PIN_L3_TABLE;
			
 
				+#endif
			
 
				+		op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base)));
			
 
				+		if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
			
 
				+			BUG();
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* This is called once we have the cpu_possible_map */
			
 
				+void __init xen_setup_vcpu_info_placement(void)
			
 
				+{
			
 
				+	int cpu;
			
 
				+
			
 
				+	for_each_possible_cpu(cpu)
			
 
				+		xen_vcpu_setup(cpu);
			
 
				+
			
 
				+	/* xen_vcpu_setup managed to place the vcpu_info within the
			
 
				+	   percpu area for all cpus, so make use of it */
			
 
				+	if (have_vcpu_info_placement) {
			
 
				+		printk(KERN_INFO "Xen: using vcpu_info placement\n");
			
 
				+
			
 
				+		paravirt_ops.save_fl = xen_save_fl_direct;
			
 
				+		paravirt_ops.restore_fl = xen_restore_fl_direct;
			
 
				+		paravirt_ops.irq_disable = xen_irq_disable_direct;
			
 
				+		paravirt_ops.irq_enable = xen_irq_enable_direct;
			
 
				+		paravirt_ops.read_cr2 = xen_read_cr2_direct;
			
 
				+		paravirt_ops.iret = xen_iret_direct;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static unsigned xen_patch(u8 type, u16 clobbers, void *insns, unsigned len)
			
 
				+{
			
 
				+	char *start, *end, *reloc;
			
 
				+	unsigned ret;
			
 
				+
			
 
				+	start = end = reloc = NULL;
			
 
				+
			
 
				+#define SITE(x)								\
			
 
				+	case PARAVIRT_PATCH(x):						\
			
 
				+	if (have_vcpu_info_placement) {					\
			
 
				+		start = (char *)xen_##x##_direct;			\
			
 
				+		end = xen_##x##_direct_end;				\
			
 
				+		reloc = xen_##x##_direct_reloc;				\
			
 
				+	}								\
			
 
				+	goto patch_site
			
 
				+
			
 
				+	switch (type) {
			
 
				+		SITE(irq_enable);
			
 
				+		SITE(irq_disable);
			
 
				+		SITE(save_fl);
			
 
				+		SITE(restore_fl);
			
 
				+#undef SITE
			
 
				+
			
 
				+	patch_site:
			
 
				+		if (start == NULL || (end-start) > len)
			
 
				+			goto default_patch;
			
 
				+
			
 
				+		ret = paravirt_patch_insns(insns, len, start, end);
			
 
				+
			
 
				+		/* Note: because reloc is assigned from something that
			
 
				+		   appears to be an array, gcc assumes it's non-null,
			
 
				+		   but doesn't know its relationship with start and
			
 
				+		   end. */
			
 
				+		if (reloc > start && reloc < end) {
			
 
				+			int reloc_off = reloc - start;
			
 
				+			long *relocp = (long *)(insns + reloc_off);
			
 
				+			long delta = start - (char *)insns;
			
 
				+
			
 
				+			*relocp += delta;
			
 
				+		}
			
 
				+		break;
			
 
				+
			
 
				+	default_patch:
			
 
				+	default:
			
 
				+		ret = paravirt_patch_default(type, clobbers, insns, len);
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static const struct paravirt_ops xen_paravirt_ops __initdata = {
			
 
				+	.paravirt_enabled = 1,
			
 
				+	.shared_kernel_pmd = 0,
			
 
				+
			
 
				+	.name = "Xen",
			
 
				+	.banner = xen_banner,
			
 
				+
			
 
				+	.patch = xen_patch,
			
 
				+
			
 
				+	.memory_setup = xen_memory_setup,
			
 
				+	.arch_setup = xen_arch_setup,
			
 
				+	.init_IRQ = xen_init_IRQ,
			
 
				+	.post_allocator_init = xen_mark_init_mm_pinned,
			
 
				+
			
 
				+	.time_init = xen_time_init,
			
 
				+	.set_wallclock = xen_set_wallclock,
			
 
				+	.get_wallclock = xen_get_wallclock,
			
 
				+	.get_cpu_khz = xen_cpu_khz,
			
 
				+	.sched_clock = xen_sched_clock,
			
 
				+
			
 
				+	.cpuid = xen_cpuid,
			
 
				+
			
 
				+	.set_debugreg = xen_set_debugreg,
			
 
				+	.get_debugreg = xen_get_debugreg,
			
 
				+
			
 
				+	.clts = native_clts,
			
 
				+
			
 
				+	.read_cr0 = native_read_cr0,
			
 
				+	.write_cr0 = native_write_cr0,
			
 
				+
			
 
				+	.read_cr2 = xen_read_cr2,
			
 
				+	.write_cr2 = xen_write_cr2,
			
 
				+
			
 
				+	.read_cr3 = xen_read_cr3,
			
 
				+	.write_cr3 = xen_write_cr3,
			
 
				+
			
 
				+	.read_cr4 = native_read_cr4,
			
 
				+	.read_cr4_safe = native_read_cr4_safe,
			
 
				+	.write_cr4 = xen_write_cr4,
			
 
				+
			
 
				+	.save_fl = xen_save_fl,
			
 
				+	.restore_fl = xen_restore_fl,
			
 
				+	.irq_disable = xen_irq_disable,
			
 
				+	.irq_enable = xen_irq_enable,
			
 
				+	.safe_halt = xen_safe_halt,
			
 
				+	.halt = xen_halt,
			
 
				+	.wbinvd = native_wbinvd,
			
 
				+
			
 
				+	.read_msr = native_read_msr_safe,
			
 
				+	.write_msr = native_write_msr_safe,
			
 
				+	.read_tsc = native_read_tsc,
			
 
				+	.read_pmc = native_read_pmc,
			
 
				+
			
 
				+	.iret = (void *)&hypercall_page[__HYPERVISOR_iret],
			
 
				+	.irq_enable_sysexit = NULL,  /* never called */
			
 
				+
			
 
				+	.load_tr_desc = paravirt_nop,
			
 
				+	.set_ldt = xen_set_ldt,
			
 
				+	.load_gdt = xen_load_gdt,
			
 
				+	.load_idt = xen_load_idt,
			
 
				+	.load_tls = xen_load_tls,
			
 
				+
			
 
				+	.store_gdt = native_store_gdt,
			
 
				+	.store_idt = native_store_idt,
			
 
				+	.store_tr = xen_store_tr,
			
 
				+
			
 
				+	.write_ldt_entry = xen_write_ldt_entry,
			
 
				+	.write_gdt_entry = xen_write_gdt_entry,
			
 
				+	.write_idt_entry = xen_write_idt_entry,
			
 
				+	.load_esp0 = xen_load_esp0,
			
 
				+
			
 
				+	.set_iopl_mask = xen_set_iopl_mask,
			
 
				+	.io_delay = xen_io_delay,
			
 
				+
			
 
				+#ifdef CONFIG_X86_LOCAL_APIC
			
 
				+	.apic_write = xen_apic_write,
			
 
				+	.apic_write_atomic = xen_apic_write,
			
 
				+	.apic_read = xen_apic_read,
			
 
				+	.setup_boot_clock = paravirt_nop,
			
 
				+	.setup_secondary_clock = paravirt_nop,
			
 
				+	.startup_ipi_hook = paravirt_nop,
			
 
				+#endif
			
 
				+
			
 
				+	.flush_tlb_user = xen_flush_tlb,
			
 
				+	.flush_tlb_kernel = xen_flush_tlb,
			
 
				+	.flush_tlb_single = xen_flush_tlb_single,
			
 
				+	.flush_tlb_others = xen_flush_tlb_others,
			
 
				+
			
 
				+	.pte_update = paravirt_nop,
			
 
				+	.pte_update_defer = paravirt_nop,
			
 
				+
			
 
				+	.pagetable_setup_start = xen_pagetable_setup_start,
			
 
				+	.pagetable_setup_done = xen_pagetable_setup_done,
			
 
				+
			
 
				+	.alloc_pt = xen_alloc_pt_init,
			
 
				+	.release_pt = xen_release_pt,
			
 
				+	.alloc_pd = paravirt_nop,
			
 
				+	.alloc_pd_clone = paravirt_nop,
			
 
				+	.release_pd = paravirt_nop,
			
 
				+
			
 
				+#ifdef CONFIG_HIGHPTE
			
 
				+	.kmap_atomic_pte = xen_kmap_atomic_pte,
			
 
				+#endif
			
 
				+
			
 
				+	.set_pte = NULL,	/* see xen_pagetable_setup_* */
			
 
				+	.set_pte_at = xen_set_pte_at,
			
 
				+	.set_pmd = xen_set_pmd,
			
 
				+
			
 
				+	.pte_val = xen_pte_val,
			
 
				+	.pgd_val = xen_pgd_val,
			
 
				+
			
 
				+	.make_pte = xen_make_pte,
			
 
				+	.make_pgd = xen_make_pgd,
			
 
				+
			
 
				+#ifdef CONFIG_X86_PAE
			
 
				+	.set_pte_atomic = xen_set_pte_atomic,
			
 
				+	.set_pte_present = xen_set_pte_at,
			
 
				+	.set_pud = xen_set_pud,
			
 
				+	.pte_clear = xen_pte_clear,
			
 
				+	.pmd_clear = xen_pmd_clear,
			
 
				+
			
 
				+	.make_pmd = xen_make_pmd,
			
 
				+	.pmd_val = xen_pmd_val,
			
 
				+#endif	/* PAE */
			
 
				+
			
 
				+	.activate_mm = xen_activate_mm,
			
 
				+	.dup_mmap = xen_dup_mmap,
			
 
				+	.exit_mmap = xen_exit_mmap,
			
 
				+
			
 
				+	.set_lazy_mode = xen_set_lazy_mode,
			
 
				+};
			
 
				+
			
 
				+#ifdef CONFIG_SMP
			
 
				+static const struct smp_ops xen_smp_ops __initdata = {
			
 
				+	.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
			
 
				+	.smp_prepare_cpus = xen_smp_prepare_cpus,
			
 
				+	.cpu_up = xen_cpu_up,
			
 
				+	.smp_cpus_done = xen_smp_cpus_done,
			
 
				+
			
 
				+	.smp_send_stop = xen_smp_send_stop,
			
 
				+	.smp_send_reschedule = xen_smp_send_reschedule,
			
 
				+	.smp_call_function_mask = xen_smp_call_function_mask,
			
 
				+};
			
 
				+#endif	/* CONFIG_SMP */
			
 
				+
			
 
				+static void xen_reboot(int reason)
			
 
				+{
			
 
				+#ifdef CONFIG_SMP
			
 
				+	smp_send_stop();
			
 
				+#endif
			
 
				+
			
 
				+	if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason))
			
 
				+		BUG();
			
 
				+}
			
 
				+
			
 
				+static void xen_restart(char *msg)
			
 
				+{
			
 
				+	xen_reboot(SHUTDOWN_reboot);
			
 
				+}
			
 
				+
			
 
				+static void xen_emergency_restart(void)
			
 
				+{
			
 
				+	xen_reboot(SHUTDOWN_reboot);
			
 
				+}
			
 
				+
			
 
				+static void xen_machine_halt(void)
			
 
				+{
			
 
				+	xen_reboot(SHUTDOWN_poweroff);
			
 
				+}
			
 
				+
			
 
				+static void xen_crash_shutdown(struct pt_regs *regs)
			
 
				+{
			
 
				+	xen_reboot(SHUTDOWN_crash);
			
 
				+}
			
 
				+
			
 
				+static const struct machine_ops __initdata xen_machine_ops = {
			
 
				+	.restart = xen_restart,
			
 
				+	.halt = xen_machine_halt,
			
 
				+	.power_off = xen_machine_halt,
			
 
				+	.shutdown = xen_machine_halt,
			
 
				+	.crash_shutdown = xen_crash_shutdown,
			
 
				+	.emergency_restart = xen_emergency_restart,
			
 
				+};
			
 
				+
			
 
				+
			
 
				+/* First C function to be called on Xen boot */
			
 
				+asmlinkage void __init xen_start_kernel(void)
			
 
				+{
			
 
				+	pgd_t *pgd;
			
 
				+
			
 
				+	if (!xen_start_info)
			
 
				+		return;
			
 
				+
			
 
				+	BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
			
 
				+
			
 
				+	/* Install Xen paravirt ops */
			
 
				+	paravirt_ops = xen_paravirt_ops;
			
 
				+	machine_ops = xen_machine_ops;
			
 
				+
			
 
				+#ifdef CONFIG_SMP
			
 
				+	smp_ops = xen_smp_ops;
			
 
				+#endif
			
 
				+
			
 
				+	xen_setup_features();
			
 
				+
			
 
				+	/* Get mfn list */
			
 
				+	if (!xen_feature(XENFEAT_auto_translated_physmap))
			
 
				+		phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list;
			
 
				+
			
 
				+	pgd = (pgd_t *)xen_start_info->pt_base;
			
 
				+
			
 
				+	init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
			
 
				+
			
 
				+	init_mm.pgd = pgd; /* use the Xen pagetables to start */
			
 
				+
			
 
				+	/* keep using Xen gdt for now; no urgent need to change it */
			
 
				+
			
 
				+	x86_write_percpu(xen_cr3, __pa(pgd));
			
 
				+
			
 
				+#ifdef CONFIG_SMP
			
 
				+	/* Don't do the full vcpu_info placement stuff until we have a
			
 
				+	   possible map. */
			
 
				+	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
			
 
				+#else
			
 
				+	/* May as well do it now, since there's no good time to call
			
 
				+	   it later on UP. */
			
 
				+	xen_setup_vcpu_info_placement();
			
 
				+#endif
			
 
				+
			
 
				+	paravirt_ops.kernel_rpl = 1;
			
 
				+	if (xen_feature(XENFEAT_supervisor_mode_kernel))
			
 
				+		paravirt_ops.kernel_rpl = 0;
			
 
				+
			
 
				+	/* set the limit of our address space */
			
 
				+	reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE);
			
 
				+
			
 
				+	/* set up basic CPUID stuff */
			
 
				+	cpu_detect(&new_cpu_data);
			
 
				+	new_cpu_data.hard_math = 1;
			
 
				+	new_cpu_data.x86_capability[0] = cpuid_edx(1);
			
 
				+
			
 
				+	/* Poke various useful things into boot_params */
			
 
				+	LOADER_TYPE = (9 << 4) | 0;
			
 
				+	INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0;
			
 
				+	INITRD_SIZE = xen_start_info->mod_len;
			
 
				+
			
 
				+	/* Start the world */
			
 
				+	start_kernel();
			
 
				+}
			
--- a/arch/i386/xen/events.c
+++ b/arch/i386/xen/events.c
@@ -0,0 +1,590 @@
 
				+/*
			
 
				+ * Xen event channels
			
 
				+ *
			
 
				+ * Xen models interrupts with abstract event channels.  Because each
			
 
				+ * domain gets 1024 event channels, but NR_IRQ is not that large, we
			
 
				+ * must dynamically map irqs<->event channels.  The event channels
			
 
				+ * interface with the rest of the kernel by defining a xen interrupt
			
 
				+ * chip.  When an event is recieved, it is mapped to an irq and sent
			
 
				+ * through the normal interrupt processing path.
			
 
				+ *
			
 
				+ * There are four kinds of events which can be mapped to an event
			
 
				+ * channel:
			
 
				+ *
			
 
				+ * 1. Inter-domain notifications.  This includes all the virtual
			
 
				+ *    device events, since they're driven by front-ends in another domain
			
 
				+ *    (typically dom0).
			
 
				+ * 2. VIRQs, typically used for timers.  These are per-cpu events.
			
 
				+ * 3. IPIs.
			
 
				+ * 4. Hardware interrupts. Not supported at present.
			
 
				+ *
			
 
				+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
			
 
				+ */
			
 
				+
			
 
				+#include <linux/linkage.h>
			
 
				+#include <linux/interrupt.h>
			
 
				+#include <linux/irq.h>
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/string.h>
			
 
				+
			
 
				+#include <asm/ptrace.h>
			
 
				+#include <asm/irq.h>
			
 
				+#include <asm/sync_bitops.h>
			
 
				+#include <asm/xen/hypercall.h>
			
 
				+
			
 
				+#include <xen/events.h>
			
 
				+#include <xen/interface/xen.h>
			
 
				+#include <xen/interface/event_channel.h>
			
 
				+
			
 
				+#include "xen-ops.h"
			
 
				+
			
 
				+/*
			
 
				+ * This lock protects updates to the following mapping and reference-count
			
 
				+ * arrays. The lock does not need to be acquired to read the mapping tables.
			
 
				+ */
			
 
				+static DEFINE_SPINLOCK(irq_mapping_update_lock);
			
 
				+
			
 
				+/* IRQ <-> VIRQ mapping. */
			
 
				+static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
			
 
				+
			
 
				+/* IRQ <-> IPI mapping */
			
 
				+static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
			
 
				+
			
 
				+/* Packed IRQ information: binding type, sub-type index, and event channel. */
			
 
				+struct packed_irq
			
 
				+{
			
 
				+	unsigned short evtchn;
			
 
				+	unsigned char index;
			
 
				+	unsigned char type;
			
 
				+};
			
 
				+
			
 
				+static struct packed_irq irq_info[NR_IRQS];
			
 
				+
			
 
				+/* Binding types. */
			
 
				+enum {
			
 
				+	IRQT_UNBOUND,
			
 
				+	IRQT_PIRQ,
			
 
				+	IRQT_VIRQ,
			
 
				+	IRQT_IPI,
			
 
				+	IRQT_EVTCHN
			
 
				+};
			
 
				+
			
 
				+/* Convenient shorthand for packed representation of an unbound IRQ. */
			
 
				+#define IRQ_UNBOUND	mk_irq_info(IRQT_UNBOUND, 0, 0)
			
 
				+
			
 
				+static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
			
 
				+	[0 ... NR_EVENT_CHANNELS-1] = -1
			
 
				+};
			
 
				+static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
			
 
				+static u8 cpu_evtchn[NR_EVENT_CHANNELS];
			
 
				+
			
 
				+/* Reference counts for bindings to IRQs. */
			
 
				+static int irq_bindcount[NR_IRQS];
			
 
				+
			
 
				+/* Xen will never allocate port zero for any purpose. */
			
 
				+#define VALID_EVTCHN(chn)	((chn) != 0)
			
 
				+
			
 
				+/*
			
 
				+ * Force a proper event-channel callback from Xen after clearing the
			
 
				+ * callback mask. We do this in a very simple manner, by making a call
			
 
				+ * down into Xen. The pending flag will be checked by Xen on return.
			
 
				+ */
			
 
				+void force_evtchn_callback(void)
			
 
				+{
			
 
				+	(void)HYPERVISOR_xen_version(0, NULL);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(force_evtchn_callback);
			
 
				+
			
 
				+static struct irq_chip xen_dynamic_chip;
			
 
				+
			
 
				+/* Constructor for packed IRQ information. */
			
 
				+static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn)
			
 
				+{
			
 
				+	return (struct packed_irq) { evtchn, index, type };
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Accessors for packed IRQ information.
			
 
				+ */
			
 
				+static inline unsigned int evtchn_from_irq(int irq)
			
 
				+{
			
 
				+	return irq_info[irq].evtchn;
			
 
				+}
			
 
				+
			
 
				+static inline unsigned int index_from_irq(int irq)
			
 
				+{
			
 
				+	return irq_info[irq].index;
			
 
				+}
			
 
				+
			
 
				+static inline unsigned int type_from_irq(int irq)
			
 
				+{
			
 
				+	return irq_info[irq].type;
			
 
				+}
			
 
				+
			
 
				+static inline unsigned long active_evtchns(unsigned int cpu,
			
 
				+					   struct shared_info *sh,
			
 
				+					   unsigned int idx)
			
 
				+{
			
 
				+	return (sh->evtchn_pending[idx] &
			
 
				+		cpu_evtchn_mask[cpu][idx] &
			
 
				+		~sh->evtchn_mask[idx]);
			
 
				+}
			
 
				+
			
 
				+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
			
 
				+{
			
 
				+	int irq = evtchn_to_irq[chn];
			
 
				+
			
 
				+	BUG_ON(irq == -1);
			
 
				+#ifdef CONFIG_SMP
			
 
				+	irq_desc[irq].affinity = cpumask_of_cpu(cpu);
			
 
				+#endif
			
 
				+
			
 
				+	__clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
			
 
				+	__set_bit(chn, cpu_evtchn_mask[cpu]);
			
 
				+
			
 
				+	cpu_evtchn[chn] = cpu;
			
 
				+}
			
 
				+
			
 
				+static void init_evtchn_cpu_bindings(void)
			
 
				+{
			
 
				+#ifdef CONFIG_SMP
			
 
				+	int i;
			
 
				+	/* By default all event channels notify CPU#0. */
			
 
				+	for (i = 0; i < NR_IRQS; i++)
			
 
				+		irq_desc[i].affinity = cpumask_of_cpu(0);
			
 
				+#endif
			
 
				+
			
 
				+	memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
			
 
				+	memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
			
 
				+}
			
 
				+
			
 
				+static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
			
 
				+{
			
 
				+	return cpu_evtchn[evtchn];
			
 
				+}
			
 
				+
			
 
				+static inline void clear_evtchn(int port)
			
 
				+{
			
 
				+	struct shared_info *s = HYPERVISOR_shared_info;
			
 
				+	sync_clear_bit(port, &s->evtchn_pending[0]);
			
 
				+}
			
 
				+
			
 
				+static inline void set_evtchn(int port)
			
 
				+{
			
 
				+	struct shared_info *s = HYPERVISOR_shared_info;
			
 
				+	sync_set_bit(port, &s->evtchn_pending[0]);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * notify_remote_via_irq - send event to remote end of event channel via irq
			
 
				+ * @irq: irq of event channel to send event to
			
 
				+ *
			
 
				+ * Unlike notify_remote_via_evtchn(), this is safe to use across
			
 
				+ * save/restore. Notifications on a broken connection are silently
			
 
				+ * dropped.
			
 
				+ */
			
 
				+void notify_remote_via_irq(int irq)
			
 
				+{
			
 
				+	int evtchn = evtchn_from_irq(irq);
			
 
				+
			
 
				+	if (VALID_EVTCHN(evtchn))
			
 
				+		notify_remote_via_evtchn(evtchn);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(notify_remote_via_irq);
			
 
				+
			
 
				+static void mask_evtchn(int port)
			
 
				+{
			
 
				+	struct shared_info *s = HYPERVISOR_shared_info;
			
 
				+	sync_set_bit(port, &s->evtchn_mask[0]);
			
 
				+}
			
 
				+
			
 
				+static void unmask_evtchn(int port)
			
 
				+{
			
 
				+	struct shared_info *s = HYPERVISOR_shared_info;
			
 
				+	unsigned int cpu = get_cpu();
			
 
				+
			
 
				+	BUG_ON(!irqs_disabled());
			
 
				+
			
 
				+	/* Slow path (hypercall) if this is a non-local port. */
			
 
				+	if (unlikely(cpu != cpu_from_evtchn(port))) {
			
 
				+		struct evtchn_unmask unmask = { .port = port };
			
 
				+		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
			
 
				+	} else {
			
 
				+		struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
			
 
				+
			
 
				+		sync_clear_bit(port, &s->evtchn_mask[0]);
			
 
				+
			
 
				+		/*
			
 
				+		 * The following is basically the equivalent of
			
 
				+		 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
			
 
				+		 * the interrupt edge' if the channel is masked.
			
 
				+		 */
			
 
				+		if (sync_test_bit(port, &s->evtchn_pending[0]) &&
			
 
				+		    !sync_test_and_set_bit(port / BITS_PER_LONG,
			
 
				+					   &vcpu_info->evtchn_pending_sel))
			
 
				+			vcpu_info->evtchn_upcall_pending = 1;
			
 
				+	}
			
 
				+
			
 
				+	put_cpu();
			
 
				+}
			
 
				+
			
 
				+static int find_unbound_irq(void)
			
 
				+{
			
 
				+	int irq;
			
 
				+
			
 
				+	/* Only allocate from dynirq range */
			
 
				+	for (irq = 0; irq < NR_IRQS; irq++)
			
 
				+		if (irq_bindcount[irq] == 0)
			
 
				+			break;
			
 
				+
			
 
				+	if (irq == NR_IRQS)
			
 
				+		panic("No available IRQ to bind to: increase NR_IRQS!\n");
			
 
				+
			
 
				+	return irq;
			
 
				+}
			
 
				+
			
 
				+int bind_evtchn_to_irq(unsigned int evtchn)
			
 
				+{
			
 
				+	int irq;
			
 
				+
			
 
				+	spin_lock(&irq_mapping_update_lock);
			
 
				+
			
 
				+	irq = evtchn_to_irq[evtchn];
			
 
				+
			
 
				+	if (irq == -1) {
			
 
				+		irq = find_unbound_irq();
			
 
				+
			
 
				+		dynamic_irq_init(irq);
			
 
				+		set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
			
 
				+					      handle_level_irq, "event");
			
 
				+
			
 
				+		evtchn_to_irq[evtchn] = irq;
			
 
				+		irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
			
 
				+	}
			
 
				+
			
 
				+	irq_bindcount[irq]++;
			
 
				+
			
 
				+	spin_unlock(&irq_mapping_update_lock);
			
 
				+
			
 
				+	return irq;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
			
 
				+
			
 
				+static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
			
 
				+{
			
 
				+	struct evtchn_bind_ipi bind_ipi;
			
 
				+	int evtchn, irq;
			
 
				+
			
 
				+	spin_lock(&irq_mapping_update_lock);
			
 
				+
			
 
				+	irq = per_cpu(ipi_to_irq, cpu)[ipi];
			
 
				+	if (irq == -1) {
			
 
				+		irq = find_unbound_irq();
			
 
				+		if (irq < 0)
			
 
				+			goto out;
			
 
				+
			
 
				+		dynamic_irq_init(irq);
			
 
				+		set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
			
 
				+					      handle_level_irq, "ipi");
			
 
				+
			
 
				+		bind_ipi.vcpu = cpu;
			
 
				+		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
			
 
				+						&bind_ipi) != 0)
			
 
				+			BUG();
			
 
				+		evtchn = bind_ipi.port;
			
 
				+
			
 
				+		evtchn_to_irq[evtchn] = irq;
			
 
				+		irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
			
 
				+
			
 
				+		per_cpu(ipi_to_irq, cpu)[ipi] = irq;
			
 
				+
			
 
				+		bind_evtchn_to_cpu(evtchn, cpu);
			
 
				+	}
			
 
				+
			
 
				+	irq_bindcount[irq]++;
			
 
				+
			
 
				+ out:
			
 
				+	spin_unlock(&irq_mapping_update_lock);
			
 
				+	return irq;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
			
 
				+{
			
 
				+	struct evtchn_bind_virq bind_virq;
			
 
				+	int evtchn, irq;
			
 
				+
			
 
				+	spin_lock(&irq_mapping_update_lock);
			
 
				+
			
 
				+	irq = per_cpu(virq_to_irq, cpu)[virq];
			
 
				+
			
 
				+	if (irq == -1) {
			
 
				+		bind_virq.virq = virq;
			
 
				+		bind_virq.vcpu = cpu;
			
 
				+		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
			
 
				+						&bind_virq) != 0)
			
 
				+			BUG();
			
 
				+		evtchn = bind_virq.port;
			
 
				+
			
 
				+		irq = find_unbound_irq();
			
 
				+
			
 
				+		dynamic_irq_init(irq);
			
 
				+		set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
			
 
				+					      handle_level_irq, "virq");
			
 
				+
			
 
				+		evtchn_to_irq[evtchn] = irq;
			
 
				+		irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
			
 
				+
			
 
				+		per_cpu(virq_to_irq, cpu)[virq] = irq;
			
 
				+
			
 
				+		bind_evtchn_to_cpu(evtchn, cpu);
			
 
				+	}
			
 
				+
			
 
				+	irq_bindcount[irq]++;
			
 
				+
			
 
				+	spin_unlock(&irq_mapping_update_lock);
			
 
				+
			
 
				+	return irq;
			
 
				+}
			
 
				+
			
 
				+static void unbind_from_irq(unsigned int irq)
			
 
				+{
			
 
				+	struct evtchn_close close;
			
 
				+	int evtchn = evtchn_from_irq(irq);
			
 
				+
			
 
				+	spin_lock(&irq_mapping_update_lock);
			
 
				+
			
 
				+	if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
			
 
				+		close.port = evtchn;
			
 
				+		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
			
 
				+			BUG();
			
 
				+
			
 
				+		switch (type_from_irq(irq)) {
			
 
				+		case IRQT_VIRQ:
			
 
				+			per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
			
 
				+				[index_from_irq(irq)] = -1;
			
 
				+			break;
			
 
				+		default:
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		/* Closed ports are implicitly re-bound to VCPU0. */
			
 
				+		bind_evtchn_to_cpu(evtchn, 0);
			
 
				+
			
 
				+		evtchn_to_irq[evtchn] = -1;
			
 
				+		irq_info[irq] = IRQ_UNBOUND;
			
 
				+
			
 
				+		dynamic_irq_init(irq);
			
 
				+	}
			
 
				+
			
 
				+	spin_unlock(&irq_mapping_update_lock);
			
 
				+}
			
 
				+
			
 
				+int bind_evtchn_to_irqhandler(unsigned int evtchn,
			
 
				+			      irqreturn_t (*handler)(int, void *),
			
 
				+			      unsigned long irqflags,
			
 
				+			      const char *devname, void *dev_id)
			
 
				+{
			
 
				+	unsigned int irq;
			
 
				+	int retval;
			
 
				+
			
 
				+	irq = bind_evtchn_to_irq(evtchn);
			
 
				+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
			
 
				+	if (retval != 0) {
			
 
				+		unbind_from_irq(irq);
			
 
				+		return retval;
			
 
				+	}
			
 
				+
			
 
				+	return irq;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
			
 
				+
			
 
				+int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
			
 
				+			    irqreturn_t (*handler)(int, void *),
			
 
				+			    unsigned long irqflags, const char *devname, void *dev_id)
			
 
				+{
			
 
				+	unsigned int irq;
			
 
				+	int retval;
			
 
				+
			
 
				+	irq = bind_virq_to_irq(virq, cpu);
			
 
				+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
			
 
				+	if (retval != 0) {
			
 
				+		unbind_from_irq(irq);
			
 
				+		return retval;
			
 
				+	}
			
 
				+
			
 
				+	return irq;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
			
 
				+
			
 
				+int bind_ipi_to_irqhandler(enum ipi_vector ipi,
			
 
				+			   unsigned int cpu,
			
 
				+			   irq_handler_t handler,
			
 
				+			   unsigned long irqflags,
			
 
				+			   const char *devname,
			
 
				+			   void *dev_id)
			
 
				+{
			
 
				+	int irq, retval;
			
 
				+
			
 
				+	irq = bind_ipi_to_irq(ipi, cpu);
			
 
				+	if (irq < 0)
			
 
				+		return irq;
			
 
				+
			
 
				+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
			
 
				+	if (retval != 0) {
			
 
				+		unbind_from_irq(irq);
			
 
				+		return retval;
			
 
				+	}
			
 
				+
			
 
				+	return irq;
			
 
				+}
			
 
				+
			
 
				+void unbind_from_irqhandler(unsigned int irq, void *dev_id)
			
 
				+{
			
 
				+	free_irq(irq, dev_id);
			
 
				+	unbind_from_irq(irq);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
			
 
				+
			
 
				+void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
			
 
				+{
			
 
				+	int irq = per_cpu(ipi_to_irq, cpu)[vector];
			
 
				+	BUG_ON(irq < 0);
			
 
				+	notify_remote_via_irq(irq);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Search the CPUs pending events bitmasks.  For each one found, map
			
 
				+ * the event number to an irq, and feed it into do_IRQ() for
			
 
				+ * handling.
			
 
				+ *
			
 
				+ * Xen uses a two-level bitmap to speed searching.  The first level is
			
 
				+ * a bitset of words which contain pending event bits.  The second
			
 
				+ * level is a bitset of pending events themselves.
			
 
				+ */
			
 
				+fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
			
 
				+{
			
 
				+	int cpu = get_cpu();
			
 
				+	struct shared_info *s = HYPERVISOR_shared_info;
			
 
				+	struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
			
 
				+	unsigned long pending_words;
			
 
				+
			
 
				+	vcpu_info->evtchn_upcall_pending = 0;
			
 
				+
			
 
				+	/* NB. No need for a barrier here -- XCHG is a barrier on x86. */
			
 
				+	pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
			
 
				+	while (pending_words != 0) {
			
 
				+		unsigned long pending_bits;
			
 
				+		int word_idx = __ffs(pending_words);
			
 
				+		pending_words &= ~(1UL << word_idx);
			
 
				+
			
 
				+		while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
			
 
				+			int bit_idx = __ffs(pending_bits);
			
 
				+			int port = (word_idx * BITS_PER_LONG) + bit_idx;
			
 
				+			int irq = evtchn_to_irq[port];
			
 
				+
			
 
				+			if (irq != -1) {
			
 
				+				regs->orig_eax = ~irq;
			
 
				+				do_IRQ(regs);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	put_cpu();
			
 
				+}
			
 
				+
			
 
				+/* Rebind an evtchn so that it gets delivered to a specific cpu */
			
 
				+static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
			
 
				+{
			
 
				+	struct evtchn_bind_vcpu bind_vcpu;
			
 
				+	int evtchn = evtchn_from_irq(irq);
			
 
				+
			
 
				+	if (!VALID_EVTCHN(evtchn))
			
 
				+		return;
			
 
				+
			
 
				+	/* Send future instances of this interrupt to other vcpu. */
			
 
				+	bind_vcpu.port = evtchn;
			
 
				+	bind_vcpu.vcpu = tcpu;
			
 
				+
			
 
				+	/*
			
 
				+	 * If this fails, it usually just indicates that we're dealing with a
			
 
				+	 * virq or IPI channel, which don't actually need to be rebound. Ignore
			
 
				+	 * it, but don't do the xenlinux-level rebind in that case.
			
 
				+	 */
			
 
				+	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
			
 
				+		bind_evtchn_to_cpu(evtchn, tcpu);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void set_affinity_irq(unsigned irq, cpumask_t dest)
			
 
				+{
			
 
				+	unsigned tcpu = first_cpu(dest);
			
 
				+	rebind_irq_to_cpu(irq, tcpu);
			
 
				+}
			
 
				+
			
 
				+static void enable_dynirq(unsigned int irq)
			
 
				+{
			
 
				+	int evtchn = evtchn_from_irq(irq);
			
 
				+
			
 
				+	if (VALID_EVTCHN(evtchn))
			
 
				+		unmask_evtchn(evtchn);
			
 
				+}
			
 
				+
			
 
				+static void disable_dynirq(unsigned int irq)
			
 
				+{
			
 
				+	int evtchn = evtchn_from_irq(irq);
			
 
				+
			
 
				+	if (VALID_EVTCHN(evtchn))
			
 
				+		mask_evtchn(evtchn);
			
 
				+}
			
 
				+
			
 
				+static void ack_dynirq(unsigned int irq)
			
 
				+{
			
 
				+	int evtchn = evtchn_from_irq(irq);
			
 
				+
			
 
				+	move_native_irq(irq);
			
 
				+
			
 
				+	if (VALID_EVTCHN(evtchn))
			
 
				+		clear_evtchn(evtchn);
			
 
				+}
			
 
				+
			
 
				+static int retrigger_dynirq(unsigned int irq)
			
 
				+{
			
 
				+	int evtchn = evtchn_from_irq(irq);
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	if (VALID_EVTCHN(evtchn)) {
			
 
				+		set_evtchn(evtchn);
			
 
				+		ret = 1;
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static struct irq_chip xen_dynamic_chip __read_mostly = {
			
 
				+	.name		= "xen-dyn",
			
 
				+	.mask		= disable_dynirq,
			
 
				+	.unmask		= enable_dynirq,
			
 
				+	.ack		= ack_dynirq,
			
 
				+	.set_affinity	= set_affinity_irq,
			
 
				+	.retrigger	= retrigger_dynirq,
			
 
				+};
			
 
				+
			
 
				+void __init xen_init_IRQ(void)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	init_evtchn_cpu_bindings();
			
 
				+
			
 
				+	/* No event channels are 'live' right now. */
			
 
				+	for (i = 0; i < NR_EVENT_CHANNELS; i++)
			
 
				+		mask_evtchn(i);
			
 
				+
			
 
				+	/* Dynamic IRQ space is currently unbound. Zero the refcnts. */
			
 
				+	for (i = 0; i < NR_IRQS; i++)
			
 
				+		irq_bindcount[i] = 0;
			
 
				+
			
 
				+	irq_ctx_init(smp_processor_id());
			
 
				+}
			
--- a/arch/i386/xen/features.c
+++ b/arch/i386/xen/features.c
@@ -0,0 +1,29 @@
 
				+/******************************************************************************
			
 
				+ * features.c
			
 
				+ *
			
 
				+ * Xen feature flags.
			
 
				+ *
			
 
				+ * Copyright (c) 2006, Ian Campbell, XenSource Inc.
			
 
				+ */
			
 
				+#include <linux/types.h>
			
 
				+#include <linux/cache.h>
			
 
				+#include <linux/module.h>
			
 
				+#include <asm/xen/hypervisor.h>
			
 
				+#include <xen/features.h>
			
 
				+
			
 
				+u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
			
 
				+EXPORT_SYMBOL_GPL(xen_features);
			
 
				+
			
 
				+void xen_setup_features(void)
			
 
				+{
			
 
				+	struct xen_feature_info fi;
			
 
				+	int i, j;
			
 
				+
			
 
				+	for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
			
 
				+		fi.submap_idx = i;
			
 
				+		if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
			
 
				+			break;
			
 
				+		for (j = 0; j < 32; j++)
			
 
				+			xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
			
 
				+	}
			
 
				+}
			
--- a/arch/i386/xen/manage.c
+++ b/arch/i386/xen/manage.c
@@ -0,0 +1,143 @@
 
				+/*
			
 
				+ * Handle extern requests for shutdown, reboot and sysrq
			
 
				+ */
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/err.h>
			
 
				+#include <linux/reboot.h>
			
 
				+#include <linux/sysrq.h>
			
 
				+
			
 
				+#include <xen/xenbus.h>
			
 
				+
			
 
				+#define SHUTDOWN_INVALID  -1
			
 
				+#define SHUTDOWN_POWEROFF  0
			
 
				+#define SHUTDOWN_SUSPEND   2
			
 
				+/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
			
 
				+ * report a crash, not be instructed to crash!
			
 
				+ * HALT is the same as POWEROFF, as far as we're concerned.  The tools use
			
 
				+ * the distinction when we return the reason code to them.
			
 
				+ */
			
 
				+#define SHUTDOWN_HALT      4
			
 
				+
			
 
				+/* Ignore multiple shutdown requests. */
			
 
				+static int shutting_down = SHUTDOWN_INVALID;
			
 
				+
			
 
				+static void shutdown_handler(struct xenbus_watch *watch,
			
 
				+			     const char **vec, unsigned int len)
			
 
				+{
			
 
				+	char *str;
			
 
				+	struct xenbus_transaction xbt;
			
 
				+	int err;
			
 
				+
			
 
				+	if (shutting_down != SHUTDOWN_INVALID)
			
 
				+		return;
			
 
				+
			
 
				+ again:
			
 
				+	err = xenbus_transaction_start(&xbt);
			
 
				+	if (err)
			
 
				+		return;
			
 
				+
			
 
				+	str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
			
 
				+	/* Ignore read errors and empty reads. */
			
 
				+	if (XENBUS_IS_ERR_READ(str)) {
			
 
				+		xenbus_transaction_end(xbt, 1);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	xenbus_write(xbt, "control", "shutdown", "");
			
 
				+
			
 
				+	err = xenbus_transaction_end(xbt, 0);
			
 
				+	if (err == -EAGAIN) {
			
 
				+		kfree(str);
			
 
				+		goto again;
			
 
				+	}
			
 
				+
			
 
				+	if (strcmp(str, "poweroff") == 0 ||
			
 
				+	    strcmp(str, "halt") == 0)
			
 
				+		orderly_poweroff(false);
			
 
				+	else if (strcmp(str, "reboot") == 0)
			
 
				+		ctrl_alt_del();
			
 
				+	else {
			
 
				+		printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
			
 
				+		shutting_down = SHUTDOWN_INVALID;
			
 
				+	}
			
 
				+
			
 
				+	kfree(str);
			
 
				+}
			
 
				+
			
 
				+static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
			
 
				+			  unsigned int len)
			
 
				+{
			
 
				+	char sysrq_key = '\0';
			
 
				+	struct xenbus_transaction xbt;
			
 
				+	int err;
			
 
				+
			
 
				+ again:
			
 
				+	err = xenbus_transaction_start(&xbt);
			
 
				+	if (err)
			
 
				+		return;
			
 
				+	if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
			
 
				+		printk(KERN_ERR "Unable to read sysrq code in "
			
 
				+		       "control/sysrq\n");
			
 
				+		xenbus_transaction_end(xbt, 1);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	if (sysrq_key != '\0')
			
 
				+		xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
			
 
				+
			
 
				+	err = xenbus_transaction_end(xbt, 0);
			
 
				+	if (err == -EAGAIN)
			
 
				+		goto again;
			
 
				+
			
 
				+	if (sysrq_key != '\0')
			
 
				+		handle_sysrq(sysrq_key, NULL);
			
 
				+}
			
 
				+
			
 
				+static struct xenbus_watch shutdown_watch = {
			
 
				+	.node = "control/shutdown",
			
 
				+	.callback = shutdown_handler
			
 
				+};
			
 
				+
			
 
				+static struct xenbus_watch sysrq_watch = {
			
 
				+	.node = "control/sysrq",
			
 
				+	.callback = sysrq_handler
			
 
				+};
			
 
				+
			
 
				+static int setup_shutdown_watcher(void)
			
 
				+{
			
 
				+	int err;
			
 
				+
			
 
				+	err = register_xenbus_watch(&shutdown_watch);
			
 
				+	if (err) {
			
 
				+		printk(KERN_ERR "Failed to set shutdown watcher\n");
			
 
				+		return err;
			
 
				+	}
			
 
				+
			
 
				+	err = register_xenbus_watch(&sysrq_watch);
			
 
				+	if (err) {
			
 
				+		printk(KERN_ERR "Failed to set sysrq watcher\n");
			
 
				+		return err;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int shutdown_event(struct notifier_block *notifier,
			
 
				+			  unsigned long event,
			
 
				+			  void *data)
			
 
				+{
			
 
				+	setup_shutdown_watcher();
			
 
				+	return NOTIFY_DONE;
			
 
				+}
			
 
				+
			
 
				+static int __init setup_shutdown_event(void)
			
 
				+{
			
 
				+	static struct notifier_block xenstore_notifier = {
			
 
				+		.notifier_call = shutdown_event
			
 
				+	};
			
 
				+	register_xenstore_notifier(&xenstore_notifier);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+subsys_initcall(setup_shutdown_event);
			
--- a/arch/i386/xen/mmu.c
+++ b/arch/i386/xen/mmu.c
@@ -0,0 +1,564 @@
 
				+/*
			
 
				+ * Xen mmu operations
			
 
				+ *
			
 
				+ * This file contains the various mmu fetch and update operations.
			
 
				+ * The most important job they must perform is the mapping between the
			
 
				+ * domain's pfn and the overall machine mfns.
			
 
				+ *
			
 
				+ * Xen allows guests to directly update the pagetable, in a controlled
			
 
				+ * fashion.  In other words, the guest modifies the same pagetable
			
 
				+ * that the CPU actually uses, which eliminates the overhead of having
			
 
				+ * a separate shadow pagetable.
			
 
				+ *
			
 
				+ * In order to allow this, it falls on the guest domain to map its
			
 
				+ * notion of a "physical" pfn - which is just a domain-local linear
			
 
				+ * address - into a real "machine address" which the CPU's MMU can
			
 
				+ * use.
			
 
				+ *
			
 
				+ * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
			
 
				+ * inserted directly into the pagetable.  When creating a new
			
 
				+ * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
			
 
				+ * when reading the content back with __(pgd|pmd|pte)_val, it converts
			
 
				+ * the mfn back into a pfn.
			
 
				+ *
			
 
				+ * The other constraint is that all pages which make up a pagetable
			
 
				+ * must be mapped read-only in the guest.  This prevents uncontrolled
			
 
				+ * guest updates to the pagetable.  Xen strictly enforces this, and
			
 
				+ * will disallow any pagetable update which will end up mapping a
			
 
				+ * pagetable page RW, and will disallow using any writable page as a
			
 
				+ * pagetable.
			
 
				+ *
			
 
				+ * Naively, when loading %cr3 with the base of a new pagetable, Xen
			
 
				+ * would need to validate the whole pagetable before going on.
			
 
				+ * Naturally, this is quite slow.  The solution is to "pin" a
			
 
				+ * pagetable, which enforces all the constraints on the pagetable even
			
 
				+ * when it is not actively in use.  This menas that Xen can be assured
			
 
				+ * that it is still valid when you do load it into %cr3, and doesn't
			
 
				+ * need to revalidate it.
			
 
				+ *
			
 
				+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
			
 
				+ */
			
 
				+#include <linux/sched.h>
			
 
				+#include <linux/highmem.h>
			
 
				+#include <linux/bug.h>
			
 
				+#include <linux/sched.h>
			
 
				+
			
 
				+#include <asm/pgtable.h>
			
 
				+#include <asm/tlbflush.h>
			
 
				+#include <asm/mmu_context.h>
			
 
				+#include <asm/paravirt.h>
			
 
				+
			
 
				+#include <asm/xen/hypercall.h>
			
 
				+#include <asm/xen/hypervisor.h>
			
 
				+
			
 
				+#include <xen/page.h>
			
 
				+#include <xen/interface/xen.h>
			
 
				+
			
 
				+#include "multicalls.h"
			
 
				+#include "mmu.h"
			
 
				+
			
 
				+xmaddr_t arbitrary_virt_to_machine(unsigned long address)
			
 
				+{
			
 
				+	pte_t *pte = lookup_address(address);
			
 
				+	unsigned offset = address & PAGE_MASK;
			
 
				+
			
 
				+	BUG_ON(pte == NULL);
			
 
				+
			
 
				+	return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
			
 
				+}
			
 
				+
			
 
				+void make_lowmem_page_readonly(void *vaddr)
			
 
				+{
			
 
				+	pte_t *pte, ptev;
			
 
				+	unsigned long address = (unsigned long)vaddr;
			
 
				+
			
 
				+	pte = lookup_address(address);
			
 
				+	BUG_ON(pte == NULL);
			
 
				+
			
 
				+	ptev = pte_wrprotect(*pte);
			
 
				+
			
 
				+	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
			
 
				+		BUG();
			
 
				+}
			
 
				+
			
 
				+void make_lowmem_page_readwrite(void *vaddr)
			
 
				+{
			
 
				+	pte_t *pte, ptev;
			
 
				+	unsigned long address = (unsigned long)vaddr;
			
 
				+
			
 
				+	pte = lookup_address(address);
			
 
				+	BUG_ON(pte == NULL);
			
 
				+
			
 
				+	ptev = pte_mkwrite(*pte);
			
 
				+
			
 
				+	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
			
 
				+		BUG();
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void xen_set_pmd(pmd_t *ptr, pmd_t val)
			
 
				+{
			
 
				+	struct multicall_space mcs;
			
 
				+	struct mmu_update *u;
			
 
				+
			
 
				+	preempt_disable();
			
 
				+
			
 
				+	mcs = xen_mc_entry(sizeof(*u));
			
 
				+	u = mcs.args;
			
 
				+	u->ptr = virt_to_machine(ptr).maddr;
			
 
				+	u->val = pmd_val_ma(val);
			
 
				+	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
			
 
				+
			
 
				+	xen_mc_issue(PARAVIRT_LAZY_MMU);
			
 
				+
			
 
				+	preempt_enable();
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Associate a virtual page frame with a given physical page frame
			
 
				+ * and protection flags for that frame.
			
 
				+ */
			
 
				+void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
			
 
				+{
			
 
				+	pgd_t *pgd;
			
 
				+	pud_t *pud;
			
 
				+	pmd_t *pmd;
			
 
				+	pte_t *pte;
			
 
				+
			
 
				+	pgd = swapper_pg_dir + pgd_index(vaddr);
			
 
				+	if (pgd_none(*pgd)) {
			
 
				+		BUG();
			
 
				+		return;
			
 
				+	}
			
 
				+	pud = pud_offset(pgd, vaddr);
			
 
				+	if (pud_none(*pud)) {
			
 
				+		BUG();
			
 
				+		return;
			
 
				+	}
			
 
				+	pmd = pmd_offset(pud, vaddr);
			
 
				+	if (pmd_none(*pmd)) {
			
 
				+		BUG();
			
 
				+		return;
			
 
				+	}
			
 
				+	pte = pte_offset_kernel(pmd, vaddr);
			
 
				+	/* <mfn,flags> stored as-is, to permit clearing entries */
			
 
				+	xen_set_pte(pte, mfn_pte(mfn, flags));
			
 
				+
			
 
				+	/*
			
 
				+	 * It's enough to flush this one mapping.
			
 
				+	 * (PGE mappings get flushed as well)
			
 
				+	 */
			
 
				+	__flush_tlb_one(vaddr);
			
 
				+}
			
 
				+
			
 
				+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
			
 
				+		    pte_t *ptep, pte_t pteval)
			
 
				+{
			
 
				+	if (mm == current->mm || mm == &init_mm) {
			
 
				+		if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
			
 
				+			struct multicall_space mcs;
			
 
				+			mcs = xen_mc_entry(0);
			
 
				+
			
 
				+			MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
			
 
				+			xen_mc_issue(PARAVIRT_LAZY_MMU);
			
 
				+			return;
			
 
				+		} else
			
 
				+			if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
			
 
				+				return;
			
 
				+	}
			
 
				+	xen_set_pte(ptep, pteval);
			
 
				+}
			
 
				+
			
 
				+#ifdef CONFIG_X86_PAE
			
 
				+void xen_set_pud(pud_t *ptr, pud_t val)
			
 
				+{
			
 
				+	struct multicall_space mcs;
			
 
				+	struct mmu_update *u;
			
 
				+
			
 
				+	preempt_disable();
			
 
				+
			
 
				+	mcs = xen_mc_entry(sizeof(*u));
			
 
				+	u = mcs.args;
			
 
				+	u->ptr = virt_to_machine(ptr).maddr;
			
 
				+	u->val = pud_val_ma(val);
			
 
				+	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
			
 
				+
			
 
				+	xen_mc_issue(PARAVIRT_LAZY_MMU);
			
 
				+
			
 
				+	preempt_enable();
			
 
				+}
			
 
				+
			
 
				+void xen_set_pte(pte_t *ptep, pte_t pte)
			
 
				+{
			
 
				+	ptep->pte_high = pte.pte_high;
			
 
				+	smp_wmb();
			
 
				+	ptep->pte_low = pte.pte_low;
			
 
				+}
			
 
				+
			
 
				+void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
			
 
				+{
			
 
				+	set_64bit((u64 *)ptep, pte_val_ma(pte));
			
 
				+}
			
 
				+
			
 
				+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
			
 
				+{
			
 
				+	ptep->pte_low = 0;
			
 
				+	smp_wmb();		/* make sure low gets written first */
			
 
				+	ptep->pte_high = 0;
			
 
				+}
			
 
				+
			
 
				+void xen_pmd_clear(pmd_t *pmdp)
			
 
				+{
			
 
				+	xen_set_pmd(pmdp, __pmd(0));
			
 
				+}
			
 
				+
			
 
				+unsigned long long xen_pte_val(pte_t pte)
			
 
				+{
			
 
				+	unsigned long long ret = 0;
			
 
				+
			
 
				+	if (pte.pte_low) {
			
 
				+		ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
			
 
				+		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+unsigned long long xen_pmd_val(pmd_t pmd)
			
 
				+{
			
 
				+	unsigned long long ret = pmd.pmd;
			
 
				+	if (ret)
			
 
				+		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+unsigned long long xen_pgd_val(pgd_t pgd)
			
 
				+{
			
 
				+	unsigned long long ret = pgd.pgd;
			
 
				+	if (ret)
			
 
				+		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+pte_t xen_make_pte(unsigned long long pte)
			
 
				+{
			
 
				+	if (pte & 1)
			
 
				+		pte = phys_to_machine(XPADDR(pte)).maddr;
			
 
				+
			
 
				+	return (pte_t){ pte, pte >> 32 };
			
 
				+}
			
 
				+
			
 
				+pmd_t xen_make_pmd(unsigned long long pmd)
			
 
				+{
			
 
				+	if (pmd & 1)
			
 
				+		pmd = phys_to_machine(XPADDR(pmd)).maddr;
			
 
				+
			
 
				+	return (pmd_t){ pmd };
			
 
				+}
			
 
				+
			
 
				+pgd_t xen_make_pgd(unsigned long long pgd)
			
 
				+{
			
 
				+	if (pgd & _PAGE_PRESENT)
			
 
				+		pgd = phys_to_machine(XPADDR(pgd)).maddr;
			
 
				+
			
 
				+	return (pgd_t){ pgd };
			
 
				+}
			
 
				+#else  /* !PAE */
			
 
				+void xen_set_pte(pte_t *ptep, pte_t pte)
			
 
				+{
			
 
				+	*ptep = pte;
			
 
				+}
			
 
				+
			
 
				+unsigned long xen_pte_val(pte_t pte)
			
 
				+{
			
 
				+	unsigned long ret = pte.pte_low;
			
 
				+
			
 
				+	if (ret & _PAGE_PRESENT)
			
 
				+		ret = machine_to_phys(XMADDR(ret)).paddr;
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+unsigned long xen_pgd_val(pgd_t pgd)
			
 
				+{
			
 
				+	unsigned long ret = pgd.pgd;
			
 
				+	if (ret)
			
 
				+		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+pte_t xen_make_pte(unsigned long pte)
			
 
				+{
			
 
				+	if (pte & _PAGE_PRESENT)
			
 
				+		pte = phys_to_machine(XPADDR(pte)).maddr;
			
 
				+
			
 
				+	return (pte_t){ pte };
			
 
				+}
			
 
				+
			
 
				+pgd_t xen_make_pgd(unsigned long pgd)
			
 
				+{
			
 
				+	if (pgd & _PAGE_PRESENT)
			
 
				+		pgd = phys_to_machine(XPADDR(pgd)).maddr;
			
 
				+
			
 
				+	return (pgd_t){ pgd };
			
 
				+}
			
 
				+#endif	/* CONFIG_X86_PAE */
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+  (Yet another) pagetable walker.  This one is intended for pinning a
			
 
				+  pagetable.  This means that it walks a pagetable and calls the
			
 
				+  callback function on each page it finds making up the page table,
			
 
				+  at every level.  It walks the entire pagetable, but it only bothers
			
 
				+  pinning pte pages which are below pte_limit.  In the normal case
			
 
				+  this will be TASK_SIZE, but at boot we need to pin up to
			
 
				+  FIXADDR_TOP.  But the important bit is that we don't pin beyond
			
 
				+  there, because then we start getting into Xen's ptes.
			
 
				+*/
			
 
				+static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
			
 
				+		    unsigned long limit)
			
 
				+{
			
 
				+	pgd_t *pgd = pgd_base;
			
 
				+	int flush = 0;
			
 
				+	unsigned long addr = 0;
			
 
				+	unsigned long pgd_next;
			
 
				+
			
 
				+	BUG_ON(limit > FIXADDR_TOP);
			
 
				+
			
 
				+	if (xen_feature(XENFEAT_auto_translated_physmap))
			
 
				+		return 0;
			
 
				+
			
 
				+	for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
			
 
				+		pud_t *pud;
			
 
				+		unsigned long pud_limit, pud_next;
			
 
				+
			
 
				+		pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
			
 
				+
			
 
				+		if (!pgd_val(*pgd))
			
 
				+			continue;
			
 
				+
			
 
				+		pud = pud_offset(pgd, 0);
			
 
				+
			
 
				+		if (PTRS_PER_PUD > 1) /* not folded */
			
 
				+			flush |= (*func)(virt_to_page(pud), 0);
			
 
				+
			
 
				+		for (; addr != pud_limit; pud++, addr = pud_next) {
			
 
				+			pmd_t *pmd;
			
 
				+			unsigned long pmd_limit;
			
 
				+
			
 
				+			pud_next = pud_addr_end(addr, pud_limit);
			
 
				+
			
 
				+			if (pud_next < limit)
			
 
				+				pmd_limit = pud_next;
			
 
				+			else
			
 
				+				pmd_limit = limit;
			
 
				+
			
 
				+			if (pud_none(*pud))
			
 
				+				continue;
			
 
				+
			
 
				+			pmd = pmd_offset(pud, 0);
			
 
				+
			
 
				+			if (PTRS_PER_PMD > 1) /* not folded */
			
 
				+				flush |= (*func)(virt_to_page(pmd), 0);
			
 
				+
			
 
				+			for (; addr != pmd_limit; pmd++) {
			
 
				+				addr += (PAGE_SIZE * PTRS_PER_PTE);
			
 
				+				if ((pmd_limit-1) < (addr-1)) {
			
 
				+					addr = pmd_limit;
			
 
				+					break;
			
 
				+				}
			
 
				+
			
 
				+				if (pmd_none(*pmd))
			
 
				+					continue;
			
 
				+
			
 
				+				flush |= (*func)(pmd_page(*pmd), 0);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
			
 
				+
			
 
				+	return flush;
			
 
				+}
			
 
				+
			
 
				+static int pin_page(struct page *page, unsigned flags)
			
 
				+{
			
 
				+	unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
			
 
				+	int flush;
			
 
				+
			
 
				+	if (pgfl)
			
 
				+		flush = 0;		/* already pinned */
			
 
				+	else if (PageHighMem(page))
			
 
				+		/* kmaps need flushing if we found an unpinned
			
 
				+		   highpage */
			
 
				+		flush = 1;
			
 
				+	else {
			
 
				+		void *pt = lowmem_page_address(page);
			
 
				+		unsigned long pfn = page_to_pfn(page);
			
 
				+		struct multicall_space mcs = __xen_mc_entry(0);
			
 
				+
			
 
				+		flush = 0;
			
 
				+
			
 
				+		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
			
 
				+					pfn_pte(pfn, PAGE_KERNEL_RO),
			
 
				+					flags);
			
 
				+	}
			
 
				+
			
 
				+	return flush;
			
 
				+}
			
 
				+
			
 
				+/* This is called just after a mm has been created, but it has not
			
 
				+   been used yet.  We need to make sure that its pagetable is all
			
 
				+   read-only, and can be pinned. */
			
 
				+void xen_pgd_pin(pgd_t *pgd)
			
 
				+{
			
 
				+	struct multicall_space mcs;
			
 
				+	struct mmuext_op *op;
			
 
				+
			
 
				+	xen_mc_batch();
			
 
				+
			
 
				+	if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
			
 
				+		/* re-enable interrupts for kmap_flush_unused */
			
 
				+		xen_mc_issue(0);
			
 
				+		kmap_flush_unused();
			
 
				+		xen_mc_batch();
			
 
				+	}
			
 
				+
			
 
				+	mcs = __xen_mc_entry(sizeof(*op));
			
 
				+	op = mcs.args;
			
 
				+
			
 
				+#ifdef CONFIG_X86_PAE
			
 
				+	op->cmd = MMUEXT_PIN_L3_TABLE;
			
 
				+#else
			
 
				+	op->cmd = MMUEXT_PIN_L2_TABLE;
			
 
				+#endif
			
 
				+	op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
			
 
				+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
			
 
				+
			
 
				+	xen_mc_issue(0);
			
 
				+}
			
 
				+
			
 
				+/* The init_mm pagetable is really pinned as soon as its created, but
			
 
				+   that's before we have page structures to store the bits.  So do all
			
 
				+   the book-keeping now. */
			
 
				+static __init int mark_pinned(struct page *page, unsigned flags)
			
 
				+{
			
 
				+	SetPagePinned(page);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+void __init xen_mark_init_mm_pinned(void)
			
 
				+{
			
 
				+	pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
			
 
				+}
			
 
				+
			
 
				+static int unpin_page(struct page *page, unsigned flags)
			
 
				+{
			
 
				+	unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
			
 
				+
			
 
				+	if (pgfl && !PageHighMem(page)) {
			
 
				+		void *pt = lowmem_page_address(page);
			
 
				+		unsigned long pfn = page_to_pfn(page);
			
 
				+		struct multicall_space mcs = __xen_mc_entry(0);
			
 
				+
			
 
				+		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
			
 
				+					pfn_pte(pfn, PAGE_KERNEL),
			
 
				+					flags);
			
 
				+	}
			
 
				+
			
 
				+	return 0;		/* never need to flush on unpin */
			
 
				+}
			
 
				+
			
 
				+/* Release a pagetables pages back as normal RW */
			
 
				+static void xen_pgd_unpin(pgd_t *pgd)
			
 
				+{
			
 
				+	struct mmuext_op *op;
			
 
				+	struct multicall_space mcs;
			
 
				+
			
 
				+	xen_mc_batch();
			
 
				+
			
 
				+	mcs = __xen_mc_entry(sizeof(*op));
			
 
				+
			
 
				+	op = mcs.args;
			
 
				+	op->cmd = MMUEXT_UNPIN_TABLE;
			
 
				+	op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
			
 
				+
			
 
				+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
			
 
				+
			
 
				+	pgd_walk(pgd, unpin_page, TASK_SIZE);
			
 
				+
			
 
				+	xen_mc_issue(0);
			
 
				+}
			
 
				+
			
 
				+void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
			
 
				+{
			
 
				+	spin_lock(&next->page_table_lock);
			
 
				+	xen_pgd_pin(next->pgd);
			
 
				+	spin_unlock(&next->page_table_lock);
			
 
				+}
			
 
				+
			
 
				+void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
			
 
				+{
			
 
				+	spin_lock(&mm->page_table_lock);
			
 
				+	xen_pgd_pin(mm->pgd);
			
 
				+	spin_unlock(&mm->page_table_lock);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#ifdef CONFIG_SMP
			
 
				+/* Another cpu may still have their %cr3 pointing at the pagetable, so
			
 
				+   we need to repoint it somewhere else before we can unpin it. */
			
 
				+static void drop_other_mm_ref(void *info)
			
 
				+{
			
 
				+	struct mm_struct *mm = info;
			
 
				+
			
 
				+	if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
			
 
				+		leave_mm(smp_processor_id());
			
 
				+}
			
 
				+
			
 
				+static void drop_mm_ref(struct mm_struct *mm)
			
 
				+{
			
 
				+	if (current->active_mm == mm) {
			
 
				+		if (current->mm == mm)
			
 
				+			load_cr3(swapper_pg_dir);
			
 
				+		else
			
 
				+			leave_mm(smp_processor_id());
			
 
				+	}
			
 
				+
			
 
				+	if (!cpus_empty(mm->cpu_vm_mask))
			
 
				+		xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
			
 
				+					   mm, 1);
			
 
				+}
			
 
				+#else
			
 
				+static void drop_mm_ref(struct mm_struct *mm)
			
 
				+{
			
 
				+	if (current->active_mm == mm)
			
 
				+		load_cr3(swapper_pg_dir);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+ * While a process runs, Xen pins its pagetables, which means that the
			
 
				+ * hypervisor forces it to be read-only, and it controls all updates
			
 
				+ * to it.  This means that all pagetable updates have to go via the
			
 
				+ * hypervisor, which is moderately expensive.
			
 
				+ *
			
 
				+ * Since we're pulling the pagetable down, we switch to use init_mm,
			
 
				+ * unpin old process pagetable and mark it all read-write, which
			
 
				+ * allows further operations on it to be simple memory accesses.
			
 
				+ *
			
 
				+ * The only subtle point is that another CPU may be still using the
			
 
				+ * pagetable because of lazy tlb flushing.  This means we need need to
			
 
				+ * switch all CPUs off this pagetable before we can unpin it.
			
 
				+ */
			
 
				+void xen_exit_mmap(struct mm_struct *mm)
			
 
				+{
			
 
				+	get_cpu();		/* make sure we don't move around */
			
 
				+	drop_mm_ref(mm);
			
 
				+	put_cpu();
			
 
				+
			
 
				+	spin_lock(&mm->page_table_lock);
			
 
				+	xen_pgd_unpin(mm->pgd);
			
 
				+	spin_unlock(&mm->page_table_lock);
			
 
				+}
			
--- a/arch/i386/xen/mmu.h
+++ b/arch/i386/xen/mmu.h
@@ -0,0 +1,60 @@
 
				+#ifndef _XEN_MMU_H
			
 
				+
			
 
				+#include <linux/linkage.h>
			
 
				+#include <asm/page.h>
			
 
				+
			
 
				+/*
			
 
				+ * Page-directory addresses above 4GB do not fit into architectural %cr3.
			
 
				+ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
			
 
				+ * must use the following accessor macros to pack/unpack valid MFNs.
			
 
				+ *
			
 
				+ * Note that Xen is using the fact that the pagetable base is always
			
 
				+ * page-aligned, and putting the 12 MSB of the address into the 12 LSB
			
 
				+ * of cr3.
			
 
				+ */
			
 
				+#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
			
 
				+#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
			
 
				+
			
 
				+
			
 
				+void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
			
 
				+
			
 
				+void xen_set_pte(pte_t *ptep, pte_t pteval);
			
 
				+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
			
 
				+		    pte_t *ptep, pte_t pteval);
			
 
				+void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
			
 
				+
			
 
				+void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
			
 
				+void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
			
 
				+void xen_exit_mmap(struct mm_struct *mm);
			
 
				+
			
 
				+void xen_pgd_pin(pgd_t *pgd);
			
 
				+//void xen_pgd_unpin(pgd_t *pgd);
			
 
				+
			
 
				+#ifdef CONFIG_X86_PAE
			
 
				+unsigned long long xen_pte_val(pte_t);
			
 
				+unsigned long long xen_pmd_val(pmd_t);
			
 
				+unsigned long long xen_pgd_val(pgd_t);
			
 
				+
			
 
				+pte_t xen_make_pte(unsigned long long);
			
 
				+pmd_t xen_make_pmd(unsigned long long);
			
 
				+pgd_t xen_make_pgd(unsigned long long);
			
 
				+
			
 
				+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
			
 
				+		    pte_t *ptep, pte_t pteval);
			
 
				+void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
			
 
				+void xen_set_pud(pud_t *ptr, pud_t val);
			
 
				+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
			
 
				+void xen_pmd_clear(pmd_t *pmdp);
			
 
				+
			
 
				+
			
 
				+#else
			
 
				+unsigned long xen_pte_val(pte_t);
			
 
				+unsigned long xen_pmd_val(pmd_t);
			
 
				+unsigned long xen_pgd_val(pgd_t);
			
 
				+
			
 
				+pte_t xen_make_pte(unsigned long);
			
 
				+pmd_t xen_make_pmd(unsigned long);
			
 
				+pgd_t xen_make_pgd(unsigned long);
			
 
				+#endif
			
 
				+
			
 
				+#endif	/* _XEN_MMU_H */
			
--- a/arch/i386/xen/multicalls.c
+++ b/arch/i386/xen/multicalls.c
@@ -0,0 +1,90 @@
 
				+/*
			
 
				+ * Xen hypercall batching.
			
 
				+ *
			
 
				+ * Xen allows multiple hypercalls to be issued at once, using the
			
 
				+ * multicall interface.  This allows the cost of trapping into the
			
 
				+ * hypervisor to be amortized over several calls.
			
 
				+ *
			
 
				+ * This file implements a simple interface for multicalls.  There's a
			
 
				+ * per-cpu buffer of outstanding multicalls.  When you want to queue a
			
 
				+ * multicall for issuing, you can allocate a multicall slot for the
			
 
				+ * call and its arguments, along with storage for space which is
			
 
				+ * pointed to by the arguments (for passing pointers to structures,
			
 
				+ * etc).  When the multicall is actually issued, all the space for the
			
 
				+ * commands and allocated memory is freed for reuse.
			
 
				+ *
			
 
				+ * Multicalls are flushed whenever any of the buffers get full, or
			
 
				+ * when explicitly requested.  There's no way to get per-multicall
			
 
				+ * return results back.  It will BUG if any of the multicalls fail.
			
 
				+ *
			
 
				+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
			
 
				+ */
			
 
				+#include <linux/percpu.h>
			
 
				+#include <linux/hardirq.h>
			
 
				+
			
 
				+#include <asm/xen/hypercall.h>
			
 
				+
			
 
				+#include "multicalls.h"
			
 
				+
			
 
				+#define MC_BATCH	32
			
 
				+#define MC_ARGS		(MC_BATCH * 16 / sizeof(u64))
			
 
				+
			
 
				+struct mc_buffer {
			
 
				+	struct multicall_entry entries[MC_BATCH];
			
 
				+	u64 args[MC_ARGS];
			
 
				+	unsigned mcidx, argidx;
			
 
				+};
			
 
				+
			
 
				+static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
			
 
				+DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
			
 
				+
			
 
				+void xen_mc_flush(void)
			
 
				+{
			
 
				+	struct mc_buffer *b = &__get_cpu_var(mc_buffer);
			
 
				+	int ret = 0;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	BUG_ON(preemptible());
			
 
				+
			
 
				+	/* Disable interrupts in case someone comes in and queues
			
 
				+	   something in the middle */
			
 
				+	local_irq_save(flags);
			
 
				+
			
 
				+	if (b->mcidx) {
			
 
				+		int i;
			
 
				+
			
 
				+		if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
			
 
				+			BUG();
			
 
				+		for (i = 0; i < b->mcidx; i++)
			
 
				+			if (b->entries[i].result < 0)
			
 
				+				ret++;
			
 
				+		b->mcidx = 0;
			
 
				+		b->argidx = 0;
			
 
				+	} else
			
 
				+		BUG_ON(b->argidx != 0);
			
 
				+
			
 
				+	local_irq_restore(flags);
			
 
				+
			
 
				+	BUG_ON(ret);
			
 
				+}
			
 
				+
			
 
				+struct multicall_space __xen_mc_entry(size_t args)
			
 
				+{
			
 
				+	struct mc_buffer *b = &__get_cpu_var(mc_buffer);
			
 
				+	struct multicall_space ret;
			
 
				+	unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64);
			
 
				+
			
 
				+	BUG_ON(preemptible());
			
 
				+	BUG_ON(argspace > MC_ARGS);
			
 
				+
			
 
				+	if (b->mcidx == MC_BATCH ||
			
 
				+	    (b->argidx + argspace) > MC_ARGS)
			
 
				+		xen_mc_flush();
			
 
				+
			
 
				+	ret.mc = &b->entries[b->mcidx];
			
 
				+	b->mcidx++;
			
 
				+	ret.args = &b->args[b->argidx];
			
 
				+	b->argidx += argspace;
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
--- a/arch/i386/xen/multicalls.h
+++ b/arch/i386/xen/multicalls.h
@@ -0,0 +1,45 @@
 
				+#ifndef _XEN_MULTICALLS_H
			
 
				+#define _XEN_MULTICALLS_H
			
 
				+
			
 
				+#include "xen-ops.h"
			
 
				+
			
 
				+/* Multicalls */
			
 
				+struct multicall_space
			
 
				+{
			
 
				+	struct multicall_entry *mc;
			
 
				+	void *args;
			
 
				+};
			
 
				+
			
 
				+/* Allocate room for a multicall and its args */
			
 
				+struct multicall_space __xen_mc_entry(size_t args);
			
 
				+
			
 
				+DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
			
 
				+
			
 
				+/* Call to start a batch of multiple __xen_mc_entry()s.  Must be
			
 
				+   paired with xen_mc_issue() */
			
 
				+static inline void xen_mc_batch(void)
			
 
				+{
			
 
				+	/* need to disable interrupts until this entry is complete */
			
 
				+	local_irq_save(__get_cpu_var(xen_mc_irq_flags));
			
 
				+}
			
 
				+
			
 
				+static inline struct multicall_space xen_mc_entry(size_t args)
			
 
				+{
			
 
				+	xen_mc_batch();
			
 
				+	return __xen_mc_entry(args);
			
 
				+}
			
 
				+
			
 
				+/* Flush all pending multicalls */
			
 
				+void xen_mc_flush(void);
			
 
				+
			
 
				+/* Issue a multicall if we're not in a lazy mode */
			
 
				+static inline void xen_mc_issue(unsigned mode)
			
 
				+{
			
 
				+	if ((xen_get_lazy_mode() & mode) == 0)
			
 
				+		xen_mc_flush();
			
 
				+
			
 
				+	/* restore flags saved in xen_mc_batch */
			
 
				+	local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
			
 
				+}
			
 
				+
			
 
				+#endif /* _XEN_MULTICALLS_H */
			
--- a/arch/i386/xen/setup.c
+++ b/arch/i386/xen/setup.c
@@ -0,0 +1,96 @@
 
				+/*
			
 
				+ * Machine specific setup for xen
			
 
				+ *
			
 
				+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
			
 
				+ */
			
 
				+
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/sched.h>
			
 
				+#include <linux/mm.h>
			
 
				+#include <linux/pm.h>
			
 
				+
			
 
				+#include <asm/elf.h>
			
 
				+#include <asm/e820.h>
			
 
				+#include <asm/setup.h>
			
 
				+#include <asm/xen/hypervisor.h>
			
 
				+#include <asm/xen/hypercall.h>
			
 
				+
			
 
				+#include <xen/interface/physdev.h>
			
 
				+#include <xen/features.h>
			
 
				+
			
 
				+#include "xen-ops.h"
			
 
				+
			
 
				+/* These are code, but not functions.  Defined in entry.S */
			
 
				+extern const char xen_hypervisor_callback[];
			
 
				+extern const char xen_failsafe_callback[];
			
 
				+
			
 
				+unsigned long *phys_to_machine_mapping;
			
 
				+EXPORT_SYMBOL(phys_to_machine_mapping);
			
 
				+
			
 
				+/**
			
 
				+ * machine_specific_memory_setup - Hook for machine specific memory setup.
			
 
				+ **/
			
 
				+
			
 
				+char * __init xen_memory_setup(void)
			
 
				+{
			
 
				+	unsigned long max_pfn = xen_start_info->nr_pages;
			
 
				+
			
 
				+	e820.nr_map = 0;
			
 
				+	add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
			
 
				+
			
 
				+	return "Xen";
			
 
				+}
			
 
				+
			
 
				+static void xen_idle(void)
			
 
				+{
			
 
				+	local_irq_disable();
			
 
				+
			
 
				+	if (need_resched())
			
 
				+		local_irq_enable();
			
 
				+	else {
			
 
				+		current_thread_info()->status &= ~TS_POLLING;
			
 
				+		smp_mb__after_clear_bit();
			
 
				+		safe_halt();
			
 
				+		current_thread_info()->status |= TS_POLLING;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void __init xen_arch_setup(void)
			
 
				+{
			
 
				+	struct physdev_set_iopl set_iopl;
			
 
				+	int rc;
			
 
				+
			
 
				+	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
			
 
				+	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
			
 
				+
			
 
				+	if (!xen_feature(XENFEAT_auto_translated_physmap))
			
 
				+		HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
			
 
				+
			
 
				+	HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
			
 
				+				 __KERNEL_CS, (unsigned long)xen_failsafe_callback);
			
 
				+
			
 
				+	set_iopl.iopl = 1;
			
 
				+	rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
			
 
				+	if (rc != 0)
			
 
				+		printk(KERN_INFO "physdev_op failed %d\n", rc);
			
 
				+
			
 
				+#ifdef CONFIG_ACPI
			
 
				+	if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
			
 
				+		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
			
 
				+		disable_acpi();
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	memcpy(boot_command_line, xen_start_info->cmd_line,
			
 
				+	       MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
			
 
				+	       COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
			
 
				+
			
 
				+	pm_idle = xen_idle;
			
 
				+
			
 
				+#ifdef CONFIG_SMP
			
 
				+	/* fill cpus_possible with all available cpus */
			
 
				+	xen_fill_possible_map();
			
 
				+#endif
			
 
				+
			
 
				+	paravirt_disable_iospace();
			
 
				+}
			
--- a/arch/i386/xen/smp.c
+++ b/arch/i386/xen/smp.c
@@ -0,0 +1,404 @@
 
				+/*
			
 
				+ * Xen SMP support
			
 
				+ *
			
 
				+ * This file implements the Xen versions of smp_ops.  SMP under Xen is
			
 
				+ * very straightforward.  Bringing a CPU up is simply a matter of
			
 
				+ * loading its initial context and setting it running.
			
 
				+ *
			
 
				+ * IPIs are handled through the Xen event mechanism.
			
 
				+ *
			
 
				+ * Because virtual CPUs can be scheduled onto any real CPU, there's no
			
 
				+ * useful topology information for the kernel to make use of.  As a
			
 
				+ * result, all CPUs are treated as if they're single-core and
			
 
				+ * single-threaded.
			
 
				+ *
			
 
				+ * This does not handle HOTPLUG_CPU yet.
			
 
				+ */
			
 
				+#include <linux/sched.h>
			
 
				+#include <linux/err.h>
			
 
				+#include <linux/smp.h>
			
 
				+
			
 
				+#include <asm/paravirt.h>
			
 
				+#include <asm/desc.h>
			
 
				+#include <asm/pgtable.h>
			
 
				+#include <asm/cpu.h>
			
 
				+
			
 
				+#include <xen/interface/xen.h>
			
 
				+#include <xen/interface/vcpu.h>
			
 
				+
			
 
				+#include <asm/xen/interface.h>
			
 
				+#include <asm/xen/hypercall.h>
			
 
				+
			
 
				+#include <xen/page.h>
			
 
				+#include <xen/events.h>
			
 
				+
			
 
				+#include "xen-ops.h"
			
 
				+#include "mmu.h"
			
 
				+
			
 
				+static cpumask_t cpu_initialized_map;
			
 
				+static DEFINE_PER_CPU(int, resched_irq);
			
 
				+static DEFINE_PER_CPU(int, callfunc_irq);
			
 
				+
			
 
				+/*
			
 
				+ * Structure and data for smp_call_function(). This is designed to minimise
			
 
				+ * static memory requirements. It also looks cleaner.
			
 
				+ */
			
 
				+static DEFINE_SPINLOCK(call_lock);
			
 
				+
			
 
				+struct call_data_struct {
			
 
				+	void (*func) (void *info);
			
 
				+	void *info;
			
 
				+	atomic_t started;
			
 
				+	atomic_t finished;
			
 
				+	int wait;
			
 
				+};
			
 
				+
			
 
				+static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
			
 
				+
			
 
				+static struct call_data_struct *call_data;
			
 
				+
			
 
				+/*
			
 
				+ * Reschedule call back. Nothing to do,
			
 
				+ * all the work is done automatically when
			
 
				+ * we return from the interrupt.
			
 
				+ */
			
 
				+static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
			
 
				+{
			
 
				+	return IRQ_HANDLED;
			
 
				+}
			
 
				+
			
 
				+static __cpuinit void cpu_bringup_and_idle(void)
			
 
				+{
			
 
				+	int cpu = smp_processor_id();
			
 
				+
			
 
				+	cpu_init();
			
 
				+
			
 
				+	preempt_disable();
			
 
				+	per_cpu(cpu_state, cpu) = CPU_ONLINE;
			
 
				+
			
 
				+	xen_setup_cpu_clockevents();
			
 
				+
			
 
				+	/* We can take interrupts now: we're officially "up". */
			
 
				+	local_irq_enable();
			
 
				+
			
 
				+	wmb();			/* make sure everything is out */
			
 
				+	cpu_idle();
			
 
				+}
			
 
				+
			
 
				+static int xen_smp_intr_init(unsigned int cpu)
			
 
				+{
			
 
				+	int rc;
			
 
				+	const char *resched_name, *callfunc_name;
			
 
				+
			
 
				+	per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
			
 
				+
			
 
				+	resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
			
 
				+	rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
			
 
				+				    cpu,
			
 
				+				    xen_reschedule_interrupt,
			
 
				+				    IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
			
 
				+				    resched_name,
			
 
				+				    NULL);
			
 
				+	if (rc < 0)
			
 
				+		goto fail;
			
 
				+	per_cpu(resched_irq, cpu) = rc;
			
 
				+
			
 
				+	callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
			
 
				+	rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
			
 
				+				    cpu,
			
 
				+				    xen_call_function_interrupt,
			
 
				+				    IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
			
 
				+				    callfunc_name,
			
 
				+				    NULL);
			
 
				+	if (rc < 0)
			
 
				+		goto fail;
			
 
				+	per_cpu(callfunc_irq, cpu) = rc;
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+ fail:
			
 
				+	if (per_cpu(resched_irq, cpu) >= 0)
			
 
				+		unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
			
 
				+	if (per_cpu(callfunc_irq, cpu) >= 0)
			
 
				+		unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
			
 
				+	return rc;
			
 
				+}
			
 
				+
			
 
				+void __init xen_fill_possible_map(void)
			
 
				+{
			
 
				+	int i, rc;
			
 
				+
			
 
				+	for (i = 0; i < NR_CPUS; i++) {
			
 
				+		rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
			
 
				+		if (rc >= 0)
			
 
				+			cpu_set(i, cpu_possible_map);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void __init xen_smp_prepare_boot_cpu(void)
			
 
				+{
			
 
				+	int cpu;
			
 
				+
			
 
				+	BUG_ON(smp_processor_id() != 0);
			
 
				+	native_smp_prepare_boot_cpu();
			
 
				+
			
 
				+	/* We've switched to the "real" per-cpu gdt, so make sure the
			
 
				+	   old memory can be recycled */
			
 
				+	make_lowmem_page_readwrite(&per_cpu__gdt_page);
			
 
				+
			
 
				+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
			
 
				+		cpus_clear(cpu_sibling_map[cpu]);
			
 
				+		cpus_clear(cpu_core_map[cpu]);
			
 
				+	}
			
 
				+
			
 
				+	xen_setup_vcpu_info_placement();
			
 
				+}
			
 
				+
			
 
				+void __init xen_smp_prepare_cpus(unsigned int max_cpus)
			
 
				+{
			
 
				+	unsigned cpu;
			
 
				+
			
 
				+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
			
 
				+		cpus_clear(cpu_sibling_map[cpu]);
			
 
				+		cpus_clear(cpu_core_map[cpu]);
			
 
				+	}
			
 
				+
			
 
				+	smp_store_cpu_info(0);
			
 
				+	set_cpu_sibling_map(0);
			
 
				+
			
 
				+	if (xen_smp_intr_init(0))
			
 
				+		BUG();
			
 
				+
			
 
				+	cpu_initialized_map = cpumask_of_cpu(0);
			
 
				+
			
 
				+	/* Restrict the possible_map according to max_cpus. */
			
 
				+	while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
			
 
				+		for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
			
 
				+			continue;
			
 
				+		cpu_clear(cpu, cpu_possible_map);
			
 
				+	}
			
 
				+
			
 
				+	for_each_possible_cpu (cpu) {
			
 
				+		struct task_struct *idle;
			
 
				+
			
 
				+		if (cpu == 0)
			
 
				+			continue;
			
 
				+
			
 
				+		idle = fork_idle(cpu);
			
 
				+		if (IS_ERR(idle))
			
 
				+			panic("failed fork for CPU %d", cpu);
			
 
				+
			
 
				+		cpu_set(cpu, cpu_present_map);
			
 
				+	}
			
 
				+
			
 
				+	//init_xenbus_allowed_cpumask();
			
 
				+}
			
 
				+
			
 
				+static __cpuinit int
			
 
				+cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
			
 
				+{
			
 
				+	struct vcpu_guest_context *ctxt;
			
 
				+	struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
			
 
				+
			
 
				+	if (cpu_test_and_set(cpu, cpu_initialized_map))
			
 
				+		return 0;
			
 
				+
			
 
				+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
			
 
				+	if (ctxt == NULL)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	ctxt->flags = VGCF_IN_KERNEL;
			
 
				+	ctxt->user_regs.ds = __USER_DS;
			
 
				+	ctxt->user_regs.es = __USER_DS;
			
 
				+	ctxt->user_regs.fs = __KERNEL_PERCPU;
			
 
				+	ctxt->user_regs.gs = 0;
			
 
				+	ctxt->user_regs.ss = __KERNEL_DS;
			
 
				+	ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
			
 
				+	ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
			
 
				+
			
 
				+	memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
			
 
				+
			
 
				+	xen_copy_trap_info(ctxt->trap_ctxt);
			
 
				+
			
 
				+	ctxt->ldt_ents = 0;
			
 
				+
			
 
				+	BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
			
 
				+	make_lowmem_page_readonly(gdt->gdt);
			
 
				+
			
 
				+	ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
			
 
				+	ctxt->gdt_ents      = ARRAY_SIZE(gdt->gdt);
			
 
				+
			
 
				+	ctxt->user_regs.cs = __KERNEL_CS;
			
 
				+	ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
			
 
				+
			
 
				+	ctxt->kernel_ss = __KERNEL_DS;
			
 
				+	ctxt->kernel_sp = idle->thread.esp0;
			
 
				+
			
 
				+	ctxt->event_callback_cs     = __KERNEL_CS;
			
 
				+	ctxt->event_callback_eip    = (unsigned long)xen_hypervisor_callback;
			
 
				+	ctxt->failsafe_callback_cs  = __KERNEL_CS;
			
 
				+	ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
			
 
				+
			
 
				+	per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
			
 
				+	ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
			
 
				+
			
 
				+	if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
			
 
				+		BUG();
			
 
				+
			
 
				+	kfree(ctxt);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int __cpuinit xen_cpu_up(unsigned int cpu)
			
 
				+{
			
 
				+	struct task_struct *idle = idle_task(cpu);
			
 
				+	int rc;
			
 
				+
			
 
				+#if 0
			
 
				+	rc = cpu_up_check(cpu);
			
 
				+	if (rc)
			
 
				+		return rc;
			
 
				+#endif
			
 
				+
			
 
				+	init_gdt(cpu);
			
 
				+	per_cpu(current_task, cpu) = idle;
			
 
				+	irq_ctx_init(cpu);
			
 
				+	xen_setup_timer(cpu);
			
 
				+
			
 
				+	/* make sure interrupts start blocked */
			
 
				+	per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
			
 
				+
			
 
				+	rc = cpu_initialize_context(cpu, idle);
			
 
				+	if (rc)
			
 
				+		return rc;
			
 
				+
			
 
				+	if (num_online_cpus() == 1)
			
 
				+		alternatives_smp_switch(1);
			
 
				+
			
 
				+	rc = xen_smp_intr_init(cpu);
			
 
				+	if (rc)
			
 
				+		return rc;
			
 
				+
			
 
				+	smp_store_cpu_info(cpu);
			
 
				+	set_cpu_sibling_map(cpu);
			
 
				+	/* This must be done before setting cpu_online_map */
			
 
				+	wmb();
			
 
				+
			
 
				+	cpu_set(cpu, cpu_online_map);
			
 
				+
			
 
				+	rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
			
 
				+	BUG_ON(rc);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+void xen_smp_cpus_done(unsigned int max_cpus)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+static void stop_self(void *v)
			
 
				+{
			
 
				+	int cpu = smp_processor_id();
			
 
				+
			
 
				+	/* make sure we're not pinning something down */
			
 
				+	load_cr3(swapper_pg_dir);
			
 
				+	/* should set up a minimal gdt */
			
 
				+
			
 
				+	HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
			
 
				+	BUG();
			
 
				+}
			
 
				+
			
 
				+void xen_smp_send_stop(void)
			
 
				+{
			
 
				+	smp_call_function(stop_self, NULL, 0, 0);
			
 
				+}
			
 
				+
			
 
				+void xen_smp_send_reschedule(int cpu)
			
 
				+{
			
 
				+	xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
			
 
				+{
			
 
				+	unsigned cpu;
			
 
				+
			
 
				+	cpus_and(mask, mask, cpu_online_map);
			
 
				+
			
 
				+	for_each_cpu_mask(cpu, mask)
			
 
				+		xen_send_IPI_one(cpu, vector);
			
 
				+}
			
 
				+
			
 
				+static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
			
 
				+{
			
 
				+	void (*func) (void *info) = call_data->func;
			
 
				+	void *info = call_data->info;
			
 
				+	int wait = call_data->wait;
			
 
				+
			
 
				+	/*
			
 
				+	 * Notify initiating CPU that I've grabbed the data and am
			
 
				+	 * about to execute the function
			
 
				+	 */
			
 
				+	mb();
			
 
				+	atomic_inc(&call_data->started);
			
 
				+	/*
			
 
				+	 * At this point the info structure may be out of scope unless wait==1
			
 
				+	 */
			
 
				+	irq_enter();
			
 
				+	(*func)(info);
			
 
				+	irq_exit();
			
 
				+
			
 
				+	if (wait) {
			
 
				+		mb();		/* commit everything before setting finished */
			
 
				+		atomic_inc(&call_data->finished);
			
 
				+	}
			
 
				+
			
 
				+	return IRQ_HANDLED;
			
 
				+}
			
 
				+
			
 
				+int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
			
 
				+			       void *info, int wait)
			
 
				+{
			
 
				+	struct call_data_struct data;
			
 
				+	int cpus;
			
 
				+
			
 
				+	/* Holding any lock stops cpus from going down. */
			
 
				+	spin_lock(&call_lock);
			
 
				+
			
 
				+	cpu_clear(smp_processor_id(), mask);
			
 
				+
			
 
				+	cpus = cpus_weight(mask);
			
 
				+	if (!cpus) {
			
 
				+		spin_unlock(&call_lock);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	/* Can deadlock when called with interrupts disabled */
			
 
				+	WARN_ON(irqs_disabled());
			
 
				+
			
 
				+	data.func = func;
			
 
				+	data.info = info;
			
 
				+	atomic_set(&data.started, 0);
			
 
				+	data.wait = wait;
			
 
				+	if (wait)
			
 
				+		atomic_set(&data.finished, 0);
			
 
				+
			
 
				+	call_data = &data;
			
 
				+	mb();			/* write everything before IPI */
			
 
				+
			
 
				+	/* Send a message to other CPUs and wait for them to respond */
			
 
				+	xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
			
 
				+
			
 
				+	/* Make sure other vcpus get a chance to run.
			
 
				+	   XXX too severe?  Maybe we should check the other CPU's states? */
			
 
				+	HYPERVISOR_sched_op(SCHEDOP_yield, 0);
			
 
				+
			
 
				+	/* Wait for response */
			
 
				+	while (atomic_read(&data.started) != cpus ||
			
 
				+	       (wait && atomic_read(&data.finished) != cpus))
			
 
				+		cpu_relax();
			
 
				+
			
 
				+	spin_unlock(&call_lock);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/arch/i386/xen/time.c
+++ b/arch/i386/xen/time.c
@@ -0,0 +1,590 @@
 
				+/*
			
 
				+ * Xen time implementation.
			
 
				+ *
			
 
				+ * This is implemented in terms of a clocksource driver which uses
			
 
				+ * the hypervisor clock as a nanosecond timebase, and a clockevent
			
 
				+ * driver which uses the hypervisor's timer mechanism.
			
 
				+ *
			
 
				+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
			
 
				+ */
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/interrupt.h>
			
 
				+#include <linux/clocksource.h>
			
 
				+#include <linux/clockchips.h>
			
 
				+#include <linux/kernel_stat.h>
			
 
				+
			
 
				+#include <asm/xen/hypervisor.h>
			
 
				+#include <asm/xen/hypercall.h>
			
 
				+
			
 
				+#include <xen/events.h>
			
 
				+#include <xen/interface/xen.h>
			
 
				+#include <xen/interface/vcpu.h>
			
 
				+
			
 
				+#include "xen-ops.h"
			
 
				+
			
 
				+#define XEN_SHIFT 22
			
 
				+
			
 
				+/* Xen may fire a timer up to this many ns early */
			
 
				+#define TIMER_SLOP	100000
			
 
				+#define NS_PER_TICK	(1000000000LL / HZ)
			
 
				+
			
 
				+static cycle_t xen_clocksource_read(void);
			
 
				+
			
 
				+/* These are perodically updated in shared_info, and then copied here. */
			
 
				+struct shadow_time_info {
			
 
				+	u64 tsc_timestamp;     /* TSC at last update of time vals.  */
			
 
				+	u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
			
 
				+	u32 tsc_to_nsec_mul;
			
 
				+	int tsc_shift;
			
 
				+	u32 version;
			
 
				+};
			
 
				+
			
 
				+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
			
 
				+
			
 
				+/* runstate info updated by Xen */
			
 
				+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
			
 
				+
			
 
				+/* snapshots of runstate info */
			
 
				+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
			
 
				+
			
 
				+/* unused ns of stolen and blocked time */
			
 
				+static DEFINE_PER_CPU(u64, residual_stolen);
			
 
				+static DEFINE_PER_CPU(u64, residual_blocked);
			
 
				+
			
 
				+/* return an consistent snapshot of 64-bit time/counter value */
			
 
				+static u64 get64(const u64 *p)
			
 
				+{
			
 
				+	u64 ret;
			
 
				+
			
 
				+	if (BITS_PER_LONG < 64) {
			
 
				+		u32 *p32 = (u32 *)p;
			
 
				+		u32 h, l;
			
 
				+
			
 
				+		/*
			
 
				+		 * Read high then low, and then make sure high is
			
 
				+		 * still the same; this will only loop if low wraps
			
 
				+		 * and carries into high.
			
 
				+		 * XXX some clean way to make this endian-proof?
			
 
				+		 */
			
 
				+		do {
			
 
				+			h = p32[1];
			
 
				+			barrier();
			
 
				+			l = p32[0];
			
 
				+			barrier();
			
 
				+		} while (p32[1] != h);
			
 
				+
			
 
				+		ret = (((u64)h) << 32) | l;
			
 
				+	} else
			
 
				+		ret = *p;
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Runstate accounting
			
 
				+ */
			
 
				+static void get_runstate_snapshot(struct vcpu_runstate_info *res)
			
 
				+{
			
 
				+	u64 state_time;
			
 
				+	struct vcpu_runstate_info *state;
			
 
				+
			
 
				+	BUG_ON(preemptible());
			
 
				+
			
 
				+	state = &__get_cpu_var(runstate);
			
 
				+
			
 
				+	/*
			
 
				+	 * The runstate info is always updated by the hypervisor on
			
 
				+	 * the current CPU, so there's no need to use anything
			
 
				+	 * stronger than a compiler barrier when fetching it.
			
 
				+	 */
			
 
				+	do {
			
 
				+		state_time = get64(&state->state_entry_time);
			
 
				+		barrier();
			
 
				+		*res = *state;
			
 
				+		barrier();
			
 
				+	} while (get64(&state->state_entry_time) != state_time);
			
 
				+}
			
 
				+
			
 
				+static void setup_runstate_info(int cpu)
			
 
				+{
			
 
				+	struct vcpu_register_runstate_memory_area area;
			
 
				+
			
 
				+	area.addr.v = &per_cpu(runstate, cpu);
			
 
				+
			
 
				+	if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
			
 
				+			       cpu, &area))
			
 
				+		BUG();
			
 
				+}
			
 
				+
			
 
				+static void do_stolen_accounting(void)
			
 
				+{
			
 
				+	struct vcpu_runstate_info state;
			
 
				+	struct vcpu_runstate_info *snap;
			
 
				+	s64 blocked, runnable, offline, stolen;
			
 
				+	cputime_t ticks;
			
 
				+
			
 
				+	get_runstate_snapshot(&state);
			
 
				+
			
 
				+	WARN_ON(state.state != RUNSTATE_running);
			
 
				+
			
 
				+	snap = &__get_cpu_var(runstate_snapshot);
			
 
				+
			
 
				+	/* work out how much time the VCPU has not been runn*ing*  */
			
 
				+	blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
			
 
				+	runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
			
 
				+	offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
			
 
				+
			
 
				+	*snap = state;
			
 
				+
			
 
				+	/* Add the appropriate number of ticks of stolen time,
			
 
				+	   including any left-overs from last time.  Passing NULL to
			
 
				+	   account_steal_time accounts the time as stolen. */
			
 
				+	stolen = runnable + offline + __get_cpu_var(residual_stolen);
			
 
				+
			
 
				+	if (stolen < 0)
			
 
				+		stolen = 0;
			
 
				+
			
 
				+	ticks = 0;
			
 
				+	while (stolen >= NS_PER_TICK) {
			
 
				+		ticks++;
			
 
				+		stolen -= NS_PER_TICK;
			
 
				+	}
			
 
				+	__get_cpu_var(residual_stolen) = stolen;
			
 
				+	account_steal_time(NULL, ticks);
			
 
				+
			
 
				+	/* Add the appropriate number of ticks of blocked time,
			
 
				+	   including any left-overs from last time.  Passing idle to
			
 
				+	   account_steal_time accounts the time as idle/wait. */
			
 
				+	blocked += __get_cpu_var(residual_blocked);
			
 
				+
			
 
				+	if (blocked < 0)
			
 
				+		blocked = 0;
			
 
				+
			
 
				+	ticks = 0;
			
 
				+	while (blocked >= NS_PER_TICK) {
			
 
				+		ticks++;
			
 
				+		blocked -= NS_PER_TICK;
			
 
				+	}
			
 
				+	__get_cpu_var(residual_blocked) = blocked;
			
 
				+	account_steal_time(idle_task(smp_processor_id()), ticks);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Xen sched_clock implementation.  Returns the number of unstolen
			
 
				+ * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
			
 
				+ * states.
			
 
				+ */
			
 
				+unsigned long long xen_sched_clock(void)
			
 
				+{
			
 
				+	struct vcpu_runstate_info state;
			
 
				+	cycle_t now;
			
 
				+	u64 ret;
			
 
				+	s64 offset;
			
 
				+
			
 
				+	/*
			
 
				+	 * Ideally sched_clock should be called on a per-cpu basis
			
 
				+	 * anyway, so preempt should already be disabled, but that's
			
 
				+	 * not current practice at the moment.
			
 
				+	 */
			
 
				+	preempt_disable();
			
 
				+
			
 
				+	now = xen_clocksource_read();
			
 
				+
			
 
				+	get_runstate_snapshot(&state);
			
 
				+
			
 
				+	WARN_ON(state.state != RUNSTATE_running);
			
 
				+
			
 
				+	offset = now - state.state_entry_time;
			
 
				+	if (offset < 0)
			
 
				+		offset = 0;
			
 
				+
			
 
				+	ret = state.time[RUNSTATE_blocked] +
			
 
				+		state.time[RUNSTATE_running] +
			
 
				+		offset;
			
 
				+
			
 
				+	preempt_enable();
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* Get the CPU speed from Xen */
			
 
				+unsigned long xen_cpu_khz(void)
			
 
				+{
			
 
				+	u64 cpu_khz = 1000000ULL << 32;
			
 
				+	const struct vcpu_time_info *info =
			
 
				+		&HYPERVISOR_shared_info->vcpu_info[0].time;
			
 
				+
			
 
				+	do_div(cpu_khz, info->tsc_to_system_mul);
			
 
				+	if (info->tsc_shift < 0)
			
 
				+		cpu_khz <<= -info->tsc_shift;
			
 
				+	else
			
 
				+		cpu_khz >>= info->tsc_shift;
			
 
				+
			
 
				+	return cpu_khz;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Reads a consistent set of time-base values from Xen, into a shadow data
			
 
				+ * area.
			
 
				+ */
			
 
				+static unsigned get_time_values_from_xen(void)
			
 
				+{
			
 
				+	struct vcpu_time_info   *src;
			
 
				+	struct shadow_time_info *dst;
			
 
				+
			
 
				+	/* src is shared memory with the hypervisor, so we need to
			
 
				+	   make sure we get a consistent snapshot, even in the face of
			
 
				+	   being preempted. */
			
 
				+	src = &__get_cpu_var(xen_vcpu)->time;
			
 
				+	dst = &__get_cpu_var(shadow_time);
			
 
				+
			
 
				+	do {
			
 
				+		dst->version = src->version;
			
 
				+		rmb();		/* fetch version before data */
			
 
				+		dst->tsc_timestamp     = src->tsc_timestamp;
			
 
				+		dst->system_timestamp  = src->system_time;
			
 
				+		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
			
 
				+		dst->tsc_shift         = src->tsc_shift;
			
 
				+		rmb();		/* test version after fetching data */
			
 
				+	} while ((src->version & 1) | (dst->version ^ src->version));
			
 
				+
			
 
				+	return dst->version;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
			
 
				+ * yielding a 64-bit result.
			
 
				+ */
			
 
				+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
			
 
				+{
			
 
				+	u64 product;
			
 
				+#ifdef __i386__
			
 
				+	u32 tmp1, tmp2;
			
 
				+#endif
			
 
				+
			
 
				+	if (shift < 0)
			
 
				+		delta >>= -shift;
			
 
				+	else
			
 
				+		delta <<= shift;
			
 
				+
			
 
				+#ifdef __i386__
			
 
				+	__asm__ (
			
 
				+		"mul  %5       ; "
			
 
				+		"mov  %4,%%eax ; "
			
 
				+		"mov  %%edx,%4 ; "
			
 
				+		"mul  %5       ; "
			
 
				+		"xor  %5,%5    ; "
			
 
				+		"add  %4,%%eax ; "
			
 
				+		"adc  %5,%%edx ; "
			
 
				+		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
			
 
				+		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
			
 
				+#elif __x86_64__
			
 
				+	__asm__ (
			
 
				+		"mul %%rdx ; shrd $32,%%rdx,%%rax"
			
 
				+		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
			
 
				+#else
			
 
				+#error implement me!
			
 
				+#endif
			
 
				+
			
 
				+	return product;
			
 
				+}
			
 
				+
			
 
				+static u64 get_nsec_offset(struct shadow_time_info *shadow)
			
 
				+{
			
 
				+	u64 now, delta;
			
 
				+	now = native_read_tsc();
			
 
				+	delta = now - shadow->tsc_timestamp;
			
 
				+	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
			
 
				+}
			
 
				+
			
 
				+static cycle_t xen_clocksource_read(void)
			
 
				+{
			
 
				+	struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
			
 
				+	cycle_t ret;
			
 
				+	unsigned version;
			
 
				+
			
 
				+	do {
			
 
				+		version = get_time_values_from_xen();
			
 
				+		barrier();
			
 
				+		ret = shadow->system_timestamp + get_nsec_offset(shadow);
			
 
				+		barrier();
			
 
				+	} while (version != __get_cpu_var(xen_vcpu)->time.version);
			
 
				+
			
 
				+	put_cpu_var(shadow_time);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static void xen_read_wallclock(struct timespec *ts)
			
 
				+{
			
 
				+	const struct shared_info *s = HYPERVISOR_shared_info;
			
 
				+	u32 version;
			
 
				+	u64 delta;
			
 
				+	struct timespec now;
			
 
				+
			
 
				+	/* get wallclock at system boot */
			
 
				+	do {
			
 
				+		version = s->wc_version;
			
 
				+		rmb();		/* fetch version before time */
			
 
				+		now.tv_sec  = s->wc_sec;
			
 
				+		now.tv_nsec = s->wc_nsec;
			
 
				+		rmb();		/* fetch time before checking version */
			
 
				+	} while ((s->wc_version & 1) | (version ^ s->wc_version));
			
 
				+
			
 
				+	delta = xen_clocksource_read();	/* time since system boot */
			
 
				+	delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
			
 
				+
			
 
				+	now.tv_nsec = do_div(delta, NSEC_PER_SEC);
			
 
				+	now.tv_sec = delta;
			
 
				+
			
 
				+	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
			
 
				+}
			
 
				+
			
 
				+unsigned long xen_get_wallclock(void)
			
 
				+{
			
 
				+	struct timespec ts;
			
 
				+
			
 
				+	xen_read_wallclock(&ts);
			
 
				+
			
 
				+	return ts.tv_sec;
			
 
				+}
			
 
				+
			
 
				+int xen_set_wallclock(unsigned long now)
			
 
				+{
			
 
				+	/* do nothing for domU */
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+static struct clocksource xen_clocksource __read_mostly = {
			
 
				+	.name = "xen",
			
 
				+	.rating = 400,
			
 
				+	.read = xen_clocksource_read,
			
 
				+	.mask = ~0,
			
 
				+	.mult = 1<<XEN_SHIFT,		/* time directly in nanoseconds */
			
 
				+	.shift = XEN_SHIFT,
			
 
				+	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+   Xen clockevent implementation
			
 
				+
			
 
				+   Xen has two clockevent implementations:
			
 
				+
			
 
				+   The old timer_op one works with all released versions of Xen prior
			
 
				+   to version 3.0.4.  This version of the hypervisor provides a
			
 
				+   single-shot timer with nanosecond resolution.  However, sharing the
			
 
				+   same event channel is a 100Hz tick which is delivered while the
			
 
				+   vcpu is running.  We don't care about or use this tick, but it will
			
 
				+   cause the core time code to think the timer fired too soon, and
			
 
				+   will end up resetting it each time.  It could be filtered, but
			
 
				+   doing so has complications when the ktime clocksource is not yet
			
 
				+   the xen clocksource (ie, at boot time).
			
 
				+
			
 
				+   The new vcpu_op-based timer interface allows the tick timer period
			
 
				+   to be changed or turned off.  The tick timer is not useful as a
			
 
				+   periodic timer because events are only delivered to running vcpus.
			
 
				+   The one-shot timer can report when a timeout is in the past, so
			
 
				+   set_next_event is capable of returning -ETIME when appropriate.
			
 
				+   This interface is used when available.
			
 
				+*/
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+  Get a hypervisor absolute time.  In theory we could maintain an
			
 
				+  offset between the kernel's time and the hypervisor's time, and
			
 
				+  apply that to a kernel's absolute timeout.  Unfortunately the
			
 
				+  hypervisor and kernel times can drift even if the kernel is using
			
 
				+  the Xen clocksource, because ntp can warp the kernel's clocksource.
			
 
				+*/
			
 
				+static s64 get_abs_timeout(unsigned long delta)
			
 
				+{
			
 
				+	return xen_clocksource_read() + delta;
			
 
				+}
			
 
				+
			
 
				+static void xen_timerop_set_mode(enum clock_event_mode mode,
			
 
				+				 struct clock_event_device *evt)
			
 
				+{
			
 
				+	switch (mode) {
			
 
				+	case CLOCK_EVT_MODE_PERIODIC:
			
 
				+		/* unsupported */
			
 
				+		WARN_ON(1);
			
 
				+		break;
			
 
				+
			
 
				+	case CLOCK_EVT_MODE_ONESHOT:
			
 
				+		break;
			
 
				+
			
 
				+	case CLOCK_EVT_MODE_UNUSED:
			
 
				+	case CLOCK_EVT_MODE_SHUTDOWN:
			
 
				+		HYPERVISOR_set_timer_op(0);  /* cancel timeout */
			
 
				+		break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int xen_timerop_set_next_event(unsigned long delta,
			
 
				+				      struct clock_event_device *evt)
			
 
				+{
			
 
				+	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
			
 
				+
			
 
				+	if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
			
 
				+		BUG();
			
 
				+
			
 
				+	/* We may have missed the deadline, but there's no real way of
			
 
				+	   knowing for sure.  If the event was in the past, then we'll
			
 
				+	   get an immediate interrupt. */
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static const struct clock_event_device xen_timerop_clockevent = {
			
 
				+	.name = "xen",
			
 
				+	.features = CLOCK_EVT_FEAT_ONESHOT,
			
 
				+
			
 
				+	.max_delta_ns = 0xffffffff,
			
 
				+	.min_delta_ns = TIMER_SLOP,
			
 
				+
			
 
				+	.mult = 1,
			
 
				+	.shift = 0,
			
 
				+	.rating = 500,
			
 
				+
			
 
				+	.set_mode = xen_timerop_set_mode,
			
 
				+	.set_next_event = xen_timerop_set_next_event,
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+static void xen_vcpuop_set_mode(enum clock_event_mode mode,
			
 
				+				struct clock_event_device *evt)
			
 
				+{
			
 
				+	int cpu = smp_processor_id();
			
 
				+
			
 
				+	switch (mode) {
			
 
				+	case CLOCK_EVT_MODE_PERIODIC:
			
 
				+		WARN_ON(1);	/* unsupported */
			
 
				+		break;
			
 
				+
			
 
				+	case CLOCK_EVT_MODE_ONESHOT:
			
 
				+		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
			
 
				+			BUG();
			
 
				+		break;
			
 
				+
			
 
				+	case CLOCK_EVT_MODE_UNUSED:
			
 
				+	case CLOCK_EVT_MODE_SHUTDOWN:
			
 
				+		if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
			
 
				+		    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
			
 
				+			BUG();
			
 
				+		break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int xen_vcpuop_set_next_event(unsigned long delta,
			
 
				+				     struct clock_event_device *evt)
			
 
				+{
			
 
				+	int cpu = smp_processor_id();
			
 
				+	struct vcpu_set_singleshot_timer single;
			
 
				+	int ret;
			
 
				+
			
 
				+	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
			
 
				+
			
 
				+	single.timeout_abs_ns = get_abs_timeout(delta);
			
 
				+	single.flags = VCPU_SSHOTTMR_future;
			
 
				+
			
 
				+	ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
			
 
				+
			
 
				+	BUG_ON(ret != 0 && ret != -ETIME);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static const struct clock_event_device xen_vcpuop_clockevent = {
			
 
				+	.name = "xen",
			
 
				+	.features = CLOCK_EVT_FEAT_ONESHOT,
			
 
				+
			
 
				+	.max_delta_ns = 0xffffffff,
			
 
				+	.min_delta_ns = TIMER_SLOP,
			
 
				+
			
 
				+	.mult = 1,
			
 
				+	.shift = 0,
			
 
				+	.rating = 500,
			
 
				+
			
 
				+	.set_mode = xen_vcpuop_set_mode,
			
 
				+	.set_next_event = xen_vcpuop_set_next_event,
			
 
				+};
			
 
				+
			
 
				+static const struct clock_event_device *xen_clockevent =
			
 
				+	&xen_timerop_clockevent;
			
 
				+static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
			
 
				+
			
 
				+static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
			
 
				+{
			
 
				+	struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
			
 
				+	irqreturn_t ret;
			
 
				+
			
 
				+	ret = IRQ_NONE;
			
 
				+	if (evt->event_handler) {
			
 
				+		evt->event_handler(evt);
			
 
				+		ret = IRQ_HANDLED;
			
 
				+	}
			
 
				+
			
 
				+	do_stolen_accounting();
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+void xen_setup_timer(int cpu)
			
 
				+{
			
 
				+	const char *name;
			
 
				+	struct clock_event_device *evt;
			
 
				+	int irq;
			
 
				+
			
 
				+	printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
			
 
				+
			
 
				+	name = kasprintf(GFP_KERNEL, "timer%d", cpu);
			
 
				+	if (!name)
			
 
				+		name = "<timer kasprintf failed>";
			
 
				+
			
 
				+	irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
			
 
				+				      IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
			
 
				+				      name, NULL);
			
 
				+
			
 
				+	evt = &per_cpu(xen_clock_events, cpu);
			
 
				+	memcpy(evt, xen_clockevent, sizeof(*evt));
			
 
				+
			
 
				+	evt->cpumask = cpumask_of_cpu(cpu);
			
 
				+	evt->irq = irq;
			
 
				+
			
 
				+	setup_runstate_info(cpu);
			
 
				+}
			
 
				+
			
 
				+void xen_setup_cpu_clockevents(void)
			
 
				+{
			
 
				+	BUG_ON(preemptible());
			
 
				+
			
 
				+	clockevents_register_device(&__get_cpu_var(xen_clock_events));
			
 
				+}
			
 
				+
			
 
				+__init void xen_time_init(void)
			
 
				+{
			
 
				+	int cpu = smp_processor_id();
			
 
				+
			
 
				+	get_time_values_from_xen();
			
 
				+
			
 
				+	clocksource_register(&xen_clocksource);
			
 
				+
			
 
				+	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
			
 
				+		/* Successfully turned off 100Hz tick, so we have the
			
 
				+		   vcpuop-based timer interface */
			
 
				+		printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
			
 
				+		xen_clockevent = &xen_vcpuop_clockevent;
			
 
				+	}
			
 
				+
			
 
				+	/* Set initial system time with full resolution */
			
 
				+	xen_read_wallclock(&xtime);
			
 
				+	set_normalized_timespec(&wall_to_monotonic,
			
 
				+				-xtime.tv_sec, -xtime.tv_nsec);
			
 
				+
			
 
				+	tsc_disable = 0;
			
 
				+
			
 
				+	xen_setup_timer(cpu);
			
 
				+	xen_setup_cpu_clockevents();
			
 
				+}
			
--- a/arch/i386/xen/xen-asm.S
+++ b/arch/i386/xen/xen-asm.S
@@ -0,0 +1,291 @@
 
				+/*
			
 
				+	Asm versions of Xen pv-ops, suitable for either direct use or inlining.
			
 
				+	The inline versions are the same as the direct-use versions, with the
			
 
				+	pre- and post-amble chopped off.
			
 
				+
			
 
				+	This code is encoded for size rather than absolute efficiency,
			
 
				+	with a view to being able to inline as much as possible.
			
 
				+
			
 
				+	We only bother with direct forms (ie, vcpu in pda) of the operations
			
 
				+	here; the indirect forms are better handled in C, since they're
			
 
				+	generally too large to inline anyway.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/linkage.h>
			
 
				+
			
 
				+#include <asm/asm-offsets.h>
			
 
				+#include <asm/thread_info.h>
			
 
				+#include <asm/percpu.h>
			
 
				+#include <asm/processor-flags.h>
			
 
				+#include <asm/segment.h>
			
 
				+
			
 
				+#include <xen/interface/xen.h>
			
 
				+
			
 
				+#define RELOC(x, v)	.globl x##_reloc; x##_reloc=v
			
 
				+#define ENDPATCH(x)	.globl x##_end; x##_end=.
			
 
				+
			
 
				+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
			
 
				+#define XEN_EFLAGS_NMI	0x80000000
			
 
				+
			
 
				+/*
			
 
				+	Enable events.  This clears the event mask and tests the pending
			
 
				+	event status with one and operation.  If there are pending
			
 
				+	events, then enter the hypervisor to get them handled.
			
 
				+ */
			
 
				+ENTRY(xen_irq_enable_direct)
			
 
				+	/* Clear mask and test pending */
			
 
				+	andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
			
 
				+	/* Preempt here doesn't matter because that will deal with
			
 
				+	   any pending interrupts.  The pending check may end up being
			
 
				+	   run on the wrong CPU, but that doesn't hurt. */
			
 
				+	jz 1f
			
 
				+2:	call check_events
			
 
				+1:
			
 
				+ENDPATCH(xen_irq_enable_direct)
			
 
				+	ret
			
 
				+	ENDPROC(xen_irq_enable_direct)
			
 
				+	RELOC(xen_irq_enable_direct, 2b+1)
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+	Disabling events is simply a matter of making the event mask
			
 
				+	non-zero.
			
 
				+ */
			
 
				+ENTRY(xen_irq_disable_direct)
			
 
				+	movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
			
 
				+ENDPATCH(xen_irq_disable_direct)
			
 
				+	ret
			
 
				+	ENDPROC(xen_irq_disable_direct)
			
 
				+	RELOC(xen_irq_disable_direct, 0)
			
 
				+
			
 
				+/*
			
 
				+	(xen_)save_fl is used to get the current interrupt enable status.
			
 
				+	Callers expect the status to be in X86_EFLAGS_IF, and other bits
			
 
				+	may be set in the return value.  We take advantage of this by
			
 
				+	making sure that X86_EFLAGS_IF has the right value (and other bits
			
 
				+	in that byte are 0), but other bits in the return value are
			
 
				+	undefined.  We need to toggle the state of the bit, because
			
 
				+	Xen and x86 use opposite senses (mask vs enable).
			
 
				+ */
			
 
				+ENTRY(xen_save_fl_direct)
			
 
				+	testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
			
 
				+	setz %ah
			
 
				+	addb %ah,%ah
			
 
				+ENDPATCH(xen_save_fl_direct)
			
 
				+	ret
			
 
				+	ENDPROC(xen_save_fl_direct)
			
 
				+	RELOC(xen_save_fl_direct, 0)
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+	In principle the caller should be passing us a value return
			
 
				+	from xen_save_fl_direct, but for robustness sake we test only
			
 
				+	the X86_EFLAGS_IF flag rather than the whole byte. After
			
 
				+	setting the interrupt mask state, it checks for unmasked
			
 
				+	pending events and enters the hypervisor to get them delivered
			
 
				+	if so.
			
 
				+ */
			
 
				+ENTRY(xen_restore_fl_direct)
			
 
				+	testb $X86_EFLAGS_IF>>8, %ah
			
 
				+	setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
			
 
				+	/* Preempt here doesn't matter because that will deal with
			
 
				+	   any pending interrupts.  The pending check may end up being
			
 
				+	   run on the wrong CPU, but that doesn't hurt. */
			
 
				+
			
 
				+	/* check for unmasked and pending */
			
 
				+	cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
			
 
				+	jz 1f
			
 
				+2:	call check_events
			
 
				+1:
			
 
				+ENDPATCH(xen_restore_fl_direct)
			
 
				+	ret
			
 
				+	ENDPROC(xen_restore_fl_direct)
			
 
				+	RELOC(xen_restore_fl_direct, 2b+1)
			
 
				+
			
 
				+/*
			
 
				+	This is run where a normal iret would be run, with the same stack setup:
			
 
				+	      8: eflags
			
 
				+	      4: cs
			
 
				+	esp-> 0: eip
			
 
				+
			
 
				+	This attempts to make sure that any pending events are dealt
			
 
				+	with on return to usermode, but there is a small window in
			
 
				+	which an event can happen just before entering usermode.  If
			
 
				+	the nested interrupt ends up setting one of the TIF_WORK_MASK
			
 
				+	pending work flags, they will not be tested again before
			
 
				+	returning to usermode. This means that a process can end up
			
 
				+	with pending work, which will be unprocessed until the process
			
 
				+	enters and leaves the kernel again, which could be an
			
 
				+	unbounded amount of time.  This means that a pending signal or
			
 
				+	reschedule event could be indefinitely delayed.
			
 
				+
			
 
				+	The fix is to notice a nested interrupt in the critical
			
 
				+	window, and if one occurs, then fold the nested interrupt into
			
 
				+	the current interrupt stack frame, and re-process it
			
 
				+	iteratively rather than recursively.  This means that it will
			
 
				+	exit via the normal path, and all pending work will be dealt
			
 
				+	with appropriately.
			
 
				+
			
 
				+	Because the nested interrupt handler needs to deal with the
			
 
				+	current stack state in whatever form its in, we keep things
			
 
				+	simple by only using a single register which is pushed/popped
			
 
				+	on the stack.
			
 
				+
			
 
				+	Non-direct iret could be done in the same way, but it would
			
 
				+	require an annoying amount of code duplication.  We'll assume
			
 
				+	that direct mode will be the common case once the hypervisor
			
 
				+	support becomes commonplace.
			
 
				+ */
			
 
				+ENTRY(xen_iret_direct)
			
 
				+	/* test eflags for special cases */
			
 
				+	testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
			
 
				+	jnz hyper_iret
			
 
				+
			
 
				+	push %eax
			
 
				+	ESP_OFFSET=4	# bytes pushed onto stack
			
 
				+
			
 
				+	/* Store vcpu_info pointer for easy access.  Do it this
			
 
				+	   way to avoid having to reload %fs */
			
 
				+#ifdef CONFIG_SMP
			
 
				+	GET_THREAD_INFO(%eax)
			
 
				+	movl TI_cpu(%eax),%eax
			
 
				+	movl __per_cpu_offset(,%eax,4),%eax
			
 
				+	lea per_cpu__xen_vcpu_info(%eax),%eax
			
 
				+#else
			
 
				+	movl $per_cpu__xen_vcpu_info, %eax
			
 
				+#endif
			
 
				+
			
 
				+	/* check IF state we're restoring */
			
 
				+	testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
			
 
				+
			
 
				+	/* Maybe enable events.  Once this happens we could get a
			
 
				+	   recursive event, so the critical region starts immediately
			
 
				+	   afterwards.  However, if that happens we don't end up
			
 
				+	   resuming the code, so we don't have to be worried about
			
 
				+	   being preempted to another CPU. */
			
 
				+	setz XEN_vcpu_info_mask(%eax)
			
 
				+xen_iret_start_crit:
			
 
				+
			
 
				+	/* check for unmasked and pending */
			
 
				+	cmpw $0x0001, XEN_vcpu_info_pending(%eax)
			
 
				+
			
 
				+	/* If there's something pending, mask events again so we
			
 
				+	   can jump back into xen_hypervisor_callback */
			
 
				+	sete XEN_vcpu_info_mask(%eax)
			
 
				+
			
 
				+	popl %eax
			
 
				+
			
 
				+	/* From this point on the registers are restored and the stack
			
 
				+	   updated, so we don't need to worry about it if we're preempted */
			
 
				+iret_restore_end:
			
 
				+
			
 
				+	/* Jump to hypervisor_callback after fixing up the stack.
			
 
				+	   Events are masked, so jumping out of the critical
			
 
				+	   region is OK. */
			
 
				+	je xen_hypervisor_callback
			
 
				+
			
 
				+	iret
			
 
				+xen_iret_end_crit:
			
 
				+
			
 
				+hyper_iret:
			
 
				+	/* put this out of line since its very rarely used */
			
 
				+	jmp hypercall_page + __HYPERVISOR_iret * 32
			
 
				+
			
 
				+	.globl xen_iret_start_crit, xen_iret_end_crit
			
 
				+
			
 
				+/*
			
 
				+   This is called by xen_hypervisor_callback in entry.S when it sees
			
 
				+   that the EIP at the time of interrupt was between xen_iret_start_crit
			
 
				+   and xen_iret_end_crit.  We're passed the EIP in %eax so we can do
			
 
				+   a more refined determination of what to do.
			
 
				+
			
 
				+   The stack format at this point is:
			
 
				+	----------------
			
 
				+	 ss		: (ss/esp may be present if we came from usermode)
			
 
				+	 esp		:
			
 
				+	 eflags		}  outer exception info
			
 
				+	 cs		}
			
 
				+	 eip		}
			
 
				+	---------------- <- edi (copy dest)
			
 
				+	 eax		:  outer eax if it hasn't been restored
			
 
				+	----------------
			
 
				+	 eflags		}  nested exception info
			
 
				+	 cs		}   (no ss/esp because we're nested
			
 
				+	 eip		}    from the same ring)
			
 
				+	 orig_eax	}<- esi (copy src)
			
 
				+	 - - - - - - - -
			
 
				+	 fs		}
			
 
				+	 es		}
			
 
				+	 ds		}  SAVE_ALL state
			
 
				+	 eax		}
			
 
				+	  :		:
			
 
				+	 ebx		}
			
 
				+	----------------
			
 
				+	 return addr	 <- esp
			
 
				+	----------------
			
 
				+
			
 
				+   In order to deliver the nested exception properly, we need to shift
			
 
				+   everything from the return addr up to the error code so it
			
 
				+   sits just under the outer exception info.  This means that when we
			
 
				+   handle the exception, we do it in the context of the outer exception
			
 
				+   rather than starting a new one.
			
 
				+
			
 
				+   The only caveat is that if the outer eax hasn't been
			
 
				+   restored yet (ie, it's still on stack), we need to insert
			
 
				+   its value into the SAVE_ALL state before going on, since
			
 
				+   it's usermode state which we eventually need to restore.
			
 
				+ */
			
 
				+ENTRY(xen_iret_crit_fixup)
			
 
				+	/* offsets +4 for return address */
			
 
				+
			
 
				+	/*
			
 
				+	   Paranoia: Make sure we're really coming from userspace.
			
 
				+	   One could imagine a case where userspace jumps into the
			
 
				+	   critical range address, but just before the CPU delivers a GP,
			
 
				+	   it decides to deliver an interrupt instead.  Unlikely?
			
 
				+	   Definitely.  Easy to avoid?  Yes.  The Intel documents
			
 
				+	   explicitly say that the reported EIP for a bad jump is the
			
 
				+	   jump instruction itself, not the destination, but some virtual
			
 
				+	   environments get this wrong.
			
 
				+	 */
			
 
				+	movl PT_CS+4(%esp), %ecx
			
 
				+	andl $SEGMENT_RPL_MASK, %ecx
			
 
				+	cmpl $USER_RPL, %ecx
			
 
				+	je 2f
			
 
				+
			
 
				+	lea PT_ORIG_EAX+4(%esp), %esi
			
 
				+	lea PT_EFLAGS+4(%esp), %edi
			
 
				+
			
 
				+	/* If eip is before iret_restore_end then stack
			
 
				+	   hasn't been restored yet. */
			
 
				+	cmp $iret_restore_end, %eax
			
 
				+	jae 1f
			
 
				+
			
 
				+	movl 0+4(%edi),%eax		/* copy EAX */
			
 
				+	movl %eax, PT_EAX+4(%esp)
			
 
				+
			
 
				+	lea ESP_OFFSET(%edi),%edi	/* move dest up over saved regs */
			
 
				+
			
 
				+	/* set up the copy */
			
 
				+1:	std
			
 
				+	mov $(PT_EIP+4) / 4, %ecx	/* copy ret+saved regs up to orig_eax */
			
 
				+	rep movsl
			
 
				+	cld
			
 
				+
			
 
				+	lea 4(%edi),%esp		/* point esp to new frame */
			
 
				+2:	ret
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+	Force an event check by making a hypercall,
			
 
				+	but preserve regs before making the call.
			
 
				+ */
			
 
				+check_events:
			
 
				+	push %eax
			
 
				+	push %ecx
			
 
				+	push %edx
			
 
				+	call force_evtchn_callback
			
 
				+	pop %edx
			
 
				+	pop %ecx
			
 
				+	pop %eax
			
 
				+	ret
			
--- a/arch/i386/xen/xen-head.S
+++ b/arch/i386/xen/xen-head.S
@@ -0,0 +1,36 @@
 
				+/* Xen-specific pieces of head.S, intended to be included in the right
			
 
				+	place in head.S */
			
 
				+
			
 
				+#ifdef CONFIG_XEN
			
 
				+
			
 
				+#include <linux/elfnote.h>
			
 
				+#include <asm/boot.h>
			
 
				+#include <xen/interface/elfnote.h>
			
 
				+
			
 
				+ENTRY(startup_xen)
			
 
				+	movl %esi,xen_start_info
			
 
				+	cld
			
 
				+	movl $(init_thread_union+THREAD_SIZE),%esp
			
 
				+	jmp xen_start_kernel
			
 
				+
			
 
				+.pushsection ".bss.page_aligned"
			
 
				+	.align PAGE_SIZE_asm
			
 
				+ENTRY(hypercall_page)
			
 
				+	.skip 0x1000
			
 
				+.popsection
			
 
				+
			
 
				+	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")
			
 
				+	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz "2.6")
			
 
				+	ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz "xen-3.0")
			
 
				+	ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .long  __PAGE_OFFSET)
			
 
				+	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .long  startup_xen)
			
 
				+	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long  hypercall_page)
			
 
				+	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz "!writable_page_tables|pae_pgdir_above_4gb")
			
 
				+#ifdef CONFIG_X86_PAE
			
 
				+	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")
			
 
				+#else
			
 
				+	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "no")
			
 
				+#endif
			
 
				+	ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
			
 
				+
			
 
				+#endif /*CONFIG_XEN */
			
--- a/arch/i386/xen/xen-ops.h
+++ b/arch/i386/xen/xen-ops.h
@@ -0,0 +1,71 @@
 
				+#ifndef XEN_OPS_H
			
 
				+#define XEN_OPS_H
			
 
				+
			
 
				+#include <linux/init.h>
			
 
				+
			
 
				+/* These are code, but not functions.  Defined in entry.S */
			
 
				+extern const char xen_hypervisor_callback[];
			
 
				+extern const char xen_failsafe_callback[];
			
 
				+
			
 
				+void xen_copy_trap_info(struct trap_info *traps);
			
 
				+
			
 
				+DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
			
 
				+DECLARE_PER_CPU(unsigned long, xen_cr3);
			
 
				+
			
 
				+extern struct start_info *xen_start_info;
			
 
				+extern struct shared_info *HYPERVISOR_shared_info;
			
 
				+
			
 
				+char * __init xen_memory_setup(void);
			
 
				+void __init xen_arch_setup(void);
			
 
				+void __init xen_init_IRQ(void);
			
 
				+
			
 
				+void xen_setup_timer(int cpu);
			
 
				+void xen_setup_cpu_clockevents(void);
			
 
				+unsigned long xen_cpu_khz(void);
			
 
				+void __init xen_time_init(void);
			
 
				+unsigned long xen_get_wallclock(void);
			
 
				+int xen_set_wallclock(unsigned long time);
			
 
				+unsigned long long xen_sched_clock(void);
			
 
				+
			
 
				+void xen_mark_init_mm_pinned(void);
			
 
				+
			
 
				+DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
			
 
				+
			
 
				+static inline unsigned xen_get_lazy_mode(void)
			
 
				+{
			
 
				+	return x86_read_percpu(xen_lazy_mode);
			
 
				+}
			
 
				+
			
 
				+void __init xen_fill_possible_map(void);
			
 
				+
			
 
				+void __init xen_setup_vcpu_info_placement(void);
			
 
				+void xen_smp_prepare_boot_cpu(void);
			
 
				+void xen_smp_prepare_cpus(unsigned int max_cpus);
			
 
				+int xen_cpu_up(unsigned int cpu);
			
 
				+void xen_smp_cpus_done(unsigned int max_cpus);
			
 
				+
			
 
				+void xen_smp_send_stop(void);
			
 
				+void xen_smp_send_reschedule(int cpu);
			
 
				+int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic,
			
 
				+			   int wait);
			
 
				+int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
			
 
				+				 int nonatomic, int wait);
			
 
				+
			
 
				+int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
			
 
				+			       void *info, int wait);
			
 
				+
			
 
				+
			
 
				+/* Declare an asm function, along with symbols needed to make it
			
 
				+   inlineable */
			
 
				+#define DECL_ASM(ret, name, ...)		\
			
 
				+	ret name(__VA_ARGS__);			\
			
 
				+	extern char name##_end[];		\
			
 
				+	extern char name##_reloc[]		\
			
 
				+
			
 
				+DECL_ASM(void, xen_irq_enable_direct, void);
			
 
				+DECL_ASM(void, xen_irq_disable_direct, void);
			
 
				+DECL_ASM(unsigned long, xen_save_fl_direct, void);
			
 
				+DECL_ASM(void, xen_restore_fl_direct, unsigned long);
			
 
				+
			
 
				+void xen_iret_direct(void);
			
 
				+#endif /* XEN_OPS_H */
			
--- a/arch/x86_64/kernel/early_printk.c
+++ b/arch/x86_64/kernel/early_printk.c
@@ -6,6 +6,7 @@
 
				 #include <asm/io.h>
			
 
				 #include <asm/processor.h>
			
 
				 #include <asm/fcntl.h>
			
 
				+#include <xen/hvc-console.h>
			
 
				 
			
 
				 /* Simple VGA output */
			
 
				 
			
@@ -242,6 +243,10 @@ static int __init setup_early_printk(char *buf)
 
				  		simnow_init(buf + 6);
			
 
				  		early_console = &simnow_console;
			
 
				  		keep_early = 1;
			
 
				+#ifdef CONFIG_HVC_XEN
			
 
				+	} else if (!strncmp(buf, "xen", 3)) {
			
 
				+		early_console = &xenboot_console;
			
 
				+#endif
			
 
				 	}
			
 
				 
			
 
				 	if (keep_early)
			
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -174,7 +174,7 @@ static void do_mce_trigger(void)
 
				 	if (events != atomic_read(&mce_logged) && trigger[0]) {
			
 
				 		/* Small race window, but should be harmless.  */
			
 
				 		atomic_set(&mce_logged, events);
			
 
				-		call_usermodehelper(trigger, trigger_argv, NULL, -1);
			
 
				+		call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -15,6 +15,8 @@ obj-$(CONFIG_ACPI)		+= acpi/
 
				 obj-$(CONFIG_PNP)		+= pnp/
			
 
				 obj-$(CONFIG_ARM_AMBA)		+= amba/
			
 
				 
			
 
				+obj-$(CONFIG_XEN)		+= xen/
			
 
				+
			
 
				 # char/ comes before serial/ etc so that the VT console is the boot-time
			
 
				 # default.
			
 
				 obj-y				+= char/
			
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -40,6 +40,7 @@
 
				 #include <linux/jiffies.h>
			
 
				 #include <linux/kmod.h>
			
 
				 #include <linux/seq_file.h>
			
 
				+#include <linux/reboot.h>
			
 
				 #include <asm/uaccess.h>
			
 
				 
			
 
				 #include <acpi/acpi_bus.h>
			
@@ -59,7 +60,6 @@
 
				 #define ACPI_THERMAL_NOTIFY_CRITICAL	0xF0
			
 
				 #define ACPI_THERMAL_NOTIFY_HOT		0xF1
			
 
				 #define ACPI_THERMAL_MODE_ACTIVE	0x00
			
 
				-#define ACPI_THERMAL_PATH_POWEROFF	"/sbin/poweroff"
			
 
				 
			
 
				 #define ACPI_THERMAL_MAX_ACTIVE	10
			
 
				 #define ACPI_THERMAL_MAX_LIMIT_STR_LEN 65
			
@@ -419,26 +419,6 @@ static int acpi_thermal_get_devices(struct acpi_thermal *tz)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int acpi_thermal_call_usermode(char *path)
			
 
				-{
			
 
				-	char *argv[2] = { NULL, NULL };
			
 
				-	char *envp[3] = { NULL, NULL, NULL };
			
 
				-
			
 
				-
			
 
				-	if (!path)
			
 
				-		return -EINVAL;
			
 
				-
			
 
				-	argv[0] = path;
			
 
				-
			
 
				-	/* minimal command environment */
			
 
				-	envp[0] = "HOME=/";
			
 
				-	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
			
 
				-
			
 
				-	call_usermodehelper(argv[0], argv, envp, 0);
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				 static int acpi_thermal_critical(struct acpi_thermal *tz)
			
 
				 {
			
 
				 	if (!tz || !tz->trips.critical.flags.valid)
			
@@ -456,7 +436,7 @@ static int acpi_thermal_critical(struct acpi_thermal *tz)
 
				 	acpi_bus_generate_event(tz->device, ACPI_THERMAL_NOTIFY_CRITICAL,
			
 
				 				tz->trips.critical.flags.enabled);
			
 
				 
			
 
				-	acpi_thermal_call_usermode(ACPI_THERMAL_PATH_POWEROFF);
			
 
				+	orderly_poweroff(true);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -427,4 +427,13 @@ config XILINX_SYSACE
 
				 	help
			
 
				 	  Include support for the Xilinx SystemACE CompactFlash interface
			
 
				 
			
 
				+config XEN_BLKDEV_FRONTEND
			
 
				+	tristate "Xen virtual block device support"
			
 
				+	depends on XEN
			
 
				+	default y
			
 
				+	help
			
 
				+	  This driver implements the front-end of the Xen virtual
			
 
				+	  block device driver.  It communicates with a back-end driver
			
 
				+	  in another domain which drives the actual block device.
			
 
				+
			
 
				 endif # BLK_DEV
			
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -29,3 +29,4 @@ obj-$(CONFIG_VIODASD)		+= viodasd.o
 
				 obj-$(CONFIG_BLK_DEV_SX8)	+= sx8.o
			
 
				 obj-$(CONFIG_BLK_DEV_UB)	+= ub.o
			
 
				 
			
 
				+obj-$(CONFIG_XEN_BLKDEV_FRONTEND)	+= xen-blkfront.o
			
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -0,0 +1,988 @@
 
				+/*
			
 
				+ * blkfront.c
			
 
				+ *
			
 
				+ * XenLinux virtual block device driver.
			
 
				+ *
			
 
				+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
			
 
				+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
			
 
				+ * Copyright (c) 2004, Christian Limpach
			
 
				+ * Copyright (c) 2004, Andrew Warfield
			
 
				+ * Copyright (c) 2005, Christopher Clark
			
 
				+ * Copyright (c) 2005, XenSource Ltd
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public License version 2
			
 
				+ * as published by the Free Software Foundation; or, when distributed
			
 
				+ * separately from the Linux kernel or incorporated into other
			
 
				+ * software packages, subject to the following license:
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this source file (the "Software"), to deal in the Software without
			
 
				+ * restriction, including without limitation the rights to use, copy, modify,
			
 
				+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
			
 
				+ * and to permit persons to whom the Software is furnished to do so, subject to
			
 
				+ * the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
			
 
				+ * IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/interrupt.h>
			
 
				+#include <linux/blkdev.h>
			
 
				+#include <linux/module.h>
			
 
				+
			
 
				+#include <xen/xenbus.h>
			
 
				+#include <xen/grant_table.h>
			
 
				+#include <xen/events.h>
			
 
				+#include <xen/page.h>
			
 
				+
			
 
				+#include <xen/interface/grant_table.h>
			
 
				+#include <xen/interface/io/blkif.h>
			
 
				+
			
 
				+#include <asm/xen/hypervisor.h>
			
 
				+
			
 
				+enum blkif_state {
			
 
				+	BLKIF_STATE_DISCONNECTED,
			
 
				+	BLKIF_STATE_CONNECTED,
			
 
				+	BLKIF_STATE_SUSPENDED,
			
 
				+};
			
 
				+
			
 
				+struct blk_shadow {
			
 
				+	struct blkif_request req;
			
 
				+	unsigned long request;
			
 
				+	unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
			
 
				+};
			
 
				+
			
 
				+static struct block_device_operations xlvbd_block_fops;
			
 
				+
			
 
				+#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
			
 
				+
			
 
				+/*
			
 
				+ * We have one of these per vbd, whether ide, scsi or 'other'.  They
			
 
				+ * hang in private_data off the gendisk structure. We may end up
			
 
				+ * putting all kinds of interesting stuff here :-)
			
 
				+ */
			
 
				+struct blkfront_info
			
 
				+{
			
 
				+	struct xenbus_device *xbdev;
			
 
				+	dev_t dev;
			
 
				+	struct gendisk *gd;
			
 
				+	int vdevice;
			
 
				+	blkif_vdev_t handle;
			
 
				+	enum blkif_state connected;
			
 
				+	int ring_ref;
			
 
				+	struct blkif_front_ring ring;
			
 
				+	unsigned int evtchn, irq;
			
 
				+	struct request_queue *rq;
			
 
				+	struct work_struct work;
			
 
				+	struct gnttab_free_callback callback;
			
 
				+	struct blk_shadow shadow[BLK_RING_SIZE];
			
 
				+	unsigned long shadow_free;
			
 
				+	int feature_barrier;
			
 
				+
			
 
				+	/**
			
 
				+	 * The number of people holding this device open.  We won't allow a
			
 
				+	 * hot-unplug unless this is 0.
			
 
				+	 */
			
 
				+	int users;
			
 
				+};
			
 
				+
			
 
				+static DEFINE_SPINLOCK(blkif_io_lock);
			
 
				+
			
 
				+#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
			
 
				+	(BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
			
 
				+#define GRANT_INVALID_REF	0
			
 
				+
			
 
				+#define PARTS_PER_DISK		16
			
 
				+
			
 
				+#define BLKIF_MAJOR(dev) ((dev)>>8)
			
 
				+#define BLKIF_MINOR(dev) ((dev) & 0xff)
			
 
				+
			
 
				+#define DEV_NAME	"xvd"	/* name in /dev */
			
 
				+
			
 
				+/* Information about our VBDs. */
			
 
				+#define MAX_VBDS 64
			
 
				+static LIST_HEAD(vbds_list);
			
 
				+
			
 
				+static int get_id_from_freelist(struct blkfront_info *info)
			
 
				+{
			
 
				+	unsigned long free = info->shadow_free;
			
 
				+	BUG_ON(free > BLK_RING_SIZE);
			
 
				+	info->shadow_free = info->shadow[free].req.id;
			
 
				+	info->shadow[free].req.id = 0x0fffffee; /* debug */
			
 
				+	return free;
			
 
				+}
			
 
				+
			
 
				+static void add_id_to_freelist(struct blkfront_info *info,
			
 
				+			       unsigned long id)
			
 
				+{
			
 
				+	info->shadow[id].req.id  = info->shadow_free;
			
 
				+	info->shadow[id].request = 0;
			
 
				+	info->shadow_free = id;
			
 
				+}
			
 
				+
			
 
				+static void blkif_restart_queue_callback(void *arg)
			
 
				+{
			
 
				+	struct blkfront_info *info = (struct blkfront_info *)arg;
			
 
				+	schedule_work(&info->work);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * blkif_queue_request
			
 
				+ *
			
 
				+ * request block io
			
 
				+ *
			
 
				+ * id: for guest use only.
			
 
				+ * operation: BLKIF_OP_{READ,WRITE,PROBE}
			
 
				+ * buffer: buffer to read/write into. this should be a
			
 
				+ *   virtual address in the guest os.
			
 
				+ */
			
 
				+static int blkif_queue_request(struct request *req)
			
 
				+{
			
 
				+	struct blkfront_info *info = req->rq_disk->private_data;
			
 
				+	unsigned long buffer_mfn;
			
 
				+	struct blkif_request *ring_req;
			
 
				+	struct bio *bio;
			
 
				+	struct bio_vec *bvec;
			
 
				+	int idx;
			
 
				+	unsigned long id;
			
 
				+	unsigned int fsect, lsect;
			
 
				+	int ref;
			
 
				+	grant_ref_t gref_head;
			
 
				+
			
 
				+	if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
			
 
				+		return 1;
			
 
				+
			
 
				+	if (gnttab_alloc_grant_references(
			
 
				+		BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
			
 
				+		gnttab_request_free_callback(
			
 
				+			&info->callback,
			
 
				+			blkif_restart_queue_callback,
			
 
				+			info,
			
 
				+			BLKIF_MAX_SEGMENTS_PER_REQUEST);
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				+	/* Fill out a communications ring structure. */
			
 
				+	ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
			
 
				+	id = get_id_from_freelist(info);
			
 
				+	info->shadow[id].request = (unsigned long)req;
			
 
				+
			
 
				+	ring_req->id = id;
			
 
				+	ring_req->sector_number = (blkif_sector_t)req->sector;
			
 
				+	ring_req->handle = info->handle;
			
 
				+
			
 
				+	ring_req->operation = rq_data_dir(req) ?
			
 
				+		BLKIF_OP_WRITE : BLKIF_OP_READ;
			
 
				+	if (blk_barrier_rq(req))
			
 
				+		ring_req->operation = BLKIF_OP_WRITE_BARRIER;
			
 
				+
			
 
				+	ring_req->nr_segments = 0;
			
 
				+	rq_for_each_bio (bio, req) {
			
 
				+		bio_for_each_segment (bvec, bio, idx) {
			
 
				+			BUG_ON(ring_req->nr_segments
			
 
				+			       == BLKIF_MAX_SEGMENTS_PER_REQUEST);
			
 
				+			buffer_mfn = pfn_to_mfn(page_to_pfn(bvec->bv_page));
			
 
				+			fsect = bvec->bv_offset >> 9;
			
 
				+			lsect = fsect + (bvec->bv_len >> 9) - 1;
			
 
				+			/* install a grant reference. */
			
 
				+			ref = gnttab_claim_grant_reference(&gref_head);
			
 
				+			BUG_ON(ref == -ENOSPC);
			
 
				+
			
 
				+			gnttab_grant_foreign_access_ref(
			
 
				+				ref,
			
 
				+				info->xbdev->otherend_id,
			
 
				+				buffer_mfn,
			
 
				+				rq_data_dir(req) );
			
 
				+
			
 
				+			info->shadow[id].frame[ring_req->nr_segments] =
			
 
				+				mfn_to_pfn(buffer_mfn);
			
 
				+
			
 
				+			ring_req->seg[ring_req->nr_segments] =
			
 
				+				(struct blkif_request_segment) {
			
 
				+					.gref       = ref,
			
 
				+					.first_sect = fsect,
			
 
				+					.last_sect  = lsect };
			
 
				+
			
 
				+			ring_req->nr_segments++;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	info->ring.req_prod_pvt++;
			
 
				+
			
 
				+	/* Keep a private copy so we can reissue requests when recovering. */
			
 
				+	info->shadow[id].req = *ring_req;
			
 
				+
			
 
				+	gnttab_free_grant_references(gref_head);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static inline void flush_requests(struct blkfront_info *info)
			
 
				+{
			
 
				+	int notify;
			
 
				+
			
 
				+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
			
 
				+
			
 
				+	if (notify)
			
 
				+		notify_remote_via_irq(info->irq);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * do_blkif_request
			
 
				+ *  read a block; request is in a request queue
			
 
				+ */
			
 
				+static void do_blkif_request(request_queue_t *rq)
			
 
				+{
			
 
				+	struct blkfront_info *info = NULL;
			
 
				+	struct request *req;
			
 
				+	int queued;
			
 
				+
			
 
				+	pr_debug("Entered do_blkif_request\n");
			
 
				+
			
 
				+	queued = 0;
			
 
				+
			
 
				+	while ((req = elv_next_request(rq)) != NULL) {
			
 
				+		info = req->rq_disk->private_data;
			
 
				+		if (!blk_fs_request(req)) {
			
 
				+			end_request(req, 0);
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		if (RING_FULL(&info->ring))
			
 
				+			goto wait;
			
 
				+
			
 
				+		pr_debug("do_blk_req %p: cmd %p, sec %lx, "
			
 
				+			 "(%u/%li) buffer:%p [%s]\n",
			
 
				+			 req, req->cmd, (unsigned long)req->sector,
			
 
				+			 req->current_nr_sectors,
			
 
				+			 req->nr_sectors, req->buffer,
			
 
				+			 rq_data_dir(req) ? "write" : "read");
			
 
				+
			
 
				+
			
 
				+		blkdev_dequeue_request(req);
			
 
				+		if (blkif_queue_request(req)) {
			
 
				+			blk_requeue_request(rq, req);
			
 
				+wait:
			
 
				+			/* Avoid pointless unplugs. */
			
 
				+			blk_stop_queue(rq);
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		queued++;
			
 
				+	}
			
 
				+
			
 
				+	if (queued != 0)
			
 
				+		flush_requests(info);
			
 
				+}
			
 
				+
			
 
				+static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
			
 
				+{
			
 
				+	request_queue_t *rq;
			
 
				+
			
 
				+	rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
			
 
				+	if (rq == NULL)
			
 
				+		return -1;
			
 
				+
			
 
				+	elevator_init(rq, "noop");
			
 
				+
			
 
				+	/* Hard sector size and max sectors impersonate the equiv. hardware. */
			
 
				+	blk_queue_hardsect_size(rq, sector_size);
			
 
				+	blk_queue_max_sectors(rq, 512);
			
 
				+
			
 
				+	/* Each segment in a request is up to an aligned page in size. */
			
 
				+	blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
			
 
				+	blk_queue_max_segment_size(rq, PAGE_SIZE);
			
 
				+
			
 
				+	/* Ensure a merged request will fit in a single I/O ring slot. */
			
 
				+	blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
			
 
				+	blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
			
 
				+
			
 
				+	/* Make sure buffer addresses are sector-aligned. */
			
 
				+	blk_queue_dma_alignment(rq, 511);
			
 
				+
			
 
				+	gd->queue = rq;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static int xlvbd_barrier(struct blkfront_info *info)
			
 
				+{
			
 
				+	int err;
			
 
				+
			
 
				+	err = blk_queue_ordered(info->rq,
			
 
				+				info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE,
			
 
				+				NULL);
			
 
				+
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	printk(KERN_INFO "blkfront: %s: barriers %s\n",
			
 
				+	       info->gd->disk_name,
			
 
				+	       info->feature_barrier ? "enabled" : "disabled");
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static int xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity,
			
 
				+			       int vdevice, u16 vdisk_info, u16 sector_size,
			
 
				+			       struct blkfront_info *info)
			
 
				+{
			
 
				+	struct gendisk *gd;
			
 
				+	int nr_minors = 1;
			
 
				+	int err = -ENODEV;
			
 
				+
			
 
				+	BUG_ON(info->gd != NULL);
			
 
				+	BUG_ON(info->rq != NULL);
			
 
				+
			
 
				+	if ((minor % PARTS_PER_DISK) == 0)
			
 
				+		nr_minors = PARTS_PER_DISK;
			
 
				+
			
 
				+	gd = alloc_disk(nr_minors);
			
 
				+	if (gd == NULL)
			
 
				+		goto out;
			
 
				+
			
 
				+	if (nr_minors > 1)
			
 
				+		sprintf(gd->disk_name, "%s%c", DEV_NAME,
			
 
				+			'a' + minor / PARTS_PER_DISK);
			
 
				+	else
			
 
				+		sprintf(gd->disk_name, "%s%c%d", DEV_NAME,
			
 
				+			'a' + minor / PARTS_PER_DISK,
			
 
				+			minor % PARTS_PER_DISK);
			
 
				+
			
 
				+	gd->major = XENVBD_MAJOR;
			
 
				+	gd->first_minor = minor;
			
 
				+	gd->fops = &xlvbd_block_fops;
			
 
				+	gd->private_data = info;
			
 
				+	gd->driverfs_dev = &(info->xbdev->dev);
			
 
				+	set_capacity(gd, capacity);
			
 
				+
			
 
				+	if (xlvbd_init_blk_queue(gd, sector_size)) {
			
 
				+		del_gendisk(gd);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	info->rq = gd->queue;
			
 
				+	info->gd = gd;
			
 
				+
			
 
				+	if (info->feature_barrier)
			
 
				+		xlvbd_barrier(info);
			
 
				+
			
 
				+	if (vdisk_info & VDISK_READONLY)
			
 
				+		set_disk_ro(gd, 1);
			
 
				+
			
 
				+	if (vdisk_info & VDISK_REMOVABLE)
			
 
				+		gd->flags |= GENHD_FL_REMOVABLE;
			
 
				+
			
 
				+	if (vdisk_info & VDISK_CDROM)
			
 
				+		gd->flags |= GENHD_FL_CD;
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+ out:
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static void kick_pending_request_queues(struct blkfront_info *info)
			
 
				+{
			
 
				+	if (!RING_FULL(&info->ring)) {
			
 
				+		/* Re-enable calldowns. */
			
 
				+		blk_start_queue(info->rq);
			
 
				+		/* Kick things off immediately. */
			
 
				+		do_blkif_request(info->rq);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void blkif_restart_queue(struct work_struct *work)
			
 
				+{
			
 
				+	struct blkfront_info *info = container_of(work, struct blkfront_info, work);
			
 
				+
			
 
				+	spin_lock_irq(&blkif_io_lock);
			
 
				+	if (info->connected == BLKIF_STATE_CONNECTED)
			
 
				+		kick_pending_request_queues(info);
			
 
				+	spin_unlock_irq(&blkif_io_lock);
			
 
				+}
			
 
				+
			
 
				+static void blkif_free(struct blkfront_info *info, int suspend)
			
 
				+{
			
 
				+	/* Prevent new requests being issued until we fix things up. */
			
 
				+	spin_lock_irq(&blkif_io_lock);
			
 
				+	info->connected = suspend ?
			
 
				+		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
			
 
				+	/* No more blkif_request(). */
			
 
				+	if (info->rq)
			
 
				+		blk_stop_queue(info->rq);
			
 
				+	/* No more gnttab callback work. */
			
 
				+	gnttab_cancel_free_callback(&info->callback);
			
 
				+	spin_unlock_irq(&blkif_io_lock);
			
 
				+
			
 
				+	/* Flush gnttab callback work. Must be done with no locks held. */
			
 
				+	flush_scheduled_work();
			
 
				+
			
 
				+	/* Free resources associated with old device channel. */
			
 
				+	if (info->ring_ref != GRANT_INVALID_REF) {
			
 
				+		gnttab_end_foreign_access(info->ring_ref, 0,
			
 
				+					  (unsigned long)info->ring.sring);
			
 
				+		info->ring_ref = GRANT_INVALID_REF;
			
 
				+		info->ring.sring = NULL;
			
 
				+	}
			
 
				+	if (info->irq)
			
 
				+		unbind_from_irqhandler(info->irq, info);
			
 
				+	info->evtchn = info->irq = 0;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+static void blkif_completion(struct blk_shadow *s)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 0; i < s->req.nr_segments; i++)
			
 
				+		gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
			
 
				+}
			
 
				+
			
 
				+static irqreturn_t blkif_interrupt(int irq, void *dev_id)
			
 
				+{
			
 
				+	struct request *req;
			
 
				+	struct blkif_response *bret;
			
 
				+	RING_IDX i, rp;
			
 
				+	unsigned long flags;
			
 
				+	struct blkfront_info *info = (struct blkfront_info *)dev_id;
			
 
				+	int uptodate;
			
 
				+
			
 
				+	spin_lock_irqsave(&blkif_io_lock, flags);
			
 
				+
			
 
				+	if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
			
 
				+		spin_unlock_irqrestore(&blkif_io_lock, flags);
			
 
				+		return IRQ_HANDLED;
			
 
				+	}
			
 
				+
			
 
				+ again:
			
 
				+	rp = info->ring.sring->rsp_prod;
			
 
				+	rmb(); /* Ensure we see queued responses up to 'rp'. */
			
 
				+
			
 
				+	for (i = info->ring.rsp_cons; i != rp; i++) {
			
 
				+		unsigned long id;
			
 
				+		int ret;
			
 
				+
			
 
				+		bret = RING_GET_RESPONSE(&info->ring, i);
			
 
				+		id   = bret->id;
			
 
				+		req  = (struct request *)info->shadow[id].request;
			
 
				+
			
 
				+		blkif_completion(&info->shadow[id]);
			
 
				+
			
 
				+		add_id_to_freelist(info, id);
			
 
				+
			
 
				+		uptodate = (bret->status == BLKIF_RSP_OKAY);
			
 
				+		switch (bret->operation) {
			
 
				+		case BLKIF_OP_WRITE_BARRIER:
			
 
				+			if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
			
 
				+				printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
			
 
				+				       info->gd->disk_name);
			
 
				+				uptodate = -EOPNOTSUPP;
			
 
				+				info->feature_barrier = 0;
			
 
				+				xlvbd_barrier(info);
			
 
				+			}
			
 
				+			/* fall through */
			
 
				+		case BLKIF_OP_READ:
			
 
				+		case BLKIF_OP_WRITE:
			
 
				+			if (unlikely(bret->status != BLKIF_RSP_OKAY))
			
 
				+				dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
			
 
				+					"request: %x\n", bret->status);
			
 
				+
			
 
				+			ret = end_that_request_first(req, uptodate,
			
 
				+				req->hard_nr_sectors);
			
 
				+			BUG_ON(ret);
			
 
				+			end_that_request_last(req, uptodate);
			
 
				+			break;
			
 
				+		default:
			
 
				+			BUG();
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	info->ring.rsp_cons = i;
			
 
				+
			
 
				+	if (i != info->ring.req_prod_pvt) {
			
 
				+		int more_to_do;
			
 
				+		RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
			
 
				+		if (more_to_do)
			
 
				+			goto again;
			
 
				+	} else
			
 
				+		info->ring.sring->rsp_event = i + 1;
			
 
				+
			
 
				+	kick_pending_request_queues(info);
			
 
				+
			
 
				+	spin_unlock_irqrestore(&blkif_io_lock, flags);
			
 
				+
			
 
				+	return IRQ_HANDLED;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static int setup_blkring(struct xenbus_device *dev,
			
 
				+			 struct blkfront_info *info)
			
 
				+{
			
 
				+	struct blkif_sring *sring;
			
 
				+	int err;
			
 
				+
			
 
				+	info->ring_ref = GRANT_INVALID_REF;
			
 
				+
			
 
				+	sring = (struct blkif_sring *)__get_free_page(GFP_KERNEL);
			
 
				+	if (!sring) {
			
 
				+		xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+	SHARED_RING_INIT(sring);
			
 
				+	FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
			
 
				+
			
 
				+	err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
			
 
				+	if (err < 0) {
			
 
				+		free_page((unsigned long)sring);
			
 
				+		info->ring.sring = NULL;
			
 
				+		goto fail;
			
 
				+	}
			
 
				+	info->ring_ref = err;
			
 
				+
			
 
				+	err = xenbus_alloc_evtchn(dev, &info->evtchn);
			
 
				+	if (err)
			
 
				+		goto fail;
			
 
				+
			
 
				+	err = bind_evtchn_to_irqhandler(info->evtchn,
			
 
				+					blkif_interrupt,
			
 
				+					IRQF_SAMPLE_RANDOM, "blkif", info);
			
 
				+	if (err <= 0) {
			
 
				+		xenbus_dev_fatal(dev, err,
			
 
				+				 "bind_evtchn_to_irqhandler failed");
			
 
				+		goto fail;
			
 
				+	}
			
 
				+	info->irq = err;
			
 
				+
			
 
				+	return 0;
			
 
				+fail:
			
 
				+	blkif_free(info, 0);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* Common code used when first setting up, and when resuming. */
			
 
				+static int talk_to_backend(struct xenbus_device *dev,
			
 
				+			   struct blkfront_info *info)
			
 
				+{
			
 
				+	const char *message = NULL;
			
 
				+	struct xenbus_transaction xbt;
			
 
				+	int err;
			
 
				+
			
 
				+	/* Create shared ring, alloc event channel. */
			
 
				+	err = setup_blkring(dev, info);
			
 
				+	if (err)
			
 
				+		goto out;
			
 
				+
			
 
				+again:
			
 
				+	err = xenbus_transaction_start(&xbt);
			
 
				+	if (err) {
			
 
				+		xenbus_dev_fatal(dev, err, "starting transaction");
			
 
				+		goto destroy_blkring;
			
 
				+	}
			
 
				+
			
 
				+	err = xenbus_printf(xbt, dev->nodename,
			
 
				+			    "ring-ref", "%u", info->ring_ref);
			
 
				+	if (err) {
			
 
				+		message = "writing ring-ref";
			
 
				+		goto abort_transaction;
			
 
				+	}
			
 
				+	err = xenbus_printf(xbt, dev->nodename,
			
 
				+			    "event-channel", "%u", info->evtchn);
			
 
				+	if (err) {
			
 
				+		message = "writing event-channel";
			
 
				+		goto abort_transaction;
			
 
				+	}
			
 
				+
			
 
				+	err = xenbus_transaction_end(xbt, 0);
			
 
				+	if (err) {
			
 
				+		if (err == -EAGAIN)
			
 
				+			goto again;
			
 
				+		xenbus_dev_fatal(dev, err, "completing transaction");
			
 
				+		goto destroy_blkring;
			
 
				+	}
			
 
				+
			
 
				+	xenbus_switch_state(dev, XenbusStateInitialised);
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+ abort_transaction:
			
 
				+	xenbus_transaction_end(xbt, 1);
			
 
				+	if (message)
			
 
				+		xenbus_dev_fatal(dev, err, "%s", message);
			
 
				+ destroy_blkring:
			
 
				+	blkif_free(info, 0);
			
 
				+ out:
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * Entry point to this code when a new device is created.  Allocate the basic
			
 
				+ * structures and the ring buffer for communication with the backend, and
			
 
				+ * inform the backend of the appropriate details for those.  Switch to
			
 
				+ * Initialised state.
			
 
				+ */
			
 
				+static int blkfront_probe(struct xenbus_device *dev,
			
 
				+			  const struct xenbus_device_id *id)
			
 
				+{
			
 
				+	int err, vdevice, i;
			
 
				+	struct blkfront_info *info;
			
 
				+
			
 
				+	/* FIXME: Use dynamic device id if this is not set. */
			
 
				+	err = xenbus_scanf(XBT_NIL, dev->nodename,
			
 
				+			   "virtual-device", "%i", &vdevice);
			
 
				+	if (err != 1) {
			
 
				+		xenbus_dev_fatal(dev, err, "reading virtual-device");
			
 
				+		return err;
			
 
				+	}
			
 
				+
			
 
				+	info = kzalloc(sizeof(*info), GFP_KERNEL);
			
 
				+	if (!info) {
			
 
				+		xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	info->xbdev = dev;
			
 
				+	info->vdevice = vdevice;
			
 
				+	info->connected = BLKIF_STATE_DISCONNECTED;
			
 
				+	INIT_WORK(&info->work, blkif_restart_queue);
			
 
				+
			
 
				+	for (i = 0; i < BLK_RING_SIZE; i++)
			
 
				+		info->shadow[i].req.id = i+1;
			
 
				+	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
			
 
				+
			
 
				+	/* Front end dir is a number, which is used as the id. */
			
 
				+	info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
			
 
				+	dev->dev.driver_data = info;
			
 
				+
			
 
				+	err = talk_to_backend(dev, info);
			
 
				+	if (err) {
			
 
				+		kfree(info);
			
 
				+		dev->dev.driver_data = NULL;
			
 
				+		return err;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static int blkif_recover(struct blkfront_info *info)
			
 
				+{
			
 
				+	int i;
			
 
				+	struct blkif_request *req;
			
 
				+	struct blk_shadow *copy;
			
 
				+	int j;
			
 
				+
			
 
				+	/* Stage 1: Make a safe copy of the shadow state. */
			
 
				+	copy = kmalloc(sizeof(info->shadow), GFP_KERNEL);
			
 
				+	if (!copy)
			
 
				+		return -ENOMEM;
			
 
				+	memcpy(copy, info->shadow, sizeof(info->shadow));
			
 
				+
			
 
				+	/* Stage 2: Set up free list. */
			
 
				+	memset(&info->shadow, 0, sizeof(info->shadow));
			
 
				+	for (i = 0; i < BLK_RING_SIZE; i++)
			
 
				+		info->shadow[i].req.id = i+1;
			
 
				+	info->shadow_free = info->ring.req_prod_pvt;
			
 
				+	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
			
 
				+
			
 
				+	/* Stage 3: Find pending requests and requeue them. */
			
 
				+	for (i = 0; i < BLK_RING_SIZE; i++) {
			
 
				+		/* Not in use? */
			
 
				+		if (copy[i].request == 0)
			
 
				+			continue;
			
 
				+
			
 
				+		/* Grab a request slot and copy shadow state into it. */
			
 
				+		req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
			
 
				+		*req = copy[i].req;
			
 
				+
			
 
				+		/* We get a new request id, and must reset the shadow state. */
			
 
				+		req->id = get_id_from_freelist(info);
			
 
				+		memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
			
 
				+
			
 
				+		/* Rewrite any grant references invalidated by susp/resume. */
			
 
				+		for (j = 0; j < req->nr_segments; j++)
			
 
				+			gnttab_grant_foreign_access_ref(
			
 
				+				req->seg[j].gref,
			
 
				+				info->xbdev->otherend_id,
			
 
				+				pfn_to_mfn(info->shadow[req->id].frame[j]),
			
 
				+				rq_data_dir(
			
 
				+					(struct request *)
			
 
				+					info->shadow[req->id].request));
			
 
				+		info->shadow[req->id].req = *req;
			
 
				+
			
 
				+		info->ring.req_prod_pvt++;
			
 
				+	}
			
 
				+
			
 
				+	kfree(copy);
			
 
				+
			
 
				+	xenbus_switch_state(info->xbdev, XenbusStateConnected);
			
 
				+
			
 
				+	spin_lock_irq(&blkif_io_lock);
			
 
				+
			
 
				+	/* Now safe for us to use the shared ring */
			
 
				+	info->connected = BLKIF_STATE_CONNECTED;
			
 
				+
			
 
				+	/* Send off requeued requests */
			
 
				+	flush_requests(info);
			
 
				+
			
 
				+	/* Kick any other new requests queued since we resumed */
			
 
				+	kick_pending_request_queues(info);
			
 
				+
			
 
				+	spin_unlock_irq(&blkif_io_lock);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * We are reconnecting to the backend, due to a suspend/resume, or a backend
			
 
				+ * driver restart.  We tear down our blkif structure and recreate it, but
			
 
				+ * leave the device-layer structures intact so that this is transparent to the
			
 
				+ * rest of the kernel.
			
 
				+ */
			
 
				+static int blkfront_resume(struct xenbus_device *dev)
			
 
				+{
			
 
				+	struct blkfront_info *info = dev->dev.driver_data;
			
 
				+	int err;
			
 
				+
			
 
				+	dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
			
 
				+
			
 
				+	blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
			
 
				+
			
 
				+	err = talk_to_backend(dev, info);
			
 
				+	if (info->connected == BLKIF_STATE_SUSPENDED && !err)
			
 
				+		err = blkif_recover(info);
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Invoked when the backend is finally 'ready' (and has told produced
			
 
				+ * the details about the physical device - #sectors, size, etc).
			
 
				+ */
			
 
				+static void blkfront_connect(struct blkfront_info *info)
			
 
				+{
			
 
				+	unsigned long long sectors;
			
 
				+	unsigned long sector_size;
			
 
				+	unsigned int binfo;
			
 
				+	int err;
			
 
				+
			
 
				+	if ((info->connected == BLKIF_STATE_CONNECTED) ||
			
 
				+	    (info->connected == BLKIF_STATE_SUSPENDED) )
			
 
				+		return;
			
 
				+
			
 
				+	dev_dbg(&info->xbdev->dev, "%s:%s.\n",
			
 
				+		__func__, info->xbdev->otherend);
			
 
				+
			
 
				+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
			
 
				+			    "sectors", "%llu", &sectors,
			
 
				+			    "info", "%u", &binfo,
			
 
				+			    "sector-size", "%lu", &sector_size,
			
 
				+			    NULL);
			
 
				+	if (err) {
			
 
				+		xenbus_dev_fatal(info->xbdev, err,
			
 
				+				 "reading backend fields at %s",
			
 
				+				 info->xbdev->otherend);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
			
 
				+			    "feature-barrier", "%lu", &info->feature_barrier,
			
 
				+			    NULL);
			
 
				+	if (err)
			
 
				+		info->feature_barrier = 0;
			
 
				+
			
 
				+	err = xlvbd_alloc_gendisk(BLKIF_MINOR(info->vdevice),
			
 
				+				  sectors, info->vdevice,
			
 
				+				  binfo, sector_size, info);
			
 
				+	if (err) {
			
 
				+		xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
			
 
				+				 info->xbdev->otherend);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	xenbus_switch_state(info->xbdev, XenbusStateConnected);
			
 
				+
			
 
				+	/* Kick pending requests. */
			
 
				+	spin_lock_irq(&blkif_io_lock);
			
 
				+	info->connected = BLKIF_STATE_CONNECTED;
			
 
				+	kick_pending_request_queues(info);
			
 
				+	spin_unlock_irq(&blkif_io_lock);
			
 
				+
			
 
				+	add_disk(info->gd);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Handle the change of state of the backend to Closing.  We must delete our
			
 
				+ * device-layer structures now, to ensure that writes are flushed through to
			
 
				+ * the backend.  Once is this done, we can switch to Closed in
			
 
				+ * acknowledgement.
			
 
				+ */
			
 
				+static void blkfront_closing(struct xenbus_device *dev)
			
 
				+{
			
 
				+	struct blkfront_info *info = dev->dev.driver_data;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	dev_dbg(&dev->dev, "blkfront_closing: %s removed\n", dev->nodename);
			
 
				+
			
 
				+	if (info->rq == NULL)
			
 
				+		goto out;
			
 
				+
			
 
				+	spin_lock_irqsave(&blkif_io_lock, flags);
			
 
				+
			
 
				+	del_gendisk(info->gd);
			
 
				+
			
 
				+	/* No more blkif_request(). */
			
 
				+	blk_stop_queue(info->rq);
			
 
				+
			
 
				+	/* No more gnttab callback work. */
			
 
				+	gnttab_cancel_free_callback(&info->callback);
			
 
				+	spin_unlock_irqrestore(&blkif_io_lock, flags);
			
 
				+
			
 
				+	/* Flush gnttab callback work. Must be done with no locks held. */
			
 
				+	flush_scheduled_work();
			
 
				+
			
 
				+	blk_cleanup_queue(info->rq);
			
 
				+	info->rq = NULL;
			
 
				+
			
 
				+ out:
			
 
				+	xenbus_frontend_closed(dev);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Callback received when the backend's state changes.
			
 
				+ */
			
 
				+static void backend_changed(struct xenbus_device *dev,
			
 
				+			    enum xenbus_state backend_state)
			
 
				+{
			
 
				+	struct blkfront_info *info = dev->dev.driver_data;
			
 
				+	struct block_device *bd;
			
 
				+
			
 
				+	dev_dbg(&dev->dev, "blkfront:backend_changed.\n");
			
 
				+
			
 
				+	switch (backend_state) {
			
 
				+	case XenbusStateInitialising:
			
 
				+	case XenbusStateInitWait:
			
 
				+	case XenbusStateInitialised:
			
 
				+	case XenbusStateUnknown:
			
 
				+	case XenbusStateClosed:
			
 
				+		break;
			
 
				+
			
 
				+	case XenbusStateConnected:
			
 
				+		blkfront_connect(info);
			
 
				+		break;
			
 
				+
			
 
				+	case XenbusStateClosing:
			
 
				+		bd = bdget(info->dev);
			
 
				+		if (bd == NULL)
			
 
				+			xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
			
 
				+
			
 
				+		mutex_lock(&bd->bd_mutex);
			
 
				+		if (info->users > 0)
			
 
				+			xenbus_dev_error(dev, -EBUSY,
			
 
				+					 "Device in use; refusing to close");
			
 
				+		else
			
 
				+			blkfront_closing(dev);
			
 
				+		mutex_unlock(&bd->bd_mutex);
			
 
				+		bdput(bd);
			
 
				+		break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int blkfront_remove(struct xenbus_device *dev)
			
 
				+{
			
 
				+	struct blkfront_info *info = dev->dev.driver_data;
			
 
				+
			
 
				+	dev_dbg(&dev->dev, "blkfront_remove: %s removed\n", dev->nodename);
			
 
				+
			
 
				+	blkif_free(info, 0);
			
 
				+
			
 
				+	kfree(info);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int blkif_open(struct inode *inode, struct file *filep)
			
 
				+{
			
 
				+	struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
			
 
				+	info->users++;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int blkif_release(struct inode *inode, struct file *filep)
			
 
				+{
			
 
				+	struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
			
 
				+	info->users--;
			
 
				+	if (info->users == 0) {
			
 
				+		/* Check whether we have been instructed to close.  We will
			
 
				+		   have ignored this request initially, as the device was
			
 
				+		   still mounted. */
			
 
				+		struct xenbus_device *dev = info->xbdev;
			
 
				+		enum xenbus_state state = xenbus_read_driver_state(dev->otherend);
			
 
				+
			
 
				+		if (state == XenbusStateClosing)
			
 
				+			blkfront_closing(dev);
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static struct block_device_operations xlvbd_block_fops =
			
 
				+{
			
 
				+	.owner = THIS_MODULE,
			
 
				+	.open = blkif_open,
			
 
				+	.release = blkif_release,
			
 
				+};
			
 
				+
			
 
				+
			
 
				+static struct xenbus_device_id blkfront_ids[] = {
			
 
				+	{ "vbd" },
			
 
				+	{ "" }
			
 
				+};
			
 
				+
			
 
				+static struct xenbus_driver blkfront = {
			
 
				+	.name = "vbd",
			
 
				+	.owner = THIS_MODULE,
			
 
				+	.ids = blkfront_ids,
			
 
				+	.probe = blkfront_probe,
			
 
				+	.remove = blkfront_remove,
			
 
				+	.resume = blkfront_resume,
			
 
				+	.otherend_changed = backend_changed,
			
 
				+};
			
 
				+
			
 
				+static int __init xlblk_init(void)
			
 
				+{
			
 
				+	if (!is_running_on_xen())
			
 
				+		return -ENODEV;
			
 
				+
			
 
				+	if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
			
 
				+		printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n",
			
 
				+		       XENVBD_MAJOR, DEV_NAME);
			
 
				+		return -ENODEV;
			
 
				+	}
			
 
				+
			
 
				+	return xenbus_register_frontend(&blkfront);
			
 
				+}
			
 
				+module_init(xlblk_init);
			
 
				+
			
 
				+
			
 
				+static void xlblk_exit(void)
			
 
				+{
			
 
				+	return xenbus_unregister_driver(&blkfront);
			
 
				+}
			
 
				+module_exit(xlblk_exit);
			
 
				+
			
 
				+MODULE_DESCRIPTION("Xen virtual block device frontend");
			
 
				+MODULE_LICENSE("GPL");
			
 
				+MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR);
			
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -604,6 +604,14 @@ config HVC_BEAT
 
				 	help
			
 
				 	  Toshiba's Cell Reference Set Beat Console device driver
			
 
				 
			
 
				+config HVC_XEN
			
 
				+	bool "Xen Hypervisor Console support"
			
 
				+	depends on XEN
			
 
				+	select HVC_DRIVER
			
 
				+	default y
			
 
				+	help
			
 
				+	  Xen virtual console device driver
			
 
				+
			
 
				 config HVCS
			
 
				 	tristate "IBM Hypervisor Virtual Console Server support"
			
 
				 	depends on PPC_PSERIES
			
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_HVC_ISERIES)	+= hvc_iseries.o
 
				 obj-$(CONFIG_HVC_RTAS)		+= hvc_rtas.o
			
 
				 obj-$(CONFIG_HVC_BEAT)		+= hvc_beat.o
			
 
				 obj-$(CONFIG_HVC_DRIVER)	+= hvc_console.o
			
 
				+obj-$(CONFIG_HVC_XEN)		+= hvc_xen.o
			
 
				 obj-$(CONFIG_RAW_DRIVER)	+= raw.o
			
 
				 obj-$(CONFIG_SGI_SNSC)		+= snsc.o snsc_event.o
			
 
				 obj-$(CONFIG_MSPEC)		+= mspec.o
			
--- a/drivers/char/hvc_xen.c
+++ b/drivers/char/hvc_xen.c
@@ -0,0 +1,159 @@
 
				+/*
			
 
				+ * xen console driver interface to hvc_console.c
			
 
				+ *
			
 
				+ * (c) 2007 Gerd Hoffmann <kraxel@suse.de>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2 of the License, or
			
 
				+ * (at your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful,
			
 
				+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
			
 
				+ * GNU General Public License for more details.
			
 
				+ *
			
 
				+ * You should have received a copy of the GNU General Public License
			
 
				+ * along with this program; if not, write to the Free Software
			
 
				+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
			
 
				+ */
			
 
				+
			
 
				+#include <linux/console.h>
			
 
				+#include <linux/delay.h>
			
 
				+#include <linux/err.h>
			
 
				+#include <linux/init.h>
			
 
				+#include <linux/types.h>
			
 
				+
			
 
				+#include <asm/xen/hypervisor.h>
			
 
				+#include <xen/page.h>
			
 
				+#include <xen/events.h>
			
 
				+#include <xen/interface/io/console.h>
			
 
				+#include <xen/hvc-console.h>
			
 
				+
			
 
				+#include "hvc_console.h"
			
 
				+
			
 
				+#define HVC_COOKIE   0x58656e /* "Xen" in hex */
			
 
				+
			
 
				+static struct hvc_struct *hvc;
			
 
				+static int xencons_irq;
			
 
				+
			
 
				+/* ------------------------------------------------------------------ */
			
 
				+
			
 
				+static inline struct xencons_interface *xencons_interface(void)
			
 
				+{
			
 
				+	return mfn_to_virt(xen_start_info->console.domU.mfn);
			
 
				+}
			
 
				+
			
 
				+static inline void notify_daemon(void)
			
 
				+{
			
 
				+	/* Use evtchn: this is called early, before irq is set up. */
			
 
				+	notify_remote_via_evtchn(xen_start_info->console.domU.evtchn);
			
 
				+}
			
 
				+
			
 
				+static int write_console(uint32_t vtermno, const char *data, int len)
			
 
				+{
			
 
				+	struct xencons_interface *intf = xencons_interface();
			
 
				+	XENCONS_RING_IDX cons, prod;
			
 
				+	int sent = 0;
			
 
				+
			
 
				+	cons = intf->out_cons;
			
 
				+	prod = intf->out_prod;
			
 
				+	mb();			/* update queue values before going on */
			
 
				+	BUG_ON((prod - cons) > sizeof(intf->out));
			
 
				+
			
 
				+	while ((sent < len) && ((prod - cons) < sizeof(intf->out)))
			
 
				+		intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++];
			
 
				+
			
 
				+	wmb();			/* write ring before updating pointer */
			
 
				+	intf->out_prod = prod;
			
 
				+
			
 
				+	notify_daemon();
			
 
				+	return sent;
			
 
				+}
			
 
				+
			
 
				+static int read_console(uint32_t vtermno, char *buf, int len)
			
 
				+{
			
 
				+	struct xencons_interface *intf = xencons_interface();
			
 
				+	XENCONS_RING_IDX cons, prod;
			
 
				+	int recv = 0;
			
 
				+
			
 
				+	cons = intf->in_cons;
			
 
				+	prod = intf->in_prod;
			
 
				+	mb();			/* get pointers before reading ring */
			
 
				+	BUG_ON((prod - cons) > sizeof(intf->in));
			
 
				+
			
 
				+	while (cons != prod && recv < len)
			
 
				+		buf[recv++] = intf->in[MASK_XENCONS_IDX(cons++, intf->in)];
			
 
				+
			
 
				+	mb();			/* read ring before consuming */
			
 
				+	intf->in_cons = cons;
			
 
				+
			
 
				+	notify_daemon();
			
 
				+	return recv;
			
 
				+}
			
 
				+
			
 
				+static struct hv_ops hvc_ops = {
			
 
				+	.get_chars = read_console,
			
 
				+	.put_chars = write_console,
			
 
				+};
			
 
				+
			
 
				+static int __init xen_init(void)
			
 
				+{
			
 
				+	struct hvc_struct *hp;
			
 
				+
			
 
				+	if (!is_running_on_xen())
			
 
				+		return 0;
			
 
				+
			
 
				+	xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn);
			
 
				+	if (xencons_irq < 0)
			
 
				+		xencons_irq = 0 /* NO_IRQ */;
			
 
				+	hp = hvc_alloc(HVC_COOKIE, xencons_irq, &hvc_ops, 256);
			
 
				+	if (IS_ERR(hp))
			
 
				+		return PTR_ERR(hp);
			
 
				+
			
 
				+	hvc = hp;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void __exit xen_fini(void)
			
 
				+{
			
 
				+	if (hvc)
			
 
				+		hvc_remove(hvc);
			
 
				+}
			
 
				+
			
 
				+static int xen_cons_init(void)
			
 
				+{
			
 
				+	if (!is_running_on_xen())
			
 
				+		return 0;
			
 
				+
			
 
				+	hvc_instantiate(HVC_COOKIE, 0, &hvc_ops);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+module_init(xen_init);
			
 
				+module_exit(xen_fini);
			
 
				+console_initcall(xen_cons_init);
			
 
				+
			
 
				+static void xenboot_write_console(struct console *console, const char *string,
			
 
				+				  unsigned len)
			
 
				+{
			
 
				+	unsigned int linelen, off = 0;
			
 
				+	const char *pos;
			
 
				+
			
 
				+	while (off < len && NULL != (pos = strchr(string+off, '\n'))) {
			
 
				+		linelen = pos-string+off;
			
 
				+		if (off + linelen > len)
			
 
				+			break;
			
 
				+		write_console(0, string+off, linelen);
			
 
				+		write_console(0, "\r\n", 2);
			
 
				+		off += linelen + 1;
			
 
				+	}
			
 
				+	if (off < len)
			
 
				+		write_console(0, string+off, len-off);
			
 
				+}
			
 
				+
			
 
				+struct console xenboot_console = {
			
 
				+	.name		= "xenboot",
			
 
				+	.write		= xenboot_write_console,
			
 
				+	.flags		= CON_PRINTBUFFER | CON_BOOT,
			
 
				+};
			
--- a/drivers/macintosh/therm_pm72.c
+++ b/drivers/macintosh/therm_pm72.c
@@ -1770,7 +1770,8 @@ static int call_critical_overtemp(void)
 
				 				"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
			
 
				 				NULL };
			
 
				 
			
 
				-	return call_usermodehelper(critical_overtemp_path, argv, envp, 0);
			
 
				+	return call_usermodehelper(critical_overtemp_path,
			
 
				+				   argv, envp, UMH_WAIT_EXEC);
			
 
				 }
			
 
				 
			
 
				 
			
--- a/drivers/macintosh/windfarm_core.c
+++ b/drivers/macintosh/windfarm_core.c
@@ -80,7 +80,8 @@ int wf_critical_overtemp(void)
 
				 				"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
			
 
				 				NULL };
			
 
				 
			
 
				-	return call_usermodehelper(critical_overtemp_path, argv, envp, 0);
			
 
				+	return call_usermodehelper(critical_overtemp_path,
			
 
				+				   argv, envp, UMH_WAIT_EXEC);
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(wf_critical_overtemp);
			
 
				 
			
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2486,6 +2486,18 @@ source "drivers/atm/Kconfig"
 
				 
			
 
				 source "drivers/s390/net/Kconfig"
			
 
				 
			
 
				+config XEN_NETDEV_FRONTEND
			
 
				+	tristate "Xen network device frontend driver"
			
 
				+	depends on XEN
			
 
				+	default y
			
 
				+	help
			
 
				+	  The network device frontend driver allows the kernel to
			
 
				+	  access network devices exported exported by a virtual
			
 
				+	  machine containing a physical network device driver. The
			
 
				+	  frontend driver is intended for unprivileged guest domains;
			
 
				+	  if you are compiling a kernel for a Xen guest, you almost
			
 
				+	  certainly want to enable this.
			
 
				+
			
 
				 config ISERIES_VETH
			
 
				 	tristate "iSeries Virtual Ethernet driver support"
			
 
				 	depends on PPC_ISERIES
			
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -127,6 +127,8 @@ obj-$(CONFIG_PPPOL2TP) += pppox.o pppol2tp.o
 
				 obj-$(CONFIG_SLIP) += slip.o
			
 
				 obj-$(CONFIG_SLHC) += slhc.o
			
 
				 
			
 
				+obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o
			
 
				+
			
 
				 obj-$(CONFIG_DUMMY) += dummy.o
			
 
				 obj-$(CONFIG_IFB) += ifb.o
			
 
				 obj-$(CONFIG_MACVLAN) += macvlan.o
			
--- a/drivers/net/hamradio/baycom_epp.c
+++ b/drivers/net/hamradio/baycom_epp.c
@@ -320,7 +320,7 @@ static int eppconfig(struct baycom_state *bc)
 
				 	sprintf(portarg, "%ld", bc->pdev->port->base);
			
 
				 	printk(KERN_DEBUG "%s: %s -s -p %s -m %s\n", bc_drvname, eppconfig_path, portarg, modearg);
			
 
				 
			
 
				-	return call_usermodehelper(eppconfig_path, argv, envp, 1);
			
 
				+	return call_usermodehelper(eppconfig_path, argv, envp, UMH_WAIT_PROC);
			
 
				 }
			
 
				 
			
 
				 /* ---------------------------------------------------------------------- */
			
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -0,0 +1,1863 @@
 
				+/*
			
 
				+ * Virtual network driver for conversing with remote driver backends.
			
 
				+ *
			
 
				+ * Copyright (c) 2002-2005, K A Fraser
			
 
				+ * Copyright (c) 2005, XenSource Ltd
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public License version 2
			
 
				+ * as published by the Free Software Foundation; or, when distributed
			
 
				+ * separately from the Linux kernel or incorporated into other
			
 
				+ * software packages, subject to the following license:
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this source file (the "Software"), to deal in the Software without
			
 
				+ * restriction, including without limitation the rights to use, copy, modify,
			
 
				+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
			
 
				+ * and to permit persons to whom the Software is furnished to do so, subject to
			
 
				+ * the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
			
 
				+ * IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/netdevice.h>
			
 
				+#include <linux/etherdevice.h>
			
 
				+#include <linux/skbuff.h>
			
 
				+#include <linux/ethtool.h>
			
 
				+#include <linux/if_ether.h>
			
 
				+#include <linux/tcp.h>
			
 
				+#include <linux/udp.h>
			
 
				+#include <linux/moduleparam.h>
			
 
				+#include <linux/mm.h>
			
 
				+#include <net/ip.h>
			
 
				+
			
 
				+#include <xen/xenbus.h>
			
 
				+#include <xen/events.h>
			
 
				+#include <xen/page.h>
			
 
				+#include <xen/grant_table.h>
			
 
				+
			
 
				+#include <xen/interface/io/netif.h>
			
 
				+#include <xen/interface/memory.h>
			
 
				+#include <xen/interface/grant_table.h>
			
 
				+
			
 
				+static struct ethtool_ops xennet_ethtool_ops;
			
 
				+
			
 
				+struct netfront_cb {
			
 
				+	struct page *page;
			
 
				+	unsigned offset;
			
 
				+};
			
 
				+
			
 
				+#define NETFRONT_SKB_CB(skb)	((struct netfront_cb *)((skb)->cb))
			
 
				+
			
 
				+#define RX_COPY_THRESHOLD 256
			
 
				+
			
 
				+#define GRANT_INVALID_REF	0
			
 
				+
			
 
				+#define NET_TX_RING_SIZE __RING_SIZE((struct xen_netif_tx_sring *)0, PAGE_SIZE)
			
 
				+#define NET_RX_RING_SIZE __RING_SIZE((struct xen_netif_rx_sring *)0, PAGE_SIZE)
			
 
				+#define TX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
			
 
				+
			
 
				+struct netfront_info {
			
 
				+	struct list_head list;
			
 
				+	struct net_device *netdev;
			
 
				+
			
 
				+	struct net_device_stats stats;
			
 
				+
			
 
				+	struct xen_netif_tx_front_ring tx;
			
 
				+	struct xen_netif_rx_front_ring rx;
			
 
				+
			
 
				+	spinlock_t   tx_lock;
			
 
				+	spinlock_t   rx_lock;
			
 
				+
			
 
				+	unsigned int evtchn;
			
 
				+
			
 
				+	/* Receive-ring batched refills. */
			
 
				+#define RX_MIN_TARGET 8
			
 
				+#define RX_DFL_MIN_TARGET 64
			
 
				+#define RX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
			
 
				+	unsigned rx_min_target, rx_max_target, rx_target;
			
 
				+	struct sk_buff_head rx_batch;
			
 
				+
			
 
				+	struct timer_list rx_refill_timer;
			
 
				+
			
 
				+	/*
			
 
				+	 * {tx,rx}_skbs store outstanding skbuffs. Free tx_skb entries
			
 
				+	 * are linked from tx_skb_freelist through skb_entry.link.
			
 
				+	 *
			
 
				+	 *  NB. Freelist index entries are always going to be less than
			
 
				+	 *  PAGE_OFFSET, whereas pointers to skbs will always be equal or
			
 
				+	 *  greater than PAGE_OFFSET: we use this property to distinguish
			
 
				+	 *  them.
			
 
				+	 */
			
 
				+	union skb_entry {
			
 
				+		struct sk_buff *skb;
			
 
				+		unsigned link;
			
 
				+	} tx_skbs[NET_TX_RING_SIZE];
			
 
				+	grant_ref_t gref_tx_head;
			
 
				+	grant_ref_t grant_tx_ref[NET_TX_RING_SIZE];
			
 
				+	unsigned tx_skb_freelist;
			
 
				+
			
 
				+	struct sk_buff *rx_skbs[NET_RX_RING_SIZE];
			
 
				+	grant_ref_t gref_rx_head;
			
 
				+	grant_ref_t grant_rx_ref[NET_RX_RING_SIZE];
			
 
				+
			
 
				+	struct xenbus_device *xbdev;
			
 
				+	int tx_ring_ref;
			
 
				+	int rx_ring_ref;
			
 
				+
			
 
				+	unsigned long rx_pfn_array[NET_RX_RING_SIZE];
			
 
				+	struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1];
			
 
				+	struct mmu_update rx_mmu[NET_RX_RING_SIZE];
			
 
				+};
			
 
				+
			
 
				+struct netfront_rx_info {
			
 
				+	struct xen_netif_rx_response rx;
			
 
				+	struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Access macros for acquiring freeing slots in tx_skbs[].
			
 
				+ */
			
 
				+
			
 
				+static void add_id_to_freelist(unsigned *head, union skb_entry *list,
			
 
				+			       unsigned short id)
			
 
				+{
			
 
				+	list[id].link = *head;
			
 
				+	*head = id;
			
 
				+}
			
 
				+
			
 
				+static unsigned short get_id_from_freelist(unsigned *head,
			
 
				+					   union skb_entry *list)
			
 
				+{
			
 
				+	unsigned int id = *head;
			
 
				+	*head = list[id].link;
			
 
				+	return id;
			
 
				+}
			
 
				+
			
 
				+static int xennet_rxidx(RING_IDX idx)
			
 
				+{
			
 
				+	return idx & (NET_RX_RING_SIZE - 1);
			
 
				+}
			
 
				+
			
 
				+static struct sk_buff *xennet_get_rx_skb(struct netfront_info *np,
			
 
				+					 RING_IDX ri)
			
 
				+{
			
 
				+	int i = xennet_rxidx(ri);
			
 
				+	struct sk_buff *skb = np->rx_skbs[i];
			
 
				+	np->rx_skbs[i] = NULL;
			
 
				+	return skb;
			
 
				+}
			
 
				+
			
 
				+static grant_ref_t xennet_get_rx_ref(struct netfront_info *np,
			
 
				+					    RING_IDX ri)
			
 
				+{
			
 
				+	int i = xennet_rxidx(ri);
			
 
				+	grant_ref_t ref = np->grant_rx_ref[i];
			
 
				+	np->grant_rx_ref[i] = GRANT_INVALID_REF;
			
 
				+	return ref;
			
 
				+}
			
 
				+
			
 
				+#ifdef CONFIG_SYSFS
			
 
				+static int xennet_sysfs_addif(struct net_device *netdev);
			
 
				+static void xennet_sysfs_delif(struct net_device *netdev);
			
 
				+#else /* !CONFIG_SYSFS */
			
 
				+#define xennet_sysfs_addif(dev) (0)
			
 
				+#define xennet_sysfs_delif(dev) do { } while (0)
			
 
				+#endif
			
 
				+
			
 
				+static int xennet_can_sg(struct net_device *dev)
			
 
				+{
			
 
				+	return dev->features & NETIF_F_SG;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void rx_refill_timeout(unsigned long data)
			
 
				+{
			
 
				+	struct net_device *dev = (struct net_device *)data;
			
 
				+	netif_rx_schedule(dev);
			
 
				+}
			
 
				+
			
 
				+static int netfront_tx_slot_available(struct netfront_info *np)
			
 
				+{
			
 
				+	return ((np->tx.req_prod_pvt - np->tx.rsp_cons) <
			
 
				+		(TX_MAX_TARGET - MAX_SKB_FRAGS - 2));
			
 
				+}
			
 
				+
			
 
				+static void xennet_maybe_wake_tx(struct net_device *dev)
			
 
				+{
			
 
				+	struct netfront_info *np = netdev_priv(dev);
			
 
				+
			
 
				+	if (unlikely(netif_queue_stopped(dev)) &&
			
 
				+	    netfront_tx_slot_available(np) &&
			
 
				+	    likely(netif_running(dev)))
			
 
				+		netif_wake_queue(dev);
			
 
				+}
			
 
				+
			
 
				+static void xennet_alloc_rx_buffers(struct net_device *dev)
			
 
				+{
			
 
				+	unsigned short id;
			
 
				+	struct netfront_info *np = netdev_priv(dev);
			
 
				+	struct sk_buff *skb;
			
 
				+	struct page *page;
			
 
				+	int i, batch_target, notify;
			
 
				+	RING_IDX req_prod = np->rx.req_prod_pvt;
			
 
				+	struct xen_memory_reservation reservation;
			
 
				+	grant_ref_t ref;
			
 
				+	unsigned long pfn;
			
 
				+	void *vaddr;
			
 
				+	int nr_flips;
			
 
				+	struct xen_netif_rx_request *req;
			
 
				+
			
 
				+	if (unlikely(!netif_carrier_ok(dev)))
			
 
				+		return;
			
 
				+
			
 
				+	/*
			
 
				+	 * Allocate skbuffs greedily, even though we batch updates to the
			
 
				+	 * receive ring. This creates a less bursty demand on the memory
			
 
				+	 * allocator, so should reduce the chance of failed allocation requests
			
 
				+	 * both for ourself and for other kernel subsystems.
			
 
				+	 */
			
 
				+	batch_target = np->rx_target - (req_prod - np->rx.rsp_cons);
			
 
				+	for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) {
			
 
				+		skb = __netdev_alloc_skb(dev, RX_COPY_THRESHOLD,
			
 
				+					 GFP_ATOMIC | __GFP_NOWARN);
			
 
				+		if (unlikely(!skb))
			
 
				+			goto no_skb;
			
 
				+
			
 
				+		page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
			
 
				+		if (!page) {
			
 
				+			kfree_skb(skb);
			
 
				+no_skb:
			
 
				+			/* Any skbuffs queued for refill? Force them out. */
			
 
				+			if (i != 0)
			
 
				+				goto refill;
			
 
				+			/* Could not allocate any skbuffs. Try again later. */
			
 
				+			mod_timer(&np->rx_refill_timer,
			
 
				+				  jiffies + (HZ/10));
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		skb_shinfo(skb)->frags[0].page = page;
			
 
				+		skb_shinfo(skb)->nr_frags = 1;
			
 
				+		__skb_queue_tail(&np->rx_batch, skb);
			
 
				+	}
			
 
				+
			
 
				+	/* Is the batch large enough to be worthwhile? */
			
 
				+	if (i < (np->rx_target/2)) {
			
 
				+		if (req_prod > np->rx.sring->req_prod)
			
 
				+			goto push;
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	/* Adjust our fill target if we risked running out of buffers. */
			
 
				+	if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) &&
			
 
				+	    ((np->rx_target *= 2) > np->rx_max_target))
			
 
				+		np->rx_target = np->rx_max_target;
			
 
				+
			
 
				+ refill:
			
 
				+	for (nr_flips = i = 0; ; i++) {
			
 
				+		skb = __skb_dequeue(&np->rx_batch);
			
 
				+		if (skb == NULL)
			
 
				+			break;
			
 
				+
			
 
				+		skb->dev = dev;
			
 
				+
			
 
				+		id = xennet_rxidx(req_prod + i);
			
 
				+
			
 
				+		BUG_ON(np->rx_skbs[id]);
			
 
				+		np->rx_skbs[id] = skb;
			
 
				+
			
 
				+		ref = gnttab_claim_grant_reference(&np->gref_rx_head);
			
 
				+		BUG_ON((signed short)ref < 0);
			
 
				+		np->grant_rx_ref[id] = ref;
			
 
				+
			
 
				+		pfn = page_to_pfn(skb_shinfo(skb)->frags[0].page);
			
 
				+		vaddr = page_address(skb_shinfo(skb)->frags[0].page);
			
 
				+
			
 
				+		req = RING_GET_REQUEST(&np->rx, req_prod + i);
			
 
				+		gnttab_grant_foreign_access_ref(ref,
			
 
				+						np->xbdev->otherend_id,
			
 
				+						pfn_to_mfn(pfn),
			
 
				+						0);
			
 
				+
			
 
				+		req->id = id;
			
 
				+		req->gref = ref;
			
 
				+	}
			
 
				+
			
 
				+	if (nr_flips != 0) {
			
 
				+		reservation.extent_start = np->rx_pfn_array;
			
 
				+		reservation.nr_extents   = nr_flips;
			
 
				+		reservation.extent_order = 0;
			
 
				+		reservation.address_bits = 0;
			
 
				+		reservation.domid        = DOMID_SELF;
			
 
				+
			
 
				+		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
			
 
				+			/* After all PTEs have been zapped, flush the TLB. */
			
 
				+			np->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] =
			
 
				+				UVMF_TLB_FLUSH|UVMF_ALL;
			
 
				+
			
 
				+			/* Give away a batch of pages. */
			
 
				+			np->rx_mcl[i].op = __HYPERVISOR_memory_op;
			
 
				+			np->rx_mcl[i].args[0] = XENMEM_decrease_reservation;
			
 
				+			np->rx_mcl[i].args[1] = (unsigned long)&reservation;
			
 
				+
			
 
				+			/* Zap PTEs and give away pages in one big
			
 
				+			 * multicall. */
			
 
				+			(void)HYPERVISOR_multicall(np->rx_mcl, i+1);
			
 
				+
			
 
				+			/* Check return status of HYPERVISOR_memory_op(). */
			
 
				+			if (unlikely(np->rx_mcl[i].result != i))
			
 
				+				panic("Unable to reduce memory reservation\n");
			
 
				+		} else {
			
 
				+			if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
			
 
				+						 &reservation) != i)
			
 
				+				panic("Unable to reduce memory reservation\n");
			
 
				+		}
			
 
				+	} else {
			
 
				+		wmb();		/* barrier so backend seens requests */
			
 
				+	}
			
 
				+
			
 
				+	/* Above is a suitable barrier to ensure backend will see requests. */
			
 
				+	np->rx.req_prod_pvt = req_prod + i;
			
 
				+ push:
			
 
				+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->rx, notify);
			
 
				+	if (notify)
			
 
				+		notify_remote_via_irq(np->netdev->irq);
			
 
				+}
			
 
				+
			
 
				+static int xennet_open(struct net_device *dev)
			
 
				+{
			
 
				+	struct netfront_info *np = netdev_priv(dev);
			
 
				+
			
 
				+	memset(&np->stats, 0, sizeof(np->stats));
			
 
				+
			
 
				+	spin_lock_bh(&np->rx_lock);
			
 
				+	if (netif_carrier_ok(dev)) {
			
 
				+		xennet_alloc_rx_buffers(dev);
			
 
				+		np->rx.sring->rsp_event = np->rx.rsp_cons + 1;
			
 
				+		if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
			
 
				+			netif_rx_schedule(dev);
			
 
				+	}
			
 
				+	spin_unlock_bh(&np->rx_lock);
			
 
				+
			
 
				+	xennet_maybe_wake_tx(dev);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void xennet_tx_buf_gc(struct net_device *dev)
			
 
				+{
			
 
				+	RING_IDX cons, prod;
			
 
				+	unsigned short id;
			
 
				+	struct netfront_info *np = netdev_priv(dev);
			
 
				+	struct sk_buff *skb;
			
 
				+
			
 
				+	BUG_ON(!netif_carrier_ok(dev));
			
 
				+
			
 
				+	do {
			
 
				+		prod = np->tx.sring->rsp_prod;
			
 
				+		rmb(); /* Ensure we see responses up to 'rp'. */
			
 
				+
			
 
				+		for (cons = np->tx.rsp_cons; cons != prod; cons++) {
			
 
				+			struct xen_netif_tx_response *txrsp;
			
 
				+
			
 
				+			txrsp = RING_GET_RESPONSE(&np->tx, cons);
			
 
				+			if (txrsp->status == NETIF_RSP_NULL)
			
 
				+				continue;
			
 
				+
			
 
				+			id  = txrsp->id;
			
 
				+			skb = np->tx_skbs[id].skb;
			
 
				+			if (unlikely(gnttab_query_foreign_access(
			
 
				+				np->grant_tx_ref[id]) != 0)) {
			
 
				+				printk(KERN_ALERT "xennet_tx_buf_gc: warning "
			
 
				+				       "-- grant still in use by backend "
			
 
				+				       "domain.\n");
			
 
				+				BUG();
			
 
				+			}
			
 
				+			gnttab_end_foreign_access_ref(
			
 
				+				np->grant_tx_ref[id], GNTMAP_readonly);
			
 
				+			gnttab_release_grant_reference(
			
 
				+				&np->gref_tx_head, np->grant_tx_ref[id]);
			
 
				+			np->grant_tx_ref[id] = GRANT_INVALID_REF;
			
 
				+			add_id_to_freelist(&np->tx_skb_freelist, np->tx_skbs, id);
			
 
				+			dev_kfree_skb_irq(skb);
			
 
				+		}
			
 
				+
			
 
				+		np->tx.rsp_cons = prod;
			
 
				+
			
 
				+		/*
			
 
				+		 * Set a new event, then check for race with update of tx_cons.
			
 
				+		 * Note that it is essential to schedule a callback, no matter
			
 
				+		 * how few buffers are pending. Even if there is space in the
			
 
				+		 * transmit ring, higher layers may be blocked because too much
			
 
				+		 * data is outstanding: in such cases notification from Xen is
			
 
				+		 * likely to be the only kick that we'll get.
			
 
				+		 */
			
 
				+		np->tx.sring->rsp_event =
			
 
				+			prod + ((np->tx.sring->req_prod - prod) >> 1) + 1;
			
 
				+		mb();		/* update shared area */
			
 
				+	} while ((cons == prod) && (prod != np->tx.sring->rsp_prod));
			
 
				+
			
 
				+	xennet_maybe_wake_tx(dev);
			
 
				+}
			
 
				+
			
 
				+static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
			
 
				+			      struct xen_netif_tx_request *tx)
			
 
				+{
			
 
				+	struct netfront_info *np = netdev_priv(dev);
			
 
				+	char *data = skb->data;
			
 
				+	unsigned long mfn;
			
 
				+	RING_IDX prod = np->tx.req_prod_pvt;
			
 
				+	int frags = skb_shinfo(skb)->nr_frags;
			
 
				+	unsigned int offset = offset_in_page(data);
			
 
				+	unsigned int len = skb_headlen(skb);
			
 
				+	unsigned int id;
			
 
				+	grant_ref_t ref;
			
 
				+	int i;
			
 
				+
			
 
				+	/* While the header overlaps a page boundary (including being
			
 
				+	   larger than a page), split it it into page-sized chunks. */
			
 
				+	while (len > PAGE_SIZE - offset) {
			
 
				+		tx->size = PAGE_SIZE - offset;
			
 
				+		tx->flags |= NETTXF_more_data;
			
 
				+		len -= tx->size;
			
 
				+		data += tx->size;
			
 
				+		offset = 0;
			
 
				+
			
 
				+		id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs);
			
 
				+		np->tx_skbs[id].skb = skb_get(skb);
			
 
				+		tx = RING_GET_REQUEST(&np->tx, prod++);
			
 
				+		tx->id = id;
			
 
				+		ref = gnttab_claim_grant_reference(&np->gref_tx_head);
			
 
				+		BUG_ON((signed short)ref < 0);
			
 
				+
			
 
				+		mfn = virt_to_mfn(data);
			
 
				+		gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
			
 
				+						mfn, GNTMAP_readonly);
			
 
				+
			
 
				+		tx->gref = np->grant_tx_ref[id] = ref;
			
 
				+		tx->offset = offset;
			
 
				+		tx->size = len;
			
 
				+		tx->flags = 0;
			
 
				+	}
			
 
				+
			
 
				+	/* Grant backend access to each skb fragment page. */
			
 
				+	for (i = 0; i < frags; i++) {
			
 
				+		skb_frag_t *frag = skb_shinfo(skb)->frags + i;
			
 
				+
			
 
				+		tx->flags |= NETTXF_more_data;
			
 
				+
			
 
				+		id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs);
			
 
				+		np->tx_skbs[id].skb = skb_get(skb);
			
 
				+		tx = RING_GET_REQUEST(&np->tx, prod++);
			
 
				+		tx->id = id;
			
 
				+		ref = gnttab_claim_grant_reference(&np->gref_tx_head);
			
 
				+		BUG_ON((signed short)ref < 0);
			
 
				+
			
 
				+		mfn = pfn_to_mfn(page_to_pfn(frag->page));
			
 
				+		gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
			
 
				+						mfn, GNTMAP_readonly);
			
 
				+
			
 
				+		tx->gref = np->grant_tx_ref[id] = ref;
			
 
				+		tx->offset = frag->page_offset;
			
 
				+		tx->size = frag->size;
			
 
				+		tx->flags = 0;
			
 
				+	}
			
 
				+
			
 
				+	np->tx.req_prod_pvt = prod;
			
 
				+}
			
 
				+
			
 
				+static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
			
 
				+{
			
 
				+	unsigned short id;
			
 
				+	struct netfront_info *np = netdev_priv(dev);
			
 
				+	struct xen_netif_tx_request *tx;
			
 
				+	struct xen_netif_extra_info *extra;
			
 
				+	char *data = skb->data;
			
 
				+	RING_IDX i;
			
 
				+	grant_ref_t ref;
			
 
				+	unsigned long mfn;
			
 
				+	int notify;
			
 
				+	int frags = skb_shinfo(skb)->nr_frags;
			
 
				+	unsigned int offset = offset_in_page(data);
			
 
				+	unsigned int len = skb_headlen(skb);
			
 
				+
			
 
				+	frags += (offset + len + PAGE_SIZE - 1) / PAGE_SIZE;
			
 
				+	if (unlikely(frags > MAX_SKB_FRAGS + 1)) {
			
 
				+		printk(KERN_ALERT "xennet: skb rides the rocket: %d frags\n",
			
 
				+		       frags);
			
 
				+		dump_stack();
			
 
				+		goto drop;
			
 
				+	}
			
 
				+
			
 
				+	spin_lock_irq(&np->tx_lock);
			
 
				+
			
 
				+	if (unlikely(!netif_carrier_ok(dev) ||
			
 
				+		     (frags > 1 && !xennet_can_sg(dev)) ||
			
 
				+		     netif_needs_gso(dev, skb))) {
			
 
				+		spin_unlock_irq(&np->tx_lock);
			
 
				+		goto drop;
			
 
				+	}
			
 
				+
			
 
				+	i = np->tx.req_prod_pvt;
			
 
				+
			
 
				+	id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs);
			
 
				+	np->tx_skbs[id].skb = skb;
			
 
				+
			
 
				+	tx = RING_GET_REQUEST(&np->tx, i);
			
 
				+
			
 
				+	tx->id   = id;
			
 
				+	ref = gnttab_claim_grant_reference(&np->gref_tx_head);
			
 
				+	BUG_ON((signed short)ref < 0);
			
 
				+	mfn = virt_to_mfn(data);
			
 
				+	gnttab_grant_foreign_access_ref(
			
 
				+		ref, np->xbdev->otherend_id, mfn, GNTMAP_readonly);
			
 
				+	tx->gref = np->grant_tx_ref[id] = ref;
			
 
				+	tx->offset = offset;
			
 
				+	tx->size = len;
			
 
				+	extra = NULL;
			
 
				+
			
 
				+	tx->flags = 0;
			
 
				+	if (skb->ip_summed == CHECKSUM_PARTIAL)
			
 
				+		/* local packet? */
			
 
				+		tx->flags |= NETTXF_csum_blank | NETTXF_data_validated;
			
 
				+	else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
			
 
				+		/* remote but checksummed. */
			
 
				+		tx->flags |= NETTXF_data_validated;
			
 
				+
			
 
				+	if (skb_shinfo(skb)->gso_size) {
			
 
				+		struct xen_netif_extra_info *gso;
			
 
				+
			
 
				+		gso = (struct xen_netif_extra_info *)
			
 
				+			RING_GET_REQUEST(&np->tx, ++i);
			
 
				+
			
 
				+		if (extra)
			
 
				+			extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
			
 
				+		else
			
 
				+			tx->flags |= NETTXF_extra_info;
			
 
				+
			
 
				+		gso->u.gso.size = skb_shinfo(skb)->gso_size;
			
 
				+		gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
			
 
				+		gso->u.gso.pad = 0;
			
 
				+		gso->u.gso.features = 0;
			
 
				+
			
 
				+		gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
			
 
				+		gso->flags = 0;
			
 
				+		extra = gso;
			
 
				+	}
			
 
				+
			
 
				+	np->tx.req_prod_pvt = i + 1;
			
 
				+
			
 
				+	xennet_make_frags(skb, dev, tx);
			
 
				+	tx->size = skb->len;
			
 
				+
			
 
				+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify);
			
 
				+	if (notify)
			
 
				+		notify_remote_via_irq(np->netdev->irq);
			
 
				+
			
 
				+	xennet_tx_buf_gc(dev);
			
 
				+
			
 
				+	if (!netfront_tx_slot_available(np))
			
 
				+		netif_stop_queue(dev);
			
 
				+
			
 
				+	spin_unlock_irq(&np->tx_lock);
			
 
				+
			
 
				+	np->stats.tx_bytes += skb->len;
			
 
				+	np->stats.tx_packets++;
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+ drop:
			
 
				+	np->stats.tx_dropped++;
			
 
				+	dev_kfree_skb(skb);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int xennet_close(struct net_device *dev)
			
 
				+{
			
 
				+	struct netfront_info *np = netdev_priv(dev);
			
 
				+	netif_stop_queue(np->netdev);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static struct net_device_stats *xennet_get_stats(struct net_device *dev)
			
 
				+{
			
 
				+	struct netfront_info *np = netdev_priv(dev);
			
 
				+	return &np->stats;
			
 
				+}
			
 
				+
			
 
				+static void xennet_move_rx_slot(struct netfront_info *np, struct sk_buff *skb,
			
 
				+				grant_ref_t ref)
			
 
				+{
			
 
				+	int new = xennet_rxidx(np->rx.req_prod_pvt);
			
 
				+
			
 
				+	BUG_ON(np->rx_skbs[new]);
			
 
				+	np->rx_skbs[new] = skb;
			
 
				+	np->grant_rx_ref[new] = ref;
			
 
				+	RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id = new;
			
 
				+	RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref = ref;
			
 
				+	np->rx.req_prod_pvt++;
			
 
				+}
			
 
				+
			
 
				+static int xennet_get_extras(struct netfront_info *np,
			
 
				+			     struct xen_netif_extra_info *extras,
			
 
				+			     RING_IDX rp)
			
 
				+
			
 
				+{
			
 
				+	struct xen_netif_extra_info *extra;
			
 
				+	struct device *dev = &np->netdev->dev;
			
 
				+	RING_IDX cons = np->rx.rsp_cons;
			
 
				+	int err = 0;
			
 
				+
			
 
				+	do {
			
 
				+		struct sk_buff *skb;
			
 
				+		grant_ref_t ref;
			
 
				+
			
 
				+		if (unlikely(cons + 1 == rp)) {
			
 
				+			if (net_ratelimit())
			
 
				+				dev_warn(dev, "Missing extra info\n");
			
 
				+			err = -EBADR;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		extra = (struct xen_netif_extra_info *)
			
 
				+			RING_GET_RESPONSE(&np->rx, ++cons);
			
 
				+
			
 
				+		if (unlikely(!extra->type ||
			
 
				+			     extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
			
 
				+			if (net_ratelimit())
			
 
				+				dev_warn(dev, "Invalid extra type: %d\n",
			
 
				+					extra->type);
			
 
				+			err = -EINVAL;
			
 
				+		} else {
			
 
				+			memcpy(&extras[extra->type - 1], extra,
			
 
				+			       sizeof(*extra));
			
 
				+		}
			
 
				+
			
 
				+		skb = xennet_get_rx_skb(np, cons);
			
 
				+		ref = xennet_get_rx_ref(np, cons);
			
 
				+		xennet_move_rx_slot(np, skb, ref);
			
 
				+	} while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE);
			
 
				+
			
 
				+	np->rx.rsp_cons = cons;
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int xennet_get_responses(struct netfront_info *np,
			
 
				+				struct netfront_rx_info *rinfo, RING_IDX rp,
			
 
				+				struct sk_buff_head *list)
			
 
				+{
			
 
				+	struct xen_netif_rx_response *rx = &rinfo->rx;
			
 
				+	struct xen_netif_extra_info *extras = rinfo->extras;
			
 
				+	struct device *dev = &np->netdev->dev;
			
 
				+	RING_IDX cons = np->rx.rsp_cons;
			
 
				+	struct sk_buff *skb = xennet_get_rx_skb(np, cons);
			
 
				+	grant_ref_t ref = xennet_get_rx_ref(np, cons);
			
 
				+	int max = MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD);
			
 
				+	int frags = 1;
			
 
				+	int err = 0;
			
 
				+	unsigned long ret;
			
 
				+
			
 
				+	if (rx->flags & NETRXF_extra_info) {
			
 
				+		err = xennet_get_extras(np, extras, rp);
			
 
				+		cons = np->rx.rsp_cons;
			
 
				+	}
			
 
				+
			
 
				+	for (;;) {
			
 
				+		if (unlikely(rx->status < 0 ||
			
 
				+			     rx->offset + rx->status > PAGE_SIZE)) {
			
 
				+			if (net_ratelimit())
			
 
				+				dev_warn(dev, "rx->offset: %x, size: %u\n",
			
 
				+					 rx->offset, rx->status);
			
 
				+			xennet_move_rx_slot(np, skb, ref);
			
 
				+			err = -EINVAL;
			
 
				+			goto next;
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * This definitely indicates a bug, either in this driver or in
			
 
				+		 * the backend driver. In future this should flag the bad
			
 
				+		 * situation to the system controller to reboot the backed.
			
 
				+		 */
			
 
				+		if (ref == GRANT_INVALID_REF) {
			
 
				+			if (net_ratelimit())
			
 
				+				dev_warn(dev, "Bad rx response id %d.\n",
			
 
				+					 rx->id);
			
 
				+			err = -EINVAL;
			
 
				+			goto next;
			
 
				+		}
			
 
				+
			
 
				+		ret = gnttab_end_foreign_access_ref(ref, 0);
			
 
				+		BUG_ON(!ret);
			
 
				+
			
 
				+		gnttab_release_grant_reference(&np->gref_rx_head, ref);
			
 
				+
			
 
				+		__skb_queue_tail(list, skb);
			
 
				+
			
 
				+next:
			
 
				+		if (!(rx->flags & NETRXF_more_data))
			
 
				+			break;
			
 
				+
			
 
				+		if (cons + frags == rp) {
			
 
				+			if (net_ratelimit())
			
 
				+				dev_warn(dev, "Need more frags\n");
			
 
				+			err = -ENOENT;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		rx = RING_GET_RESPONSE(&np->rx, cons + frags);
			
 
				+		skb = xennet_get_rx_skb(np, cons + frags);
			
 
				+		ref = xennet_get_rx_ref(np, cons + frags);
			
 
				+		frags++;
			
 
				+	}
			
 
				+
			
 
				+	if (unlikely(frags > max)) {
			
 
				+		if (net_ratelimit())
			
 
				+			dev_warn(dev, "Too many frags\n");
			
 
				+		err = -E2BIG;
			
 
				+	}
			
 
				+
			
 
				+	if (unlikely(err))
			
 
				+		np->rx.rsp_cons = cons + frags;
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int xennet_set_skb_gso(struct sk_buff *skb,
			
 
				+			      struct xen_netif_extra_info *gso)
			
 
				+{
			
 
				+	if (!gso->u.gso.size) {
			
 
				+		if (net_ratelimit())
			
 
				+			printk(KERN_WARNING "GSO size must not be zero.\n");
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	/* Currently only TCPv4 S.O. is supported. */
			
 
				+	if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
			
 
				+		if (net_ratelimit())
			
 
				+			printk(KERN_WARNING "Bad GSO type %d.\n", gso->u.gso.type);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	skb_shinfo(skb)->gso_size = gso->u.gso.size;
			
 
				+	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
			
 
				+
			
 
				+	/* Header must be checked, and gso_segs computed. */
			
 
				+	skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
			
 
				+	skb_shinfo(skb)->gso_segs = 0;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static RING_IDX xennet_fill_frags(struct netfront_info *np,
			
 
				+				  struct sk_buff *skb,
			
 
				+				  struct sk_buff_head *list)
			
 
				+{
			
 
				+	struct skb_shared_info *shinfo = skb_shinfo(skb);
			
 
				+	int nr_frags = shinfo->nr_frags;
			
 
				+	RING_IDX cons = np->rx.rsp_cons;
			
 
				+	skb_frag_t *frag = shinfo->frags + nr_frags;
			
 
				+	struct sk_buff *nskb;
			
 
				+
			
 
				+	while ((nskb = __skb_dequeue(list))) {
			
 
				+		struct xen_netif_rx_response *rx =
			
 
				+			RING_GET_RESPONSE(&np->rx, ++cons);
			
 
				+
			
 
				+		frag->page = skb_shinfo(nskb)->frags[0].page;
			
 
				+		frag->page_offset = rx->offset;
			
 
				+		frag->size = rx->status;
			
 
				+
			
 
				+		skb->data_len += rx->status;
			
 
				+
			
 
				+		skb_shinfo(nskb)->nr_frags = 0;
			
 
				+		kfree_skb(nskb);
			
 
				+
			
 
				+		frag++;
			
 
				+		nr_frags++;
			
 
				+	}
			
 
				+
			
 
				+	shinfo->nr_frags = nr_frags;
			
 
				+	return cons;
			
 
				+}
			
 
				+
			
 
				+static int skb_checksum_setup(struct sk_buff *skb)
			
 
				+{
			
 
				+	struct iphdr *iph;
			
 
				+	unsigned char *th;
			
 
				+	int err = -EPROTO;
			
 
				+
			
 
				+	if (skb->protocol != htons(ETH_P_IP))
			
 
				+		goto out;
			
 
				+
			
 
				+	iph = (void *)skb->data;
			
 
				+	th = skb->data + 4 * iph->ihl;
			
 
				+	if (th >= skb_tail_pointer(skb))
			
 
				+		goto out;
			
 
				+
			
 
				+	skb->csum_start = th - skb->head;
			
 
				+	switch (iph->protocol) {
			
 
				+	case IPPROTO_TCP:
			
 
				+		skb->csum_offset = offsetof(struct tcphdr, check);
			
 
				+		break;
			
 
				+	case IPPROTO_UDP:
			
 
				+		skb->csum_offset = offsetof(struct udphdr, check);
			
 
				+		break;
			
 
				+	default:
			
 
				+		if (net_ratelimit())
			
 
				+			printk(KERN_ERR "Attempting to checksum a non-"
			
 
				+			       "TCP/UDP packet, dropping a protocol"
			
 
				+			       " %d packet", iph->protocol);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb))
			
 
				+		goto out;
			
 
				+
			
 
				+	err = 0;
			
 
				+
			
 
				+out:
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int handle_incoming_queue(struct net_device *dev,
			
 
				+				  struct sk_buff_head *rxq)
			
 
				+{
			
 
				+	struct netfront_info *np = netdev_priv(dev);
			
 
				+	int packets_dropped = 0;
			
 
				+	struct sk_buff *skb;
			
 
				+
			
 
				+	while ((skb = __skb_dequeue(rxq)) != NULL) {
			
 
				+		struct page *page = NETFRONT_SKB_CB(skb)->page;
			
 
				+		void *vaddr = page_address(page);
			
 
				+		unsigned offset = NETFRONT_SKB_CB(skb)->offset;
			
 
				+
			
 
				+		memcpy(skb->data, vaddr + offset,
			
 
				+		       skb_headlen(skb));
			
 
				+
			
 
				+		if (page != skb_shinfo(skb)->frags[0].page)
			
 
				+			__free_page(page);
			
 
				+
			
 
				+		/* Ethernet work: Delayed to here as it peeks the header. */
			
 
				+		skb->protocol = eth_type_trans(skb, dev);
			
 
				+
			
 
				+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
			
 
				+			if (skb_checksum_setup(skb)) {
			
 
				+				kfree_skb(skb);
			
 
				+				packets_dropped++;
			
 
				+				np->stats.rx_errors++;
			
 
				+				continue;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		np->stats.rx_packets++;
			
 
				+		np->stats.rx_bytes += skb->len;
			
 
				+
			
 
				+		/* Pass it up. */
			
 
				+		netif_receive_skb(skb);
			
 
				+		dev->last_rx = jiffies;
			
 
				+	}
			
 
				+
			
 
				+	return packets_dropped;
			
 
				+}
			
 
				+
			
 
				+static int xennet_poll(struct net_device *dev, int *pbudget)
			
 
				+{
			
 
				+	struct netfront_info *np = netdev_priv(dev);
			
 
				+	struct sk_buff *skb;
			
 
				+	struct netfront_rx_info rinfo;
			
 
				+	struct xen_netif_rx_response *rx = &rinfo.rx;
			
 
				+	struct xen_netif_extra_info *extras = rinfo.extras;
			
 
				+	RING_IDX i, rp;
			
 
				+	int work_done, budget, more_to_do = 1;
			
 
				+	struct sk_buff_head rxq;
			
 
				+	struct sk_buff_head errq;
			
 
				+	struct sk_buff_head tmpq;
			
 
				+	unsigned long flags;
			
 
				+	unsigned int len;
			
 
				+	int err;
			
 
				+
			
 
				+	spin_lock(&np->rx_lock);
			
 
				+
			
 
				+	if (unlikely(!netif_carrier_ok(dev))) {
			
 
				+		spin_unlock(&np->rx_lock);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	skb_queue_head_init(&rxq);
			
 
				+	skb_queue_head_init(&errq);
			
 
				+	skb_queue_head_init(&tmpq);
			
 
				+
			
 
				+	budget = *pbudget;
			
 
				+	if (budget > dev->quota)
			
 
				+		budget = dev->quota;
			
 
				+	rp = np->rx.sring->rsp_prod;
			
 
				+	rmb(); /* Ensure we see queued responses up to 'rp'. */
			
 
				+
			
 
				+	i = np->rx.rsp_cons;
			
 
				+	work_done = 0;
			
 
				+	while ((i != rp) && (work_done < budget)) {
			
 
				+		memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx));
			
 
				+		memset(extras, 0, sizeof(rinfo.extras));
			
 
				+
			
 
				+		err = xennet_get_responses(np, &rinfo, rp, &tmpq);
			
 
				+
			
 
				+		if (unlikely(err)) {
			
 
				+err:
			
 
				+			while ((skb = __skb_dequeue(&tmpq)))
			
 
				+				__skb_queue_tail(&errq, skb);
			
 
				+			np->stats.rx_errors++;
			
 
				+			i = np->rx.rsp_cons;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		skb = __skb_dequeue(&tmpq);
			
 
				+
			
 
				+		if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
			
 
				+			struct xen_netif_extra_info *gso;
			
 
				+			gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
			
 
				+
			
 
				+			if (unlikely(xennet_set_skb_gso(skb, gso))) {
			
 
				+				__skb_queue_head(&tmpq, skb);
			
 
				+				np->rx.rsp_cons += skb_queue_len(&tmpq);
			
 
				+				goto err;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		NETFRONT_SKB_CB(skb)->page = skb_shinfo(skb)->frags[0].page;
			
 
				+		NETFRONT_SKB_CB(skb)->offset = rx->offset;
			
 
				+
			
 
				+		len = rx->status;
			
 
				+		if (len > RX_COPY_THRESHOLD)
			
 
				+			len = RX_COPY_THRESHOLD;
			
 
				+		skb_put(skb, len);
			
 
				+
			
 
				+		if (rx->status > len) {
			
 
				+			skb_shinfo(skb)->frags[0].page_offset =
			
 
				+				rx->offset + len;
			
 
				+			skb_shinfo(skb)->frags[0].size = rx->status - len;
			
 
				+			skb->data_len = rx->status - len;
			
 
				+		} else {
			
 
				+			skb_shinfo(skb)->frags[0].page = NULL;
			
 
				+			skb_shinfo(skb)->nr_frags = 0;
			
 
				+		}
			
 
				+
			
 
				+		i = xennet_fill_frags(np, skb, &tmpq);
			
 
				+
			
 
				+		/*
			
 
				+		 * Truesize approximates the size of true data plus
			
 
				+		 * any supervisor overheads. Adding hypervisor
			
 
				+		 * overheads has been shown to significantly reduce
			
 
				+		 * achievable bandwidth with the default receive
			
 
				+		 * buffer size. It is therefore not wise to account
			
 
				+		 * for it here.
			
 
				+		 *
			
 
				+		 * After alloc_skb(RX_COPY_THRESHOLD), truesize is set
			
 
				+		 * to RX_COPY_THRESHOLD + the supervisor
			
 
				+		 * overheads. Here, we add the size of the data pulled
			
 
				+		 * in xennet_fill_frags().
			
 
				+		 *
			
 
				+		 * We also adjust for any unused space in the main
			
 
				+		 * data area by subtracting (RX_COPY_THRESHOLD -
			
 
				+		 * len). This is especially important with drivers
			
 
				+		 * which split incoming packets into header and data,
			
 
				+		 * using only 66 bytes of the main data area (see the
			
 
				+		 * e1000 driver for example.)  On such systems,
			
 
				+		 * without this last adjustement, our achievable
			
 
				+		 * receive throughout using the standard receive
			
 
				+		 * buffer size was cut by 25%(!!!).
			
 
				+		 */
			
 
				+		skb->truesize += skb->data_len - (RX_COPY_THRESHOLD - len);
			
 
				+		skb->len += skb->data_len;
			
 
				+
			
 
				+		if (rx->flags & NETRXF_csum_blank)
			
 
				+			skb->ip_summed = CHECKSUM_PARTIAL;
			
 
				+		else if (rx->flags & NETRXF_data_validated)
			
 
				+			skb->ip_summed = CHECKSUM_UNNECESSARY;
			
 
				+
			
 
				+		__skb_queue_tail(&rxq, skb);
			
 
				+
			
 
				+		np->rx.rsp_cons = ++i;
			
 
				+		work_done++;
			
 
				+	}
			
 
				+
			
 
				+	while ((skb = __skb_dequeue(&errq)))
			
 
				+		kfree_skb(skb);
			
 
				+
			
 
				+	work_done -= handle_incoming_queue(dev, &rxq);
			
 
				+
			
 
				+	/* If we get a callback with very few responses, reduce fill target. */
			
 
				+	/* NB. Note exponential increase, linear decrease. */
			
 
				+	if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) >
			
 
				+	     ((3*np->rx_target) / 4)) &&
			
 
				+	    (--np->rx_target < np->rx_min_target))
			
 
				+		np->rx_target = np->rx_min_target;
			
 
				+
			
 
				+	xennet_alloc_rx_buffers(dev);
			
 
				+
			
 
				+	*pbudget   -= work_done;
			
 
				+	dev->quota -= work_done;
			
 
				+
			
 
				+	if (work_done < budget) {
			
 
				+		local_irq_save(flags);
			
 
				+
			
 
				+		RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, more_to_do);
			
 
				+		if (!more_to_do)
			
 
				+			__netif_rx_complete(dev);
			
 
				+
			
 
				+		local_irq_restore(flags);
			
 
				+	}
			
 
				+
			
 
				+	spin_unlock(&np->rx_lock);
			
 
				+
			
 
				+	return more_to_do;
			
 
				+}
			
 
				+
			
 
				+static int xennet_change_mtu(struct net_device *dev, int mtu)
			
 
				+{
			
 
				+	int max = xennet_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
			
 
				+
			
 
				+	if (mtu > max)
			
 
				+		return -EINVAL;
			
 
				+	dev->mtu = mtu;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void xennet_release_tx_bufs(struct netfront_info *np)
			
 
				+{
			
 
				+	struct sk_buff *skb;
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < NET_TX_RING_SIZE; i++) {
			
 
				+		/* Skip over entries which are actually freelist references */
			
 
				+		if ((unsigned long)np->tx_skbs[i].skb < PAGE_OFFSET)
			
 
				+			continue;
			
 
				+
			
 
				+		skb = np->tx_skbs[i].skb;
			
 
				+		gnttab_end_foreign_access_ref(np->grant_tx_ref[i],
			
 
				+					      GNTMAP_readonly);
			
 
				+		gnttab_release_grant_reference(&np->gref_tx_head,
			
 
				+					       np->grant_tx_ref[i]);
			
 
				+		np->grant_tx_ref[i] = GRANT_INVALID_REF;
			
 
				+		add_id_to_freelist(&np->tx_skb_freelist, np->tx_skbs, i);
			
 
				+		dev_kfree_skb_irq(skb);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void xennet_release_rx_bufs(struct netfront_info *np)
			
 
				+{
			
 
				+	struct mmu_update      *mmu = np->rx_mmu;
			
 
				+	struct multicall_entry *mcl = np->rx_mcl;
			
 
				+	struct sk_buff_head free_list;
			
 
				+	struct sk_buff *skb;
			
 
				+	unsigned long mfn;
			
 
				+	int xfer = 0, noxfer = 0, unused = 0;
			
 
				+	int id, ref;
			
 
				+
			
 
				+	dev_warn(&np->netdev->dev, "%s: fix me for copying receiver.\n",
			
 
				+			 __func__);
			
 
				+	return;
			
 
				+
			
 
				+	skb_queue_head_init(&free_list);
			
 
				+
			
 
				+	spin_lock_bh(&np->rx_lock);
			
 
				+
			
 
				+	for (id = 0; id < NET_RX_RING_SIZE; id++) {
			
 
				+		ref = np->grant_rx_ref[id];
			
 
				+		if (ref == GRANT_INVALID_REF) {
			
 
				+			unused++;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		skb = np->rx_skbs[id];
			
 
				+		mfn = gnttab_end_foreign_transfer_ref(ref);
			
 
				+		gnttab_release_grant_reference(&np->gref_rx_head, ref);
			
 
				+		np->grant_rx_ref[id] = GRANT_INVALID_REF;
			
 
				+
			
 
				+		if (0 == mfn) {
			
 
				+			skb_shinfo(skb)->nr_frags = 0;
			
 
				+			dev_kfree_skb(skb);
			
 
				+			noxfer++;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
			
 
				+			/* Remap the page. */
			
 
				+			struct page *page = skb_shinfo(skb)->frags[0].page;
			
 
				+			unsigned long pfn = page_to_pfn(page);
			
 
				+			void *vaddr = page_address(page);
			
 
				+
			
 
				+			MULTI_update_va_mapping(mcl, (unsigned long)vaddr,
			
 
				+						mfn_pte(mfn, PAGE_KERNEL),
			
 
				+						0);
			
 
				+			mcl++;
			
 
				+			mmu->ptr = ((u64)mfn << PAGE_SHIFT)
			
 
				+				| MMU_MACHPHYS_UPDATE;
			
 
				+			mmu->val = pfn;
			
 
				+			mmu++;
			
 
				+
			
 
				+			set_phys_to_machine(pfn, mfn);
			
 
				+		}
			
 
				+		__skb_queue_tail(&free_list, skb);
			
 
				+		xfer++;
			
 
				+	}
			
 
				+
			
 
				+	dev_info(&np->netdev->dev, "%s: %d xfer, %d noxfer, %d unused\n",
			
 
				+		 __func__, xfer, noxfer, unused);
			
 
				+
			
 
				+	if (xfer) {
			
 
				+		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
			
 
				+			/* Do all the remapping work and M2P updates. */
			
 
				+			MULTI_mmu_update(mcl, np->rx_mmu, mmu - np->rx_mmu,
			
 
				+					 0, DOMID_SELF);
			
 
				+			mcl++;
			
 
				+			HYPERVISOR_multicall(np->rx_mcl, mcl - np->rx_mcl);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	while ((skb = __skb_dequeue(&free_list)) != NULL)
			
 
				+		dev_kfree_skb(skb);
			
 
				+
			
 
				+	spin_unlock_bh(&np->rx_lock);
			
 
				+}
			
 
				+
			
 
				+static void xennet_uninit(struct net_device *dev)
			
 
				+{
			
 
				+	struct netfront_info *np = netdev_priv(dev);
			
 
				+	xennet_release_tx_bufs(np);
			
 
				+	xennet_release_rx_bufs(np);
			
 
				+	gnttab_free_grant_references(np->gref_tx_head);
			
 
				+	gnttab_free_grant_references(np->gref_rx_head);
			
 
				+}
			
 
				+
			
 
				+static struct net_device * __devinit xennet_create_dev(struct xenbus_device *dev)
			
 
				+{
			
 
				+	int i, err;
			
 
				+	struct net_device *netdev;
			
 
				+	struct netfront_info *np;
			
 
				+
			
 
				+	netdev = alloc_etherdev(sizeof(struct netfront_info));
			
 
				+	if (!netdev) {
			
 
				+		printk(KERN_WARNING "%s> alloc_etherdev failed.\n",
			
 
				+		       __func__);
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+	}
			
 
				+
			
 
				+	np                   = netdev_priv(netdev);
			
 
				+	np->xbdev            = dev;
			
 
				+
			
 
				+	spin_lock_init(&np->tx_lock);
			
 
				+	spin_lock_init(&np->rx_lock);
			
 
				+
			
 
				+	skb_queue_head_init(&np->rx_batch);
			
 
				+	np->rx_target     = RX_DFL_MIN_TARGET;
			
 
				+	np->rx_min_target = RX_DFL_MIN_TARGET;
			
 
				+	np->rx_max_target = RX_MAX_TARGET;
			
 
				+
			
 
				+	init_timer(&np->rx_refill_timer);
			
 
				+	np->rx_refill_timer.data = (unsigned long)netdev;
			
 
				+	np->rx_refill_timer.function = rx_refill_timeout;
			
 
				+
			
 
				+	/* Initialise tx_skbs as a free chain containing every entry. */
			
 
				+	np->tx_skb_freelist = 0;
			
 
				+	for (i = 0; i < NET_TX_RING_SIZE; i++) {
			
 
				+		np->tx_skbs[i].link = i+1;
			
 
				+		np->grant_tx_ref[i] = GRANT_INVALID_REF;
			
 
				+	}
			
 
				+
			
 
				+	/* Clear out rx_skbs */
			
 
				+	for (i = 0; i < NET_RX_RING_SIZE; i++) {
			
 
				+		np->rx_skbs[i] = NULL;
			
 
				+		np->grant_rx_ref[i] = GRANT_INVALID_REF;
			
 
				+	}
			
 
				+
			
 
				+	/* A grant for every tx ring slot */
			
 
				+	if (gnttab_alloc_grant_references(TX_MAX_TARGET,
			
 
				+					  &np->gref_tx_head) < 0) {
			
 
				+		printk(KERN_ALERT "#### netfront can't alloc tx grant refs\n");
			
 
				+		err = -ENOMEM;
			
 
				+		goto exit;
			
 
				+	}
			
 
				+	/* A grant for every rx ring slot */
			
 
				+	if (gnttab_alloc_grant_references(RX_MAX_TARGET,
			
 
				+					  &np->gref_rx_head) < 0) {
			
 
				+		printk(KERN_ALERT "#### netfront can't alloc rx grant refs\n");
			
 
				+		err = -ENOMEM;
			
 
				+		goto exit_free_tx;
			
 
				+	}
			
 
				+
			
 
				+	netdev->open            = xennet_open;
			
 
				+	netdev->hard_start_xmit = xennet_start_xmit;
			
 
				+	netdev->stop            = xennet_close;
			
 
				+	netdev->get_stats       = xennet_get_stats;
			
 
				+	netdev->poll            = xennet_poll;
			
 
				+	netdev->uninit          = xennet_uninit;
			
 
				+	netdev->change_mtu	= xennet_change_mtu;
			
 
				+	netdev->weight          = 64;
			
 
				+	netdev->features        = NETIF_F_IP_CSUM;
			
 
				+
			
 
				+	SET_ETHTOOL_OPS(netdev, &xennet_ethtool_ops);
			
 
				+	SET_MODULE_OWNER(netdev);
			
 
				+	SET_NETDEV_DEV(netdev, &dev->dev);
			
 
				+
			
 
				+	np->netdev = netdev;
			
 
				+
			
 
				+	netif_carrier_off(netdev);
			
 
				+
			
 
				+	return netdev;
			
 
				+
			
 
				+ exit_free_tx:
			
 
				+	gnttab_free_grant_references(np->gref_tx_head);
			
 
				+ exit:
			
 
				+	free_netdev(netdev);
			
 
				+	return ERR_PTR(err);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Entry point to this code when a new device is created.  Allocate the basic
			
 
				+ * structures and the ring buffers for communication with the backend, and
			
 
				+ * inform the backend of the appropriate details for those.
			
 
				+ */
			
 
				+static int __devinit netfront_probe(struct xenbus_device *dev,
			
 
				+				    const struct xenbus_device_id *id)
			
 
				+{
			
 
				+	int err;
			
 
				+	struct net_device *netdev;
			
 
				+	struct netfront_info *info;
			
 
				+
			
 
				+	netdev = xennet_create_dev(dev);
			
 
				+	if (IS_ERR(netdev)) {
			
 
				+		err = PTR_ERR(netdev);
			
 
				+		xenbus_dev_fatal(dev, err, "creating netdev");
			
 
				+		return err;
			
 
				+	}
			
 
				+
			
 
				+	info = netdev_priv(netdev);
			
 
				+	dev->dev.driver_data = info;
			
 
				+
			
 
				+	err = register_netdev(info->netdev);
			
 
				+	if (err) {
			
 
				+		printk(KERN_WARNING "%s: register_netdev err=%d\n",
			
 
				+		       __func__, err);
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	err = xennet_sysfs_addif(info->netdev);
			
 
				+	if (err) {
			
 
				+		unregister_netdev(info->netdev);
			
 
				+		printk(KERN_WARNING "%s: add sysfs failed err=%d\n",
			
 
				+		       __func__, err);
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+ fail:
			
 
				+	free_netdev(netdev);
			
 
				+	dev->dev.driver_data = NULL;
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static void xennet_end_access(int ref, void *page)
			
 
				+{
			
 
				+	/* This frees the page as a side-effect */
			
 
				+	if (ref != GRANT_INVALID_REF)
			
 
				+		gnttab_end_foreign_access(ref, 0, (unsigned long)page);
			
 
				+}
			
 
				+
			
 
				+static void xennet_disconnect_backend(struct netfront_info *info)
			
 
				+{
			
 
				+	/* Stop old i/f to prevent errors whilst we rebuild the state. */
			
 
				+	spin_lock_bh(&info->rx_lock);
			
 
				+	spin_lock_irq(&info->tx_lock);
			
 
				+	netif_carrier_off(info->netdev);
			
 
				+	spin_unlock_irq(&info->tx_lock);
			
 
				+	spin_unlock_bh(&info->rx_lock);
			
 
				+
			
 
				+	if (info->netdev->irq)
			
 
				+		unbind_from_irqhandler(info->netdev->irq, info->netdev);
			
 
				+	info->evtchn = info->netdev->irq = 0;
			
 
				+
			
 
				+	/* End access and free the pages */
			
 
				+	xennet_end_access(info->tx_ring_ref, info->tx.sring);
			
 
				+	xennet_end_access(info->rx_ring_ref, info->rx.sring);
			
 
				+
			
 
				+	info->tx_ring_ref = GRANT_INVALID_REF;
			
 
				+	info->rx_ring_ref = GRANT_INVALID_REF;
			
 
				+	info->tx.sring = NULL;
			
 
				+	info->rx.sring = NULL;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * We are reconnecting to the backend, due to a suspend/resume, or a backend
			
 
				+ * driver restart.  We tear down our netif structure and recreate it, but
			
 
				+ * leave the device-layer structures intact so that this is transparent to the
			
 
				+ * rest of the kernel.
			
 
				+ */
			
 
				+static int netfront_resume(struct xenbus_device *dev)
			
 
				+{
			
 
				+	struct netfront_info *info = dev->dev.driver_data;
			
 
				+
			
 
				+	dev_dbg(&dev->dev, "%s\n", dev->nodename);
			
 
				+
			
 
				+	xennet_disconnect_backend(info);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
			
 
				+{
			
 
				+	char *s, *e, *macstr;
			
 
				+	int i;
			
 
				+
			
 
				+	macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
			
 
				+	if (IS_ERR(macstr))
			
 
				+		return PTR_ERR(macstr);
			
 
				+
			
 
				+	for (i = 0; i < ETH_ALEN; i++) {
			
 
				+		mac[i] = simple_strtoul(s, &e, 16);
			
 
				+		if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
			
 
				+			kfree(macstr);
			
 
				+			return -ENOENT;
			
 
				+		}
			
 
				+		s = e+1;
			
 
				+	}
			
 
				+
			
 
				+	kfree(macstr);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static irqreturn_t xennet_interrupt(int irq, void *dev_id)
			
 
				+{
			
 
				+	struct net_device *dev = dev_id;
			
 
				+	struct netfront_info *np = netdev_priv(dev);
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&np->tx_lock, flags);
			
 
				+
			
 
				+	if (likely(netif_carrier_ok(dev))) {
			
 
				+		xennet_tx_buf_gc(dev);
			
 
				+		/* Under tx_lock: protects access to rx shared-ring indexes. */
			
 
				+		if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
			
 
				+			netif_rx_schedule(dev);
			
 
				+	}
			
 
				+
			
 
				+	spin_unlock_irqrestore(&np->tx_lock, flags);
			
 
				+
			
 
				+	return IRQ_HANDLED;
			
 
				+}
			
 
				+
			
 
				+static int setup_netfront(struct xenbus_device *dev, struct netfront_info *info)
			
 
				+{
			
 
				+	struct xen_netif_tx_sring *txs;
			
 
				+	struct xen_netif_rx_sring *rxs;
			
 
				+	int err;
			
 
				+	struct net_device *netdev = info->netdev;
			
 
				+
			
 
				+	info->tx_ring_ref = GRANT_INVALID_REF;
			
 
				+	info->rx_ring_ref = GRANT_INVALID_REF;
			
 
				+	info->rx.sring = NULL;
			
 
				+	info->tx.sring = NULL;
			
 
				+	netdev->irq = 0;
			
 
				+
			
 
				+	err = xen_net_read_mac(dev, netdev->dev_addr);
			
 
				+	if (err) {
			
 
				+		xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	txs = (struct xen_netif_tx_sring *)get_zeroed_page(GFP_KERNEL);
			
 
				+	if (!txs) {
			
 
				+		err = -ENOMEM;
			
 
				+		xenbus_dev_fatal(dev, err, "allocating tx ring page");
			
 
				+		goto fail;
			
 
				+	}
			
 
				+	SHARED_RING_INIT(txs);
			
 
				+	FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE);
			
 
				+
			
 
				+	err = xenbus_grant_ring(dev, virt_to_mfn(txs));
			
 
				+	if (err < 0) {
			
 
				+		free_page((unsigned long)txs);
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	info->tx_ring_ref = err;
			
 
				+	rxs = (struct xen_netif_rx_sring *)get_zeroed_page(GFP_KERNEL);
			
 
				+	if (!rxs) {
			
 
				+		err = -ENOMEM;
			
 
				+		xenbus_dev_fatal(dev, err, "allocating rx ring page");
			
 
				+		goto fail;
			
 
				+	}
			
 
				+	SHARED_RING_INIT(rxs);
			
 
				+	FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE);
			
 
				+
			
 
				+	err = xenbus_grant_ring(dev, virt_to_mfn(rxs));
			
 
				+	if (err < 0) {
			
 
				+		free_page((unsigned long)rxs);
			
 
				+		goto fail;
			
 
				+	}
			
 
				+	info->rx_ring_ref = err;
			
 
				+
			
 
				+	err = xenbus_alloc_evtchn(dev, &info->evtchn);
			
 
				+	if (err)
			
 
				+		goto fail;
			
 
				+
			
 
				+	err = bind_evtchn_to_irqhandler(info->evtchn, xennet_interrupt,
			
 
				+					IRQF_SAMPLE_RANDOM, netdev->name,
			
 
				+					netdev);
			
 
				+	if (err < 0)
			
 
				+		goto fail;
			
 
				+	netdev->irq = err;
			
 
				+	return 0;
			
 
				+
			
 
				+ fail:
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+/* Common code used when first setting up, and when resuming. */
			
 
				+static int talk_to_backend(struct xenbus_device *dev,
			
 
				+			   struct netfront_info *info)
			
 
				+{
			
 
				+	const char *message;
			
 
				+	struct xenbus_transaction xbt;
			
 
				+	int err;
			
 
				+
			
 
				+	/* Create shared ring, alloc event channel. */
			
 
				+	err = setup_netfront(dev, info);
			
 
				+	if (err)
			
 
				+		goto out;
			
 
				+
			
 
				+again:
			
 
				+	err = xenbus_transaction_start(&xbt);
			
 
				+	if (err) {
			
 
				+		xenbus_dev_fatal(dev, err, "starting transaction");
			
 
				+		goto destroy_ring;
			
 
				+	}
			
 
				+
			
 
				+	err = xenbus_printf(xbt, dev->nodename, "tx-ring-ref", "%u",
			
 
				+			    info->tx_ring_ref);
			
 
				+	if (err) {
			
 
				+		message = "writing tx ring-ref";
			
 
				+		goto abort_transaction;
			
 
				+	}
			
 
				+	err = xenbus_printf(xbt, dev->nodename, "rx-ring-ref", "%u",
			
 
				+			    info->rx_ring_ref);
			
 
				+	if (err) {
			
 
				+		message = "writing rx ring-ref";
			
 
				+		goto abort_transaction;
			
 
				+	}
			
 
				+	err = xenbus_printf(xbt, dev->nodename,
			
 
				+			    "event-channel", "%u", info->evtchn);
			
 
				+	if (err) {
			
 
				+		message = "writing event-channel";
			
 
				+		goto abort_transaction;
			
 
				+	}
			
 
				+
			
 
				+	err = xenbus_printf(xbt, dev->nodename, "request-rx-copy", "%u",
			
 
				+			    1);
			
 
				+	if (err) {
			
 
				+		message = "writing request-rx-copy";
			
 
				+		goto abort_transaction;
			
 
				+	}
			
 
				+
			
 
				+	err = xenbus_printf(xbt, dev->nodename, "feature-rx-notify", "%d", 1);
			
 
				+	if (err) {
			
 
				+		message = "writing feature-rx-notify";
			
 
				+		goto abort_transaction;
			
 
				+	}
			
 
				+
			
 
				+	err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1);
			
 
				+	if (err) {
			
 
				+		message = "writing feature-sg";
			
 
				+		goto abort_transaction;
			
 
				+	}
			
 
				+
			
 
				+	err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", "%d", 1);
			
 
				+	if (err) {
			
 
				+		message = "writing feature-gso-tcpv4";
			
 
				+		goto abort_transaction;
			
 
				+	}
			
 
				+
			
 
				+	err = xenbus_transaction_end(xbt, 0);
			
 
				+	if (err) {
			
 
				+		if (err == -EAGAIN)
			
 
				+			goto again;
			
 
				+		xenbus_dev_fatal(dev, err, "completing transaction");
			
 
				+		goto destroy_ring;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+ abort_transaction:
			
 
				+	xenbus_transaction_end(xbt, 1);
			
 
				+	xenbus_dev_fatal(dev, err, "%s", message);
			
 
				+ destroy_ring:
			
 
				+	xennet_disconnect_backend(info);
			
 
				+ out:
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int xennet_set_sg(struct net_device *dev, u32 data)
			
 
				+{
			
 
				+	if (data) {
			
 
				+		struct netfront_info *np = netdev_priv(dev);
			
 
				+		int val;
			
 
				+
			
 
				+		if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, "feature-sg",
			
 
				+				 "%d", &val) < 0)
			
 
				+			val = 0;
			
 
				+		if (!val)
			
 
				+			return -ENOSYS;
			
 
				+	} else if (dev->mtu > ETH_DATA_LEN)
			
 
				+		dev->mtu = ETH_DATA_LEN;
			
 
				+
			
 
				+	return ethtool_op_set_sg(dev, data);
			
 
				+}
			
 
				+
			
 
				+static int xennet_set_tso(struct net_device *dev, u32 data)
			
 
				+{
			
 
				+	if (data) {
			
 
				+		struct netfront_info *np = netdev_priv(dev);
			
 
				+		int val;
			
 
				+
			
 
				+		if (xenbus_scanf(XBT_NIL, np->xbdev->otherend,
			
 
				+				 "feature-gso-tcpv4", "%d", &val) < 0)
			
 
				+			val = 0;
			
 
				+		if (!val)
			
 
				+			return -ENOSYS;
			
 
				+	}
			
 
				+
			
 
				+	return ethtool_op_set_tso(dev, data);
			
 
				+}
			
 
				+
			
 
				+static void xennet_set_features(struct net_device *dev)
			
 
				+{
			
 
				+	/* Turn off all GSO bits except ROBUST. */
			
 
				+	dev->features &= (1 << NETIF_F_GSO_SHIFT) - 1;
			
 
				+	dev->features |= NETIF_F_GSO_ROBUST;
			
 
				+	xennet_set_sg(dev, 0);
			
 
				+
			
 
				+	/* We need checksum offload to enable scatter/gather and TSO. */
			
 
				+	if (!(dev->features & NETIF_F_IP_CSUM))
			
 
				+		return;
			
 
				+
			
 
				+	if (!xennet_set_sg(dev, 1))
			
 
				+		xennet_set_tso(dev, 1);
			
 
				+}
			
 
				+
			
 
				+static int xennet_connect(struct net_device *dev)
			
 
				+{
			
 
				+	struct netfront_info *np = netdev_priv(dev);
			
 
				+	int i, requeue_idx, err;
			
 
				+	struct sk_buff *skb;
			
 
				+	grant_ref_t ref;
			
 
				+	struct xen_netif_rx_request *req;
			
 
				+	unsigned int feature_rx_copy;
			
 
				+
			
 
				+	err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
			
 
				+			   "feature-rx-copy", "%u", &feature_rx_copy);
			
 
				+	if (err != 1)
			
 
				+		feature_rx_copy = 0;
			
 
				+
			
 
				+	if (!feature_rx_copy) {
			
 
				+		dev_info(&dev->dev,
			
 
				+			 "backend does not support copying recieve path");
			
 
				+		return -ENODEV;
			
 
				+	}
			
 
				+
			
 
				+	err = talk_to_backend(np->xbdev, np);
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	xennet_set_features(dev);
			
 
				+
			
 
				+	spin_lock_bh(&np->rx_lock);
			
 
				+	spin_lock_irq(&np->tx_lock);
			
 
				+
			
 
				+	/* Step 1: Discard all pending TX packet fragments. */
			
 
				+	xennet_release_tx_bufs(np);
			
 
				+
			
 
				+	/* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */
			
 
				+	for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) {
			
 
				+		if (!np->rx_skbs[i])
			
 
				+			continue;
			
 
				+
			
 
				+		skb = np->rx_skbs[requeue_idx] = xennet_get_rx_skb(np, i);
			
 
				+		ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i);
			
 
				+		req = RING_GET_REQUEST(&np->rx, requeue_idx);
			
 
				+
			
 
				+		gnttab_grant_foreign_access_ref(
			
 
				+			ref, np->xbdev->otherend_id,
			
 
				+			pfn_to_mfn(page_to_pfn(skb_shinfo(skb)->
			
 
				+					       frags->page)),
			
 
				+			0);
			
 
				+		req->gref = ref;
			
 
				+		req->id   = requeue_idx;
			
 
				+
			
 
				+		requeue_idx++;
			
 
				+	}
			
 
				+
			
 
				+	np->rx.req_prod_pvt = requeue_idx;
			
 
				+
			
 
				+	/*
			
 
				+	 * Step 3: All public and private state should now be sane.  Get
			
 
				+	 * ready to start sending and receiving packets and give the driver
			
 
				+	 * domain a kick because we've probably just requeued some
			
 
				+	 * packets.
			
 
				+	 */
			
 
				+	netif_carrier_on(np->netdev);
			
 
				+	notify_remote_via_irq(np->netdev->irq);
			
 
				+	xennet_tx_buf_gc(dev);
			
 
				+	xennet_alloc_rx_buffers(dev);
			
 
				+
			
 
				+	spin_unlock_irq(&np->tx_lock);
			
 
				+	spin_unlock_bh(&np->rx_lock);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Callback received when the backend's state changes.
			
 
				+ */
			
 
				+static void backend_changed(struct xenbus_device *dev,
			
 
				+			    enum xenbus_state backend_state)
			
 
				+{
			
 
				+	struct netfront_info *np = dev->dev.driver_data;
			
 
				+	struct net_device *netdev = np->netdev;
			
 
				+
			
 
				+	dev_dbg(&dev->dev, "%s\n", xenbus_strstate(backend_state));
			
 
				+
			
 
				+	switch (backend_state) {
			
 
				+	case XenbusStateInitialising:
			
 
				+	case XenbusStateInitialised:
			
 
				+	case XenbusStateConnected:
			
 
				+	case XenbusStateUnknown:
			
 
				+	case XenbusStateClosed:
			
 
				+		break;
			
 
				+
			
 
				+	case XenbusStateInitWait:
			
 
				+		if (dev->state != XenbusStateInitialising)
			
 
				+			break;
			
 
				+		if (xennet_connect(netdev) != 0)
			
 
				+			break;
			
 
				+		xenbus_switch_state(dev, XenbusStateConnected);
			
 
				+		break;
			
 
				+
			
 
				+	case XenbusStateClosing:
			
 
				+		xenbus_frontend_closed(dev);
			
 
				+		break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static struct ethtool_ops xennet_ethtool_ops =
			
 
				+{
			
 
				+	.get_tx_csum = ethtool_op_get_tx_csum,
			
 
				+	.set_tx_csum = ethtool_op_set_tx_csum,
			
 
				+	.get_sg = ethtool_op_get_sg,
			
 
				+	.set_sg = xennet_set_sg,
			
 
				+	.get_tso = ethtool_op_get_tso,
			
 
				+	.set_tso = xennet_set_tso,
			
 
				+	.get_link = ethtool_op_get_link,
			
 
				+};
			
 
				+
			
 
				+#ifdef CONFIG_SYSFS
			
 
				+static ssize_t show_rxbuf_min(struct device *dev,
			
 
				+			      struct device_attribute *attr, char *buf)
			
 
				+{
			
 
				+	struct net_device *netdev = to_net_dev(dev);
			
 
				+	struct netfront_info *info = netdev_priv(netdev);
			
 
				+
			
 
				+	return sprintf(buf, "%u\n", info->rx_min_target);
			
 
				+}
			
 
				+
			
 
				+static ssize_t store_rxbuf_min(struct device *dev,
			
 
				+			       struct device_attribute *attr,
			
 
				+			       const char *buf, size_t len)
			
 
				+{
			
 
				+	struct net_device *netdev = to_net_dev(dev);
			
 
				+	struct netfront_info *np = netdev_priv(netdev);
			
 
				+	char *endp;
			
 
				+	unsigned long target;
			
 
				+
			
 
				+	if (!capable(CAP_NET_ADMIN))
			
 
				+		return -EPERM;
			
 
				+
			
 
				+	target = simple_strtoul(buf, &endp, 0);
			
 
				+	if (endp == buf)
			
 
				+		return -EBADMSG;
			
 
				+
			
 
				+	if (target < RX_MIN_TARGET)
			
 
				+		target = RX_MIN_TARGET;
			
 
				+	if (target > RX_MAX_TARGET)
			
 
				+		target = RX_MAX_TARGET;
			
 
				+
			
 
				+	spin_lock_bh(&np->rx_lock);
			
 
				+	if (target > np->rx_max_target)
			
 
				+		np->rx_max_target = target;
			
 
				+	np->rx_min_target = target;
			
 
				+	if (target > np->rx_target)
			
 
				+		np->rx_target = target;
			
 
				+
			
 
				+	xennet_alloc_rx_buffers(netdev);
			
 
				+
			
 
				+	spin_unlock_bh(&np->rx_lock);
			
 
				+	return len;
			
 
				+}
			
 
				+
			
 
				+static ssize_t show_rxbuf_max(struct device *dev,
			
 
				+			      struct device_attribute *attr, char *buf)
			
 
				+{
			
 
				+	struct net_device *netdev = to_net_dev(dev);
			
 
				+	struct netfront_info *info = netdev_priv(netdev);
			
 
				+
			
 
				+	return sprintf(buf, "%u\n", info->rx_max_target);
			
 
				+}
			
 
				+
			
 
				+static ssize_t store_rxbuf_max(struct device *dev,
			
 
				+			       struct device_attribute *attr,
			
 
				+			       const char *buf, size_t len)
			
 
				+{
			
 
				+	struct net_device *netdev = to_net_dev(dev);
			
 
				+	struct netfront_info *np = netdev_priv(netdev);
			
 
				+	char *endp;
			
 
				+	unsigned long target;
			
 
				+
			
 
				+	if (!capable(CAP_NET_ADMIN))
			
 
				+		return -EPERM;
			
 
				+
			
 
				+	target = simple_strtoul(buf, &endp, 0);
			
 
				+	if (endp == buf)
			
 
				+		return -EBADMSG;
			
 
				+
			
 
				+	if (target < RX_MIN_TARGET)
			
 
				+		target = RX_MIN_TARGET;
			
 
				+	if (target > RX_MAX_TARGET)
			
 
				+		target = RX_MAX_TARGET;
			
 
				+
			
 
				+	spin_lock_bh(&np->rx_lock);
			
 
				+	if (target < np->rx_min_target)
			
 
				+		np->rx_min_target = target;
			
 
				+	np->rx_max_target = target;
			
 
				+	if (target < np->rx_target)
			
 
				+		np->rx_target = target;
			
 
				+
			
 
				+	xennet_alloc_rx_buffers(netdev);
			
 
				+
			
 
				+	spin_unlock_bh(&np->rx_lock);
			
 
				+	return len;
			
 
				+}
			
 
				+
			
 
				+static ssize_t show_rxbuf_cur(struct device *dev,
			
 
				+			      struct device_attribute *attr, char *buf)
			
 
				+{
			
 
				+	struct net_device *netdev = to_net_dev(dev);
			
 
				+	struct netfront_info *info = netdev_priv(netdev);
			
 
				+
			
 
				+	return sprintf(buf, "%u\n", info->rx_target);
			
 
				+}
			
 
				+
			
 
				+static struct device_attribute xennet_attrs[] = {
			
 
				+	__ATTR(rxbuf_min, S_IRUGO|S_IWUSR, show_rxbuf_min, store_rxbuf_min),
			
 
				+	__ATTR(rxbuf_max, S_IRUGO|S_IWUSR, show_rxbuf_max, store_rxbuf_max),
			
 
				+	__ATTR(rxbuf_cur, S_IRUGO, show_rxbuf_cur, NULL),
			
 
				+};
			
 
				+
			
 
				+static int xennet_sysfs_addif(struct net_device *netdev)
			
 
				+{
			
 
				+	int i;
			
 
				+	int err;
			
 
				+
			
 
				+	for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) {
			
 
				+		err = device_create_file(&netdev->dev,
			
 
				+					   &xennet_attrs[i]);
			
 
				+		if (err)
			
 
				+			goto fail;
			
 
				+	}
			
 
				+	return 0;
			
 
				+
			
 
				+ fail:
			
 
				+	while (--i >= 0)
			
 
				+		device_remove_file(&netdev->dev, &xennet_attrs[i]);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static void xennet_sysfs_delif(struct net_device *netdev)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++)
			
 
				+		device_remove_file(&netdev->dev, &xennet_attrs[i]);
			
 
				+}
			
 
				+
			
 
				+#endif /* CONFIG_SYSFS */
			
 
				+
			
 
				+static struct xenbus_device_id netfront_ids[] = {
			
 
				+	{ "vif" },
			
 
				+	{ "" }
			
 
				+};
			
 
				+
			
 
				+
			
 
				+static int __devexit xennet_remove(struct xenbus_device *dev)
			
 
				+{
			
 
				+	struct netfront_info *info = dev->dev.driver_data;
			
 
				+
			
 
				+	dev_dbg(&dev->dev, "%s\n", dev->nodename);
			
 
				+
			
 
				+	unregister_netdev(info->netdev);
			
 
				+
			
 
				+	xennet_disconnect_backend(info);
			
 
				+
			
 
				+	del_timer_sync(&info->rx_refill_timer);
			
 
				+
			
 
				+	xennet_sysfs_delif(info->netdev);
			
 
				+
			
 
				+	free_netdev(info->netdev);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static struct xenbus_driver netfront = {
			
 
				+	.name = "vif",
			
 
				+	.owner = THIS_MODULE,
			
 
				+	.ids = netfront_ids,
			
 
				+	.probe = netfront_probe,
			
 
				+	.remove = __devexit_p(xennet_remove),
			
 
				+	.resume = netfront_resume,
			
 
				+	.otherend_changed = backend_changed,
			
 
				+};
			
 
				+
			
 
				+static int __init netif_init(void)
			
 
				+{
			
 
				+	if (!is_running_on_xen())
			
 
				+		return -ENODEV;
			
 
				+
			
 
				+	if (is_initial_xendomain())
			
 
				+		return 0;
			
 
				+
			
 
				+	printk(KERN_INFO "Initialising Xen virtual ethernet driver.\n");
			
 
				+
			
 
				+	return xenbus_register_frontend(&netfront);
			
 
				+}
			
 
				+module_init(netif_init);
			
 
				+
			
 
				+
			
 
				+static void __exit netif_exit(void)
			
 
				+{
			
 
				+	if (is_initial_xendomain())
			
 
				+		return;
			
 
				+
			
 
				+	return xenbus_unregister_driver(&netfront);
			
 
				+}
			
 
				+module_exit(netif_exit);
			
 
				+
			
 
				+MODULE_DESCRIPTION("Xen virtual network device frontend");
			
 
				+MODULE_LICENSE("GPL");
			
--- a/drivers/pnp/pnpbios/core.c
+++ b/drivers/pnp/pnpbios/core.c
@@ -147,7 +147,7 @@ static int pnp_dock_event(int dock, struct pnp_docking_station_info *info)
 
				 		info->location_id, info->serial, info->capabilities);
			
 
				 	envp[i] = NULL;
			
 
				 	
			
 
				-	value = call_usermodehelper (argv [0], argv, envp, 0);
			
 
				+	value = call_usermodehelper (argv [0], argv, envp, UMH_WAIT_EXEC);
			
 
				 	kfree (buf);
			
 
				 	kfree (envp);
			
 
				 	return 0;
			
--- a/drivers/sbus/char/bbc_envctrl.c
+++ b/drivers/sbus/char/bbc_envctrl.c
@@ -7,6 +7,7 @@
 
				 #include <linux/kthread.h>
			
 
				 #include <linux/delay.h>
			
 
				 #include <linux/kmod.h>
			
 
				+#include <linux/reboot.h>
			
 
				 #include <asm/oplib.h>
			
 
				 #include <asm/ebus.h>
			
 
				 
			
@@ -170,8 +171,6 @@ static void get_current_temps(struct bbc_cpu_temperature *tp)
 
				 static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp)
			
 
				 {
			
 
				 	static int shutting_down = 0;
			
 
				-	static char *envp[] = { "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
			
 
				-	char *argv[] = { "/sbin/shutdown", "-h", "now", NULL };
			
 
				 	char *type = "???";
			
 
				 	s8 val = -1;
			
 
				 
			
@@ -195,7 +194,7 @@ static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp)
 
				 	printk(KERN_CRIT "kenvctrld: Shutting down the system now.\n");
			
 
				 
			
 
				 	shutting_down = 1;
			
 
				-	if (call_usermodehelper("/sbin/shutdown", argv, envp, 0) < 0)
			
 
				+	if (orderly_poweroff(true) < 0)
			
 
				 		printk(KERN_CRIT "envctrl: shutdown execution failed\n");
			
 
				 }
			
 
				 
			
--- a/drivers/sbus/char/envctrl.c
+++ b/drivers/sbus/char/envctrl.c
@@ -26,6 +26,7 @@
 
				 #include <linux/ioport.h>
			
 
				 #include <linux/miscdevice.h>
			
 
				 #include <linux/kmod.h>
			
 
				+#include <linux/reboot.h>
			
 
				 
			
 
				 #include <asm/ebus.h>
			
 
				 #include <asm/uaccess.h>
			
@@ -966,10 +967,6 @@ static struct i2c_child_t *envctrl_get_i2c_child(unsigned char mon_type)
 
				 static void envctrl_do_shutdown(void)
			
 
				 {
			
 
				 	static int inprog = 0;
			
 
				-	static char *envp[] = {	
			
 
				-		"HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
			
 
				-	char *argv[] = { 
			
 
				-		"/sbin/shutdown", "-h", "now", NULL };	
			
 
				 	int ret;
			
 
				 
			
 
				 	if (inprog != 0)
			
@@ -977,7 +974,7 @@ static void envctrl_do_shutdown(void)
 
				 
			
 
				 	inprog = 1;
			
 
				 	printk(KERN_CRIT "kenvctrld: WARNING: Shutting down the system now.\n");
			
 
				-	ret = call_usermodehelper("/sbin/shutdown", argv, envp, 0);
			
 
				+	ret = orderly_poweroff(true);
			
 
				 	if (ret < 0) {
			
 
				 		printk(KERN_CRIT "kenvctrld: WARNING: system shutdown failed!\n"); 
			
 
				 		inprog = 0;  /* unlikely to succeed, but we could try again */
			
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -0,0 +1,2 @@
 
				+obj-y	+= grant-table.o
			
 
				+obj-y	+= xenbus/
			
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -0,0 +1,582 @@
 
				+/******************************************************************************
			
 
				+ * grant_table.c
			
 
				+ *
			
 
				+ * Granting foreign access to our memory reservation.
			
 
				+ *
			
 
				+ * Copyright (c) 2005-2006, Christopher Clark
			
 
				+ * Copyright (c) 2004-2005, K A Fraser
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public License version 2
			
 
				+ * as published by the Free Software Foundation; or, when distributed
			
 
				+ * separately from the Linux kernel or incorporated into other
			
 
				+ * software packages, subject to the following license:
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this source file (the "Software"), to deal in the Software without
			
 
				+ * restriction, including without limitation the rights to use, copy, modify,
			
 
				+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
			
 
				+ * and to permit persons to whom the Software is furnished to do so, subject to
			
 
				+ * the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
			
 
				+ * IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/sched.h>
			
 
				+#include <linux/mm.h>
			
 
				+#include <linux/vmalloc.h>
			
 
				+#include <linux/uaccess.h>
			
 
				+
			
 
				+#include <xen/interface/xen.h>
			
 
				+#include <xen/page.h>
			
 
				+#include <xen/grant_table.h>
			
 
				+
			
 
				+#include <asm/pgtable.h>
			
 
				+#include <asm/sync_bitops.h>
			
 
				+
			
 
				+
			
 
				+/* External tools reserve first few grant table entries. */
			
 
				+#define NR_RESERVED_ENTRIES 8
			
 
				+#define GNTTAB_LIST_END 0xffffffff
			
 
				+#define GREFS_PER_GRANT_FRAME (PAGE_SIZE / sizeof(struct grant_entry))
			
 
				+
			
 
				+static grant_ref_t **gnttab_list;
			
 
				+static unsigned int nr_grant_frames;
			
 
				+static unsigned int boot_max_nr_grant_frames;
			
 
				+static int gnttab_free_count;
			
 
				+static grant_ref_t gnttab_free_head;
			
 
				+static DEFINE_SPINLOCK(gnttab_list_lock);
			
 
				+
			
 
				+static struct grant_entry *shared;
			
 
				+
			
 
				+static struct gnttab_free_callback *gnttab_free_callback_list;
			
 
				+
			
 
				+static int gnttab_expand(unsigned int req_entries);
			
 
				+
			
 
				+#define RPP (PAGE_SIZE / sizeof(grant_ref_t))
			
 
				+
			
 
				+static inline grant_ref_t *__gnttab_entry(grant_ref_t entry)
			
 
				+{
			
 
				+	return &gnttab_list[(entry) / RPP][(entry) % RPP];
			
 
				+}
			
 
				+/* This can be used as an l-value */
			
 
				+#define gnttab_entry(entry) (*__gnttab_entry(entry))
			
 
				+
			
 
				+static int get_free_entries(unsigned count)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+	int ref, rc;
			
 
				+	grant_ref_t head;
			
 
				+
			
 
				+	spin_lock_irqsave(&gnttab_list_lock, flags);
			
 
				+
			
 
				+	if ((gnttab_free_count < count) &&
			
 
				+	    ((rc = gnttab_expand(count - gnttab_free_count)) < 0)) {
			
 
				+		spin_unlock_irqrestore(&gnttab_list_lock, flags);
			
 
				+		return rc;
			
 
				+	}
			
 
				+
			
 
				+	ref = head = gnttab_free_head;
			
 
				+	gnttab_free_count -= count;
			
 
				+	while (count-- > 1)
			
 
				+		head = gnttab_entry(head);
			
 
				+	gnttab_free_head = gnttab_entry(head);
			
 
				+	gnttab_entry(head) = GNTTAB_LIST_END;
			
 
				+
			
 
				+	spin_unlock_irqrestore(&gnttab_list_lock, flags);
			
 
				+
			
 
				+	return ref;
			
 
				+}
			
 
				+
			
 
				+static void do_free_callbacks(void)
			
 
				+{
			
 
				+	struct gnttab_free_callback *callback, *next;
			
 
				+
			
 
				+	callback = gnttab_free_callback_list;
			
 
				+	gnttab_free_callback_list = NULL;
			
 
				+
			
 
				+	while (callback != NULL) {
			
 
				+		next = callback->next;
			
 
				+		if (gnttab_free_count >= callback->count) {
			
 
				+			callback->next = NULL;
			
 
				+			callback->fn(callback->arg);
			
 
				+		} else {
			
 
				+			callback->next = gnttab_free_callback_list;
			
 
				+			gnttab_free_callback_list = callback;
			
 
				+		}
			
 
				+		callback = next;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static inline void check_free_callbacks(void)
			
 
				+{
			
 
				+	if (unlikely(gnttab_free_callback_list))
			
 
				+		do_free_callbacks();
			
 
				+}
			
 
				+
			
 
				+static void put_free_entry(grant_ref_t ref)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+	spin_lock_irqsave(&gnttab_list_lock, flags);
			
 
				+	gnttab_entry(ref) = gnttab_free_head;
			
 
				+	gnttab_free_head = ref;
			
 
				+	gnttab_free_count++;
			
 
				+	check_free_callbacks();
			
 
				+	spin_unlock_irqrestore(&gnttab_list_lock, flags);
			
 
				+}
			
 
				+
			
 
				+static void update_grant_entry(grant_ref_t ref, domid_t domid,
			
 
				+			       unsigned long frame, unsigned flags)
			
 
				+{
			
 
				+	/*
			
 
				+	 * Introducing a valid entry into the grant table:
			
 
				+	 *  1. Write ent->domid.
			
 
				+	 *  2. Write ent->frame:
			
 
				+	 *      GTF_permit_access:   Frame to which access is permitted.
			
 
				+	 *      GTF_accept_transfer: Pseudo-phys frame slot being filled by new
			
 
				+	 *                           frame, or zero if none.
			
 
				+	 *  3. Write memory barrier (WMB).
			
 
				+	 *  4. Write ent->flags, inc. valid type.
			
 
				+	 */
			
 
				+	shared[ref].frame = frame;
			
 
				+	shared[ref].domid = domid;
			
 
				+	wmb();
			
 
				+	shared[ref].flags = flags;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Public grant-issuing interface functions
			
 
				+ */
			
 
				+void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
			
 
				+				     unsigned long frame, int readonly)
			
 
				+{
			
 
				+	update_grant_entry(ref, domid, frame,
			
 
				+			   GTF_permit_access | (readonly ? GTF_readonly : 0));
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref);
			
 
				+
			
 
				+int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
			
 
				+				int readonly)
			
 
				+{
			
 
				+	int ref;
			
 
				+
			
 
				+	ref = get_free_entries(1);
			
 
				+	if (unlikely(ref < 0))
			
 
				+		return -ENOSPC;
			
 
				+
			
 
				+	gnttab_grant_foreign_access_ref(ref, domid, frame, readonly);
			
 
				+
			
 
				+	return ref;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
			
 
				+
			
 
				+int gnttab_query_foreign_access(grant_ref_t ref)
			
 
				+{
			
 
				+	u16 nflags;
			
 
				+
			
 
				+	nflags = shared[ref].flags;
			
 
				+
			
 
				+	return (nflags & (GTF_reading|GTF_writing));
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(gnttab_query_foreign_access);
			
 
				+
			
 
				+int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
			
 
				+{
			
 
				+	u16 flags, nflags;
			
 
				+
			
 
				+	nflags = shared[ref].flags;
			
 
				+	do {
			
 
				+		flags = nflags;
			
 
				+		if (flags & (GTF_reading|GTF_writing)) {
			
 
				+			printk(KERN_ALERT "WARNING: g.e. still in use!\n");
			
 
				+			return 0;
			
 
				+		}
			
 
				+	} while ((nflags = sync_cmpxchg(&shared[ref].flags, flags, 0)) != flags);
			
 
				+
			
 
				+	return 1;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref);
			
 
				+
			
 
				+void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
			
 
				+			       unsigned long page)
			
 
				+{
			
 
				+	if (gnttab_end_foreign_access_ref(ref, readonly)) {
			
 
				+		put_free_entry(ref);
			
 
				+		if (page != 0)
			
 
				+			free_page(page);
			
 
				+	} else {
			
 
				+		/* XXX This needs to be fixed so that the ref and page are
			
 
				+		   placed on a list to be freed up later. */
			
 
				+		printk(KERN_WARNING
			
 
				+		       "WARNING: leaking g.e. and page still in use!\n");
			
 
				+	}
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(gnttab_end_foreign_access);
			
 
				+
			
 
				+int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn)
			
 
				+{
			
 
				+	int ref;
			
 
				+
			
 
				+	ref = get_free_entries(1);
			
 
				+	if (unlikely(ref < 0))
			
 
				+		return -ENOSPC;
			
 
				+	gnttab_grant_foreign_transfer_ref(ref, domid, pfn);
			
 
				+
			
 
				+	return ref;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer);
			
 
				+
			
 
				+void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid,
			
 
				+				       unsigned long pfn)
			
 
				+{
			
 
				+	update_grant_entry(ref, domid, pfn, GTF_accept_transfer);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref);
			
 
				+
			
 
				+unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
			
 
				+{
			
 
				+	unsigned long frame;
			
 
				+	u16           flags;
			
 
				+
			
 
				+	/*
			
 
				+	 * If a transfer is not even yet started, try to reclaim the grant
			
 
				+	 * reference and return failure (== 0).
			
 
				+	 */
			
 
				+	while (!((flags = shared[ref].flags) & GTF_transfer_committed)) {
			
 
				+		if (sync_cmpxchg(&shared[ref].flags, flags, 0) == flags)
			
 
				+			return 0;
			
 
				+		cpu_relax();
			
 
				+	}
			
 
				+
			
 
				+	/* If a transfer is in progress then wait until it is completed. */
			
 
				+	while (!(flags & GTF_transfer_completed)) {
			
 
				+		flags = shared[ref].flags;
			
 
				+		cpu_relax();
			
 
				+	}
			
 
				+
			
 
				+	rmb();	/* Read the frame number /after/ reading completion status. */
			
 
				+	frame = shared[ref].frame;
			
 
				+	BUG_ON(frame == 0);
			
 
				+
			
 
				+	return frame;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref);
			
 
				+
			
 
				+unsigned long gnttab_end_foreign_transfer(grant_ref_t ref)
			
 
				+{
			
 
				+	unsigned long frame = gnttab_end_foreign_transfer_ref(ref);
			
 
				+	put_free_entry(ref);
			
 
				+	return frame;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer);
			
 
				+
			
 
				+void gnttab_free_grant_reference(grant_ref_t ref)
			
 
				+{
			
 
				+	put_free_entry(ref);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(gnttab_free_grant_reference);
			
 
				+
			
 
				+void gnttab_free_grant_references(grant_ref_t head)
			
 
				+{
			
 
				+	grant_ref_t ref;
			
 
				+	unsigned long flags;
			
 
				+	int count = 1;
			
 
				+	if (head == GNTTAB_LIST_END)
			
 
				+		return;
			
 
				+	spin_lock_irqsave(&gnttab_list_lock, flags);
			
 
				+	ref = head;
			
 
				+	while (gnttab_entry(ref) != GNTTAB_LIST_END) {
			
 
				+		ref = gnttab_entry(ref);
			
 
				+		count++;
			
 
				+	}
			
 
				+	gnttab_entry(ref) = gnttab_free_head;
			
 
				+	gnttab_free_head = head;
			
 
				+	gnttab_free_count += count;
			
 
				+	check_free_callbacks();
			
 
				+	spin_unlock_irqrestore(&gnttab_list_lock, flags);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(gnttab_free_grant_references);
			
 
				+
			
 
				+int gnttab_alloc_grant_references(u16 count, grant_ref_t *head)
			
 
				+{
			
 
				+	int h = get_free_entries(count);
			
 
				+
			
 
				+	if (h < 0)
			
 
				+		return -ENOSPC;
			
 
				+
			
 
				+	*head = h;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references);
			
 
				+
			
 
				+int gnttab_empty_grant_references(const grant_ref_t *private_head)
			
 
				+{
			
 
				+	return (*private_head == GNTTAB_LIST_END);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(gnttab_empty_grant_references);
			
 
				+
			
 
				+int gnttab_claim_grant_reference(grant_ref_t *private_head)
			
 
				+{
			
 
				+	grant_ref_t g = *private_head;
			
 
				+	if (unlikely(g == GNTTAB_LIST_END))
			
 
				+		return -ENOSPC;
			
 
				+	*private_head = gnttab_entry(g);
			
 
				+	return g;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(gnttab_claim_grant_reference);
			
 
				+
			
 
				+void gnttab_release_grant_reference(grant_ref_t *private_head,
			
 
				+				    grant_ref_t release)
			
 
				+{
			
 
				+	gnttab_entry(release) = *private_head;
			
 
				+	*private_head = release;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(gnttab_release_grant_reference);
			
 
				+
			
 
				+void gnttab_request_free_callback(struct gnttab_free_callback *callback,
			
 
				+				  void (*fn)(void *), void *arg, u16 count)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+	spin_lock_irqsave(&gnttab_list_lock, flags);
			
 
				+	if (callback->next)
			
 
				+		goto out;
			
 
				+	callback->fn = fn;
			
 
				+	callback->arg = arg;
			
 
				+	callback->count = count;
			
 
				+	callback->next = gnttab_free_callback_list;
			
 
				+	gnttab_free_callback_list = callback;
			
 
				+	check_free_callbacks();
			
 
				+out:
			
 
				+	spin_unlock_irqrestore(&gnttab_list_lock, flags);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(gnttab_request_free_callback);
			
 
				+
			
 
				+void gnttab_cancel_free_callback(struct gnttab_free_callback *callback)
			
 
				+{
			
 
				+	struct gnttab_free_callback **pcb;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&gnttab_list_lock, flags);
			
 
				+	for (pcb = &gnttab_free_callback_list; *pcb; pcb = &(*pcb)->next) {
			
 
				+		if (*pcb == callback) {
			
 
				+			*pcb = callback->next;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+	spin_unlock_irqrestore(&gnttab_list_lock, flags);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(gnttab_cancel_free_callback);
			
 
				+
			
 
				+static int grow_gnttab_list(unsigned int more_frames)
			
 
				+{
			
 
				+	unsigned int new_nr_grant_frames, extra_entries, i;
			
 
				+
			
 
				+	new_nr_grant_frames = nr_grant_frames + more_frames;
			
 
				+	extra_entries       = more_frames * GREFS_PER_GRANT_FRAME;
			
 
				+
			
 
				+	for (i = nr_grant_frames; i < new_nr_grant_frames; i++) {
			
 
				+		gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_ATOMIC);
			
 
				+		if (!gnttab_list[i])
			
 
				+			goto grow_nomem;
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	for (i = GREFS_PER_GRANT_FRAME * nr_grant_frames;
			
 
				+	     i < GREFS_PER_GRANT_FRAME * new_nr_grant_frames - 1; i++)
			
 
				+		gnttab_entry(i) = i + 1;
			
 
				+
			
 
				+	gnttab_entry(i) = gnttab_free_head;
			
 
				+	gnttab_free_head = GREFS_PER_GRANT_FRAME * nr_grant_frames;
			
 
				+	gnttab_free_count += extra_entries;
			
 
				+
			
 
				+	nr_grant_frames = new_nr_grant_frames;
			
 
				+
			
 
				+	check_free_callbacks();
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+grow_nomem:
			
 
				+	for ( ; i >= nr_grant_frames; i--)
			
 
				+		free_page((unsigned long) gnttab_list[i]);
			
 
				+	return -ENOMEM;
			
 
				+}
			
 
				+
			
 
				+static unsigned int __max_nr_grant_frames(void)
			
 
				+{
			
 
				+	struct gnttab_query_size query;
			
 
				+	int rc;
			
 
				+
			
 
				+	query.dom = DOMID_SELF;
			
 
				+
			
 
				+	rc = HYPERVISOR_grant_table_op(GNTTABOP_query_size, &query, 1);
			
 
				+	if ((rc < 0) || (query.status != GNTST_okay))
			
 
				+		return 4; /* Legacy max supported number of frames */
			
 
				+
			
 
				+	return query.max_nr_frames;
			
 
				+}
			
 
				+
			
 
				+static inline unsigned int max_nr_grant_frames(void)
			
 
				+{
			
 
				+	unsigned int xen_max = __max_nr_grant_frames();
			
 
				+
			
 
				+	if (xen_max > boot_max_nr_grant_frames)
			
 
				+		return boot_max_nr_grant_frames;
			
 
				+	return xen_max;
			
 
				+}
			
 
				+
			
 
				+static int map_pte_fn(pte_t *pte, struct page *pmd_page,
			
 
				+		      unsigned long addr, void *data)
			
 
				+{
			
 
				+	unsigned long **frames = (unsigned long **)data;
			
 
				+
			
 
				+	set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
			
 
				+	(*frames)++;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
			
 
				+			unsigned long addr, void *data)
			
 
				+{
			
 
				+
			
 
				+	set_pte_at(&init_mm, addr, pte, __pte(0));
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
			
 
				+{
			
 
				+	struct gnttab_setup_table setup;
			
 
				+	unsigned long *frames;
			
 
				+	unsigned int nr_gframes = end_idx + 1;
			
 
				+	int rc;
			
 
				+
			
 
				+	frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC);
			
 
				+	if (!frames)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	setup.dom        = DOMID_SELF;
			
 
				+	setup.nr_frames  = nr_gframes;
			
 
				+	setup.frame_list = frames;
			
 
				+
			
 
				+	rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1);
			
 
				+	if (rc == -ENOSYS) {
			
 
				+		kfree(frames);
			
 
				+		return -ENOSYS;
			
 
				+	}
			
 
				+
			
 
				+	BUG_ON(rc || setup.status);
			
 
				+
			
 
				+	if (shared == NULL) {
			
 
				+		struct vm_struct *area;
			
 
				+		area = alloc_vm_area(PAGE_SIZE * max_nr_grant_frames());
			
 
				+		BUG_ON(area == NULL);
			
 
				+		shared = area->addr;
			
 
				+	}
			
 
				+	rc = apply_to_page_range(&init_mm, (unsigned long)shared,
			
 
				+				 PAGE_SIZE * nr_gframes,
			
 
				+				 map_pte_fn, &frames);
			
 
				+	BUG_ON(rc);
			
 
				+	frames -= nr_gframes; /* adjust after map_pte_fn() */
			
 
				+
			
 
				+	kfree(frames);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int gnttab_resume(void)
			
 
				+{
			
 
				+	if (max_nr_grant_frames() < nr_grant_frames)
			
 
				+		return -ENOSYS;
			
 
				+	return gnttab_map(0, nr_grant_frames - 1);
			
 
				+}
			
 
				+
			
 
				+static int gnttab_suspend(void)
			
 
				+{
			
 
				+	apply_to_page_range(&init_mm, (unsigned long)shared,
			
 
				+			    PAGE_SIZE * nr_grant_frames,
			
 
				+			    unmap_pte_fn, NULL);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int gnttab_expand(unsigned int req_entries)
			
 
				+{
			
 
				+	int rc;
			
 
				+	unsigned int cur, extra;
			
 
				+
			
 
				+	cur = nr_grant_frames;
			
 
				+	extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) /
			
 
				+		 GREFS_PER_GRANT_FRAME);
			
 
				+	if (cur + extra > max_nr_grant_frames())
			
 
				+		return -ENOSPC;
			
 
				+
			
 
				+	rc = gnttab_map(cur, cur + extra - 1);
			
 
				+	if (rc == 0)
			
 
				+		rc = grow_gnttab_list(extra);
			
 
				+
			
 
				+	return rc;
			
 
				+}
			
 
				+
			
 
				+static int __devinit gnttab_init(void)
			
 
				+{
			
 
				+	int i;
			
 
				+	unsigned int max_nr_glist_frames;
			
 
				+	unsigned int nr_init_grefs;
			
 
				+
			
 
				+	if (!is_running_on_xen())
			
 
				+		return -ENODEV;
			
 
				+
			
 
				+	nr_grant_frames = 1;
			
 
				+	boot_max_nr_grant_frames = __max_nr_grant_frames();
			
 
				+
			
 
				+	/* Determine the maximum number of frames required for the
			
 
				+	 * grant reference free list on the current hypervisor.
			
 
				+	 */
			
 
				+	max_nr_glist_frames = (boot_max_nr_grant_frames *
			
 
				+			       GREFS_PER_GRANT_FRAME /
			
 
				+			       (PAGE_SIZE / sizeof(grant_ref_t)));
			
 
				+
			
 
				+	gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *),
			
 
				+			      GFP_KERNEL);
			
 
				+	if (gnttab_list == NULL)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	for (i = 0; i < nr_grant_frames; i++) {
			
 
				+		gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_KERNEL);
			
 
				+		if (gnttab_list[i] == NULL)
			
 
				+			goto ini_nomem;
			
 
				+	}
			
 
				+
			
 
				+	if (gnttab_resume() < 0)
			
 
				+		return -ENODEV;
			
 
				+
			
 
				+	nr_init_grefs = nr_grant_frames * GREFS_PER_GRANT_FRAME;
			
 
				+
			
 
				+	for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++)
			
 
				+		gnttab_entry(i) = i + 1;
			
 
				+
			
 
				+	gnttab_entry(nr_init_grefs - 1) = GNTTAB_LIST_END;
			
 
				+	gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES;
			
 
				+	gnttab_free_head  = NR_RESERVED_ENTRIES;
			
 
				+
			
 
				+	printk("Grant table initialized\n");
			
 
				+	return 0;
			
 
				+
			
 
				+ ini_nomem:
			
 
				+	for (i--; i >= 0; i--)
			
 
				+		free_page((unsigned long)gnttab_list[i]);
			
 
				+	kfree(gnttab_list);
			
 
				+	return -ENOMEM;
			
 
				+}
			
 
				+
			
 
				+core_initcall(gnttab_init);
			
--- a/drivers/xen/xenbus/Makefile
+++ b/drivers/xen/xenbus/Makefile
@@ -0,0 +1,7 @@
 
				+obj-y	+= xenbus.o
			
 
				+
			
 
				+xenbus-objs =
			
 
				+xenbus-objs += xenbus_client.o
			
 
				+xenbus-objs += xenbus_comms.o
			
 
				+xenbus-objs += xenbus_xs.o
			
 
				+xenbus-objs += xenbus_probe.o
			
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -0,0 +1,569 @@
 
				+/******************************************************************************
			
 
				+ * Client-facing interface for the Xenbus driver.  In other words, the
			
 
				+ * interface between the Xenbus and the device-specific code, be it the
			
 
				+ * frontend or the backend of that driver.
			
 
				+ *
			
 
				+ * Copyright (C) 2005 XenSource Ltd
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public License version 2
			
 
				+ * as published by the Free Software Foundation; or, when distributed
			
 
				+ * separately from the Linux kernel or incorporated into other
			
 
				+ * software packages, subject to the following license:
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this source file (the "Software"), to deal in the Software without
			
 
				+ * restriction, including without limitation the rights to use, copy, modify,
			
 
				+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
			
 
				+ * and to permit persons to whom the Software is furnished to do so, subject to
			
 
				+ * the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
			
 
				+ * IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/types.h>
			
 
				+#include <linux/vmalloc.h>
			
 
				+#include <asm/xen/hypervisor.h>
			
 
				+#include <xen/interface/xen.h>
			
 
				+#include <xen/interface/event_channel.h>
			
 
				+#include <xen/events.h>
			
 
				+#include <xen/grant_table.h>
			
 
				+#include <xen/xenbus.h>
			
 
				+
			
 
				+const char *xenbus_strstate(enum xenbus_state state)
			
 
				+{
			
 
				+	static const char *const name[] = {
			
 
				+		[ XenbusStateUnknown      ] = "Unknown",
			
 
				+		[ XenbusStateInitialising ] = "Initialising",
			
 
				+		[ XenbusStateInitWait     ] = "InitWait",
			
 
				+		[ XenbusStateInitialised  ] = "Initialised",
			
 
				+		[ XenbusStateConnected    ] = "Connected",
			
 
				+		[ XenbusStateClosing      ] = "Closing",
			
 
				+		[ XenbusStateClosed	  ] = "Closed",
			
 
				+	};
			
 
				+	return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_strstate);
			
 
				+
			
 
				+/**
			
 
				+ * xenbus_watch_path - register a watch
			
 
				+ * @dev: xenbus device
			
 
				+ * @path: path to watch
			
 
				+ * @watch: watch to register
			
 
				+ * @callback: callback to register
			
 
				+ *
			
 
				+ * Register a @watch on the given path, using the given xenbus_watch structure
			
 
				+ * for storage, and the given @callback function as the callback.  Return 0 on
			
 
				+ * success, or -errno on error.  On success, the given @path will be saved as
			
 
				+ * @watch->node, and remains the caller's to free.  On error, @watch->node will
			
 
				+ * be NULL, the device will switch to %XenbusStateClosing, and the error will
			
 
				+ * be saved in the store.
			
 
				+ */
			
 
				+int xenbus_watch_path(struct xenbus_device *dev, const char *path,
			
 
				+		      struct xenbus_watch *watch,
			
 
				+		      void (*callback)(struct xenbus_watch *,
			
 
				+				       const char **, unsigned int))
			
 
				+{
			
 
				+	int err;
			
 
				+
			
 
				+	watch->node = path;
			
 
				+	watch->callback = callback;
			
 
				+
			
 
				+	err = register_xenbus_watch(watch);
			
 
				+
			
 
				+	if (err) {
			
 
				+		watch->node = NULL;
			
 
				+		watch->callback = NULL;
			
 
				+		xenbus_dev_fatal(dev, err, "adding watch on %s", path);
			
 
				+	}
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_watch_path);
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * xenbus_watch_pathfmt - register a watch on a sprintf-formatted path
			
 
				+ * @dev: xenbus device
			
 
				+ * @watch: watch to register
			
 
				+ * @callback: callback to register
			
 
				+ * @pathfmt: format of path to watch
			
 
				+ *
			
 
				+ * Register a watch on the given @path, using the given xenbus_watch
			
 
				+ * structure for storage, and the given @callback function as the callback.
			
 
				+ * Return 0 on success, or -errno on error.  On success, the watched path
			
 
				+ * (@path/@path2) will be saved as @watch->node, and becomes the caller's to
			
 
				+ * kfree().  On error, watch->node will be NULL, so the caller has nothing to
			
 
				+ * free, the device will switch to %XenbusStateClosing, and the error will be
			
 
				+ * saved in the store.
			
 
				+ */
			
 
				+int xenbus_watch_pathfmt(struct xenbus_device *dev,
			
 
				+			 struct xenbus_watch *watch,
			
 
				+			 void (*callback)(struct xenbus_watch *,
			
 
				+					const char **, unsigned int),
			
 
				+			 const char *pathfmt, ...)
			
 
				+{
			
 
				+	int err;
			
 
				+	va_list ap;
			
 
				+	char *path;
			
 
				+
			
 
				+	va_start(ap, pathfmt);
			
 
				+	path = kvasprintf(GFP_KERNEL, pathfmt, ap);
			
 
				+	va_end(ap);
			
 
				+
			
 
				+	if (!path) {
			
 
				+		xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch");
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+	err = xenbus_watch_path(dev, path, watch, callback);
			
 
				+
			
 
				+	if (err)
			
 
				+		kfree(path);
			
 
				+	return err;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt);
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * xenbus_switch_state
			
 
				+ * @dev: xenbus device
			
 
				+ * @xbt: transaction handle
			
 
				+ * @state: new state
			
 
				+ *
			
 
				+ * Advertise in the store a change of the given driver to the given new_state.
			
 
				+ * Return 0 on success, or -errno on error.  On error, the device will switch
			
 
				+ * to XenbusStateClosing, and the error will be saved in the store.
			
 
				+ */
			
 
				+int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
			
 
				+{
			
 
				+	/* We check whether the state is currently set to the given value, and
			
 
				+	   if not, then the state is set.  We don't want to unconditionally
			
 
				+	   write the given state, because we don't want to fire watches
			
 
				+	   unnecessarily.  Furthermore, if the node has gone, we don't write
			
 
				+	   to it, as the device will be tearing down, and we don't want to
			
 
				+	   resurrect that directory.
			
 
				+
			
 
				+	   Note that, because of this cached value of our state, this function
			
 
				+	   will not work inside a Xenstore transaction (something it was
			
 
				+	   trying to in the past) because dev->state would not get reset if
			
 
				+	   the transaction was aborted.
			
 
				+
			
 
				+	 */
			
 
				+
			
 
				+	int current_state;
			
 
				+	int err;
			
 
				+
			
 
				+	if (state == dev->state)
			
 
				+		return 0;
			
 
				+
			
 
				+	err = xenbus_scanf(XBT_NIL, dev->nodename, "state", "%d",
			
 
				+			   &current_state);
			
 
				+	if (err != 1)
			
 
				+		return 0;
			
 
				+
			
 
				+	err = xenbus_printf(XBT_NIL, dev->nodename, "state", "%d", state);
			
 
				+	if (err) {
			
 
				+		if (state != XenbusStateClosing) /* Avoid looping */
			
 
				+			xenbus_dev_fatal(dev, err, "writing new state");
			
 
				+		return err;
			
 
				+	}
			
 
				+
			
 
				+	dev->state = state;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_switch_state);
			
 
				+
			
 
				+int xenbus_frontend_closed(struct xenbus_device *dev)
			
 
				+{
			
 
				+	xenbus_switch_state(dev, XenbusStateClosed);
			
 
				+	complete(&dev->down);
			
 
				+	return 0;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_frontend_closed);
			
 
				+
			
 
				+/**
			
 
				+ * Return the path to the error node for the given device, or NULL on failure.
			
 
				+ * If the value returned is non-NULL, then it is the caller's to kfree.
			
 
				+ */
			
 
				+static char *error_path(struct xenbus_device *dev)
			
 
				+{
			
 
				+	return kasprintf(GFP_KERNEL, "error/%s", dev->nodename);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void xenbus_va_dev_error(struct xenbus_device *dev, int err,
			
 
				+				const char *fmt, va_list ap)
			
 
				+{
			
 
				+	int ret;
			
 
				+	unsigned int len;
			
 
				+	char *printf_buffer = NULL;
			
 
				+	char *path_buffer = NULL;
			
 
				+
			
 
				+#define PRINTF_BUFFER_SIZE 4096
			
 
				+	printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
			
 
				+	if (printf_buffer == NULL)
			
 
				+		goto fail;
			
 
				+
			
 
				+	len = sprintf(printf_buffer, "%i ", -err);
			
 
				+	ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap);
			
 
				+
			
 
				+	BUG_ON(len + ret > PRINTF_BUFFER_SIZE-1);
			
 
				+
			
 
				+	dev_err(&dev->dev, "%s\n", printf_buffer);
			
 
				+
			
 
				+	path_buffer = error_path(dev);
			
 
				+
			
 
				+	if (path_buffer == NULL) {
			
 
				+		dev_err(&dev->dev, "failed to write error node for %s (%s)\n",
			
 
				+		       dev->nodename, printf_buffer);
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	if (xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer) != 0) {
			
 
				+		dev_err(&dev->dev, "failed to write error node for %s (%s)\n",
			
 
				+		       dev->nodename, printf_buffer);
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+fail:
			
 
				+	kfree(printf_buffer);
			
 
				+	kfree(path_buffer);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * xenbus_dev_error
			
 
				+ * @dev: xenbus device
			
 
				+ * @err: error to report
			
 
				+ * @fmt: error message format
			
 
				+ *
			
 
				+ * Report the given negative errno into the store, along with the given
			
 
				+ * formatted message.
			
 
				+ */
			
 
				+void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...)
			
 
				+{
			
 
				+	va_list ap;
			
 
				+
			
 
				+	va_start(ap, fmt);
			
 
				+	xenbus_va_dev_error(dev, err, fmt, ap);
			
 
				+	va_end(ap);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_dev_error);
			
 
				+
			
 
				+/**
			
 
				+ * xenbus_dev_fatal
			
 
				+ * @dev: xenbus device
			
 
				+ * @err: error to report
			
 
				+ * @fmt: error message format
			
 
				+ *
			
 
				+ * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by
			
 
				+ * xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly
			
 
				+ * closedown of this driver and its peer.
			
 
				+ */
			
 
				+
			
 
				+void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...)
			
 
				+{
			
 
				+	va_list ap;
			
 
				+
			
 
				+	va_start(ap, fmt);
			
 
				+	xenbus_va_dev_error(dev, err, fmt, ap);
			
 
				+	va_end(ap);
			
 
				+
			
 
				+	xenbus_switch_state(dev, XenbusStateClosing);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_dev_fatal);
			
 
				+
			
 
				+/**
			
 
				+ * xenbus_grant_ring
			
 
				+ * @dev: xenbus device
			
 
				+ * @ring_mfn: mfn of ring to grant
			
 
				+
			
 
				+ * Grant access to the given @ring_mfn to the peer of the given device.  Return
			
 
				+ * 0 on success, or -errno on error.  On error, the device will switch to
			
 
				+ * XenbusStateClosing, and the error will be saved in the store.
			
 
				+ */
			
 
				+int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn)
			
 
				+{
			
 
				+	int err = gnttab_grant_foreign_access(dev->otherend_id, ring_mfn, 0);
			
 
				+	if (err < 0)
			
 
				+		xenbus_dev_fatal(dev, err, "granting access to ring page");
			
 
				+	return err;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_grant_ring);
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * Allocate an event channel for the given xenbus_device, assigning the newly
			
 
				+ * created local port to *port.  Return 0 on success, or -errno on error.  On
			
 
				+ * error, the device will switch to XenbusStateClosing, and the error will be
			
 
				+ * saved in the store.
			
 
				+ */
			
 
				+int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port)
			
 
				+{
			
 
				+	struct evtchn_alloc_unbound alloc_unbound;
			
 
				+	int err;
			
 
				+
			
 
				+	alloc_unbound.dom = DOMID_SELF;
			
 
				+	alloc_unbound.remote_dom = dev->otherend_id;
			
 
				+
			
 
				+	err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
			
 
				+					  &alloc_unbound);
			
 
				+	if (err)
			
 
				+		xenbus_dev_fatal(dev, err, "allocating event channel");
			
 
				+	else
			
 
				+		*port = alloc_unbound.port;
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn);
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * Bind to an existing interdomain event channel in another domain. Returns 0
			
 
				+ * on success and stores the local port in *port. On error, returns -errno,
			
 
				+ * switches the device to XenbusStateClosing, and saves the error in XenStore.
			
 
				+ */
			
 
				+int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port)
			
 
				+{
			
 
				+	struct evtchn_bind_interdomain bind_interdomain;
			
 
				+	int err;
			
 
				+
			
 
				+	bind_interdomain.remote_dom = dev->otherend_id;
			
 
				+	bind_interdomain.remote_port = remote_port;
			
 
				+
			
 
				+	err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
			
 
				+					  &bind_interdomain);
			
 
				+	if (err)
			
 
				+		xenbus_dev_fatal(dev, err,
			
 
				+				 "binding to event channel %d from domain %d",
			
 
				+				 remote_port, dev->otherend_id);
			
 
				+	else
			
 
				+		*port = bind_interdomain.local_port;
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_bind_evtchn);
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * Free an existing event channel. Returns 0 on success or -errno on error.
			
 
				+ */
			
 
				+int xenbus_free_evtchn(struct xenbus_device *dev, int port)
			
 
				+{
			
 
				+	struct evtchn_close close;
			
 
				+	int err;
			
 
				+
			
 
				+	close.port = port;
			
 
				+
			
 
				+	err = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
			
 
				+	if (err)
			
 
				+		xenbus_dev_error(dev, err, "freeing event channel %d", port);
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_free_evtchn);
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * xenbus_map_ring_valloc
			
 
				+ * @dev: xenbus device
			
 
				+ * @gnt_ref: grant reference
			
 
				+ * @vaddr: pointer to address to be filled out by mapping
			
 
				+ *
			
 
				+ * Based on Rusty Russell's skeleton driver's map_page.
			
 
				+ * Map a page of memory into this domain from another domain's grant table.
			
 
				+ * xenbus_map_ring_valloc allocates a page of virtual address space, maps the
			
 
				+ * page to that address, and sets *vaddr to that address.
			
 
				+ * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
			
 
				+ * or -ENOMEM on error. If an error is returned, device will switch to
			
 
				+ * XenbusStateClosing and the error message will be saved in XenStore.
			
 
				+ */
			
 
				+int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr)
			
 
				+{
			
 
				+	struct gnttab_map_grant_ref op = {
			
 
				+		.flags = GNTMAP_host_map,
			
 
				+		.ref   = gnt_ref,
			
 
				+		.dom   = dev->otherend_id,
			
 
				+	};
			
 
				+	struct vm_struct *area;
			
 
				+
			
 
				+	*vaddr = NULL;
			
 
				+
			
 
				+	area = alloc_vm_area(PAGE_SIZE);
			
 
				+	if (!area)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	op.host_addr = (unsigned long)area->addr;
			
 
				+
			
 
				+	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
			
 
				+		BUG();
			
 
				+
			
 
				+	if (op.status != GNTST_okay) {
			
 
				+		free_vm_area(area);
			
 
				+		xenbus_dev_fatal(dev, op.status,
			
 
				+				 "mapping in shared page %d from domain %d",
			
 
				+				 gnt_ref, dev->otherend_id);
			
 
				+		return op.status;
			
 
				+	}
			
 
				+
			
 
				+	/* Stuff the handle in an unused field */
			
 
				+	area->phys_addr = (unsigned long)op.handle;
			
 
				+
			
 
				+	*vaddr = area->addr;
			
 
				+	return 0;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * xenbus_map_ring
			
 
				+ * @dev: xenbus device
			
 
				+ * @gnt_ref: grant reference
			
 
				+ * @handle: pointer to grant handle to be filled
			
 
				+ * @vaddr: address to be mapped to
			
 
				+ *
			
 
				+ * Map a page of memory into this domain from another domain's grant table.
			
 
				+ * xenbus_map_ring does not allocate the virtual address space (you must do
			
 
				+ * this yourself!). It only maps in the page to the specified address.
			
 
				+ * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
			
 
				+ * or -ENOMEM on error. If an error is returned, device will switch to
			
 
				+ * XenbusStateClosing and the error message will be saved in XenStore.
			
 
				+ */
			
 
				+int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
			
 
				+		    grant_handle_t *handle, void *vaddr)
			
 
				+{
			
 
				+	struct gnttab_map_grant_ref op = {
			
 
				+		.host_addr = (unsigned long)vaddr,
			
 
				+		.flags     = GNTMAP_host_map,
			
 
				+		.ref       = gnt_ref,
			
 
				+		.dom       = dev->otherend_id,
			
 
				+	};
			
 
				+
			
 
				+	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
			
 
				+		BUG();
			
 
				+
			
 
				+	if (op.status != GNTST_okay) {
			
 
				+		xenbus_dev_fatal(dev, op.status,
			
 
				+				 "mapping in shared page %d from domain %d",
			
 
				+				 gnt_ref, dev->otherend_id);
			
 
				+	} else
			
 
				+		*handle = op.handle;
			
 
				+
			
 
				+	return op.status;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_map_ring);
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * xenbus_unmap_ring_vfree
			
 
				+ * @dev: xenbus device
			
 
				+ * @vaddr: addr to unmap
			
 
				+ *
			
 
				+ * Based on Rusty Russell's skeleton driver's unmap_page.
			
 
				+ * Unmap a page of memory in this domain that was imported from another domain.
			
 
				+ * Use xenbus_unmap_ring_vfree if you mapped in your memory with
			
 
				+ * xenbus_map_ring_valloc (it will free the virtual address space).
			
 
				+ * Returns 0 on success and returns GNTST_* on error
			
 
				+ * (see xen/include/interface/grant_table.h).
			
 
				+ */
			
 
				+int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr)
			
 
				+{
			
 
				+	struct vm_struct *area;
			
 
				+	struct gnttab_unmap_grant_ref op = {
			
 
				+		.host_addr = (unsigned long)vaddr,
			
 
				+	};
			
 
				+
			
 
				+	/* It'd be nice if linux/vmalloc.h provided a find_vm_area(void *addr)
			
 
				+	 * method so that we don't have to muck with vmalloc internals here.
			
 
				+	 * We could force the user to hang on to their struct vm_struct from
			
 
				+	 * xenbus_map_ring_valloc, but these 6 lines considerably simplify
			
 
				+	 * this API.
			
 
				+	 */
			
 
				+	read_lock(&vmlist_lock);
			
 
				+	for (area = vmlist; area != NULL; area = area->next) {
			
 
				+		if (area->addr == vaddr)
			
 
				+			break;
			
 
				+	}
			
 
				+	read_unlock(&vmlist_lock);
			
 
				+
			
 
				+	if (!area) {
			
 
				+		xenbus_dev_error(dev, -ENOENT,
			
 
				+				 "can't find mapped virtual address %p", vaddr);
			
 
				+		return GNTST_bad_virt_addr;
			
 
				+	}
			
 
				+
			
 
				+	op.handle = (grant_handle_t)area->phys_addr;
			
 
				+
			
 
				+	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
			
 
				+		BUG();
			
 
				+
			
 
				+	if (op.status == GNTST_okay)
			
 
				+		free_vm_area(area);
			
 
				+	else
			
 
				+		xenbus_dev_error(dev, op.status,
			
 
				+				 "unmapping page at handle %d error %d",
			
 
				+				 (int16_t)area->phys_addr, op.status);
			
 
				+
			
 
				+	return op.status;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * xenbus_unmap_ring
			
 
				+ * @dev: xenbus device
			
 
				+ * @handle: grant handle
			
 
				+ * @vaddr: addr to unmap
			
 
				+ *
			
 
				+ * Unmap a page of memory in this domain that was imported from another domain.
			
 
				+ * Returns 0 on success and returns GNTST_* on error
			
 
				+ * (see xen/include/interface/grant_table.h).
			
 
				+ */
			
 
				+int xenbus_unmap_ring(struct xenbus_device *dev,
			
 
				+		      grant_handle_t handle, void *vaddr)
			
 
				+{
			
 
				+	struct gnttab_unmap_grant_ref op = {
			
 
				+		.host_addr = (unsigned long)vaddr,
			
 
				+		.handle    = handle,
			
 
				+	};
			
 
				+
			
 
				+	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
			
 
				+		BUG();
			
 
				+
			
 
				+	if (op.status != GNTST_okay)
			
 
				+		xenbus_dev_error(dev, op.status,
			
 
				+				 "unmapping page at handle %d error %d",
			
 
				+				 handle, op.status);
			
 
				+
			
 
				+	return op.status;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_unmap_ring);
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * xenbus_read_driver_state
			
 
				+ * @path: path for driver
			
 
				+ *
			
 
				+ * Return the state of the driver rooted at the given store path, or
			
 
				+ * XenbusStateUnknown if no state can be read.
			
 
				+ */
			
 
				+enum xenbus_state xenbus_read_driver_state(const char *path)
			
 
				+{
			
 
				+	enum xenbus_state result;
			
 
				+	int err = xenbus_gather(XBT_NIL, path, "state", "%d", &result, NULL);
			
 
				+	if (err)
			
 
				+		result = XenbusStateUnknown;
			
 
				+
			
 
				+	return result;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_read_driver_state);
			
--- a/drivers/xen/xenbus/xenbus_comms.c
+++ b/drivers/xen/xenbus/xenbus_comms.c
@@ -0,0 +1,233 @@
 
				+/******************************************************************************
			
 
				+ * xenbus_comms.c
			
 
				+ *
			
 
				+ * Low level code to talks to Xen Store: ringbuffer and event channel.
			
 
				+ *
			
 
				+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public License version 2
			
 
				+ * as published by the Free Software Foundation; or, when distributed
			
 
				+ * separately from the Linux kernel or incorporated into other
			
 
				+ * software packages, subject to the following license:
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this source file (the "Software"), to deal in the Software without
			
 
				+ * restriction, including without limitation the rights to use, copy, modify,
			
 
				+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
			
 
				+ * and to permit persons to whom the Software is furnished to do so, subject to
			
 
				+ * the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
			
 
				+ * IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/wait.h>
			
 
				+#include <linux/interrupt.h>
			
 
				+#include <linux/sched.h>
			
 
				+#include <linux/err.h>
			
 
				+#include <xen/xenbus.h>
			
 
				+#include <asm/xen/hypervisor.h>
			
 
				+#include <xen/events.h>
			
 
				+#include <xen/page.h>
			
 
				+#include "xenbus_comms.h"
			
 
				+
			
 
				+static int xenbus_irq;
			
 
				+
			
 
				+static DECLARE_WORK(probe_work, xenbus_probe);
			
 
				+
			
 
				+static DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
			
 
				+
			
 
				+static irqreturn_t wake_waiting(int irq, void *unused)
			
 
				+{
			
 
				+	if (unlikely(xenstored_ready == 0)) {
			
 
				+		xenstored_ready = 1;
			
 
				+		schedule_work(&probe_work);
			
 
				+	}
			
 
				+
			
 
				+	wake_up(&xb_waitq);
			
 
				+	return IRQ_HANDLED;
			
 
				+}
			
 
				+
			
 
				+static int check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod)
			
 
				+{
			
 
				+	return ((prod - cons) <= XENSTORE_RING_SIZE);
			
 
				+}
			
 
				+
			
 
				+static void *get_output_chunk(XENSTORE_RING_IDX cons,
			
 
				+			      XENSTORE_RING_IDX prod,
			
 
				+			      char *buf, uint32_t *len)
			
 
				+{
			
 
				+	*len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod);
			
 
				+	if ((XENSTORE_RING_SIZE - (prod - cons)) < *len)
			
 
				+		*len = XENSTORE_RING_SIZE - (prod - cons);
			
 
				+	return buf + MASK_XENSTORE_IDX(prod);
			
 
				+}
			
 
				+
			
 
				+static const void *get_input_chunk(XENSTORE_RING_IDX cons,
			
 
				+				   XENSTORE_RING_IDX prod,
			
 
				+				   const char *buf, uint32_t *len)
			
 
				+{
			
 
				+	*len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons);
			
 
				+	if ((prod - cons) < *len)
			
 
				+		*len = prod - cons;
			
 
				+	return buf + MASK_XENSTORE_IDX(cons);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * xb_write - low level write
			
 
				+ * @data: buffer to send
			
 
				+ * @len: length of buffer
			
 
				+ *
			
 
				+ * Returns 0 on success, error otherwise.
			
 
				+ */
			
 
				+int xb_write(const void *data, unsigned len)
			
 
				+{
			
 
				+	struct xenstore_domain_interface *intf = xen_store_interface;
			
 
				+	XENSTORE_RING_IDX cons, prod;
			
 
				+	int rc;
			
 
				+
			
 
				+	while (len != 0) {
			
 
				+		void *dst;
			
 
				+		unsigned int avail;
			
 
				+
			
 
				+		rc = wait_event_interruptible(
			
 
				+			xb_waitq,
			
 
				+			(intf->req_prod - intf->req_cons) !=
			
 
				+			XENSTORE_RING_SIZE);
			
 
				+		if (rc < 0)
			
 
				+			return rc;
			
 
				+
			
 
				+		/* Read indexes, then verify. */
			
 
				+		cons = intf->req_cons;
			
 
				+		prod = intf->req_prod;
			
 
				+		if (!check_indexes(cons, prod)) {
			
 
				+			intf->req_cons = intf->req_prod = 0;
			
 
				+			return -EIO;
			
 
				+		}
			
 
				+
			
 
				+		dst = get_output_chunk(cons, prod, intf->req, &avail);
			
 
				+		if (avail == 0)
			
 
				+			continue;
			
 
				+		if (avail > len)
			
 
				+			avail = len;
			
 
				+
			
 
				+		/* Must write data /after/ reading the consumer index. */
			
 
				+		mb();
			
 
				+
			
 
				+		memcpy(dst, data, avail);
			
 
				+		data += avail;
			
 
				+		len -= avail;
			
 
				+
			
 
				+		/* Other side must not see new producer until data is there. */
			
 
				+		wmb();
			
 
				+		intf->req_prod += avail;
			
 
				+
			
 
				+		/* Implies mb(): other side will see the updated producer. */
			
 
				+		notify_remote_via_evtchn(xen_store_evtchn);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int xb_data_to_read(void)
			
 
				+{
			
 
				+	struct xenstore_domain_interface *intf = xen_store_interface;
			
 
				+	return (intf->rsp_cons != intf->rsp_prod);
			
 
				+}
			
 
				+
			
 
				+int xb_wait_for_data_to_read(void)
			
 
				+{
			
 
				+	return wait_event_interruptible(xb_waitq, xb_data_to_read());
			
 
				+}
			
 
				+
			
 
				+int xb_read(void *data, unsigned len)
			
 
				+{
			
 
				+	struct xenstore_domain_interface *intf = xen_store_interface;
			
 
				+	XENSTORE_RING_IDX cons, prod;
			
 
				+	int rc;
			
 
				+
			
 
				+	while (len != 0) {
			
 
				+		unsigned int avail;
			
 
				+		const char *src;
			
 
				+
			
 
				+		rc = xb_wait_for_data_to_read();
			
 
				+		if (rc < 0)
			
 
				+			return rc;
			
 
				+
			
 
				+		/* Read indexes, then verify. */
			
 
				+		cons = intf->rsp_cons;
			
 
				+		prod = intf->rsp_prod;
			
 
				+		if (!check_indexes(cons, prod)) {
			
 
				+			intf->rsp_cons = intf->rsp_prod = 0;
			
 
				+			return -EIO;
			
 
				+		}
			
 
				+
			
 
				+		src = get_input_chunk(cons, prod, intf->rsp, &avail);
			
 
				+		if (avail == 0)
			
 
				+			continue;
			
 
				+		if (avail > len)
			
 
				+			avail = len;
			
 
				+
			
 
				+		/* Must read data /after/ reading the producer index. */
			
 
				+		rmb();
			
 
				+
			
 
				+		memcpy(data, src, avail);
			
 
				+		data += avail;
			
 
				+		len -= avail;
			
 
				+
			
 
				+		/* Other side must not see free space until we've copied out */
			
 
				+		mb();
			
 
				+		intf->rsp_cons += avail;
			
 
				+
			
 
				+		pr_debug("Finished read of %i bytes (%i to go)\n", avail, len);
			
 
				+
			
 
				+		/* Implies mb(): other side will see the updated consumer. */
			
 
				+		notify_remote_via_evtchn(xen_store_evtchn);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * xb_init_comms - Set up interrupt handler off store event channel.
			
 
				+ */
			
 
				+int xb_init_comms(void)
			
 
				+{
			
 
				+	struct xenstore_domain_interface *intf = xen_store_interface;
			
 
				+	int err;
			
 
				+
			
 
				+	if (intf->req_prod != intf->req_cons)
			
 
				+		printk(KERN_ERR "XENBUS request ring is not quiescent "
			
 
				+		       "(%08x:%08x)!\n", intf->req_cons, intf->req_prod);
			
 
				+
			
 
				+	if (intf->rsp_prod != intf->rsp_cons) {
			
 
				+		printk(KERN_WARNING "XENBUS response ring is not quiescent "
			
 
				+		       "(%08x:%08x): fixing up\n",
			
 
				+		       intf->rsp_cons, intf->rsp_prod);
			
 
				+		intf->rsp_cons = intf->rsp_prod;
			
 
				+	}
			
 
				+
			
 
				+	if (xenbus_irq)
			
 
				+		unbind_from_irqhandler(xenbus_irq, &xb_waitq);
			
 
				+
			
 
				+	err = bind_evtchn_to_irqhandler(
			
 
				+		xen_store_evtchn, wake_waiting,
			
 
				+		0, "xenbus", &xb_waitq);
			
 
				+	if (err <= 0) {
			
 
				+		printk(KERN_ERR "XENBUS request irq failed %i\n", err);
			
 
				+		return err;
			
 
				+	}
			
 
				+
			
 
				+	xenbus_irq = err;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/drivers/xen/xenbus/xenbus_comms.h
+++ b/drivers/xen/xenbus/xenbus_comms.h
@@ -0,0 +1,46 @@
 
				+/*
			
 
				+ * Private include for xenbus communications.
			
 
				+ *
			
 
				+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public License version 2
			
 
				+ * as published by the Free Software Foundation; or, when distributed
			
 
				+ * separately from the Linux kernel or incorporated into other
			
 
				+ * software packages, subject to the following license:
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this source file (the "Software"), to deal in the Software without
			
 
				+ * restriction, including without limitation the rights to use, copy, modify,
			
 
				+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
			
 
				+ * and to permit persons to whom the Software is furnished to do so, subject to
			
 
				+ * the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
			
 
				+ * IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#ifndef _XENBUS_COMMS_H
			
 
				+#define _XENBUS_COMMS_H
			
 
				+
			
 
				+int xs_init(void);
			
 
				+int xb_init_comms(void);
			
 
				+
			
 
				+/* Low level routines. */
			
 
				+int xb_write(const void *data, unsigned len);
			
 
				+int xb_read(void *data, unsigned len);
			
 
				+int xb_data_to_read(void);
			
 
				+int xb_wait_for_data_to_read(void);
			
 
				+int xs_input_avail(void);
			
 
				+extern struct xenstore_domain_interface *xen_store_interface;
			
 
				+extern int xen_store_evtchn;
			
 
				+
			
 
				+#endif /* _XENBUS_COMMS_H */
			
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -0,0 +1,935 @@
 
				+/******************************************************************************
			
 
				+ * Talks to Xen Store to figure out what devices we have.
			
 
				+ *
			
 
				+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
			
 
				+ * Copyright (C) 2005 Mike Wray, Hewlett-Packard
			
 
				+ * Copyright (C) 2005, 2006 XenSource Ltd
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public License version 2
			
 
				+ * as published by the Free Software Foundation; or, when distributed
			
 
				+ * separately from the Linux kernel or incorporated into other
			
 
				+ * software packages, subject to the following license:
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this source file (the "Software"), to deal in the Software without
			
 
				+ * restriction, including without limitation the rights to use, copy, modify,
			
 
				+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
			
 
				+ * and to permit persons to whom the Software is furnished to do so, subject to
			
 
				+ * the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
			
 
				+ * IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#define DPRINTK(fmt, args...)				\
			
 
				+	pr_debug("xenbus_probe (%s:%d) " fmt ".\n",	\
			
 
				+		 __func__, __LINE__, ##args)
			
 
				+
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/err.h>
			
 
				+#include <linux/string.h>
			
 
				+#include <linux/ctype.h>
			
 
				+#include <linux/fcntl.h>
			
 
				+#include <linux/mm.h>
			
 
				+#include <linux/notifier.h>
			
 
				+#include <linux/kthread.h>
			
 
				+#include <linux/mutex.h>
			
 
				+#include <linux/io.h>
			
 
				+
			
 
				+#include <asm/page.h>
			
 
				+#include <asm/pgtable.h>
			
 
				+#include <asm/xen/hypervisor.h>
			
 
				+#include <xen/xenbus.h>
			
 
				+#include <xen/events.h>
			
 
				+#include <xen/page.h>
			
 
				+
			
 
				+#include "xenbus_comms.h"
			
 
				+#include "xenbus_probe.h"
			
 
				+
			
 
				+int xen_store_evtchn;
			
 
				+struct xenstore_domain_interface *xen_store_interface;
			
 
				+static unsigned long xen_store_mfn;
			
 
				+
			
 
				+static BLOCKING_NOTIFIER_HEAD(xenstore_chain);
			
 
				+
			
 
				+static void wait_for_devices(struct xenbus_driver *xendrv);
			
 
				+
			
 
				+static int xenbus_probe_frontend(const char *type, const char *name);
			
 
				+
			
 
				+static void xenbus_dev_shutdown(struct device *_dev);
			
 
				+
			
 
				+/* If something in array of ids matches this device, return it. */
			
 
				+static const struct xenbus_device_id *
			
 
				+match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
			
 
				+{
			
 
				+	for (; *arr->devicetype != '\0'; arr++) {
			
 
				+		if (!strcmp(arr->devicetype, dev->devicetype))
			
 
				+			return arr;
			
 
				+	}
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+int xenbus_match(struct device *_dev, struct device_driver *_drv)
			
 
				+{
			
 
				+	struct xenbus_driver *drv = to_xenbus_driver(_drv);
			
 
				+
			
 
				+	if (!drv->ids)
			
 
				+		return 0;
			
 
				+
			
 
				+	return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
			
 
				+}
			
 
				+
			
 
				+/* device/<type>/<id> => <type>-<id> */
			
 
				+static int frontend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
			
 
				+{
			
 
				+	nodename = strchr(nodename, '/');
			
 
				+	if (!nodename || strlen(nodename + 1) >= BUS_ID_SIZE) {
			
 
				+		printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	strlcpy(bus_id, nodename + 1, BUS_ID_SIZE);
			
 
				+	if (!strchr(bus_id, '/')) {
			
 
				+		printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+	*strchr(bus_id, '/') = '-';
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void free_otherend_details(struct xenbus_device *dev)
			
 
				+{
			
 
				+	kfree(dev->otherend);
			
 
				+	dev->otherend = NULL;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void free_otherend_watch(struct xenbus_device *dev)
			
 
				+{
			
 
				+	if (dev->otherend_watch.node) {
			
 
				+		unregister_xenbus_watch(&dev->otherend_watch);
			
 
				+		kfree(dev->otherend_watch.node);
			
 
				+		dev->otherend_watch.node = NULL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int read_otherend_details(struct xenbus_device *xendev,
			
 
				+				 char *id_node, char *path_node)
			
 
				+{
			
 
				+	int err = xenbus_gather(XBT_NIL, xendev->nodename,
			
 
				+				id_node, "%i", &xendev->otherend_id,
			
 
				+				path_node, NULL, &xendev->otherend,
			
 
				+				NULL);
			
 
				+	if (err) {
			
 
				+		xenbus_dev_fatal(xendev, err,
			
 
				+				 "reading other end details from %s",
			
 
				+				 xendev->nodename);
			
 
				+		return err;
			
 
				+	}
			
 
				+	if (strlen(xendev->otherend) == 0 ||
			
 
				+	    !xenbus_exists(XBT_NIL, xendev->otherend, "")) {
			
 
				+		xenbus_dev_fatal(xendev, -ENOENT,
			
 
				+				 "unable to read other end from %s.  "
			
 
				+				 "missing or inaccessible.",
			
 
				+				 xendev->nodename);
			
 
				+		free_otherend_details(xendev);
			
 
				+		return -ENOENT;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static int read_backend_details(struct xenbus_device *xendev)
			
 
				+{
			
 
				+	return read_otherend_details(xendev, "backend-id", "backend");
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* Bus type for frontend drivers. */
			
 
				+static struct xen_bus_type xenbus_frontend = {
			
 
				+	.root = "device",
			
 
				+	.levels = 2, 		/* device/type/<id> */
			
 
				+	.get_bus_id = frontend_bus_id,
			
 
				+	.probe = xenbus_probe_frontend,
			
 
				+	.bus = {
			
 
				+		.name     = "xen",
			
 
				+		.match    = xenbus_match,
			
 
				+		.probe    = xenbus_dev_probe,
			
 
				+		.remove   = xenbus_dev_remove,
			
 
				+		.shutdown = xenbus_dev_shutdown,
			
 
				+	},
			
 
				+};
			
 
				+
			
 
				+static void otherend_changed(struct xenbus_watch *watch,
			
 
				+			     const char **vec, unsigned int len)
			
 
				+{
			
 
				+	struct xenbus_device *dev =
			
 
				+		container_of(watch, struct xenbus_device, otherend_watch);
			
 
				+	struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
			
 
				+	enum xenbus_state state;
			
 
				+
			
 
				+	/* Protect us against watches firing on old details when the otherend
			
 
				+	   details change, say immediately after a resume. */
			
 
				+	if (!dev->otherend ||
			
 
				+	    strncmp(dev->otherend, vec[XS_WATCH_PATH],
			
 
				+		    strlen(dev->otherend))) {
			
 
				+		dev_dbg(&dev->dev, "Ignoring watch at %s", vec[XS_WATCH_PATH]);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	state = xenbus_read_driver_state(dev->otherend);
			
 
				+
			
 
				+	dev_dbg(&dev->dev, "state is %d, (%s), %s, %s",
			
 
				+		state, xenbus_strstate(state), dev->otherend_watch.node,
			
 
				+		vec[XS_WATCH_PATH]);
			
 
				+
			
 
				+	/*
			
 
				+	 * Ignore xenbus transitions during shutdown. This prevents us doing
			
 
				+	 * work that can fail e.g., when the rootfs is gone.
			
 
				+	 */
			
 
				+	if (system_state > SYSTEM_RUNNING) {
			
 
				+		struct xen_bus_type *bus = bus;
			
 
				+		bus = container_of(dev->dev.bus, struct xen_bus_type, bus);
			
 
				+		/* If we're frontend, drive the state machine to Closed. */
			
 
				+		/* This should cause the backend to release our resources. */
			
 
				+		if ((bus == &xenbus_frontend) && (state == XenbusStateClosing))
			
 
				+			xenbus_frontend_closed(dev);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	if (drv->otherend_changed)
			
 
				+		drv->otherend_changed(dev, state);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static int talk_to_otherend(struct xenbus_device *dev)
			
 
				+{
			
 
				+	struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
			
 
				+
			
 
				+	free_otherend_watch(dev);
			
 
				+	free_otherend_details(dev);
			
 
				+
			
 
				+	return drv->read_otherend_details(dev);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static int watch_otherend(struct xenbus_device *dev)
			
 
				+{
			
 
				+	return xenbus_watch_pathfmt(dev, &dev->otherend_watch, otherend_changed,
			
 
				+				    "%s/%s", dev->otherend, "state");
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int xenbus_dev_probe(struct device *_dev)
			
 
				+{
			
 
				+	struct xenbus_device *dev = to_xenbus_device(_dev);
			
 
				+	struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
			
 
				+	const struct xenbus_device_id *id;
			
 
				+	int err;
			
 
				+
			
 
				+	DPRINTK("%s", dev->nodename);
			
 
				+
			
 
				+	if (!drv->probe) {
			
 
				+		err = -ENODEV;
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	id = match_device(drv->ids, dev);
			
 
				+	if (!id) {
			
 
				+		err = -ENODEV;
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	err = talk_to_otherend(dev);
			
 
				+	if (err) {
			
 
				+		dev_warn(&dev->dev, "talk_to_otherend on %s failed.\n",
			
 
				+			 dev->nodename);
			
 
				+		return err;
			
 
				+	}
			
 
				+
			
 
				+	err = drv->probe(dev, id);
			
 
				+	if (err)
			
 
				+		goto fail;
			
 
				+
			
 
				+	err = watch_otherend(dev);
			
 
				+	if (err) {
			
 
				+		dev_warn(&dev->dev, "watch_otherend on %s failed.\n",
			
 
				+		       dev->nodename);
			
 
				+		return err;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+fail:
			
 
				+	xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename);
			
 
				+	xenbus_switch_state(dev, XenbusStateClosed);
			
 
				+	return -ENODEV;
			
 
				+}
			
 
				+
			
 
				+int xenbus_dev_remove(struct device *_dev)
			
 
				+{
			
 
				+	struct xenbus_device *dev = to_xenbus_device(_dev);
			
 
				+	struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
			
 
				+
			
 
				+	DPRINTK("%s", dev->nodename);
			
 
				+
			
 
				+	free_otherend_watch(dev);
			
 
				+	free_otherend_details(dev);
			
 
				+
			
 
				+	if (drv->remove)
			
 
				+		drv->remove(dev);
			
 
				+
			
 
				+	xenbus_switch_state(dev, XenbusStateClosed);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void xenbus_dev_shutdown(struct device *_dev)
			
 
				+{
			
 
				+	struct xenbus_device *dev = to_xenbus_device(_dev);
			
 
				+	unsigned long timeout = 5*HZ;
			
 
				+
			
 
				+	DPRINTK("%s", dev->nodename);
			
 
				+
			
 
				+	get_device(&dev->dev);
			
 
				+	if (dev->state != XenbusStateConnected) {
			
 
				+		printk(KERN_INFO "%s: %s: %s != Connected, skipping\n", __func__,
			
 
				+		       dev->nodename, xenbus_strstate(dev->state));
			
 
				+		goto out;
			
 
				+	}
			
 
				+	xenbus_switch_state(dev, XenbusStateClosing);
			
 
				+	timeout = wait_for_completion_timeout(&dev->down, timeout);
			
 
				+	if (!timeout)
			
 
				+		printk(KERN_INFO "%s: %s timeout closing device\n",
			
 
				+		       __func__, dev->nodename);
			
 
				+ out:
			
 
				+	put_device(&dev->dev);
			
 
				+}
			
 
				+
			
 
				+int xenbus_register_driver_common(struct xenbus_driver *drv,
			
 
				+				  struct xen_bus_type *bus,
			
 
				+				  struct module *owner,
			
 
				+				  const char *mod_name)
			
 
				+{
			
 
				+	drv->driver.name = drv->name;
			
 
				+	drv->driver.bus = &bus->bus;
			
 
				+	drv->driver.owner = owner;
			
 
				+	drv->driver.mod_name = mod_name;
			
 
				+
			
 
				+	return driver_register(&drv->driver);
			
 
				+}
			
 
				+
			
 
				+int __xenbus_register_frontend(struct xenbus_driver *drv,
			
 
				+			       struct module *owner, const char *mod_name)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	drv->read_otherend_details = read_backend_details;
			
 
				+
			
 
				+	ret = xenbus_register_driver_common(drv, &xenbus_frontend,
			
 
				+					    owner, mod_name);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	/* If this driver is loaded as a module wait for devices to attach. */
			
 
				+	wait_for_devices(drv);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(__xenbus_register_frontend);
			
 
				+
			
 
				+void xenbus_unregister_driver(struct xenbus_driver *drv)
			
 
				+{
			
 
				+	driver_unregister(&drv->driver);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_unregister_driver);
			
 
				+
			
 
				+struct xb_find_info
			
 
				+{
			
 
				+	struct xenbus_device *dev;
			
 
				+	const char *nodename;
			
 
				+};
			
 
				+
			
 
				+static int cmp_dev(struct device *dev, void *data)
			
 
				+{
			
 
				+	struct xenbus_device *xendev = to_xenbus_device(dev);
			
 
				+	struct xb_find_info *info = data;
			
 
				+
			
 
				+	if (!strcmp(xendev->nodename, info->nodename)) {
			
 
				+		info->dev = xendev;
			
 
				+		get_device(dev);
			
 
				+		return 1;
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+struct xenbus_device *xenbus_device_find(const char *nodename,
			
 
				+					 struct bus_type *bus)
			
 
				+{
			
 
				+	struct xb_find_info info = { .dev = NULL, .nodename = nodename };
			
 
				+
			
 
				+	bus_for_each_dev(bus, NULL, &info, cmp_dev);
			
 
				+	return info.dev;
			
 
				+}
			
 
				+
			
 
				+static int cleanup_dev(struct device *dev, void *data)
			
 
				+{
			
 
				+	struct xenbus_device *xendev = to_xenbus_device(dev);
			
 
				+	struct xb_find_info *info = data;
			
 
				+	int len = strlen(info->nodename);
			
 
				+
			
 
				+	DPRINTK("%s", info->nodename);
			
 
				+
			
 
				+	/* Match the info->nodename path, or any subdirectory of that path. */
			
 
				+	if (strncmp(xendev->nodename, info->nodename, len))
			
 
				+		return 0;
			
 
				+
			
 
				+	/* If the node name is longer, ensure it really is a subdirectory. */
			
 
				+	if ((strlen(xendev->nodename) > len) && (xendev->nodename[len] != '/'))
			
 
				+		return 0;
			
 
				+
			
 
				+	info->dev = xendev;
			
 
				+	get_device(dev);
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+static void xenbus_cleanup_devices(const char *path, struct bus_type *bus)
			
 
				+{
			
 
				+	struct xb_find_info info = { .nodename = path };
			
 
				+
			
 
				+	do {
			
 
				+		info.dev = NULL;
			
 
				+		bus_for_each_dev(bus, NULL, &info, cleanup_dev);
			
 
				+		if (info.dev) {
			
 
				+			device_unregister(&info.dev->dev);
			
 
				+			put_device(&info.dev->dev);
			
 
				+		}
			
 
				+	} while (info.dev);
			
 
				+}
			
 
				+
			
 
				+static void xenbus_dev_release(struct device *dev)
			
 
				+{
			
 
				+	if (dev)
			
 
				+		kfree(to_xenbus_device(dev));
			
 
				+}
			
 
				+
			
 
				+static ssize_t xendev_show_nodename(struct device *dev,
			
 
				+				    struct device_attribute *attr, char *buf)
			
 
				+{
			
 
				+	return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);
			
 
				+}
			
 
				+DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL);
			
 
				+
			
 
				+static ssize_t xendev_show_devtype(struct device *dev,
			
 
				+				   struct device_attribute *attr, char *buf)
			
 
				+{
			
 
				+	return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);
			
 
				+}
			
 
				+DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
			
 
				+
			
 
				+
			
 
				+int xenbus_probe_node(struct xen_bus_type *bus,
			
 
				+		      const char *type,
			
 
				+		      const char *nodename)
			
 
				+{
			
 
				+	int err;
			
 
				+	struct xenbus_device *xendev;
			
 
				+	size_t stringlen;
			
 
				+	char *tmpstring;
			
 
				+
			
 
				+	enum xenbus_state state = xenbus_read_driver_state(nodename);
			
 
				+
			
 
				+	if (state != XenbusStateInitialising) {
			
 
				+		/* Device is not new, so ignore it.  This can happen if a
			
 
				+		   device is going away after switching to Closed.  */
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	stringlen = strlen(nodename) + 1 + strlen(type) + 1;
			
 
				+	xendev = kzalloc(sizeof(*xendev) + stringlen, GFP_KERNEL);
			
 
				+	if (!xendev)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	xendev->state = XenbusStateInitialising;
			
 
				+
			
 
				+	/* Copy the strings into the extra space. */
			
 
				+
			
 
				+	tmpstring = (char *)(xendev + 1);
			
 
				+	strcpy(tmpstring, nodename);
			
 
				+	xendev->nodename = tmpstring;
			
 
				+
			
 
				+	tmpstring += strlen(tmpstring) + 1;
			
 
				+	strcpy(tmpstring, type);
			
 
				+	xendev->devicetype = tmpstring;
			
 
				+	init_completion(&xendev->down);
			
 
				+
			
 
				+	xendev->dev.bus = &bus->bus;
			
 
				+	xendev->dev.release = xenbus_dev_release;
			
 
				+
			
 
				+	err = bus->get_bus_id(xendev->dev.bus_id, xendev->nodename);
			
 
				+	if (err)
			
 
				+		goto fail;
			
 
				+
			
 
				+	/* Register with generic device framework. */
			
 
				+	err = device_register(&xendev->dev);
			
 
				+	if (err)
			
 
				+		goto fail;
			
 
				+
			
 
				+	err = device_create_file(&xendev->dev, &dev_attr_nodename);
			
 
				+	if (err)
			
 
				+		goto fail_unregister;
			
 
				+
			
 
				+	err = device_create_file(&xendev->dev, &dev_attr_devtype);
			
 
				+	if (err)
			
 
				+		goto fail_remove_file;
			
 
				+
			
 
				+	return 0;
			
 
				+fail_remove_file:
			
 
				+	device_remove_file(&xendev->dev, &dev_attr_nodename);
			
 
				+fail_unregister:
			
 
				+	device_unregister(&xendev->dev);
			
 
				+fail:
			
 
				+	kfree(xendev);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+/* device/<typename>/<name> */
			
 
				+static int xenbus_probe_frontend(const char *type, const char *name)
			
 
				+{
			
 
				+	char *nodename;
			
 
				+	int err;
			
 
				+
			
 
				+	nodename = kasprintf(GFP_KERNEL, "%s/%s/%s",
			
 
				+			     xenbus_frontend.root, type, name);
			
 
				+	if (!nodename)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	DPRINTK("%s", nodename);
			
 
				+
			
 
				+	err = xenbus_probe_node(&xenbus_frontend, type, nodename);
			
 
				+	kfree(nodename);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
			
 
				+{
			
 
				+	int err = 0;
			
 
				+	char **dir;
			
 
				+	unsigned int dir_n = 0;
			
 
				+	int i;
			
 
				+
			
 
				+	dir = xenbus_directory(XBT_NIL, bus->root, type, &dir_n);
			
 
				+	if (IS_ERR(dir))
			
 
				+		return PTR_ERR(dir);
			
 
				+
			
 
				+	for (i = 0; i < dir_n; i++) {
			
 
				+		err = bus->probe(type, dir[i]);
			
 
				+		if (err)
			
 
				+			break;
			
 
				+	}
			
 
				+	kfree(dir);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+int xenbus_probe_devices(struct xen_bus_type *bus)
			
 
				+{
			
 
				+	int err = 0;
			
 
				+	char **dir;
			
 
				+	unsigned int i, dir_n;
			
 
				+
			
 
				+	dir = xenbus_directory(XBT_NIL, bus->root, "", &dir_n);
			
 
				+	if (IS_ERR(dir))
			
 
				+		return PTR_ERR(dir);
			
 
				+
			
 
				+	for (i = 0; i < dir_n; i++) {
			
 
				+		err = xenbus_probe_device_type(bus, dir[i]);
			
 
				+		if (err)
			
 
				+			break;
			
 
				+	}
			
 
				+	kfree(dir);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static unsigned int char_count(const char *str, char c)
			
 
				+{
			
 
				+	unsigned int i, ret = 0;
			
 
				+
			
 
				+	for (i = 0; str[i]; i++)
			
 
				+		if (str[i] == c)
			
 
				+			ret++;
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int strsep_len(const char *str, char c, unsigned int len)
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+
			
 
				+	for (i = 0; str[i]; i++)
			
 
				+		if (str[i] == c) {
			
 
				+			if (len == 0)
			
 
				+				return i;
			
 
				+			len--;
			
 
				+		}
			
 
				+	return (len == 0) ? i : -ERANGE;
			
 
				+}
			
 
				+
			
 
				+void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)
			
 
				+{
			
 
				+	int exists, rootlen;
			
 
				+	struct xenbus_device *dev;
			
 
				+	char type[BUS_ID_SIZE];
			
 
				+	const char *p, *root;
			
 
				+
			
 
				+	if (char_count(node, '/') < 2)
			
 
				+		return;
			
 
				+
			
 
				+	exists = xenbus_exists(XBT_NIL, node, "");
			
 
				+	if (!exists) {
			
 
				+		xenbus_cleanup_devices(node, &bus->bus);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	/* backend/<type>/... or device/<type>/... */
			
 
				+	p = strchr(node, '/') + 1;
			
 
				+	snprintf(type, BUS_ID_SIZE, "%.*s", (int)strcspn(p, "/"), p);
			
 
				+	type[BUS_ID_SIZE-1] = '\0';
			
 
				+
			
 
				+	rootlen = strsep_len(node, '/', bus->levels);
			
 
				+	if (rootlen < 0)
			
 
				+		return;
			
 
				+	root = kasprintf(GFP_KERNEL, "%.*s", rootlen, node);
			
 
				+	if (!root)
			
 
				+		return;
			
 
				+
			
 
				+	dev = xenbus_device_find(root, &bus->bus);
			
 
				+	if (!dev)
			
 
				+		xenbus_probe_node(bus, type, root);
			
 
				+	else
			
 
				+		put_device(&dev->dev);
			
 
				+
			
 
				+	kfree(root);
			
 
				+}
			
 
				+
			
 
				+static void frontend_changed(struct xenbus_watch *watch,
			
 
				+			     const char **vec, unsigned int len)
			
 
				+{
			
 
				+	DPRINTK("");
			
 
				+
			
 
				+	xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
			
 
				+}
			
 
				+
			
 
				+/* We watch for devices appearing and vanishing. */
			
 
				+static struct xenbus_watch fe_watch = {
			
 
				+	.node = "device",
			
 
				+	.callback = frontend_changed,
			
 
				+};
			
 
				+
			
 
				+static int suspend_dev(struct device *dev, void *data)
			
 
				+{
			
 
				+	int err = 0;
			
 
				+	struct xenbus_driver *drv;
			
 
				+	struct xenbus_device *xdev;
			
 
				+
			
 
				+	DPRINTK("");
			
 
				+
			
 
				+	if (dev->driver == NULL)
			
 
				+		return 0;
			
 
				+	drv = to_xenbus_driver(dev->driver);
			
 
				+	xdev = container_of(dev, struct xenbus_device, dev);
			
 
				+	if (drv->suspend)
			
 
				+		err = drv->suspend(xdev);
			
 
				+	if (err)
			
 
				+		printk(KERN_WARNING
			
 
				+		       "xenbus: suspend %s failed: %i\n", dev->bus_id, err);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int suspend_cancel_dev(struct device *dev, void *data)
			
 
				+{
			
 
				+	int err = 0;
			
 
				+	struct xenbus_driver *drv;
			
 
				+	struct xenbus_device *xdev;
			
 
				+
			
 
				+	DPRINTK("");
			
 
				+
			
 
				+	if (dev->driver == NULL)
			
 
				+		return 0;
			
 
				+	drv = to_xenbus_driver(dev->driver);
			
 
				+	xdev = container_of(dev, struct xenbus_device, dev);
			
 
				+	if (drv->suspend_cancel)
			
 
				+		err = drv->suspend_cancel(xdev);
			
 
				+	if (err)
			
 
				+		printk(KERN_WARNING
			
 
				+		       "xenbus: suspend_cancel %s failed: %i\n",
			
 
				+		       dev->bus_id, err);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int resume_dev(struct device *dev, void *data)
			
 
				+{
			
 
				+	int err;
			
 
				+	struct xenbus_driver *drv;
			
 
				+	struct xenbus_device *xdev;
			
 
				+
			
 
				+	DPRINTK("");
			
 
				+
			
 
				+	if (dev->driver == NULL)
			
 
				+		return 0;
			
 
				+
			
 
				+	drv = to_xenbus_driver(dev->driver);
			
 
				+	xdev = container_of(dev, struct xenbus_device, dev);
			
 
				+
			
 
				+	err = talk_to_otherend(xdev);
			
 
				+	if (err) {
			
 
				+		printk(KERN_WARNING
			
 
				+		       "xenbus: resume (talk_to_otherend) %s failed: %i\n",
			
 
				+		       dev->bus_id, err);
			
 
				+		return err;
			
 
				+	}
			
 
				+
			
 
				+	xdev->state = XenbusStateInitialising;
			
 
				+
			
 
				+	if (drv->resume) {
			
 
				+		err = drv->resume(xdev);
			
 
				+		if (err) {
			
 
				+			printk(KERN_WARNING
			
 
				+			       "xenbus: resume %s failed: %i\n",
			
 
				+			       dev->bus_id, err);
			
 
				+			return err;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	err = watch_otherend(xdev);
			
 
				+	if (err) {
			
 
				+		printk(KERN_WARNING
			
 
				+		       "xenbus_probe: resume (watch_otherend) %s failed: "
			
 
				+		       "%d.\n", dev->bus_id, err);
			
 
				+		return err;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+void xenbus_suspend(void)
			
 
				+{
			
 
				+	DPRINTK("");
			
 
				+
			
 
				+	bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev);
			
 
				+	xenbus_backend_suspend(suspend_dev);
			
 
				+	xs_suspend();
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_suspend);
			
 
				+
			
 
				+void xenbus_resume(void)
			
 
				+{
			
 
				+	xb_init_comms();
			
 
				+	xs_resume();
			
 
				+	bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev);
			
 
				+	xenbus_backend_resume(resume_dev);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_resume);
			
 
				+
			
 
				+void xenbus_suspend_cancel(void)
			
 
				+{
			
 
				+	xs_suspend_cancel();
			
 
				+	bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_cancel_dev);
			
 
				+	xenbus_backend_resume(suspend_cancel_dev);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_suspend_cancel);
			
 
				+
			
 
				+/* A flag to determine if xenstored is 'ready' (i.e. has started) */
			
 
				+int xenstored_ready = 0;
			
 
				+
			
 
				+
			
 
				+int register_xenstore_notifier(struct notifier_block *nb)
			
 
				+{
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	if (xenstored_ready > 0)
			
 
				+		ret = nb->notifier_call(nb, 0, NULL);
			
 
				+	else
			
 
				+		blocking_notifier_chain_register(&xenstore_chain, nb);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(register_xenstore_notifier);
			
 
				+
			
 
				+void unregister_xenstore_notifier(struct notifier_block *nb)
			
 
				+{
			
 
				+	blocking_notifier_chain_unregister(&xenstore_chain, nb);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
			
 
				+
			
 
				+void xenbus_probe(struct work_struct *unused)
			
 
				+{
			
 
				+	BUG_ON((xenstored_ready <= 0));
			
 
				+
			
 
				+	/* Enumerate devices in xenstore and watch for changes. */
			
 
				+	xenbus_probe_devices(&xenbus_frontend);
			
 
				+	register_xenbus_watch(&fe_watch);
			
 
				+	xenbus_backend_probe_and_watch();
			
 
				+
			
 
				+	/* Notify others that xenstore is up */
			
 
				+	blocking_notifier_call_chain(&xenstore_chain, 0, NULL);
			
 
				+}
			
 
				+
			
 
				+static int __init xenbus_probe_init(void)
			
 
				+{
			
 
				+	int err = 0;
			
 
				+
			
 
				+	DPRINTK("");
			
 
				+
			
 
				+	err = -ENODEV;
			
 
				+	if (!is_running_on_xen())
			
 
				+		goto out_error;
			
 
				+
			
 
				+	/* Register ourselves with the kernel bus subsystem */
			
 
				+	err = bus_register(&xenbus_frontend.bus);
			
 
				+	if (err)
			
 
				+		goto out_error;
			
 
				+
			
 
				+	err = xenbus_backend_bus_register();
			
 
				+	if (err)
			
 
				+		goto out_unreg_front;
			
 
				+
			
 
				+	/*
			
 
				+	 * Domain0 doesn't have a store_evtchn or store_mfn yet.
			
 
				+	 */
			
 
				+	if (is_initial_xendomain()) {
			
 
				+		/* dom0 not yet supported */
			
 
				+	} else {
			
 
				+		xenstored_ready = 1;
			
 
				+		xen_store_evtchn = xen_start_info->store_evtchn;
			
 
				+		xen_store_mfn = xen_start_info->store_mfn;
			
 
				+	}
			
 
				+	xen_store_interface = mfn_to_virt(xen_store_mfn);
			
 
				+
			
 
				+	/* Initialize the interface to xenstore. */
			
 
				+	err = xs_init();
			
 
				+	if (err) {
			
 
				+		printk(KERN_WARNING
			
 
				+		       "XENBUS: Error initializing xenstore comms: %i\n", err);
			
 
				+		goto out_unreg_back;
			
 
				+	}
			
 
				+
			
 
				+	if (!is_initial_xendomain())
			
 
				+		xenbus_probe(NULL);
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+  out_unreg_back:
			
 
				+	xenbus_backend_bus_unregister();
			
 
				+
			
 
				+  out_unreg_front:
			
 
				+	bus_unregister(&xenbus_frontend.bus);
			
 
				+
			
 
				+  out_error:
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+postcore_initcall(xenbus_probe_init);
			
 
				+
			
 
				+MODULE_LICENSE("GPL");
			
 
				+
			
 
				+static int is_disconnected_device(struct device *dev, void *data)
			
 
				+{
			
 
				+	struct xenbus_device *xendev = to_xenbus_device(dev);
			
 
				+	struct device_driver *drv = data;
			
 
				+
			
 
				+	/*
			
 
				+	 * A device with no driver will never connect. We care only about
			
 
				+	 * devices which should currently be in the process of connecting.
			
 
				+	 */
			
 
				+	if (!dev->driver)
			
 
				+		return 0;
			
 
				+
			
 
				+	/* Is this search limited to a particular driver? */
			
 
				+	if (drv && (dev->driver != drv))
			
 
				+		return 0;
			
 
				+
			
 
				+	return (xendev->state != XenbusStateConnected);
			
 
				+}
			
 
				+
			
 
				+static int exists_disconnected_device(struct device_driver *drv)
			
 
				+{
			
 
				+	return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
			
 
				+				is_disconnected_device);
			
 
				+}
			
 
				+
			
 
				+static int print_device_status(struct device *dev, void *data)
			
 
				+{
			
 
				+	struct xenbus_device *xendev = to_xenbus_device(dev);
			
 
				+	struct device_driver *drv = data;
			
 
				+
			
 
				+	/* Is this operation limited to a particular driver? */
			
 
				+	if (drv && (dev->driver != drv))
			
 
				+		return 0;
			
 
				+
			
 
				+	if (!dev->driver) {
			
 
				+		/* Information only: is this too noisy? */
			
 
				+		printk(KERN_INFO "XENBUS: Device with no driver: %s\n",
			
 
				+		       xendev->nodename);
			
 
				+	} else if (xendev->state != XenbusStateConnected) {
			
 
				+		printk(KERN_WARNING "XENBUS: Timeout connecting "
			
 
				+		       "to device: %s (state %d)\n",
			
 
				+		       xendev->nodename, xendev->state);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* We only wait for device setup after most initcalls have run. */
			
 
				+static int ready_to_wait_for_devices;
			
 
				+
			
 
				+/*
			
 
				+ * On a 10 second timeout, wait for all devices currently configured.  We need
			
 
				+ * to do this to guarantee that the filesystems and / or network devices
			
 
				+ * needed for boot are available, before we can allow the boot to proceed.
			
 
				+ *
			
 
				+ * This needs to be on a late_initcall, to happen after the frontend device
			
 
				+ * drivers have been initialised, but before the root fs is mounted.
			
 
				+ *
			
 
				+ * A possible improvement here would be to have the tools add a per-device
			
 
				+ * flag to the store entry, indicating whether it is needed at boot time.
			
 
				+ * This would allow people who knew what they were doing to accelerate their
			
 
				+ * boot slightly, but of course needs tools or manual intervention to set up
			
 
				+ * those flags correctly.
			
 
				+ */
			
 
				+static void wait_for_devices(struct xenbus_driver *xendrv)
			
 
				+{
			
 
				+	unsigned long timeout = jiffies + 10*HZ;
			
 
				+	struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
			
 
				+
			
 
				+	if (!ready_to_wait_for_devices || !is_running_on_xen())
			
 
				+		return;
			
 
				+
			
 
				+	while (exists_disconnected_device(drv)) {
			
 
				+		if (time_after(jiffies, timeout))
			
 
				+			break;
			
 
				+		schedule_timeout_interruptible(HZ/10);
			
 
				+	}
			
 
				+
			
 
				+	bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
			
 
				+			 print_device_status);
			
 
				+}
			
 
				+
			
 
				+#ifndef MODULE
			
 
				+static int __init boot_wait_for_devices(void)
			
 
				+{
			
 
				+	ready_to_wait_for_devices = 1;
			
 
				+	wait_for_devices(NULL);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+late_initcall(boot_wait_for_devices);
			
 
				+#endif
			
--- a/drivers/xen/xenbus/xenbus_probe.h
+++ b/drivers/xen/xenbus/xenbus_probe.h
@@ -0,0 +1,74 @@
 
				+/******************************************************************************
			
 
				+ * xenbus_probe.h
			
 
				+ *
			
 
				+ * Talks to Xen Store to figure out what devices we have.
			
 
				+ *
			
 
				+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
			
 
				+ * Copyright (C) 2005 XenSource Ltd.
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public License version 2
			
 
				+ * as published by the Free Software Foundation; or, when distributed
			
 
				+ * separately from the Linux kernel or incorporated into other
			
 
				+ * software packages, subject to the following license:
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this source file (the "Software"), to deal in the Software without
			
 
				+ * restriction, including without limitation the rights to use, copy, modify,
			
 
				+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
			
 
				+ * and to permit persons to whom the Software is furnished to do so, subject to
			
 
				+ * the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
			
 
				+ * IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#ifndef _XENBUS_PROBE_H
			
 
				+#define _XENBUS_PROBE_H
			
 
				+
			
 
				+#ifdef CONFIG_XEN_BACKEND
			
 
				+extern void xenbus_backend_suspend(int (*fn)(struct device *, void *));
			
 
				+extern void xenbus_backend_resume(int (*fn)(struct device *, void *));
			
 
				+extern void xenbus_backend_probe_and_watch(void);
			
 
				+extern int xenbus_backend_bus_register(void);
			
 
				+extern void xenbus_backend_bus_unregister(void);
			
 
				+#else
			
 
				+static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {}
			
 
				+static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {}
			
 
				+static inline void xenbus_backend_probe_and_watch(void) {}
			
 
				+static inline int xenbus_backend_bus_register(void) { return 0; }
			
 
				+static inline void xenbus_backend_bus_unregister(void) {}
			
 
				+#endif
			
 
				+
			
 
				+struct xen_bus_type
			
 
				+{
			
 
				+	char *root;
			
 
				+	unsigned int levels;
			
 
				+	int (*get_bus_id)(char bus_id[BUS_ID_SIZE], const char *nodename);
			
 
				+	int (*probe)(const char *type, const char *dir);
			
 
				+	struct bus_type bus;
			
 
				+};
			
 
				+
			
 
				+extern int xenbus_match(struct device *_dev, struct device_driver *_drv);
			
 
				+extern int xenbus_dev_probe(struct device *_dev);
			
 
				+extern int xenbus_dev_remove(struct device *_dev);
			
 
				+extern int xenbus_register_driver_common(struct xenbus_driver *drv,
			
 
				+					 struct xen_bus_type *bus,
			
 
				+					 struct module *owner,
			
 
				+					 const char *mod_name);
			
 
				+extern int xenbus_probe_node(struct xen_bus_type *bus,
			
 
				+			     const char *type,
			
 
				+			     const char *nodename);
			
 
				+extern int xenbus_probe_devices(struct xen_bus_type *bus);
			
 
				+
			
 
				+extern void xenbus_dev_changed(const char *node, struct xen_bus_type *bus);
			
 
				+
			
 
				+#endif
			
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -0,0 +1,861 @@
 
				+/******************************************************************************
			
 
				+ * xenbus_xs.c
			
 
				+ *
			
 
				+ * This is the kernel equivalent of the "xs" library.  We don't need everything
			
 
				+ * and we use xenbus_comms for communication.
			
 
				+ *
			
 
				+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public License version 2
			
 
				+ * as published by the Free Software Foundation; or, when distributed
			
 
				+ * separately from the Linux kernel or incorporated into other
			
 
				+ * software packages, subject to the following license:
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this source file (the "Software"), to deal in the Software without
			
 
				+ * restriction, including without limitation the rights to use, copy, modify,
			
 
				+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
			
 
				+ * and to permit persons to whom the Software is furnished to do so, subject to
			
 
				+ * the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
			
 
				+ * IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/unistd.h>
			
 
				+#include <linux/errno.h>
			
 
				+#include <linux/types.h>
			
 
				+#include <linux/uio.h>
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/string.h>
			
 
				+#include <linux/err.h>
			
 
				+#include <linux/slab.h>
			
 
				+#include <linux/fcntl.h>
			
 
				+#include <linux/kthread.h>
			
 
				+#include <linux/rwsem.h>
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/mutex.h>
			
 
				+#include <xen/xenbus.h>
			
 
				+#include "xenbus_comms.h"
			
 
				+
			
 
				+struct xs_stored_msg {
			
 
				+	struct list_head list;
			
 
				+
			
 
				+	struct xsd_sockmsg hdr;
			
 
				+
			
 
				+	union {
			
 
				+		/* Queued replies. */
			
 
				+		struct {
			
 
				+			char *body;
			
 
				+		} reply;
			
 
				+
			
 
				+		/* Queued watch events. */
			
 
				+		struct {
			
 
				+			struct xenbus_watch *handle;
			
 
				+			char **vec;
			
 
				+			unsigned int vec_size;
			
 
				+		} watch;
			
 
				+	} u;
			
 
				+};
			
 
				+
			
 
				+struct xs_handle {
			
 
				+	/* A list of replies. Currently only one will ever be outstanding. */
			
 
				+	struct list_head reply_list;
			
 
				+	spinlock_t reply_lock;
			
 
				+	wait_queue_head_t reply_waitq;
			
 
				+
			
 
				+	/*
			
 
				+	 * Mutex ordering: transaction_mutex -> watch_mutex -> request_mutex.
			
 
				+	 * response_mutex is never taken simultaneously with the other three.
			
 
				+	 */
			
 
				+
			
 
				+	/* One request at a time. */
			
 
				+	struct mutex request_mutex;
			
 
				+
			
 
				+	/* Protect xenbus reader thread against save/restore. */
			
 
				+	struct mutex response_mutex;
			
 
				+
			
 
				+	/* Protect transactions against save/restore. */
			
 
				+	struct rw_semaphore transaction_mutex;
			
 
				+
			
 
				+	/* Protect watch (de)register against save/restore. */
			
 
				+	struct rw_semaphore watch_mutex;
			
 
				+};
			
 
				+
			
 
				+static struct xs_handle xs_state;
			
 
				+
			
 
				+/* List of registered watches, and a lock to protect it. */
			
 
				+static LIST_HEAD(watches);
			
 
				+static DEFINE_SPINLOCK(watches_lock);
			
 
				+
			
 
				+/* List of pending watch callback events, and a lock to protect it. */
			
 
				+static LIST_HEAD(watch_events);
			
 
				+static DEFINE_SPINLOCK(watch_events_lock);
			
 
				+
			
 
				+/*
			
 
				+ * Details of the xenwatch callback kernel thread. The thread waits on the
			
 
				+ * watch_events_waitq for work to do (queued on watch_events list). When it
			
 
				+ * wakes up it acquires the xenwatch_mutex before reading the list and
			
 
				+ * carrying out work.
			
 
				+ */
			
 
				+static pid_t xenwatch_pid;
			
 
				+static DEFINE_MUTEX(xenwatch_mutex);
			
 
				+static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq);
			
 
				+
			
 
				+static int get_error(const char *errorstring)
			
 
				+{
			
 
				+	unsigned int i;
			
 
				+
			
 
				+	for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) {
			
 
				+		if (i == ARRAY_SIZE(xsd_errors) - 1) {
			
 
				+			printk(KERN_WARNING
			
 
				+			       "XENBUS xen store gave: unknown error %s",
			
 
				+			       errorstring);
			
 
				+			return EINVAL;
			
 
				+		}
			
 
				+	}
			
 
				+	return xsd_errors[i].errnum;
			
 
				+}
			
 
				+
			
 
				+static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)
			
 
				+{
			
 
				+	struct xs_stored_msg *msg;
			
 
				+	char *body;
			
 
				+
			
 
				+	spin_lock(&xs_state.reply_lock);
			
 
				+
			
 
				+	while (list_empty(&xs_state.reply_list)) {
			
 
				+		spin_unlock(&xs_state.reply_lock);
			
 
				+		/* XXX FIXME: Avoid synchronous wait for response here. */
			
 
				+		wait_event(xs_state.reply_waitq,
			
 
				+			   !list_empty(&xs_state.reply_list));
			
 
				+		spin_lock(&xs_state.reply_lock);
			
 
				+	}
			
 
				+
			
 
				+	msg = list_entry(xs_state.reply_list.next,
			
 
				+			 struct xs_stored_msg, list);
			
 
				+	list_del(&msg->list);
			
 
				+
			
 
				+	spin_unlock(&xs_state.reply_lock);
			
 
				+
			
 
				+	*type = msg->hdr.type;
			
 
				+	if (len)
			
 
				+		*len = msg->hdr.len;
			
 
				+	body = msg->u.reply.body;
			
 
				+
			
 
				+	kfree(msg);
			
 
				+
			
 
				+	return body;
			
 
				+}
			
 
				+
			
 
				+void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
			
 
				+{
			
 
				+	void *ret;
			
 
				+	struct xsd_sockmsg req_msg = *msg;
			
 
				+	int err;
			
 
				+
			
 
				+	if (req_msg.type == XS_TRANSACTION_START)
			
 
				+		down_read(&xs_state.transaction_mutex);
			
 
				+
			
 
				+	mutex_lock(&xs_state.request_mutex);
			
 
				+
			
 
				+	err = xb_write(msg, sizeof(*msg) + msg->len);
			
 
				+	if (err) {
			
 
				+		msg->type = XS_ERROR;
			
 
				+		ret = ERR_PTR(err);
			
 
				+	} else
			
 
				+		ret = read_reply(&msg->type, &msg->len);
			
 
				+
			
 
				+	mutex_unlock(&xs_state.request_mutex);
			
 
				+
			
 
				+	if ((msg->type == XS_TRANSACTION_END) ||
			
 
				+	    ((req_msg.type == XS_TRANSACTION_START) &&
			
 
				+	     (msg->type == XS_ERROR)))
			
 
				+		up_read(&xs_state.transaction_mutex);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/* Send message to xs, get kmalloc'ed reply.  ERR_PTR() on error. */
			
 
				+static void *xs_talkv(struct xenbus_transaction t,
			
 
				+		      enum xsd_sockmsg_type type,
			
 
				+		      const struct kvec *iovec,
			
 
				+		      unsigned int num_vecs,
			
 
				+		      unsigned int *len)
			
 
				+{
			
 
				+	struct xsd_sockmsg msg;
			
 
				+	void *ret = NULL;
			
 
				+	unsigned int i;
			
 
				+	int err;
			
 
				+
			
 
				+	msg.tx_id = t.id;
			
 
				+	msg.req_id = 0;
			
 
				+	msg.type = type;
			
 
				+	msg.len = 0;
			
 
				+	for (i = 0; i < num_vecs; i++)
			
 
				+		msg.len += iovec[i].iov_len;
			
 
				+
			
 
				+	mutex_lock(&xs_state.request_mutex);
			
 
				+
			
 
				+	err = xb_write(&msg, sizeof(msg));
			
 
				+	if (err) {
			
 
				+		mutex_unlock(&xs_state.request_mutex);
			
 
				+		return ERR_PTR(err);
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < num_vecs; i++) {
			
 
				+		err = xb_write(iovec[i].iov_base, iovec[i].iov_len);
			
 
				+		if (err) {
			
 
				+			mutex_unlock(&xs_state.request_mutex);
			
 
				+			return ERR_PTR(err);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	ret = read_reply(&msg.type, len);
			
 
				+
			
 
				+	mutex_unlock(&xs_state.request_mutex);
			
 
				+
			
 
				+	if (IS_ERR(ret))
			
 
				+		return ret;
			
 
				+
			
 
				+	if (msg.type == XS_ERROR) {
			
 
				+		err = get_error(ret);
			
 
				+		kfree(ret);
			
 
				+		return ERR_PTR(-err);
			
 
				+	}
			
 
				+
			
 
				+	if (msg.type != type) {
			
 
				+		if (printk_ratelimit())
			
 
				+			printk(KERN_WARNING
			
 
				+			       "XENBUS unexpected type [%d], expected [%d]\n",
			
 
				+			       msg.type, type);
			
 
				+		kfree(ret);
			
 
				+		return ERR_PTR(-EINVAL);
			
 
				+	}
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/* Simplified version of xs_talkv: single message. */
			
 
				+static void *xs_single(struct xenbus_transaction t,
			
 
				+		       enum xsd_sockmsg_type type,
			
 
				+		       const char *string,
			
 
				+		       unsigned int *len)
			
 
				+{
			
 
				+	struct kvec iovec;
			
 
				+
			
 
				+	iovec.iov_base = (void *)string;
			
 
				+	iovec.iov_len = strlen(string) + 1;
			
 
				+	return xs_talkv(t, type, &iovec, 1, len);
			
 
				+}
			
 
				+
			
 
				+/* Many commands only need an ack, don't care what it says. */
			
 
				+static int xs_error(char *reply)
			
 
				+{
			
 
				+	if (IS_ERR(reply))
			
 
				+		return PTR_ERR(reply);
			
 
				+	kfree(reply);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static unsigned int count_strings(const char *strings, unsigned int len)
			
 
				+{
			
 
				+	unsigned int num;
			
 
				+	const char *p;
			
 
				+
			
 
				+	for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1)
			
 
				+		num++;
			
 
				+
			
 
				+	return num;
			
 
				+}
			
 
				+
			
 
				+/* Return the path to dir with /name appended. Buffer must be kfree()'ed. */
			
 
				+static char *join(const char *dir, const char *name)
			
 
				+{
			
 
				+	char *buffer;
			
 
				+
			
 
				+	if (strlen(name) == 0)
			
 
				+		buffer = kasprintf(GFP_KERNEL, "%s", dir);
			
 
				+	else
			
 
				+		buffer = kasprintf(GFP_KERNEL, "%s/%s", dir, name);
			
 
				+	return (!buffer) ? ERR_PTR(-ENOMEM) : buffer;
			
 
				+}
			
 
				+
			
 
				+static char **split(char *strings, unsigned int len, unsigned int *num)
			
 
				+{
			
 
				+	char *p, **ret;
			
 
				+
			
 
				+	/* Count the strings. */
			
 
				+	*num = count_strings(strings, len);
			
 
				+
			
 
				+	/* Transfer to one big alloc for easy freeing. */
			
 
				+	ret = kmalloc(*num * sizeof(char *) + len, GFP_KERNEL);
			
 
				+	if (!ret) {
			
 
				+		kfree(strings);
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+	}
			
 
				+	memcpy(&ret[*num], strings, len);
			
 
				+	kfree(strings);
			
 
				+
			
 
				+	strings = (char *)&ret[*num];
			
 
				+	for (p = strings, *num = 0; p < strings + len; p += strlen(p) + 1)
			
 
				+		ret[(*num)++] = p;
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+char **xenbus_directory(struct xenbus_transaction t,
			
 
				+			const char *dir, const char *node, unsigned int *num)
			
 
				+{
			
 
				+	char *strings, *path;
			
 
				+	unsigned int len;
			
 
				+
			
 
				+	path = join(dir, node);
			
 
				+	if (IS_ERR(path))
			
 
				+		return (char **)path;
			
 
				+
			
 
				+	strings = xs_single(t, XS_DIRECTORY, path, &len);
			
 
				+	kfree(path);
			
 
				+	if (IS_ERR(strings))
			
 
				+		return (char **)strings;
			
 
				+
			
 
				+	return split(strings, len, num);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_directory);
			
 
				+
			
 
				+/* Check if a path exists. Return 1 if it does. */
			
 
				+int xenbus_exists(struct xenbus_transaction t,
			
 
				+		  const char *dir, const char *node)
			
 
				+{
			
 
				+	char **d;
			
 
				+	int dir_n;
			
 
				+
			
 
				+	d = xenbus_directory(t, dir, node, &dir_n);
			
 
				+	if (IS_ERR(d))
			
 
				+		return 0;
			
 
				+	kfree(d);
			
 
				+	return 1;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_exists);
			
 
				+
			
 
				+/* Get the value of a single file.
			
 
				+ * Returns a kmalloced value: call free() on it after use.
			
 
				+ * len indicates length in bytes.
			
 
				+ */
			
 
				+void *xenbus_read(struct xenbus_transaction t,
			
 
				+		  const char *dir, const char *node, unsigned int *len)
			
 
				+{
			
 
				+	char *path;
			
 
				+	void *ret;
			
 
				+
			
 
				+	path = join(dir, node);
			
 
				+	if (IS_ERR(path))
			
 
				+		return (void *)path;
			
 
				+
			
 
				+	ret = xs_single(t, XS_READ, path, len);
			
 
				+	kfree(path);
			
 
				+	return ret;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_read);
			
 
				+
			
 
				+/* Write the value of a single file.
			
 
				+ * Returns -err on failure.
			
 
				+ */
			
 
				+int xenbus_write(struct xenbus_transaction t,
			
 
				+		 const char *dir, const char *node, const char *string)
			
 
				+{
			
 
				+	const char *path;
			
 
				+	struct kvec iovec[2];
			
 
				+	int ret;
			
 
				+
			
 
				+	path = join(dir, node);
			
 
				+	if (IS_ERR(path))
			
 
				+		return PTR_ERR(path);
			
 
				+
			
 
				+	iovec[0].iov_base = (void *)path;
			
 
				+	iovec[0].iov_len = strlen(path) + 1;
			
 
				+	iovec[1].iov_base = (void *)string;
			
 
				+	iovec[1].iov_len = strlen(string);
			
 
				+
			
 
				+	ret = xs_error(xs_talkv(t, XS_WRITE, iovec, ARRAY_SIZE(iovec), NULL));
			
 
				+	kfree(path);
			
 
				+	return ret;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_write);
			
 
				+
			
 
				+/* Create a new directory. */
			
 
				+int xenbus_mkdir(struct xenbus_transaction t,
			
 
				+		 const char *dir, const char *node)
			
 
				+{
			
 
				+	char *path;
			
 
				+	int ret;
			
 
				+
			
 
				+	path = join(dir, node);
			
 
				+	if (IS_ERR(path))
			
 
				+		return PTR_ERR(path);
			
 
				+
			
 
				+	ret = xs_error(xs_single(t, XS_MKDIR, path, NULL));
			
 
				+	kfree(path);
			
 
				+	return ret;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_mkdir);
			
 
				+
			
 
				+/* Destroy a file or directory (directories must be empty). */
			
 
				+int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node)
			
 
				+{
			
 
				+	char *path;
			
 
				+	int ret;
			
 
				+
			
 
				+	path = join(dir, node);
			
 
				+	if (IS_ERR(path))
			
 
				+		return PTR_ERR(path);
			
 
				+
			
 
				+	ret = xs_error(xs_single(t, XS_RM, path, NULL));
			
 
				+	kfree(path);
			
 
				+	return ret;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_rm);
			
 
				+
			
 
				+/* Start a transaction: changes by others will not be seen during this
			
 
				+ * transaction, and changes will not be visible to others until end.
			
 
				+ */
			
 
				+int xenbus_transaction_start(struct xenbus_transaction *t)
			
 
				+{
			
 
				+	char *id_str;
			
 
				+
			
 
				+	down_read(&xs_state.transaction_mutex);
			
 
				+
			
 
				+	id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL);
			
 
				+	if (IS_ERR(id_str)) {
			
 
				+		up_read(&xs_state.transaction_mutex);
			
 
				+		return PTR_ERR(id_str);
			
 
				+	}
			
 
				+
			
 
				+	t->id = simple_strtoul(id_str, NULL, 0);
			
 
				+	kfree(id_str);
			
 
				+	return 0;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_transaction_start);
			
 
				+
			
 
				+/* End a transaction.
			
 
				+ * If abandon is true, transaction is discarded instead of committed.
			
 
				+ */
			
 
				+int xenbus_transaction_end(struct xenbus_transaction t, int abort)
			
 
				+{
			
 
				+	char abortstr[2];
			
 
				+	int err;
			
 
				+
			
 
				+	if (abort)
			
 
				+		strcpy(abortstr, "F");
			
 
				+	else
			
 
				+		strcpy(abortstr, "T");
			
 
				+
			
 
				+	err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL));
			
 
				+
			
 
				+	up_read(&xs_state.transaction_mutex);
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_transaction_end);
			
 
				+
			
 
				+/* Single read and scanf: returns -errno or num scanned. */
			
 
				+int xenbus_scanf(struct xenbus_transaction t,
			
 
				+		 const char *dir, const char *node, const char *fmt, ...)
			
 
				+{
			
 
				+	va_list ap;
			
 
				+	int ret;
			
 
				+	char *val;
			
 
				+
			
 
				+	val = xenbus_read(t, dir, node, NULL);
			
 
				+	if (IS_ERR(val))
			
 
				+		return PTR_ERR(val);
			
 
				+
			
 
				+	va_start(ap, fmt);
			
 
				+	ret = vsscanf(val, fmt, ap);
			
 
				+	va_end(ap);
			
 
				+	kfree(val);
			
 
				+	/* Distinctive errno. */
			
 
				+	if (ret == 0)
			
 
				+		return -ERANGE;
			
 
				+	return ret;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_scanf);
			
 
				+
			
 
				+/* Single printf and write: returns -errno or 0. */
			
 
				+int xenbus_printf(struct xenbus_transaction t,
			
 
				+		  const char *dir, const char *node, const char *fmt, ...)
			
 
				+{
			
 
				+	va_list ap;
			
 
				+	int ret;
			
 
				+#define PRINTF_BUFFER_SIZE 4096
			
 
				+	char *printf_buffer;
			
 
				+
			
 
				+	printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
			
 
				+	if (printf_buffer == NULL)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	va_start(ap, fmt);
			
 
				+	ret = vsnprintf(printf_buffer, PRINTF_BUFFER_SIZE, fmt, ap);
			
 
				+	va_end(ap);
			
 
				+
			
 
				+	BUG_ON(ret > PRINTF_BUFFER_SIZE-1);
			
 
				+	ret = xenbus_write(t, dir, node, printf_buffer);
			
 
				+
			
 
				+	kfree(printf_buffer);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_printf);
			
 
				+
			
 
				+/* Takes tuples of names, scanf-style args, and void **, NULL terminated. */
			
 
				+int xenbus_gather(struct xenbus_transaction t, const char *dir, ...)
			
 
				+{
			
 
				+	va_list ap;
			
 
				+	const char *name;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	va_start(ap, dir);
			
 
				+	while (ret == 0 && (name = va_arg(ap, char *)) != NULL) {
			
 
				+		const char *fmt = va_arg(ap, char *);
			
 
				+		void *result = va_arg(ap, void *);
			
 
				+		char *p;
			
 
				+
			
 
				+		p = xenbus_read(t, dir, name, NULL);
			
 
				+		if (IS_ERR(p)) {
			
 
				+			ret = PTR_ERR(p);
			
 
				+			break;
			
 
				+		}
			
 
				+		if (fmt) {
			
 
				+			if (sscanf(p, fmt, result) == 0)
			
 
				+				ret = -EINVAL;
			
 
				+			kfree(p);
			
 
				+		} else
			
 
				+			*(char **)result = p;
			
 
				+	}
			
 
				+	va_end(ap);
			
 
				+	return ret;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(xenbus_gather);
			
 
				+
			
 
				+static int xs_watch(const char *path, const char *token)
			
 
				+{
			
 
				+	struct kvec iov[2];
			
 
				+
			
 
				+	iov[0].iov_base = (void *)path;
			
 
				+	iov[0].iov_len = strlen(path) + 1;
			
 
				+	iov[1].iov_base = (void *)token;
			
 
				+	iov[1].iov_len = strlen(token) + 1;
			
 
				+
			
 
				+	return xs_error(xs_talkv(XBT_NIL, XS_WATCH, iov,
			
 
				+				 ARRAY_SIZE(iov), NULL));
			
 
				+}
			
 
				+
			
 
				+static int xs_unwatch(const char *path, const char *token)
			
 
				+{
			
 
				+	struct kvec iov[2];
			
 
				+
			
 
				+	iov[0].iov_base = (char *)path;
			
 
				+	iov[0].iov_len = strlen(path) + 1;
			
 
				+	iov[1].iov_base = (char *)token;
			
 
				+	iov[1].iov_len = strlen(token) + 1;
			
 
				+
			
 
				+	return xs_error(xs_talkv(XBT_NIL, XS_UNWATCH, iov,
			
 
				+				 ARRAY_SIZE(iov), NULL));
			
 
				+}
			
 
				+
			
 
				+static struct xenbus_watch *find_watch(const char *token)
			
 
				+{
			
 
				+	struct xenbus_watch *i, *cmp;
			
 
				+
			
 
				+	cmp = (void *)simple_strtoul(token, NULL, 16);
			
 
				+
			
 
				+	list_for_each_entry(i, &watches, list)
			
 
				+		if (i == cmp)
			
 
				+			return i;
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+/* Register callback to watch this node. */
			
 
				+int register_xenbus_watch(struct xenbus_watch *watch)
			
 
				+{
			
 
				+	/* Pointer in ascii is the token. */
			
 
				+	char token[sizeof(watch) * 2 + 1];
			
 
				+	int err;
			
 
				+
			
 
				+	sprintf(token, "%lX", (long)watch);
			
 
				+
			
 
				+	down_read(&xs_state.watch_mutex);
			
 
				+
			
 
				+	spin_lock(&watches_lock);
			
 
				+	BUG_ON(find_watch(token));
			
 
				+	list_add(&watch->list, &watches);
			
 
				+	spin_unlock(&watches_lock);
			
 
				+
			
 
				+	err = xs_watch(watch->node, token);
			
 
				+
			
 
				+	/* Ignore errors due to multiple registration. */
			
 
				+	if ((err != 0) && (err != -EEXIST)) {
			
 
				+		spin_lock(&watches_lock);
			
 
				+		list_del(&watch->list);
			
 
				+		spin_unlock(&watches_lock);
			
 
				+	}
			
 
				+
			
 
				+	up_read(&xs_state.watch_mutex);
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(register_xenbus_watch);
			
 
				+
			
 
				+void unregister_xenbus_watch(struct xenbus_watch *watch)
			
 
				+{
			
 
				+	struct xs_stored_msg *msg, *tmp;
			
 
				+	char token[sizeof(watch) * 2 + 1];
			
 
				+	int err;
			
 
				+
			
 
				+	sprintf(token, "%lX", (long)watch);
			
 
				+
			
 
				+	down_read(&xs_state.watch_mutex);
			
 
				+
			
 
				+	spin_lock(&watches_lock);
			
 
				+	BUG_ON(!find_watch(token));
			
 
				+	list_del(&watch->list);
			
 
				+	spin_unlock(&watches_lock);
			
 
				+
			
 
				+	err = xs_unwatch(watch->node, token);
			
 
				+	if (err)
			
 
				+		printk(KERN_WARNING
			
 
				+		       "XENBUS Failed to release watch %s: %i\n",
			
 
				+		       watch->node, err);
			
 
				+
			
 
				+	up_read(&xs_state.watch_mutex);
			
 
				+
			
 
				+	/* Make sure there are no callbacks running currently (unless
			
 
				+	   its us) */
			
 
				+	if (current->pid != xenwatch_pid)
			
 
				+		mutex_lock(&xenwatch_mutex);
			
 
				+
			
 
				+	/* Cancel pending watch events. */
			
 
				+	spin_lock(&watch_events_lock);
			
 
				+	list_for_each_entry_safe(msg, tmp, &watch_events, list) {
			
 
				+		if (msg->u.watch.handle != watch)
			
 
				+			continue;
			
 
				+		list_del(&msg->list);
			
 
				+		kfree(msg->u.watch.vec);
			
 
				+		kfree(msg);
			
 
				+	}
			
 
				+	spin_unlock(&watch_events_lock);
			
 
				+
			
 
				+	if (current->pid != xenwatch_pid)
			
 
				+		mutex_unlock(&xenwatch_mutex);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(unregister_xenbus_watch);
			
 
				+
			
 
				+void xs_suspend(void)
			
 
				+{
			
 
				+	down_write(&xs_state.transaction_mutex);
			
 
				+	down_write(&xs_state.watch_mutex);
			
 
				+	mutex_lock(&xs_state.request_mutex);
			
 
				+	mutex_lock(&xs_state.response_mutex);
			
 
				+}
			
 
				+
			
 
				+void xs_resume(void)
			
 
				+{
			
 
				+	struct xenbus_watch *watch;
			
 
				+	char token[sizeof(watch) * 2 + 1];
			
 
				+
			
 
				+	mutex_unlock(&xs_state.response_mutex);
			
 
				+	mutex_unlock(&xs_state.request_mutex);
			
 
				+	up_write(&xs_state.transaction_mutex);
			
 
				+
			
 
				+	/* No need for watches_lock: the watch_mutex is sufficient. */
			
 
				+	list_for_each_entry(watch, &watches, list) {
			
 
				+		sprintf(token, "%lX", (long)watch);
			
 
				+		xs_watch(watch->node, token);
			
 
				+	}
			
 
				+
			
 
				+	up_write(&xs_state.watch_mutex);
			
 
				+}
			
 
				+
			
 
				+void xs_suspend_cancel(void)
			
 
				+{
			
 
				+	mutex_unlock(&xs_state.response_mutex);
			
 
				+	mutex_unlock(&xs_state.request_mutex);
			
 
				+	up_write(&xs_state.watch_mutex);
			
 
				+	up_write(&xs_state.transaction_mutex);
			
 
				+}
			
 
				+
			
 
				+static int xenwatch_thread(void *unused)
			
 
				+{
			
 
				+	struct list_head *ent;
			
 
				+	struct xs_stored_msg *msg;
			
 
				+
			
 
				+	for (;;) {
			
 
				+		wait_event_interruptible(watch_events_waitq,
			
 
				+					 !list_empty(&watch_events));
			
 
				+
			
 
				+		if (kthread_should_stop())
			
 
				+			break;
			
 
				+
			
 
				+		mutex_lock(&xenwatch_mutex);
			
 
				+
			
 
				+		spin_lock(&watch_events_lock);
			
 
				+		ent = watch_events.next;
			
 
				+		if (ent != &watch_events)
			
 
				+			list_del(ent);
			
 
				+		spin_unlock(&watch_events_lock);
			
 
				+
			
 
				+		if (ent != &watch_events) {
			
 
				+			msg = list_entry(ent, struct xs_stored_msg, list);
			
 
				+			msg->u.watch.handle->callback(
			
 
				+				msg->u.watch.handle,
			
 
				+				(const char **)msg->u.watch.vec,
			
 
				+				msg->u.watch.vec_size);
			
 
				+			kfree(msg->u.watch.vec);
			
 
				+			kfree(msg);
			
 
				+		}
			
 
				+
			
 
				+		mutex_unlock(&xenwatch_mutex);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int process_msg(void)
			
 
				+{
			
 
				+	struct xs_stored_msg *msg;
			
 
				+	char *body;
			
 
				+	int err;
			
 
				+
			
 
				+	/*
			
 
				+	 * We must disallow save/restore while reading a xenstore message.
			
 
				+	 * A partial read across s/r leaves us out of sync with xenstored.
			
 
				+	 */
			
 
				+	for (;;) {
			
 
				+		err = xb_wait_for_data_to_read();
			
 
				+		if (err)
			
 
				+			return err;
			
 
				+		mutex_lock(&xs_state.response_mutex);
			
 
				+		if (xb_data_to_read())
			
 
				+			break;
			
 
				+		/* We raced with save/restore: pending data 'disappeared'. */
			
 
				+		mutex_unlock(&xs_state.response_mutex);
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	msg = kmalloc(sizeof(*msg), GFP_KERNEL);
			
 
				+	if (msg == NULL) {
			
 
				+		err = -ENOMEM;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	err = xb_read(&msg->hdr, sizeof(msg->hdr));
			
 
				+	if (err) {
			
 
				+		kfree(msg);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	body = kmalloc(msg->hdr.len + 1, GFP_KERNEL);
			
 
				+	if (body == NULL) {
			
 
				+		kfree(msg);
			
 
				+		err = -ENOMEM;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	err = xb_read(body, msg->hdr.len);
			
 
				+	if (err) {
			
 
				+		kfree(body);
			
 
				+		kfree(msg);
			
 
				+		goto out;
			
 
				+	}
			
 
				+	body[msg->hdr.len] = '\0';
			
 
				+
			
 
				+	if (msg->hdr.type == XS_WATCH_EVENT) {
			
 
				+		msg->u.watch.vec = split(body, msg->hdr.len,
			
 
				+					 &msg->u.watch.vec_size);
			
 
				+		if (IS_ERR(msg->u.watch.vec)) {
			
 
				+			kfree(msg);
			
 
				+			err = PTR_ERR(msg->u.watch.vec);
			
 
				+			goto out;
			
 
				+		}
			
 
				+
			
 
				+		spin_lock(&watches_lock);
			
 
				+		msg->u.watch.handle = find_watch(
			
 
				+			msg->u.watch.vec[XS_WATCH_TOKEN]);
			
 
				+		if (msg->u.watch.handle != NULL) {
			
 
				+			spin_lock(&watch_events_lock);
			
 
				+			list_add_tail(&msg->list, &watch_events);
			
 
				+			wake_up(&watch_events_waitq);
			
 
				+			spin_unlock(&watch_events_lock);
			
 
				+		} else {
			
 
				+			kfree(msg->u.watch.vec);
			
 
				+			kfree(msg);
			
 
				+		}
			
 
				+		spin_unlock(&watches_lock);
			
 
				+	} else {
			
 
				+		msg->u.reply.body = body;
			
 
				+		spin_lock(&xs_state.reply_lock);
			
 
				+		list_add_tail(&msg->list, &xs_state.reply_list);
			
 
				+		spin_unlock(&xs_state.reply_lock);
			
 
				+		wake_up(&xs_state.reply_waitq);
			
 
				+	}
			
 
				+
			
 
				+ out:
			
 
				+	mutex_unlock(&xs_state.response_mutex);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int xenbus_thread(void *unused)
			
 
				+{
			
 
				+	int err;
			
 
				+
			
 
				+	for (;;) {
			
 
				+		err = process_msg();
			
 
				+		if (err)
			
 
				+			printk(KERN_WARNING "XENBUS error %d while reading "
			
 
				+			       "message\n", err);
			
 
				+		if (kthread_should_stop())
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int xs_init(void)
			
 
				+{
			
 
				+	int err;
			
 
				+	struct task_struct *task;
			
 
				+
			
 
				+	INIT_LIST_HEAD(&xs_state.reply_list);
			
 
				+	spin_lock_init(&xs_state.reply_lock);
			
 
				+	init_waitqueue_head(&xs_state.reply_waitq);
			
 
				+
			
 
				+	mutex_init(&xs_state.request_mutex);
			
 
				+	mutex_init(&xs_state.response_mutex);
			
 
				+	init_rwsem(&xs_state.transaction_mutex);
			
 
				+	init_rwsem(&xs_state.watch_mutex);
			
 
				+
			
 
				+	/* Initialize the shared memory rings to talk to xenstored */
			
 
				+	err = xb_init_comms();
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	task = kthread_run(xenwatch_thread, NULL, "xenwatch");
			
 
				+	if (IS_ERR(task))
			
 
				+		return PTR_ERR(task);
			
 
				+	xenwatch_pid = task->pid;
			
 
				+
			
 
				+	task = kthread_run(xenbus_thread, NULL, "xenbus");
			
 
				+	if (IS_ERR(task))
			
 
				+		return PTR_ERR(task);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -209,7 +209,7 @@ void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
 
				 	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
			
 
				 	envp[2] = NULL;
			
 
				 
			
 
				-	ret = call_usermodehelper(argv[0], argv, envp, 1);
			
 
				+	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
			
 
				 	if (ret < 0)
			
 
				 		mlog_errno(ret);
			
 
				 }
			
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -41,6 +41,7 @@ extern int irqbalance_disable(char *str);
 
				 extern void fixup_irqs(cpumask_t map);
			
 
				 #endif
			
 
				 
			
 
				+unsigned int do_IRQ(struct pt_regs *regs);
			
 
				 void init_IRQ(void);
			
 
				 void __init native_init_IRQ(void);
			
 
				 
			
--- a/include/asm-i386/mach-default/irq_vectors_limits.h
+++ b/include/asm-i386/mach-default/irq_vectors_limits.h
@@ -1,7 +1,7 @@
 
				 #ifndef _ASM_IRQ_VECTORS_LIMITS_H
			
 
				 #define _ASM_IRQ_VECTORS_LIMITS_H
			
 
				 
			
 
				-#ifdef CONFIG_X86_IO_APIC
			
 
				+#if defined(CONFIG_X86_IO_APIC) || defined(CONFIG_PARAVIRT)
			
 
				 #define NR_IRQS 224
			
 
				 # if (224 >= 32 * NR_CPUS)
			
 
				 # define NR_IRQ_VECTORS NR_IRQS
			
--- a/include/asm-i386/mmu_context.h
+++ b/include/asm-i386/mmu_context.h
@@ -32,6 +32,8 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 
				 #endif
			
 
				 }
			
 
				 
			
 
				+void leave_mm(unsigned long cpu);
			
 
				+
			
 
				 static inline void switch_mm(struct mm_struct *prev,
			
 
				 			     struct mm_struct *next,
			
 
				 			     struct task_struct *tsk)
			
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -52,6 +52,8 @@ struct paravirt_ops
 
				 	/* Basic arch-specific setup */
			
 
				 	void (*arch_setup)(void);
			
 
				 	char *(*memory_setup)(void);
			
 
				+	void (*post_allocator_init)(void);
			
 
				+
			
 
				 	void (*init_IRQ)(void);
			
 
				 	void (*time_init)(void);
			
 
				 
			
@@ -116,7 +118,7 @@ struct paravirt_ops
 
				 
			
 
				 	u64 (*read_tsc)(void);
			
 
				 	u64 (*read_pmc)(void);
			
 
				- 	u64 (*get_scheduled_cycles)(void);
			
 
				+	unsigned long long (*sched_clock)(void);
			
 
				 	unsigned long (*get_cpu_khz)(void);
			
 
				 
			
 
				 	/* Segment descriptor handling */
			
@@ -173,7 +175,7 @@ struct paravirt_ops
 
				 				 unsigned long va);
			
 
				 
			
 
				 	/* Hooks for allocating/releasing pagetable pages */
			
 
				-	void (*alloc_pt)(u32 pfn);
			
 
				+	void (*alloc_pt)(struct mm_struct *mm, u32 pfn);
			
 
				 	void (*alloc_pd)(u32 pfn);
			
 
				 	void (*alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
			
 
				 	void (*release_pt)(u32 pfn);
			
@@ -260,6 +262,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *site, unsigned len)
 
				 unsigned paravirt_patch_insns(void *site, unsigned len,
			
 
				 			      const char *start, const char *end);
			
 
				 
			
 
				+int paravirt_disable_iospace(void);
			
 
				 
			
 
				 /*
			
 
				  * This generates an indirect call based on the operation type number.
			
@@ -563,7 +566,10 @@ static inline u64 paravirt_read_tsc(void)
 
				 
			
 
				 #define rdtscll(val) (val = paravirt_read_tsc())
			
 
				 
			
 
				-#define get_scheduled_cycles(val) (val = paravirt_ops.get_scheduled_cycles())
			
 
				+static inline unsigned long long paravirt_sched_clock(void)
			
 
				+{
			
 
				+	return PVOP_CALL0(unsigned long long, sched_clock);
			
 
				+}
			
 
				 #define calculate_cpu_khz() (paravirt_ops.get_cpu_khz())
			
 
				 
			
 
				 #define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
			
@@ -669,6 +675,12 @@ static inline void setup_secondary_clock(void)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+static inline void paravirt_post_allocator_init(void)
			
 
				+{
			
 
				+	if (paravirt_ops.post_allocator_init)
			
 
				+		(*paravirt_ops.post_allocator_init)();
			
 
				+}
			
 
				+
			
 
				 static inline void paravirt_pagetable_setup_start(pgd_t *base)
			
 
				 {
			
 
				 	if (paravirt_ops.pagetable_setup_start)
			
@@ -725,9 +737,9 @@ static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
 
				 	PVOP_VCALL3(flush_tlb_others, &cpumask, mm, va);
			
 
				 }
			
 
				 
			
 
				-static inline void paravirt_alloc_pt(unsigned pfn)
			
 
				+static inline void paravirt_alloc_pt(struct mm_struct *mm, unsigned pfn)
			
 
				 {
			
 
				-	PVOP_VCALL1(alloc_pt, pfn);
			
 
				+	PVOP_VCALL2(alloc_pt, mm, pfn);
			
 
				 }
			
 
				 static inline void paravirt_release_pt(unsigned pfn)
			
 
				 {
			
--- a/include/asm-i386/pgalloc.h
+++ b/include/asm-i386/pgalloc.h
@@ -7,7 +7,7 @@
 
				 #ifdef CONFIG_PARAVIRT
			
 
				 #include <asm/paravirt.h>
			
 
				 #else
			
 
				-#define paravirt_alloc_pt(pfn) do { } while (0)
			
 
				+#define paravirt_alloc_pt(mm, pfn) do { } while (0)
			
 
				 #define paravirt_alloc_pd(pfn) do { } while (0)
			
 
				 #define paravirt_alloc_pd(pfn) do { } while (0)
			
 
				 #define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
			
@@ -17,13 +17,13 @@
 
				 
			
 
				 #define pmd_populate_kernel(mm, pmd, pte)			\
			
 
				 do {								\
			
 
				-	paravirt_alloc_pt(__pa(pte) >> PAGE_SHIFT);		\
			
 
				+	paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);		\
			
 
				 	set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)));		\
			
 
				 } while (0)
			
 
				 
			
 
				 #define pmd_populate(mm, pmd, pte) 				\
			
 
				 do {								\
			
 
				-	paravirt_alloc_pt(page_to_pfn(pte));			\
			
 
				+	paravirt_alloc_pt(mm, page_to_pfn(pte));		\
			
 
				 	set_pmd(pmd, __pmd(_PAGE_TABLE +			\
			
 
				 		((unsigned long long)page_to_pfn(pte) <<	\
			
 
				 			(unsigned long long) PAGE_SHIFT)));	\
			
--- a/include/asm-i386/setup.h
+++ b/include/asm-i386/setup.h
@@ -81,6 +81,10 @@ void __init add_memory_region(unsigned long long start,
 
				 
			
 
				 extern unsigned long init_pg_tables_end;
			
 
				 
			
 
				+#ifndef CONFIG_PARAVIRT
			
 
				+#define paravirt_post_allocator_init()	do {} while (0)
			
 
				+#endif
			
 
				+
			
 
				 #endif /* __ASSEMBLY__ */
			
 
				 
			
 
				 #endif  /*  __KERNEL__  */
			
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -43,9 +43,12 @@ extern u8 x86_cpu_to_apicid[];
 
				 
			
 
				 #define cpu_physical_id(cpu)	x86_cpu_to_apicid[cpu]
			
 
				 
			
 
				+extern void set_cpu_sibling_map(int cpu);
			
 
				+
			
 
				 #ifdef CONFIG_HOTPLUG_CPU
			
 
				 extern void cpu_exit_clear(void);
			
 
				 extern void cpu_uninit(void);
			
 
				+extern void remove_siblinginfo(int cpu);
			
 
				 #endif
			
 
				 
			
 
				 struct smp_ops
			
@@ -129,6 +132,8 @@ extern int __cpu_disable(void);
 
				 extern void __cpu_die(unsigned int cpu);
			
 
				 extern unsigned int num_processors;
			
 
				 
			
 
				+void __cpuinit smp_store_cpu_info(int id);
			
 
				+
			
 
				 #endif /* !__ASSEMBLY__ */
			
 
				 
			
 
				 #else /* CONFIG_SMP */
			
--- a/include/asm-i386/timer.h
+++ b/include/asm-i386/timer.h
@@ -15,8 +15,38 @@ extern int no_sync_cmos_clock;
 
				 extern int recalibrate_cpu_khz(void);
			
 
				 
			
 
				 #ifndef CONFIG_PARAVIRT
			
 
				-#define get_scheduled_cycles(val) rdtscll(val)
			
 
				 #define calculate_cpu_khz() native_calculate_cpu_khz()
			
 
				 #endif
			
 
				 
			
 
				+/* Accellerators for sched_clock()
			
 
				+ * convert from cycles(64bits) => nanoseconds (64bits)
			
 
				+ *  basic equation:
			
 
				+ *		ns = cycles / (freq / ns_per_sec)
			
 
				+ *		ns = cycles * (ns_per_sec / freq)
			
 
				+ *		ns = cycles * (10^9 / (cpu_khz * 10^3))
			
 
				+ *		ns = cycles * (10^6 / cpu_khz)
			
 
				+ *
			
 
				+ *	Then we use scaling math (suggested by george@mvista.com) to get:
			
 
				+ *		ns = cycles * (10^6 * SC / cpu_khz) / SC
			
 
				+ *		ns = cycles * cyc2ns_scale / SC
			
 
				+ *
			
 
				+ *	And since SC is a constant power of two, we can convert the div
			
 
				+ *  into a shift.
			
 
				+ *
			
 
				+ *  We can use khz divisor instead of mhz to keep a better percision, since
			
 
				+ *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
			
 
				+ *  (mathieu.desnoyers@polymtl.ca)
			
 
				+ *
			
 
				+ *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
			
 
				+ */
			
 
				+extern unsigned long cyc2ns_scale __read_mostly;
			
 
				+
			
 
				+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
			
 
				+
			
 
				+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
			
 
				+{
			
 
				+	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
			
 
				+}
			
 
				+
			
 
				+
			
 
				 #endif
			
--- a/include/asm-i386/vmi_time.h
+++ b/include/asm-i386/vmi_time.h
@@ -49,7 +49,7 @@ extern struct vmi_timer_ops {
 
				 extern void __init vmi_time_init(void);
			
 
				 extern unsigned long vmi_get_wallclock(void);
			
 
				 extern int vmi_set_wallclock(unsigned long now);
			
 
				-extern unsigned long long vmi_get_sched_cycles(void);
			
 
				+extern unsigned long long vmi_sched_clock(void);
			
 
				 extern unsigned long vmi_cpu_khz(void);
			
 
				 
			
 
				 #ifdef CONFIG_X86_LOCAL_APIC
			
--- a/include/asm-i386/xen/hypercall.h
+++ b/include/asm-i386/xen/hypercall.h
@@ -0,0 +1,413 @@
 
				+/******************************************************************************
			
 
				+ * hypercall.h
			
 
				+ *
			
 
				+ * Linux-specific hypervisor handling.
			
 
				+ *
			
 
				+ * Copyright (c) 2002-2004, K A Fraser
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public License version 2
			
 
				+ * as published by the Free Software Foundation; or, when distributed
			
 
				+ * separately from the Linux kernel or incorporated into other
			
 
				+ * software packages, subject to the following license:
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this source file (the "Software"), to deal in the Software without
			
 
				+ * restriction, including without limitation the rights to use, copy, modify,
			
 
				+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
			
 
				+ * and to permit persons to whom the Software is furnished to do so, subject to
			
 
				+ * the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
			
 
				+ * IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __HYPERCALL_H__
			
 
				+#define __HYPERCALL_H__
			
 
				+
			
 
				+#include <linux/errno.h>
			
 
				+#include <linux/string.h>
			
 
				+
			
 
				+#include <xen/interface/xen.h>
			
 
				+#include <xen/interface/sched.h>
			
 
				+#include <xen/interface/physdev.h>
			
 
				+
			
 
				+extern struct { char _entry[32]; } hypercall_page[];
			
 
				+
			
 
				+#define _hypercall0(type, name)						\
			
 
				+({									\
			
 
				+	long __res;							\
			
 
				+	asm volatile (							\
			
 
				+		"call %[call]"						\
			
 
				+		: "=a" (__res)						\
			
 
				+		: [call] "m" (hypercall_page[__HYPERVISOR_##name])	\
			
 
				+		: "memory" );						\
			
 
				+	(type)__res;							\
			
 
				+})
			
 
				+
			
 
				+#define _hypercall1(type, name, a1)					\
			
 
				+({									\
			
 
				+	long __res, __ign1;						\
			
 
				+	asm volatile (							\
			
 
				+		"call %[call]"						\
			
 
				+		: "=a" (__res), "=b" (__ign1)				\
			
 
				+		: "1" ((long)(a1)),					\
			
 
				+		  [call] "m" (hypercall_page[__HYPERVISOR_##name])	\
			
 
				+		: "memory" );						\
			
 
				+	(type)__res;							\
			
 
				+})
			
 
				+
			
 
				+#define _hypercall2(type, name, a1, a2)					\
			
 
				+({									\
			
 
				+	long __res, __ign1, __ign2;					\
			
 
				+	asm volatile (							\
			
 
				+		"call %[call]"						\
			
 
				+		: "=a" (__res), "=b" (__ign1), "=c" (__ign2)		\
			
 
				+		: "1" ((long)(a1)), "2" ((long)(a2)),			\
			
 
				+		  [call] "m" (hypercall_page[__HYPERVISOR_##name])	\
			
 
				+		: "memory" );						\
			
 
				+	(type)__res;							\
			
 
				+})
			
 
				+
			
 
				+#define _hypercall3(type, name, a1, a2, a3)				\
			
 
				+({									\
			
 
				+	long __res, __ign1, __ign2, __ign3;				\
			
 
				+	asm volatile (							\
			
 
				+		"call %[call]"						\
			
 
				+		: "=a" (__res), "=b" (__ign1), "=c" (__ign2),		\
			
 
				+		"=d" (__ign3)						\
			
 
				+		: "1" ((long)(a1)), "2" ((long)(a2)),			\
			
 
				+		  "3" ((long)(a3)),					\
			
 
				+		  [call] "m" (hypercall_page[__HYPERVISOR_##name])	\
			
 
				+		: "memory" );						\
			
 
				+	(type)__res;							\
			
 
				+})
			
 
				+
			
 
				+#define _hypercall4(type, name, a1, a2, a3, a4)				\
			
 
				+({									\
			
 
				+	long __res, __ign1, __ign2, __ign3, __ign4;			\
			
 
				+	asm volatile (							\
			
 
				+		"call %[call]"						\
			
 
				+		: "=a" (__res), "=b" (__ign1), "=c" (__ign2),		\
			
 
				+		"=d" (__ign3), "=S" (__ign4)				\
			
 
				+		: "1" ((long)(a1)), "2" ((long)(a2)),			\
			
 
				+		  "3" ((long)(a3)), "4" ((long)(a4)),			\
			
 
				+		  [call] "m" (hypercall_page[__HYPERVISOR_##name])	\
			
 
				+		: "memory" );						\
			
 
				+	(type)__res;							\
			
 
				+})
			
 
				+
			
 
				+#define _hypercall5(type, name, a1, a2, a3, a4, a5)			\
			
 
				+({									\
			
 
				+	long __res, __ign1, __ign2, __ign3, __ign4, __ign5;		\
			
 
				+	asm volatile (							\
			
 
				+		"call %[call]"						\
			
 
				+		: "=a" (__res), "=b" (__ign1), "=c" (__ign2),		\
			
 
				+		"=d" (__ign3), "=S" (__ign4), "=D" (__ign5)		\
			
 
				+		: "1" ((long)(a1)), "2" ((long)(a2)),			\
			
 
				+		  "3" ((long)(a3)), "4" ((long)(a4)),			\
			
 
				+		  "5" ((long)(a5)),					\
			
 
				+		  [call] "m" (hypercall_page[__HYPERVISOR_##name])	\
			
 
				+		: "memory" );						\
			
 
				+	(type)__res;							\
			
 
				+})
			
 
				+
			
 
				+static inline int
			
 
				+HYPERVISOR_set_trap_table(struct trap_info *table)
			
 
				+{
			
 
				+	return _hypercall1(int, set_trap_table, table);
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+HYPERVISOR_mmu_update(struct mmu_update *req, int count,
			
 
				+		      int *success_count, domid_t domid)
			
 
				+{
			
 
				+	return _hypercall4(int, mmu_update, req, count, success_count, domid);
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+HYPERVISOR_mmuext_op(struct mmuext_op *op, int count,
			
 
				+		     int *success_count, domid_t domid)
			
 
				+{
			
 
				+	return _hypercall4(int, mmuext_op, op, count, success_count, domid);
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+HYPERVISOR_set_gdt(unsigned long *frame_list, int entries)
			
 
				+{
			
 
				+	return _hypercall2(int, set_gdt, frame_list, entries);
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp)
			
 
				+{
			
 
				+	return _hypercall2(int, stack_switch, ss, esp);
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+HYPERVISOR_set_callbacks(unsigned long event_selector,
			
 
				+			 unsigned long event_address,
			
 
				+			 unsigned long failsafe_selector,
			
 
				+			 unsigned long failsafe_address)
			
 
				+{
			
 
				+	return _hypercall4(int, set_callbacks,
			
 
				+			   event_selector, event_address,
			
 
				+			   failsafe_selector, failsafe_address);
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+HYPERVISOR_fpu_taskswitch(int set)
			
 
				+{
			
 
				+	return _hypercall1(int, fpu_taskswitch, set);
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+HYPERVISOR_sched_op(int cmd, unsigned long arg)
			
 
				+{
			
 
				+	return _hypercall2(int, sched_op, cmd, arg);
			
 
				+}
			
 
				+
			
 
				+static inline long
			
 
				+HYPERVISOR_set_timer_op(u64 timeout)
			
 
				+{
			
 
				+	unsigned long timeout_hi = (unsigned long)(timeout>>32);
			
 
				+	unsigned long timeout_lo = (unsigned long)timeout;
			
 
				+	return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+HYPERVISOR_set_debugreg(int reg, unsigned long value)
			
 
				+{
			
 
				+	return _hypercall2(int, set_debugreg, reg, value);
			
 
				+}
			
 
				+
			
 
				+static inline unsigned long
			
 
				+HYPERVISOR_get_debugreg(int reg)
			
 
				+{
			
 
				+	return _hypercall1(unsigned long, get_debugreg, reg);
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+HYPERVISOR_update_descriptor(u64 ma, u64 desc)
			
 
				+{
			
 
				+	return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+HYPERVISOR_memory_op(unsigned int cmd, void *arg)
			
 
				+{
			
 
				+	return _hypercall2(int, memory_op, cmd, arg);
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+HYPERVISOR_multicall(void *call_list, int nr_calls)
			
 
				+{
			
 
				+	return _hypercall2(int, multicall, call_list, nr_calls);
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val,
			
 
				+			     unsigned long flags)
			
 
				+{
			
 
				+	unsigned long pte_hi = 0;
			
 
				+#ifdef CONFIG_X86_PAE
			
 
				+	pte_hi = new_val.pte_high;
			
 
				+#endif
			
 
				+	return _hypercall4(int, update_va_mapping, va,
			
 
				+			   new_val.pte_low, pte_hi, flags);
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+HYPERVISOR_event_channel_op(int cmd, void *arg)
			
 
				+{
			
 
				+	int rc = _hypercall2(int, event_channel_op, cmd, arg);
			
 
				+	if (unlikely(rc == -ENOSYS)) {
			
 
				+		struct evtchn_op op;
			
 
				+		op.cmd = cmd;
			
 
				+		memcpy(&op.u, arg, sizeof(op.u));
			
 
				+		rc = _hypercall1(int, event_channel_op_compat, &op);
			
 
				+		memcpy(arg, &op.u, sizeof(op.u));
			
 
				+	}
			
 
				+	return rc;
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+HYPERVISOR_xen_version(int cmd, void *arg)
			
 
				+{
			
 
				+	return _hypercall2(int, xen_version, cmd, arg);
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+HYPERVISOR_console_io(int cmd, int count, char *str)
			
 
				+{
			
 
				+	return _hypercall3(int, console_io, cmd, count, str);
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+HYPERVISOR_physdev_op(int cmd, void *arg)
			
 
				+{
			
 
				+	int rc = _hypercall2(int, physdev_op, cmd, arg);
			
 
				+	if (unlikely(rc == -ENOSYS)) {
			
 
				+		struct physdev_op op;
			
 
				+		op.cmd = cmd;
			
 
				+		memcpy(&op.u, arg, sizeof(op.u));
			
 
				+		rc = _hypercall1(int, physdev_op_compat, &op);
			
 
				+		memcpy(arg, &op.u, sizeof(op.u));
			
 
				+	}
			
 
				+	return rc;
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count)
			
 
				+{
			
 
				+	return _hypercall3(int, grant_table_op, cmd, uop, count);
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, pte_t new_val,
			
 
				+					 unsigned long flags, domid_t domid)
			
 
				+{
			
 
				+	unsigned long pte_hi = 0;
			
 
				+#ifdef CONFIG_X86_PAE
			
 
				+	pte_hi = new_val.pte_high;
			
 
				+#endif
			
 
				+	return _hypercall5(int, update_va_mapping_otherdomain, va,
			
 
				+			   new_val.pte_low, pte_hi, flags, domid);
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type)
			
 
				+{
			
 
				+	return _hypercall2(int, vm_assist, cmd, type);
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args)
			
 
				+{
			
 
				+	return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+HYPERVISOR_suspend(unsigned long srec)
			
 
				+{
			
 
				+	return _hypercall3(int, sched_op, SCHEDOP_shutdown,
			
 
				+			   SHUTDOWN_suspend, srec);
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+HYPERVISOR_nmi_op(unsigned long op, unsigned long arg)
			
 
				+{
			
 
				+	return _hypercall2(int, nmi_op, op, arg);
			
 
				+}
			
 
				+
			
 
				+static inline void
			
 
				+MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
			
 
				+			pte_t new_val, unsigned long flags)
			
 
				+{
			
 
				+	mcl->op = __HYPERVISOR_update_va_mapping;
			
 
				+	mcl->args[0] = va;
			
 
				+#ifdef CONFIG_X86_PAE
			
 
				+	mcl->args[1] = new_val.pte_low;
			
 
				+	mcl->args[2] = new_val.pte_high;
			
 
				+#else
			
 
				+	mcl->args[1] = new_val.pte_low;
			
 
				+	mcl->args[2] = 0;
			
 
				+#endif
			
 
				+	mcl->args[3] = flags;
			
 
				+}
			
 
				+
			
 
				+static inline void
			
 
				+MULTI_grant_table_op(struct multicall_entry *mcl, unsigned int cmd,
			
 
				+		     void *uop, unsigned int count)
			
 
				+{
			
 
				+	mcl->op = __HYPERVISOR_grant_table_op;
			
 
				+	mcl->args[0] = cmd;
			
 
				+	mcl->args[1] = (unsigned long)uop;
			
 
				+	mcl->args[2] = count;
			
 
				+}
			
 
				+
			
 
				+static inline void
			
 
				+MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long va,
			
 
				+				    pte_t new_val, unsigned long flags,
			
 
				+				    domid_t domid)
			
 
				+{
			
 
				+	mcl->op = __HYPERVISOR_update_va_mapping_otherdomain;
			
 
				+	mcl->args[0] = va;
			
 
				+#ifdef CONFIG_X86_PAE
			
 
				+	mcl->args[1] = new_val.pte_low;
			
 
				+	mcl->args[2] = new_val.pte_high;
			
 
				+#else
			
 
				+	mcl->args[1] = new_val.pte_low;
			
 
				+	mcl->args[2] = 0;
			
 
				+#endif
			
 
				+	mcl->args[3] = flags;
			
 
				+	mcl->args[4] = domid;
			
 
				+}
			
 
				+
			
 
				+static inline void
			
 
				+MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
			
 
				+			struct desc_struct desc)
			
 
				+{
			
 
				+	mcl->op = __HYPERVISOR_update_descriptor;
			
 
				+	mcl->args[0] = maddr;
			
 
				+	mcl->args[1] = maddr >> 32;
			
 
				+	mcl->args[2] = desc.a;
			
 
				+	mcl->args[3] = desc.b;
			
 
				+}
			
 
				+
			
 
				+static inline void
			
 
				+MULTI_memory_op(struct multicall_entry *mcl, unsigned int cmd, void *arg)
			
 
				+{
			
 
				+	mcl->op = __HYPERVISOR_memory_op;
			
 
				+	mcl->args[0] = cmd;
			
 
				+	mcl->args[1] = (unsigned long)arg;
			
 
				+}
			
 
				+
			
 
				+static inline void
			
 
				+MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req,
			
 
				+		 int count, int *success_count, domid_t domid)
			
 
				+{
			
 
				+	mcl->op = __HYPERVISOR_mmu_update;
			
 
				+	mcl->args[0] = (unsigned long)req;
			
 
				+	mcl->args[1] = count;
			
 
				+	mcl->args[2] = (unsigned long)success_count;
			
 
				+	mcl->args[3] = domid;
			
 
				+}
			
 
				+
			
 
				+static inline void
			
 
				+MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count,
			
 
				+		int *success_count, domid_t domid)
			
 
				+{
			
 
				+	mcl->op = __HYPERVISOR_mmuext_op;
			
 
				+	mcl->args[0] = (unsigned long)op;
			
 
				+	mcl->args[1] = count;
			
 
				+	mcl->args[2] = (unsigned long)success_count;
			
 
				+	mcl->args[3] = domid;
			
 
				+}
			
 
				+
			
 
				+static inline void
			
 
				+MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries)
			
 
				+{
			
 
				+	mcl->op = __HYPERVISOR_set_gdt;
			
 
				+	mcl->args[0] = (unsigned long)frames;
			
 
				+	mcl->args[1] = entries;
			
 
				+}
			
 
				+
			
 
				+static inline void
			
 
				+MULTI_stack_switch(struct multicall_entry *mcl,
			
 
				+		   unsigned long ss, unsigned long esp)
			
 
				+{
			
 
				+	mcl->op = __HYPERVISOR_stack_switch;
			
 
				+	mcl->args[0] = ss;
			
 
				+	mcl->args[1] = esp;
			
 
				+}
			
 
				+
			
 
				+#endif /* __HYPERCALL_H__ */
			
--- a/include/asm-i386/xen/hypervisor.h
+++ b/include/asm-i386/xen/hypervisor.h
@@ -0,0 +1,73 @@
 
				+/******************************************************************************
			
 
				+ * hypervisor.h
			
 
				+ *
			
 
				+ * Linux-specific hypervisor handling.
			
 
				+ *
			
 
				+ * Copyright (c) 2002-2004, K A Fraser
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public License version 2
			
 
				+ * as published by the Free Software Foundation; or, when distributed
			
 
				+ * separately from the Linux kernel or incorporated into other
			
 
				+ * software packages, subject to the following license:
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this source file (the "Software"), to deal in the Software without
			
 
				+ * restriction, including without limitation the rights to use, copy, modify,
			
 
				+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
			
 
				+ * and to permit persons to whom the Software is furnished to do so, subject to
			
 
				+ * the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
			
 
				+ * IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __HYPERVISOR_H__
			
 
				+#define __HYPERVISOR_H__
			
 
				+
			
 
				+#include <linux/types.h>
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/version.h>
			
 
				+
			
 
				+#include <xen/interface/xen.h>
			
 
				+#include <xen/interface/version.h>
			
 
				+
			
 
				+#include <asm/ptrace.h>
			
 
				+#include <asm/page.h>
			
 
				+#include <asm/desc.h>
			
 
				+#if defined(__i386__)
			
 
				+#  ifdef CONFIG_X86_PAE
			
 
				+#   include <asm-generic/pgtable-nopud.h>
			
 
				+#  else
			
 
				+#   include <asm-generic/pgtable-nopmd.h>
			
 
				+#  endif
			
 
				+#endif
			
 
				+#include <asm/xen/hypercall.h>
			
 
				+
			
 
				+/* arch/i386/kernel/setup.c */
			
 
				+extern struct shared_info *HYPERVISOR_shared_info;
			
 
				+extern struct start_info *xen_start_info;
			
 
				+#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN)
			
 
				+
			
 
				+/* arch/i386/mach-xen/evtchn.c */
			
 
				+/* Force a proper event-channel callback from Xen. */
			
 
				+extern void force_evtchn_callback(void);
			
 
				+
			
 
				+/* Turn jiffies into Xen system time. */
			
 
				+u64 jiffies_to_st(unsigned long jiffies);
			
 
				+
			
 
				+
			
 
				+#define MULTI_UVMFLAGS_INDEX 3
			
 
				+#define MULTI_UVMDOMID_INDEX 4
			
 
				+
			
 
				+#define is_running_on_xen()	(xen_start_info ? 1 : 0)
			
 
				+
			
 
				+#endif /* __HYPERVISOR_H__ */
			
--- a/include/asm-i386/xen/interface.h
+++ b/include/asm-i386/xen/interface.h
@@ -0,0 +1,188 @@
 
				+/******************************************************************************
			
 
				+ * arch-x86_32.h
			
 
				+ *
			
 
				+ * Guest OS interface to x86 32-bit Xen.
			
 
				+ *
			
 
				+ * Copyright (c) 2004, K A Fraser
			
 
				+ */
			
 
				+
			
 
				+#ifndef __XEN_PUBLIC_ARCH_X86_32_H__
			
 
				+#define __XEN_PUBLIC_ARCH_X86_32_H__
			
 
				+
			
 
				+#ifdef __XEN__
			
 
				+#define __DEFINE_GUEST_HANDLE(name, type) \
			
 
				+    typedef struct { type *p; } __guest_handle_ ## name
			
 
				+#else
			
 
				+#define __DEFINE_GUEST_HANDLE(name, type) \
			
 
				+    typedef type * __guest_handle_ ## name
			
 
				+#endif
			
 
				+
			
 
				+#define DEFINE_GUEST_HANDLE_STRUCT(name) \
			
 
				+	__DEFINE_GUEST_HANDLE(name, struct name)
			
 
				+#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
			
 
				+#define GUEST_HANDLE(name)        __guest_handle_ ## name
			
 
				+
			
 
				+#ifndef __ASSEMBLY__
			
 
				+/* Guest handles for primitive C types. */
			
 
				+__DEFINE_GUEST_HANDLE(uchar, unsigned char);
			
 
				+__DEFINE_GUEST_HANDLE(uint,  unsigned int);
			
 
				+__DEFINE_GUEST_HANDLE(ulong, unsigned long);
			
 
				+DEFINE_GUEST_HANDLE(char);
			
 
				+DEFINE_GUEST_HANDLE(int);
			
 
				+DEFINE_GUEST_HANDLE(long);
			
 
				+DEFINE_GUEST_HANDLE(void);
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+ * SEGMENT DESCRIPTOR TABLES
			
 
				+ */
			
 
				+/*
			
 
				+ * A number of GDT entries are reserved by Xen. These are not situated at the
			
 
				+ * start of the GDT because some stupid OSes export hard-coded selector values
			
 
				+ * in their ABI. These hard-coded values are always near the start of the GDT,
			
 
				+ * so Xen places itself out of the way, at the far end of the GDT.
			
 
				+ */
			
 
				+#define FIRST_RESERVED_GDT_PAGE  14
			
 
				+#define FIRST_RESERVED_GDT_BYTE  (FIRST_RESERVED_GDT_PAGE * 4096)
			
 
				+#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
			
 
				+
			
 
				+/*
			
 
				+ * These flat segments are in the Xen-private section of every GDT. Since these
			
 
				+ * are also present in the initial GDT, many OSes will be able to avoid
			
 
				+ * installing their own GDT.
			
 
				+ */
			
 
				+#define FLAT_RING1_CS 0xe019    /* GDT index 259 */
			
 
				+#define FLAT_RING1_DS 0xe021    /* GDT index 260 */
			
 
				+#define FLAT_RING1_SS 0xe021    /* GDT index 260 */
			
 
				+#define FLAT_RING3_CS 0xe02b    /* GDT index 261 */
			
 
				+#define FLAT_RING3_DS 0xe033    /* GDT index 262 */
			
 
				+#define FLAT_RING3_SS 0xe033    /* GDT index 262 */
			
 
				+
			
 
				+#define FLAT_KERNEL_CS FLAT_RING1_CS
			
 
				+#define FLAT_KERNEL_DS FLAT_RING1_DS
			
 
				+#define FLAT_KERNEL_SS FLAT_RING1_SS
			
 
				+#define FLAT_USER_CS    FLAT_RING3_CS
			
 
				+#define FLAT_USER_DS    FLAT_RING3_DS
			
 
				+#define FLAT_USER_SS    FLAT_RING3_SS
			
 
				+
			
 
				+/* And the trap vector is... */
			
 
				+#define TRAP_INSTR "int $0x82"
			
 
				+
			
 
				+/*
			
 
				+ * Virtual addresses beyond this are not modifiable by guest OSes. The
			
 
				+ * machine->physical mapping table starts at this address, read-only.
			
 
				+ */
			
 
				+#ifdef CONFIG_X86_PAE
			
 
				+#define __HYPERVISOR_VIRT_START 0xF5800000
			
 
				+#else
			
 
				+#define __HYPERVISOR_VIRT_START 0xFC000000
			
 
				+#endif
			
 
				+
			
 
				+#ifndef HYPERVISOR_VIRT_START
			
 
				+#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
			
 
				+#endif
			
 
				+
			
 
				+#ifndef machine_to_phys_mapping
			
 
				+#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
			
 
				+#endif
			
 
				+
			
 
				+/* Maximum number of virtual CPUs in multi-processor guests. */
			
 
				+#define MAX_VIRT_CPUS 32
			
 
				+
			
 
				+#ifndef __ASSEMBLY__
			
 
				+
			
 
				+/*
			
 
				+ * Send an array of these to HYPERVISOR_set_trap_table()
			
 
				+ */
			
 
				+#define TI_GET_DPL(_ti)		((_ti)->flags & 3)
			
 
				+#define TI_GET_IF(_ti)		((_ti)->flags & 4)
			
 
				+#define TI_SET_DPL(_ti, _dpl)	((_ti)->flags |= (_dpl))
			
 
				+#define TI_SET_IF(_ti, _if)	((_ti)->flags |= ((!!(_if))<<2))
			
 
				+
			
 
				+struct trap_info {
			
 
				+    uint8_t       vector;  /* exception vector                              */
			
 
				+    uint8_t       flags;   /* 0-3: privilege level; 4: clear event enable?  */
			
 
				+    uint16_t      cs;      /* code selector                                 */
			
 
				+    unsigned long address; /* code offset                                   */
			
 
				+};
			
 
				+DEFINE_GUEST_HANDLE_STRUCT(trap_info);
			
 
				+
			
 
				+struct cpu_user_regs {
			
 
				+    uint32_t ebx;
			
 
				+    uint32_t ecx;
			
 
				+    uint32_t edx;
			
 
				+    uint32_t esi;
			
 
				+    uint32_t edi;
			
 
				+    uint32_t ebp;
			
 
				+    uint32_t eax;
			
 
				+    uint16_t error_code;    /* private */
			
 
				+    uint16_t entry_vector;  /* private */
			
 
				+    uint32_t eip;
			
 
				+    uint16_t cs;
			
 
				+    uint8_t  saved_upcall_mask;
			
 
				+    uint8_t  _pad0;
			
 
				+    uint32_t eflags;        /* eflags.IF == !saved_upcall_mask */
			
 
				+    uint32_t esp;
			
 
				+    uint16_t ss, _pad1;
			
 
				+    uint16_t es, _pad2;
			
 
				+    uint16_t ds, _pad3;
			
 
				+    uint16_t fs, _pad4;
			
 
				+    uint16_t gs, _pad5;
			
 
				+};
			
 
				+DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
			
 
				+
			
 
				+typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
			
 
				+
			
 
				+/*
			
 
				+ * The following is all CPU context. Note that the fpu_ctxt block is filled
			
 
				+ * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
			
 
				+ */
			
 
				+struct vcpu_guest_context {
			
 
				+    /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
			
 
				+    struct { char x[512]; } fpu_ctxt;       /* User-level FPU registers     */
			
 
				+#define VGCF_I387_VALID (1<<0)
			
 
				+#define VGCF_HVM_GUEST  (1<<1)
			
 
				+#define VGCF_IN_KERNEL  (1<<2)
			
 
				+    unsigned long flags;                    /* VGCF_* flags                 */
			
 
				+    struct cpu_user_regs user_regs;         /* User-level CPU registers     */
			
 
				+    struct trap_info trap_ctxt[256];        /* Virtual IDT                  */
			
 
				+    unsigned long ldt_base, ldt_ents;       /* LDT (linear address, # ents) */
			
 
				+    unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
			
 
				+    unsigned long kernel_ss, kernel_sp;     /* Virtual TSS (only SS1/SP1)   */
			
 
				+    unsigned long ctrlreg[8];               /* CR0-CR7 (control registers)  */
			
 
				+    unsigned long debugreg[8];              /* DB0-DB7 (debug registers)    */
			
 
				+    unsigned long event_callback_cs;        /* CS:EIP of event callback     */
			
 
				+    unsigned long event_callback_eip;
			
 
				+    unsigned long failsafe_callback_cs;     /* CS:EIP of failsafe callback  */
			
 
				+    unsigned long failsafe_callback_eip;
			
 
				+    unsigned long vm_assist;                /* VMASST_TYPE_* bitmap */
			
 
				+};
			
 
				+DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
			
 
				+
			
 
				+struct arch_shared_info {
			
 
				+    unsigned long max_pfn;                  /* max pfn that appears in table */
			
 
				+    /* Frame containing list of mfns containing list of mfns containing p2m. */
			
 
				+    unsigned long pfn_to_mfn_frame_list_list;
			
 
				+    unsigned long nmi_reason;
			
 
				+};
			
 
				+
			
 
				+struct arch_vcpu_info {
			
 
				+    unsigned long cr2;
			
 
				+    unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */
			
 
				+};
			
 
				+
			
 
				+#endif /* !__ASSEMBLY__ */
			
 
				+
			
 
				+/*
			
 
				+ * Prefix forces emulation of some non-trapping instructions.
			
 
				+ * Currently only CPUID.
			
 
				+ */
			
 
				+#ifdef __ASSEMBLY__
			
 
				+#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ;
			
 
				+#define XEN_CPUID          XEN_EMULATE_PREFIX cpuid
			
 
				+#else
			
 
				+#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; "
			
 
				+#define XEN_CPUID          XEN_EMULATE_PREFIX "cpuid"
			
 
				+#endif
			
 
				+
			
 
				+#endif
			
--- a/include/linux/elfnote.h
+++ b/include/linux/elfnote.h
@@ -38,17 +38,25 @@
 
				  * e.g. ELFNOTE(XYZCo, 42, .asciz, "forty-two")
			
 
				  *      ELFNOTE(XYZCo, 12, .long, 0xdeadbeef)
			
 
				  */
			
 
				-#define ELFNOTE(name, type, desctype, descdata)	\
			
 
				-.pushsection .note.name, "",@note	;	\
			
 
				-  .align 4				;	\
			
 
				+#define ELFNOTE_START(name, type, flags)	\
			
 
				+.pushsection .note.name, flags,@note	;	\
			
 
				+  .balign 4				;	\
			
 
				   .long 2f - 1f		/* namesz */	;	\
			
 
				-  .long 4f - 3f		/* descsz */	;	\
			
 
				+  .long 4484f - 3f	/* descsz */	;	\
			
 
				   .long type				;	\
			
 
				 1:.asciz #name				;	\
			
 
				-2:.align 4				;	\
			
 
				-3:desctype descdata			;	\
			
 
				-4:.align 4				;	\
			
 
				+2:.balign 4				;	\
			
 
				+3:
			
 
				+
			
 
				+#define ELFNOTE_END				\
			
 
				+4484:.balign 4				;	\
			
 
				 .popsection				;
			
 
				+
			
 
				+#define ELFNOTE(name, type, desc)		\
			
 
				+	ELFNOTE_START(name, type, "")		\
			
 
				+		desc			;	\
			
 
				+	ELFNOTE_END
			
 
				+
			
 
				 #else	/* !__ASSEMBLER__ */
			
 
				 #include <linux/elf.h>
			
 
				 /*
			
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -36,13 +36,57 @@ static inline int request_module(const char * name, ...) { return -ENOSYS; }
 
				 #define try_then_request_module(x, mod...) ((x) ?: (request_module(mod), (x)))
			
 
				 
			
 
				 struct key;
			
 
				-extern int call_usermodehelper_keys(char *path, char *argv[], char *envp[],
			
 
				-				    struct key *session_keyring, int wait);
			
 
				+struct file;
			
 
				+struct subprocess_info;
			
 
				+
			
 
				+/* Allocate a subprocess_info structure */
			
 
				+struct subprocess_info *call_usermodehelper_setup(char *path,
			
 
				+						  char **argv, char **envp);
			
 
				+
			
 
				+/* Set various pieces of state into the subprocess_info structure */
			
 
				+void call_usermodehelper_setkeys(struct subprocess_info *info,
			
 
				+				 struct key *session_keyring);
			
 
				+int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
			
 
				+				  struct file **filp);
			
 
				+void call_usermodehelper_setcleanup(struct subprocess_info *info,
			
 
				+				    void (*cleanup)(char **argv, char **envp));
			
 
				+
			
 
				+enum umh_wait {
			
 
				+	UMH_NO_WAIT = -1,	/* don't wait at all */
			
 
				+	UMH_WAIT_EXEC = 0,	/* wait for the exec, but not the process */
			
 
				+	UMH_WAIT_PROC = 1,	/* wait for the process to complete */
			
 
				+};
			
 
				+
			
 
				+/* Actually execute the sub-process */
			
 
				+int call_usermodehelper_exec(struct subprocess_info *info, enum umh_wait wait);
			
 
				+
			
 
				+/* Free the subprocess_info. This is only needed if you're not going
			
 
				+   to call call_usermodehelper_exec */
			
 
				+void call_usermodehelper_freeinfo(struct subprocess_info *info);
			
 
				 
			
 
				 static inline int
			
 
				-call_usermodehelper(char *path, char **argv, char **envp, int wait)
			
 
				+call_usermodehelper(char *path, char **argv, char **envp, enum umh_wait wait)
			
 
				 {
			
 
				-	return call_usermodehelper_keys(path, argv, envp, NULL, wait);
			
 
				+	struct subprocess_info *info;
			
 
				+
			
 
				+	info = call_usermodehelper_setup(path, argv, envp);
			
 
				+	if (info == NULL)
			
 
				+		return -ENOMEM;
			
 
				+	return call_usermodehelper_exec(info, wait);
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+call_usermodehelper_keys(char *path, char **argv, char **envp,
			
 
				+			 struct key *session_keyring, enum umh_wait wait)
			
 
				+{
			
 
				+	struct subprocess_info *info;
			
 
				+
			
 
				+	info = call_usermodehelper_setup(path, argv, envp);
			
 
				+	if (info == NULL)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	call_usermodehelper_setkeys(info, session_keyring);
			
 
				+	return call_usermodehelper_exec(info, wait);
			
 
				 }
			
 
				 
			
 
				 extern void usermodehelper_init(void);
			
--- a/include/linux/major.h
+++ b/include/linux/major.h
@@ -158,6 +158,8 @@
 
				 #define VXSPEC_MAJOR		200	/* VERITAS volume config driver */
			
 
				 #define VXDMP_MAJOR		201	/* VERITAS volume multipath driver */
			
 
				 
			
 
				+#define XENVBD_MAJOR		202	/* Xen virtual block device */
			
 
				+
			
 
				 #define MSR_MAJOR		202
			
 
				 #define CPUID_MAJOR		203
			
 
				 
			
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -92,6 +92,7 @@
 
				 
			
 
				 /* PG_owner_priv_1 users should have descriptive aliases */
			
 
				 #define PG_checked		PG_owner_priv_1 /* Used by some filesystems */
			
 
				+#define PG_pinned		PG_owner_priv_1	/* Xen pinned pagetable */
			
 
				 
			
 
				 #if (BITS_PER_LONG > 32)
			
 
				 /*
			
@@ -170,6 +171,10 @@ static inline void SetPageUptodate(struct page *page)
 
				 #define SetPageChecked(page)	set_bit(PG_checked, &(page)->flags)
			
 
				 #define ClearPageChecked(page)	clear_bit(PG_checked, &(page)->flags)
			
 
				 
			
 
				+#define PagePinned(page)	test_bit(PG_pinned, &(page)->flags)
			
 
				+#define SetPagePinned(page)	set_bit(PG_pinned, &(page)->flags)
			
 
				+#define ClearPagePinned(page)	clear_bit(PG_pinned, &(page)->flags)
			
 
				+
			
 
				 #define PageReserved(page)	test_bit(PG_reserved, &(page)->flags)
			
 
				 #define SetPageReserved(page)	set_bit(PG_reserved, &(page)->flags)
			
 
				 #define ClearPageReserved(page)	clear_bit(PG_reserved, &(page)->flags)
			
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -67,6 +67,11 @@ extern void kernel_power_off(void);
 
				 
			
 
				 void ctrl_alt_del(void);
			
 
				 
			
 
				+#define POWEROFF_CMD_PATH_LEN	256
			
 
				+extern char poweroff_cmd[POWEROFF_CMD_PATH_LEN];
			
 
				+
			
 
				+extern int orderly_poweroff(bool force);
			
 
				+
			
 
				 /*
			
 
				  * Emergency restart, callable from an interrupt handler.
			
 
				  */
			
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -105,8 +105,12 @@ extern void * memchr(const void *,int,__kernel_size_t);
 
				 #endif
			
 
				 
			
 
				 extern char *kstrdup(const char *s, gfp_t gfp);
			
 
				+extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
			
 
				 extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
			
 
				 
			
 
				+extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
			
 
				+extern void argv_free(char **argv);
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -70,6 +70,10 @@ extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
 
				 			struct page ***pages);
			
 
				 extern void unmap_kernel_range(unsigned long addr, unsigned long size);
			
 
				 
			
 
				+/* Allocate/destroy a 'vmalloc' VM area. */
			
 
				+extern struct vm_struct *alloc_vm_area(size_t size);
			
 
				+extern void free_vm_area(struct vm_struct *area);
			
 
				+
			
 
				 /*
			
 
				  *	Internals.  Dont't use..
			
 
				  */
			
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -0,0 +1,48 @@
 
				+#ifndef _XEN_EVENTS_H
			
 
				+#define _XEN_EVENTS_H
			
 
				+
			
 
				+#include <linux/interrupt.h>
			
 
				+
			
 
				+#include <xen/interface/event_channel.h>
			
 
				+#include <asm/xen/hypercall.h>
			
 
				+
			
 
				+enum ipi_vector {
			
 
				+	XEN_RESCHEDULE_VECTOR,
			
 
				+	XEN_CALL_FUNCTION_VECTOR,
			
 
				+
			
 
				+	XEN_NR_IPIS,
			
 
				+};
			
 
				+
			
 
				+int bind_evtchn_to_irq(unsigned int evtchn);
			
 
				+int bind_evtchn_to_irqhandler(unsigned int evtchn,
			
 
				+			      irq_handler_t handler,
			
 
				+			      unsigned long irqflags, const char *devname,
			
 
				+			      void *dev_id);
			
 
				+int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
			
 
				+			    irq_handler_t handler,
			
 
				+			    unsigned long irqflags, const char *devname,
			
 
				+			    void *dev_id);
			
 
				+int bind_ipi_to_irqhandler(enum ipi_vector ipi,
			
 
				+			   unsigned int cpu,
			
 
				+			   irq_handler_t handler,
			
 
				+			   unsigned long irqflags,
			
 
				+			   const char *devname,
			
 
				+			   void *dev_id);
			
 
				+
			
 
				+/*
			
 
				+ * Common unbind function for all event sources. Takes IRQ to unbind from.
			
 
				+ * Automatically closes the underlying event channel (even for bindings
			
 
				+ * made with bind_evtchn_to_irqhandler()).
			
 
				+ */
			
 
				+void unbind_from_irqhandler(unsigned int irq, void *dev_id);
			
 
				+
			
 
				+void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector);
			
 
				+
			
 
				+static inline void notify_remote_via_evtchn(int port)
			
 
				+{
			
 
				+	struct evtchn_send send = { .port = port };
			
 
				+	(void)HYPERVISOR_event_channel_op(EVTCHNOP_send, &send);
			
 
				+}
			
 
				+
			
 
				+extern void notify_remote_via_irq(int irq);
			
 
				+#endif	/* _XEN_EVENTS_H */
			
--- a/include/xen/features.h
+++ b/include/xen/features.h
@@ -0,0 +1,23 @@
 
				+/******************************************************************************
			
 
				+ * features.h
			
 
				+ *
			
 
				+ * Query the features reported by Xen.
			
 
				+ *
			
 
				+ * Copyright (c) 2006, Ian Campbell
			
 
				+ */
			
 
				+
			
 
				+#ifndef __XEN_FEATURES_H__
			
 
				+#define __XEN_FEATURES_H__
			
 
				+
			
 
				+#include <xen/interface/features.h>
			
 
				+
			
 
				+void xen_setup_features(void);
			
 
				+
			
 
				+extern u8 xen_features[XENFEAT_NR_SUBMAPS * 32];
			
 
				+
			
 
				+static inline int xen_feature(int flag)
			
 
				+{
			
 
				+	return xen_features[flag];
			
 
				+}
			
 
				+
			
 
				+#endif /* __ASM_XEN_FEATURES_H__ */
			
--- a/include/xen/grant_table.h
+++ b/include/xen/grant_table.h
@@ -0,0 +1,107 @@
 
				+/******************************************************************************
			
 
				+ * grant_table.h
			
 
				+ *
			
 
				+ * Two sets of functionality:
			
 
				+ * 1. Granting foreign access to our memory reservation.
			
 
				+ * 2. Accessing others' memory reservations via grant references.
			
 
				+ * (i.e., mechanisms for both sender and recipient of grant references)
			
 
				+ *
			
 
				+ * Copyright (c) 2004-2005, K A Fraser
			
 
				+ * Copyright (c) 2005, Christopher Clark
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public License version 2
			
 
				+ * as published by the Free Software Foundation; or, when distributed
			
 
				+ * separately from the Linux kernel or incorporated into other
			
 
				+ * software packages, subject to the following license:
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this source file (the "Software"), to deal in the Software without
			
 
				+ * restriction, including without limitation the rights to use, copy, modify,
			
 
				+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
			
 
				+ * and to permit persons to whom the Software is furnished to do so, subject to
			
 
				+ * the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
			
 
				+ * IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __ASM_GNTTAB_H__
			
 
				+#define __ASM_GNTTAB_H__
			
 
				+
			
 
				+#include <asm/xen/hypervisor.h>
			
 
				+#include <xen/interface/grant_table.h>
			
 
				+
			
 
				+/* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */
			
 
				+#define NR_GRANT_FRAMES 4
			
 
				+
			
 
				+struct gnttab_free_callback {
			
 
				+	struct gnttab_free_callback *next;
			
 
				+	void (*fn)(void *);
			
 
				+	void *arg;
			
 
				+	u16 count;
			
 
				+};
			
 
				+
			
 
				+int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
			
 
				+				int readonly);
			
 
				+
			
 
				+/*
			
 
				+ * End access through the given grant reference, iff the grant entry is no
			
 
				+ * longer in use.  Return 1 if the grant entry was freed, 0 if it is still in
			
 
				+ * use.
			
 
				+ */
			
 
				+int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly);
			
 
				+
			
 
				+/*
			
 
				+ * Eventually end access through the given grant reference, and once that
			
 
				+ * access has been ended, free the given page too.  Access will be ended
			
 
				+ * immediately iff the grant entry is not in use, otherwise it will happen
			
 
				+ * some time later.  page may be 0, in which case no freeing will occur.
			
 
				+ */
			
 
				+void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
			
 
				+			       unsigned long page);
			
 
				+
			
 
				+int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn);
			
 
				+
			
 
				+unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref);
			
 
				+unsigned long gnttab_end_foreign_transfer(grant_ref_t ref);
			
 
				+
			
 
				+int gnttab_query_foreign_access(grant_ref_t ref);
			
 
				+
			
 
				+/*
			
 
				+ * operations on reserved batches of grant references
			
 
				+ */
			
 
				+int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head);
			
 
				+
			
 
				+void gnttab_free_grant_reference(grant_ref_t ref);
			
 
				+
			
 
				+void gnttab_free_grant_references(grant_ref_t head);
			
 
				+
			
 
				+int gnttab_empty_grant_references(const grant_ref_t *pprivate_head);
			
 
				+
			
 
				+int gnttab_claim_grant_reference(grant_ref_t *pprivate_head);
			
 
				+
			
 
				+void gnttab_release_grant_reference(grant_ref_t *private_head,
			
 
				+				    grant_ref_t release);
			
 
				+
			
 
				+void gnttab_request_free_callback(struct gnttab_free_callback *callback,
			
 
				+				  void (*fn)(void *), void *arg, u16 count);
			
 
				+void gnttab_cancel_free_callback(struct gnttab_free_callback *callback);
			
 
				+
			
 
				+void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
			
 
				+				     unsigned long frame, int readonly);
			
 
				+
			
 
				+void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
			
 
				+				       unsigned long pfn);
			
 
				+
			
 
				+#define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
			
 
				+
			
 
				+#endif /* __ASM_GNTTAB_H__ */
			
--- a/include/xen/hvc-console.h
+++ b/include/xen/hvc-console.h
@@ -0,0 +1,6 @@
 
				+#ifndef XEN_HVC_CONSOLE_H
			
 
				+#define XEN_HVC_CONSOLE_H
			
 
				+
			
 
				+extern struct console xenboot_console;
			
 
				+
			
 
				+#endif	/* XEN_HVC_CONSOLE_H */
			
--- a/include/xen/interface/elfnote.h
+++ b/include/xen/interface/elfnote.h
@@ -0,0 +1,133 @@
 
				+/******************************************************************************
			
 
				+ * elfnote.h
			
 
				+ *
			
 
				+ * Definitions used for the Xen ELF notes.
			
 
				+ *
			
 
				+ * Copyright (c) 2006, Ian Campbell, XenSource Ltd.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __XEN_PUBLIC_ELFNOTE_H__
			
 
				+#define __XEN_PUBLIC_ELFNOTE_H__
			
 
				+
			
 
				+/*
			
 
				+ * The notes should live in a SHT_NOTE segment and have "Xen" in the
			
 
				+ * name field.
			
 
				+ *
			
 
				+ * Numeric types are either 4 or 8 bytes depending on the content of
			
 
				+ * the desc field.
			
 
				+ *
			
 
				+ * LEGACY indicated the fields in the legacy __xen_guest string which
			
 
				+ * this a note type replaces.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * NAME=VALUE pair (string).
			
 
				+ *
			
 
				+ * LEGACY: FEATURES and PAE
			
 
				+ */
			
 
				+#define XEN_ELFNOTE_INFO           0
			
 
				+
			
 
				+/*
			
 
				+ * The virtual address of the entry point (numeric).
			
 
				+ *
			
 
				+ * LEGACY: VIRT_ENTRY
			
 
				+ */
			
 
				+#define XEN_ELFNOTE_ENTRY          1
			
 
				+
			
 
				+/* The virtual address of the hypercall transfer page (numeric).
			
 
				+ *
			
 
				+ * LEGACY: HYPERCALL_PAGE. (n.b. legacy value is a physical page
			
 
				+ * number not a virtual address)
			
 
				+ */
			
 
				+#define XEN_ELFNOTE_HYPERCALL_PAGE 2
			
 
				+
			
 
				+/* The virtual address where the kernel image should be mapped (numeric).
			
 
				+ *
			
 
				+ * Defaults to 0.
			
 
				+ *
			
 
				+ * LEGACY: VIRT_BASE
			
 
				+ */
			
 
				+#define XEN_ELFNOTE_VIRT_BASE      3
			
 
				+
			
 
				+/*
			
 
				+ * The offset of the ELF paddr field from the acutal required
			
 
				+ * psuedo-physical address (numeric).
			
 
				+ *
			
 
				+ * This is used to maintain backwards compatibility with older kernels
			
 
				+ * which wrote __PAGE_OFFSET into that field. This field defaults to 0
			
 
				+ * if not present.
			
 
				+ *
			
 
				+ * LEGACY: ELF_PADDR_OFFSET. (n.b. legacy default is VIRT_BASE)
			
 
				+ */
			
 
				+#define XEN_ELFNOTE_PADDR_OFFSET   4
			
 
				+
			
 
				+/*
			
 
				+ * The version of Xen that we work with (string).
			
 
				+ *
			
 
				+ * LEGACY: XEN_VER
			
 
				+ */
			
 
				+#define XEN_ELFNOTE_XEN_VERSION    5
			
 
				+
			
 
				+/*
			
 
				+ * The name of the guest operating system (string).
			
 
				+ *
			
 
				+ * LEGACY: GUEST_OS
			
 
				+ */
			
 
				+#define XEN_ELFNOTE_GUEST_OS       6
			
 
				+
			
 
				+/*
			
 
				+ * The version of the guest operating system (string).
			
 
				+ *
			
 
				+ * LEGACY: GUEST_VER
			
 
				+ */
			
 
				+#define XEN_ELFNOTE_GUEST_VERSION  7
			
 
				+
			
 
				+/*
			
 
				+ * The loader type (string).
			
 
				+ *
			
 
				+ * LEGACY: LOADER
			
 
				+ */
			
 
				+#define XEN_ELFNOTE_LOADER         8
			
 
				+
			
 
				+/*
			
 
				+ * The kernel supports PAE (x86/32 only, string = "yes" or "no").
			
 
				+ *
			
 
				+ * LEGACY: PAE (n.b. The legacy interface included a provision to
			
 
				+ * indicate 'extended-cr3' support allowing L3 page tables to be
			
 
				+ * placed above 4G. It is assumed that any kernel new enough to use
			
 
				+ * these ELF notes will include this and therefore "yes" here is
			
 
				+ * equivalent to "yes[entended-cr3]" in the __xen_guest interface.
			
 
				+ */
			
 
				+#define XEN_ELFNOTE_PAE_MODE       9
			
 
				+
			
 
				+/*
			
 
				+ * The features supported/required by this kernel (string).
			
 
				+ *
			
 
				+ * The string must consist of a list of feature names (as given in
			
 
				+ * features.h, without the "XENFEAT_" prefix) separated by '|'
			
 
				+ * characters. If a feature is required for the kernel to function
			
 
				+ * then the feature name must be preceded by a '!' character.
			
 
				+ *
			
 
				+ * LEGACY: FEATURES
			
 
				+ */
			
 
				+#define XEN_ELFNOTE_FEATURES      10
			
 
				+
			
 
				+/*
			
 
				+ * The kernel requires the symbol table to be loaded (string = "yes" or "no")
			
 
				+ * LEGACY: BSD_SYMTAB (n.b. The legacy treated the presence or absence
			
 
				+ * of this string as a boolean flag rather than requiring "yes" or
			
 
				+ * "no".
			
 
				+ */
			
 
				+#define XEN_ELFNOTE_BSD_SYMTAB    11
			
 
				+
			
 
				+#endif /* __XEN_PUBLIC_ELFNOTE_H__ */
			
 
				+
			
 
				+/*
			
 
				+ * Local variables:
			
 
				+ * mode: C
			
 
				+ * c-set-style: "BSD"
			
 
				+ * c-basic-offset: 4
			
 
				+ * tab-width: 4
			
 
				+ * indent-tabs-mode: nil
			
 
				+ * End:
			
 
				+ */
			
--- a/include/xen/interface/event_channel.h
+++ b/include/xen/interface/event_channel.h
@@ -0,0 +1,195 @@
 
				+/******************************************************************************
			
 
				+ * event_channel.h
			
 
				+ *
			
 
				+ * Event channels between domains.
			
 
				+ *
			
 
				+ * Copyright (c) 2003-2004, K A Fraser.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __XEN_PUBLIC_EVENT_CHANNEL_H__
			
 
				+#define __XEN_PUBLIC_EVENT_CHANNEL_H__
			
 
				+
			
 
				+typedef uint32_t evtchn_port_t;
			
 
				+DEFINE_GUEST_HANDLE(evtchn_port_t);
			
 
				+
			
 
				+/*
			
 
				+ * EVTCHNOP_alloc_unbound: Allocate a port in domain <dom> and mark as
			
 
				+ * accepting interdomain bindings from domain <remote_dom>. A fresh port
			
 
				+ * is allocated in <dom> and returned as <port>.
			
 
				+ * NOTES:
			
 
				+ *  1. If the caller is unprivileged then <dom> must be DOMID_SELF.
			
 
				+ *  2. <rdom> may be DOMID_SELF, allowing loopback connections.
			
 
				+ */
			
 
				+#define EVTCHNOP_alloc_unbound	  6
			
 
				+struct evtchn_alloc_unbound {
			
 
				+	/* IN parameters */
			
 
				+	domid_t dom, remote_dom;
			
 
				+	/* OUT parameters */
			
 
				+	evtchn_port_t port;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * EVTCHNOP_bind_interdomain: Construct an interdomain event channel between
			
 
				+ * the calling domain and <remote_dom>. <remote_dom,remote_port> must identify
			
 
				+ * a port that is unbound and marked as accepting bindings from the calling
			
 
				+ * domain. A fresh port is allocated in the calling domain and returned as
			
 
				+ * <local_port>.
			
 
				+ * NOTES:
			
 
				+ *  2. <remote_dom> may be DOMID_SELF, allowing loopback connections.
			
 
				+ */
			
 
				+#define EVTCHNOP_bind_interdomain 0
			
 
				+struct evtchn_bind_interdomain {
			
 
				+	/* IN parameters. */
			
 
				+	domid_t remote_dom;
			
 
				+	evtchn_port_t remote_port;
			
 
				+	/* OUT parameters. */
			
 
				+	evtchn_port_t local_port;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * EVTCHNOP_bind_virq: Bind a local event channel to VIRQ <irq> on specified
			
 
				+ * vcpu.
			
 
				+ * NOTES:
			
 
				+ *  1. A virtual IRQ may be bound to at most one event channel per vcpu.
			
 
				+ *  2. The allocated event channel is bound to the specified vcpu. The binding
			
 
				+ *     may not be changed.
			
 
				+ */
			
 
				+#define EVTCHNOP_bind_virq	  1
			
 
				+struct evtchn_bind_virq {
			
 
				+	/* IN parameters. */
			
 
				+	uint32_t virq;
			
 
				+	uint32_t vcpu;
			
 
				+	/* OUT parameters. */
			
 
				+	evtchn_port_t port;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * EVTCHNOP_bind_pirq: Bind a local event channel to PIRQ <irq>.
			
 
				+ * NOTES:
			
 
				+ *  1. A physical IRQ may be bound to at most one event channel per domain.
			
 
				+ *  2. Only a sufficiently-privileged domain may bind to a physical IRQ.
			
 
				+ */
			
 
				+#define EVTCHNOP_bind_pirq	  2
			
 
				+struct evtchn_bind_pirq {
			
 
				+	/* IN parameters. */
			
 
				+	uint32_t pirq;
			
 
				+#define BIND_PIRQ__WILL_SHARE 1
			
 
				+	uint32_t flags; /* BIND_PIRQ__* */
			
 
				+	/* OUT parameters. */
			
 
				+	evtchn_port_t port;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * EVTCHNOP_bind_ipi: Bind a local event channel to receive events.
			
 
				+ * NOTES:
			
 
				+ *  1. The allocated event channel is bound to the specified vcpu. The binding
			
 
				+ *     may not be changed.
			
 
				+ */
			
 
				+#define EVTCHNOP_bind_ipi	  7
			
 
				+struct evtchn_bind_ipi {
			
 
				+	uint32_t vcpu;
			
 
				+	/* OUT parameters. */
			
 
				+	evtchn_port_t port;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * EVTCHNOP_close: Close a local event channel <port>. If the channel is
			
 
				+ * interdomain then the remote end is placed in the unbound state
			
 
				+ * (EVTCHNSTAT_unbound), awaiting a new connection.
			
 
				+ */
			
 
				+#define EVTCHNOP_close		  3
			
 
				+struct evtchn_close {
			
 
				+	/* IN parameters. */
			
 
				+	evtchn_port_t port;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * EVTCHNOP_send: Send an event to the remote end of the channel whose local
			
 
				+ * endpoint is <port>.
			
 
				+ */
			
 
				+#define EVTCHNOP_send		  4
			
 
				+struct evtchn_send {
			
 
				+	/* IN parameters. */
			
 
				+	evtchn_port_t port;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * EVTCHNOP_status: Get the current status of the communication channel which
			
 
				+ * has an endpoint at <dom, port>.
			
 
				+ * NOTES:
			
 
				+ *  1. <dom> may be specified as DOMID_SELF.
			
 
				+ *  2. Only a sufficiently-privileged domain may obtain the status of an event
			
 
				+ *     channel for which <dom> is not DOMID_SELF.
			
 
				+ */
			
 
				+#define EVTCHNOP_status		  5
			
 
				+struct evtchn_status {
			
 
				+	/* IN parameters */
			
 
				+	domid_t  dom;
			
 
				+	evtchn_port_t port;
			
 
				+	/* OUT parameters */
			
 
				+#define EVTCHNSTAT_closed	0  /* Channel is not in use.		     */
			
 
				+#define EVTCHNSTAT_unbound	1  /* Channel is waiting interdom connection.*/
			
 
				+#define EVTCHNSTAT_interdomain	2  /* Channel is connected to remote domain. */
			
 
				+#define EVTCHNSTAT_pirq		3  /* Channel is bound to a phys IRQ line.   */
			
 
				+#define EVTCHNSTAT_virq		4  /* Channel is bound to a virtual IRQ line */
			
 
				+#define EVTCHNSTAT_ipi		5  /* Channel is bound to a virtual IPI line */
			
 
				+	uint32_t status;
			
 
				+	uint32_t vcpu;		   /* VCPU to which this channel is bound.   */
			
 
				+	union {
			
 
				+		struct {
			
 
				+			domid_t dom;
			
 
				+		} unbound; /* EVTCHNSTAT_unbound */
			
 
				+		struct {
			
 
				+			domid_t dom;
			
 
				+			evtchn_port_t port;
			
 
				+		} interdomain; /* EVTCHNSTAT_interdomain */
			
 
				+		uint32_t pirq;	    /* EVTCHNSTAT_pirq	      */
			
 
				+		uint32_t virq;	    /* EVTCHNSTAT_virq	      */
			
 
				+	} u;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * EVTCHNOP_bind_vcpu: Specify which vcpu a channel should notify when an
			
 
				+ * event is pending.
			
 
				+ * NOTES:
			
 
				+ *  1. IPI- and VIRQ-bound channels always notify the vcpu that initialised
			
 
				+ *     the binding. This binding cannot be changed.
			
 
				+ *  2. All other channels notify vcpu0 by default. This default is set when
			
 
				+ *     the channel is allocated (a port that is freed and subsequently reused
			
 
				+ *     has its binding reset to vcpu0).
			
 
				+ */
			
 
				+#define EVTCHNOP_bind_vcpu	  8
			
 
				+struct evtchn_bind_vcpu {
			
 
				+	/* IN parameters. */
			
 
				+	evtchn_port_t port;
			
 
				+	uint32_t vcpu;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * EVTCHNOP_unmask: Unmask the specified local event-channel port and deliver
			
 
				+ * a notification to the appropriate VCPU if an event is pending.
			
 
				+ */
			
 
				+#define EVTCHNOP_unmask		  9
			
 
				+struct evtchn_unmask {
			
 
				+	/* IN parameters. */
			
 
				+	evtchn_port_t port;
			
 
				+};
			
 
				+
			
 
				+struct evtchn_op {
			
 
				+	uint32_t cmd; /* EVTCHNOP_* */
			
 
				+	union {
			
 
				+		struct evtchn_alloc_unbound    alloc_unbound;
			
 
				+		struct evtchn_bind_interdomain bind_interdomain;
			
 
				+		struct evtchn_bind_virq	       bind_virq;
			
 
				+		struct evtchn_bind_pirq	       bind_pirq;
			
 
				+		struct evtchn_bind_ipi	       bind_ipi;
			
 
				+		struct evtchn_close	       close;
			
 
				+		struct evtchn_send	       send;
			
 
				+		struct evtchn_status	       status;
			
 
				+		struct evtchn_bind_vcpu	       bind_vcpu;
			
 
				+		struct evtchn_unmask	       unmask;
			
 
				+	} u;
			
 
				+};
			
 
				+DEFINE_GUEST_HANDLE_STRUCT(evtchn_op);
			
 
				+
			
 
				+#endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */
			
--- a/include/xen/interface/features.h
+++ b/include/xen/interface/features.h
@@ -0,0 +1,43 @@
 
				+/******************************************************************************
			
 
				+ * features.h
			
 
				+ *
			
 
				+ * Feature flags, reported by XENVER_get_features.
			
 
				+ *
			
 
				+ * Copyright (c) 2006, Keir Fraser <keir@xensource.com>
			
 
				+ */
			
 
				+
			
 
				+#ifndef __XEN_PUBLIC_FEATURES_H__
			
 
				+#define __XEN_PUBLIC_FEATURES_H__
			
 
				+
			
 
				+/*
			
 
				+ * If set, the guest does not need to write-protect its pagetables, and can
			
 
				+ * update them via direct writes.
			
 
				+ */
			
 
				+#define XENFEAT_writable_page_tables       0
			
 
				+
			
 
				+/*
			
 
				+ * If set, the guest does not need to write-protect its segment descriptor
			
 
				+ * tables, and can update them via direct writes.
			
 
				+ */
			
 
				+#define XENFEAT_writable_descriptor_tables 1
			
 
				+
			
 
				+/*
			
 
				+ * If set, translation between the guest's 'pseudo-physical' address space
			
 
				+ * and the host's machine address space are handled by the hypervisor. In this
			
 
				+ * mode the guest does not need to perform phys-to/from-machine translations
			
 
				+ * when performing page table operations.
			
 
				+ */
			
 
				+#define XENFEAT_auto_translated_physmap    2
			
 
				+
			
 
				+/* If set, the guest is running in supervisor mode (e.g., x86 ring 0). */
			
 
				+#define XENFEAT_supervisor_mode_kernel     3
			
 
				+
			
 
				+/*
			
 
				+ * If set, the guest does not need to allocate x86 PAE page directories
			
 
				+ * below 4GB. This flag is usually implied by auto_translated_physmap.
			
 
				+ */
			
 
				+#define XENFEAT_pae_pgdir_above_4gb        4
			
 
				+
			
 
				+#define XENFEAT_NR_SUBMAPS 1
			
 
				+
			
 
				+#endif /* __XEN_PUBLIC_FEATURES_H__ */
			
--- a/include/xen/interface/grant_table.h
+++ b/include/xen/interface/grant_table.h
@@ -0,0 +1,375 @@
 
				+/******************************************************************************
			
 
				+ * grant_table.h
			
 
				+ *
			
 
				+ * Interface for granting foreign access to page frames, and receiving
			
 
				+ * page-ownership transfers.
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to
			
 
				+ * deal in the Software without restriction, including without limitation the
			
 
				+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
			
 
				+ * sell copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
			
 
				+ * DEALINGS IN THE SOFTWARE.
			
 
				+ *
			
 
				+ * Copyright (c) 2004, K A Fraser
			
 
				+ */
			
 
				+
			
 
				+#ifndef __XEN_PUBLIC_GRANT_TABLE_H__
			
 
				+#define __XEN_PUBLIC_GRANT_TABLE_H__
			
 
				+
			
 
				+
			
 
				+/***********************************
			
 
				+ * GRANT TABLE REPRESENTATION
			
 
				+ */
			
 
				+
			
 
				+/* Some rough guidelines on accessing and updating grant-table entries
			
 
				+ * in a concurrency-safe manner. For more information, Linux contains a
			
 
				+ * reference implementation for guest OSes (arch/xen/kernel/grant_table.c).
			
 
				+ *
			
 
				+ * NB. WMB is a no-op on current-generation x86 processors. However, a
			
 
				+ *     compiler barrier will still be required.
			
 
				+ *
			
 
				+ * Introducing a valid entry into the grant table:
			
 
				+ *  1. Write ent->domid.
			
 
				+ *  2. Write ent->frame:
			
 
				+ *      GTF_permit_access:   Frame to which access is permitted.
			
 
				+ *      GTF_accept_transfer: Pseudo-phys frame slot being filled by new
			
 
				+ *                           frame, or zero if none.
			
 
				+ *  3. Write memory barrier (WMB).
			
 
				+ *  4. Write ent->flags, inc. valid type.
			
 
				+ *
			
 
				+ * Invalidating an unused GTF_permit_access entry:
			
 
				+ *  1. flags = ent->flags.
			
 
				+ *  2. Observe that !(flags & (GTF_reading|GTF_writing)).
			
 
				+ *  3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).
			
 
				+ *  NB. No need for WMB as reuse of entry is control-dependent on success of
			
 
				+ *      step 3, and all architectures guarantee ordering of ctrl-dep writes.
			
 
				+ *
			
 
				+ * Invalidating an in-use GTF_permit_access entry:
			
 
				+ *  This cannot be done directly. Request assistance from the domain controller
			
 
				+ *  which can set a timeout on the use of a grant entry and take necessary
			
 
				+ *  action. (NB. This is not yet implemented!).
			
 
				+ *
			
 
				+ * Invalidating an unused GTF_accept_transfer entry:
			
 
				+ *  1. flags = ent->flags.
			
 
				+ *  2. Observe that !(flags & GTF_transfer_committed). [*]
			
 
				+ *  3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).
			
 
				+ *  NB. No need for WMB as reuse of entry is control-dependent on success of
			
 
				+ *      step 3, and all architectures guarantee ordering of ctrl-dep writes.
			
 
				+ *  [*] If GTF_transfer_committed is set then the grant entry is 'committed'.
			
 
				+ *      The guest must /not/ modify the grant entry until the address of the
			
 
				+ *      transferred frame is written. It is safe for the guest to spin waiting
			
 
				+ *      for this to occur (detect by observing GTF_transfer_completed in
			
 
				+ *      ent->flags).
			
 
				+ *
			
 
				+ * Invalidating a committed GTF_accept_transfer entry:
			
 
				+ *  1. Wait for (ent->flags & GTF_transfer_completed).
			
 
				+ *
			
 
				+ * Changing a GTF_permit_access from writable to read-only:
			
 
				+ *  Use SMP-safe CMPXCHG to set GTF_readonly, while checking !GTF_writing.
			
 
				+ *
			
 
				+ * Changing a GTF_permit_access from read-only to writable:
			
 
				+ *  Use SMP-safe bit-setting instruction.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * A grant table comprises a packed array of grant entries in one or more
			
 
				+ * page frames shared between Xen and a guest.
			
 
				+ * [XEN]: This field is written by Xen and read by the sharing guest.
			
 
				+ * [GST]: This field is written by the guest and read by Xen.
			
 
				+ */
			
 
				+struct grant_entry {
			
 
				+    /* GTF_xxx: various type and flag information.  [XEN,GST] */
			
 
				+    uint16_t flags;
			
 
				+    /* The domain being granted foreign privileges. [GST] */
			
 
				+    domid_t  domid;
			
 
				+    /*
			
 
				+     * GTF_permit_access: Frame that @domid is allowed to map and access. [GST]
			
 
				+     * GTF_accept_transfer: Frame whose ownership transferred by @domid. [XEN]
			
 
				+     */
			
 
				+    uint32_t frame;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Type of grant entry.
			
 
				+ *  GTF_invalid: This grant entry grants no privileges.
			
 
				+ *  GTF_permit_access: Allow @domid to map/access @frame.
			
 
				+ *  GTF_accept_transfer: Allow @domid to transfer ownership of one page frame
			
 
				+ *                       to this guest. Xen writes the page number to @frame.
			
 
				+ */
			
 
				+#define GTF_invalid         (0U<<0)
			
 
				+#define GTF_permit_access   (1U<<0)
			
 
				+#define GTF_accept_transfer (2U<<0)
			
 
				+#define GTF_type_mask       (3U<<0)
			
 
				+
			
 
				+/*
			
 
				+ * Subflags for GTF_permit_access.
			
 
				+ *  GTF_readonly: Restrict @domid to read-only mappings and accesses. [GST]
			
 
				+ *  GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN]
			
 
				+ *  GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN]
			
 
				+ */
			
 
				+#define _GTF_readonly       (2)
			
 
				+#define GTF_readonly        (1U<<_GTF_readonly)
			
 
				+#define _GTF_reading        (3)
			
 
				+#define GTF_reading         (1U<<_GTF_reading)
			
 
				+#define _GTF_writing        (4)
			
 
				+#define GTF_writing         (1U<<_GTF_writing)
			
 
				+
			
 
				+/*
			
 
				+ * Subflags for GTF_accept_transfer:
			
 
				+ *  GTF_transfer_committed: Xen sets this flag to indicate that it is committed
			
 
				+ *      to transferring ownership of a page frame. When a guest sees this flag
			
 
				+ *      it must /not/ modify the grant entry until GTF_transfer_completed is
			
 
				+ *      set by Xen.
			
 
				+ *  GTF_transfer_completed: It is safe for the guest to spin-wait on this flag
			
 
				+ *      after reading GTF_transfer_committed. Xen will always write the frame
			
 
				+ *      address, followed by ORing this flag, in a timely manner.
			
 
				+ */
			
 
				+#define _GTF_transfer_committed (2)
			
 
				+#define GTF_transfer_committed  (1U<<_GTF_transfer_committed)
			
 
				+#define _GTF_transfer_completed (3)
			
 
				+#define GTF_transfer_completed  (1U<<_GTF_transfer_completed)
			
 
				+
			
 
				+
			
 
				+/***********************************
			
 
				+ * GRANT TABLE QUERIES AND USES
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Reference to a grant entry in a specified domain's grant table.
			
 
				+ */
			
 
				+typedef uint32_t grant_ref_t;
			
 
				+
			
 
				+/*
			
 
				+ * Handle to track a mapping created via a grant reference.
			
 
				+ */
			
 
				+typedef uint32_t grant_handle_t;
			
 
				+
			
 
				+/*
			
 
				+ * GNTTABOP_map_grant_ref: Map the grant entry (<dom>,<ref>) for access
			
 
				+ * by devices and/or host CPUs. If successful, <handle> is a tracking number
			
 
				+ * that must be presented later to destroy the mapping(s). On error, <handle>
			
 
				+ * is a negative status code.
			
 
				+ * NOTES:
			
 
				+ *  1. If GNTMAP_device_map is specified then <dev_bus_addr> is the address
			
 
				+ *     via which I/O devices may access the granted frame.
			
 
				+ *  2. If GNTMAP_host_map is specified then a mapping will be added at
			
 
				+ *     either a host virtual address in the current address space, or at
			
 
				+ *     a PTE at the specified machine address.  The type of mapping to
			
 
				+ *     perform is selected through the GNTMAP_contains_pte flag, and the
			
 
				+ *     address is specified in <host_addr>.
			
 
				+ *  3. Mappings should only be destroyed via GNTTABOP_unmap_grant_ref. If a
			
 
				+ *     host mapping is destroyed by other means then it is *NOT* guaranteed
			
 
				+ *     to be accounted to the correct grant reference!
			
 
				+ */
			
 
				+#define GNTTABOP_map_grant_ref        0
			
 
				+struct gnttab_map_grant_ref {
			
 
				+    /* IN parameters. */
			
 
				+    uint64_t host_addr;
			
 
				+    uint32_t flags;               /* GNTMAP_* */
			
 
				+    grant_ref_t ref;
			
 
				+    domid_t  dom;
			
 
				+    /* OUT parameters. */
			
 
				+    int16_t  status;              /* GNTST_* */
			
 
				+    grant_handle_t handle;
			
 
				+    uint64_t dev_bus_addr;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * GNTTABOP_unmap_grant_ref: Destroy one or more grant-reference mappings
			
 
				+ * tracked by <handle>. If <host_addr> or <dev_bus_addr> is zero, that
			
 
				+ * field is ignored. If non-zero, they must refer to a device/host mapping
			
 
				+ * that is tracked by <handle>
			
 
				+ * NOTES:
			
 
				+ *  1. The call may fail in an undefined manner if either mapping is not
			
 
				+ *     tracked by <handle>.
			
 
				+ *  3. After executing a batch of unmaps, it is guaranteed that no stale
			
 
				+ *     mappings will remain in the device or host TLBs.
			
 
				+ */
			
 
				+#define GNTTABOP_unmap_grant_ref      1
			
 
				+struct gnttab_unmap_grant_ref {
			
 
				+    /* IN parameters. */
			
 
				+    uint64_t host_addr;
			
 
				+    uint64_t dev_bus_addr;
			
 
				+    grant_handle_t handle;
			
 
				+    /* OUT parameters. */
			
 
				+    int16_t  status;              /* GNTST_* */
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * GNTTABOP_setup_table: Set up a grant table for <dom> comprising at least
			
 
				+ * <nr_frames> pages. The frame addresses are written to the <frame_list>.
			
 
				+ * Only <nr_frames> addresses are written, even if the table is larger.
			
 
				+ * NOTES:
			
 
				+ *  1. <dom> may be specified as DOMID_SELF.
			
 
				+ *  2. Only a sufficiently-privileged domain may specify <dom> != DOMID_SELF.
			
 
				+ *  3. Xen may not support more than a single grant-table page per domain.
			
 
				+ */
			
 
				+#define GNTTABOP_setup_table          2
			
 
				+struct gnttab_setup_table {
			
 
				+    /* IN parameters. */
			
 
				+    domid_t  dom;
			
 
				+    uint32_t nr_frames;
			
 
				+    /* OUT parameters. */
			
 
				+    int16_t  status;              /* GNTST_* */
			
 
				+    ulong *frame_list;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * GNTTABOP_dump_table: Dump the contents of the grant table to the
			
 
				+ * xen console. Debugging use only.
			
 
				+ */
			
 
				+#define GNTTABOP_dump_table           3
			
 
				+struct gnttab_dump_table {
			
 
				+    /* IN parameters. */
			
 
				+    domid_t dom;
			
 
				+    /* OUT parameters. */
			
 
				+    int16_t status;               /* GNTST_* */
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * GNTTABOP_transfer_grant_ref: Transfer <frame> to a foreign domain. The
			
 
				+ * foreign domain has previously registered its interest in the transfer via
			
 
				+ * <domid, ref>.
			
 
				+ *
			
 
				+ * Note that, even if the transfer fails, the specified page no longer belongs
			
 
				+ * to the calling domain *unless* the error is GNTST_bad_page.
			
 
				+ */
			
 
				+#define GNTTABOP_transfer                4
			
 
				+struct gnttab_transfer {
			
 
				+    /* IN parameters. */
			
 
				+    unsigned long mfn;
			
 
				+    domid_t       domid;
			
 
				+    grant_ref_t   ref;
			
 
				+    /* OUT parameters. */
			
 
				+    int16_t       status;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * GNTTABOP_copy: Hypervisor based copy
			
 
				+ * source and destinations can be eithers MFNs or, for foreign domains,
			
 
				+ * grant references. the foreign domain has to grant read/write access
			
 
				+ * in its grant table.
			
 
				+ *
			
 
				+ * The flags specify what type source and destinations are (either MFN
			
 
				+ * or grant reference).
			
 
				+ *
			
 
				+ * Note that this can also be used to copy data between two domains
			
 
				+ * via a third party if the source and destination domains had previously
			
 
				+ * grant appropriate access to their pages to the third party.
			
 
				+ *
			
 
				+ * source_offset specifies an offset in the source frame, dest_offset
			
 
				+ * the offset in the target frame and  len specifies the number of
			
 
				+ * bytes to be copied.
			
 
				+ */
			
 
				+
			
 
				+#define _GNTCOPY_source_gref      (0)
			
 
				+#define GNTCOPY_source_gref       (1<<_GNTCOPY_source_gref)
			
 
				+#define _GNTCOPY_dest_gref        (1)
			
 
				+#define GNTCOPY_dest_gref         (1<<_GNTCOPY_dest_gref)
			
 
				+
			
 
				+#define GNTTABOP_copy                 5
			
 
				+struct gnttab_copy {
			
 
				+	/* IN parameters. */
			
 
				+	struct {
			
 
				+		union {
			
 
				+			grant_ref_t ref;
			
 
				+			unsigned long   gmfn;
			
 
				+		} u;
			
 
				+		domid_t  domid;
			
 
				+		uint16_t offset;
			
 
				+	} source, dest;
			
 
				+	uint16_t      len;
			
 
				+	uint16_t      flags;          /* GNTCOPY_* */
			
 
				+	/* OUT parameters. */
			
 
				+	int16_t       status;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * GNTTABOP_query_size: Query the current and maximum sizes of the shared
			
 
				+ * grant table.
			
 
				+ * NOTES:
			
 
				+ *  1. <dom> may be specified as DOMID_SELF.
			
 
				+ *  2. Only a sufficiently-privileged domain may specify <dom> != DOMID_SELF.
			
 
				+ */
			
 
				+#define GNTTABOP_query_size           6
			
 
				+struct gnttab_query_size {
			
 
				+    /* IN parameters. */
			
 
				+    domid_t  dom;
			
 
				+    /* OUT parameters. */
			
 
				+    uint32_t nr_frames;
			
 
				+    uint32_t max_nr_frames;
			
 
				+    int16_t  status;              /* GNTST_* */
			
 
				+};
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Bitfield values for update_pin_status.flags.
			
 
				+ */
			
 
				+ /* Map the grant entry for access by I/O devices. */
			
 
				+#define _GNTMAP_device_map      (0)
			
 
				+#define GNTMAP_device_map       (1<<_GNTMAP_device_map)
			
 
				+ /* Map the grant entry for access by host CPUs. */
			
 
				+#define _GNTMAP_host_map        (1)
			
 
				+#define GNTMAP_host_map         (1<<_GNTMAP_host_map)
			
 
				+ /* Accesses to the granted frame will be restricted to read-only access. */
			
 
				+#define _GNTMAP_readonly        (2)
			
 
				+#define GNTMAP_readonly         (1<<_GNTMAP_readonly)
			
 
				+ /*
			
 
				+  * GNTMAP_host_map subflag:
			
 
				+  *  0 => The host mapping is usable only by the guest OS.
			
 
				+  *  1 => The host mapping is usable by guest OS + current application.
			
 
				+  */
			
 
				+#define _GNTMAP_application_map (3)
			
 
				+#define GNTMAP_application_map  (1<<_GNTMAP_application_map)
			
 
				+
			
 
				+ /*
			
 
				+  * GNTMAP_contains_pte subflag:
			
 
				+  *  0 => This map request contains a host virtual address.
			
 
				+  *  1 => This map request contains the machine addess of the PTE to update.
			
 
				+  */
			
 
				+#define _GNTMAP_contains_pte    (4)
			
 
				+#define GNTMAP_contains_pte     (1<<_GNTMAP_contains_pte)
			
 
				+
			
 
				+/*
			
 
				+ * Values for error status returns. All errors are -ve.
			
 
				+ */
			
 
				+#define GNTST_okay             (0)  /* Normal return.                        */
			
 
				+#define GNTST_general_error    (-1) /* General undefined error.              */
			
 
				+#define GNTST_bad_domain       (-2) /* Unrecognsed domain id.                */
			
 
				+#define GNTST_bad_gntref       (-3) /* Unrecognised or inappropriate gntref. */
			
 
				+#define GNTST_bad_handle       (-4) /* Unrecognised or inappropriate handle. */
			
 
				+#define GNTST_bad_virt_addr    (-5) /* Inappropriate virtual address to map. */
			
 
				+#define GNTST_bad_dev_addr     (-6) /* Inappropriate device address to unmap.*/
			
 
				+#define GNTST_no_device_space  (-7) /* Out of space in I/O MMU.              */
			
 
				+#define GNTST_permission_denied (-8) /* Not enough privilege for operation.  */
			
 
				+#define GNTST_bad_page         (-9) /* Specified page was invalid for op.    */
			
 
				+#define GNTST_bad_copy_arg    (-10) /* copy arguments cross page boundary */
			
 
				+
			
 
				+#define GNTTABOP_error_msgs {                   \
			
 
				+    "okay",                                     \
			
 
				+    "undefined error",                          \
			
 
				+    "unrecognised domain id",                   \
			
 
				+    "invalid grant reference",                  \
			
 
				+    "invalid mapping handle",                   \
			
 
				+    "invalid virtual address",                  \
			
 
				+    "invalid device address",                   \
			
 
				+    "no spare translation slot in the I/O MMU", \
			
 
				+    "permission denied",                        \
			
 
				+    "bad page",                                 \
			
 
				+    "copy arguments cross page boundary"        \
			
 
				+}
			
 
				+
			
 
				+#endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */
			
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -0,0 +1,94 @@
 
				+/******************************************************************************
			
 
				+ * blkif.h
			
 
				+ *
			
 
				+ * Unified block-device I/O interface for Xen guest OSes.
			
 
				+ *
			
 
				+ * Copyright (c) 2003-2004, Keir Fraser
			
 
				+ */
			
 
				+
			
 
				+#ifndef __XEN_PUBLIC_IO_BLKIF_H__
			
 
				+#define __XEN_PUBLIC_IO_BLKIF_H__
			
 
				+
			
 
				+#include "ring.h"
			
 
				+#include "../grant_table.h"
			
 
				+
			
 
				+/*
			
 
				+ * Front->back notifications: When enqueuing a new request, sending a
			
 
				+ * notification can be made conditional on req_event (i.e., the generic
			
 
				+ * hold-off mechanism provided by the ring macros). Backends must set
			
 
				+ * req_event appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()).
			
 
				+ *
			
 
				+ * Back->front notifications: When enqueuing a new response, sending a
			
 
				+ * notification can be made conditional on rsp_event (i.e., the generic
			
 
				+ * hold-off mechanism provided by the ring macros). Frontends must set
			
 
				+ * rsp_event appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()).
			
 
				+ */
			
 
				+
			
 
				+typedef uint16_t blkif_vdev_t;
			
 
				+typedef uint64_t blkif_sector_t;
			
 
				+
			
 
				+/*
			
 
				+ * REQUEST CODES.
			
 
				+ */
			
 
				+#define BLKIF_OP_READ              0
			
 
				+#define BLKIF_OP_WRITE             1
			
 
				+/*
			
 
				+ * Recognised only if "feature-barrier" is present in backend xenbus info.
			
 
				+ * The "feature_barrier" node contains a boolean indicating whether barrier
			
 
				+ * requests are likely to succeed or fail. Either way, a barrier request
			
 
				+ * may fail at any time with BLKIF_RSP_EOPNOTSUPP if it is unsupported by
			
 
				+ * the underlying block-device hardware. The boolean simply indicates whether
			
 
				+ * or not it is worthwhile for the frontend to attempt barrier requests.
			
 
				+ * If a backend does not recognise BLKIF_OP_WRITE_BARRIER, it should *not*
			
 
				+ * create the "feature-barrier" node!
			
 
				+ */
			
 
				+#define BLKIF_OP_WRITE_BARRIER     2
			
 
				+
			
 
				+/*
			
 
				+ * Maximum scatter/gather segments per request.
			
 
				+ * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE.
			
 
				+ * NB. This could be 12 if the ring indexes weren't stored in the same page.
			
 
				+ */
			
 
				+#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
			
 
				+
			
 
				+struct blkif_request {
			
 
				+	uint8_t        operation;    /* BLKIF_OP_???                         */
			
 
				+	uint8_t        nr_segments;  /* number of segments                   */
			
 
				+	blkif_vdev_t   handle;       /* only for read/write requests         */
			
 
				+	uint64_t       id;           /* private guest value, echoed in resp  */
			
 
				+	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
			
 
				+	struct blkif_request_segment {
			
 
				+		grant_ref_t gref;        /* reference to I/O buffer frame        */
			
 
				+		/* @first_sect: first sector in frame to transfer (inclusive).   */
			
 
				+		/* @last_sect: last sector in frame to transfer (inclusive).     */
			
 
				+		uint8_t     first_sect, last_sect;
			
 
				+	} seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
			
 
				+};
			
 
				+
			
 
				+struct blkif_response {
			
 
				+	uint64_t        id;              /* copied from request */
			
 
				+	uint8_t         operation;       /* copied from request */
			
 
				+	int16_t         status;          /* BLKIF_RSP_???       */
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * STATUS RETURN CODES.
			
 
				+ */
			
 
				+ /* Operation not supported (only happens on barrier writes). */
			
 
				+#define BLKIF_RSP_EOPNOTSUPP  -2
			
 
				+ /* Operation failed for some unspecified reason (-EIO). */
			
 
				+#define BLKIF_RSP_ERROR       -1
			
 
				+ /* Operation completed successfully. */
			
 
				+#define BLKIF_RSP_OKAY         0
			
 
				+
			
 
				+/*
			
 
				+ * Generate blkif ring structures and types.
			
 
				+ */
			
 
				+
			
 
				+DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response);
			
 
				+
			
 
				+#define VDISK_CDROM        0x1
			
 
				+#define VDISK_REMOVABLE    0x2
			
 
				+#define VDISK_READONLY     0x4
			
 
				+
			
 
				+#endif /* __XEN_PUBLIC_IO_BLKIF_H__ */
			
--- a/include/xen/interface/io/console.h
+++ b/include/xen/interface/io/console.h
@@ -0,0 +1,23 @@
 
				+/******************************************************************************
			
 
				+ * console.h
			
 
				+ *
			
 
				+ * Console I/O interface for Xen guest OSes.
			
 
				+ *
			
 
				+ * Copyright (c) 2005, Keir Fraser
			
 
				+ */
			
 
				+
			
 
				+#ifndef __XEN_PUBLIC_IO_CONSOLE_H__
			
 
				+#define __XEN_PUBLIC_IO_CONSOLE_H__
			
 
				+
			
 
				+typedef uint32_t XENCONS_RING_IDX;
			
 
				+
			
 
				+#define MASK_XENCONS_IDX(idx, ring) ((idx) & (sizeof(ring)-1))
			
 
				+
			
 
				+struct xencons_interface {
			
 
				+    char in[1024];
			
 
				+    char out[2048];
			
 
				+    XENCONS_RING_IDX in_cons, in_prod;
			
 
				+    XENCONS_RING_IDX out_cons, out_prod;
			
 
				+};
			
 
				+
			
 
				+#endif /* __XEN_PUBLIC_IO_CONSOLE_H__ */
			
--- a/include/xen/interface/io/netif.h
+++ b/include/xen/interface/io/netif.h
@@ -0,0 +1,158 @@
 
				+/******************************************************************************
			
 
				+ * netif.h
			
 
				+ *
			
 
				+ * Unified network-device I/O interface for Xen guest OSes.
			
 
				+ *
			
 
				+ * Copyright (c) 2003-2004, Keir Fraser
			
 
				+ */
			
 
				+
			
 
				+#ifndef __XEN_PUBLIC_IO_NETIF_H__
			
 
				+#define __XEN_PUBLIC_IO_NETIF_H__
			
 
				+
			
 
				+#include "ring.h"
			
 
				+#include "../grant_table.h"
			
 
				+
			
 
				+/*
			
 
				+ * Notifications after enqueuing any type of message should be conditional on
			
 
				+ * the appropriate req_event or rsp_event field in the shared ring.
			
 
				+ * If the client sends notification for rx requests then it should specify
			
 
				+ * feature 'feature-rx-notify' via xenbus. Otherwise the backend will assume
			
 
				+ * that it cannot safely queue packets (as it may not be kicked to send them).
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * This is the 'wire' format for packets:
			
 
				+ *  Request 1: netif_tx_request -- NETTXF_* (any flags)
			
 
				+ * [Request 2: netif_tx_extra]  (only if request 1 has NETTXF_extra_info)
			
 
				+ * [Request 3: netif_tx_extra]  (only if request 2 has XEN_NETIF_EXTRA_MORE)
			
 
				+ *  Request 4: netif_tx_request -- NETTXF_more_data
			
 
				+ *  Request 5: netif_tx_request -- NETTXF_more_data
			
 
				+ *  ...
			
 
				+ *  Request N: netif_tx_request -- 0
			
 
				+ */
			
 
				+
			
 
				+/* Protocol checksum field is blank in the packet (hardware offload)? */
			
 
				+#define _NETTXF_csum_blank     (0)
			
 
				+#define  NETTXF_csum_blank     (1U<<_NETTXF_csum_blank)
			
 
				+
			
 
				+/* Packet data has been validated against protocol checksum. */
			
 
				+#define _NETTXF_data_validated (1)
			
 
				+#define  NETTXF_data_validated (1U<<_NETTXF_data_validated)
			
 
				+
			
 
				+/* Packet continues in the next request descriptor. */
			
 
				+#define _NETTXF_more_data      (2)
			
 
				+#define  NETTXF_more_data      (1U<<_NETTXF_more_data)
			
 
				+
			
 
				+/* Packet to be followed by extra descriptor(s). */
			
 
				+#define _NETTXF_extra_info     (3)
			
 
				+#define  NETTXF_extra_info     (1U<<_NETTXF_extra_info)
			
 
				+
			
 
				+struct xen_netif_tx_request {
			
 
				+    grant_ref_t gref;      /* Reference to buffer page */
			
 
				+    uint16_t offset;       /* Offset within buffer page */
			
 
				+    uint16_t flags;        /* NETTXF_* */
			
 
				+    uint16_t id;           /* Echoed in response message. */
			
 
				+    uint16_t size;         /* Packet size in bytes.       */
			
 
				+};
			
 
				+
			
 
				+/* Types of netif_extra_info descriptors. */
			
 
				+#define XEN_NETIF_EXTRA_TYPE_NONE  (0)  /* Never used - invalid */
			
 
				+#define XEN_NETIF_EXTRA_TYPE_GSO   (1)  /* u.gso */
			
 
				+#define XEN_NETIF_EXTRA_TYPE_MAX   (2)
			
 
				+
			
 
				+/* netif_extra_info flags. */
			
 
				+#define _XEN_NETIF_EXTRA_FLAG_MORE (0)
			
 
				+#define XEN_NETIF_EXTRA_FLAG_MORE  (1U<<_XEN_NETIF_EXTRA_FLAG_MORE)
			
 
				+
			
 
				+/* GSO types - only TCPv4 currently supported. */
			
 
				+#define XEN_NETIF_GSO_TYPE_TCPV4        (1)
			
 
				+
			
 
				+/*
			
 
				+ * This structure needs to fit within both netif_tx_request and
			
 
				+ * netif_rx_response for compatibility.
			
 
				+ */
			
 
				+struct xen_netif_extra_info {
			
 
				+	uint8_t type;  /* XEN_NETIF_EXTRA_TYPE_* */
			
 
				+	uint8_t flags; /* XEN_NETIF_EXTRA_FLAG_* */
			
 
				+
			
 
				+	union {
			
 
				+		struct {
			
 
				+			/*
			
 
				+			 * Maximum payload size of each segment. For
			
 
				+			 * example, for TCP this is just the path MSS.
			
 
				+			 */
			
 
				+			uint16_t size;
			
 
				+
			
 
				+			/*
			
 
				+			 * GSO type. This determines the protocol of
			
 
				+			 * the packet and any extra features required
			
 
				+			 * to segment the packet properly.
			
 
				+			 */
			
 
				+			uint8_t type; /* XEN_NETIF_GSO_TYPE_* */
			
 
				+
			
 
				+			/* Future expansion. */
			
 
				+			uint8_t pad;
			
 
				+
			
 
				+			/*
			
 
				+			 * GSO features. This specifies any extra GSO
			
 
				+			 * features required to process this packet,
			
 
				+			 * such as ECN support for TCPv4.
			
 
				+			 */
			
 
				+			uint16_t features; /* XEN_NETIF_GSO_FEAT_* */
			
 
				+		} gso;
			
 
				+
			
 
				+		uint16_t pad[3];
			
 
				+	} u;
			
 
				+};
			
 
				+
			
 
				+struct xen_netif_tx_response {
			
 
				+	uint16_t id;
			
 
				+	int16_t  status;       /* NETIF_RSP_* */
			
 
				+};
			
 
				+
			
 
				+struct xen_netif_rx_request {
			
 
				+	uint16_t    id;        /* Echoed in response message.        */
			
 
				+	grant_ref_t gref;      /* Reference to incoming granted frame */
			
 
				+};
			
 
				+
			
 
				+/* Packet data has been validated against protocol checksum. */
			
 
				+#define _NETRXF_data_validated (0)
			
 
				+#define  NETRXF_data_validated (1U<<_NETRXF_data_validated)
			
 
				+
			
 
				+/* Protocol checksum field is blank in the packet (hardware offload)? */
			
 
				+#define _NETRXF_csum_blank     (1)
			
 
				+#define  NETRXF_csum_blank     (1U<<_NETRXF_csum_blank)
			
 
				+
			
 
				+/* Packet continues in the next request descriptor. */
			
 
				+#define _NETRXF_more_data      (2)
			
 
				+#define  NETRXF_more_data      (1U<<_NETRXF_more_data)
			
 
				+
			
 
				+/* Packet to be followed by extra descriptor(s). */
			
 
				+#define _NETRXF_extra_info     (3)
			
 
				+#define  NETRXF_extra_info     (1U<<_NETRXF_extra_info)
			
 
				+
			
 
				+struct xen_netif_rx_response {
			
 
				+    uint16_t id;
			
 
				+    uint16_t offset;       /* Offset in page of start of received packet  */
			
 
				+    uint16_t flags;        /* NETRXF_* */
			
 
				+    int16_t  status;       /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Generate netif ring structures and types.
			
 
				+ */
			
 
				+
			
 
				+DEFINE_RING_TYPES(xen_netif_tx,
			
 
				+		  struct xen_netif_tx_request,
			
 
				+		  struct xen_netif_tx_response);
			
 
				+DEFINE_RING_TYPES(xen_netif_rx,
			
 
				+		  struct xen_netif_rx_request,
			
 
				+		  struct xen_netif_rx_response);
			
 
				+
			
 
				+#define NETIF_RSP_DROPPED         -2
			
 
				+#define NETIF_RSP_ERROR           -1
			
 
				+#define NETIF_RSP_OKAY             0
			
 
				+/* No response: used for auxiliary requests (e.g., netif_tx_extra). */
			
 
				+#define NETIF_RSP_NULL             1
			
 
				+
			
 
				+#endif
			
--- a/include/xen/interface/io/ring.h
+++ b/include/xen/interface/io/ring.h
@@ -0,0 +1,260 @@
 
				+/******************************************************************************
			
 
				+ * ring.h
			
 
				+ *
			
 
				+ * Shared producer-consumer ring macros.
			
 
				+ *
			
 
				+ * Tim Deegan and Andrew Warfield November 2004.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __XEN_PUBLIC_IO_RING_H__
			
 
				+#define __XEN_PUBLIC_IO_RING_H__
			
 
				+
			
 
				+typedef unsigned int RING_IDX;
			
 
				+
			
 
				+/* Round a 32-bit unsigned constant down to the nearest power of two. */
			
 
				+#define __RD2(_x)  (((_x) & 0x00000002) ? 0x2		       : ((_x) & 0x1))
			
 
				+#define __RD4(_x)  (((_x) & 0x0000000c) ? __RD2((_x)>>2)<<2    : __RD2(_x))
			
 
				+#define __RD8(_x)  (((_x) & 0x000000f0) ? __RD4((_x)>>4)<<4    : __RD4(_x))
			
 
				+#define __RD16(_x) (((_x) & 0x0000ff00) ? __RD8((_x)>>8)<<8    : __RD8(_x))
			
 
				+#define __RD32(_x) (((_x) & 0xffff0000) ? __RD16((_x)>>16)<<16 : __RD16(_x))
			
 
				+
			
 
				+/*
			
 
				+ * Calculate size of a shared ring, given the total available space for the
			
 
				+ * ring and indexes (_sz), and the name tag of the request/response structure.
			
 
				+ * A ring contains as many entries as will fit, rounded down to the nearest
			
 
				+ * power of two (so we can mask with (size-1) to loop around).
			
 
				+ */
			
 
				+#define __RING_SIZE(_s, _sz) \
			
 
				+    (__RD32(((_sz) - (long)&(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0])))
			
 
				+
			
 
				+/*
			
 
				+ * Macros to make the correct C datatypes for a new kind of ring.
			
 
				+ *
			
 
				+ * To make a new ring datatype, you need to have two message structures,
			
 
				+ * let's say struct request, and struct response already defined.
			
 
				+ *
			
 
				+ * In a header where you want the ring datatype declared, you then do:
			
 
				+ *
			
 
				+ *     DEFINE_RING_TYPES(mytag, struct request, struct response);
			
 
				+ *
			
 
				+ * These expand out to give you a set of types, as you can see below.
			
 
				+ * The most important of these are:
			
 
				+ *
			
 
				+ *     struct mytag_sring      - The shared ring.
			
 
				+ *     struct mytag_front_ring - The 'front' half of the ring.
			
 
				+ *     struct mytag_back_ring  - The 'back' half of the ring.
			
 
				+ *
			
 
				+ * To initialize a ring in your code you need to know the location and size
			
 
				+ * of the shared memory area (PAGE_SIZE, for instance). To initialise
			
 
				+ * the front half:
			
 
				+ *
			
 
				+ *     struct mytag_front_ring front_ring;
			
 
				+ *     SHARED_RING_INIT((struct mytag_sring *)shared_page);
			
 
				+ *     FRONT_RING_INIT(&front_ring, (struct mytag_sring *)shared_page,
			
 
				+ *		       PAGE_SIZE);
			
 
				+ *
			
 
				+ * Initializing the back follows similarly (note that only the front
			
 
				+ * initializes the shared ring):
			
 
				+ *
			
 
				+ *     struct mytag_back_ring back_ring;
			
 
				+ *     BACK_RING_INIT(&back_ring, (struct mytag_sring *)shared_page,
			
 
				+ *		      PAGE_SIZE);
			
 
				+ */
			
 
				+
			
 
				+#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t)			\
			
 
				+									\
			
 
				+/* Shared ring entry */							\
			
 
				+union __name##_sring_entry {						\
			
 
				+    __req_t req;							\
			
 
				+    __rsp_t rsp;							\
			
 
				+};									\
			
 
				+									\
			
 
				+/* Shared ring page */							\
			
 
				+struct __name##_sring {							\
			
 
				+    RING_IDX req_prod, req_event;					\
			
 
				+    RING_IDX rsp_prod, rsp_event;					\
			
 
				+    uint8_t  pad[48];							\
			
 
				+    union __name##_sring_entry ring[1]; /* variable-length */		\
			
 
				+};									\
			
 
				+									\
			
 
				+/* "Front" end's private variables */					\
			
 
				+struct __name##_front_ring {						\
			
 
				+    RING_IDX req_prod_pvt;						\
			
 
				+    RING_IDX rsp_cons;							\
			
 
				+    unsigned int nr_ents;						\
			
 
				+    struct __name##_sring *sring;					\
			
 
				+};									\
			
 
				+									\
			
 
				+/* "Back" end's private variables */					\
			
 
				+struct __name##_back_ring {						\
			
 
				+    RING_IDX rsp_prod_pvt;						\
			
 
				+    RING_IDX req_cons;							\
			
 
				+    unsigned int nr_ents;						\
			
 
				+    struct __name##_sring *sring;					\
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Macros for manipulating rings.
			
 
				+ *
			
 
				+ * FRONT_RING_whatever works on the "front end" of a ring: here
			
 
				+ * requests are pushed on to the ring and responses taken off it.
			
 
				+ *
			
 
				+ * BACK_RING_whatever works on the "back end" of a ring: here
			
 
				+ * requests are taken off the ring and responses put on.
			
 
				+ *
			
 
				+ * N.B. these macros do NO INTERLOCKS OR FLOW CONTROL.
			
 
				+ * This is OK in 1-for-1 request-response situations where the
			
 
				+ * requestor (front end) never has more than RING_SIZE()-1
			
 
				+ * outstanding requests.
			
 
				+ */
			
 
				+
			
 
				+/* Initialising empty rings */
			
 
				+#define SHARED_RING_INIT(_s) do {					\
			
 
				+    (_s)->req_prod  = (_s)->rsp_prod  = 0;				\
			
 
				+    (_s)->req_event = (_s)->rsp_event = 1;				\
			
 
				+    memset((_s)->pad, 0, sizeof((_s)->pad));				\
			
 
				+} while(0)
			
 
				+
			
 
				+#define FRONT_RING_INIT(_r, _s, __size) do {				\
			
 
				+    (_r)->req_prod_pvt = 0;						\
			
 
				+    (_r)->rsp_cons = 0;							\
			
 
				+    (_r)->nr_ents = __RING_SIZE(_s, __size);				\
			
 
				+    (_r)->sring = (_s);							\
			
 
				+} while (0)
			
 
				+
			
 
				+#define BACK_RING_INIT(_r, _s, __size) do {				\
			
 
				+    (_r)->rsp_prod_pvt = 0;						\
			
 
				+    (_r)->req_cons = 0;							\
			
 
				+    (_r)->nr_ents = __RING_SIZE(_s, __size);				\
			
 
				+    (_r)->sring = (_s);							\
			
 
				+} while (0)
			
 
				+
			
 
				+/* Initialize to existing shared indexes -- for recovery */
			
 
				+#define FRONT_RING_ATTACH(_r, _s, __size) do {				\
			
 
				+    (_r)->sring = (_s);							\
			
 
				+    (_r)->req_prod_pvt = (_s)->req_prod;				\
			
 
				+    (_r)->rsp_cons = (_s)->rsp_prod;					\
			
 
				+    (_r)->nr_ents = __RING_SIZE(_s, __size);				\
			
 
				+} while (0)
			
 
				+
			
 
				+#define BACK_RING_ATTACH(_r, _s, __size) do {				\
			
 
				+    (_r)->sring = (_s);							\
			
 
				+    (_r)->rsp_prod_pvt = (_s)->rsp_prod;				\
			
 
				+    (_r)->req_cons = (_s)->req_prod;					\
			
 
				+    (_r)->nr_ents = __RING_SIZE(_s, __size);				\
			
 
				+} while (0)
			
 
				+
			
 
				+/* How big is this ring? */
			
 
				+#define RING_SIZE(_r)							\
			
 
				+    ((_r)->nr_ents)
			
 
				+
			
 
				+/* Number of free requests (for use on front side only). */
			
 
				+#define RING_FREE_REQUESTS(_r)						\
			
 
				+    (RING_SIZE(_r) - ((_r)->req_prod_pvt - (_r)->rsp_cons))
			
 
				+
			
 
				+/* Test if there is an empty slot available on the front ring.
			
 
				+ * (This is only meaningful from the front. )
			
 
				+ */
			
 
				+#define RING_FULL(_r)							\
			
 
				+    (RING_FREE_REQUESTS(_r) == 0)
			
 
				+
			
 
				+/* Test if there are outstanding messages to be processed on a ring. */
			
 
				+#define RING_HAS_UNCONSUMED_RESPONSES(_r)				\
			
 
				+    ((_r)->sring->rsp_prod - (_r)->rsp_cons)
			
 
				+
			
 
				+#define RING_HAS_UNCONSUMED_REQUESTS(_r)				\
			
 
				+    ({									\
			
 
				+	unsigned int req = (_r)->sring->req_prod - (_r)->req_cons;	\
			
 
				+	unsigned int rsp = RING_SIZE(_r) -				\
			
 
				+			   ((_r)->req_cons - (_r)->rsp_prod_pvt);	\
			
 
				+	req < rsp ? req : rsp;						\
			
 
				+    })
			
 
				+
			
 
				+/* Direct access to individual ring elements, by index. */
			
 
				+#define RING_GET_REQUEST(_r, _idx)					\
			
 
				+    (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].req))
			
 
				+
			
 
				+#define RING_GET_RESPONSE(_r, _idx)					\
			
 
				+    (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp))
			
 
				+
			
 
				+/* Loop termination condition: Would the specified index overflow the ring? */
			
 
				+#define RING_REQUEST_CONS_OVERFLOW(_r, _cons)				\
			
 
				+    (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))
			
 
				+
			
 
				+#define RING_PUSH_REQUESTS(_r) do {					\
			
 
				+    wmb(); /* back sees requests /before/ updated producer index */	\
			
 
				+    (_r)->sring->req_prod = (_r)->req_prod_pvt;				\
			
 
				+} while (0)
			
 
				+
			
 
				+#define RING_PUSH_RESPONSES(_r) do {					\
			
 
				+    wmb(); /* front sees responses /before/ updated producer index */	\
			
 
				+    (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt;				\
			
 
				+} while (0)
			
 
				+
			
 
				+/*
			
 
				+ * Notification hold-off (req_event and rsp_event):
			
 
				+ *
			
 
				+ * When queueing requests or responses on a shared ring, it may not always be
			
 
				+ * necessary to notify the remote end. For example, if requests are in flight
			
 
				+ * in a backend, the front may be able to queue further requests without
			
 
				+ * notifying the back (if the back checks for new requests when it queues
			
 
				+ * responses).
			
 
				+ *
			
 
				+ * When enqueuing requests or responses:
			
 
				+ *
			
 
				+ *  Use RING_PUSH_{REQUESTS,RESPONSES}_AND_CHECK_NOTIFY(). The second argument
			
 
				+ *  is a boolean return value. True indicates that the receiver requires an
			
 
				+ *  asynchronous notification.
			
 
				+ *
			
 
				+ * After dequeuing requests or responses (before sleeping the connection):
			
 
				+ *
			
 
				+ *  Use RING_FINAL_CHECK_FOR_REQUESTS() or RING_FINAL_CHECK_FOR_RESPONSES().
			
 
				+ *  The second argument is a boolean return value. True indicates that there
			
 
				+ *  are pending messages on the ring (i.e., the connection should not be put
			
 
				+ *  to sleep).
			
 
				+ *
			
 
				+ *  These macros will set the req_event/rsp_event field to trigger a
			
 
				+ *  notification on the very next message that is enqueued. If you want to
			
 
				+ *  create batches of work (i.e., only receive a notification after several
			
 
				+ *  messages have been enqueued) then you will need to create a customised
			
 
				+ *  version of the FINAL_CHECK macro in your own code, which sets the event
			
 
				+ *  field appropriately.
			
 
				+ */
			
 
				+
			
 
				+#define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do {		\
			
 
				+    RING_IDX __old = (_r)->sring->req_prod;				\
			
 
				+    RING_IDX __new = (_r)->req_prod_pvt;				\
			
 
				+    wmb(); /* back sees requests /before/ updated producer index */	\
			
 
				+    (_r)->sring->req_prod = __new;					\
			
 
				+    mb(); /* back sees new requests /before/ we check req_event */	\
			
 
				+    (_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) <		\
			
 
				+		 (RING_IDX)(__new - __old));				\
			
 
				+} while (0)
			
 
				+
			
 
				+#define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do {		\
			
 
				+    RING_IDX __old = (_r)->sring->rsp_prod;				\
			
 
				+    RING_IDX __new = (_r)->rsp_prod_pvt;				\
			
 
				+    wmb(); /* front sees responses /before/ updated producer index */	\
			
 
				+    (_r)->sring->rsp_prod = __new;					\
			
 
				+    mb(); /* front sees new responses /before/ we check rsp_event */	\
			
 
				+    (_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) <		\
			
 
				+		 (RING_IDX)(__new - __old));				\
			
 
				+} while (0)
			
 
				+
			
 
				+#define RING_FINAL_CHECK_FOR_REQUESTS(_r, _work_to_do) do {		\
			
 
				+    (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r);			\
			
 
				+    if (_work_to_do) break;						\
			
 
				+    (_r)->sring->req_event = (_r)->req_cons + 1;			\
			
 
				+    mb();								\
			
 
				+    (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r);			\
			
 
				+} while (0)
			
 
				+
			
 
				+#define RING_FINAL_CHECK_FOR_RESPONSES(_r, _work_to_do) do {		\
			
 
				+    (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r);			\
			
 
				+    if (_work_to_do) break;						\
			
 
				+    (_r)->sring->rsp_event = (_r)->rsp_cons + 1;			\
			
 
				+    mb();								\
			
 
				+    (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r);			\
			
 
				+} while (0)
			
 
				+
			
 
				+#endif /* __XEN_PUBLIC_IO_RING_H__ */
			
--- a/include/xen/interface/io/xenbus.h
+++ b/include/xen/interface/io/xenbus.h
@@ -0,0 +1,44 @@
 
				+/*****************************************************************************
			
 
				+ * xenbus.h
			
 
				+ *
			
 
				+ * Xenbus protocol details.
			
 
				+ *
			
 
				+ * Copyright (C) 2005 XenSource Ltd.
			
 
				+ */
			
 
				+
			
 
				+#ifndef _XEN_PUBLIC_IO_XENBUS_H
			
 
				+#define _XEN_PUBLIC_IO_XENBUS_H
			
 
				+
			
 
				+/* The state of either end of the Xenbus, i.e. the current communication
			
 
				+   status of initialisation across the bus.  States here imply nothing about
			
 
				+   the state of the connection between the driver and the kernel's device
			
 
				+   layers.  */
			
 
				+enum xenbus_state
			
 
				+{
			
 
				+	XenbusStateUnknown      = 0,
			
 
				+	XenbusStateInitialising = 1,
			
 
				+	XenbusStateInitWait     = 2,  /* Finished early
			
 
				+					 initialisation, but waiting
			
 
				+					 for information from the peer
			
 
				+					 or hotplug scripts. */
			
 
				+	XenbusStateInitialised  = 3,  /* Initialised and waiting for a
			
 
				+					 connection from the peer. */
			
 
				+	XenbusStateConnected    = 4,
			
 
				+	XenbusStateClosing      = 5,  /* The device is being closed
			
 
				+					 due to an error or an unplug
			
 
				+					 event. */
			
 
				+	XenbusStateClosed       = 6
			
 
				+
			
 
				+};
			
 
				+
			
 
				+#endif /* _XEN_PUBLIC_IO_XENBUS_H */
			
 
				+
			
 
				+/*
			
 
				+ * Local variables:
			
 
				+ *  c-file-style: "linux"
			
 
				+ *  indent-tabs-mode: t
			
 
				+ *  c-indent-level: 8
			
 
				+ *  c-basic-offset: 8
			
 
				+ *  tab-width: 8
			
 
				+ * End:
			
 
				+ */
			
--- a/include/xen/interface/io/xs_wire.h
+++ b/include/xen/interface/io/xs_wire.h
@@ -0,0 +1,87 @@
 
				+/*
			
 
				+ * Details of the "wire" protocol between Xen Store Daemon and client
			
 
				+ * library or guest kernel.
			
 
				+ * Copyright (C) 2005 Rusty Russell IBM Corporation
			
 
				+ */
			
 
				+
			
 
				+#ifndef _XS_WIRE_H
			
 
				+#define _XS_WIRE_H
			
 
				+
			
 
				+enum xsd_sockmsg_type
			
 
				+{
			
 
				+    XS_DEBUG,
			
 
				+    XS_DIRECTORY,
			
 
				+    XS_READ,
			
 
				+    XS_GET_PERMS,
			
 
				+    XS_WATCH,
			
 
				+    XS_UNWATCH,
			
 
				+    XS_TRANSACTION_START,
			
 
				+    XS_TRANSACTION_END,
			
 
				+    XS_INTRODUCE,
			
 
				+    XS_RELEASE,
			
 
				+    XS_GET_DOMAIN_PATH,
			
 
				+    XS_WRITE,
			
 
				+    XS_MKDIR,
			
 
				+    XS_RM,
			
 
				+    XS_SET_PERMS,
			
 
				+    XS_WATCH_EVENT,
			
 
				+    XS_ERROR,
			
 
				+    XS_IS_DOMAIN_INTRODUCED
			
 
				+};
			
 
				+
			
 
				+#define XS_WRITE_NONE "NONE"
			
 
				+#define XS_WRITE_CREATE "CREATE"
			
 
				+#define XS_WRITE_CREATE_EXCL "CREATE|EXCL"
			
 
				+
			
 
				+/* We hand errors as strings, for portability. */
			
 
				+struct xsd_errors
			
 
				+{
			
 
				+    int errnum;
			
 
				+    const char *errstring;
			
 
				+};
			
 
				+#define XSD_ERROR(x) { x, #x }
			
 
				+static struct xsd_errors xsd_errors[] __attribute__((unused)) = {
			
 
				+    XSD_ERROR(EINVAL),
			
 
				+    XSD_ERROR(EACCES),
			
 
				+    XSD_ERROR(EEXIST),
			
 
				+    XSD_ERROR(EISDIR),
			
 
				+    XSD_ERROR(ENOENT),
			
 
				+    XSD_ERROR(ENOMEM),
			
 
				+    XSD_ERROR(ENOSPC),
			
 
				+    XSD_ERROR(EIO),
			
 
				+    XSD_ERROR(ENOTEMPTY),
			
 
				+    XSD_ERROR(ENOSYS),
			
 
				+    XSD_ERROR(EROFS),
			
 
				+    XSD_ERROR(EBUSY),
			
 
				+    XSD_ERROR(EAGAIN),
			
 
				+    XSD_ERROR(EISCONN)
			
 
				+};
			
 
				+
			
 
				+struct xsd_sockmsg
			
 
				+{
			
 
				+    uint32_t type;  /* XS_??? */
			
 
				+    uint32_t req_id;/* Request identifier, echoed in daemon's response.  */
			
 
				+    uint32_t tx_id; /* Transaction id (0 if not related to a transaction). */
			
 
				+    uint32_t len;   /* Length of data following this. */
			
 
				+
			
 
				+    /* Generally followed by nul-terminated string(s). */
			
 
				+};
			
 
				+
			
 
				+enum xs_watch_type
			
 
				+{
			
 
				+    XS_WATCH_PATH = 0,
			
 
				+    XS_WATCH_TOKEN
			
 
				+};
			
 
				+
			
 
				+/* Inter-domain shared memory communications. */
			
 
				+#define XENSTORE_RING_SIZE 1024
			
 
				+typedef uint32_t XENSTORE_RING_IDX;
			
 
				+#define MASK_XENSTORE_IDX(idx) ((idx) & (XENSTORE_RING_SIZE-1))
			
 
				+struct xenstore_domain_interface {
			
 
				+    char req[XENSTORE_RING_SIZE]; /* Requests to xenstore daemon. */
			
 
				+    char rsp[XENSTORE_RING_SIZE]; /* Replies and async watch events. */
			
 
				+    XENSTORE_RING_IDX req_cons, req_prod;
			
 
				+    XENSTORE_RING_IDX rsp_cons, rsp_prod;
			
 
				+};
			
 
				+
			
 
				+#endif /* _XS_WIRE_H */
			
--- a/include/xen/interface/memory.h
+++ b/include/xen/interface/memory.h
@@ -0,0 +1,145 @@
 
				+/******************************************************************************
			
 
				+ * memory.h
			
 
				+ *
			
 
				+ * Memory reservation and information.
			
 
				+ *
			
 
				+ * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
			
 
				+ */
			
 
				+
			
 
				+#ifndef __XEN_PUBLIC_MEMORY_H__
			
 
				+#define __XEN_PUBLIC_MEMORY_H__
			
 
				+
			
 
				+/*
			
 
				+ * Increase or decrease the specified domain's memory reservation. Returns a
			
 
				+ * -ve errcode on failure, or the # extents successfully allocated or freed.
			
 
				+ * arg == addr of struct xen_memory_reservation.
			
 
				+ */
			
 
				+#define XENMEM_increase_reservation 0
			
 
				+#define XENMEM_decrease_reservation 1
			
 
				+#define XENMEM_populate_physmap     6
			
 
				+struct xen_memory_reservation {
			
 
				+
			
 
				+    /*
			
 
				+     * XENMEM_increase_reservation:
			
 
				+     *   OUT: MFN (*not* GMFN) bases of extents that were allocated
			
 
				+     * XENMEM_decrease_reservation:
			
 
				+     *   IN:  GMFN bases of extents to free
			
 
				+     * XENMEM_populate_physmap:
			
 
				+     *   IN:  GPFN bases of extents to populate with memory
			
 
				+     *   OUT: GMFN bases of extents that were allocated
			
 
				+     *   (NB. This command also updates the mach_to_phys translation table)
			
 
				+     */
			
 
				+    GUEST_HANDLE(ulong) extent_start;
			
 
				+
			
 
				+    /* Number of extents, and size/alignment of each (2^extent_order pages). */
			
 
				+    unsigned long  nr_extents;
			
 
				+    unsigned int   extent_order;
			
 
				+
			
 
				+    /*
			
 
				+     * Maximum # bits addressable by the user of the allocated region (e.g.,
			
 
				+     * I/O devices often have a 32-bit limitation even in 64-bit systems). If
			
 
				+     * zero then the user has no addressing restriction.
			
 
				+     * This field is not used by XENMEM_decrease_reservation.
			
 
				+     */
			
 
				+    unsigned int   address_bits;
			
 
				+
			
 
				+    /*
			
 
				+     * Domain whose reservation is being changed.
			
 
				+     * Unprivileged domains can specify only DOMID_SELF.
			
 
				+     */
			
 
				+    domid_t        domid;
			
 
				+
			
 
				+};
			
 
				+DEFINE_GUEST_HANDLE_STRUCT(xen_memory_reservation);
			
 
				+
			
 
				+/*
			
 
				+ * Returns the maximum machine frame number of mapped RAM in this system.
			
 
				+ * This command always succeeds (it never returns an error code).
			
 
				+ * arg == NULL.
			
 
				+ */
			
 
				+#define XENMEM_maximum_ram_page     2
			
 
				+
			
 
				+/*
			
 
				+ * Returns the current or maximum memory reservation, in pages, of the
			
 
				+ * specified domain (may be DOMID_SELF). Returns -ve errcode on failure.
			
 
				+ * arg == addr of domid_t.
			
 
				+ */
			
 
				+#define XENMEM_current_reservation  3
			
 
				+#define XENMEM_maximum_reservation  4
			
 
				+
			
 
				+/*
			
 
				+ * Returns a list of MFN bases of 2MB extents comprising the machine_to_phys
			
 
				+ * mapping table. Architectures which do not have a m2p table do not implement
			
 
				+ * this command.
			
 
				+ * arg == addr of xen_machphys_mfn_list_t.
			
 
				+ */
			
 
				+#define XENMEM_machphys_mfn_list    5
			
 
				+struct xen_machphys_mfn_list {
			
 
				+    /*
			
 
				+     * Size of the 'extent_start' array. Fewer entries will be filled if the
			
 
				+     * machphys table is smaller than max_extents * 2MB.
			
 
				+     */
			
 
				+    unsigned int max_extents;
			
 
				+
			
 
				+    /*
			
 
				+     * Pointer to buffer to fill with list of extent starts. If there are
			
 
				+     * any large discontiguities in the machine address space, 2MB gaps in
			
 
				+     * the machphys table will be represented by an MFN base of zero.
			
 
				+     */
			
 
				+    GUEST_HANDLE(ulong) extent_start;
			
 
				+
			
 
				+    /*
			
 
				+     * Number of extents written to the above array. This will be smaller
			
 
				+     * than 'max_extents' if the machphys table is smaller than max_e * 2MB.
			
 
				+     */
			
 
				+    unsigned int nr_extents;
			
 
				+};
			
 
				+DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
			
 
				+
			
 
				+/*
			
 
				+ * Sets the GPFN at which a particular page appears in the specified guest's
			
 
				+ * pseudophysical address space.
			
 
				+ * arg == addr of xen_add_to_physmap_t.
			
 
				+ */
			
 
				+#define XENMEM_add_to_physmap      7
			
 
				+struct xen_add_to_physmap {
			
 
				+    /* Which domain to change the mapping for. */
			
 
				+    domid_t domid;
			
 
				+
			
 
				+    /* Source mapping space. */
			
 
				+#define XENMAPSPACE_shared_info 0 /* shared info page */
			
 
				+#define XENMAPSPACE_grant_table 1 /* grant table page */
			
 
				+    unsigned int space;
			
 
				+
			
 
				+    /* Index into source mapping space. */
			
 
				+    unsigned long idx;
			
 
				+
			
 
				+    /* GPFN where the source mapping page should appear. */
			
 
				+    unsigned long gpfn;
			
 
				+};
			
 
				+DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap);
			
 
				+
			
 
				+/*
			
 
				+ * Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error
			
 
				+ * code on failure. This call only works for auto-translated guests.
			
 
				+ */
			
 
				+#define XENMEM_translate_gpfn_list  8
			
 
				+struct xen_translate_gpfn_list {
			
 
				+    /* Which domain to translate for? */
			
 
				+    domid_t domid;
			
 
				+
			
 
				+    /* Length of list. */
			
 
				+    unsigned long nr_gpfns;
			
 
				+
			
 
				+    /* List of GPFNs to translate. */
			
 
				+    GUEST_HANDLE(ulong) gpfn_list;
			
 
				+
			
 
				+    /*
			
 
				+     * Output list to contain MFN translations. May be the same as the input
			
 
				+     * list (in which case each input GPFN is overwritten with the output MFN).
			
 
				+     */
			
 
				+    GUEST_HANDLE(ulong) mfn_list;
			
 
				+};
			
 
				+DEFINE_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
			
 
				+
			
 
				+#endif /* __XEN_PUBLIC_MEMORY_H__ */
			
--- a/include/xen/interface/physdev.h
+++ b/include/xen/interface/physdev.h
@@ -0,0 +1,145 @@
 
				+/*
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to
			
 
				+ * deal in the Software without restriction, including without limitation the
			
 
				+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
			
 
				+ * sell copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
			
 
				+ * DEALINGS IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __XEN_PUBLIC_PHYSDEV_H__
			
 
				+#define __XEN_PUBLIC_PHYSDEV_H__
			
 
				+
			
 
				+/*
			
 
				+ * Prototype for this hypercall is:
			
 
				+ *  int physdev_op(int cmd, void *args)
			
 
				+ * @cmd	 == PHYSDEVOP_??? (physdev operation).
			
 
				+ * @args == Operation-specific extra arguments (NULL if none).
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Notify end-of-interrupt (EOI) for the specified IRQ.
			
 
				+ * @arg == pointer to physdev_eoi structure.
			
 
				+ */
			
 
				+#define PHYSDEVOP_eoi			12
			
 
				+struct physdev_eoi {
			
 
				+	/* IN */
			
 
				+	uint32_t irq;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Query the status of an IRQ line.
			
 
				+ * @arg == pointer to physdev_irq_status_query structure.
			
 
				+ */
			
 
				+#define PHYSDEVOP_irq_status_query	 5
			
 
				+struct physdev_irq_status_query {
			
 
				+	/* IN */
			
 
				+	uint32_t irq;
			
 
				+	/* OUT */
			
 
				+	uint32_t flags; /* XENIRQSTAT_* */
			
 
				+};
			
 
				+
			
 
				+/* Need to call PHYSDEVOP_eoi when the IRQ has been serviced? */
			
 
				+#define _XENIRQSTAT_needs_eoi	(0)
			
 
				+#define	 XENIRQSTAT_needs_eoi	(1U<<_XENIRQSTAT_needs_eoi)
			
 
				+
			
 
				+/* IRQ shared by multiple guests? */
			
 
				+#define _XENIRQSTAT_shared	(1)
			
 
				+#define	 XENIRQSTAT_shared	(1U<<_XENIRQSTAT_shared)
			
 
				+
			
 
				+/*
			
 
				+ * Set the current VCPU's I/O privilege level.
			
 
				+ * @arg == pointer to physdev_set_iopl structure.
			
 
				+ */
			
 
				+#define PHYSDEVOP_set_iopl		 6
			
 
				+struct physdev_set_iopl {
			
 
				+	/* IN */
			
 
				+	uint32_t iopl;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Set the current VCPU's I/O-port permissions bitmap.
			
 
				+ * @arg == pointer to physdev_set_iobitmap structure.
			
 
				+ */
			
 
				+#define PHYSDEVOP_set_iobitmap		 7
			
 
				+struct physdev_set_iobitmap {
			
 
				+	/* IN */
			
 
				+	uint8_t * bitmap;
			
 
				+	uint32_t nr_ports;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Read or write an IO-APIC register.
			
 
				+ * @arg == pointer to physdev_apic structure.
			
 
				+ */
			
 
				+#define PHYSDEVOP_apic_read		 8
			
 
				+#define PHYSDEVOP_apic_write		 9
			
 
				+struct physdev_apic {
			
 
				+	/* IN */
			
 
				+	unsigned long apic_physbase;
			
 
				+	uint32_t reg;
			
 
				+	/* IN or OUT */
			
 
				+	uint32_t value;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Allocate or free a physical upcall vector for the specified IRQ line.
			
 
				+ * @arg == pointer to physdev_irq structure.
			
 
				+ */
			
 
				+#define PHYSDEVOP_alloc_irq_vector	10
			
 
				+#define PHYSDEVOP_free_irq_vector	11
			
 
				+struct physdev_irq {
			
 
				+	/* IN */
			
 
				+	uint32_t irq;
			
 
				+	/* IN or OUT */
			
 
				+	uint32_t vector;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Argument to physdev_op_compat() hypercall. Superceded by new physdev_op()
			
 
				+ * hypercall since 0x00030202.
			
 
				+ */
			
 
				+struct physdev_op {
			
 
				+	uint32_t cmd;
			
 
				+	union {
			
 
				+		struct physdev_irq_status_query	     irq_status_query;
			
 
				+		struct physdev_set_iopl		     set_iopl;
			
 
				+		struct physdev_set_iobitmap	     set_iobitmap;
			
 
				+		struct physdev_apic		     apic_op;
			
 
				+		struct physdev_irq		     irq_op;
			
 
				+	} u;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Notify that some PIRQ-bound event channels have been unmasked.
			
 
				+ * ** This command is obsolete since interface version 0x00030202 and is **
			
 
				+ * ** unsupported by newer versions of Xen.				 **
			
 
				+ */
			
 
				+#define PHYSDEVOP_IRQ_UNMASK_NOTIFY	 4
			
 
				+
			
 
				+/*
			
 
				+ * These all-capitals physdev operation names are superceded by the new names
			
 
				+ * (defined above) since interface version 0x00030202.
			
 
				+ */
			
 
				+#define PHYSDEVOP_IRQ_STATUS_QUERY	 PHYSDEVOP_irq_status_query
			
 
				+#define PHYSDEVOP_SET_IOPL		 PHYSDEVOP_set_iopl
			
 
				+#define PHYSDEVOP_SET_IOBITMAP		 PHYSDEVOP_set_iobitmap
			
 
				+#define PHYSDEVOP_APIC_READ		 PHYSDEVOP_apic_read
			
 
				+#define PHYSDEVOP_APIC_WRITE		 PHYSDEVOP_apic_write
			
 
				+#define PHYSDEVOP_ASSIGN_VECTOR		 PHYSDEVOP_alloc_irq_vector
			
 
				+#define PHYSDEVOP_FREE_VECTOR		 PHYSDEVOP_free_irq_vector
			
 
				+#define PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY XENIRQSTAT_needs_eoi
			
 
				+#define PHYSDEVOP_IRQ_SHARED		 XENIRQSTAT_shared
			
 
				+
			
 
				+#endif /* __XEN_PUBLIC_PHYSDEV_H__ */
			
--- a/include/xen/interface/sched.h
+++ b/include/xen/interface/sched.h
@@ -0,0 +1,77 @@
 
				+/******************************************************************************
			
 
				+ * sched.h
			
 
				+ *
			
 
				+ * Scheduler state interactions
			
 
				+ *
			
 
				+ * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
			
 
				+ */
			
 
				+
			
 
				+#ifndef __XEN_PUBLIC_SCHED_H__
			
 
				+#define __XEN_PUBLIC_SCHED_H__
			
 
				+
			
 
				+#include "event_channel.h"
			
 
				+
			
 
				+/*
			
 
				+ * The prototype for this hypercall is:
			
 
				+ *  long sched_op_new(int cmd, void *arg)
			
 
				+ * @cmd == SCHEDOP_??? (scheduler operation).
			
 
				+ * @arg == Operation-specific extra argument(s), as described below.
			
 
				+ *
			
 
				+ * **NOTE**:
			
 
				+ * Versions of Xen prior to 3.0.2 provide only the following legacy version
			
 
				+ * of this hypercall, supporting only the commands yield, block and shutdown:
			
 
				+ *  long sched_op(int cmd, unsigned long arg)
			
 
				+ * @cmd == SCHEDOP_??? (scheduler operation).
			
 
				+ * @arg == 0               (SCHEDOP_yield and SCHEDOP_block)
			
 
				+ *      == SHUTDOWN_* code (SCHEDOP_shutdown)
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Voluntarily yield the CPU.
			
 
				+ * @arg == NULL.
			
 
				+ */
			
 
				+#define SCHEDOP_yield       0
			
 
				+
			
 
				+/*
			
 
				+ * Block execution of this VCPU until an event is received for processing.
			
 
				+ * If called with event upcalls masked, this operation will atomically
			
 
				+ * reenable event delivery and check for pending events before blocking the
			
 
				+ * VCPU. This avoids a "wakeup waiting" race.
			
 
				+ * @arg == NULL.
			
 
				+ */
			
 
				+#define SCHEDOP_block       1
			
 
				+
			
 
				+/*
			
 
				+ * Halt execution of this domain (all VCPUs) and notify the system controller.
			
 
				+ * @arg == pointer to sched_shutdown structure.
			
 
				+ */
			
 
				+#define SCHEDOP_shutdown    2
			
 
				+struct sched_shutdown {
			
 
				+    unsigned int reason; /* SHUTDOWN_* */
			
 
				+};
			
 
				+DEFINE_GUEST_HANDLE_STRUCT(sched_shutdown);
			
 
				+
			
 
				+/*
			
 
				+ * Poll a set of event-channel ports. Return when one or more are pending. An
			
 
				+ * optional timeout may be specified.
			
 
				+ * @arg == pointer to sched_poll structure.
			
 
				+ */
			
 
				+#define SCHEDOP_poll        3
			
 
				+struct sched_poll {
			
 
				+    GUEST_HANDLE(evtchn_port_t) ports;
			
 
				+    unsigned int nr_ports;
			
 
				+    uint64_t timeout;
			
 
				+};
			
 
				+DEFINE_GUEST_HANDLE_STRUCT(sched_poll);
			
 
				+
			
 
				+/*
			
 
				+ * Reason codes for SCHEDOP_shutdown. These may be interpreted by control
			
 
				+ * software to determine the appropriate action. For the most part, Xen does
			
 
				+ * not care about the shutdown code.
			
 
				+ */
			
 
				+#define SHUTDOWN_poweroff   0  /* Domain exited normally. Clean up and kill. */
			
 
				+#define SHUTDOWN_reboot     1  /* Clean up, kill, and then restart.          */
			
 
				+#define SHUTDOWN_suspend    2  /* Clean up, save suspend info, kill.         */
			
 
				+#define SHUTDOWN_crash      3  /* Tell controller we've crashed.             */
			
 
				+
			
 
				+#endif /* __XEN_PUBLIC_SCHED_H__ */
			
--- a/include/xen/interface/vcpu.h
+++ b/include/xen/interface/vcpu.h
@@ -0,0 +1,167 @@
 
				+/******************************************************************************
			
 
				+ * vcpu.h
			
 
				+ *
			
 
				+ * VCPU initialisation, query, and hotplug.
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to
			
 
				+ * deal in the Software without restriction, including without limitation the
			
 
				+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
			
 
				+ * sell copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
			
 
				+ * DEALINGS IN THE SOFTWARE.
			
 
				+ *
			
 
				+ * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
			
 
				+ */
			
 
				+
			
 
				+#ifndef __XEN_PUBLIC_VCPU_H__
			
 
				+#define __XEN_PUBLIC_VCPU_H__
			
 
				+
			
 
				+/*
			
 
				+ * Prototype for this hypercall is:
			
 
				+ *	int vcpu_op(int cmd, int vcpuid, void *extra_args)
			
 
				+ * @cmd		   == VCPUOP_??? (VCPU operation).
			
 
				+ * @vcpuid	   == VCPU to operate on.
			
 
				+ * @extra_args == Operation-specific extra arguments (NULL if none).
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Initialise a VCPU. Each VCPU can be initialised only once. A
			
 
				+ * newly-initialised VCPU will not run until it is brought up by VCPUOP_up.
			
 
				+ *
			
 
				+ * @extra_arg == pointer to vcpu_guest_context structure containing initial
			
 
				+ *				 state for the VCPU.
			
 
				+ */
			
 
				+#define VCPUOP_initialise			 0
			
 
				+
			
 
				+/*
			
 
				+ * Bring up a VCPU. This makes the VCPU runnable. This operation will fail
			
 
				+ * if the VCPU has not been initialised (VCPUOP_initialise).
			
 
				+ */
			
 
				+#define VCPUOP_up					 1
			
 
				+
			
 
				+/*
			
 
				+ * Bring down a VCPU (i.e., make it non-runnable).
			
 
				+ * There are a few caveats that callers should observe:
			
 
				+ *	1. This operation may return, and VCPU_is_up may return false, before the
			
 
				+ *	   VCPU stops running (i.e., the command is asynchronous). It is a good
			
 
				+ *	   idea to ensure that the VCPU has entered a non-critical loop before
			
 
				+ *	   bringing it down. Alternatively, this operation is guaranteed
			
 
				+ *	   synchronous if invoked by the VCPU itself.
			
 
				+ *	2. After a VCPU is initialised, there is currently no way to drop all its
			
 
				+ *	   references to domain memory. Even a VCPU that is down still holds
			
 
				+ *	   memory references via its pagetable base pointer and GDT. It is good
			
 
				+ *	   practise to move a VCPU onto an 'idle' or default page table, LDT and
			
 
				+ *	   GDT before bringing it down.
			
 
				+ */
			
 
				+#define VCPUOP_down					 2
			
 
				+
			
 
				+/* Returns 1 if the given VCPU is up. */
			
 
				+#define VCPUOP_is_up				 3
			
 
				+
			
 
				+/*
			
 
				+ * Return information about the state and running time of a VCPU.
			
 
				+ * @extra_arg == pointer to vcpu_runstate_info structure.
			
 
				+ */
			
 
				+#define VCPUOP_get_runstate_info	 4
			
 
				+struct vcpu_runstate_info {
			
 
				+		/* VCPU's current state (RUNSTATE_*). */
			
 
				+		int		 state;
			
 
				+		/* When was current state entered (system time, ns)? */
			
 
				+		uint64_t state_entry_time;
			
 
				+		/*
			
 
				+		 * Time spent in each RUNSTATE_* (ns). The sum of these times is
			
 
				+		 * guaranteed not to drift from system time.
			
 
				+		 */
			
 
				+		uint64_t time[4];
			
 
				+};
			
 
				+
			
 
				+/* VCPU is currently running on a physical CPU. */
			
 
				+#define RUNSTATE_running  0
			
 
				+
			
 
				+/* VCPU is runnable, but not currently scheduled on any physical CPU. */
			
 
				+#define RUNSTATE_runnable 1
			
 
				+
			
 
				+/* VCPU is blocked (a.k.a. idle). It is therefore not runnable. */
			
 
				+#define RUNSTATE_blocked  2
			
 
				+
			
 
				+/*
			
 
				+ * VCPU is not runnable, but it is not blocked.
			
 
				+ * This is a 'catch all' state for things like hotplug and pauses by the
			
 
				+ * system administrator (or for critical sections in the hypervisor).
			
 
				+ * RUNSTATE_blocked dominates this state (it is the preferred state).
			
 
				+ */
			
 
				+#define RUNSTATE_offline  3
			
 
				+
			
 
				+/*
			
 
				+ * Register a shared memory area from which the guest may obtain its own
			
 
				+ * runstate information without needing to execute a hypercall.
			
 
				+ * Notes:
			
 
				+ *	1. The registered address may be virtual or physical, depending on the
			
 
				+ *	   platform. The virtual address should be registered on x86 systems.
			
 
				+ *	2. Only one shared area may be registered per VCPU. The shared area is
			
 
				+ *	   updated by the hypervisor each time the VCPU is scheduled. Thus
			
 
				+ *	   runstate.state will always be RUNSTATE_running and
			
 
				+ *	   runstate.state_entry_time will indicate the system time at which the
			
 
				+ *	   VCPU was last scheduled to run.
			
 
				+ * @extra_arg == pointer to vcpu_register_runstate_memory_area structure.
			
 
				+ */
			
 
				+#define VCPUOP_register_runstate_memory_area 5
			
 
				+struct vcpu_register_runstate_memory_area {
			
 
				+		union {
			
 
				+				struct vcpu_runstate_info *v;
			
 
				+				uint64_t p;
			
 
				+		} addr;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Set or stop a VCPU's periodic timer. Every VCPU has one periodic timer
			
 
				+ * which can be set via these commands. Periods smaller than one millisecond
			
 
				+ * may not be supported.
			
 
				+ */
			
 
				+#define VCPUOP_set_periodic_timer	 6 /* arg == vcpu_set_periodic_timer_t */
			
 
				+#define VCPUOP_stop_periodic_timer	 7 /* arg == NULL */
			
 
				+struct vcpu_set_periodic_timer {
			
 
				+		uint64_t period_ns;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Set or stop a VCPU's single-shot timer. Every VCPU has one single-shot
			
 
				+ * timer which can be set via these commands.
			
 
				+ */
			
 
				+#define VCPUOP_set_singleshot_timer	 8 /* arg == vcpu_set_singleshot_timer_t */
			
 
				+#define VCPUOP_stop_singleshot_timer 9 /* arg == NULL */
			
 
				+struct vcpu_set_singleshot_timer {
			
 
				+		uint64_t timeout_abs_ns;
			
 
				+		uint32_t flags;			   /* VCPU_SSHOTTMR_??? */
			
 
				+};
			
 
				+
			
 
				+/* Flags to VCPUOP_set_singleshot_timer. */
			
 
				+ /* Require the timeout to be in the future (return -ETIME if it's passed). */
			
 
				+#define _VCPU_SSHOTTMR_future (0)
			
 
				+#define VCPU_SSHOTTMR_future  (1U << _VCPU_SSHOTTMR_future)
			
 
				+
			
 
				+/*
			
 
				+ * Register a memory location in the guest address space for the
			
 
				+ * vcpu_info structure.  This allows the guest to place the vcpu_info
			
 
				+ * structure in a convenient place, such as in a per-cpu data area.
			
 
				+ * The pointer need not be page aligned, but the structure must not
			
 
				+ * cross a page boundary.
			
 
				+ */
			
 
				+#define VCPUOP_register_vcpu_info   10  /* arg == struct vcpu_info */
			
 
				+struct vcpu_register_vcpu_info {
			
 
				+    uint32_t mfn;               /* mfn of page to place vcpu_info */
			
 
				+    uint32_t offset;            /* offset within page */
			
 
				+};
			
 
				+
			
 
				+#endif /* __XEN_PUBLIC_VCPU_H__ */