Browse Source

Merge branches 'x86/urgent', 'x86/amd-iommu', 'x86/apic', 'x86/cleanups', 'x86/core', 'x86/cpu', 'x86/fixmap', 'x86/gart', 'x86/kprobes', 'x86/memtest', 'x86/modules', 'x86/nmi', 'x86/pat', 'x86/reboot', 'x86/setup', 'x86/step', 'x86/unify-pci', 'x86/uv', 'x86/xen' and 'xen-64bit' into x86/for-linus

100 changed files with 3078 additions and 1467 deletions
  1. 5 1
      Documentation/kernel-parameters.txt
  2. 0 1
      arch/x86/Kconfig
  3. 0 4
      arch/x86/Kconfig.cpu
  4. 2 1
      arch/x86/Kconfig.debug
  5. 2 3
      arch/x86/boot/edd.c
  6. 0 6
      arch/x86/boot/pm.c
  7. 7 4
      arch/x86/ia32/ia32_signal.c
  8. 10 8
      arch/x86/ia32/ia32entry.S
  9. 1 0
      arch/x86/kernel/Makefile
  10. 1 9
      arch/x86/kernel/acpi/sleep.c
  11. 218 13
      arch/x86/kernel/amd_iommu.c
  12. 271 86
      arch/x86/kernel/amd_iommu_init.c
  13. 1 0
      arch/x86/kernel/aperture_64.c
  14. 88 87
      arch/x86/kernel/apic_32.c
  15. 15 11
      arch/x86/kernel/apic_64.c
  16. 11 0
      arch/x86/kernel/asm-offsets_64.c
  17. 48 0
      arch/x86/kernel/bios_uv.c
  18. 0 2
      arch/x86/kernel/cpu/amd.c
  19. 2 0
      arch/x86/kernel/cpu/amd_64.c
  20. 1 22
      arch/x86/kernel/cpu/bugs.c
  21. 2 13
      arch/x86/kernel/cpu/common_64.c
  22. 10 0
      arch/x86/kernel/cpu/intel.c
  23. 2 2
      arch/x86/kernel/cpu/mcheck/p4.c
  24. 4 29
      arch/x86/kernel/e820.c
  25. 1 4
      arch/x86/kernel/early-quirks.c
  26. 8 16
      arch/x86/kernel/entry_32.S
  27. 115 5
      arch/x86/kernel/entry_64.S
  28. 23 0
      arch/x86/kernel/genx2apic_uv_x.c
  29. 8 3
      arch/x86/kernel/head64.c
  30. 1 0
      arch/x86/kernel/head_64.S
  31. 29 24
      arch/x86/kernel/io_apic_32.c
  32. 23 18
      arch/x86/kernel/io_apic_64.c
  33. 3 0
      arch/x86/kernel/io_delay.c
  34. 3 3
      arch/x86/kernel/ipi.c
  35. 2 5
      arch/x86/kernel/irq_32.c
  36. 8 0
      arch/x86/kernel/kdebugfs.c
  37. 0 1
      arch/x86/kernel/kprobes.c
  38. 9 1
      arch/x86/kernel/module_64.c
  39. 26 182
      arch/x86/kernel/mpparse.c
  40. 9 2
      arch/x86/kernel/nmi.c
  41. 186 11
      arch/x86/kernel/numaq_32.c
  42. 4 1
      arch/x86/kernel/paravirt.c
  43. 1 1
      arch/x86/kernel/pci-calgary_64.c
  44. 2 15
      arch/x86/kernel/pci-dma.c
  45. 1 0
      arch/x86/kernel/pci-gart_64.c
  46. 1 1
      arch/x86/kernel/pci-nommu.c
  47. 1 1
      arch/x86/kernel/pci-swiotlb_64.c
  48. 5 0
      arch/x86/kernel/process.c
  49. 28 28
      arch/x86/kernel/process_64.c
  50. 56 95
      arch/x86/kernel/ptrace.c
  51. 8 0
      arch/x86/kernel/reboot.c
  52. 8 14
      arch/x86/kernel/setup.c
  53. 1 7
      arch/x86/kernel/signal_32.c
  54. 0 6
      arch/x86/kernel/signal_64.c
  55. 22 32
      arch/x86/kernel/smpboot.c
  56. 29 6
      arch/x86/kernel/step.c
  57. 1 0
      arch/x86/kernel/time_32.c
  58. 53 65
      arch/x86/kernel/traps_32.c
  59. 18 30
      arch/x86/kernel/traps_64.c
  60. 20 22
      arch/x86/kernel/visws_quirks.c
  61. 0 1
      arch/x86/kernel/vmi_32.c
  62. 0 1
      arch/x86/lguest/boot.c
  63. 18 16
      arch/x86/mach-default/setup.c
  64. 1 0
      arch/x86/mm/Makefile
  65. 3 2
      arch/x86/mm/init_32.c
  66. 0 112
      arch/x86/mm/init_64.c
  67. 123 0
      arch/x86/mm/memtest.c
  68. 88 0
      arch/x86/mm/pat.c
  69. 6 6
      arch/x86/pci/Makefile
  70. 6 3
      arch/x86/pci/legacy.c
  71. 2 2
      arch/x86/pci/numaq_32.c
  72. 2 1
      arch/x86/pci/pci.h
  73. 7 16
      arch/x86/pci/visws.c
  74. 1 1
      arch/x86/vdso/Makefile
  75. 9 10
      arch/x86/vdso/vdso32-setup.c
  76. 8 5
      arch/x86/vdso/vdso32.S
  77. 6 5
      arch/x86/vdso/vma.c
  78. 10 4
      arch/x86/xen/Kconfig
  79. 1 1
      arch/x86/xen/Makefile
  80. 534 148
      arch/x86/xen/enlighten.c
  81. 227 89
      arch/x86/xen/mmu.c
  82. 15 14
      arch/x86/xen/mmu.h
  83. 1 0
      arch/x86/xen/multicalls.c
  84. 59 20
      arch/x86/xen/setup.c
  85. 87 50
      arch/x86/xen/smp.c
  86. 4 1
      arch/x86/xen/suspend.c
  87. 0 0
      arch/x86/xen/xen-asm_32.S
  88. 271 0
      arch/x86/xen/xen-asm_64.S
  89. 21 7
      arch/x86/xen/xen-head.S
  90. 10 11
      arch/x86/xen/xen-ops.h
  91. 15 4
      drivers/net/xen-netfront.c
  92. 1 1
      drivers/pci/intel-iommu.c
  93. 6 4
      drivers/xen/manage.c
  94. 106 8
      include/asm-x86/amd_iommu_types.h
  95. 9 19
      include/asm-x86/apic.h
  96. 1 0
      include/asm-x86/arch_hooks.h
  97. 1 1
      include/asm-x86/bitops.h
  98. 4 2
      include/asm-x86/calling.h
  99. 1 0
      include/asm-x86/cpufeature.h
  100. 0 1
      include/asm-x86/dma-mapping.h

+ 5 - 1
Documentation/kernel-parameters.txt

@@ -1206,7 +1206,7 @@ and is between 256 and 4096 characters. It is defined in the file
 			         or
 			         or
 			         memmap=0x10000$0x18690000
 			         memmap=0x10000$0x18690000
 
 
-	memtest=	[KNL,X86_64] Enable memtest
+	memtest=	[KNL,X86] Enable memtest
 			Format: <integer>
 			Format: <integer>
 			range: 0,4 : pattern number
 			range: 0,4 : pattern number
 			default : 0 <disable>
 			default : 0 <disable>
@@ -2158,6 +2158,10 @@ and is between 256 and 4096 characters. It is defined in the file
 			Note that genuine overcurrent events won't be
 			Note that genuine overcurrent events won't be
 			reported either.
 			reported either.
 
 
+	unknown_nmi_panic
+			[X86-32,X86-64]
+			Set unknown_nmi_panic=1 early on boot.
+
 	usbcore.autosuspend=
 	usbcore.autosuspend=
 			[USB] The autosuspend time delay (in seconds) used
 			[USB] The autosuspend time delay (in seconds) used
 			for newly-detected USB devices (default 2).  This
 			for newly-detected USB devices (default 2).  This

+ 0 - 1
arch/x86/Kconfig

@@ -447,7 +447,6 @@ config PARAVIRT_DEBUG
 
 
 config MEMTEST
 config MEMTEST
 	bool "Memtest"
 	bool "Memtest"
-	depends on X86_64
 	help
 	help
 	  This option adds a kernel parameter 'memtest', which allows memtest
 	  This option adds a kernel parameter 'memtest', which allows memtest
 	  to be set.
 	  to be set.

+ 0 - 4
arch/x86/Kconfig.cpu

@@ -362,10 +362,6 @@ config X86_ALIGNMENT_16
 	def_bool y
 	def_bool y
 	depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
 	depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
 
 
-config X86_GOOD_APIC
-	def_bool y
-	depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 || MVIAC7 || X86_64
-
 config X86_INTEL_USERCOPY
 config X86_INTEL_USERCOPY
 	def_bool y
 	def_bool y
 	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
 	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2

+ 2 - 1
arch/x86/Kconfig.debug

@@ -289,7 +289,6 @@ config CPA_DEBUG
 
 
 config OPTIMIZE_INLINING
 config OPTIMIZE_INLINING
 	bool "Allow gcc to uninline functions marked 'inline'"
 	bool "Allow gcc to uninline functions marked 'inline'"
-	depends on BROKEN
 	help
 	help
 	  This option determines if the kernel forces gcc to inline the functions
 	  This option determines if the kernel forces gcc to inline the functions
 	  developers have marked 'inline'. Doing so takes away freedom from gcc to
 	  developers have marked 'inline'. Doing so takes away freedom from gcc to
@@ -300,5 +299,7 @@ config OPTIMIZE_INLINING
 	  become the default in the future, until then this option is there to
 	  become the default in the future, until then this option is there to
 	  test gcc for this.
 	  test gcc for this.
 
 
+	  If unsure, say N.
+
 endmenu
 endmenu
 
 

+ 2 - 3
arch/x86/boot/edd.c

@@ -167,9 +167,8 @@ void query_edd(void)
 		 * Scan the BIOS-supported hard disks and query EDD
 		 * Scan the BIOS-supported hard disks and query EDD
 		 * information...
 		 * information...
 		 */
 		 */
-		get_edd_info(devno, &ei);
-
-		if (boot_params.eddbuf_entries < EDDMAXNR) {
+		if (!get_edd_info(devno, &ei)
+		    && boot_params.eddbuf_entries < EDDMAXNR) {
 			memcpy(edp, &ei, sizeof ei);
 			memcpy(edp, &ei, sizeof ei);
 			edp++;
 			edp++;
 			boot_params.eddbuf_entries++;
 			boot_params.eddbuf_entries++;

+ 0 - 6
arch/x86/boot/pm.c

@@ -98,12 +98,6 @@ static void reset_coprocessor(void)
 /*
 /*
  * Set up the GDT
  * Set up the GDT
  */
  */
-#define GDT_ENTRY(flags, base, limit)		\
-	(((u64)(base & 0xff000000) << 32) |	\
-	 ((u64)flags << 40) |			\
-	 ((u64)(limit & 0x00ff0000) << 32) |	\
-	 ((u64)(base & 0x00ffffff) << 16) |	\
-	 ((u64)(limit & 0x0000ffff)))
 
 
 struct gdt_ptr {
 struct gdt_ptr {
 	u16 len;
 	u16 len;

+ 7 - 4
arch/x86/ia32/ia32_signal.c

@@ -36,6 +36,11 @@
 
 
 #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
 #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
 
 
+#define FIX_EFLAGS	(X86_EFLAGS_AC | X86_EFLAGS_OF | \
+			 X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
+			 X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
+			 X86_EFLAGS_CF)
+
 asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset);
 asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset);
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
 
 
@@ -248,7 +253,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
 	regs->ss |= 3;
 	regs->ss |= 3;
 
 
 	err |= __get_user(tmpflags, &sc->flags);
 	err |= __get_user(tmpflags, &sc->flags);
-	regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5);
+	regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
 	/* disable syscall checks */
 	/* disable syscall checks */
 	regs->orig_ax = -1;
 	regs->orig_ax = -1;
 
 
@@ -515,7 +520,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 			compat_sigset_t *set, struct pt_regs *regs)
 			compat_sigset_t *set, struct pt_regs *regs)
 {
 {
 	struct rt_sigframe __user *frame;
 	struct rt_sigframe __user *frame;
-	struct exec_domain *ed = current_thread_info()->exec_domain;
 	void __user *restorer;
 	void __user *restorer;
 	int err = 0;
 	int err = 0;
 
 
@@ -538,8 +542,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
 	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
 		goto give_sigsegv;
 		goto give_sigsegv;
 
 
-	err |= __put_user((ed && ed->signal_invmap && sig < 32
-			   ? ed->signal_invmap[sig] : sig), &frame->sig);
+	err |= __put_user(sig, &frame->sig);
 	err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
 	err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
 	err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
 	err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
 	err |= copy_siginfo_to_user32(&frame->info, info);
 	err |= copy_siginfo_to_user32(&frame->info, info);

+ 10 - 8
arch/x86/ia32/ia32entry.S

@@ -37,6 +37,11 @@
 	movq	%rax,R8(%rsp)
 	movq	%rax,R8(%rsp)
 	.endm
 	.endm
 
 
+	/*
+	 * Reload arg registers from stack in case ptrace changed them.
+	 * We don't reload %eax because syscall_trace_enter() returned
+	 * the value it wants us to use in the table lookup.
+	 */
 	.macro LOAD_ARGS32 offset
 	.macro LOAD_ARGS32 offset
 	movl \offset(%rsp),%r11d
 	movl \offset(%rsp),%r11d
 	movl \offset+8(%rsp),%r10d
 	movl \offset+8(%rsp),%r10d
@@ -46,7 +51,6 @@
 	movl \offset+48(%rsp),%edx
 	movl \offset+48(%rsp),%edx
 	movl \offset+56(%rsp),%esi
 	movl \offset+56(%rsp),%esi
 	movl \offset+64(%rsp),%edi
 	movl \offset+64(%rsp),%edi
-	movl \offset+72(%rsp),%eax
 	.endm
 	.endm
 	
 	
 	.macro CFI_STARTPROC32 simple
 	.macro CFI_STARTPROC32 simple
@@ -137,13 +141,12 @@ ENTRY(ia32_sysenter_target)
  	.previous	
  	.previous	
 	GET_THREAD_INFO(%r10)
 	GET_THREAD_INFO(%r10)
 	orl    $TS_COMPAT,TI_status(%r10)
 	orl    $TS_COMPAT,TI_status(%r10)
-	testl  $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
-		 TI_flags(%r10)
+	testl  $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
 	CFI_REMEMBER_STATE
 	CFI_REMEMBER_STATE
 	jnz  sysenter_tracesys
 	jnz  sysenter_tracesys
-sysenter_do_call:	
 	cmpl	$(IA32_NR_syscalls-1),%eax
 	cmpl	$(IA32_NR_syscalls-1),%eax
 	ja	ia32_badsys
 	ja	ia32_badsys
+sysenter_do_call:
 	IA32_ARG_FIXUP 1
 	IA32_ARG_FIXUP 1
 	call	*ia32_sys_call_table(,%rax,8)
 	call	*ia32_sys_call_table(,%rax,8)
 	movq	%rax,RAX-ARGOFFSET(%rsp)
 	movq	%rax,RAX-ARGOFFSET(%rsp)
@@ -242,8 +245,7 @@ ENTRY(ia32_cstar_target)
 	.previous	
 	.previous	
 	GET_THREAD_INFO(%r10)
 	GET_THREAD_INFO(%r10)
 	orl   $TS_COMPAT,TI_status(%r10)
 	orl   $TS_COMPAT,TI_status(%r10)
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
-		TI_flags(%r10)
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
 	CFI_REMEMBER_STATE
 	CFI_REMEMBER_STATE
 	jnz   cstar_tracesys
 	jnz   cstar_tracesys
 cstar_do_call:	
 cstar_do_call:	
@@ -321,6 +323,7 @@ ENTRY(ia32_syscall)
 	/*CFI_REL_OFFSET	rflags,EFLAGS-RIP*/
 	/*CFI_REL_OFFSET	rflags,EFLAGS-RIP*/
 	/*CFI_REL_OFFSET	cs,CS-RIP*/
 	/*CFI_REL_OFFSET	cs,CS-RIP*/
 	CFI_REL_OFFSET	rip,RIP-RIP
 	CFI_REL_OFFSET	rip,RIP-RIP
+	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	SWAPGS
 	SWAPGS
 	/*
 	/*
 	 * No need to follow this irqs on/off section: the syscall
 	 * No need to follow this irqs on/off section: the syscall
@@ -336,8 +339,7 @@ ENTRY(ia32_syscall)
 	SAVE_ARGS 0,0,1
 	SAVE_ARGS 0,0,1
 	GET_THREAD_INFO(%r10)
 	GET_THREAD_INFO(%r10)
 	orl   $TS_COMPAT,TI_status(%r10)
 	orl   $TS_COMPAT,TI_status(%r10)
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
-		TI_flags(%r10)
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
 	jnz ia32_tracesys
 	jnz ia32_tracesys
 ia32_do_syscall:	
 ia32_do_syscall:	
 	cmpl $(IA32_NR_syscalls-1),%eax
 	cmpl $(IA32_NR_syscalls-1),%eax

+ 1 - 0
arch/x86/kernel/Makefile

@@ -102,6 +102,7 @@ obj-$(CONFIG_OLPC)		+= olpc.o
 # 64 bit specific files
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
 ifeq ($(CONFIG_X86_64),y)
         obj-y				+= genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
         obj-y				+= genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
+	obj-y				+= bios_uv.o
         obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer_64.o
         obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer_64.o
         obj-$(CONFIG_AUDIT)		+= audit_64.o
         obj-$(CONFIG_AUDIT)		+= audit_64.o
 
 

+ 1 - 9
arch/x86/kernel/acpi/sleep.c

@@ -9,6 +9,7 @@
 #include <linux/bootmem.h>
 #include <linux/bootmem.h>
 #include <linux/dmi.h>
 #include <linux/dmi.h>
 #include <linux/cpumask.h>
 #include <linux/cpumask.h>
+#include <asm/segment.h>
 
 
 #include "realmode/wakeup.h"
 #include "realmode/wakeup.h"
 #include "sleep.h"
 #include "sleep.h"
@@ -23,15 +24,6 @@ static unsigned long acpi_realmode;
 static char temp_stack[10240];
 static char temp_stack[10240];
 #endif
 #endif
 
 
-/* XXX: this macro should move to asm-x86/segment.h and be shared with the
-   boot code... */
-#define GDT_ENTRY(flags, base, limit)		\
-	(((u64)(base & 0xff000000) << 32) |	\
-	 ((u64)flags << 40) |			\
-	 ((u64)(limit & 0x00ff0000) << 32) |	\
-	 ((u64)(base & 0x00ffffff) << 16) |	\
-	 ((u64)(limit & 0x0000ffff)))
-
 /**
 /**
  * acpi_save_state_mem - save kernel state
  * acpi_save_state_mem - save kernel state
  *
  *

+ 218 - 13
arch/x86/kernel/amd_iommu.c

@@ -23,7 +23,7 @@
 #include <linux/scatterlist.h>
 #include <linux/scatterlist.h>
 #include <linux/iommu-helper.h>
 #include <linux/iommu-helper.h>
 #include <asm/proto.h>
 #include <asm/proto.h>
-#include <asm/gart.h>
+#include <asm/iommu.h>
 #include <asm/amd_iommu_types.h>
 #include <asm/amd_iommu_types.h>
 #include <asm/amd_iommu.h>
 #include <asm/amd_iommu.h>
 
 
@@ -32,21 +32,37 @@
 #define to_pages(addr, size) \
 #define to_pages(addr, size) \
 	 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
 	 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
 
 
+#define EXIT_LOOP_COUNT 10000000
+
 static DEFINE_RWLOCK(amd_iommu_devtable_lock);
 static DEFINE_RWLOCK(amd_iommu_devtable_lock);
 
 
-struct command {
+/*
+ * general struct to manage commands send to an IOMMU
+ */
+struct iommu_cmd {
 	u32 data[4];
 	u32 data[4];
 };
 };
 
 
 static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 			     struct unity_map_entry *e);
 			     struct unity_map_entry *e);
 
 
+/* returns !0 if the IOMMU is caching non-present entries in its TLB */
 static int iommu_has_npcache(struct amd_iommu *iommu)
 static int iommu_has_npcache(struct amd_iommu *iommu)
 {
 {
 	return iommu->cap & IOMMU_CAP_NPCACHE;
 	return iommu->cap & IOMMU_CAP_NPCACHE;
 }
 }
 
 
-static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
+/****************************************************************************
+ *
+ * IOMMU command queuing functions
+ *
+ ****************************************************************************/
+
+/*
+ * Writes the command to the IOMMUs command buffer and informs the
+ * hardware about the new command. Must be called with iommu->lock held.
+ */
+static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
 {
 {
 	u32 tail, head;
 	u32 tail, head;
 	u8 *target;
 	u8 *target;
@@ -63,7 +79,11 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
 	return 0;
 	return 0;
 }
 }
 
 
-static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
+/*
+ * General queuing function for commands. Takes iommu->lock and calls
+ * __iommu_queue_command().
+ */
+static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
 {
 {
 	unsigned long flags;
 	unsigned long flags;
 	int ret;
 	int ret;
@@ -75,16 +95,24 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
 	return ret;
 	return ret;
 }
 }
 
 
+/*
+ * This function is called whenever we need to ensure that the IOMMU has
+ * completed execution of all commands we sent. It sends a
+ * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
+ * us about that by writing a value to a physical address we pass with
+ * the command.
+ */
 static int iommu_completion_wait(struct amd_iommu *iommu)
 static int iommu_completion_wait(struct amd_iommu *iommu)
 {
 {
 	int ret;
 	int ret;
-	struct command cmd;
+	struct iommu_cmd cmd;
 	volatile u64 ready = 0;
 	volatile u64 ready = 0;
 	unsigned long ready_phys = virt_to_phys(&ready);
 	unsigned long ready_phys = virt_to_phys(&ready);
+	unsigned long i = 0;
 
 
 	memset(&cmd, 0, sizeof(cmd));
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK;
 	cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK;
-	cmd.data[1] = HIGH_U32(ready_phys);
+	cmd.data[1] = upper_32_bits(ready_phys);
 	cmd.data[2] = 1; /* value written to 'ready' */
 	cmd.data[2] = 1; /* value written to 'ready' */
 	CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
 	CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
 
 
@@ -95,15 +123,23 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
 	if (ret)
 	if (ret)
 		return ret;
 		return ret;
 
 
-	while (!ready)
+	while (!ready && (i < EXIT_LOOP_COUNT)) {
+		++i;
 		cpu_relax();
 		cpu_relax();
+	}
+
+	if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
+		printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
 
 
 	return 0;
 	return 0;
 }
 }
 
 
+/*
+ * Command send function for invalidating a device table entry
+ */
 static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
 static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
 {
 {
-	struct command cmd;
+	struct iommu_cmd cmd;
 
 
 	BUG_ON(iommu == NULL);
 	BUG_ON(iommu == NULL);
 
 
@@ -116,20 +152,23 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
 	return iommu_queue_command(iommu, &cmd);
 	return iommu_queue_command(iommu, &cmd);
 }
 }
 
 
+/*
+ * Generic command send function for invalidaing TLB entries
+ */
 static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
 static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
 		u64 address, u16 domid, int pde, int s)
 		u64 address, u16 domid, int pde, int s)
 {
 {
-	struct command cmd;
+	struct iommu_cmd cmd;
 
 
 	memset(&cmd, 0, sizeof(cmd));
 	memset(&cmd, 0, sizeof(cmd));
 	address &= PAGE_MASK;
 	address &= PAGE_MASK;
 	CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
 	CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
 	cmd.data[1] |= domid;
 	cmd.data[1] |= domid;
 	cmd.data[2] = LOW_U32(address);
 	cmd.data[2] = LOW_U32(address);
-	cmd.data[3] = HIGH_U32(address);
-	if (s)
+	cmd.data[3] = upper_32_bits(address);
+	if (s) /* size bit - we flush more than one 4kb page */
 		cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
 		cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
-	if (pde)
+	if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
 		cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
 		cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
 
 
 	iommu->need_sync = 1;
 	iommu->need_sync = 1;
@@ -137,6 +176,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
 	return iommu_queue_command(iommu, &cmd);
 	return iommu_queue_command(iommu, &cmd);
 }
 }
 
 
+/*
+ * TLB invalidation function which is called from the mapping functions.
+ * It invalidates a single PTE if the range to flush is within a single
+ * page. Otherwise it flushes the whole TLB of the IOMMU.
+ */
 static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
 static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
 		u64 address, size_t size)
 		u64 address, size_t size)
 {
 {
@@ -159,6 +203,20 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
 	return 0;
 	return 0;
 }
 }
 
 
+/****************************************************************************
+ *
+ * The functions below are used the create the page table mappings for
+ * unity mapped regions.
+ *
+ ****************************************************************************/
+
+/*
+ * Generic mapping functions. It maps a physical address into a DMA
+ * address space. It allocates the page table pages if necessary.
+ * In the future it can be extended to a generic mapping function
+ * supporting all features of AMD IOMMU page tables like level skipping
+ * and full 64 bit address spaces.
+ */
 static int iommu_map(struct protection_domain *dom,
 static int iommu_map(struct protection_domain *dom,
 		     unsigned long bus_addr,
 		     unsigned long bus_addr,
 		     unsigned long phys_addr,
 		     unsigned long phys_addr,
@@ -209,6 +267,10 @@ static int iommu_map(struct protection_domain *dom,
 	return 0;
 	return 0;
 }
 }
 
 
+/*
+ * This function checks if a specific unity mapping entry is needed for
+ * this specific IOMMU.
+ */
 static int iommu_for_unity_map(struct amd_iommu *iommu,
 static int iommu_for_unity_map(struct amd_iommu *iommu,
 			       struct unity_map_entry *entry)
 			       struct unity_map_entry *entry)
 {
 {
@@ -223,6 +285,12 @@ static int iommu_for_unity_map(struct amd_iommu *iommu,
 	return 0;
 	return 0;
 }
 }
 
 
+/*
+ * Init the unity mappings for a specific IOMMU in the system
+ *
+ * Basically iterates over all unity mapping entries and applies them to
+ * the default domain DMA of that IOMMU if necessary.
+ */
 static int iommu_init_unity_mappings(struct amd_iommu *iommu)
 static int iommu_init_unity_mappings(struct amd_iommu *iommu)
 {
 {
 	struct unity_map_entry *entry;
 	struct unity_map_entry *entry;
@@ -239,6 +307,10 @@ static int iommu_init_unity_mappings(struct amd_iommu *iommu)
 	return 0;
 	return 0;
 }
 }
 
 
+/*
+ * This function actually applies the mapping to the page table of the
+ * dma_ops domain.
+ */
 static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 			     struct unity_map_entry *e)
 			     struct unity_map_entry *e)
 {
 {
@@ -261,6 +333,9 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 	return 0;
 	return 0;
 }
 }
 
 
+/*
+ * Inits the unity mappings required for a specific device
+ */
 static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
 static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
 					  u16 devid)
 					  u16 devid)
 {
 {
@@ -278,12 +353,26 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
 	return 0;
 	return 0;
 }
 }
 
 
+/****************************************************************************
+ *
+ * The next functions belong to the address allocator for the dma_ops
+ * interface functions. They work like the allocators in the other IOMMU
+ * drivers. Its basically a bitmap which marks the allocated pages in
+ * the aperture. Maybe it could be enhanced in the future to a more
+ * efficient allocator.
+ *
+ ****************************************************************************/
 static unsigned long dma_mask_to_pages(unsigned long mask)
 static unsigned long dma_mask_to_pages(unsigned long mask)
 {
 {
 	return (mask >> PAGE_SHIFT) +
 	return (mask >> PAGE_SHIFT) +
 		(PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
 		(PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
 }
 }
 
 
+/*
+ * The address allocator core function.
+ *
+ * called with domain->lock held
+ */
 static unsigned long dma_ops_alloc_addresses(struct device *dev,
 static unsigned long dma_ops_alloc_addresses(struct device *dev,
 					     struct dma_ops_domain *dom,
 					     struct dma_ops_domain *dom,
 					     unsigned int pages)
 					     unsigned int pages)
@@ -317,6 +406,11 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
 	return address;
 	return address;
 }
 }
 
 
+/*
+ * The address free function.
+ *
+ * called with domain->lock held
+ */
 static void dma_ops_free_addresses(struct dma_ops_domain *dom,
 static void dma_ops_free_addresses(struct dma_ops_domain *dom,
 				   unsigned long address,
 				   unsigned long address,
 				   unsigned int pages)
 				   unsigned int pages)
@@ -325,6 +419,16 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
 	iommu_area_free(dom->bitmap, address, pages);
 	iommu_area_free(dom->bitmap, address, pages);
 }
 }
 
 
+/****************************************************************************
+ *
+ * The next functions belong to the domain allocation. A domain is
+ * allocated for every IOMMU as the default domain. If device isolation
+ * is enabled, every device get its own domain. The most important thing
+ * about domains is the page table mapping the DMA address space they
+ * contain.
+ *
+ ****************************************************************************/
+
 static u16 domain_id_alloc(void)
 static u16 domain_id_alloc(void)
 {
 {
 	unsigned long flags;
 	unsigned long flags;
@@ -342,6 +446,10 @@ static u16 domain_id_alloc(void)
 	return id;
 	return id;
 }
 }
 
 
+/*
+ * Used to reserve address ranges in the aperture (e.g. for exclusion
+ * ranges.
+ */
 static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
 static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
 				      unsigned long start_page,
 				      unsigned long start_page,
 				      unsigned int pages)
 				      unsigned int pages)
@@ -382,6 +490,10 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
 	free_page((unsigned long)p1);
 	free_page((unsigned long)p1);
 }
 }
 
 
+/*
+ * Free a domain, only used if something went wrong in the
+ * allocation path and we need to free an already allocated page table
+ */
 static void dma_ops_domain_free(struct dma_ops_domain *dom)
 static void dma_ops_domain_free(struct dma_ops_domain *dom)
 {
 {
 	if (!dom)
 	if (!dom)
@@ -396,6 +508,11 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
 	kfree(dom);
 	kfree(dom);
 }
 }
 
 
+/*
+ * Allocates a new protection domain usable for the dma_ops functions.
+ * It also intializes the page table and the address allocator data
+ * structures required for the dma_ops interface
+ */
 static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 						   unsigned order)
 						   unsigned order)
 {
 {
@@ -436,6 +553,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	dma_dom->bitmap[0] = 1;
 	dma_dom->bitmap[0] = 1;
 	dma_dom->next_bit = 0;
 	dma_dom->next_bit = 0;
 
 
+	/* Intialize the exclusion range if necessary */
 	if (iommu->exclusion_start &&
 	if (iommu->exclusion_start &&
 	    iommu->exclusion_start < dma_dom->aperture_size) {
 	    iommu->exclusion_start < dma_dom->aperture_size) {
 		unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
 		unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
@@ -444,6 +562,11 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 		dma_ops_reserve_addresses(dma_dom, startpage, pages);
 		dma_ops_reserve_addresses(dma_dom, startpage, pages);
 	}
 	}
 
 
+	/*
+	 * At the last step, build the page tables so we don't need to
+	 * allocate page table pages in the dma_ops mapping/unmapping
+	 * path.
+	 */
 	num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
 	num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
 	dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
 	dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
 			GFP_KERNEL);
 			GFP_KERNEL);
@@ -472,6 +595,10 @@ free_dma_dom:
 	return NULL;
 	return NULL;
 }
 }
 
 
+/*
+ * Find out the protection domain structure for a given PCI device. This
+ * will give us the pointer to the page table root for example.
+ */
 static struct protection_domain *domain_for_device(u16 devid)
 static struct protection_domain *domain_for_device(u16 devid)
 {
 {
 	struct protection_domain *dom;
 	struct protection_domain *dom;
@@ -484,6 +611,10 @@ static struct protection_domain *domain_for_device(u16 devid)
 	return dom;
 	return dom;
 }
 }
 
 
+/*
+ * If a device is not yet associated with a domain, this function does
+ * assigns it visible for the hardware
+ */
 static void set_device_domain(struct amd_iommu *iommu,
 static void set_device_domain(struct amd_iommu *iommu,
 			      struct protection_domain *domain,
 			      struct protection_domain *domain,
 			      u16 devid)
 			      u16 devid)
@@ -508,6 +639,19 @@ static void set_device_domain(struct amd_iommu *iommu,
 	iommu->need_sync = 1;
 	iommu->need_sync = 1;
 }
 }
 
 
+/*****************************************************************************
+ *
+ * The next functions belong to the dma_ops mapping/unmapping code.
+ *
+ *****************************************************************************/
+
+/*
+ * In the dma_ops path we only have the struct device. This function
+ * finds the corresponding IOMMU, the protection domain and the
+ * requestor id for a given device.
+ * If the device is not yet associated with a domain this is also done
+ * in this function.
+ */
 static int get_device_resources(struct device *dev,
 static int get_device_resources(struct device *dev,
 				struct amd_iommu **iommu,
 				struct amd_iommu **iommu,
 				struct protection_domain **domain,
 				struct protection_domain **domain,
@@ -520,8 +664,9 @@ static int get_device_resources(struct device *dev,
 	BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask);
 	BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask);
 
 
 	pcidev = to_pci_dev(dev);
 	pcidev = to_pci_dev(dev);
-	_bdf = (pcidev->bus->number << 8) | pcidev->devfn;
+	_bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
 
 
+	/* device not translated by any IOMMU in the system? */
 	if (_bdf >= amd_iommu_last_bdf) {
 	if (_bdf >= amd_iommu_last_bdf) {
 		*iommu = NULL;
 		*iommu = NULL;
 		*domain = NULL;
 		*domain = NULL;
@@ -547,6 +692,10 @@ static int get_device_resources(struct device *dev,
 	return 1;
 	return 1;
 }
 }
 
 
+/*
+ * This is the generic map function. It maps one 4kb page at paddr to
+ * the given address in the DMA address space for the domain.
+ */
 static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
 static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
 				     struct dma_ops_domain *dom,
 				     struct dma_ops_domain *dom,
 				     unsigned long address,
 				     unsigned long address,
@@ -578,6 +727,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
 	return (dma_addr_t)address;
 	return (dma_addr_t)address;
 }
 }
 
 
+/*
+ * The generic unmapping function for on page in the DMA address space.
+ */
 static void dma_ops_domain_unmap(struct amd_iommu *iommu,
 static void dma_ops_domain_unmap(struct amd_iommu *iommu,
 				 struct dma_ops_domain *dom,
 				 struct dma_ops_domain *dom,
 				 unsigned long address)
 				 unsigned long address)
@@ -597,6 +749,12 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
 	*pte = 0ULL;
 	*pte = 0ULL;
 }
 }
 
 
+/*
+ * This function contains common code for mapping of a physically
+ * contiguous memory region into DMA address space. It is uses by all
+ * mapping functions provided by this IOMMU driver.
+ * Must be called with the domain lock held.
+ */
 static dma_addr_t __map_single(struct device *dev,
 static dma_addr_t __map_single(struct device *dev,
 			       struct amd_iommu *iommu,
 			       struct amd_iommu *iommu,
 			       struct dma_ops_domain *dma_dom,
 			       struct dma_ops_domain *dma_dom,
@@ -628,6 +786,10 @@ out:
 	return address;
 	return address;
 }
 }
 
 
+/*
+ * Does the reverse of the __map_single function. Must be called with
+ * the domain lock held too
+ */
 static void __unmap_single(struct amd_iommu *iommu,
 static void __unmap_single(struct amd_iommu *iommu,
 			   struct dma_ops_domain *dma_dom,
 			   struct dma_ops_domain *dma_dom,
 			   dma_addr_t dma_addr,
 			   dma_addr_t dma_addr,
@@ -652,6 +814,9 @@ static void __unmap_single(struct amd_iommu *iommu,
 	dma_ops_free_addresses(dma_dom, dma_addr, pages);
 	dma_ops_free_addresses(dma_dom, dma_addr, pages);
 }
 }
 
 
+/*
+ * The exported map_single function for dma_ops.
+ */
 static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
 static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
 			     size_t size, int dir)
 			     size_t size, int dir)
 {
 {
@@ -664,6 +829,7 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
 	get_device_resources(dev, &iommu, &domain, &devid);
 	get_device_resources(dev, &iommu, &domain, &devid);
 
 
 	if (iommu == NULL || domain == NULL)
 	if (iommu == NULL || domain == NULL)
+		/* device not handled by any AMD IOMMU */
 		return (dma_addr_t)paddr;
 		return (dma_addr_t)paddr;
 
 
 	spin_lock_irqsave(&domain->lock, flags);
 	spin_lock_irqsave(&domain->lock, flags);
@@ -683,6 +849,9 @@ out:
 	return addr;
 	return addr;
 }
 }
 
 
+/*
+ * The exported unmap_single function for dma_ops.
+ */
 static void unmap_single(struct device *dev, dma_addr_t dma_addr,
 static void unmap_single(struct device *dev, dma_addr_t dma_addr,
 			 size_t size, int dir)
 			 size_t size, int dir)
 {
 {
@@ -692,6 +861,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
 	u16 devid;
 	u16 devid;
 
 
 	if (!get_device_resources(dev, &iommu, &domain, &devid))
 	if (!get_device_resources(dev, &iommu, &domain, &devid))
+		/* device not handled by any AMD IOMMU */
 		return;
 		return;
 
 
 	spin_lock_irqsave(&domain->lock, flags);
 	spin_lock_irqsave(&domain->lock, flags);
@@ -706,6 +876,10 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
 	spin_unlock_irqrestore(&domain->lock, flags);
 	spin_unlock_irqrestore(&domain->lock, flags);
 }
 }
 
 
+/*
+ * This is a special map_sg function which is used if we should map a
+ * device which is not handled by an AMD IOMMU in the system.
+ */
 static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
 static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
 			   int nelems, int dir)
 			   int nelems, int dir)
 {
 {
@@ -720,6 +894,10 @@ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
 	return nelems;
 	return nelems;
 }
 }
 
 
+/*
+ * The exported map_sg function for dma_ops (handles scatter-gather
+ * lists).
+ */
 static int map_sg(struct device *dev, struct scatterlist *sglist,
 static int map_sg(struct device *dev, struct scatterlist *sglist,
 		  int nelems, int dir)
 		  int nelems, int dir)
 {
 {
@@ -775,6 +953,10 @@ unmap:
 	goto out;
 	goto out;
 }
 }
 
 
+/*
+ * The exported map_sg function for dma_ops (handles scatter-gather
+ * lists).
+ */
 static void unmap_sg(struct device *dev, struct scatterlist *sglist,
 static void unmap_sg(struct device *dev, struct scatterlist *sglist,
 		     int nelems, int dir)
 		     int nelems, int dir)
 {
 {
@@ -804,6 +986,9 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
 	spin_unlock_irqrestore(&domain->lock, flags);
 	spin_unlock_irqrestore(&domain->lock, flags);
 }
 }
 
 
+/*
+ * The exported alloc_coherent function for dma_ops.
+ */
 static void *alloc_coherent(struct device *dev, size_t size,
 static void *alloc_coherent(struct device *dev, size_t size,
 			    dma_addr_t *dma_addr, gfp_t flag)
 			    dma_addr_t *dma_addr, gfp_t flag)
 {
 {
@@ -851,6 +1036,11 @@ out:
 	return virt_addr;
 	return virt_addr;
 }
 }
 
 
+/*
+ * The exported free_coherent function for dma_ops.
+ * FIXME: fix the generic x86 DMA layer so that it actually calls that
+ *        function.
+ */
 static void free_coherent(struct device *dev, size_t size,
 static void free_coherent(struct device *dev, size_t size,
 			  void *virt_addr, dma_addr_t dma_addr)
 			  void *virt_addr, dma_addr_t dma_addr)
 {
 {
@@ -879,6 +1069,8 @@ free_mem:
 }
 }
 
 
 /*
 /*
+ * The function for pre-allocating protection domains.
+ *
  * If the driver core informs the DMA layer if a driver grabs a device
  * If the driver core informs the DMA layer if a driver grabs a device
  * we don't need to preallocate the protection domains anymore.
  * we don't need to preallocate the protection domains anymore.
  * For now we have to.
  * For now we have to.
@@ -921,12 +1113,20 @@ static struct dma_mapping_ops amd_iommu_dma_ops = {
 	.unmap_sg = unmap_sg,
 	.unmap_sg = unmap_sg,
 };
 };
 
 
+/*
+ * The function which clues the AMD IOMMU driver into dma_ops.
+ */
 int __init amd_iommu_init_dma_ops(void)
 int __init amd_iommu_init_dma_ops(void)
 {
 {
 	struct amd_iommu *iommu;
 	struct amd_iommu *iommu;
 	int order = amd_iommu_aperture_order;
 	int order = amd_iommu_aperture_order;
 	int ret;
 	int ret;
 
 
+	/*
+	 * first allocate a default protection domain for every IOMMU we
+	 * found in the system. Devices not assigned to any other
+	 * protection domain will be assigned to the default one.
+	 */
 	list_for_each_entry(iommu, &amd_iommu_list, list) {
 	list_for_each_entry(iommu, &amd_iommu_list, list) {
 		iommu->default_dom = dma_ops_domain_alloc(iommu, order);
 		iommu->default_dom = dma_ops_domain_alloc(iommu, order);
 		if (iommu->default_dom == NULL)
 		if (iommu->default_dom == NULL)
@@ -936,6 +1136,10 @@ int __init amd_iommu_init_dma_ops(void)
 			goto free_domains;
 			goto free_domains;
 	}
 	}
 
 
+	/*
+	 * If device isolation is enabled, pre-allocate the protection
+	 * domains for each device.
+	 */
 	if (amd_iommu_isolate)
 	if (amd_iommu_isolate)
 		prealloc_protection_domains();
 		prealloc_protection_domains();
 
 
@@ -947,6 +1151,7 @@ int __init amd_iommu_init_dma_ops(void)
 	gart_iommu_aperture = 0;
 	gart_iommu_aperture = 0;
 #endif
 #endif
 
 
+	/* Make the driver finally visible to the drivers */
 	dma_ops = &amd_iommu_dma_ops;
 	dma_ops = &amd_iommu_dma_ops;
 
 
 	return 0;
 	return 0;

+ 271 - 86
arch/x86/kernel/amd_iommu_init.c

@@ -25,20 +25,13 @@
 #include <asm/pci-direct.h>
 #include <asm/pci-direct.h>
 #include <asm/amd_iommu_types.h>
 #include <asm/amd_iommu_types.h>
 #include <asm/amd_iommu.h>
 #include <asm/amd_iommu.h>
-#include <asm/gart.h>
+#include <asm/iommu.h>
 
 
 /*
 /*
  * definitions for the ACPI scanning code
  * definitions for the ACPI scanning code
  */
  */
-#define UPDATE_LAST_BDF(x) do {\
-	if ((x) > amd_iommu_last_bdf) \
-		amd_iommu_last_bdf = (x); \
-	} while (0);
-
-#define DEVID(bus, devfn) (((bus) << 8) | (devfn))
 #define PCI_BUS(x) (((x) >> 8) & 0xff)
 #define PCI_BUS(x) (((x) >> 8) & 0xff)
 #define IVRS_HEADER_LENGTH 48
 #define IVRS_HEADER_LENGTH 48
-#define TBL_SIZE(x) (1 << (PAGE_SHIFT + get_order(amd_iommu_last_bdf * (x))))
 
 
 #define ACPI_IVHD_TYPE                  0x10
 #define ACPI_IVHD_TYPE                  0x10
 #define ACPI_IVMD_TYPE_ALL              0x20
 #define ACPI_IVMD_TYPE_ALL              0x20
@@ -71,6 +64,17 @@
 #define ACPI_DEVFLAG_LINT1              0x80
 #define ACPI_DEVFLAG_LINT1              0x80
 #define ACPI_DEVFLAG_ATSDIS             0x10000000
 #define ACPI_DEVFLAG_ATSDIS             0x10000000
 
 
+/*
+ * ACPI table definitions
+ *
+ * These data structures are laid over the table to parse the important values
+ * out of it.
+ */
+
+/*
+ * structure describing one IOMMU in the ACPI table. Typically followed by one
+ * or more ivhd_entrys.
+ */
 struct ivhd_header {
 struct ivhd_header {
 	u8 type;
 	u8 type;
 	u8 flags;
 	u8 flags;
@@ -83,6 +87,10 @@ struct ivhd_header {
 	u32 reserved;
 	u32 reserved;
 } __attribute__((packed));
 } __attribute__((packed));
 
 
+/*
+ * A device entry describing which devices a specific IOMMU translates and
+ * which requestor ids they use.
+ */
 struct ivhd_entry {
 struct ivhd_entry {
 	u8 type;
 	u8 type;
 	u16 devid;
 	u16 devid;
@@ -90,6 +98,10 @@ struct ivhd_entry {
 	u32 ext;
 	u32 ext;
 } __attribute__((packed));
 } __attribute__((packed));
 
 
+/*
+ * An AMD IOMMU memory definition structure. It defines things like exclusion
+ * ranges for devices and regions that should be unity mapped.
+ */
 struct ivmd_header {
 struct ivmd_header {
 	u8 type;
 	u8 type;
 	u8 flags;
 	u8 flags;
@@ -103,22 +115,80 @@ struct ivmd_header {
 
 
 static int __initdata amd_iommu_detected;
 static int __initdata amd_iommu_detected;
 
 
-u16 amd_iommu_last_bdf;
-struct list_head amd_iommu_unity_map;
-unsigned amd_iommu_aperture_order = 26;
-int amd_iommu_isolate;
+u16 amd_iommu_last_bdf;			/* largest PCI device id we have
+					   to handle */
+LIST_HEAD(amd_iommu_unity_map);		/* a list of required unity mappings
+					   we find in ACPI */
+unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
+int amd_iommu_isolate;			/* if 1, device isolation is enabled */
+
+LIST_HEAD(amd_iommu_list);		/* list of all AMD IOMMUs in the
+					   system */
 
 
-struct list_head amd_iommu_list;
+/*
+ * Pointer to the device table which is shared by all AMD IOMMUs
+ * it is indexed by the PCI device id or the HT unit id and contains
+ * information about the domain the device belongs to as well as the
+ * page table root pointer.
+ */
 struct dev_table_entry *amd_iommu_dev_table;
 struct dev_table_entry *amd_iommu_dev_table;
+
+/*
+ * The alias table is a driver specific data structure which contains the
+ * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
+ * More than one device can share the same requestor id.
+ */
 u16 *amd_iommu_alias_table;
 u16 *amd_iommu_alias_table;
+
+/*
+ * The rlookup table is used to find the IOMMU which is responsible
+ * for a specific device. It is also indexed by the PCI device id.
+ */
 struct amd_iommu **amd_iommu_rlookup_table;
 struct amd_iommu **amd_iommu_rlookup_table;
+
+/*
+ * The pd table (protection domain table) is used to find the protection domain
+ * data structure a device belongs to. Indexed with the PCI device id too.
+ */
 struct protection_domain **amd_iommu_pd_table;
 struct protection_domain **amd_iommu_pd_table;
+
+/*
+ * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
+ * to know which ones are already in use.
+ */
 unsigned long *amd_iommu_pd_alloc_bitmap;
 unsigned long *amd_iommu_pd_alloc_bitmap;
 
 
-static u32 dev_table_size;
-static u32 alias_table_size;
-static u32 rlookup_table_size;
+static u32 dev_table_size;	/* size of the device table */
+static u32 alias_table_size;	/* size of the alias table */
+static u32 rlookup_table_size;	/* size if the rlookup table */
 
 
+static inline void update_last_devid(u16 devid)
+{
+	if (devid > amd_iommu_last_bdf)
+		amd_iommu_last_bdf = devid;
+}
+
+static inline unsigned long tbl_size(int entry_size)
+{
+	unsigned shift = PAGE_SHIFT +
+			 get_order(amd_iommu_last_bdf * entry_size);
+
+	return 1UL << shift;
+}
+
+/****************************************************************************
+ *
+ * AMD IOMMU MMIO register space handling functions
+ *
+ * These functions are used to program the IOMMU device registers in
+ * MMIO space required for that driver.
+ *
+ ****************************************************************************/
+
+/*
+ * This function set the exclusion range in the IOMMU. DMA accesses to the
+ * exclusion range are passed through untranslated
+ */
 static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
 static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
 {
 {
 	u64 start = iommu->exclusion_start & PAGE_MASK;
 	u64 start = iommu->exclusion_start & PAGE_MASK;
@@ -137,6 +207,7 @@ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
 			&entry, sizeof(entry));
 			&entry, sizeof(entry));
 }
 }
 
 
+/* Programs the physical address of the device table into the IOMMU hardware */
 static void __init iommu_set_device_table(struct amd_iommu *iommu)
 static void __init iommu_set_device_table(struct amd_iommu *iommu)
 {
 {
 	u32 entry;
 	u32 entry;
@@ -149,6 +220,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
 			&entry, sizeof(entry));
 			&entry, sizeof(entry));
 }
 }
 
 
+/* Generic functions to enable/disable certain features of the IOMMU. */
 static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
 static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
 {
 {
 	u32 ctrl;
 	u32 ctrl;
@@ -167,6 +239,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
 	writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
 	writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
 }
 }
 
 
+/* Function to enable the hardware */
 void __init iommu_enable(struct amd_iommu *iommu)
 void __init iommu_enable(struct amd_iommu *iommu)
 {
 {
 	printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at ");
 	printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at ");
@@ -176,6 +249,10 @@ void __init iommu_enable(struct amd_iommu *iommu)
 	iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
 	iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
 }
 }
 
 
+/*
+ * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
+ * the system has one.
+ */
 static u8 * __init iommu_map_mmio_space(u64 address)
 static u8 * __init iommu_map_mmio_space(u64 address)
 {
 {
 	u8 *ret;
 	u8 *ret;
@@ -199,16 +276,33 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
 	release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
 	release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
 }
 }
 
 
+/****************************************************************************
+ *
+ * The functions below belong to the first pass of AMD IOMMU ACPI table
+ * parsing. In this pass we try to find out the highest device id this
+ * code has to handle. Upon this information the size of the shared data
+ * structures is determined later.
+ *
+ ****************************************************************************/
+
+/*
+ * This function reads the last device id the IOMMU has to handle from the PCI
+ * capability header for this IOMMU
+ */
 static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
 static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
 {
 {
 	u32 cap;
 	u32 cap;
 
 
 	cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
 	cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
-	UPDATE_LAST_BDF(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
+	update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
 
 
 	return 0;
 	return 0;
 }
 }
 
 
+/*
+ * After reading the highest device id from the IOMMU PCI capability header
+ * this function looks if there is a higher device id defined in the ACPI table
+ */
 static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
 static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
 {
 {
 	u8 *p = (void *)h, *end = (void *)h;
 	u8 *p = (void *)h, *end = (void *)h;
@@ -229,7 +323,8 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
 		case IVHD_DEV_RANGE_END:
 		case IVHD_DEV_RANGE_END:
 		case IVHD_DEV_ALIAS:
 		case IVHD_DEV_ALIAS:
 		case IVHD_DEV_EXT_SELECT:
 		case IVHD_DEV_EXT_SELECT:
-			UPDATE_LAST_BDF(dev->devid);
+			/* all the above subfield types refer to device ids */
+			update_last_devid(dev->devid);
 			break;
 			break;
 		default:
 		default:
 			break;
 			break;
@@ -242,6 +337,11 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
 	return 0;
 	return 0;
 }
 }
 
 
+/*
+ * Iterate over all IVHD entries in the ACPI table and find the highest device
+ * id which we need to handle. This is the first of three functions which parse
+ * the ACPI table. So we check the checksum here.
+ */
 static int __init find_last_devid_acpi(struct acpi_table_header *table)
 static int __init find_last_devid_acpi(struct acpi_table_header *table)
 {
 {
 	int i;
 	int i;
@@ -277,19 +377,31 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table)
 	return 0;
 	return 0;
 }
 }
 
 
+/****************************************************************************
+ *
+ * The following functions belong the the code path which parses the ACPI table
+ * the second time. In this ACPI parsing iteration we allocate IOMMU specific
+ * data structures, initialize the device/alias/rlookup table and also
+ * basically initialize the hardware.
+ *
+ ****************************************************************************/
+
+/*
+ * Allocates the command buffer. This buffer is per AMD IOMMU. We can
+ * write commands to that buffer later and the IOMMU will execute them
+ * asynchronously
+ */
 static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
 static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
 {
 {
-	u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL,
+	u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
 			get_order(CMD_BUFFER_SIZE));
 			get_order(CMD_BUFFER_SIZE));
-	u64 entry = 0;
+	u64 entry;
 
 
 	if (cmd_buf == NULL)
 	if (cmd_buf == NULL)
 		return NULL;
 		return NULL;
 
 
 	iommu->cmd_buf_size = CMD_BUFFER_SIZE;
 	iommu->cmd_buf_size = CMD_BUFFER_SIZE;
 
 
-	memset(cmd_buf, 0, CMD_BUFFER_SIZE);
-
 	entry = (u64)virt_to_phys(cmd_buf);
 	entry = (u64)virt_to_phys(cmd_buf);
 	entry |= MMIO_CMD_SIZE_512;
 	entry |= MMIO_CMD_SIZE_512;
 	memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
 	memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
@@ -302,11 +414,10 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
 
 
 static void __init free_command_buffer(struct amd_iommu *iommu)
 static void __init free_command_buffer(struct amd_iommu *iommu)
 {
 {
-	if (iommu->cmd_buf)
-		free_pages((unsigned long)iommu->cmd_buf,
-				get_order(CMD_BUFFER_SIZE));
+	free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE));
 }
 }
 
 
+/* sets a specific bit in the device table entry. */
 static void set_dev_entry_bit(u16 devid, u8 bit)
 static void set_dev_entry_bit(u16 devid, u8 bit)
 {
 {
 	int i = (bit >> 5) & 0x07;
 	int i = (bit >> 5) & 0x07;
@@ -315,7 +426,18 @@ static void set_dev_entry_bit(u16 devid, u8 bit)
 	amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
 	amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
 }
 }
 
 
-static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags)
+/* Writes the specific IOMMU for a device into the rlookup table */
+static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
+{
+	amd_iommu_rlookup_table[devid] = iommu;
+}
+
+/*
+ * This function takes the device specific flags read from the ACPI
+ * table and sets up the device table entry with that information
+ */
+static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
+					   u16 devid, u32 flags, u32 ext_flags)
 {
 {
 	if (flags & ACPI_DEVFLAG_INITPASS)
 	if (flags & ACPI_DEVFLAG_INITPASS)
 		set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
 		set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
@@ -331,13 +453,14 @@ static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags)
 		set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
 		set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
 	if (flags & ACPI_DEVFLAG_LINT1)
 	if (flags & ACPI_DEVFLAG_LINT1)
 		set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
 		set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
-}
 
 
-static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
-{
-	amd_iommu_rlookup_table[devid] = iommu;
+	set_iommu_for_device(iommu, devid);
 }
 }
 
 
+/*
+ * Reads the device exclusion range from ACPI and initialize IOMMU with
+ * it
+ */
 static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
 static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
 {
 {
 	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
 	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
@@ -346,12 +469,22 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
 		return;
 		return;
 
 
 	if (iommu) {
 	if (iommu) {
+		/*
+		 * We only can configure exclusion ranges per IOMMU, not
+		 * per device. But we can enable the exclusion range per
+		 * device. This is done here
+		 */
 		set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
 		set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
 		iommu->exclusion_start = m->range_start;
 		iommu->exclusion_start = m->range_start;
 		iommu->exclusion_length = m->range_length;
 		iommu->exclusion_length = m->range_length;
 	}
 	}
 }
 }
 
 
+/*
+ * This function reads some important data from the IOMMU PCI space and
+ * initializes the driver data structure with it. It reads the hardware
+ * capabilities and the first/last device entries
+ */
 static void __init init_iommu_from_pci(struct amd_iommu *iommu)
 static void __init init_iommu_from_pci(struct amd_iommu *iommu)
 {
 {
 	int bus = PCI_BUS(iommu->devid);
 	int bus = PCI_BUS(iommu->devid);
@@ -363,10 +496,16 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
 	iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET);
 	iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET);
 
 
 	range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
 	range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
-	iommu->first_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_FD(range));
-	iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range));
+	iommu->first_device = calc_devid(MMIO_GET_BUS(range),
+					 MMIO_GET_FD(range));
+	iommu->last_device = calc_devid(MMIO_GET_BUS(range),
+					MMIO_GET_LD(range));
 }
 }
 
 
+/*
+ * Takes a pointer to an AMD IOMMU entry in the ACPI table and
+ * initializes the hardware and our data structures with it.
+ */
 static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
 static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
 					struct ivhd_header *h)
 					struct ivhd_header *h)
 {
 {
@@ -374,7 +513,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
 	u8 *end = p, flags = 0;
 	u8 *end = p, flags = 0;
 	u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
 	u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
 	u32 ext_flags = 0;
 	u32 ext_flags = 0;
-	bool alias = 0;
+	bool alias = false;
 	struct ivhd_entry *e;
 	struct ivhd_entry *e;
 
 
 	/*
 	/*
@@ -414,22 +553,23 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
 		case IVHD_DEV_ALL:
 		case IVHD_DEV_ALL:
 			for (dev_i = iommu->first_device;
 			for (dev_i = iommu->first_device;
 					dev_i <= iommu->last_device; ++dev_i)
 					dev_i <= iommu->last_device; ++dev_i)
-				set_dev_entry_from_acpi(dev_i, e->flags, 0);
+				set_dev_entry_from_acpi(iommu, dev_i,
+							e->flags, 0);
 			break;
 			break;
 		case IVHD_DEV_SELECT:
 		case IVHD_DEV_SELECT:
 			devid = e->devid;
 			devid = e->devid;
-			set_dev_entry_from_acpi(devid, e->flags, 0);
+			set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
 			break;
 			break;
 		case IVHD_DEV_SELECT_RANGE_START:
 		case IVHD_DEV_SELECT_RANGE_START:
 			devid_start = e->devid;
 			devid_start = e->devid;
 			flags = e->flags;
 			flags = e->flags;
 			ext_flags = 0;
 			ext_flags = 0;
-			alias = 0;
+			alias = false;
 			break;
 			break;
 		case IVHD_DEV_ALIAS:
 		case IVHD_DEV_ALIAS:
 			devid = e->devid;
 			devid = e->devid;
 			devid_to = e->ext >> 8;
 			devid_to = e->ext >> 8;
-			set_dev_entry_from_acpi(devid, e->flags, 0);
+			set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
 			amd_iommu_alias_table[devid] = devid_to;
 			amd_iommu_alias_table[devid] = devid_to;
 			break;
 			break;
 		case IVHD_DEV_ALIAS_RANGE:
 		case IVHD_DEV_ALIAS_RANGE:
@@ -437,24 +577,25 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
 			flags = e->flags;
 			flags = e->flags;
 			devid_to = e->ext >> 8;
 			devid_to = e->ext >> 8;
 			ext_flags = 0;
 			ext_flags = 0;
-			alias = 1;
+			alias = true;
 			break;
 			break;
 		case IVHD_DEV_EXT_SELECT:
 		case IVHD_DEV_EXT_SELECT:
 			devid = e->devid;
 			devid = e->devid;
-			set_dev_entry_from_acpi(devid, e->flags, e->ext);
+			set_dev_entry_from_acpi(iommu, devid, e->flags,
+						e->ext);
 			break;
 			break;
 		case IVHD_DEV_EXT_SELECT_RANGE:
 		case IVHD_DEV_EXT_SELECT_RANGE:
 			devid_start = e->devid;
 			devid_start = e->devid;
 			flags = e->flags;
 			flags = e->flags;
 			ext_flags = e->ext;
 			ext_flags = e->ext;
-			alias = 0;
+			alias = false;
 			break;
 			break;
 		case IVHD_DEV_RANGE_END:
 		case IVHD_DEV_RANGE_END:
 			devid = e->devid;
 			devid = e->devid;
 			for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
 			for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
 				if (alias)
 				if (alias)
 					amd_iommu_alias_table[dev_i] = devid_to;
 					amd_iommu_alias_table[dev_i] = devid_to;
-				set_dev_entry_from_acpi(
+				set_dev_entry_from_acpi(iommu,
 						amd_iommu_alias_table[dev_i],
 						amd_iommu_alias_table[dev_i],
 						flags, ext_flags);
 						flags, ext_flags);
 			}
 			}
@@ -467,6 +608,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
 	}
 	}
 }
 }
 
 
+/* Initializes the device->iommu mapping for the driver */
 static int __init init_iommu_devices(struct amd_iommu *iommu)
 static int __init init_iommu_devices(struct amd_iommu *iommu)
 {
 {
 	u16 i;
 	u16 i;
@@ -494,6 +636,11 @@ static void __init free_iommu_all(void)
 	}
 	}
 }
 }
 
 
+/*
+ * This function clues the initialization function for one IOMMU
+ * together and also allocates the command buffer and programs the
+ * hardware. It does NOT enable the IOMMU. This is done afterwards.
+ */
 static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
 static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
 {
 {
 	spin_lock_init(&iommu->lock);
 	spin_lock_init(&iommu->lock);
@@ -521,6 +668,10 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
 	return 0;
 	return 0;
 }
 }
 
 
+/*
+ * Iterates over all IOMMU entries in the ACPI table, allocates the
+ * IOMMU structure and initializes it with init_iommu_one()
+ */
 static int __init init_iommu_all(struct acpi_table_header *table)
 static int __init init_iommu_all(struct acpi_table_header *table)
 {
 {
 	u8 *p = (u8 *)table, *end = (u8 *)table;
 	u8 *p = (u8 *)table, *end = (u8 *)table;
@@ -528,8 +679,6 @@ static int __init init_iommu_all(struct acpi_table_header *table)
 	struct amd_iommu *iommu;
 	struct amd_iommu *iommu;
 	int ret;
 	int ret;
 
 
-	INIT_LIST_HEAD(&amd_iommu_list);
-
 	end += table->length;
 	end += table->length;
 	p += IVRS_HEADER_LENGTH;
 	p += IVRS_HEADER_LENGTH;
 
 
@@ -555,6 +704,14 @@ static int __init init_iommu_all(struct acpi_table_header *table)
 	return 0;
 	return 0;
 }
 }
 
 
+/****************************************************************************
+ *
+ * The next functions belong to the third pass of parsing the ACPI
+ * table. In this last pass the memory mapping requirements are
+ * gathered (like exclusion and unity mapping reanges).
+ *
+ ****************************************************************************/
+
 static void __init free_unity_maps(void)
 static void __init free_unity_maps(void)
 {
 {
 	struct unity_map_entry *entry, *next;
 	struct unity_map_entry *entry, *next;
@@ -565,6 +722,7 @@ static void __init free_unity_maps(void)
 	}
 	}
 }
 }
 
 
+/* called when we find an exclusion range definition in ACPI */
 static int __init init_exclusion_range(struct ivmd_header *m)
 static int __init init_exclusion_range(struct ivmd_header *m)
 {
 {
 	int i;
 	int i;
@@ -588,6 +746,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
 	return 0;
 	return 0;
 }
 }
 
 
+/* called for unity map ACPI definition */
 static int __init init_unity_map_range(struct ivmd_header *m)
 static int __init init_unity_map_range(struct ivmd_header *m)
 {
 {
 	struct unity_map_entry *e = 0;
 	struct unity_map_entry *e = 0;
@@ -619,13 +778,12 @@ static int __init init_unity_map_range(struct ivmd_header *m)
 	return 0;
 	return 0;
 }
 }
 
 
+/* iterates over all memory definitions we find in the ACPI table */
 static int __init init_memory_definitions(struct acpi_table_header *table)
 static int __init init_memory_definitions(struct acpi_table_header *table)
 {
 {
 	u8 *p = (u8 *)table, *end = (u8 *)table;
 	u8 *p = (u8 *)table, *end = (u8 *)table;
 	struct ivmd_header *m;
 	struct ivmd_header *m;
 
 
-	INIT_LIST_HEAD(&amd_iommu_unity_map);
-
 	end += table->length;
 	end += table->length;
 	p += IVRS_HEADER_LENGTH;
 	p += IVRS_HEADER_LENGTH;
 
 
@@ -642,6 +800,10 @@ static int __init init_memory_definitions(struct acpi_table_header *table)
 	return 0;
 	return 0;
 }
 }
 
 
+/*
+ * This function finally enables all IOMMUs found in the system after
+ * they have been initialized
+ */
 static void __init enable_iommus(void)
 static void __init enable_iommus(void)
 {
 {
 	struct amd_iommu *iommu;
 	struct amd_iommu *iommu;
@@ -678,6 +840,34 @@ static struct sys_device device_amd_iommu = {
 	.cls = &amd_iommu_sysdev_class,
 	.cls = &amd_iommu_sysdev_class,
 };
 };
 
 
+/*
+ * This is the core init function for AMD IOMMU hardware in the system.
+ * This function is called from the generic x86 DMA layer initialization
+ * code.
+ *
+ * This function basically parses the ACPI table for AMD IOMMU (IVRS)
+ * three times:
+ *
+ *	1 pass) Find the highest PCI device id the driver has to handle.
+ *		Upon this information the size of the data structures is
+ *		determined that needs to be allocated.
+ *
+ *	2 pass) Initialize the data structures just allocated with the
+ *		information in the ACPI table about available AMD IOMMUs
+ *		in the system. It also maps the PCI devices in the
+ *		system to specific IOMMUs
+ *
+ *	3 pass) After the basic data structures are allocated and
+ *		initialized we update them with information about memory
+ *		remapping requirements parsed out of the ACPI table in
+ *		this last pass.
+ *
+ * After that the hardware is initialized and ready to go. In the last
+ * step we do some Linux specific things like registering the driver in
+ * the dma_ops interface and initializing the suspend/resume support
+ * functions. Finally it prints some information about AMD IOMMUs and
+ * the driver state and enables the hardware.
+ */
 int __init amd_iommu_init(void)
 int __init amd_iommu_init(void)
 {
 {
 	int i, ret = 0;
 	int i, ret = 0;
@@ -699,14 +889,14 @@ int __init amd_iommu_init(void)
 	if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
 	if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
 		return -ENODEV;
 		return -ENODEV;
 
 
-	dev_table_size     = TBL_SIZE(DEV_TABLE_ENTRY_SIZE);
-	alias_table_size   = TBL_SIZE(ALIAS_TABLE_ENTRY_SIZE);
-	rlookup_table_size = TBL_SIZE(RLOOKUP_TABLE_ENTRY_SIZE);
+	dev_table_size     = tbl_size(DEV_TABLE_ENTRY_SIZE);
+	alias_table_size   = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
+	rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
 
 
 	ret = -ENOMEM;
 	ret = -ENOMEM;
 
 
 	/* Device table - directly used by all IOMMUs */
 	/* Device table - directly used by all IOMMUs */
-	amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL,
+	amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
 				      get_order(dev_table_size));
 				      get_order(dev_table_size));
 	if (amd_iommu_dev_table == NULL)
 	if (amd_iommu_dev_table == NULL)
 		goto out;
 		goto out;
@@ -730,27 +920,23 @@ int __init amd_iommu_init(void)
 	 * Protection Domain table - maps devices to protection domains
 	 * Protection Domain table - maps devices to protection domains
 	 * This table has the same size as the rlookup_table
 	 * This table has the same size as the rlookup_table
 	 */
 	 */
-	amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL,
+	amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
 				     get_order(rlookup_table_size));
 				     get_order(rlookup_table_size));
 	if (amd_iommu_pd_table == NULL)
 	if (amd_iommu_pd_table == NULL)
 		goto free;
 		goto free;
 
 
-	amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(GFP_KERNEL,
+	amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
+					    GFP_KERNEL | __GFP_ZERO,
 					    get_order(MAX_DOMAIN_ID/8));
 					    get_order(MAX_DOMAIN_ID/8));
 	if (amd_iommu_pd_alloc_bitmap == NULL)
 	if (amd_iommu_pd_alloc_bitmap == NULL)
 		goto free;
 		goto free;
 
 
 	/*
 	/*
-	 * memory is allocated now; initialize the device table with all zeroes
-	 * and let all alias entries point to itself
+	 * let all alias entries point to itself
 	 */
 	 */
-	memset(amd_iommu_dev_table, 0, dev_table_size);
 	for (i = 0; i < amd_iommu_last_bdf; ++i)
 	for (i = 0; i < amd_iommu_last_bdf; ++i)
 		amd_iommu_alias_table[i] = i;
 		amd_iommu_alias_table[i] = i;
 
 
-	memset(amd_iommu_pd_table, 0, rlookup_table_size);
-	memset(amd_iommu_pd_alloc_bitmap, 0, MAX_DOMAIN_ID / 8);
-
 	/*
 	/*
 	 * never allocate domain 0 because its used as the non-allocated and
 	 * never allocate domain 0 because its used as the non-allocated and
 	 * error value placeholder
 	 * error value placeholder
@@ -795,24 +981,19 @@ out:
 	return ret;
 	return ret;
 
 
 free:
 free:
-	if (amd_iommu_pd_alloc_bitmap)
-		free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
+	free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
 
 
-	if (amd_iommu_pd_table)
-		free_pages((unsigned long)amd_iommu_pd_table,
-				get_order(rlookup_table_size));
+	free_pages((unsigned long)amd_iommu_pd_table,
+		   get_order(rlookup_table_size));
 
 
-	if (amd_iommu_rlookup_table)
-		free_pages((unsigned long)amd_iommu_rlookup_table,
-				get_order(rlookup_table_size));
+	free_pages((unsigned long)amd_iommu_rlookup_table,
+		   get_order(rlookup_table_size));
 
 
-	if (amd_iommu_alias_table)
-		free_pages((unsigned long)amd_iommu_alias_table,
-				get_order(alias_table_size));
+	free_pages((unsigned long)amd_iommu_alias_table,
+		   get_order(alias_table_size));
 
 
-	if (amd_iommu_dev_table)
-		free_pages((unsigned long)amd_iommu_dev_table,
-				get_order(dev_table_size));
+	free_pages((unsigned long)amd_iommu_dev_table,
+		   get_order(dev_table_size));
 
 
 	free_iommu_all();
 	free_iommu_all();
 
 
@@ -821,6 +1002,13 @@ free:
 	goto out;
 	goto out;
 }
 }
 
 
+/****************************************************************************
+ *
+ * Early detect code. This code runs at IOMMU detection time in the DMA
+ * layer. It just looks if there is an IVRS ACPI table to detect AMD
+ * IOMMUs
+ *
+ ****************************************************************************/
 static int __init early_amd_iommu_detect(struct acpi_table_header *table)
 static int __init early_amd_iommu_detect(struct acpi_table_header *table)
 {
 {
 	return 0;
 	return 0;
@@ -828,7 +1016,7 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
 
 
 void __init amd_iommu_detect(void)
 void __init amd_iommu_detect(void)
 {
 {
-	if (swiotlb || no_iommu || iommu_detected)
+	if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture))
 		return;
 		return;
 
 
 	if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
 	if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
@@ -841,6 +1029,13 @@ void __init amd_iommu_detect(void)
 	}
 	}
 }
 }
 
 
+/****************************************************************************
+ *
+ * Parsing functions for the AMD IOMMU specific kernel command line
+ * options.
+ *
+ ****************************************************************************/
+
 static int __init parse_amd_iommu_options(char *str)
 static int __init parse_amd_iommu_options(char *str)
 {
 {
 	for (; *str; ++str) {
 	for (; *str; ++str) {
@@ -853,20 +1048,10 @@ static int __init parse_amd_iommu_options(char *str)
 
 
 static int __init parse_amd_iommu_size_options(char *str)
 static int __init parse_amd_iommu_size_options(char *str)
 {
 {
-	for (; *str; ++str) {
-		if (strcmp(str, "32M") == 0)
-			amd_iommu_aperture_order = 25;
-		if (strcmp(str, "64M") == 0)
-			amd_iommu_aperture_order = 26;
-		if (strcmp(str, "128M") == 0)
-			amd_iommu_aperture_order = 27;
-		if (strcmp(str, "256M") == 0)
-			amd_iommu_aperture_order = 28;
-		if (strcmp(str, "512M") == 0)
-			amd_iommu_aperture_order = 29;
-		if (strcmp(str, "1G") == 0)
-			amd_iommu_aperture_order = 30;
-	}
+	unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
+
+	if ((order > 24) && (order < 31))
+		amd_iommu_aperture_order = order;
 
 
 	return 1;
 	return 1;
 }
 }

+ 1 - 0
arch/x86/kernel/aperture_64.c

@@ -21,6 +21,7 @@
 #include <linux/suspend.h>
 #include <linux/suspend.h>
 #include <asm/e820.h>
 #include <asm/e820.h>
 #include <asm/io.h>
 #include <asm/io.h>
+#include <asm/iommu.h>
 #include <asm/gart.h>
 #include <asm/gart.h>
 #include <asm/pci-direct.h>
 #include <asm/pci-direct.h>
 #include <asm/dma.h>
 #include <asm/dma.h>

+ 88 - 87
arch/x86/kernel/apic_32.c

@@ -75,7 +75,7 @@ char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
 /*
 /*
  * Debug level, exported for io_apic.c
  * Debug level, exported for io_apic.c
  */
  */
-int apic_verbosity;
+unsigned int apic_verbosity;
 
 
 int pic_mode;
 int pic_mode;
 
 
@@ -177,7 +177,7 @@ void __cpuinit enable_NMI_through_LVT0(void)
 	/* Level triggered for 82489DX */
 	/* Level triggered for 82489DX */
 	if (!lapic_is_integrated())
 	if (!lapic_is_integrated())
 		v |= APIC_LVT_LEVEL_TRIGGER;
 		v |= APIC_LVT_LEVEL_TRIGGER;
-	apic_write_around(APIC_LVT0, v);
+	apic_write(APIC_LVT0, v);
 }
 }
 
 
 /**
 /**
@@ -212,9 +212,6 @@ int lapic_get_maxlvt(void)
  * this function twice on the boot CPU, once with a bogus timeout
  * this function twice on the boot CPU, once with a bogus timeout
  * value, second time for real. The other (noncalibrating) CPUs
  * value, second time for real. The other (noncalibrating) CPUs
  * call this function only once, with the real, calibrated value.
  * call this function only once, with the real, calibrated value.
- *
- * We do reads before writes even if unnecessary, to get around the
- * P5 APIC double write bug.
  */
  */
 static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
 static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
 {
 {
@@ -229,18 +226,18 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
 	if (!irqen)
 	if (!irqen)
 		lvtt_value |= APIC_LVT_MASKED;
 		lvtt_value |= APIC_LVT_MASKED;
 
 
-	apic_write_around(APIC_LVTT, lvtt_value);
+	apic_write(APIC_LVTT, lvtt_value);
 
 
 	/*
 	/*
 	 * Divide PICLK by 16
 	 * Divide PICLK by 16
 	 */
 	 */
 	tmp_value = apic_read(APIC_TDCR);
 	tmp_value = apic_read(APIC_TDCR);
-	apic_write_around(APIC_TDCR, (tmp_value
-				& ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
-				| APIC_TDR_DIV_16);
+	apic_write(APIC_TDCR,
+		   (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
+		   APIC_TDR_DIV_16);
 
 
 	if (!oneshot)
 	if (!oneshot)
-		apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
+		apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
 }
 }
 
 
 /*
 /*
@@ -249,7 +246,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
 static int lapic_next_event(unsigned long delta,
 static int lapic_next_event(unsigned long delta,
 			    struct clock_event_device *evt)
 			    struct clock_event_device *evt)
 {
 {
-	apic_write_around(APIC_TMICT, delta);
+	apic_write(APIC_TMICT, delta);
 	return 0;
 	return 0;
 }
 }
 
 
@@ -278,7 +275,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
 	case CLOCK_EVT_MODE_SHUTDOWN:
 	case CLOCK_EVT_MODE_SHUTDOWN:
 		v = apic_read(APIC_LVTT);
 		v = apic_read(APIC_LVTT);
 		v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
 		v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
-		apic_write_around(APIC_LVTT, v);
+		apic_write(APIC_LVTT, v);
 		break;
 		break;
 	case CLOCK_EVT_MODE_RESUME:
 	case CLOCK_EVT_MODE_RESUME:
 		/* Nothing to do here */
 		/* Nothing to do here */
@@ -372,12 +369,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
 	}
 	}
 }
 }
 
 
-/*
- * Setup the boot APIC
- *
- * Calibrate and verify the result.
- */
-void __init setup_boot_APIC_clock(void)
+static int __init calibrate_APIC_clock(void)
 {
 {
 	struct clock_event_device *levt = &__get_cpu_var(lapic_events);
 	struct clock_event_device *levt = &__get_cpu_var(lapic_events);
 	const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
 	const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
@@ -387,24 +379,6 @@ void __init setup_boot_APIC_clock(void)
 	long delta, deltapm;
 	long delta, deltapm;
 	int pm_referenced = 0;
 	int pm_referenced = 0;
 
 
-	/*
-	 * The local apic timer can be disabled via the kernel
-	 * commandline or from the CPU detection code. Register the lapic
-	 * timer as a dummy clock event source on SMP systems, so the
-	 * broadcast mechanism is used. On UP systems simply ignore it.
-	 */
-	if (local_apic_timer_disabled) {
-		/* No broadcast on UP ! */
-		if (num_possible_cpus() > 1) {
-			lapic_clockevent.mult = 1;
-			setup_APIC_timer();
-		}
-		return;
-	}
-
-	apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
-		    "calibrating APIC timer ...\n");
-
 	local_irq_disable();
 	local_irq_disable();
 
 
 	/* Replace the global interrupt handler */
 	/* Replace the global interrupt handler */
@@ -489,8 +463,6 @@ void __init setup_boot_APIC_clock(void)
 		    calibration_result / (1000000 / HZ),
 		    calibration_result / (1000000 / HZ),
 		    calibration_result % (1000000 / HZ));
 		    calibration_result % (1000000 / HZ));
 
 
-	local_apic_timer_verify_ok = 1;
-
 	/*
 	/*
 	 * Do a sanity check on the APIC calibration result
 	 * Do a sanity check on the APIC calibration result
 	 */
 	 */
@@ -498,12 +470,11 @@ void __init setup_boot_APIC_clock(void)
 		local_irq_enable();
 		local_irq_enable();
 		printk(KERN_WARNING
 		printk(KERN_WARNING
 		       "APIC frequency too slow, disabling apic timer\n");
 		       "APIC frequency too slow, disabling apic timer\n");
-		/* No broadcast on UP ! */
-		if (num_possible_cpus() > 1)
-			setup_APIC_timer();
-		return;
+		return -1;
 	}
 	}
 
 
+	local_apic_timer_verify_ok = 1;
+
 	/* We trust the pm timer based calibration */
 	/* We trust the pm timer based calibration */
 	if (!pm_referenced) {
 	if (!pm_referenced) {
 		apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
 		apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
@@ -543,22 +514,55 @@ void __init setup_boot_APIC_clock(void)
 	if (!local_apic_timer_verify_ok) {
 	if (!local_apic_timer_verify_ok) {
 		printk(KERN_WARNING
 		printk(KERN_WARNING
 		       "APIC timer disabled due to verification failure.\n");
 		       "APIC timer disabled due to verification failure.\n");
+			return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Setup the boot APIC
+ *
+ * Calibrate and verify the result.
+ */
+void __init setup_boot_APIC_clock(void)
+{
+	/*
+	 * The local apic timer can be disabled via the kernel
+	 * commandline or from the CPU detection code. Register the lapic
+	 * timer as a dummy clock event source on SMP systems, so the
+	 * broadcast mechanism is used. On UP systems simply ignore it.
+	 */
+	if (local_apic_timer_disabled) {
 		/* No broadcast on UP ! */
 		/* No broadcast on UP ! */
-		if (num_possible_cpus() == 1)
-			return;
-	} else {
-		/*
-		 * If nmi_watchdog is set to IO_APIC, we need the
-		 * PIT/HPET going.  Otherwise register lapic as a dummy
-		 * device.
-		 */
-		if (nmi_watchdog != NMI_IO_APIC)
-			lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
-		else
-			printk(KERN_WARNING "APIC timer registered as dummy,"
-				" due to nmi_watchdog=%d!\n", nmi_watchdog);
+		if (num_possible_cpus() > 1) {
+			lapic_clockevent.mult = 1;
+			setup_APIC_timer();
+		}
+		return;
 	}
 	}
 
 
+	apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
+		    "calibrating APIC timer ...\n");
+
+	if (calibrate_APIC_clock()) {
+		/* No broadcast on UP ! */
+		if (num_possible_cpus() > 1)
+			setup_APIC_timer();
+		return;
+	}
+
+	/*
+	 * If nmi_watchdog is set to IO_APIC, we need the
+	 * PIT/HPET going.  Otherwise register lapic as a dummy
+	 * device.
+	 */
+	if (nmi_watchdog != NMI_IO_APIC)
+		lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
+	else
+		printk(KERN_WARNING "APIC timer registered as dummy,"
+			" due to nmi_watchdog=%d!\n", nmi_watchdog);
+
 	/* Setup the lapic or request the broadcast */
 	/* Setup the lapic or request the broadcast */
 	setup_APIC_timer();
 	setup_APIC_timer();
 }
 }
@@ -693,44 +697,44 @@ void clear_local_APIC(void)
 	 */
 	 */
 	if (maxlvt >= 3) {
 	if (maxlvt >= 3) {
 		v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
 		v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
-		apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED);
+		apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
 	}
 	}
 	/*
 	/*
 	 * Careful: we have to set masks only first to deassert
 	 * Careful: we have to set masks only first to deassert
 	 * any level-triggered sources.
 	 * any level-triggered sources.
 	 */
 	 */
 	v = apic_read(APIC_LVTT);
 	v = apic_read(APIC_LVTT);
-	apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
+	apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
 	v = apic_read(APIC_LVT0);
 	v = apic_read(APIC_LVT0);
-	apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
+	apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
 	v = apic_read(APIC_LVT1);
 	v = apic_read(APIC_LVT1);
-	apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED);
+	apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
 	if (maxlvt >= 4) {
 	if (maxlvt >= 4) {
 		v = apic_read(APIC_LVTPC);
 		v = apic_read(APIC_LVTPC);
-		apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED);
+		apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
 	}
 	}
 
 
 	/* lets not touch this if we didn't frob it */
 	/* lets not touch this if we didn't frob it */
 #ifdef CONFIG_X86_MCE_P4THERMAL
 #ifdef CONFIG_X86_MCE_P4THERMAL
 	if (maxlvt >= 5) {
 	if (maxlvt >= 5) {
 		v = apic_read(APIC_LVTTHMR);
 		v = apic_read(APIC_LVTTHMR);
-		apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED);
+		apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
 	}
 	}
 #endif
 #endif
 	/*
 	/*
 	 * Clean APIC state for other OSs:
 	 * Clean APIC state for other OSs:
 	 */
 	 */
-	apic_write_around(APIC_LVTT, APIC_LVT_MASKED);
-	apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
-	apic_write_around(APIC_LVT1, APIC_LVT_MASKED);
+	apic_write(APIC_LVTT, APIC_LVT_MASKED);
+	apic_write(APIC_LVT0, APIC_LVT_MASKED);
+	apic_write(APIC_LVT1, APIC_LVT_MASKED);
 	if (maxlvt >= 3)
 	if (maxlvt >= 3)
-		apic_write_around(APIC_LVTERR, APIC_LVT_MASKED);
+		apic_write(APIC_LVTERR, APIC_LVT_MASKED);
 	if (maxlvt >= 4)
 	if (maxlvt >= 4)
-		apic_write_around(APIC_LVTPC, APIC_LVT_MASKED);
+		apic_write(APIC_LVTPC, APIC_LVT_MASKED);
 
 
 #ifdef CONFIG_X86_MCE_P4THERMAL
 #ifdef CONFIG_X86_MCE_P4THERMAL
 	if (maxlvt >= 5)
 	if (maxlvt >= 5)
-		apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED);
+		apic_write(APIC_LVTTHMR, APIC_LVT_MASKED);
 #endif
 #endif
 	/* Integrated APIC (!82489DX) ? */
 	/* Integrated APIC (!82489DX) ? */
 	if (lapic_is_integrated()) {
 	if (lapic_is_integrated()) {
@@ -756,7 +760,7 @@ void disable_local_APIC(void)
 	 */
 	 */
 	value = apic_read(APIC_SPIV);
 	value = apic_read(APIC_SPIV);
 	value &= ~APIC_SPIV_APIC_ENABLED;
 	value &= ~APIC_SPIV_APIC_ENABLED;
-	apic_write_around(APIC_SPIV, value);
+	apic_write(APIC_SPIV, value);
 
 
 	/*
 	/*
 	 * When LAPIC was disabled by the BIOS and enabled by the kernel,
 	 * When LAPIC was disabled by the BIOS and enabled by the kernel,
@@ -865,8 +869,8 @@ void __init sync_Arb_IDs(void)
 	apic_wait_icr_idle();
 	apic_wait_icr_idle();
 
 
 	apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
 	apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
-	apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
-				| APIC_DM_INIT);
+	apic_write(APIC_ICR,
+		   APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT);
 }
 }
 
 
 /*
 /*
@@ -902,16 +906,16 @@ void __init init_bsp_APIC(void)
 	else
 	else
 		value |= APIC_SPIV_FOCUS_DISABLED;
 		value |= APIC_SPIV_FOCUS_DISABLED;
 	value |= SPURIOUS_APIC_VECTOR;
 	value |= SPURIOUS_APIC_VECTOR;
-	apic_write_around(APIC_SPIV, value);
+	apic_write(APIC_SPIV, value);
 
 
 	/*
 	/*
 	 * Set up the virtual wire mode.
 	 * Set up the virtual wire mode.
 	 */
 	 */
-	apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
+	apic_write(APIC_LVT0, APIC_DM_EXTINT);
 	value = APIC_DM_NMI;
 	value = APIC_DM_NMI;
 	if (!lapic_is_integrated())		/* 82489DX */
 	if (!lapic_is_integrated())		/* 82489DX */
 		value |= APIC_LVT_LEVEL_TRIGGER;
 		value |= APIC_LVT_LEVEL_TRIGGER;
-	apic_write_around(APIC_LVT1, value);
+	apic_write(APIC_LVT1, value);
 }
 }
 
 
 static void __cpuinit lapic_setup_esr(void)
 static void __cpuinit lapic_setup_esr(void)
@@ -926,7 +930,7 @@ static void __cpuinit lapic_setup_esr(void)
 
 
 		/* enables sending errors */
 		/* enables sending errors */
 		value = ERROR_APIC_VECTOR;
 		value = ERROR_APIC_VECTOR;
-		apic_write_around(APIC_LVTERR, value);
+		apic_write(APIC_LVTERR, value);
 		/*
 		/*
 		 * spec says clear errors after enabling vector.
 		 * spec says clear errors after enabling vector.
 		 */
 		 */
@@ -989,7 +993,7 @@ void __cpuinit setup_local_APIC(void)
 	 */
 	 */
 	value = apic_read(APIC_TASKPRI);
 	value = apic_read(APIC_TASKPRI);
 	value &= ~APIC_TPRI_MASK;
 	value &= ~APIC_TPRI_MASK;
-	apic_write_around(APIC_TASKPRI, value);
+	apic_write(APIC_TASKPRI, value);
 
 
 	/*
 	/*
 	 * After a crash, we no longer service the interrupts and a pending
 	 * After a crash, we no longer service the interrupts and a pending
@@ -1047,7 +1051,7 @@ void __cpuinit setup_local_APIC(void)
 	 * Set spurious IRQ vector
 	 * Set spurious IRQ vector
 	 */
 	 */
 	value |= SPURIOUS_APIC_VECTOR;
 	value |= SPURIOUS_APIC_VECTOR;
-	apic_write_around(APIC_SPIV, value);
+	apic_write(APIC_SPIV, value);
 
 
 	/*
 	/*
 	 * Set up LVT0, LVT1:
 	 * Set up LVT0, LVT1:
@@ -1069,7 +1073,7 @@ void __cpuinit setup_local_APIC(void)
 		apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
 		apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
 				smp_processor_id());
 				smp_processor_id());
 	}
 	}
-	apic_write_around(APIC_LVT0, value);
+	apic_write(APIC_LVT0, value);
 
 
 	/*
 	/*
 	 * only the BP should see the LINT1 NMI signal, obviously.
 	 * only the BP should see the LINT1 NMI signal, obviously.
@@ -1080,7 +1084,7 @@ void __cpuinit setup_local_APIC(void)
 		value = APIC_DM_NMI | APIC_LVT_MASKED;
 		value = APIC_DM_NMI | APIC_LVT_MASKED;
 	if (!integrated)		/* 82489DX */
 	if (!integrated)		/* 82489DX */
 		value |= APIC_LVT_LEVEL_TRIGGER;
 		value |= APIC_LVT_LEVEL_TRIGGER;
-	apic_write_around(APIC_LVT1, value);
+	apic_write(APIC_LVT1, value);
 }
 }
 
 
 void __cpuinit end_local_APIC_setup(void)
 void __cpuinit end_local_APIC_setup(void)
@@ -1091,7 +1095,7 @@ void __cpuinit end_local_APIC_setup(void)
 	/* Disable the local apic timer */
 	/* Disable the local apic timer */
 	value = apic_read(APIC_LVTT);
 	value = apic_read(APIC_LVTT);
 	value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
 	value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
-	apic_write_around(APIC_LVTT, value);
+	apic_write(APIC_LVTT, value);
 
 
 	setup_apic_nmi_watchdog(NULL);
 	setup_apic_nmi_watchdog(NULL);
 	apic_pm_activate();
 	apic_pm_activate();
@@ -1214,9 +1218,6 @@ int apic_version[MAX_APICS];
 
 
 int __init APIC_init_uniprocessor(void)
 int __init APIC_init_uniprocessor(void)
 {
 {
-	if (disable_apic)
-		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
-
 	if (!smp_found_config && !cpu_has_apic)
 	if (!smp_found_config && !cpu_has_apic)
 		return -1;
 		return -1;
 
 
@@ -1419,7 +1420,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
 		value &= ~APIC_VECTOR_MASK;
 		value &= ~APIC_VECTOR_MASK;
 		value |= APIC_SPIV_APIC_ENABLED;
 		value |= APIC_SPIV_APIC_ENABLED;
 		value |= 0xf;
 		value |= 0xf;
-		apic_write_around(APIC_SPIV, value);
+		apic_write(APIC_SPIV, value);
 
 
 		if (!virt_wire_setup) {
 		if (!virt_wire_setup) {
 			/*
 			/*
@@ -1432,10 +1433,10 @@ void disconnect_bsp_APIC(int virt_wire_setup)
 				APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
 				APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
 			value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
 			value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
 			value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
 			value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
-			apic_write_around(APIC_LVT0, value);
+			apic_write(APIC_LVT0, value);
 		} else {
 		} else {
 			/* Disable LVT0 */
 			/* Disable LVT0 */
-			apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
+			apic_write(APIC_LVT0, APIC_LVT_MASKED);
 		}
 		}
 
 
 		/*
 		/*
@@ -1449,7 +1450,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
 			APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
 			APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
 		value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
 		value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
 		value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
 		value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
-		apic_write_around(APIC_LVT1, value);
+		apic_write(APIC_LVT1, value);
 	}
 	}
 }
 }
 
 
@@ -1700,7 +1701,7 @@ early_param("lapic", parse_lapic);
 static int __init parse_nolapic(char *arg)
 static int __init parse_nolapic(char *arg)
 {
 {
 	disable_apic = 1;
 	disable_apic = 1;
-	clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+	setup_clear_cpu_cap(X86_FEATURE_APIC);
 	return 0;
 	return 0;
 }
 }
 early_param("nolapic", parse_nolapic);
 early_param("nolapic", parse_nolapic);

+ 15 - 11
arch/x86/kernel/apic_64.c

@@ -54,7 +54,7 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
 /*
 /*
  * Debug level, exported for io_apic.c
  * Debug level, exported for io_apic.c
  */
  */
-int apic_verbosity;
+unsigned int apic_verbosity;
 
 
 /* Have we found an MP table */
 /* Have we found an MP table */
 int smp_found_config;
 int smp_found_config;
@@ -314,7 +314,7 @@ static void setup_APIC_timer(void)
 
 
 #define TICK_COUNT 100000000
 #define TICK_COUNT 100000000
 
 
-static void __init calibrate_APIC_clock(void)
+static int __init calibrate_APIC_clock(void)
 {
 {
 	unsigned apic, apic_start;
 	unsigned apic, apic_start;
 	unsigned long tsc, tsc_start;
 	unsigned long tsc, tsc_start;
@@ -368,6 +368,17 @@ static void __init calibrate_APIC_clock(void)
 		clockevent_delta2ns(0xF, &lapic_clockevent);
 		clockevent_delta2ns(0xF, &lapic_clockevent);
 
 
 	calibration_result = result / HZ;
 	calibration_result = result / HZ;
+
+	/*
+	 * Do a sanity check on the APIC calibration result
+	 */
+	if (calibration_result < (1000000 / HZ)) {
+		printk(KERN_WARNING
+			"APIC frequency too slow, disabling apic timer\n");
+		return -1;
+	}
+
+	return 0;
 }
 }
 
 
 /*
 /*
@@ -394,14 +405,7 @@ void __init setup_boot_APIC_clock(void)
 	}
 	}
 
 
 	printk(KERN_INFO "Using local APIC timer interrupts.\n");
 	printk(KERN_INFO "Using local APIC timer interrupts.\n");
-	calibrate_APIC_clock();
-
-	/*
-	 * Do a sanity check on the APIC calibration result
-	 */
-	if (calibration_result < (1000000 / HZ)) {
-		printk(KERN_WARNING
-		       "APIC frequency too slow, disabling apic timer\n");
+	if (calibrate_APIC_clock()) {
 		/* No broadcast on UP ! */
 		/* No broadcast on UP ! */
 		if (num_possible_cpus() > 1)
 		if (num_possible_cpus() > 1)
 			setup_APIC_timer();
 			setup_APIC_timer();
@@ -1337,7 +1341,7 @@ early_param("apic", apic_set_verbosity);
 static __init int setup_disableapic(char *str)
 static __init int setup_disableapic(char *str)
 {
 {
 	disable_apic = 1;
 	disable_apic = 1;
-	clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+	setup_clear_cpu_cap(X86_FEATURE_APIC);
 	return 0;
 	return 0;
 }
 }
 early_param("disableapic", setup_disableapic);
 early_param("disableapic", setup_disableapic);

+ 11 - 0
arch/x86/kernel/asm-offsets_64.c

@@ -18,6 +18,8 @@
 #include <asm/ia32.h>
 #include <asm/ia32.h>
 #include <asm/bootparam.h>
 #include <asm/bootparam.h>
 
 
+#include <xen/interface/xen.h>
+
 #define __NO_STUBS 1
 #define __NO_STUBS 1
 #undef __SYSCALL
 #undef __SYSCALL
 #undef _ASM_X86_64_UNISTD_H_
 #undef _ASM_X86_64_UNISTD_H_
@@ -131,5 +133,14 @@ int main(void)
 	OFFSET(BP_loadflags, boot_params, hdr.loadflags);
 	OFFSET(BP_loadflags, boot_params, hdr.loadflags);
 	OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
 	OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
 	OFFSET(BP_version, boot_params, hdr.version);
 	OFFSET(BP_version, boot_params, hdr.version);
+
+	BLANK();
+	DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
+#ifdef CONFIG_XEN
+	BLANK();
+	OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
+	OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+#undef ENTRY
+#endif
 	return 0;
 	return 0;
 }
 }

+ 48 - 0
arch/x86/kernel/bios_uv.c

@@ -0,0 +1,48 @@
+/*
+ * BIOS run time interface routines.
+ *
+ *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <asm/uv/bios.h>
+
+const char *
+x86_bios_strerror(long status)
+{
+	const char *str;
+	switch (status) {
+	case  0: str = "Call completed without error"; break;
+	case -1: str = "Not implemented"; break;
+	case -2: str = "Invalid argument"; break;
+	case -3: str = "Call completed with error"; break;
+	default: str = "Unknown BIOS status code"; break;
+	}
+	return str;
+}
+
+long
+x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second,
+		   unsigned long *drift_info)
+{
+	struct uv_bios_retval isrv;
+
+	BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0);
+	*ticks_per_second = isrv.v0;
+	*drift_info = isrv.v1;
+	return isrv.status;
+}
+EXPORT_SYMBOL_GPL(x86_bios_freq_base);

+ 0 - 2
arch/x86/kernel/cpu/amd.c

@@ -24,8 +24,6 @@
 extern void vide(void);
 extern void vide(void);
 __asm__(".align 4\nvide: ret");
 __asm__(".align 4\nvide: ret");
 
 
-int force_mwait __cpuinitdata;
-
 static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
 static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
 {
 {
 	if (cpuid_eax(0x80000000) >= 0x80000007) {
 	if (cpuid_eax(0x80000000) >= 0x80000007) {

+ 2 - 0
arch/x86/kernel/cpu/amd_64.c

@@ -115,6 +115,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
 	/* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
 	/* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
 	if (c->x86_power & (1<<8))
 	if (c->x86_power & (1<<8))
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+
+	set_cpu_cap(c, X86_FEATURE_SYSCALL32);
 }
 }
 
 
 static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 static void __cpuinit init_amd(struct cpuinfo_x86 *c)

+ 1 - 22
arch/x86/kernel/cpu/bugs.c

@@ -131,13 +131,7 @@ static void __init check_popad(void)
  *   (for due to lack of "invlpg" and working WP on a i386)
  *   (for due to lack of "invlpg" and working WP on a i386)
  * - In order to run on anything without a TSC, we need to be
  * - In order to run on anything without a TSC, we need to be
  *   compiled for a i486.
  *   compiled for a i486.
- * - In order to support the local APIC on a buggy Pentium machine,
- *   we need to be compiled with CONFIG_X86_GOOD_APIC disabled,
- *   which happens implicitly if compiled for a Pentium or lower
- *   (unless an advanced selection of CPU features is used) as an
- *   otherwise config implies a properly working local APIC without
- *   the need to do extra reads from the APIC.
-*/
+ */
 
 
 static void __init check_config(void)
 static void __init check_config(void)
 {
 {
@@ -151,21 +145,6 @@ static void __init check_config(void)
 	if (boot_cpu_data.x86 == 3)
 	if (boot_cpu_data.x86 == 3)
 		panic("Kernel requires i486+ for 'invlpg' and other features");
 		panic("Kernel requires i486+ for 'invlpg' and other features");
 #endif
 #endif
-
-/*
- * If we were told we had a good local APIC, check for buggy Pentia,
- * i.e. all B steppings and the C2 stepping of P54C when using their
- * integrated APIC (see 11AP erratum in "Pentium Processor
- * Specification Update").
- */
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC)
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
-	    && cpu_has_apic
-	    && boot_cpu_data.x86 == 5
-	    && boot_cpu_data.x86_model == 2
-	    && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11))
-		panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!");
-#endif
 }
 }
 
 
 
 

+ 2 - 13
arch/x86/kernel/cpu/common_64.c

@@ -7,15 +7,13 @@
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/kgdb.h>
 #include <linux/kgdb.h>
 #include <linux/topology.h>
 #include <linux/topology.h>
-#include <linux/string.h>
 #include <linux/delay.h>
 #include <linux/delay.h>
 #include <linux/smp.h>
 #include <linux/smp.h>
-#include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/percpu.h>
-#include <asm/processor.h>
 #include <asm/i387.h>
 #include <asm/i387.h>
 #include <asm/msr.h>
 #include <asm/msr.h>
 #include <asm/io.h>
 #include <asm/io.h>
+#include <asm/linkage.h>
 #include <asm/mmu_context.h>
 #include <asm/mmu_context.h>
 #include <asm/mtrr.h>
 #include <asm/mtrr.h>
 #include <asm/mce.h>
 #include <asm/mce.h>
@@ -305,7 +303,6 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 			c->x86_capability[2] = cpuid_edx(0x80860001);
 			c->x86_capability[2] = cpuid_edx(0x80860001);
 	}
 	}
 
 
-	c->extended_cpuid_level = cpuid_eax(0x80000000);
 	if (c->extended_cpuid_level >= 0x80000007)
 	if (c->extended_cpuid_level >= 0x80000007)
 		c->x86_power = cpuid_edx(0x80000007);
 		c->x86_power = cpuid_edx(0x80000007);
 
 
@@ -316,18 +313,11 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 		c->x86_phys_bits = eax & 0xff;
 		c->x86_phys_bits = eax & 0xff;
 	}
 	}
 
 
-	/* Assume all 64-bit CPUs support 32-bit syscall */
-	set_cpu_cap(c, X86_FEATURE_SYSCALL32);
-
 	if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
 	if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
 	    cpu_devs[c->x86_vendor]->c_early_init)
 	    cpu_devs[c->x86_vendor]->c_early_init)
 		cpu_devs[c->x86_vendor]->c_early_init(c);
 		cpu_devs[c->x86_vendor]->c_early_init(c);
 
 
 	validate_pat_support(c);
 	validate_pat_support(c);
-
-	/* early_param could clear that, but recall get it set again */
-	if (disable_apic)
-		clear_cpu_cap(c, X86_FEATURE_APIC);
 }
 }
 
 
 /*
 /*
@@ -517,8 +507,7 @@ void pda_init(int cpu)
 }
 }
 
 
 char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
 char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
-			   DEBUG_STKSZ]
-__attribute__((section(".bss.page_aligned")));
+			   DEBUG_STKSZ] __page_aligned_bss;
 
 
 extern asmlinkage void ignore_sysret(void);
 extern asmlinkage void ignore_sysret(void);
 
 

+ 10 - 0
arch/x86/kernel/cpu/intel.c

@@ -227,6 +227,16 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 	if (cpu_has_bts)
 	if (cpu_has_bts)
 		ds_init_intel(c);
 		ds_init_intel(c);
 
 
+	/*
+	 * See if we have a good local APIC by checking for buggy Pentia,
+	 * i.e. all B steppings and the C2 stepping of P54C when using their
+	 * integrated APIC (see 11AP erratum in "Pentium Processor
+	 * Specification Update").
+	 */
+	if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
+	    (c->x86_mask < 0x6 || c->x86_mask == 0xb))
+		set_cpu_cap(c, X86_FEATURE_11AP);
+
 #ifdef CONFIG_X86_NUMAQ
 #ifdef CONFIG_X86_NUMAQ
 	numaq_tsc_disable();
 	numaq_tsc_disable();
 #endif
 #endif

+ 2 - 2
arch/x86/kernel/cpu/mcheck/p4.c

@@ -102,7 +102,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
 	/* The temperature transition interrupt handler setup */
 	/* The temperature transition interrupt handler setup */
 	h = THERMAL_APIC_VECTOR;		/* our delivery vector */
 	h = THERMAL_APIC_VECTOR;		/* our delivery vector */
 	h |= (APIC_DM_FIXED | APIC_LVT_MASKED);	/* we'll mask till we're ready */
 	h |= (APIC_DM_FIXED | APIC_LVT_MASKED);	/* we'll mask till we're ready */
-	apic_write_around(APIC_LVTTHMR, h);
+	apic_write(APIC_LVTTHMR, h);
 
 
 	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
 	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
 	wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
 	wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
@@ -114,7 +114,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
 	wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h);
 	wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h);
 
 
 	l = apic_read(APIC_LVTTHMR);
 	l = apic_read(APIC_LVTTHMR);
-	apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+	apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
 	printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
 	printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
 
 
 	/* enable thermal throttle processing */
 	/* enable thermal throttle processing */

+ 4 - 29
arch/x86/kernel/e820.c

@@ -877,7 +877,8 @@ void __init early_res_to_bootmem(u64 start, u64 end)
 	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
 	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
 		count++;
 		count++;
 
 
-	printk(KERN_INFO "(%d early reservations) ==> bootmem\n", count);
+	printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
+			 count, start, end);
 	for (i = 0; i < count; i++) {
 	for (i = 0; i < count; i++) {
 		struct early_res *r = &early_res[i];
 		struct early_res *r = &early_res[i];
 		printk(KERN_INFO "  #%d [%010llx - %010llx] %16s", i,
 		printk(KERN_INFO "  #%d [%010llx - %010llx] %16s", i,
@@ -1298,11 +1299,6 @@ void __init e820_reserve_resources(void)
 	}
 	}
 }
 }
 
 
-/*
- * Non-standard memory setup can be specified via this quirk:
- */
-char * (*arch_memory_setup_quirk)(void);
-
 char *__init default_machine_specific_memory_setup(void)
 char *__init default_machine_specific_memory_setup(void)
 {
 {
 	char *who = "BIOS-e820";
 	char *who = "BIOS-e820";
@@ -1343,8 +1339,8 @@ char *__init default_machine_specific_memory_setup(void)
 
 
 char *__init __attribute__((weak)) machine_specific_memory_setup(void)
 char *__init __attribute__((weak)) machine_specific_memory_setup(void)
 {
 {
-	if (arch_memory_setup_quirk) {
-		char *who = arch_memory_setup_quirk();
+	if (x86_quirks->arch_memory_setup) {
+		char *who = x86_quirks->arch_memory_setup();
 
 
 		if (who)
 		if (who)
 			return who;
 			return who;
@@ -1367,24 +1363,3 @@ void __init setup_memory_map(void)
 	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
 	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
 	e820_print_map(who);
 	e820_print_map(who);
 }
 }
-
-#ifdef CONFIG_X86_64
-int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
-{
-	int i;
-
-	if (slot < 0 || slot >= e820.nr_map)
-		return -1;
-	for (i = slot; i < e820.nr_map; i++) {
-		if (e820.map[i].type != E820_RAM)
-			continue;
-		break;
-	}
-	if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
-		return -1;
-	*addr = e820.map[i].addr;
-	*size = min_t(u64, e820.map[i].size + e820.map[i].addr,
-		max_pfn << PAGE_SHIFT) - *addr;
-	return i + 1;
-}
-#endif

+ 1 - 4
arch/x86/kernel/early-quirks.c

@@ -16,10 +16,7 @@
 #include <asm/dma.h>
 #include <asm/dma.h>
 #include <asm/io_apic.h>
 #include <asm/io_apic.h>
 #include <asm/apic.h>
 #include <asm/apic.h>
-
-#ifdef CONFIG_GART_IOMMU
-#include <asm/gart.h>
-#endif
+#include <asm/iommu.h>
 
 
 static void __init fix_hypertransport_config(int num, int slot, int func)
 static void __init fix_hypertransport_config(int num, int slot, int func)
 {
 {

+ 8 - 16
arch/x86/kernel/entry_32.S

@@ -332,7 +332,7 @@ sysenter_past_esp:
 	GET_THREAD_INFO(%ebp)
 	GET_THREAD_INFO(%ebp)
 
 
 	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
 	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-	testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+	testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
 	jnz syscall_trace_entry
 	jnz syscall_trace_entry
 	cmpl $(nr_syscalls), %eax
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
 	jae syscall_badsys
@@ -370,7 +370,7 @@ ENTRY(system_call)
 	GET_THREAD_INFO(%ebp)
 	GET_THREAD_INFO(%ebp)
 					# system call tracing in operation / emulation
 					# system call tracing in operation / emulation
 	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
 	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-	testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+	testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
 	jnz syscall_trace_entry
 	jnz syscall_trace_entry
 	cmpl $(nr_syscalls), %eax
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
 	jae syscall_badsys
@@ -383,10 +383,6 @@ syscall_exit:
 					# setting need_resched or sigpending
 					# setting need_resched or sigpending
 					# between sampling and the iret
 					# between sampling and the iret
 	TRACE_IRQS_OFF
 	TRACE_IRQS_OFF
-	testl $X86_EFLAGS_TF,PT_EFLAGS(%esp)	# If tracing set singlestep flag on exit
-	jz no_singlestep
-	orl $_TIF_SINGLESTEP,TI_flags(%ebp)
-no_singlestep:
 	movl TI_flags(%ebp), %ecx
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx	# current->work
 	testw $_TIF_ALLWORK_MASK, %cx	# current->work
 	jne syscall_exit_work
 	jne syscall_exit_work
@@ -514,12 +510,8 @@ END(work_pending)
 syscall_trace_entry:
 syscall_trace_entry:
 	movl $-ENOSYS,PT_EAX(%esp)
 	movl $-ENOSYS,PT_EAX(%esp)
 	movl %esp, %eax
 	movl %esp, %eax
-	xorl %edx,%edx
-	call do_syscall_trace
-	cmpl $0, %eax
-	jne resume_userspace		# ret != 0 -> running under PTRACE_SYSEMU,
-					# so must skip actual syscall
-	movl PT_ORIG_EAX(%esp), %eax
+	call syscall_trace_enter
+	/* What it returned is what we'll actually use.  */
 	cmpl $(nr_syscalls), %eax
 	cmpl $(nr_syscalls), %eax
 	jnae syscall_call
 	jnae syscall_call
 	jmp syscall_exit
 	jmp syscall_exit
@@ -528,14 +520,13 @@ END(syscall_trace_entry)
 	# perform syscall exit tracing
 	# perform syscall exit tracing
 	ALIGN
 	ALIGN
 syscall_exit_work:
 syscall_exit_work:
-	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
+	testb $_TIF_WORK_SYSCALL_EXIT, %cl
 	jz work_pending
 	jz work_pending
 	TRACE_IRQS_ON
 	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_ANY)	# could let do_syscall_trace() call
+	ENABLE_INTERRUPTS(CLBR_ANY)	# could let syscall_trace_leave() call
 					# schedule() instead
 					# schedule() instead
 	movl %esp, %eax
 	movl %esp, %eax
-	movl $1, %edx
-	call do_syscall_trace
+	call syscall_trace_leave
 	jmp resume_userspace
 	jmp resume_userspace
 END(syscall_exit_work)
 END(syscall_exit_work)
 	CFI_ENDPROC
 	CFI_ENDPROC
@@ -1024,6 +1015,7 @@ ENDPROC(kernel_thread_helper)
 ENTRY(xen_sysenter_target)
 ENTRY(xen_sysenter_target)
 	RING0_INT_FRAME
 	RING0_INT_FRAME
 	addl $5*4, %esp		/* remove xen-provided frame */
 	addl $5*4, %esp		/* remove xen-provided frame */
+	CFI_ADJUST_CFA_OFFSET -5*4
 	jmp sysenter_past_esp
 	jmp sysenter_past_esp
 	CFI_ENDPROC
 	CFI_ENDPROC
 
 

+ 115 - 5
arch/x86/kernel/entry_64.S

@@ -349,8 +349,7 @@ ENTRY(system_call_after_swapgs)
 	movq  %rcx,RIP-ARGOFFSET(%rsp)
 	movq  %rcx,RIP-ARGOFFSET(%rsp)
 	CFI_REL_OFFSET rip,RIP-ARGOFFSET
 	CFI_REL_OFFSET rip,RIP-ARGOFFSET
 	GET_THREAD_INFO(%rcx)
 	GET_THREAD_INFO(%rcx)
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
-		TI_flags(%rcx)
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
 	jnz tracesys
 	jnz tracesys
 	cmpq $__NR_syscall_max,%rax
 	cmpq $__NR_syscall_max,%rax
 	ja badsys
 	ja badsys
@@ -430,7 +429,12 @@ tracesys:
 	FIXUP_TOP_OF_STACK %rdi
 	FIXUP_TOP_OF_STACK %rdi
 	movq %rsp,%rdi
 	movq %rsp,%rdi
 	call syscall_trace_enter
 	call syscall_trace_enter
-	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	/*
+	 * Reload arg registers from stack in case ptrace changed them.
+	 * We don't reload %rax because syscall_trace_enter() returned
+	 * the value it wants us to use in the table lookup.
+	 */
+	LOAD_ARGS ARGOFFSET, 1
 	RESTORE_REST
 	RESTORE_REST
 	cmpq $__NR_syscall_max,%rax
 	cmpq $__NR_syscall_max,%rax
 	ja   int_ret_from_sys_call	/* RAX(%rsp) set to -ENOSYS above */
 	ja   int_ret_from_sys_call	/* RAX(%rsp) set to -ENOSYS above */
@@ -483,7 +487,7 @@ int_very_careful:
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	SAVE_REST
 	SAVE_REST
 	/* Check for syscall exit trace */	
 	/* Check for syscall exit trace */	
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
+	testl $_TIF_WORK_SYSCALL_EXIT,%edx
 	jz int_signal
 	jz int_signal
 	pushq %rdi
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET 8
 	CFI_ADJUST_CFA_OFFSET 8
@@ -491,7 +495,7 @@ int_very_careful:
 	call syscall_trace_leave
 	call syscall_trace_leave
 	popq %rdi
 	popq %rdi
 	CFI_ADJUST_CFA_OFFSET -8
 	CFI_ADJUST_CFA_OFFSET -8
-	andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
+	andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
 	jmp int_restore_rest
 	jmp int_restore_rest
 	
 	
 int_signal:
 int_signal:
@@ -1189,6 +1193,7 @@ END(device_not_available)
 	/* runs on exception stack */
 	/* runs on exception stack */
 KPROBE_ENTRY(debug)
 KPROBE_ENTRY(debug)
  	INTR_FRAME
  	INTR_FRAME
+	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	pushq $0
 	pushq $0
 	CFI_ADJUST_CFA_OFFSET 8		
 	CFI_ADJUST_CFA_OFFSET 8		
 	paranoidentry do_debug, DEBUG_STACK
 	paranoidentry do_debug, DEBUG_STACK
@@ -1198,6 +1203,7 @@ KPROBE_END(debug)
 	/* runs on exception stack */	
 	/* runs on exception stack */	
 KPROBE_ENTRY(nmi)
 KPROBE_ENTRY(nmi)
 	INTR_FRAME
 	INTR_FRAME
+	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	pushq $-1
 	pushq $-1
 	CFI_ADJUST_CFA_OFFSET 8
 	CFI_ADJUST_CFA_OFFSET 8
 	paranoidentry do_nmi, 0, 0
 	paranoidentry do_nmi, 0, 0
@@ -1211,6 +1217,7 @@ KPROBE_END(nmi)
 
 
 KPROBE_ENTRY(int3)
 KPROBE_ENTRY(int3)
  	INTR_FRAME
  	INTR_FRAME
+	PARAVIRT_ADJUST_EXCEPTION_FRAME
  	pushq $0
  	pushq $0
  	CFI_ADJUST_CFA_OFFSET 8
  	CFI_ADJUST_CFA_OFFSET 8
  	paranoidentry do_int3, DEBUG_STACK
  	paranoidentry do_int3, DEBUG_STACK
@@ -1237,6 +1244,7 @@ END(coprocessor_segment_overrun)
 	/* runs on exception stack */
 	/* runs on exception stack */
 ENTRY(double_fault)
 ENTRY(double_fault)
 	XCPT_FRAME
 	XCPT_FRAME
+	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	paranoidentry do_double_fault
 	paranoidentry do_double_fault
 	jmp paranoid_exit1
 	jmp paranoid_exit1
 	CFI_ENDPROC
 	CFI_ENDPROC
@@ -1253,6 +1261,7 @@ END(segment_not_present)
 	/* runs on exception stack */
 	/* runs on exception stack */
 ENTRY(stack_segment)
 ENTRY(stack_segment)
 	XCPT_FRAME
 	XCPT_FRAME
+	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	paranoidentry do_stack_segment
 	paranoidentry do_stack_segment
 	jmp paranoid_exit1
 	jmp paranoid_exit1
 	CFI_ENDPROC
 	CFI_ENDPROC
@@ -1278,6 +1287,7 @@ END(spurious_interrupt_bug)
 	/* runs on exception stack */
 	/* runs on exception stack */
 ENTRY(machine_check)
 ENTRY(machine_check)
 	INTR_FRAME
 	INTR_FRAME
+	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	pushq $0
 	pushq $0
 	CFI_ADJUST_CFA_OFFSET 8	
 	CFI_ADJUST_CFA_OFFSET 8	
 	paranoidentry do_machine_check
 	paranoidentry do_machine_check
@@ -1312,3 +1322,103 @@ KPROBE_ENTRY(ignore_sysret)
 	sysret
 	sysret
 	CFI_ENDPROC
 	CFI_ENDPROC
 ENDPROC(ignore_sysret)
 ENDPROC(ignore_sysret)
+
+#ifdef CONFIG_XEN
+ENTRY(xen_hypervisor_callback)
+	zeroentry xen_do_hypervisor_callback
+END(xen_hypervisor_callback)
+
+/*
+# A note on the "critical region" in our callback handler.
+# We want to avoid stacking callback handlers due to events occurring
+# during handling of the last event. To do this, we keep events disabled
+# until we've done all processing. HOWEVER, we must enable events before
+# popping the stack frame (can't be done atomically) and so it would still
+# be possible to get enough handler activations to overflow the stack.
+# Although unlikely, bugs of that kind are hard to track down, so we'd
+# like to avoid the possibility.
+# So, on entry to the handler we detect whether we interrupted an
+# existing activation in its critical region -- if so, we pop the current
+# activation and restart the handler using the previous one.
+*/
+ENTRY(xen_do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
+	CFI_STARTPROC
+/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
+   see the correct pointer to the pt_regs */
+	movq %rdi, %rsp            # we don't return, adjust the stack frame
+	CFI_ENDPROC
+	CFI_DEFAULT_STACK
+11:	incl %gs:pda_irqcount
+	movq %rsp,%rbp
+	CFI_DEF_CFA_REGISTER rbp
+	cmovzq %gs:pda_irqstackptr,%rsp
+	pushq %rbp			# backlink for old unwinder
+	call xen_evtchn_do_upcall
+	popq %rsp
+	CFI_DEF_CFA_REGISTER rsp
+	decl %gs:pda_irqcount
+	jmp  error_exit
+	CFI_ENDPROC
+END(do_hypervisor_callback)
+
+/*
+# Hypervisor uses this for application faults while it executes.
+# We get here for two reasons:
+#  1. Fault while reloading DS, ES, FS or GS
+#  2. Fault while executing IRET
+# Category 1 we do not need to fix up as Xen has already reloaded all segment
+# registers that could be reloaded and zeroed the others.
+# Category 2 we fix up by killing the current process. We cannot use the
+# normal Linux return path in this case because if we use the IRET hypercall
+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+# We distinguish between categories by comparing each saved segment register
+# with its current contents: any discrepancy means we in category 1.
+*/
+ENTRY(xen_failsafe_callback)
+	framesz = (RIP-0x30)	/* workaround buggy gas */
+	_frame framesz
+	CFI_REL_OFFSET rcx, 0
+	CFI_REL_OFFSET r11, 8
+	movw %ds,%cx
+	cmpw %cx,0x10(%rsp)
+	CFI_REMEMBER_STATE
+	jne 1f
+	movw %es,%cx
+	cmpw %cx,0x18(%rsp)
+	jne 1f
+	movw %fs,%cx
+	cmpw %cx,0x20(%rsp)
+	jne 1f
+	movw %gs,%cx
+	cmpw %cx,0x28(%rsp)
+	jne 1f
+	/* All segments match their saved values => Category 2 (Bad IRET). */
+	movq (%rsp),%rcx
+	CFI_RESTORE rcx
+	movq 8(%rsp),%r11
+	CFI_RESTORE r11
+	addq $0x30,%rsp
+	CFI_ADJUST_CFA_OFFSET -0x30
+	pushq $0
+	CFI_ADJUST_CFA_OFFSET 8
+	pushq %r11
+	CFI_ADJUST_CFA_OFFSET 8
+	pushq %rcx
+	CFI_ADJUST_CFA_OFFSET 8
+	jmp general_protection
+	CFI_RESTORE_STATE
+1:	/* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
+	movq (%rsp),%rcx
+	CFI_RESTORE rcx
+	movq 8(%rsp),%r11
+	CFI_RESTORE r11
+	addq $0x30,%rsp
+	CFI_ADJUST_CFA_OFFSET -0x30
+	pushq $0
+	CFI_ADJUST_CFA_OFFSET 8
+	SAVE_ALL
+	jmp error_exit
+	CFI_ENDPROC
+END(xen_failsafe_callback)
+
+#endif /* CONFIG_XEN */

+ 23 - 0
arch/x86/kernel/genx2apic_uv_x.c

@@ -24,6 +24,7 @@
 #include <asm/pgtable.h>
 #include <asm/pgtable.h>
 #include <asm/uv/uv_mmrs.h>
 #include <asm/uv/uv_mmrs.h>
 #include <asm/uv/uv_hub.h>
 #include <asm/uv/uv_hub.h>
+#include <asm/uv/bios.h>
 
 
 DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
 DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
 EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
 EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
@@ -40,6 +41,9 @@ EXPORT_SYMBOL_GPL(uv_cpu_to_blade);
 short uv_possible_blades;
 short uv_possible_blades;
 EXPORT_SYMBOL_GPL(uv_possible_blades);
 EXPORT_SYMBOL_GPL(uv_possible_blades);
 
 
+unsigned long sn_rtc_cycles_per_second;
+EXPORT_SYMBOL(sn_rtc_cycles_per_second);
+
 /* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
 /* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
 
 
 static cpumask_t uv_target_cpus(void)
 static cpumask_t uv_target_cpus(void)
@@ -272,6 +276,23 @@ static __init void map_mmioh_high(int max_pnode)
 		map_high("MMIOH", mmioh.s.base, shift, map_uc);
 		map_high("MMIOH", mmioh.s.base, shift, map_uc);
 }
 }
 
 
+static __init void uv_rtc_init(void)
+{
+	long status, ticks_per_sec, drift;
+
+	status =
+	    x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec,
+					&drift);
+	if (status != 0 || ticks_per_sec < 100000) {
+		printk(KERN_WARNING
+			"unable to determine platform RTC clock frequency, "
+			"guessing.\n");
+		/* BIOS gives wrong value for clock freq. so guess */
+		sn_rtc_cycles_per_second = 1000000000000UL / 30000UL;
+	} else
+		sn_rtc_cycles_per_second = ticks_per_sec;
+}
+
 static __init void uv_system_init(void)
 static __init void uv_system_init(void)
 {
 {
 	union uvh_si_addr_map_config_u m_n_config;
 	union uvh_si_addr_map_config_u m_n_config;
@@ -326,6 +347,8 @@ static __init void uv_system_init(void)
 	gnode_upper = (((unsigned long)node_id.s.node_id) &
 	gnode_upper = (((unsigned long)node_id.s.node_id) &
 		       ~((1 << n_val) - 1)) << m_val;
 		       ~((1 << n_val) - 1)) << m_val;
 
 
+	uv_rtc_init();
+
 	for_each_present_cpu(cpu) {
 	for_each_present_cpu(cpu) {
 		nid = cpu_to_node(cpu);
 		nid = cpu_to_node(cpu);
 		pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu));
 		pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu));

+ 8 - 3
arch/x86/kernel/head64.c

@@ -39,6 +39,13 @@ static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
 static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
 static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
 #endif
 #endif
 
 
+void __init x86_64_init_pda(void)
+{
+	_cpu_pda = __cpu_pda;
+	cpu_pda(0) = &_boot_cpu_pda;
+	pda_init(0);
+}
+
 static void __init zap_identity_mappings(void)
 static void __init zap_identity_mappings(void)
 {
 {
 	pgd_t *pgd = pgd_offset_k(0UL);
 	pgd_t *pgd = pgd_offset_k(0UL);
@@ -102,9 +109,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
 
 
 	early_printk("Kernel alive\n");
 	early_printk("Kernel alive\n");
 
 
-	_cpu_pda = __cpu_pda;
-	cpu_pda(0) = &_boot_cpu_pda;
-	pda_init(0);
+	x86_64_init_pda();
 
 
 	early_printk("Kernel really alive\n");
 	early_printk("Kernel really alive\n");
 
 

+ 1 - 0
arch/x86/kernel/head_64.S

@@ -407,6 +407,7 @@ ENTRY(phys_base)
 	/* This must match the first entry in level2_kernel_pgt */
 	/* This must match the first entry in level2_kernel_pgt */
 	.quad   0x0000000000000000
 	.quad   0x0000000000000000
 
 
+#include "../../x86/xen/xen-head.S"
 	
 	
 	.section .bss, "aw", @nobits
 	.section .bss, "aw", @nobits
 	.align L1_CACHE_BYTES
 	.align L1_CACHE_BYTES

+ 29 - 24
arch/x86/kernel/io_apic_32.c

@@ -756,7 +756,7 @@ void send_IPI_self(int vector)
 	/*
 	/*
 	 * Send the IPI. The write to APIC_ICR fires this off.
 	 * Send the IPI. The write to APIC_ICR fires this off.
 	 */
 	 */
-	apic_write_around(APIC_ICR, cfg);
+	apic_write(APIC_ICR, cfg);
 }
 }
 #endif /* !CONFIG_SMP */
 #endif /* !CONFIG_SMP */
 
 
@@ -2030,7 +2030,7 @@ static void mask_lapic_irq(unsigned int irq)
 	unsigned long v;
 	unsigned long v;
 
 
 	v = apic_read(APIC_LVT0);
 	v = apic_read(APIC_LVT0);
-	apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
+	apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
 }
 }
 
 
 static void unmask_lapic_irq(unsigned int irq)
 static void unmask_lapic_irq(unsigned int irq)
@@ -2038,7 +2038,7 @@ static void unmask_lapic_irq(unsigned int irq)
 	unsigned long v;
 	unsigned long v;
 
 
 	v = apic_read(APIC_LVT0);
 	v = apic_read(APIC_LVT0);
-	apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
+	apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
 }
 }
 
 
 static struct irq_chip lapic_chip __read_mostly = {
 static struct irq_chip lapic_chip __read_mostly = {
@@ -2168,7 +2168,7 @@ static inline void __init check_timer(void)
 	 * The AEOI mode will finish them in the 8259A
 	 * The AEOI mode will finish them in the 8259A
 	 * automatically.
 	 * automatically.
 	 */
 	 */
-	apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
 	init_8259A(1);
 	init_8259A(1);
 	timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
 	timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
 
 
@@ -2177,8 +2177,9 @@ static inline void __init check_timer(void)
 	pin2  = ioapic_i8259.pin;
 	pin2  = ioapic_i8259.pin;
 	apic2 = ioapic_i8259.apic;
 	apic2 = ioapic_i8259.apic;
 
 
-	printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
-		vector, apic1, pin1, apic2, pin2);
+	apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
+		    "apic1=%d pin1=%d apic2=%d pin2=%d\n",
+		    vector, apic1, pin1, apic2, pin2);
 
 
 	/*
 	/*
 	 * Some BIOS writers are clueless and report the ExtINTA
 	 * Some BIOS writers are clueless and report the ExtINTA
@@ -2216,12 +2217,13 @@ static inline void __init check_timer(void)
 		}
 		}
 		clear_IO_APIC_pin(apic1, pin1);
 		clear_IO_APIC_pin(apic1, pin1);
 		if (!no_pin1)
 		if (!no_pin1)
-			printk(KERN_ERR "..MP-BIOS bug: "
-			       "8254 timer not connected to IO-APIC\n");
+			apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
+				    "8254 timer not connected to IO-APIC\n");
 
 
-		printk(KERN_INFO "...trying to set up timer (IRQ0) "
-		       "through the 8259A ... ");
-		printk("\n..... (found pin %d) ...", pin2);
+		apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
+			    "(IRQ0) through the 8259A ...\n");
+		apic_printk(APIC_QUIET, KERN_INFO
+			    "..... (found apic %d pin %d) ...\n", apic2, pin2);
 		/*
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 * legacy devices should be connected to IO APIC #0
 		 */
 		 */
@@ -2230,7 +2232,7 @@ static inline void __init check_timer(void)
 		unmask_IO_APIC_irq(0);
 		unmask_IO_APIC_irq(0);
 		enable_8259A_irq(0);
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 		if (timer_irq_works()) {
-			printk("works.\n");
+			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
 			timer_through_8259 = 1;
 			timer_through_8259 = 1;
 			if (nmi_watchdog == NMI_IO_APIC) {
 			if (nmi_watchdog == NMI_IO_APIC) {
 				disable_8259A_irq(0);
 				disable_8259A_irq(0);
@@ -2244,44 +2246,47 @@ static inline void __init check_timer(void)
 		 */
 		 */
 		disable_8259A_irq(0);
 		disable_8259A_irq(0);
 		clear_IO_APIC_pin(apic2, pin2);
 		clear_IO_APIC_pin(apic2, pin2);
-		printk(" failed.\n");
+		apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
 	}
 	}
 
 
 	if (nmi_watchdog == NMI_IO_APIC) {
 	if (nmi_watchdog == NMI_IO_APIC) {
-		printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
+		apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
+			    "through the IO-APIC - disabling NMI Watchdog!\n");
 		nmi_watchdog = NMI_NONE;
 		nmi_watchdog = NMI_NONE;
 	}
 	}
 	timer_ack = 0;
 	timer_ack = 0;
 
 
-	printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
+	apic_printk(APIC_QUIET, KERN_INFO
+		    "...trying to set up timer as Virtual Wire IRQ...\n");
 
 
 	lapic_register_intr(0, vector);
 	lapic_register_intr(0, vector);
-	apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);	/* Fixed mode */
+	apic_write(APIC_LVT0, APIC_DM_FIXED | vector);	/* Fixed mode */
 	enable_8259A_irq(0);
 	enable_8259A_irq(0);
 
 
 	if (timer_irq_works()) {
 	if (timer_irq_works()) {
-		printk(" works.\n");
+		apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
 		goto out;
 		goto out;
 	}
 	}
 	disable_8259A_irq(0);
 	disable_8259A_irq(0);
-	apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
-	printk(" failed.\n");
+	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
+	apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
 
 
-	printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
+	apic_printk(APIC_QUIET, KERN_INFO
+		    "...trying to set up timer as ExtINT IRQ...\n");
 
 
 	init_8259A(0);
 	init_8259A(0);
 	make_8259A_irq(0);
 	make_8259A_irq(0);
-	apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
+	apic_write(APIC_LVT0, APIC_DM_EXTINT);
 
 
 	unlock_ExtINT_logic();
 	unlock_ExtINT_logic();
 
 
 	if (timer_irq_works()) {
 	if (timer_irq_works()) {
-		printk(" works.\n");
+		apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
 		goto out;
 		goto out;
 	}
 	}
-	printk(" failed :(.\n");
+	apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
 	panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
 	panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
-		"report.  Then try booting with the 'noapic' option");
+		"report.  Then try booting with the 'noapic' option.\n");
 out:
 out:
 	local_irq_restore(flags);
 	local_irq_restore(flags);
 }
 }

+ 23 - 18
arch/x86/kernel/io_apic_64.c

@@ -45,6 +45,7 @@
 #include <asm/proto.h>
 #include <asm/proto.h>
 #include <asm/acpi.h>
 #include <asm/acpi.h>
 #include <asm/dma.h>
 #include <asm/dma.h>
+#include <asm/i8259.h>
 #include <asm/nmi.h>
 #include <asm/nmi.h>
 #include <asm/msidef.h>
 #include <asm/msidef.h>
 #include <asm/hypertransport.h>
 #include <asm/hypertransport.h>
@@ -1696,8 +1697,9 @@ static inline void __init check_timer(void)
 	pin2  = ioapic_i8259.pin;
 	pin2  = ioapic_i8259.pin;
 	apic2 = ioapic_i8259.apic;
 	apic2 = ioapic_i8259.apic;
 
 
-	apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
-		cfg->vector, apic1, pin1, apic2, pin2);
+	apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
+		    "apic1=%d pin1=%d apic2=%d pin2=%d\n",
+		    cfg->vector, apic1, pin1, apic2, pin2);
 
 
 	/*
 	/*
 	 * Some BIOS writers are clueless and report the ExtINTA
 	 * Some BIOS writers are clueless and report the ExtINTA
@@ -1735,14 +1737,13 @@ static inline void __init check_timer(void)
 		}
 		}
 		clear_IO_APIC_pin(apic1, pin1);
 		clear_IO_APIC_pin(apic1, pin1);
 		if (!no_pin1)
 		if (!no_pin1)
-			apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: "
+			apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
 				    "8254 timer not connected to IO-APIC\n");
 				    "8254 timer not connected to IO-APIC\n");
 
 
-		apic_printk(APIC_VERBOSE,KERN_INFO
-			"...trying to set up timer (IRQ0) "
-			"through the 8259A ... ");
-		apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
-			apic2, pin2);
+		apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
+			    "(IRQ0) through the 8259A ...\n");
+		apic_printk(APIC_QUIET, KERN_INFO
+			    "..... (found apic %d pin %d) ...\n", apic2, pin2);
 		/*
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 * legacy devices should be connected to IO APIC #0
 		 */
 		 */
@@ -1751,7 +1752,7 @@ static inline void __init check_timer(void)
 		unmask_IO_APIC_irq(0);
 		unmask_IO_APIC_irq(0);
 		enable_8259A_irq(0);
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 		if (timer_irq_works()) {
-			apic_printk(APIC_VERBOSE," works.\n");
+			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
 			timer_through_8259 = 1;
 			timer_through_8259 = 1;
 			if (nmi_watchdog == NMI_IO_APIC) {
 			if (nmi_watchdog == NMI_IO_APIC) {
 				disable_8259A_irq(0);
 				disable_8259A_irq(0);
@@ -1765,29 +1766,32 @@ static inline void __init check_timer(void)
 		 */
 		 */
 		disable_8259A_irq(0);
 		disable_8259A_irq(0);
 		clear_IO_APIC_pin(apic2, pin2);
 		clear_IO_APIC_pin(apic2, pin2);
-		apic_printk(APIC_VERBOSE," failed.\n");
+		apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
 	}
 	}
 
 
 	if (nmi_watchdog == NMI_IO_APIC) {
 	if (nmi_watchdog == NMI_IO_APIC) {
-		printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
+		apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
+			    "through the IO-APIC - disabling NMI Watchdog!\n");
 		nmi_watchdog = NMI_NONE;
 		nmi_watchdog = NMI_NONE;
 	}
 	}
 
 
-	apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
+	apic_printk(APIC_QUIET, KERN_INFO
+		    "...trying to set up timer as Virtual Wire IRQ...\n");
 
 
 	lapic_register_intr(0);
 	lapic_register_intr(0);
 	apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);	/* Fixed mode */
 	apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);	/* Fixed mode */
 	enable_8259A_irq(0);
 	enable_8259A_irq(0);
 
 
 	if (timer_irq_works()) {
 	if (timer_irq_works()) {
-		apic_printk(APIC_VERBOSE," works.\n");
+		apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
 		goto out;
 		goto out;
 	}
 	}
 	disable_8259A_irq(0);
 	disable_8259A_irq(0);
 	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
 	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
-	apic_printk(APIC_VERBOSE," failed.\n");
+	apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
 
 
-	apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
+	apic_printk(APIC_QUIET, KERN_INFO
+		    "...trying to set up timer as ExtINT IRQ...\n");
 
 
 	init_8259A(0);
 	init_8259A(0);
 	make_8259A_irq(0);
 	make_8259A_irq(0);
@@ -1796,11 +1800,12 @@ static inline void __init check_timer(void)
 	unlock_ExtINT_logic();
 	unlock_ExtINT_logic();
 
 
 	if (timer_irq_works()) {
 	if (timer_irq_works()) {
-		apic_printk(APIC_VERBOSE," works.\n");
+		apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
 		goto out;
 		goto out;
 	}
 	}
-	apic_printk(APIC_VERBOSE," failed :(.\n");
-	panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
+	apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
+	panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
+		"report.  Then try booting with the 'noapic' option.\n");
 out:
 out:
 	local_irq_restore(flags);
 	local_irq_restore(flags);
 }
 }

+ 3 - 0
arch/x86/kernel/io_delay.c

@@ -103,6 +103,9 @@ void __init io_delay_init(void)
 
 
 static int __init io_delay_param(char *s)
 static int __init io_delay_param(char *s)
 {
 {
+	if (!s)
+		return -EINVAL;
+
 	if (!strcmp(s, "0x80"))
 	if (!strcmp(s, "0x80"))
 		io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
 		io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
 	else if (!strcmp(s, "0xed"))
 	else if (!strcmp(s, "0xed"))

+ 3 - 3
arch/x86/kernel/ipi.c

@@ -70,7 +70,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector)
 	/*
 	/*
 	 * Send the IPI. The write to APIC_ICR fires this off.
 	 * Send the IPI. The write to APIC_ICR fires this off.
 	 */
 	 */
-	apic_write_around(APIC_ICR, cfg);
+	apic_write(APIC_ICR, cfg);
 }
 }
 
 
 void send_IPI_self(int vector)
 void send_IPI_self(int vector)
@@ -98,7 +98,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
 	 * prepare target chip field
 	 * prepare target chip field
 	 */
 	 */
 	cfg = __prepare_ICR2(mask);
 	cfg = __prepare_ICR2(mask);
-	apic_write_around(APIC_ICR2, cfg);
+	apic_write(APIC_ICR2, cfg);
 
 
 	/*
 	/*
 	 * program the ICR
 	 * program the ICR
@@ -108,7 +108,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
 	/*
 	/*
 	 * Send the IPI. The write to APIC_ICR fires this off.
 	 * Send the IPI. The write to APIC_ICR fires this off.
 	 */
 	 */
-	apic_write_around(APIC_ICR, cfg);
+	apic_write(APIC_ICR, cfg);
 }
 }
 
 
 /*
 /*

+ 2 - 5
arch/x86/kernel/irq_32.c

@@ -83,11 +83,8 @@ union irq_ctx {
 static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
 static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
 static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
 static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
 
 
-static char softirq_stack[NR_CPUS * THREAD_SIZE]
-		__attribute__((__section__(".bss.page_aligned")));
-
-static char hardirq_stack[NR_CPUS * THREAD_SIZE]
-		__attribute__((__section__(".bss.page_aligned")));
+static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
+static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
 
 
 static void call_on_stack(void *func, void *stack)
 static void call_on_stack(void *func, void *stack)
 {
 {

+ 8 - 0
arch/x86/kernel/kdebugfs.c

@@ -12,9 +12,13 @@
 #include <linux/init.h>
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/io.h>
 #include <linux/mm.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 
 
 #include <asm/setup.h>
 #include <asm/setup.h>
 
 
+struct dentry *arch_debugfs_dir;
+EXPORT_SYMBOL(arch_debugfs_dir);
+
 #ifdef CONFIG_DEBUG_BOOT_PARAMS
 #ifdef CONFIG_DEBUG_BOOT_PARAMS
 struct setup_data_node {
 struct setup_data_node {
 	u64 paddr;
 	u64 paddr;
@@ -209,6 +213,10 @@ static int __init arch_kdebugfs_init(void)
 {
 {
 	int error = 0;
 	int error = 0;
 
 
+	arch_debugfs_dir = debugfs_create_dir("x86", NULL);
+	if (!arch_debugfs_dir)
+		return -ENOMEM;
+
 #ifdef CONFIG_DEBUG_BOOT_PARAMS
 #ifdef CONFIG_DEBUG_BOOT_PARAMS
 	error = boot_params_kdebugfs_init();
 	error = boot_params_kdebugfs_init();
 #endif
 #endif

+ 0 - 1
arch/x86/kernel/kprobes.c

@@ -860,7 +860,6 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs)
 
 
 	resume_execution(cur, regs, kcb);
 	resume_execution(cur, regs, kcb);
 	regs->flags |= kcb->kprobe_saved_flags;
 	regs->flags |= kcb->kprobe_saved_flags;
-	trace_hardirqs_fixup_flags(regs->flags);
 
 
 	if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
 	if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
 		kcb->kprobe_status = KPROBE_HIT_SSDONE;
 		kcb->kprobe_status = KPROBE_HIT_SSDONE;

+ 9 - 1
arch/x86/kernel/module_64.c

@@ -150,7 +150,8 @@ int module_finalize(const Elf_Ehdr *hdr,
                     const Elf_Shdr *sechdrs,
                     const Elf_Shdr *sechdrs,
                     struct module *me)
                     struct module *me)
 {
 {
-	const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL;
+	const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
+		*para = NULL;
 	char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
 	char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
 
 
 	for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
 	for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -160,6 +161,8 @@ int module_finalize(const Elf_Ehdr *hdr,
 			alt = s;
 			alt = s;
 		if (!strcmp(".smp_locks", secstrings + s->sh_name))
 		if (!strcmp(".smp_locks", secstrings + s->sh_name))
 			locks= s;
 			locks= s;
+		if (!strcmp(".parainstructions", secstrings + s->sh_name))
+			para = s;
 	}
 	}
 
 
 	if (alt) {
 	if (alt) {
@@ -175,6 +178,11 @@ int module_finalize(const Elf_Ehdr *hdr,
 					    tseg, tseg + text->sh_size);
 					    tseg, tseg + text->sh_size);
 	}
 	}
 
 
+	if (para) {
+		void *pseg = (void *)para->sh_addr;
+		apply_paravirt(pseg, pseg + para->sh_size);
+	}
+
 	return module_bug_finalize(hdr, sechdrs, me);
 	return module_bug_finalize(hdr, sechdrs, me);
 }
 }
 
 

+ 26 - 182
arch/x86/kernel/mpparse.c

@@ -27,6 +27,7 @@
 #include <asm/bios_ebda.h>
 #include <asm/bios_ebda.h>
 #include <asm/e820.h>
 #include <asm/e820.h>
 #include <asm/trampoline.h>
 #include <asm/trampoline.h>
+#include <asm/setup.h>
 
 
 #include <mach_apic.h>
 #include <mach_apic.h>
 #ifdef CONFIG_X86_32
 #ifdef CONFIG_X86_32
@@ -48,76 +49,6 @@ static int __init mpf_checksum(unsigned char *mp, int len)
 	return sum & 0xFF;
 	return sum & 0xFF;
 }
 }
 
 
-#ifdef CONFIG_X86_NUMAQ
-int found_numaq;
-/*
- * Have to match translation table entries to main table entries by counter
- * hence the mpc_record variable .... can't see a less disgusting way of
- * doing this ....
- */
-struct mpc_config_translation {
-	unsigned char mpc_type;
-	unsigned char trans_len;
-	unsigned char trans_type;
-	unsigned char trans_quad;
-	unsigned char trans_global;
-	unsigned char trans_local;
-	unsigned short trans_reserved;
-};
-
-
-static int mpc_record;
-static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
-    __cpuinitdata;
-
-static inline int generate_logical_apicid(int quad, int phys_apicid)
-{
-	return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
-}
-
-
-static inline int mpc_apic_id(struct mpc_config_processor *m,
-			struct mpc_config_translation *translation_record)
-{
-	int quad = translation_record->trans_quad;
-	int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
-
-	printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
-	       m->mpc_apicid,
-	       (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
-	       (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
-	       m->mpc_apicver, quad, logical_apicid);
-	return logical_apicid;
-}
-
-int mp_bus_id_to_node[MAX_MP_BUSSES];
-
-int mp_bus_id_to_local[MAX_MP_BUSSES];
-
-static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
-	struct mpc_config_translation *translation)
-{
-	int quad = translation->trans_quad;
-	int local = translation->trans_local;
-
-	mp_bus_id_to_node[m->mpc_busid] = quad;
-	mp_bus_id_to_local[m->mpc_busid] = local;
-	printk(KERN_INFO "Bus #%d is %s (node %d)\n",
-	       m->mpc_busid, name, quad);
-}
-
-int quad_local_to_mp_bus_id [NR_CPUS/4][4];
-static void mpc_oem_pci_bus(struct mpc_config_bus *m,
-	struct mpc_config_translation *translation)
-{
-	int quad = translation->trans_quad;
-	int local = translation->trans_local;
-
-	quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
-}
-
-#endif
-
 static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
 static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
 {
 {
 	int apicid;
 	int apicid;
@@ -127,14 +58,12 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
 		disabled_cpus++;
 		disabled_cpus++;
 		return;
 		return;
 	}
 	}
-#ifdef CONFIG_X86_NUMAQ
-	if (found_numaq)
-		apicid = mpc_apic_id(m, translation_table[mpc_record]);
+
+	if (x86_quirks->mpc_apic_id)
+		apicid = x86_quirks->mpc_apic_id(m);
 	else
 	else
 		apicid = m->mpc_apicid;
 		apicid = m->mpc_apicid;
-#else
-	apicid = m->mpc_apicid;
-#endif
+
 	if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
 	if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
 		bootup_cpu = " (Bootup-CPU)";
 		bootup_cpu = " (Bootup-CPU)";
 		boot_cpu_physical_apicid = m->mpc_apicid;
 		boot_cpu_physical_apicid = m->mpc_apicid;
@@ -151,12 +80,10 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
 	memcpy(str, m->mpc_bustype, 6);
 	memcpy(str, m->mpc_bustype, 6);
 	str[6] = 0;
 	str[6] = 0;
 
 
-#ifdef CONFIG_X86_NUMAQ
-	if (found_numaq)
-		mpc_oem_bus_info(m, str, translation_table[mpc_record]);
-#else
-	printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
-#endif
+	if (x86_quirks->mpc_oem_bus_info)
+		x86_quirks->mpc_oem_bus_info(m, str);
+	else
+		printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
 
 
 #if MAX_MP_BUSSES < 256
 #if MAX_MP_BUSSES < 256
 	if (m->mpc_busid >= MAX_MP_BUSSES) {
 	if (m->mpc_busid >= MAX_MP_BUSSES) {
@@ -173,10 +100,9 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
 		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
 		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
 #endif
 #endif
 	} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
 	} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
-#ifdef CONFIG_X86_NUMAQ
-		if (found_numaq)
-			mpc_oem_pci_bus(m, translation_table[mpc_record]);
-#endif
+		if (x86_quirks->mpc_oem_pci_bus)
+			x86_quirks->mpc_oem_pci_bus(m);
+
 		clear_bit(m->mpc_busid, mp_bus_not_pci);
 		clear_bit(m->mpc_busid, mp_bus_not_pci);
 #if defined(CONFIG_EISA) || defined (CONFIG_MCA)
 #if defined(CONFIG_EISA) || defined (CONFIG_MCA)
 		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
 		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
@@ -316,83 +242,6 @@ static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
 		m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
 		m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
 }
 }
 
 
-#ifdef CONFIG_X86_NUMAQ
-static void __init MP_translation_info(struct mpc_config_translation *m)
-{
-	printk(KERN_INFO
-	       "Translation: record %d, type %d, quad %d, global %d, local %d\n",
-	       mpc_record, m->trans_type, m->trans_quad, m->trans_global,
-	       m->trans_local);
-
-	if (mpc_record >= MAX_MPC_ENTRY)
-		printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
-	else
-		translation_table[mpc_record] = m;	/* stash this for later */
-	if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
-		node_set_online(m->trans_quad);
-}
-
-/*
- * Read/parse the MPC oem tables
- */
-
-static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
-				    unsigned short oemsize)
-{
-	int count = sizeof(*oemtable);	/* the header size */
-	unsigned char *oemptr = ((unsigned char *)oemtable) + count;
-
-	mpc_record = 0;
-	printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
-	       oemtable);
-	if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
-		printk(KERN_WARNING
-		       "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
-		       oemtable->oem_signature[0], oemtable->oem_signature[1],
-		       oemtable->oem_signature[2], oemtable->oem_signature[3]);
-		return;
-	}
-	if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
-		printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
-		return;
-	}
-	while (count < oemtable->oem_length) {
-		switch (*oemptr) {
-		case MP_TRANSLATION:
-			{
-				struct mpc_config_translation *m =
-				    (struct mpc_config_translation *)oemptr;
-				MP_translation_info(m);
-				oemptr += sizeof(*m);
-				count += sizeof(*m);
-				++mpc_record;
-				break;
-			}
-		default:
-			{
-				printk(KERN_WARNING
-				       "Unrecognised OEM table entry type! - %d\n",
-				       (int)*oemptr);
-				return;
-			}
-		}
-	}
-}
-
-void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
-				 char *productid)
-{
-	if (strncmp(oem, "IBM NUMA", 8))
-		printk("Warning!  Not a NUMA-Q system!\n");
-	else
-		found_numaq = 1;
-
-	if (mpc->mpc_oemptr)
-		smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
-				 mpc->mpc_oemsize);
-}
-#endif /* CONFIG_X86_NUMAQ */
-
 /*
 /*
  * Read/parse the MPC
  * Read/parse the MPC
  */
  */
@@ -457,7 +306,6 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
 	} else
 	} else
 		mps_oem_check(mpc, oem, str);
 		mps_oem_check(mpc, oem, str);
 #endif
 #endif
-
 	/* save the local APIC address, it might be non-default */
 	/* save the local APIC address, it might be non-default */
 	if (!acpi_lapic)
 	if (!acpi_lapic)
 		mp_lapic_addr = mpc->mpc_lapic;
 		mp_lapic_addr = mpc->mpc_lapic;
@@ -465,12 +313,17 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
 	if (early)
 	if (early)
 		return 1;
 		return 1;
 
 
+	if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
+		struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
+		x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
+	}
+
 	/*
 	/*
 	 *      Now process the configuration blocks.
 	 *      Now process the configuration blocks.
 	 */
 	 */
-#ifdef CONFIG_X86_NUMAQ
-	mpc_record = 0;
-#endif
+	if (x86_quirks->mpc_record)
+		*x86_quirks->mpc_record = 0;
+
 	while (count < mpc->mpc_length) {
 	while (count < mpc->mpc_length) {
 		switch (*mpt) {
 		switch (*mpt) {
 		case MP_PROCESSOR:
 		case MP_PROCESSOR:
@@ -536,9 +389,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
 			count = mpc->mpc_length;
 			count = mpc->mpc_length;
 			break;
 			break;
 		}
 		}
-#ifdef CONFIG_X86_NUMAQ
-		++mpc_record;
-#endif
+		if (x86_quirks->mpc_record)
+			(*x86_quirks->mpc_record)++;
 	}
 	}
 
 
 #ifdef CONFIG_X86_GENERICARCH
 #ifdef CONFIG_X86_GENERICARCH
@@ -725,12 +577,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
 
 
 static struct intel_mp_floating *mpf_found;
 static struct intel_mp_floating *mpf_found;
 
 
-/*
- * Machine specific quirk for finding the SMP config before other setup
- * activities destroy the table:
- */
-int (*mach_get_smp_config_quirk)(unsigned int early);
-
 /*
 /*
  * Scan the memory blocks for an SMP configuration block.
  * Scan the memory blocks for an SMP configuration block.
  */
  */
@@ -738,8 +584,8 @@ static void __init __get_smp_config(unsigned int early)
 {
 {
 	struct intel_mp_floating *mpf = mpf_found;
 	struct intel_mp_floating *mpf = mpf_found;
 
 
-	if (mach_get_smp_config_quirk) {
-		if (mach_get_smp_config_quirk(early))
+	if (x86_quirks->mach_get_smp_config) {
+		if (x86_quirks->mach_get_smp_config(early))
 			return;
 			return;
 	}
 	}
 	if (acpi_lapic && early)
 	if (acpi_lapic && early)
@@ -899,14 +745,12 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 	return 0;
 	return 0;
 }
 }
 
 
-int (*mach_find_smp_config_quirk)(unsigned int reserve);
-
 static void __init __find_smp_config(unsigned int reserve)
 static void __init __find_smp_config(unsigned int reserve)
 {
 {
 	unsigned int address;
 	unsigned int address;
 
 
-	if (mach_find_smp_config_quirk) {
-		if (mach_find_smp_config_quirk(reserve))
+	if (x86_quirks->mach_find_smp_config) {
+		if (x86_quirks->mach_find_smp_config(reserve))
 			return;
 			return;
 	}
 	}
 	/*
 	/*

+ 9 - 2
arch/x86/kernel/nmi.c

@@ -263,7 +263,7 @@ late_initcall(init_lapic_nmi_sysfs);
 
 
 static void __acpi_nmi_enable(void *__unused)
 static void __acpi_nmi_enable(void *__unused)
 {
 {
-	apic_write_around(APIC_LVT0, APIC_DM_NMI);
+	apic_write(APIC_LVT0, APIC_DM_NMI);
 }
 }
 
 
 /*
 /*
@@ -277,7 +277,7 @@ void acpi_nmi_enable(void)
 
 
 static void __acpi_nmi_disable(void *__unused)
 static void __acpi_nmi_disable(void *__unused)
 {
 {
-	apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+	apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
 }
 }
 
 
 /*
 /*
@@ -448,6 +448,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
 
 
 #ifdef CONFIG_SYSCTL
 #ifdef CONFIG_SYSCTL
 
 
+static int __init setup_unknown_nmi_panic(char *str)
+{
+	unknown_nmi_panic = 1;
+	return 1;
+}
+__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
+
 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
 {
 {
 	unsigned char reason = get_nmi_reason();
 	unsigned char reason = get_nmi_reason();

+ 186 - 11
arch/x86/kernel/numaq_32.c

@@ -33,6 +33,7 @@
 #include <asm/processor.h>
 #include <asm/processor.h>
 #include <asm/mpspec.h>
 #include <asm/mpspec.h>
 #include <asm/e820.h>
 #include <asm/e820.h>
+#include <asm/setup.h>
 
 
 #define	MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
 #define	MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
 
 
@@ -71,6 +72,188 @@ static void __init smp_dump_qct(void)
 	}
 	}
 }
 }
 
 
+
+void __init numaq_tsc_disable(void)
+{
+	if (!found_numaq)
+		return;
+
+	if (num_online_nodes() > 1) {
+		printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
+		setup_clear_cpu_cap(X86_FEATURE_TSC);
+	}
+}
+
+static int __init numaq_pre_time_init(void)
+{
+	numaq_tsc_disable();
+	return 0;
+}
+
+int found_numaq;
+/*
+ * Have to match translation table entries to main table entries by counter
+ * hence the mpc_record variable .... can't see a less disgusting way of
+ * doing this ....
+ */
+struct mpc_config_translation {
+	unsigned char mpc_type;
+	unsigned char trans_len;
+	unsigned char trans_type;
+	unsigned char trans_quad;
+	unsigned char trans_global;
+	unsigned char trans_local;
+	unsigned short trans_reserved;
+};
+
+/* x86_quirks member */
+static int mpc_record;
+static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
+    __cpuinitdata;
+
+static inline int generate_logical_apicid(int quad, int phys_apicid)
+{
+	return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
+}
+
+/* x86_quirks member */
+static int mpc_apic_id(struct mpc_config_processor *m)
+{
+	int quad = translation_table[mpc_record]->trans_quad;
+	int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
+
+	printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
+	       m->mpc_apicid,
+	       (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+	       (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+	       m->mpc_apicver, quad, logical_apicid);
+	return logical_apicid;
+}
+
+int mp_bus_id_to_node[MAX_MP_BUSSES];
+
+int mp_bus_id_to_local[MAX_MP_BUSSES];
+
+/* x86_quirks member */
+static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name)
+{
+	int quad = translation_table[mpc_record]->trans_quad;
+	int local = translation_table[mpc_record]->trans_local;
+
+	mp_bus_id_to_node[m->mpc_busid] = quad;
+	mp_bus_id_to_local[m->mpc_busid] = local;
+	printk(KERN_INFO "Bus #%d is %s (node %d)\n",
+	       m->mpc_busid, name, quad);
+}
+
+int quad_local_to_mp_bus_id [NR_CPUS/4][4];
+
+/* x86_quirks member */
+static void mpc_oem_pci_bus(struct mpc_config_bus *m)
+{
+	int quad = translation_table[mpc_record]->trans_quad;
+	int local = translation_table[mpc_record]->trans_local;
+
+	quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
+}
+
+static void __init MP_translation_info(struct mpc_config_translation *m)
+{
+	printk(KERN_INFO
+	       "Translation: record %d, type %d, quad %d, global %d, local %d\n",
+	       mpc_record, m->trans_type, m->trans_quad, m->trans_global,
+	       m->trans_local);
+
+	if (mpc_record >= MAX_MPC_ENTRY)
+		printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
+	else
+		translation_table[mpc_record] = m;	/* stash this for later */
+	if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
+		node_set_online(m->trans_quad);
+}
+
+static int __init mpf_checksum(unsigned char *mp, int len)
+{
+	int sum = 0;
+
+	while (len--)
+		sum += *mp++;
+
+	return sum & 0xFF;
+}
+
+/*
+ * Read/parse the MPC oem tables
+ */
+
+static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
+				    unsigned short oemsize)
+{
+	int count = sizeof(*oemtable);	/* the header size */
+	unsigned char *oemptr = ((unsigned char *)oemtable) + count;
+
+	mpc_record = 0;
+	printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
+	       oemtable);
+	if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
+		printk(KERN_WARNING
+		       "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
+		       oemtable->oem_signature[0], oemtable->oem_signature[1],
+		       oemtable->oem_signature[2], oemtable->oem_signature[3]);
+		return;
+	}
+	if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
+		printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
+		return;
+	}
+	while (count < oemtable->oem_length) {
+		switch (*oemptr) {
+		case MP_TRANSLATION:
+			{
+				struct mpc_config_translation *m =
+				    (struct mpc_config_translation *)oemptr;
+				MP_translation_info(m);
+				oemptr += sizeof(*m);
+				count += sizeof(*m);
+				++mpc_record;
+				break;
+			}
+		default:
+			{
+				printk(KERN_WARNING
+				       "Unrecognised OEM table entry type! - %d\n",
+				       (int)*oemptr);
+				return;
+			}
+		}
+	}
+}
+
+static struct x86_quirks numaq_x86_quirks __initdata = {
+	.arch_pre_time_init	= numaq_pre_time_init,
+	.arch_time_init		= NULL,
+	.arch_pre_intr_init	= NULL,
+	.arch_memory_setup	= NULL,
+	.arch_intr_init		= NULL,
+	.arch_trap_init		= NULL,
+	.mach_get_smp_config	= NULL,
+	.mach_find_smp_config	= NULL,
+	.mpc_record		= &mpc_record,
+	.mpc_apic_id		= mpc_apic_id,
+	.mpc_oem_bus_info	= mpc_oem_bus_info,
+	.mpc_oem_pci_bus	= mpc_oem_pci_bus,
+	.smp_read_mpc_oem	= smp_read_mpc_oem,
+};
+
+void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
+				 char *productid)
+{
+	if (strncmp(oem, "IBM NUMA", 8))
+		printk("Warning!  Not a NUMA-Q system!\n");
+	else
+		found_numaq = 1;
+}
+
 static __init void early_check_numaq(void)
 static __init void early_check_numaq(void)
 {
 {
 	/*
 	/*
@@ -82,6 +265,9 @@ static __init void early_check_numaq(void)
 	 */
 	 */
 	if (smp_found_config)
 	if (smp_found_config)
 		early_get_smp_config();
 		early_get_smp_config();
+
+	if (found_numaq)
+		x86_quirks = &numaq_x86_quirks;
 }
 }
 
 
 int __init get_memcfg_numaq(void)
 int __init get_memcfg_numaq(void)
@@ -92,14 +278,3 @@ int __init get_memcfg_numaq(void)
 	smp_dump_qct();
 	smp_dump_qct();
 	return 1;
 	return 1;
 }
 }
-
-void __init numaq_tsc_disable(void)
-{
-	if (!found_numaq)
-		return;
-
-	if (num_online_nodes() > 1) {
-		printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
-		setup_clear_cpu_cap(X86_FEATURE_TSC);
-	}
-}

+ 4 - 1
arch/x86/kernel/paravirt.c

@@ -29,6 +29,7 @@
 #include <asm/desc.h>
 #include <asm/desc.h>
 #include <asm/setup.h>
 #include <asm/setup.h>
 #include <asm/arch_hooks.h>
 #include <asm/arch_hooks.h>
+#include <asm/pgtable.h>
 #include <asm/time.h>
 #include <asm/time.h>
 #include <asm/pgalloc.h>
 #include <asm/pgalloc.h>
 #include <asm/irq.h>
 #include <asm/irq.h>
@@ -361,7 +362,6 @@ struct pv_cpu_ops pv_cpu_ops = {
 struct pv_apic_ops pv_apic_ops = {
 struct pv_apic_ops pv_apic_ops = {
 #ifdef CONFIG_X86_LOCAL_APIC
 #ifdef CONFIG_X86_LOCAL_APIC
 	.apic_write = native_apic_write,
 	.apic_write = native_apic_write,
-	.apic_write_atomic = native_apic_write_atomic,
 	.apic_read = native_apic_read,
 	.apic_read = native_apic_read,
 	.setup_boot_clock = setup_boot_APIC_clock,
 	.setup_boot_clock = setup_boot_APIC_clock,
 	.setup_secondary_clock = setup_secondary_APIC_clock,
 	.setup_secondary_clock = setup_secondary_APIC_clock,
@@ -373,6 +373,9 @@ struct pv_mmu_ops pv_mmu_ops = {
 #ifndef CONFIG_X86_64
 #ifndef CONFIG_X86_64
 	.pagetable_setup_start = native_pagetable_setup_start,
 	.pagetable_setup_start = native_pagetable_setup_start,
 	.pagetable_setup_done = native_pagetable_setup_done,
 	.pagetable_setup_done = native_pagetable_setup_done,
+#else
+	.pagetable_setup_start = paravirt_nop,
+	.pagetable_setup_done = paravirt_nop,
 #endif
 #endif
 
 
 	.read_cr2 = native_read_cr2,
 	.read_cr2 = native_read_cr2,

+ 1 - 1
arch/x86/kernel/pci-calgary_64.c

@@ -36,7 +36,7 @@
 #include <linux/delay.h>
 #include <linux/delay.h>
 #include <linux/scatterlist.h>
 #include <linux/scatterlist.h>
 #include <linux/iommu-helper.h>
 #include <linux/iommu-helper.h>
-#include <asm/gart.h>
+#include <asm/iommu.h>
 #include <asm/calgary.h>
 #include <asm/calgary.h>
 #include <asm/tce.h>
 #include <asm/tce.h>
 #include <asm/pci-direct.h>
 #include <asm/pci-direct.h>

+ 2 - 15
arch/x86/kernel/pci-dma.c

@@ -5,12 +5,11 @@
 
 
 #include <asm/proto.h>
 #include <asm/proto.h>
 #include <asm/dma.h>
 #include <asm/dma.h>
-#include <asm/gart.h>
+#include <asm/iommu.h>
 #include <asm/calgary.h>
 #include <asm/calgary.h>
 #include <asm/amd_iommu.h>
 #include <asm/amd_iommu.h>
 
 
-int forbid_dac __read_mostly;
-EXPORT_SYMBOL(forbid_dac);
+static int forbid_dac __read_mostly;
 
 
 const struct dma_mapping_ops *dma_ops;
 const struct dma_mapping_ops *dma_ops;
 EXPORT_SYMBOL(dma_ops);
 EXPORT_SYMBOL(dma_ops);
@@ -114,21 +113,15 @@ void __init pci_iommu_alloc(void)
 	 * The order of these functions is important for
 	 * The order of these functions is important for
 	 * fall-back/fail-over reasons
 	 * fall-back/fail-over reasons
 	 */
 	 */
-#ifdef CONFIG_GART_IOMMU
 	gart_iommu_hole_init();
 	gart_iommu_hole_init();
-#endif
 
 
-#ifdef CONFIG_CALGARY_IOMMU
 	detect_calgary();
 	detect_calgary();
-#endif
 
 
 	detect_intel_iommu();
 	detect_intel_iommu();
 
 
 	amd_iommu_detect();
 	amd_iommu_detect();
 
 
-#ifdef CONFIG_SWIOTLB
 	pci_swiotlb_init();
 	pci_swiotlb_init();
-#endif
 }
 }
 #endif
 #endif
 
 
@@ -184,9 +177,7 @@ static __init int iommu_setup(char *p)
 			swiotlb = 1;
 			swiotlb = 1;
 #endif
 #endif
 
 
-#ifdef CONFIG_GART_IOMMU
 		gart_parse_options(p);
 		gart_parse_options(p);
-#endif
 
 
 #ifdef CONFIG_CALGARY_IOMMU
 #ifdef CONFIG_CALGARY_IOMMU
 		if (!strncmp(p, "calgary", 7))
 		if (!strncmp(p, "calgary", 7))
@@ -500,17 +491,13 @@ EXPORT_SYMBOL(dma_free_coherent);
 
 
 static int __init pci_iommu_init(void)
 static int __init pci_iommu_init(void)
 {
 {
-#ifdef CONFIG_CALGARY_IOMMU
 	calgary_iommu_init();
 	calgary_iommu_init();
-#endif
 
 
 	intel_iommu_init();
 	intel_iommu_init();
 
 
 	amd_iommu_init();
 	amd_iommu_init();
 
 
-#ifdef CONFIG_GART_IOMMU
 	gart_iommu_init();
 	gart_iommu_init();
-#endif
 
 
 	no_iommu_init();
 	no_iommu_init();
 	return 0;
 	return 0;

+ 1 - 0
arch/x86/kernel/pci-gart_64.c

@@ -32,6 +32,7 @@
 #include <asm/mtrr.h>
 #include <asm/mtrr.h>
 #include <asm/pgtable.h>
 #include <asm/pgtable.h>
 #include <asm/proto.h>
 #include <asm/proto.h>
+#include <asm/iommu.h>
 #include <asm/gart.h>
 #include <asm/gart.h>
 #include <asm/cacheflush.h>
 #include <asm/cacheflush.h>
 #include <asm/swiotlb.h>
 #include <asm/swiotlb.h>

+ 1 - 1
arch/x86/kernel/pci-nommu.c

@@ -7,7 +7,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/dma-mapping.h>
 #include <linux/scatterlist.h>
 #include <linux/scatterlist.h>
 
 
-#include <asm/gart.h>
+#include <asm/iommu.h>
 #include <asm/processor.h>
 #include <asm/processor.h>
 #include <asm/dma.h>
 #include <asm/dma.h>
 
 

+ 1 - 1
arch/x86/kernel/pci-swiotlb_64.c

@@ -5,7 +5,7 @@
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/dma-mapping.h>
 #include <linux/dma-mapping.h>
 
 
-#include <asm/gart.h>
+#include <asm/iommu.h>
 #include <asm/swiotlb.h>
 #include <asm/swiotlb.h>
 #include <asm/dma.h>
 #include <asm/dma.h>
 
 

+ 5 - 0
arch/x86/kernel/process.c

@@ -15,6 +15,7 @@ unsigned long idle_nomwait;
 EXPORT_SYMBOL(idle_nomwait);
 EXPORT_SYMBOL(idle_nomwait);
 
 
 struct kmem_cache *task_xstate_cachep;
 struct kmem_cache *task_xstate_cachep;
+static int force_mwait __cpuinitdata;
 
 
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
 {
@@ -199,6 +200,7 @@ static void poll_idle(void)
  *
  *
  * idle=mwait overrides this decision and forces the usage of mwait.
  * idle=mwait overrides this decision and forces the usage of mwait.
  */
  */
+static int __cpuinitdata force_mwait;
 
 
 #define MWAIT_INFO			0x05
 #define MWAIT_INFO			0x05
 #define MWAIT_ECX_EXTENDED_INFO		0x01
 #define MWAIT_ECX_EXTENDED_INFO		0x01
@@ -326,6 +328,9 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 
 
 static int __init idle_setup(char *str)
 static int __init idle_setup(char *str)
 {
 {
+	if (!str)
+		return -EINVAL;
+
 	if (!strcmp(str, "poll")) {
 	if (!strcmp(str, "poll")) {
 		printk("using polling idle threads.\n");
 		printk("using polling idle threads.\n");
 		pm_idle = poll_idle;
 		pm_idle = poll_idle;

+ 28 - 28
arch/x86/kernel/process_64.c

@@ -537,8 +537,8 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
 struct task_struct *
 struct task_struct *
 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 {
-	struct thread_struct *prev = &prev_p->thread,
-				 *next = &next_p->thread;
+	struct thread_struct *prev = &prev_p->thread;
+	struct thread_struct *next = &next_p->thread;
 	int cpu = smp_processor_id();
 	int cpu = smp_processor_id();
 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
 	unsigned fsindex, gsindex;
 	unsigned fsindex, gsindex;
@@ -586,35 +586,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 
 	/* 
 	/* 
 	 * Switch FS and GS.
 	 * Switch FS and GS.
+	 *
+	 * Segment register != 0 always requires a reload.  Also
+	 * reload when it has changed.  When prev process used 64bit
+	 * base always reload to avoid an information leak.
 	 */
 	 */
-	{ 
-		/* segment register != 0 always requires a reload. 
-		   also reload when it has changed. 
-		   when prev process used 64bit base always reload
-		   to avoid an information leak. */
-		if (unlikely(fsindex | next->fsindex | prev->fs)) {
-			loadsegment(fs, next->fsindex);
-			/* check if the user used a selector != 0
-	                 * if yes clear 64bit base, since overloaded base
-                         * is always mapped to the Null selector
-                         */
-			if (fsindex)
+	if (unlikely(fsindex | next->fsindex | prev->fs)) {
+		loadsegment(fs, next->fsindex);
+		/* 
+		 * Check if the user used a selector != 0; if yes
+		 *  clear 64bit base, since overloaded base is always
+		 *  mapped to the Null selector
+		 */
+		if (fsindex)
 			prev->fs = 0;				
 			prev->fs = 0;				
-		}
-		/* when next process has a 64bit base use it */
-		if (next->fs) 
-			wrmsrl(MSR_FS_BASE, next->fs); 
-		prev->fsindex = fsindex;
-
-		if (unlikely(gsindex | next->gsindex | prev->gs)) {
-			load_gs_index(next->gsindex);
-			if (gsindex)
+	}
+	/* when next process has a 64bit base use it */
+	if (next->fs)
+		wrmsrl(MSR_FS_BASE, next->fs);
+	prev->fsindex = fsindex;
+
+	if (unlikely(gsindex | next->gsindex | prev->gs)) {
+		load_gs_index(next->gsindex);
+		if (gsindex)
 			prev->gs = 0;				
 			prev->gs = 0;				
-		}
-		if (next->gs)
-			wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 
-		prev->gsindex = gsindex;
 	}
 	}
+	if (next->gs)
+		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
+	prev->gsindex = gsindex;
 
 
 	/* Must be after DS reload */
 	/* Must be after DS reload */
 	unlazy_fpu(prev_p);
 	unlazy_fpu(prev_p);
@@ -627,7 +626,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	write_pda(pcurrent, next_p); 
 	write_pda(pcurrent, next_p); 
 
 
 	write_pda(kernelstack,
 	write_pda(kernelstack,
-	(unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
+		  (unsigned long)task_stack_page(next_p) +
+		  THREAD_SIZE - PDA_STACKOFFSET);
 #ifdef CONFIG_CC_STACKPROTECTOR
 #ifdef CONFIG_CC_STACKPROTECTOR
 	write_pda(stack_canary, next_p->stack_canary);
 	write_pda(stack_canary, next_p->stack_canary);
 	/*
 	/*

+ 56 - 95
arch/x86/kernel/ptrace.c

@@ -1357,8 +1357,6 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
 #endif
 #endif
 }
 }
 
 
-#ifdef CONFIG_X86_32
-
 void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
 void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
 {
 {
 	struct siginfo info;
 	struct siginfo info;
@@ -1377,89 +1375,10 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
 	force_sig_info(SIGTRAP, &info, tsk);
 	force_sig_info(SIGTRAP, &info, tsk);
 }
 }
 
 
-/* notification of system call entry/exit
- * - triggered by current->work.syscall_trace
- */
-int do_syscall_trace(struct pt_regs *regs, int entryexit)
-{
-	int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
-	/*
-	 * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
-	 * interception
-	 */
-	int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
-	int ret = 0;
-
-	/* do the secure computing check first */
-	if (!entryexit)
-		secure_computing(regs->orig_ax);
-
-	if (unlikely(current->audit_context)) {
-		if (entryexit)
-			audit_syscall_exit(AUDITSC_RESULT(regs->ax),
-						regs->ax);
-		/* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
-		 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
-		 * not used, entry.S will call us only on syscall exit, not
-		 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
-		 * calling send_sigtrap() on syscall entry.
-		 *
-		 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
-		 * is_singlestep is false, despite his name, so we will still do
-		 * the correct thing.
-		 */
-		else if (is_singlestep)
-			goto out;
-	}
-
-	if (!(current->ptrace & PT_PTRACED))
-		goto out;
-
-	/* If a process stops on the 1st tracepoint with SYSCALL_TRACE
-	 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
-	 * here. We have to check this and return */
-	if (is_sysemu && entryexit)
-		return 0;
-
-	/* Fake a debug trap */
-	if (is_singlestep)
-		send_sigtrap(current, regs, 0);
-
- 	if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
-		goto out;
-
-	/* the 0x80 provides a way for the tracing parent to distinguish
-	   between a syscall stop and SIGTRAP delivery */
-	/* Note that the debugger could change the result of test_thread_flag!*/
-	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
-
-	/*
-	 * this isn't the same as continuing with a signal, but it will do
-	 * for normal use.  strace only continues with a signal if the
-	 * stopping signal is not SIGTRAP.  -brl
-	 */
-	if (current->exit_code) {
-		send_sig(current->exit_code, current, 1);
-		current->exit_code = 0;
-	}
-	ret = is_sysemu;
-out:
-	if (unlikely(current->audit_context) && !entryexit)
-		audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax,
-				    regs->bx, regs->cx, regs->dx, regs->si);
-	if (ret == 0)
-		return 0;
-
-	regs->orig_ax = -1; /* force skip of syscall restarting */
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
-	return 1;
-}
-
-#else  /* CONFIG_X86_64 */
-
 static void syscall_trace(struct pt_regs *regs)
 static void syscall_trace(struct pt_regs *regs)
 {
 {
+	if (!(current->ptrace & PT_PTRACED))
+		return;
 
 
 #if 0
 #if 0
 	printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
 	printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
@@ -1481,39 +1400,81 @@ static void syscall_trace(struct pt_regs *regs)
 	}
 	}
 }
 }
 
 
-asmlinkage void syscall_trace_enter(struct pt_regs *regs)
+#ifdef CONFIG_X86_32
+# define IS_IA32	1
+#elif defined CONFIG_IA32_EMULATION
+# define IS_IA32	test_thread_flag(TIF_IA32)
+#else
+# define IS_IA32	0
+#endif
+
+/*
+ * We must return the syscall number to actually look up in the table.
+ * This can be -1L to skip running any syscall at all.
+ */
+asmregparm long syscall_trace_enter(struct pt_regs *regs)
 {
 {
+	long ret = 0;
+
+	/*
+	 * If we stepped into a sysenter/syscall insn, it trapped in
+	 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
+	 * If user-mode had set TF itself, then it's still clear from
+	 * do_debug() and we need to set it again to restore the user
+	 * state.  If we entered on the slow path, TF was already set.
+	 */
+	if (test_thread_flag(TIF_SINGLESTEP))
+		regs->flags |= X86_EFLAGS_TF;
+
 	/* do the secure computing check first */
 	/* do the secure computing check first */
 	secure_computing(regs->orig_ax);
 	secure_computing(regs->orig_ax);
 
 
-	if (test_thread_flag(TIF_SYSCALL_TRACE)
-	    && (current->ptrace & PT_PTRACED))
+	if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
+		ret = -1L;
+
+	if (ret || test_thread_flag(TIF_SYSCALL_TRACE))
 		syscall_trace(regs);
 		syscall_trace(regs);
 
 
 	if (unlikely(current->audit_context)) {
 	if (unlikely(current->audit_context)) {
-		if (test_thread_flag(TIF_IA32)) {
+		if (IS_IA32)
 			audit_syscall_entry(AUDIT_ARCH_I386,
 			audit_syscall_entry(AUDIT_ARCH_I386,
 					    regs->orig_ax,
 					    regs->orig_ax,
 					    regs->bx, regs->cx,
 					    regs->bx, regs->cx,
 					    regs->dx, regs->si);
 					    regs->dx, regs->si);
-		} else {
+#ifdef CONFIG_X86_64
+		else
 			audit_syscall_entry(AUDIT_ARCH_X86_64,
 			audit_syscall_entry(AUDIT_ARCH_X86_64,
 					    regs->orig_ax,
 					    regs->orig_ax,
 					    regs->di, regs->si,
 					    regs->di, regs->si,
 					    regs->dx, regs->r10);
 					    regs->dx, regs->r10);
-		}
+#endif
 	}
 	}
+
+	return ret ?: regs->orig_ax;
 }
 }
 
 
-asmlinkage void syscall_trace_leave(struct pt_regs *regs)
+asmregparm void syscall_trace_leave(struct pt_regs *regs)
 {
 {
 	if (unlikely(current->audit_context))
 	if (unlikely(current->audit_context))
 		audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
 		audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
 
 
-	if ((test_thread_flag(TIF_SYSCALL_TRACE)
-	     || test_thread_flag(TIF_SINGLESTEP))
-	    && (current->ptrace & PT_PTRACED))
+	if (test_thread_flag(TIF_SYSCALL_TRACE))
 		syscall_trace(regs);
 		syscall_trace(regs);
-}
 
 
-#endif	/* CONFIG_X86_32 */
+	/*
+	 * If TIF_SYSCALL_EMU is set, we only get here because of
+	 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
+	 * We already reported this syscall instruction in
+	 * syscall_trace_enter(), so don't do any more now.
+	 */
+	if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
+		return;
+
+	/*
+	 * If we are single-stepping, synthesize a trap to follow the
+	 * system call instruction.
+	 */
+	if (test_thread_flag(TIF_SINGLESTEP) &&
+	    (current->ptrace & PT_PTRACED))
+		send_sigtrap(current, regs, 0);
+}

+ 8 - 0
arch/x86/kernel/reboot.c

@@ -177,6 +177,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
 		},
 		},
 	},
 	},
+	{	/* Handle problems with rebooting on Dell T5400's */
+		.callback = set_bios_reboot,
+		.ident = "Dell Precision T5400",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
+		},
+	},
 	{	/* Handle problems with rebooting on HP laptops */
 	{	/* Handle problems with rebooting on HP laptops */
 		.callback = set_bios_reboot,
 		.callback = set_bios_reboot,
 		.ident = "HP Compaq Laptop",
 		.ident = "HP Compaq Laptop",

+ 8 - 14
arch/x86/kernel/setup.c

@@ -57,12 +57,8 @@
 #include <linux/slab.h>
 #include <linux/slab.h>
 #include <linux/user.h>
 #include <linux/user.h>
 #include <linux/delay.h>
 #include <linux/delay.h>
-#include <linux/highmem.h>
 
 
 #include <linux/kallsyms.h>
 #include <linux/kallsyms.h>
-#include <linux/edd.h>
-#include <linux/iscsi_ibft.h>
-#include <linux/kexec.h>
 #include <linux/cpufreq.h>
 #include <linux/cpufreq.h>
 #include <linux/dma-mapping.h>
 #include <linux/dma-mapping.h>
 #include <linux/ctype.h>
 #include <linux/ctype.h>
@@ -96,7 +92,7 @@
 #include <asm/smp.h>
 #include <asm/smp.h>
 #include <asm/desc.h>
 #include <asm/desc.h>
 #include <asm/dma.h>
 #include <asm/dma.h>
-#include <asm/gart.h>
+#include <asm/iommu.h>
 #include <asm/mmu_context.h>
 #include <asm/mmu_context.h>
 #include <asm/proto.h>
 #include <asm/proto.h>
 
 
@@ -104,7 +100,6 @@
 #include <asm/paravirt.h>
 #include <asm/paravirt.h>
 
 
 #include <asm/percpu.h>
 #include <asm/percpu.h>
-#include <asm/sections.h>
 #include <asm/topology.h>
 #include <asm/topology.h>
 #include <asm/apicdef.h>
 #include <asm/apicdef.h>
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
@@ -579,6 +574,10 @@ static int __init setup_elfcorehdr(char *arg)
 early_param("elfcorehdr", setup_elfcorehdr);
 early_param("elfcorehdr", setup_elfcorehdr);
 #endif
 #endif
 
 
+static struct x86_quirks default_x86_quirks __initdata;
+
+struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
+
 /*
 /*
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
  * passed the efi memmap, systab, etc., so we should use these data structures
  * passed the efi memmap, systab, etc., so we should use these data structures
@@ -824,7 +823,10 @@ void __init setup_arch(char **cmdline_p)
 	vmi_init();
 	vmi_init();
 #endif
 #endif
 
 
+	paravirt_pagetable_setup_start(swapper_pg_dir);
 	paging_init();
 	paging_init();
+	paravirt_pagetable_setup_done(swapper_pg_dir);
+	paravirt_post_allocator_init();
 
 
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
 	map_vsyscall();
 	map_vsyscall();
@@ -854,14 +856,6 @@ void __init setup_arch(char **cmdline_p)
 	init_cpu_to_node();
 	init_cpu_to_node();
 #endif
 #endif
 
 
-#ifdef CONFIG_X86_NUMAQ
-	/*
-	 * need to check online nodes num, call it
-	 * here before time_init/tsc_init
-	 */
-	numaq_tsc_disable();
-#endif
-
 	init_apic_mappings();
 	init_apic_mappings();
 	ioapic_init_mappings();
 	ioapic_init_mappings();
 
 

+ 1 - 7
arch/x86/kernel/signal_32.c

@@ -212,7 +212,7 @@ asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
 
 
 badframe:
 badframe:
 	if (show_unhandled_signals && printk_ratelimit()) {
 	if (show_unhandled_signals && printk_ratelimit()) {
-		printk(KERN_INFO "%s%s[%d] bad frame in sigreturn frame:"
+		printk("%s%s[%d] bad frame in sigreturn frame:"
 			"%p ip:%lx sp:%lx oeax:%lx",
 			"%p ip:%lx sp:%lx oeax:%lx",
 		    task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
 		    task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
 		    current->comm, task_pid_nr(current), frame, regs->ip,
 		    current->comm, task_pid_nr(current), frame, regs->ip,
@@ -657,12 +657,6 @@ static void do_signal(struct pt_regs *regs)
 void
 void
 do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 {
 {
-	/* Pending single-step? */
-	if (thread_info_flags & _TIF_SINGLESTEP) {
-		regs->flags |= X86_EFLAGS_TF;
-		clear_thread_flag(TIF_SINGLESTEP);
-	}
-
 	/* deal with pending signal delivery */
 	/* deal with pending signal delivery */
 	if (thread_info_flags & _TIF_SIGPENDING)
 	if (thread_info_flags & _TIF_SIGPENDING)
 		do_signal(regs);
 		do_signal(regs);

+ 0 - 6
arch/x86/kernel/signal_64.c

@@ -487,12 +487,6 @@ static void do_signal(struct pt_regs *regs)
 void do_notify_resume(struct pt_regs *regs, void *unused,
 void do_notify_resume(struct pt_regs *regs, void *unused,
 		      __u32 thread_info_flags)
 		      __u32 thread_info_flags)
 {
 {
-	/* Pending single-step? */
-	if (thread_info_flags & _TIF_SINGLESTEP) {
-		regs->flags |= X86_EFLAGS_TF;
-		clear_thread_flag(TIF_SINGLESTEP);
-	}
-
 #ifdef CONFIG_X86_MCE
 #ifdef CONFIG_X86_MCE
 	/* notify userspace of pending MCEs */
 	/* notify userspace of pending MCEs */
 	if (thread_info_flags & _TIF_MCE_NOTIFY)
 	if (thread_info_flags & _TIF_MCE_NOTIFY)

+ 22 - 32
arch/x86/kernel/smpboot.c

@@ -546,8 +546,8 @@ static inline void __inquire_remote_apic(int apicid)
 			printk(KERN_CONT
 			printk(KERN_CONT
 			       "a previous APIC delivery may have failed\n");
 			       "a previous APIC delivery may have failed\n");
 
 
-		apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
-		apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
+		apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
+		apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
 
 
 		timeout = 0;
 		timeout = 0;
 		do {
 		do {
@@ -579,11 +579,11 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
 	int maxlvt;
 	int maxlvt;
 
 
 	/* Target chip */
 	/* Target chip */
-	apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
+	apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
 
 
 	/* Boot on the stack */
 	/* Boot on the stack */
 	/* Kick the second */
 	/* Kick the second */
-	apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
+	apic_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
 
 
 	Dprintk("Waiting for send to finish...\n");
 	Dprintk("Waiting for send to finish...\n");
 	send_status = safe_apic_wait_icr_idle();
 	send_status = safe_apic_wait_icr_idle();
@@ -592,14 +592,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
 	 * Give the other CPU some time to accept the IPI.
 	 * Give the other CPU some time to accept the IPI.
 	 */
 	 */
 	udelay(200);
 	udelay(200);
-	/*
-	 * Due to the Pentium erratum 3AP.
-	 */
 	maxlvt = lapic_get_maxlvt();
 	maxlvt = lapic_get_maxlvt();
-	if (maxlvt > 3) {
-		apic_read_around(APIC_SPIV);
+	if (maxlvt > 3)			/* Due to the Pentium erratum 3AP.  */
 		apic_write(APIC_ESR, 0);
 		apic_write(APIC_ESR, 0);
-	}
 	accept_status = (apic_read(APIC_ESR) & 0xEF);
 	accept_status = (apic_read(APIC_ESR) & 0xEF);
 	Dprintk("NMI sent.\n");
 	Dprintk("NMI sent.\n");
 
 
@@ -625,12 +620,14 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
 		return send_status;
 		return send_status;
 	}
 	}
 
 
+	maxlvt = lapic_get_maxlvt();
+
 	/*
 	/*
 	 * Be paranoid about clearing APIC errors.
 	 * Be paranoid about clearing APIC errors.
 	 */
 	 */
 	if (APIC_INTEGRATED(apic_version[phys_apicid])) {
 	if (APIC_INTEGRATED(apic_version[phys_apicid])) {
-		apic_read_around(APIC_SPIV);
-		apic_write(APIC_ESR, 0);
+		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP.  */
+			apic_write(APIC_ESR, 0);
 		apic_read(APIC_ESR);
 		apic_read(APIC_ESR);
 	}
 	}
 
 
@@ -639,13 +636,13 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
 	/*
 	/*
 	 * Turn INIT on target chip
 	 * Turn INIT on target chip
 	 */
 	 */
-	apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+	apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
 
 
 	/*
 	/*
 	 * Send IPI
 	 * Send IPI
 	 */
 	 */
-	apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
-				| APIC_DM_INIT);
+	apic_write(APIC_ICR,
+		   APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT);
 
 
 	Dprintk("Waiting for send to finish...\n");
 	Dprintk("Waiting for send to finish...\n");
 	send_status = safe_apic_wait_icr_idle();
 	send_status = safe_apic_wait_icr_idle();
@@ -655,10 +652,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
 	Dprintk("Deasserting INIT.\n");
 	Dprintk("Deasserting INIT.\n");
 
 
 	/* Target chip */
 	/* Target chip */
-	apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+	apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
 
 
 	/* Send IPI */
 	/* Send IPI */
-	apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
+	apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
 
 
 	Dprintk("Waiting for send to finish...\n");
 	Dprintk("Waiting for send to finish...\n");
 	send_status = safe_apic_wait_icr_idle();
 	send_status = safe_apic_wait_icr_idle();
@@ -689,12 +686,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
 	 */
 	 */
 	Dprintk("#startup loops: %d.\n", num_starts);
 	Dprintk("#startup loops: %d.\n", num_starts);
 
 
-	maxlvt = lapic_get_maxlvt();
-
 	for (j = 1; j <= num_starts; j++) {
 	for (j = 1; j <= num_starts; j++) {
 		Dprintk("Sending STARTUP #%d.\n", j);
 		Dprintk("Sending STARTUP #%d.\n", j);
-		apic_read_around(APIC_SPIV);
-		apic_write(APIC_ESR, 0);
+		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP.  */
+			apic_write(APIC_ESR, 0);
 		apic_read(APIC_ESR);
 		apic_read(APIC_ESR);
 		Dprintk("After apic_write.\n");
 		Dprintk("After apic_write.\n");
 
 
@@ -703,12 +698,11 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
 		 */
 		 */
 
 
 		/* Target chip */
 		/* Target chip */
-		apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+		apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
 
 
 		/* Boot on the stack */
 		/* Boot on the stack */
 		/* Kick the second */
 		/* Kick the second */
-		apic_write_around(APIC_ICR, APIC_DM_STARTUP
-					| (start_eip >> 12));
+		apic_write(APIC_ICR, APIC_DM_STARTUP | (start_eip >> 12));
 
 
 		/*
 		/*
 		 * Give the other CPU some time to accept the IPI.
 		 * Give the other CPU some time to accept the IPI.
@@ -724,13 +718,8 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
 		 * Give the other CPU some time to accept the IPI.
 		 * Give the other CPU some time to accept the IPI.
 		 */
 		 */
 		udelay(200);
 		udelay(200);
-		/*
-		 * Due to the Pentium erratum 3AP.
-		 */
-		if (maxlvt > 3) {
-			apic_read_around(APIC_SPIV);
+		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP.  */
 			apic_write(APIC_ESR, 0);
 			apic_write(APIC_ESR, 0);
-		}
 		accept_status = (apic_read(APIC_ESR) & 0xEF);
 		accept_status = (apic_read(APIC_ESR) & 0xEF);
 		if (send_status || accept_status)
 		if (send_status || accept_status)
 			break;
 			break;
@@ -768,7 +757,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
  *
  *
  * Must be called after the _cpu_pda pointer table is initialized.
  * Must be called after the _cpu_pda pointer table is initialized.
  */
  */
-static int __cpuinit get_local_pda(int cpu)
+int __cpuinit get_local_pda(int cpu)
 {
 {
 	struct x8664_pda *oldpda, *newpda;
 	struct x8664_pda *oldpda, *newpda;
 	unsigned long size = sizeof(struct x8664_pda);
 	unsigned long size = sizeof(struct x8664_pda);
@@ -1390,7 +1379,8 @@ static int __init parse_maxcpus(char *arg)
 {
 {
 	extern unsigned int maxcpus;
 	extern unsigned int maxcpus;
 
 
-	maxcpus = simple_strtoul(arg, NULL, 0);
+	if (arg)
+		maxcpus = simple_strtoul(arg, NULL, 0);
 	return 0;
 	return 0;
 }
 }
 early_param("maxcpus", parse_maxcpus);
 early_param("maxcpus", parse_maxcpus);

+ 29 - 6
arch/x86/kernel/step.c

@@ -105,6 +105,20 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
 static int enable_single_step(struct task_struct *child)
 static int enable_single_step(struct task_struct *child)
 {
 {
 	struct pt_regs *regs = task_pt_regs(child);
 	struct pt_regs *regs = task_pt_regs(child);
+	unsigned long oflags;
+
+	/*
+	 * If we stepped into a sysenter/syscall insn, it trapped in
+	 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
+	 * If user-mode had set TF itself, then it's still clear from
+	 * do_debug() and we need to set it again to restore the user
+	 * state so we don't wrongly set TIF_FORCED_TF below.
+	 * If enable_single_step() was used last and that is what
+	 * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are
+	 * already set and our bookkeeping is fine.
+	 */
+	if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP)))
+		regs->flags |= X86_EFLAGS_TF;
 
 
 	/*
 	/*
 	 * Always set TIF_SINGLESTEP - this guarantees that
 	 * Always set TIF_SINGLESTEP - this guarantees that
@@ -113,11 +127,7 @@ static int enable_single_step(struct task_struct *child)
 	 */
 	 */
 	set_tsk_thread_flag(child, TIF_SINGLESTEP);
 	set_tsk_thread_flag(child, TIF_SINGLESTEP);
 
 
-	/*
-	 * If TF was already set, don't do anything else
-	 */
-	if (regs->flags & X86_EFLAGS_TF)
-		return 0;
+	oflags = regs->flags;
 
 
 	/* Set TF on the kernel stack.. */
 	/* Set TF on the kernel stack.. */
 	regs->flags |= X86_EFLAGS_TF;
 	regs->flags |= X86_EFLAGS_TF;
@@ -126,9 +136,22 @@ static int enable_single_step(struct task_struct *child)
 	 * ..but if TF is changed by the instruction we will trace,
 	 * ..but if TF is changed by the instruction we will trace,
 	 * don't mark it as being "us" that set it, so that we
 	 * don't mark it as being "us" that set it, so that we
 	 * won't clear it by hand later.
 	 * won't clear it by hand later.
+	 *
+	 * Note that if we don't actually execute the popf because
+	 * of a signal arriving right now or suchlike, we will lose
+	 * track of the fact that it really was "us" that set it.
 	 */
 	 */
-	if (is_setting_trap_flag(child, regs))
+	if (is_setting_trap_flag(child, regs)) {
+		clear_tsk_thread_flag(child, TIF_FORCED_TF);
 		return 0;
 		return 0;
+	}
+
+	/*
+	 * If TF was already set, check whether it was us who set it.
+	 * If not, we should never attempt a block step.
+	 */
+	if (oflags & X86_EFLAGS_TF)
+		return test_tsk_thread_flag(child, TIF_FORCED_TF);
 
 
 	set_tsk_thread_flag(child, TIF_FORCED_TF);
 	set_tsk_thread_flag(child, TIF_FORCED_TF);
 
 

+ 1 - 0
arch/x86/kernel/time_32.c

@@ -129,6 +129,7 @@ void __init hpet_time_init(void)
  */
  */
 void __init time_init(void)
 void __init time_init(void)
 {
 {
+	pre_time_init_hook();
 	tsc_init();
 	tsc_init();
 	late_time_init = choose_time_init();
 	late_time_init = choose_time_init();
 }
 }

+ 53 - 65
arch/x86/kernel/traps_32.c

@@ -58,6 +58,7 @@
 #include <asm/nmi.h>
 #include <asm/nmi.h>
 #include <asm/smp.h>
 #include <asm/smp.h>
 #include <asm/io.h>
 #include <asm/io.h>
+#include <asm/traps.h>
 
 
 #include "mach_traps.h"
 #include "mach_traps.h"
 
 
@@ -77,26 +78,6 @@ char ignore_fpu_irq;
 gate_desc idt_table[256]
 gate_desc idt_table[256]
 	__attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
 	__attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
 
 
-asmlinkage void divide_error(void);
-asmlinkage void debug(void);
-asmlinkage void nmi(void);
-asmlinkage void int3(void);
-asmlinkage void overflow(void);
-asmlinkage void bounds(void);
-asmlinkage void invalid_op(void);
-asmlinkage void device_not_available(void);
-asmlinkage void coprocessor_segment_overrun(void);
-asmlinkage void invalid_TSS(void);
-asmlinkage void segment_not_present(void);
-asmlinkage void stack_segment(void);
-asmlinkage void general_protection(void);
-asmlinkage void page_fault(void);
-asmlinkage void coprocessor_error(void);
-asmlinkage void simd_coprocessor_error(void);
-asmlinkage void alignment_check(void);
-asmlinkage void spurious_interrupt_bug(void);
-asmlinkage void machine_check(void);
-
 int panic_on_unrecovered_nmi;
 int panic_on_unrecovered_nmi;
 int kstack_depth_to_print = 24;
 int kstack_depth_to_print = 24;
 static unsigned int code_bytes = 64;
 static unsigned int code_bytes = 64;
@@ -256,7 +237,7 @@ static const struct stacktrace_ops print_trace_ops = {
 
 
 static void
 static void
 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-		   unsigned long *stack, unsigned long bp, char *log_lvl)
+		unsigned long *stack, unsigned long bp, char *log_lvl)
 {
 {
 	dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
 	dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
 	printk("%s =======================\n", log_lvl);
 	printk("%s =======================\n", log_lvl);
@@ -383,6 +364,54 @@ int is_valid_bugaddr(unsigned long ip)
 	return ud2 == 0x0b0f;
 	return ud2 == 0x0b0f;
 }
 }
 
 
+static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static int die_owner = -1;
+static unsigned int die_nest_count;
+
+unsigned __kprobes long oops_begin(void)
+{
+	unsigned long flags;
+
+	oops_enter();
+
+	if (die_owner != raw_smp_processor_id()) {
+		console_verbose();
+		raw_local_irq_save(flags);
+		__raw_spin_lock(&die_lock);
+		die_owner = smp_processor_id();
+		die_nest_count = 0;
+		bust_spinlocks(1);
+	} else {
+		raw_local_irq_save(flags);
+	}
+	die_nest_count++;
+	return flags;
+}
+
+void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+{
+	bust_spinlocks(0);
+	die_owner = -1;
+	add_taint(TAINT_DIE);
+	__raw_spin_unlock(&die_lock);
+	raw_local_irq_restore(flags);
+
+	if (!regs)
+		return;
+
+	if (kexec_should_crash(current))
+		crash_kexec(regs);
+
+	if (in_interrupt())
+		panic("Fatal exception in interrupt");
+
+	if (panic_on_oops)
+		panic("Fatal exception");
+
+	oops_exit();
+	do_exit(signr);
+}
+
 int __kprobes __die(const char *str, struct pt_regs *regs, long err)
 int __kprobes __die(const char *str, struct pt_regs *regs, long err)
 {
 {
 	unsigned short ss;
 	unsigned short ss;
@@ -423,31 +452,9 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
  */
  */
 void die(const char *str, struct pt_regs *regs, long err)
 void die(const char *str, struct pt_regs *regs, long err)
 {
 {
-	static struct {
-		raw_spinlock_t lock;
-		u32 lock_owner;
-		int lock_owner_depth;
-	} die = {
-		.lock =			__RAW_SPIN_LOCK_UNLOCKED,
-		.lock_owner =		-1,
-		.lock_owner_depth =	0
-	};
-	unsigned long flags;
-
-	oops_enter();
-
-	if (die.lock_owner != raw_smp_processor_id()) {
-		console_verbose();
-		raw_local_irq_save(flags);
-		__raw_spin_lock(&die.lock);
-		die.lock_owner = smp_processor_id();
-		die.lock_owner_depth = 0;
-		bust_spinlocks(1);
-	} else {
-		raw_local_irq_save(flags);
-	}
+	unsigned long flags = oops_begin();
 
 
-	if (++die.lock_owner_depth < 3) {
+	if (die_nest_count < 3) {
 		report_bug(regs->ip, regs);
 		report_bug(regs->ip, regs);
 
 
 		if (__die(str, regs, err))
 		if (__die(str, regs, err))
@@ -456,26 +463,7 @@ void die(const char *str, struct pt_regs *regs, long err)
 		printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
 		printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
 	}
 	}
 
 
-	bust_spinlocks(0);
-	die.lock_owner = -1;
-	add_taint(TAINT_DIE);
-	__raw_spin_unlock(&die.lock);
-	raw_local_irq_restore(flags);
-
-	if (!regs)
-		return;
-
-	if (kexec_should_crash(current))
-		crash_kexec(regs);
-
-	if (in_interrupt())
-		panic("Fatal exception in interrupt");
-
-	if (panic_on_oops)
-		panic("Fatal exception");
-
-	oops_exit();
-	do_exit(SIGSEGV);
+	oops_end(flags, regs, SIGSEGV);
 }
 }
 
 
 static inline void
 static inline void

+ 18 - 30
arch/x86/kernel/traps_64.c

@@ -51,30 +51,10 @@
 #include <asm/pgalloc.h>
 #include <asm/pgalloc.h>
 #include <asm/proto.h>
 #include <asm/proto.h>
 #include <asm/pda.h>
 #include <asm/pda.h>
+#include <asm/traps.h>
 
 
 #include <mach_traps.h>
 #include <mach_traps.h>
 
 
-asmlinkage void divide_error(void);
-asmlinkage void debug(void);
-asmlinkage void nmi(void);
-asmlinkage void int3(void);
-asmlinkage void overflow(void);
-asmlinkage void bounds(void);
-asmlinkage void invalid_op(void);
-asmlinkage void device_not_available(void);
-asmlinkage void double_fault(void);
-asmlinkage void coprocessor_segment_overrun(void);
-asmlinkage void invalid_TSS(void);
-asmlinkage void segment_not_present(void);
-asmlinkage void stack_segment(void);
-asmlinkage void general_protection(void);
-asmlinkage void page_fault(void);
-asmlinkage void coprocessor_error(void);
-asmlinkage void simd_coprocessor_error(void);
-asmlinkage void alignment_check(void);
-asmlinkage void spurious_interrupt_bug(void);
-asmlinkage void machine_check(void);
-
 int panic_on_unrecovered_nmi;
 int panic_on_unrecovered_nmi;
 int kstack_depth_to_print = 12;
 int kstack_depth_to_print = 12;
 static unsigned int code_bytes = 64;
 static unsigned int code_bytes = 64;
@@ -355,17 +335,24 @@ static const struct stacktrace_ops print_trace_ops = {
 	.address = print_trace_address,
 	.address = print_trace_address,
 };
 };
 
 
-void show_trace(struct task_struct *task, struct pt_regs *regs,
-		unsigned long *stack, unsigned long bp)
+static void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+		unsigned long *stack, unsigned long bp, char *log_lvl)
 {
 {
 	printk("\nCall Trace:\n");
 	printk("\nCall Trace:\n");
-	dump_trace(task, regs, stack, bp, &print_trace_ops, NULL);
+	dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
 	printk("\n");
 	printk("\n");
 }
 }
 
 
+void show_trace(struct task_struct *task, struct pt_regs *regs,
+		unsigned long *stack, unsigned long bp)
+{
+	show_trace_log_lvl(task, regs, stack, bp, "");
+}
+
 static void
 static void
-_show_stack(struct task_struct *task, struct pt_regs *regs,
-		unsigned long *sp, unsigned long bp)
+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+		unsigned long *sp, unsigned long bp, char *log_lvl)
 {
 {
 	unsigned long *stack;
 	unsigned long *stack;
 	int i;
 	int i;
@@ -399,12 +386,12 @@ _show_stack(struct task_struct *task, struct pt_regs *regs,
 		printk(" %016lx", *stack++);
 		printk(" %016lx", *stack++);
 		touch_nmi_watchdog();
 		touch_nmi_watchdog();
 	}
 	}
-	show_trace(task, regs, sp, bp);
+	show_trace_log_lvl(task, regs, sp, bp, log_lvl);
 }
 }
 
 
 void show_stack(struct task_struct *task, unsigned long *sp)
 void show_stack(struct task_struct *task, unsigned long *sp)
 {
 {
-	_show_stack(task, NULL, sp, 0);
+	show_stack_log_lvl(task, NULL, sp, 0, "");
 }
 }
 
 
 /*
 /*
@@ -454,7 +441,8 @@ void show_registers(struct pt_regs *regs)
 		u8 *ip;
 		u8 *ip;
 
 
 		printk("Stack: ");
 		printk("Stack: ");
-		_show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
+		show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
+				regs->bp, "");
 		printk("\n");
 		printk("\n");
 
 
 		printk(KERN_EMERG "Code: ");
 		printk(KERN_EMERG "Code: ");
@@ -518,7 +506,7 @@ unsigned __kprobes long oops_begin(void)
 }
 }
 
 
 void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
 void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
-{ 
+{
 	die_owner = -1;
 	die_owner = -1;
 	bust_spinlocks(0);
 	bust_spinlocks(0);
 	die_nest_count--;
 	die_nest_count--;

+ 20 - 22
arch/x86/kernel/visws_quirks.c

@@ -73,7 +73,7 @@ int is_visws_box(void)
 	return visws_board_type >= 0;
 	return visws_board_type >= 0;
 }
 }
 
 
-static int __init visws_time_init_quirk(void)
+static int __init visws_time_init(void)
 {
 {
 	printk(KERN_INFO "Starting Cobalt Timer system clock\n");
 	printk(KERN_INFO "Starting Cobalt Timer system clock\n");
 
 
@@ -93,7 +93,7 @@ static int __init visws_time_init_quirk(void)
 	return 0;
 	return 0;
 }
 }
 
 
-static int __init visws_pre_intr_init_quirk(void)
+static int __init visws_pre_intr_init(void)
 {
 {
 	init_VISWS_APIC_irqs();
 	init_VISWS_APIC_irqs();
 
 
@@ -114,7 +114,7 @@ EXPORT_SYMBOL(sgivwfb_mem_size);
 
 
 long long mem_size __initdata = 0;
 long long mem_size __initdata = 0;
 
 
-static char * __init visws_memory_setup_quirk(void)
+static char * __init visws_memory_setup(void)
 {
 {
 	long long gfx_mem_size = 8 * MB;
 	long long gfx_mem_size = 8 * MB;
 
 
@@ -176,7 +176,7 @@ static void visws_machine_power_off(void)
 	outl(PIIX_SPECIAL_STOP, 0xCFC);
 	outl(PIIX_SPECIAL_STOP, 0xCFC);
 }
 }
 
 
-static int __init visws_get_smp_config_quirk(unsigned int early)
+static int __init visws_get_smp_config(unsigned int early)
 {
 {
 	/*
 	/*
 	 * Prevent MP-table parsing by the generic code:
 	 * Prevent MP-table parsing by the generic code:
@@ -192,7 +192,7 @@ extern unsigned int __cpuinitdata maxcpus;
  * No problem for Linux.
  * No problem for Linux.
  */
  */
 
 
-static void __init MP_processor_info (struct mpc_config_processor *m)
+static void __init MP_processor_info(struct mpc_config_processor *m)
 {
 {
 	int ver, logical_apicid;
 	int ver, logical_apicid;
 	physid_mask_t apic_cpus;
 	physid_mask_t apic_cpus;
@@ -232,7 +232,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
 	apic_version[m->mpc_apicid] = ver;
 	apic_version[m->mpc_apicid] = ver;
 }
 }
 
 
-int __init visws_find_smp_config_quirk(unsigned int reserve)
+static int __init visws_find_smp_config(unsigned int reserve)
 {
 {
 	struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
 	struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
 	unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
 	unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
@@ -258,7 +258,17 @@ int __init visws_find_smp_config_quirk(unsigned int reserve)
 	return 1;
 	return 1;
 }
 }
 
 
-extern int visws_trap_init_quirk(void);
+static int visws_trap_init(void);
+
+static struct x86_quirks visws_x86_quirks __initdata = {
+	.arch_time_init		= visws_time_init,
+	.arch_pre_intr_init	= visws_pre_intr_init,
+	.arch_memory_setup	= visws_memory_setup,
+	.arch_intr_init		= NULL,
+	.arch_trap_init		= visws_trap_init,
+	.mach_get_smp_config	= visws_get_smp_config,
+	.mach_find_smp_config	= visws_find_smp_config,
+};
 
 
 void __init visws_early_detect(void)
 void __init visws_early_detect(void)
 {
 {
@@ -272,16 +282,10 @@ void __init visws_early_detect(void)
 
 
 	/*
 	/*
 	 * Install special quirks for timer, interrupt and memory setup:
 	 * Install special quirks for timer, interrupt and memory setup:
-	 */
-	arch_time_init_quirk		= visws_time_init_quirk;
-	arch_pre_intr_init_quirk	= visws_pre_intr_init_quirk;
-	arch_memory_setup_quirk		= visws_memory_setup_quirk;
-
-	/*
 	 * Fall back to generic behavior for traps:
 	 * Fall back to generic behavior for traps:
+	 * Override generic MP-table parsing:
 	 */
 	 */
-	arch_intr_init_quirk		= NULL;
-	arch_trap_init_quirk		= visws_trap_init_quirk;
+	x86_quirks = &visws_x86_quirks;
 
 
 	/*
 	/*
 	 * Install reboot quirks:
 	 * Install reboot quirks:
@@ -294,12 +298,6 @@ void __init visws_early_detect(void)
 	 */
 	 */
 	no_broadcast = 0;
 	no_broadcast = 0;
 
 
-	/*
-	 * Override generic MP-table parsing:
-	 */
-	mach_get_smp_config_quirk	= visws_get_smp_config_quirk;
-	mach_find_smp_config_quirk	= visws_find_smp_config_quirk;
-
 #ifdef CONFIG_X86_IO_APIC
 #ifdef CONFIG_X86_IO_APIC
 	/*
 	/*
 	 * Turn off IO-APIC detection and initialization:
 	 * Turn off IO-APIC detection and initialization:
@@ -426,7 +424,7 @@ static __init void cobalt_init(void)
 		co_apic_read(CO_APIC_ID));
 		co_apic_read(CO_APIC_ID));
 }
 }
 
 
-int __init visws_trap_init_quirk(void)
+static int __init visws_trap_init(void)
 {
 {
 	lithium_init();
 	lithium_init();
 	cobalt_init();
 	cobalt_init();

+ 0 - 1
arch/x86/kernel/vmi_32.c

@@ -906,7 +906,6 @@ static inline int __init activate_vmi(void)
 #ifdef CONFIG_X86_LOCAL_APIC
 #ifdef CONFIG_X86_LOCAL_APIC
 	para_fill(pv_apic_ops.apic_read, APICRead);
 	para_fill(pv_apic_ops.apic_read, APICRead);
 	para_fill(pv_apic_ops.apic_write, APICWrite);
 	para_fill(pv_apic_ops.apic_write, APICWrite);
-	para_fill(pv_apic_ops.apic_write_atomic, APICWrite);
 #endif
 #endif
 
 
 	/*
 	/*

+ 0 - 1
arch/x86/lguest/boot.c

@@ -991,7 +991,6 @@ __init void lguest_init(void)
 #ifdef CONFIG_X86_LOCAL_APIC
 #ifdef CONFIG_X86_LOCAL_APIC
 	/* apic read/write intercepts */
 	/* apic read/write intercepts */
 	pv_apic_ops.apic_write = lguest_apic_write;
 	pv_apic_ops.apic_write = lguest_apic_write;
-	pv_apic_ops.apic_write_atomic = lguest_apic_write;
 	pv_apic_ops.apic_read = lguest_apic_read;
 	pv_apic_ops.apic_read = lguest_apic_read;
 #endif
 #endif
 
 

+ 18 - 16
arch/x86/mach-default/setup.c

@@ -10,14 +10,6 @@
 #include <asm/e820.h>
 #include <asm/e820.h>
 #include <asm/setup.h>
 #include <asm/setup.h>
 
 
-/*
- * Any quirks to be performed to initialize timers/irqs/etc?
- */
-int (*arch_time_init_quirk)(void);
-int (*arch_pre_intr_init_quirk)(void);
-int (*arch_intr_init_quirk)(void);
-int (*arch_trap_init_quirk)(void);
-
 #ifdef CONFIG_HOTPLUG_CPU
 #ifdef CONFIG_HOTPLUG_CPU
 #define DEFAULT_SEND_IPI	(1)
 #define DEFAULT_SEND_IPI	(1)
 #else
 #else
@@ -37,8 +29,8 @@ int no_broadcast=DEFAULT_SEND_IPI;
  **/
  **/
 void __init pre_intr_init_hook(void)
 void __init pre_intr_init_hook(void)
 {
 {
-	if (arch_pre_intr_init_quirk) {
-		if (arch_pre_intr_init_quirk())
+	if (x86_quirks->arch_pre_intr_init) {
+		if (x86_quirks->arch_pre_intr_init())
 			return;
 			return;
 	}
 	}
 	init_ISA_irqs();
 	init_ISA_irqs();
@@ -64,8 +56,8 @@ static struct irqaction irq2 = {
  **/
  **/
 void __init intr_init_hook(void)
 void __init intr_init_hook(void)
 {
 {
-	if (arch_intr_init_quirk) {
-		if (arch_intr_init_quirk())
+	if (x86_quirks->arch_intr_init) {
+		if (x86_quirks->arch_intr_init())
 			return;
 			return;
 	}
 	}
 #ifdef CONFIG_X86_LOCAL_APIC
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -97,8 +89,8 @@ void __init pre_setup_arch_hook(void)
  **/
  **/
 void __init trap_init_hook(void)
 void __init trap_init_hook(void)
 {
 {
-	if (arch_trap_init_quirk) {
-		if (arch_trap_init_quirk())
+	if (x86_quirks->arch_trap_init) {
+		if (x86_quirks->arch_trap_init())
 			return;
 			return;
 	}
 	}
 }
 }
@@ -110,6 +102,16 @@ static struct irqaction irq0  = {
 	.name = "timer"
 	.name = "timer"
 };
 };
 
 
+/**
+ * pre_time_init_hook - do any specific initialisations before.
+ *
+ **/
+void __init pre_time_init_hook(void)
+{
+	if (x86_quirks->arch_pre_time_init)
+		x86_quirks->arch_pre_time_init();
+}
+
 /**
 /**
  * time_init_hook - do any specific initialisations for the system timer.
  * time_init_hook - do any specific initialisations for the system timer.
  *
  *
@@ -119,13 +121,13 @@ static struct irqaction irq0  = {
  **/
  **/
 void __init time_init_hook(void)
 void __init time_init_hook(void)
 {
 {
-	if (arch_time_init_quirk) {
+	if (x86_quirks->arch_time_init) {
 		/*
 		/*
 		 * A nonzero return code does not mean failure, it means
 		 * A nonzero return code does not mean failure, it means
 		 * that the architecture quirk does not want any
 		 * that the architecture quirk does not want any
 		 * generic (timer) setup to be performed after this:
 		 * generic (timer) setup to be performed after this:
 		 */
 		 */
-		if (arch_time_init_quirk())
+		if (x86_quirks->arch_time_init())
 			return;
 			return;
 	}
 	}
 
 

+ 1 - 0
arch/x86/mm/Makefile

@@ -21,3 +21,4 @@ obj-$(CONFIG_K8_NUMA)		+= k8topology_64.o
 endif
 endif
 obj-$(CONFIG_ACPI_NUMA)		+= srat_$(BITS).o
 obj-$(CONFIG_ACPI_NUMA)		+= srat_$(BITS).o
 
 
+obj-$(CONFIG_MEMTEST)		+= memtest.o

+ 3 - 2
arch/x86/mm/init_32.c

@@ -844,6 +844,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 		reserve_early(table_start << PAGE_SHIFT,
 		reserve_early(table_start << PAGE_SHIFT,
 				 table_end << PAGE_SHIFT, "PGTABLE");
 				 table_end << PAGE_SHIFT, "PGTABLE");
 
 
+	if (!after_init_bootmem)
+		early_memtest(start, end);
+
 	return end >> PAGE_SHIFT;
 	return end >> PAGE_SHIFT;
 }
 }
 
 
@@ -868,8 +871,6 @@ void __init paging_init(void)
 	 */
 	 */
 	sparse_init();
 	sparse_init();
 	zone_sizes_init();
 	zone_sizes_init();
-
-	paravirt_post_allocator_init();
 }
 }
 
 
 /*
 /*

+ 0 - 112
arch/x86/mm/init_64.c

@@ -517,118 +517,6 @@ static void __init init_gbpages(void)
 		direct_gbpages = 0;
 		direct_gbpages = 0;
 }
 }
 
 
-#ifdef CONFIG_MEMTEST
-
-static void __init memtest(unsigned long start_phys, unsigned long size,
-				 unsigned pattern)
-{
-	unsigned long i;
-	unsigned long *start;
-	unsigned long start_bad;
-	unsigned long last_bad;
-	unsigned long val;
-	unsigned long start_phys_aligned;
-	unsigned long count;
-	unsigned long incr;
-
-	switch (pattern) {
-	case 0:
-		val = 0UL;
-		break;
-	case 1:
-		val = -1UL;
-		break;
-	case 2:
-		val = 0x5555555555555555UL;
-		break;
-	case 3:
-		val = 0xaaaaaaaaaaaaaaaaUL;
-		break;
-	default:
-		return;
-	}
-
-	incr = sizeof(unsigned long);
-	start_phys_aligned = ALIGN(start_phys, incr);
-	count = (size - (start_phys_aligned - start_phys))/incr;
-	start = __va(start_phys_aligned);
-	start_bad = 0;
-	last_bad = 0;
-
-	for (i = 0; i < count; i++)
-		start[i] = val;
-	for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
-		if (*start != val) {
-			if (start_phys_aligned == last_bad + incr) {
-				last_bad += incr;
-			} else {
-				if (start_bad) {
-					printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
-						val, start_bad, last_bad + incr);
-					reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
-				}
-				start_bad = last_bad = start_phys_aligned;
-			}
-		}
-	}
-	if (start_bad) {
-		printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
-			val, start_bad, last_bad + incr);
-		reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
-	}
-
-}
-
-/* default is disabled */
-static int memtest_pattern __initdata;
-
-static int __init parse_memtest(char *arg)
-{
-	if (arg)
-		memtest_pattern = simple_strtoul(arg, NULL, 0);
-	return 0;
-}
-
-early_param("memtest", parse_memtest);
-
-static void __init early_memtest(unsigned long start, unsigned long end)
-{
-	u64 t_start, t_size;
-	unsigned pattern;
-
-	if (!memtest_pattern)
-		return;
-
-	printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
-	for (pattern = 0; pattern < memtest_pattern; pattern++) {
-		t_start = start;
-		t_size = 0;
-		while (t_start < end) {
-			t_start = find_e820_area_size(t_start, &t_size, 1);
-
-			/* done ? */
-			if (t_start >= end)
-				break;
-			if (t_start + t_size > end)
-				t_size = end - t_start;
-
-			printk(KERN_CONT "\n  %016llx - %016llx pattern %d",
-				(unsigned long long)t_start,
-				(unsigned long long)t_start + t_size, pattern);
-
-			memtest(t_start, t_size, pattern);
-
-			t_start += t_size;
-		}
-	}
-	printk(KERN_CONT "\n");
-}
-#else
-static void __init early_memtest(unsigned long start, unsigned long end)
-{
-}
-#endif
-
 static unsigned long __init kernel_physical_mapping_init(unsigned long start,
 static unsigned long __init kernel_physical_mapping_init(unsigned long start,
 						unsigned long end,
 						unsigned long end,
 						unsigned long page_size_mask)
 						unsigned long page_size_mask)

+ 123 - 0
arch/x86/mm/memtest.c

@@ -0,0 +1,123 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/pfn.h>
+
+#include <asm/e820.h>
+
+static void __init memtest(unsigned long start_phys, unsigned long size,
+				 unsigned pattern)
+{
+	unsigned long i;
+	unsigned long *start;
+	unsigned long start_bad;
+	unsigned long last_bad;
+	unsigned long val;
+	unsigned long start_phys_aligned;
+	unsigned long count;
+	unsigned long incr;
+
+	switch (pattern) {
+	case 0:
+		val = 0UL;
+		break;
+	case 1:
+		val = -1UL;
+		break;
+	case 2:
+#ifdef CONFIG_X86_64
+		val = 0x5555555555555555UL;
+#else
+		val = 0x55555555UL;
+#endif
+		break;
+	case 3:
+#ifdef CONFIG_X86_64
+		val = 0xaaaaaaaaaaaaaaaaUL;
+#else
+		val = 0xaaaaaaaaUL;
+#endif
+		break;
+	default:
+		return;
+	}
+
+	incr = sizeof(unsigned long);
+	start_phys_aligned = ALIGN(start_phys, incr);
+	count = (size - (start_phys_aligned - start_phys))/incr;
+	start = __va(start_phys_aligned);
+	start_bad = 0;
+	last_bad = 0;
+
+	for (i = 0; i < count; i++)
+		start[i] = val;
+	for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
+		if (*start != val) {
+			if (start_phys_aligned == last_bad + incr) {
+				last_bad += incr;
+			} else {
+				if (start_bad) {
+					printk(KERN_CONT "\n  %010lx bad mem addr %010lx - %010lx reserved",
+						val, start_bad, last_bad + incr);
+					reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
+				}
+				start_bad = last_bad = start_phys_aligned;
+			}
+		}
+	}
+	if (start_bad) {
+		printk(KERN_CONT "\n  %016lx bad mem addr %010lx - %010lx reserved",
+			val, start_bad, last_bad + incr);
+		reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
+	}
+
+}
+
+/* default is disabled */
+static int memtest_pattern __initdata;
+
+static int __init parse_memtest(char *arg)
+{
+	if (arg)
+		memtest_pattern = simple_strtoul(arg, NULL, 0);
+	return 0;
+}
+
+early_param("memtest", parse_memtest);
+
+void __init early_memtest(unsigned long start, unsigned long end)
+{
+	u64 t_start, t_size;
+	unsigned pattern;
+
+	if (!memtest_pattern)
+		return;
+
+	printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
+	for (pattern = 0; pattern < memtest_pattern; pattern++) {
+		t_start = start;
+		t_size = 0;
+		while (t_start < end) {
+			t_start = find_e820_area_size(t_start, &t_size, 1);
+
+			/* done ? */
+			if (t_start >= end)
+				break;
+			if (t_start + t_size > end)
+				t_size = end - t_start;
+
+			printk(KERN_CONT "\n  %010llx - %010llx pattern %d",
+				(unsigned long long)t_start,
+				(unsigned long long)t_start + t_size, pattern);
+
+			memtest(t_start, t_size, pattern);
+
+			t_start += t_size;
+		}
+	}
+	printk(KERN_CONT "\n");
+}

+ 88 - 0
arch/x86/mm/pat.c

@@ -12,6 +12,8 @@
 #include <linux/gfp.h>
 #include <linux/gfp.h>
 #include <linux/fs.h>
 #include <linux/fs.h>
 #include <linux/bootmem.h>
 #include <linux/bootmem.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
 
 
 #include <asm/msr.h>
 #include <asm/msr.h>
 #include <asm/tlbflush.h>
 #include <asm/tlbflush.h>
@@ -489,3 +491,89 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
 
 
 	free_memtype(addr, addr + size);
 	free_memtype(addr, addr + size);
 }
 }
+
+#if defined(CONFIG_DEBUG_FS)
+
+/* get Nth element of the linked list */
+static struct memtype *memtype_get_idx(loff_t pos)
+{
+	struct memtype *list_node, *print_entry;
+	int i = 1;
+
+	print_entry  = kmalloc(sizeof(struct memtype), GFP_KERNEL);
+	if (!print_entry)
+		return NULL;
+
+	spin_lock(&memtype_lock);
+	list_for_each_entry(list_node, &memtype_list, nd) {
+		if (pos == i) {
+			*print_entry = *list_node;
+			spin_unlock(&memtype_lock);
+			return print_entry;
+		}
+		++i;
+	}
+	spin_unlock(&memtype_lock);
+	kfree(print_entry);
+	return NULL;
+}
+
+static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	if (*pos == 0) {
+		++*pos;
+		seq_printf(seq, "PAT memtype list:\n");
+	}
+
+	return memtype_get_idx(*pos);
+}
+
+static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	++*pos;
+	return memtype_get_idx(*pos);
+}
+
+static void memtype_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int memtype_seq_show(struct seq_file *seq, void *v)
+{
+	struct memtype *print_entry = (struct memtype *)v;
+
+	seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
+			print_entry->start, print_entry->end);
+	kfree(print_entry);
+	return 0;
+}
+
+static struct seq_operations memtype_seq_ops = {
+	.start = memtype_seq_start,
+	.next  = memtype_seq_next,
+	.stop  = memtype_seq_stop,
+	.show  = memtype_seq_show,
+};
+
+static int memtype_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &memtype_seq_ops);
+}
+
+static const struct file_operations memtype_fops = {
+	.open    = memtype_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+static int __init pat_memtype_list_init(void)
+{
+	debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
+				NULL, &memtype_fops);
+	return 0;
+}
+
+late_initcall(pat_memtype_list_init);
+
+#endif /* CONFIG_DEBUG_FS */

+ 6 - 6
arch/x86/pci/Makefile

@@ -5,13 +5,13 @@ obj-$(CONFIG_PCI_MMCONFIG)	+= mmconfig_$(BITS).o direct.o mmconfig-shared.o
 obj-$(CONFIG_PCI_DIRECT)	+= direct.o
 obj-$(CONFIG_PCI_DIRECT)	+= direct.o
 obj-$(CONFIG_PCI_OLPC)		+= olpc.o
 obj-$(CONFIG_PCI_OLPC)		+= olpc.o
 
 
-pci-y				:= fixup.o
-pci-$(CONFIG_ACPI)		+= acpi.o
-pci-y				+= legacy.o irq.o
+obj-y				+= fixup.o
+obj-$(CONFIG_ACPI)		+= acpi.o
+obj-y				+= legacy.o irq.o
 
 
-pci-$(CONFIG_X86_VISWS)		+= visws.o
+obj-$(CONFIG_X86_VISWS)		+= visws.o
 
 
-pci-$(CONFIG_X86_NUMAQ)		+= numa.o
+obj-$(CONFIG_X86_NUMAQ)		+= numaq_32.o
 
 
-obj-y				+= $(pci-y) common.o early.o
+obj-y				+= common.o early.o
 obj-y				+= amd_bus.o
 obj-y				+= amd_bus.o

+ 6 - 3
arch/x86/pci/legacy.c

@@ -57,14 +57,17 @@ static int __init pci_legacy_init(void)
 
 
 int __init pci_subsys_init(void)
 int __init pci_subsys_init(void)
 {
 {
+#ifdef CONFIG_X86_NUMAQ
+	pci_numaq_init();
+#endif
 #ifdef CONFIG_ACPI
 #ifdef CONFIG_ACPI
 	pci_acpi_init();
 	pci_acpi_init();
+#endif
+#ifdef CONFIG_X86_VISWS
+	pci_visws_init();
 #endif
 #endif
 	pci_legacy_init();
 	pci_legacy_init();
 	pcibios_irq_init();
 	pcibios_irq_init();
-#ifdef CONFIG_X86_NUMAQ
-	pci_numa_init();
-#endif
 	pcibios_init();
 	pcibios_init();
 
 
 	return 0;
 	return 0;

+ 2 - 2
arch/x86/pci/numa.c → arch/x86/pci/numaq_32.c

@@ -1,5 +1,5 @@
 /*
 /*
- * numa.c - Low-level PCI access for NUMA-Q machines
+ * numaq_32.c - Low-level PCI access for NUMA-Q machines
  */
  */
 
 
 #include <linux/pci.h>
 #include <linux/pci.h>
@@ -151,7 +151,7 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
 }
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx);
 
 
-int __init pci_numa_init(void)
+int __init pci_numaq_init(void)
 {
 {
 	int quad;
 	int quad;
 
 

+ 2 - 1
arch/x86/pci/pci.h

@@ -108,7 +108,8 @@ extern void __init dmi_check_skip_isa_align(void);
 /* some common used subsys_initcalls */
 /* some common used subsys_initcalls */
 extern int __init pci_acpi_init(void);
 extern int __init pci_acpi_init(void);
 extern int __init pcibios_irq_init(void);
 extern int __init pcibios_irq_init(void);
-extern int __init pci_numa_init(void);
+extern int __init pci_visws_init(void);
+extern int __init pci_numaq_init(void);
 extern int __init pcibios_init(void);
 extern int __init pcibios_init(void);
 
 
 /* pci-mmconfig.c */
 /* pci-mmconfig.c */

+ 7 - 16
arch/x86/pci/visws.c

@@ -86,8 +86,14 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)
 	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
 	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
 }
 }
 
 
-static int __init pci_visws_init(void)
+int __init pci_visws_init(void)
 {
 {
+	if (!is_visws_box())
+		return -1;
+
+	pcibios_enable_irq = &pci_visws_enable_irq;
+	pcibios_disable_irq = &pci_visws_disable_irq;
+
 	/* The VISWS supports configuration access type 1 only */
 	/* The VISWS supports configuration access type 1 only */
 	pci_probe = (pci_probe | PCI_PROBE_CONF1) &
 	pci_probe = (pci_probe | PCI_PROBE_CONF1) &
 		    ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2);
 		    ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2);
@@ -105,18 +111,3 @@ static int __init pci_visws_init(void)
 	pcibios_resource_survey();
 	pcibios_resource_survey();
 	return 0;
 	return 0;
 }
 }
-
-static __init int pci_subsys_init(void)
-{
-	if (!is_visws_box())
-		return -1;
-
-	pcibios_enable_irq = &pci_visws_enable_irq;
-	pcibios_disable_irq = &pci_visws_disable_irq;
-
-	pci_visws_init();
-	pcibios_init();
-
-	return 0;
-}
-subsys_initcall(pci_subsys_init);

+ 1 - 1
arch/x86/vdso/Makefile

@@ -62,7 +62,7 @@ $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
 # Build multiple 32-bit vDSO images to choose from at boot time.
 # Build multiple 32-bit vDSO images to choose from at boot time.
 #
 #
 obj-$(VDSO32-y)			+= vdso32-syms.lds
 obj-$(VDSO32-y)			+= vdso32-syms.lds
-vdso32.so-$(CONFIG_X86_32)	+= int80
+vdso32.so-$(VDSO32-y)		+= int80
 vdso32.so-$(CONFIG_COMPAT)	+= syscall
 vdso32.so-$(CONFIG_COMPAT)	+= syscall
 vdso32.so-$(VDSO32-y)		+= sysenter
 vdso32.so-$(VDSO32-y)		+= sysenter
 
 

+ 9 - 10
arch/x86/vdso/vdso32-setup.c

@@ -193,17 +193,12 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
 	}
 	}
 }
 }
 
 
-/*
- * These symbols are defined by vdso32.S to mark the bounds
- * of the ELF DSO images included therein.
- */
-extern const char vdso32_default_start, vdso32_default_end;
-extern const char vdso32_sysenter_start, vdso32_sysenter_end;
 static struct page *vdso32_pages[1];
 static struct page *vdso32_pages[1];
 
 
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
 
 
 #define	vdso32_sysenter()	(boot_cpu_has(X86_FEATURE_SYSENTER32))
 #define	vdso32_sysenter()	(boot_cpu_has(X86_FEATURE_SYSENTER32))
+#define	vdso32_syscall()	(boot_cpu_has(X86_FEATURE_SYSCALL32))
 
 
 /* May not be __init: called during resume */
 /* May not be __init: called during resume */
 void syscall32_cpu_init(void)
 void syscall32_cpu_init(void)
@@ -226,6 +221,7 @@ static inline void map_compat_vdso(int map)
 #else  /* CONFIG_X86_32 */
 #else  /* CONFIG_X86_32 */
 
 
 #define vdso32_sysenter()	(boot_cpu_has(X86_FEATURE_SEP))
 #define vdso32_sysenter()	(boot_cpu_has(X86_FEATURE_SEP))
+#define vdso32_syscall()	(0)
 
 
 void enable_sep_cpu(void)
 void enable_sep_cpu(void)
 {
 {
@@ -296,12 +292,15 @@ int __init sysenter_setup(void)
 	gate_vma_init();
 	gate_vma_init();
 #endif
 #endif
 
 
-	if (!vdso32_sysenter()) {
-		vsyscall = &vdso32_default_start;
-		vsyscall_len = &vdso32_default_end - &vdso32_default_start;
-	} else {
+	if (vdso32_syscall()) {
+		vsyscall = &vdso32_syscall_start;
+		vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
+	} else if (vdso32_sysenter()){
 		vsyscall = &vdso32_sysenter_start;
 		vsyscall = &vdso32_sysenter_start;
 		vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
 		vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
+	} else {
+		vsyscall = &vdso32_int80_start;
+		vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
 	}
 	}
 
 
 	memcpy(syscall_page, vsyscall, vsyscall_len);
 	memcpy(syscall_page, vsyscall, vsyscall_len);

+ 8 - 5
arch/x86/vdso/vdso32.S

@@ -2,14 +2,17 @@
 
 
 __INITDATA
 __INITDATA
 
 
-	.globl vdso32_default_start, vdso32_default_end
-vdso32_default_start:
-#ifdef CONFIG_X86_32
+	.globl vdso32_int80_start, vdso32_int80_end
+vdso32_int80_start:
 	.incbin "arch/x86/vdso/vdso32-int80.so"
 	.incbin "arch/x86/vdso/vdso32-int80.so"
-#else
+vdso32_int80_end:
+
+	.globl vdso32_syscall_start, vdso32_syscall_end
+vdso32_syscall_start:
+#ifdef CONFIG_COMPAT
 	.incbin "arch/x86/vdso/vdso32-syscall.so"
 	.incbin "arch/x86/vdso/vdso32-syscall.so"
 #endif
 #endif
-vdso32_default_end:
+vdso32_syscall_end:
 
 
 	.globl vdso32_sysenter_start, vdso32_sysenter_end
 	.globl vdso32_sysenter_start, vdso32_sysenter_end
 vdso32_sysenter_start:
 vdso32_sysenter_start:

+ 6 - 5
arch/x86/vdso/vma.c

@@ -21,7 +21,8 @@ unsigned int __read_mostly vdso_enabled = 1;
 extern char vdso_start[], vdso_end[];
 extern char vdso_start[], vdso_end[];
 extern unsigned short vdso_sync_cpuid;
 extern unsigned short vdso_sync_cpuid;
 
 
-struct page **vdso_pages;
+static struct page **vdso_pages;
+static unsigned vdso_size;
 
 
 static inline void *var_ref(void *p, char *name)
 static inline void *var_ref(void *p, char *name)
 {
 {
@@ -38,6 +39,7 @@ static int __init init_vdso_vars(void)
 	int i;
 	int i;
 	char *vbase;
 	char *vbase;
 
 
+	vdso_size = npages << PAGE_SHIFT;
 	vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
 	vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
 	if (!vdso_pages)
 	if (!vdso_pages)
 		goto oom;
 		goto oom;
@@ -101,20 +103,19 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
 	struct mm_struct *mm = current->mm;
 	struct mm_struct *mm = current->mm;
 	unsigned long addr;
 	unsigned long addr;
 	int ret;
 	int ret;
-	unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE);
 
 
 	if (!vdso_enabled)
 	if (!vdso_enabled)
 		return 0;
 		return 0;
 
 
 	down_write(&mm->mmap_sem);
 	down_write(&mm->mmap_sem);
-	addr = vdso_addr(mm->start_stack, len);
-	addr = get_unmapped_area(NULL, addr, len, 0, 0);
+	addr = vdso_addr(mm->start_stack, vdso_size);
+	addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0);
 	if (IS_ERR_VALUE(addr)) {
 	if (IS_ERR_VALUE(addr)) {
 		ret = addr;
 		ret = addr;
 		goto up_fail;
 		goto up_fail;
 	}
 	}
 
 
-	ret = install_special_mapping(mm, addr, len,
+	ret = install_special_mapping(mm, addr, vdso_size,
 				      VM_READ|VM_EXEC|
 				      VM_READ|VM_EXEC|
 				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
 				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
 				      VM_ALWAYSDUMP,
 				      VM_ALWAYSDUMP,

+ 10 - 4
arch/x86/xen/Kconfig

@@ -6,8 +6,8 @@ config XEN
 	bool "Xen guest support"
 	bool "Xen guest support"
 	select PARAVIRT
 	select PARAVIRT
 	select PARAVIRT_CLOCK
 	select PARAVIRT_CLOCK
-	depends on X86_32
-	depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER)
+	depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
+	depends on X86_CMPXCHG && X86_TSC
 	help
 	help
 	  This is the Linux Xen port.  Enabling this will allow the
 	  This is the Linux Xen port.  Enabling this will allow the
 	  kernel to boot in a paravirtualized environment under the
 	  kernel to boot in a paravirtualized environment under the
@@ -15,10 +15,16 @@ config XEN
 
 
 config XEN_MAX_DOMAIN_MEMORY
 config XEN_MAX_DOMAIN_MEMORY
        int "Maximum allowed size of a domain in gigabytes"
        int "Maximum allowed size of a domain in gigabytes"
-       default 8
+       default 8 if X86_32
+       default 32 if X86_64
        depends on XEN
        depends on XEN
        help
        help
          The pseudo-physical to machine address array is sized
          The pseudo-physical to machine address array is sized
          according to the maximum possible memory size of a Xen
          according to the maximum possible memory size of a Xen
          domain.  This array uses 1 page per gigabyte, so there's no
          domain.  This array uses 1 page per gigabyte, so there's no
-         need to be too stingy here.
+         need to be too stingy here.
+
+config XEN_SAVE_RESTORE
+       bool
+       depends on PM
+       default y

+ 1 - 1
arch/x86/xen/Makefile

@@ -1,4 +1,4 @@
 obj-y		:= enlighten.o setup.o multicalls.o mmu.o \
 obj-y		:= enlighten.o setup.o multicalls.o mmu.o \
-			time.o xen-asm.o grant-table.o suspend.o
+			time.o xen-asm_$(BITS).o grant-table.o suspend.o
 
 
 obj-$(CONFIG_SMP)	+= smp.o
 obj-$(CONFIG_SMP)	+= smp.o

File diff suppressed because it is too large
+ 534 - 148
arch/x86/xen/enlighten.c


+ 227 - 89
arch/x86/xen/mmu.c

@@ -44,8 +44,10 @@
 
 
 #include <asm/pgtable.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/tlbflush.h>
+#include <asm/fixmap.h>
 #include <asm/mmu_context.h>
 #include <asm/mmu_context.h>
 #include <asm/paravirt.h>
 #include <asm/paravirt.h>
+#include <asm/linkage.h>
 
 
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypervisor.h>
@@ -56,26 +58,29 @@
 #include "multicalls.h"
 #include "multicalls.h"
 #include "mmu.h"
 #include "mmu.h"
 
 
+/*
+ * Just beyond the highest usermode address.  STACK_TOP_MAX has a
+ * redzone above it, so round it up to a PGD boundary.
+ */
+#define USER_LIMIT	((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
+
+
 #define P2M_ENTRIES_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long))
 #define P2M_ENTRIES_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long))
 #define TOP_ENTRIES		(MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
 #define TOP_ENTRIES		(MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
 
 
 /* Placeholder for holes in the address space */
 /* Placeholder for holes in the address space */
-static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE]
-	__attribute__((section(".data.page_aligned"))) =
+static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
 		{ [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
 		{ [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
 
 
  /* Array of pointers to pages containing p2m entries */
  /* Array of pointers to pages containing p2m entries */
-static unsigned long *p2m_top[TOP_ENTRIES]
-	__attribute__((section(".data.page_aligned"))) =
+static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
 		{ [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
 		{ [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
 
 
 /* Arrays of p2m arrays expressed in mfns used for save/restore */
 /* Arrays of p2m arrays expressed in mfns used for save/restore */
-static unsigned long p2m_top_mfn[TOP_ENTRIES]
-	__attribute__((section(".bss.page_aligned")));
+static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
 
 
-static unsigned long p2m_top_mfn_list[
-			PAGE_ALIGN(TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)]
-	__attribute__((section(".bss.page_aligned")));
+static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
+	__page_aligned_bss;
 
 
 static inline unsigned p2m_top_index(unsigned long pfn)
 static inline unsigned p2m_top_index(unsigned long pfn)
 {
 {
@@ -181,15 +186,16 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 	p2m_top[topidx][idx] = mfn;
 	p2m_top[topidx][idx] = mfn;
 }
 }
 
 
-xmaddr_t arbitrary_virt_to_machine(unsigned long address)
+xmaddr_t arbitrary_virt_to_machine(void *vaddr)
 {
 {
+	unsigned long address = (unsigned long)vaddr;
 	unsigned int level;
 	unsigned int level;
 	pte_t *pte = lookup_address(address, &level);
 	pte_t *pte = lookup_address(address, &level);
 	unsigned offset = address & ~PAGE_MASK;
 	unsigned offset = address & ~PAGE_MASK;
 
 
 	BUG_ON(pte == NULL);
 	BUG_ON(pte == NULL);
 
 
-	return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
+	return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
 }
 }
 
 
 void make_lowmem_page_readonly(void *vaddr)
 void make_lowmem_page_readonly(void *vaddr)
@@ -256,7 +262,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
 
 
 	xen_mc_batch();
 	xen_mc_batch();
 
 
-	u.ptr = virt_to_machine(ptr).maddr;
+	/* ptr may be ioremapped for 64-bit pagetable setup */
+	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 	u.val = pmd_val_ma(val);
 	u.val = pmd_val_ma(val);
 	extend_mmu_update(&u);
 	extend_mmu_update(&u);
 
 
@@ -283,35 +290,7 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
  */
  */
 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 {
 {
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-
-	pgd = swapper_pg_dir + pgd_index(vaddr);
-	if (pgd_none(*pgd)) {
-		BUG();
-		return;
-	}
-	pud = pud_offset(pgd, vaddr);
-	if (pud_none(*pud)) {
-		BUG();
-		return;
-	}
-	pmd = pmd_offset(pud, vaddr);
-	if (pmd_none(*pmd)) {
-		BUG();
-		return;
-	}
-	pte = pte_offset_kernel(pmd, vaddr);
-	/* <mfn,flags> stored as-is, to permit clearing entries */
-	xen_set_pte(pte, mfn_pte(mfn, flags));
-
-	/*
-	 * It's enough to flush this one mapping.
-	 * (PGE mappings get flushed as well)
-	 */
-	__flush_tlb_one(vaddr);
+	set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
 }
 }
 
 
 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -418,7 +397,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 
 
 	xen_mc_batch();
 	xen_mc_batch();
 
 
-	u.ptr = virt_to_machine(ptr).maddr;
+	/* ptr may be ioremapped for 64-bit pagetable setup */
+	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 	u.val = pud_val_ma(val);
 	u.val = pud_val_ma(val);
 	extend_mmu_update(&u);
 	extend_mmu_update(&u);
 
 
@@ -441,14 +421,19 @@ void xen_set_pud(pud_t *ptr, pud_t val)
 
 
 void xen_set_pte(pte_t *ptep, pte_t pte)
 void xen_set_pte(pte_t *ptep, pte_t pte)
 {
 {
+#ifdef CONFIG_X86_PAE
 	ptep->pte_high = pte.pte_high;
 	ptep->pte_high = pte.pte_high;
 	smp_wmb();
 	smp_wmb();
 	ptep->pte_low = pte.pte_low;
 	ptep->pte_low = pte.pte_low;
+#else
+	*ptep = pte;
+#endif
 }
 }
 
 
+#ifdef CONFIG_X86_PAE
 void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 {
 {
-	set_64bit((u64 *)ptep, pte_val_ma(pte));
+	set_64bit((u64 *)ptep, native_pte_val(pte));
 }
 }
 
 
 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -462,6 +447,7 @@ void xen_pmd_clear(pmd_t *pmdp)
 {
 {
 	set_pmd(pmdp, __pmd(0));
 	set_pmd(pmdp, __pmd(0));
 }
 }
+#endif	/* CONFIG_X86_PAE */
 
 
 pmd_t xen_make_pmd(pmdval_t pmd)
 pmd_t xen_make_pmd(pmdval_t pmd)
 {
 {
@@ -469,78 +455,189 @@ pmd_t xen_make_pmd(pmdval_t pmd)
 	return native_make_pmd(pmd);
 	return native_make_pmd(pmd);
 }
 }
 
 
+#if PAGETABLE_LEVELS == 4
+pudval_t xen_pud_val(pud_t pud)
+{
+	return pte_mfn_to_pfn(pud.pud);
+}
+
+pud_t xen_make_pud(pudval_t pud)
+{
+	pud = pte_pfn_to_mfn(pud);
+
+	return native_make_pud(pud);
+}
+
+pgd_t *xen_get_user_pgd(pgd_t *pgd)
+{
+	pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
+	unsigned offset = pgd - pgd_page;
+	pgd_t *user_ptr = NULL;
+
+	if (offset < pgd_index(USER_LIMIT)) {
+		struct page *page = virt_to_page(pgd_page);
+		user_ptr = (pgd_t *)page->private;
+		if (user_ptr)
+			user_ptr += offset;
+	}
+
+	return user_ptr;
+}
+
+static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
+{
+	struct mmu_update u;
+
+	u.ptr = virt_to_machine(ptr).maddr;
+	u.val = pgd_val_ma(val);
+	extend_mmu_update(&u);
+}
+
+/*
+ * Raw hypercall-based set_pgd, intended for in early boot before
+ * there's a page structure.  This implies:
+ *  1. The only existing pagetable is the kernel's
+ *  2. It is always pinned
+ *  3. It has no user pagetable attached to it
+ */
+void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
+{
+	preempt_disable();
+
+	xen_mc_batch();
+
+	__xen_set_pgd_hyper(ptr, val);
+
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+	preempt_enable();
+}
+
+void xen_set_pgd(pgd_t *ptr, pgd_t val)
+{
+	pgd_t *user_ptr = xen_get_user_pgd(ptr);
+
+	/* If page is not pinned, we can just update the entry
+	   directly */
+	if (!page_pinned(ptr)) {
+		*ptr = val;
+		if (user_ptr) {
+			WARN_ON(page_pinned(user_ptr));
+			*user_ptr = val;
+		}
+		return;
+	}
+
+	/* If it's pinned, then we can at least batch the kernel and
+	   user updates together. */
+	xen_mc_batch();
+
+	__xen_set_pgd_hyper(ptr, val);
+	if (user_ptr)
+		__xen_set_pgd_hyper(user_ptr, val);
+
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+#endif	/* PAGETABLE_LEVELS == 4 */
+
 /*
 /*
-  (Yet another) pagetable walker.  This one is intended for pinning a
-  pagetable.  This means that it walks a pagetable and calls the
-  callback function on each page it finds making up the page table,
-  at every level.  It walks the entire pagetable, but it only bothers
-  pinning pte pages which are below pte_limit.  In the normal case
-  this will be TASK_SIZE, but at boot we need to pin up to
-  FIXADDR_TOP.  But the important bit is that we don't pin beyond
-  there, because then we start getting into Xen's ptes.
-*/
-static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
+ * (Yet another) pagetable walker.  This one is intended for pinning a
+ * pagetable.  This means that it walks a pagetable and calls the
+ * callback function on each page it finds making up the page table,
+ * at every level.  It walks the entire pagetable, but it only bothers
+ * pinning pte pages which are below limit.  In the normal case this
+ * will be STACK_TOP_MAX, but at boot we need to pin up to
+ * FIXADDR_TOP.
+ *
+ * For 32-bit the important bit is that we don't pin beyond there,
+ * because then we start getting into Xen's ptes.
+ *
+ * For 64-bit, we must skip the Xen hole in the middle of the address
+ * space, just after the big x86-64 virtual hole.
+ */
+static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
 		    unsigned long limit)
 		    unsigned long limit)
 {
 {
-	pgd_t *pgd = pgd_base;
 	int flush = 0;
 	int flush = 0;
-	unsigned long addr = 0;
-	unsigned long pgd_next;
+	unsigned hole_low, hole_high;
+	unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
+	unsigned pgdidx, pudidx, pmdidx;
 
 
-	BUG_ON(limit > FIXADDR_TOP);
+	/* The limit is the last byte to be touched */
+	limit--;
+	BUG_ON(limit >= FIXADDR_TOP);
 
 
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return 0;
 		return 0;
 
 
-	for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
+	/*
+	 * 64-bit has a great big hole in the middle of the address
+	 * space, which contains the Xen mappings.  On 32-bit these
+	 * will end up making a zero-sized hole and so is a no-op.
+	 */
+	hole_low = pgd_index(USER_LIMIT);
+	hole_high = pgd_index(PAGE_OFFSET);
+
+	pgdidx_limit = pgd_index(limit);
+#if PTRS_PER_PUD > 1
+	pudidx_limit = pud_index(limit);
+#else
+	pudidx_limit = 0;
+#endif
+#if PTRS_PER_PMD > 1
+	pmdidx_limit = pmd_index(limit);
+#else
+	pmdidx_limit = 0;
+#endif
+
+	flush |= (*func)(virt_to_page(pgd), PT_PGD);
+
+	for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
 		pud_t *pud;
 		pud_t *pud;
-		unsigned long pud_limit, pud_next;
 
 
-		pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
+		if (pgdidx >= hole_low && pgdidx < hole_high)
+			continue;
 
 
-		if (!pgd_val(*pgd))
+		if (!pgd_val(pgd[pgdidx]))
 			continue;
 			continue;
 
 
-		pud = pud_offset(pgd, 0);
+		pud = pud_offset(&pgd[pgdidx], 0);
 
 
 		if (PTRS_PER_PUD > 1) /* not folded */
 		if (PTRS_PER_PUD > 1) /* not folded */
 			flush |= (*func)(virt_to_page(pud), PT_PUD);
 			flush |= (*func)(virt_to_page(pud), PT_PUD);
 
 
-		for (; addr != pud_limit; pud++, addr = pud_next) {
+		for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
 			pmd_t *pmd;
 			pmd_t *pmd;
-			unsigned long pmd_limit;
 
 
-			pud_next = pud_addr_end(addr, pud_limit);
-
-			if (pud_next < limit)
-				pmd_limit = pud_next;
-			else
-				pmd_limit = limit;
+			if (pgdidx == pgdidx_limit &&
+			    pudidx > pudidx_limit)
+				goto out;
 
 
-			if (pud_none(*pud))
+			if (pud_none(pud[pudidx]))
 				continue;
 				continue;
 
 
-			pmd = pmd_offset(pud, 0);
+			pmd = pmd_offset(&pud[pudidx], 0);
 
 
 			if (PTRS_PER_PMD > 1) /* not folded */
 			if (PTRS_PER_PMD > 1) /* not folded */
 				flush |= (*func)(virt_to_page(pmd), PT_PMD);
 				flush |= (*func)(virt_to_page(pmd), PT_PMD);
 
 
-			for (; addr != pmd_limit; pmd++) {
-				addr += (PAGE_SIZE * PTRS_PER_PTE);
-				if ((pmd_limit-1) < (addr-1)) {
-					addr = pmd_limit;
-					break;
-				}
+			for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
+				struct page *pte;
+
+				if (pgdidx == pgdidx_limit &&
+				    pudidx == pudidx_limit &&
+				    pmdidx > pmdidx_limit)
+					goto out;
 
 
-				if (pmd_none(*pmd))
+				if (pmd_none(pmd[pmdidx]))
 					continue;
 					continue;
 
 
-				flush |= (*func)(pmd_page(*pmd), PT_PTE);
+				pte = pmd_page(pmd[pmdidx]);
+				flush |= (*func)(pte, PT_PTE);
 			}
 			}
 		}
 		}
 	}
 	}
-
-	flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
+out:
 
 
 	return flush;
 	return flush;
 }
 }
@@ -622,14 +719,31 @@ void xen_pgd_pin(pgd_t *pgd)
 {
 {
 	xen_mc_batch();
 	xen_mc_batch();
 
 
-	if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
+	if (pgd_walk(pgd, pin_page, USER_LIMIT)) {
 		/* re-enable interrupts for kmap_flush_unused */
 		/* re-enable interrupts for kmap_flush_unused */
 		xen_mc_issue(0);
 		xen_mc_issue(0);
 		kmap_flush_unused();
 		kmap_flush_unused();
 		xen_mc_batch();
 		xen_mc_batch();
 	}
 	}
 
 
+#ifdef CONFIG_X86_64
+	{
+		pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+		xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
+
+		if (user_pgd) {
+			pin_page(virt_to_page(user_pgd), PT_PGD);
+			xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
+		}
+	}
+#else /* CONFIG_X86_32 */
+#ifdef CONFIG_X86_PAE
+	/* Need to make sure unshared kernel PMD is pinnable */
+	pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
+#endif
 	xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
 	xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
+#endif /* CONFIG_X86_64 */
 	xen_mc_issue(0);
 	xen_mc_issue(0);
 }
 }
 
 
@@ -656,9 +770,11 @@ void xen_mm_pin_all(void)
 	spin_unlock_irqrestore(&pgd_lock, flags);
 	spin_unlock_irqrestore(&pgd_lock, flags);
 }
 }
 
 
-/* The init_mm pagetable is really pinned as soon as its created, but
-   that's before we have page structures to store the bits.  So do all
-   the book-keeping now. */
+/*
+ * The init_mm pagetable is really pinned as soon as its created, but
+ * that's before we have page structures to store the bits.  So do all
+ * the book-keeping now.
+ */
 static __init int mark_pinned(struct page *page, enum pt_level level)
 static __init int mark_pinned(struct page *page, enum pt_level level)
 {
 {
 	SetPagePinned(page);
 	SetPagePinned(page);
@@ -708,7 +824,23 @@ static void xen_pgd_unpin(pgd_t *pgd)
 
 
 	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
 	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
 
 
-	pgd_walk(pgd, unpin_page, TASK_SIZE);
+#ifdef CONFIG_X86_64
+	{
+		pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+		if (user_pgd) {
+			xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
+			unpin_page(virt_to_page(user_pgd), PT_PGD);
+		}
+	}
+#endif
+
+#ifdef CONFIG_X86_PAE
+	/* Need to make sure unshared kernel PMD is unpinned */
+	pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
+#endif
+
+	pgd_walk(pgd, unpin_page, USER_LIMIT);
 
 
 	xen_mc_issue(0);
 	xen_mc_issue(0);
 }
 }
@@ -727,7 +859,6 @@ void xen_mm_unpin_all(void)
 	list_for_each_entry(page, &pgd_list, lru) {
 	list_for_each_entry(page, &pgd_list, lru) {
 		if (PageSavePinned(page)) {
 		if (PageSavePinned(page)) {
 			BUG_ON(!PagePinned(page));
 			BUG_ON(!PagePinned(page));
-			printk("unpinning pinned %p\n", page_address(page));
 			xen_pgd_unpin((pgd_t *)page_address(page));
 			xen_pgd_unpin((pgd_t *)page_address(page));
 			ClearPageSavePinned(page);
 			ClearPageSavePinned(page);
 		}
 		}
@@ -757,8 +888,15 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 static void drop_other_mm_ref(void *info)
 static void drop_other_mm_ref(void *info)
 {
 {
 	struct mm_struct *mm = info;
 	struct mm_struct *mm = info;
+	struct mm_struct *active_mm;
+
+#ifdef CONFIG_X86_64
+	active_mm = read_pda(active_mm);
+#else
+	active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
+#endif
 
 
-	if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
+	if (active_mm == mm)
 		leave_mm(smp_processor_id());
 		leave_mm(smp_processor_id());
 
 
 	/* If this cpu still has a stale cr3 reference, then make sure
 	/* If this cpu still has a stale cr3 reference, then make sure

+ 15 - 14
arch/x86/xen/mmu.h

@@ -10,18 +10,6 @@ enum pt_level {
 	PT_PTE
 	PT_PTE
 };
 };
 
 
-/*
- * Page-directory addresses above 4GB do not fit into architectural %cr3.
- * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
- * must use the following accessor macros to pack/unpack valid MFNs.
- *
- * Note that Xen is using the fact that the pagetable base is always
- * page-aligned, and putting the 12 MSB of the address into the 12 LSB
- * of cr3.
- */
-#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
-#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
-
 
 
 void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 
 
@@ -44,13 +32,26 @@ pgd_t xen_make_pgd(pgdval_t);
 void xen_set_pte(pte_t *ptep, pte_t pteval);
 void xen_set_pte(pte_t *ptep, pte_t pteval);
 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 		    pte_t *ptep, pte_t pteval);
 		    pte_t *ptep, pte_t pteval);
+
+#ifdef CONFIG_X86_PAE
 void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
 void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void xen_pmd_clear(pmd_t *pmdp);
+#endif	/* CONFIG_X86_PAE */
+
 void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
 void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
 void xen_set_pud(pud_t *ptr, pud_t val);
 void xen_set_pud(pud_t *ptr, pud_t val);
 void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
 void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
 void xen_set_pud_hyper(pud_t *ptr, pud_t val);
 void xen_set_pud_hyper(pud_t *ptr, pud_t val);
-void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-void xen_pmd_clear(pmd_t *pmdp);
+
+#if PAGETABLE_LEVELS == 4
+pudval_t xen_pud_val(pud_t pud);
+pud_t xen_make_pud(pudval_t pudval);
+void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
+void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
+#endif
+
+pgd_t *xen_get_user_pgd(pgd_t *pgd);
 
 
 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
 void  xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 void  xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,

+ 1 - 0
arch/x86/xen/multicalls.c

@@ -76,6 +76,7 @@ void xen_mc_flush(void)
 		if (ret) {
 		if (ret) {
 			printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
 			printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
 			       ret, smp_processor_id());
 			       ret, smp_processor_id());
+			dump_stack();
 			for (i = 0; i < b->mcidx; i++) {
 			for (i = 0; i < b->mcidx; i++) {
 				printk("  call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
 				printk("  call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
 				       i+1, b->mcidx,
 				       i+1, b->mcidx,

+ 59 - 20
arch/x86/xen/setup.c

@@ -83,30 +83,72 @@ static void xen_idle(void)
 
 
 /*
 /*
  * Set the bit indicating "nosegneg" library variants should be used.
  * Set the bit indicating "nosegneg" library variants should be used.
+ * We only need to bother in pure 32-bit mode; compat 32-bit processes
+ * can have un-truncated segments, so wrapping around is allowed.
  */
  */
 static void __init fiddle_vdso(void)
 static void __init fiddle_vdso(void)
 {
 {
-	extern const char vdso32_default_start;
-	u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK);
+#ifdef CONFIG_X86_32
+	u32 *mask;
+	mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
 	*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
 	*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
+	mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
+	*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
+#endif
 }
 }
 
 
-void xen_enable_sysenter(void)
+static __cpuinit int register_callback(unsigned type, const void *func)
 {
 {
-	int cpu = smp_processor_id();
-	extern void xen_sysenter_target(void);
-	/* Mask events on entry, even though they get enabled immediately */
-	static struct callback_register sysenter = {
-		.type = CALLBACKTYPE_sysenter,
-		.address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
+	struct callback_register callback = {
+		.type = type,
+		.address = XEN_CALLBACK(__KERNEL_CS, func),
 		.flags = CALLBACKF_mask_events,
 		.flags = CALLBACKF_mask_events,
 	};
 	};
 
 
-	if (!boot_cpu_has(X86_FEATURE_SEP) ||
-	    HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) {
-		clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP);
-		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
+	return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
+}
+
+void __cpuinit xen_enable_sysenter(void)
+{
+	extern void xen_sysenter_target(void);
+	int ret;
+	unsigned sysenter_feature;
+
+#ifdef CONFIG_X86_32
+	sysenter_feature = X86_FEATURE_SEP;
+#else
+	sysenter_feature = X86_FEATURE_SYSENTER32;
+#endif
+
+	if (!boot_cpu_has(sysenter_feature))
+		return;
+
+	ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
+	if(ret != 0)
+		setup_clear_cpu_cap(sysenter_feature);
+}
+
+void __cpuinit xen_enable_syscall(void)
+{
+#ifdef CONFIG_X86_64
+	int ret;
+	extern void xen_syscall_target(void);
+	extern void xen_syscall32_target(void);
+
+	ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
+	if (ret != 0) {
+		printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
+		/* Pretty fatal; 64-bit userspace has no other
+		   mechanism for syscalls. */
 	}
 	}
+
+	if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
+		ret = register_callback(CALLBACKTYPE_syscall32,
+					xen_syscall32_target);
+		if (ret != 0)
+			setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
+	}
+#endif /* CONFIG_X86_64 */
 }
 }
 
 
 void __init xen_arch_setup(void)
 void __init xen_arch_setup(void)
@@ -120,10 +162,12 @@ void __init xen_arch_setup(void)
 	if (!xen_feature(XENFEAT_auto_translated_physmap))
 	if (!xen_feature(XENFEAT_auto_translated_physmap))
 		HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
 		HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
 
 
-	HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
-				 __KERNEL_CS, (unsigned long)xen_failsafe_callback);
+	if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
+	    register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
+		BUG();
 
 
 	xen_enable_sysenter();
 	xen_enable_sysenter();
+	xen_enable_syscall();
 
 
 	set_iopl.iopl = 1;
 	set_iopl.iopl = 1;
 	rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
 	rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
@@ -143,11 +187,6 @@ void __init xen_arch_setup(void)
 
 
 	pm_idle = xen_idle;
 	pm_idle = xen_idle;
 
 
-#ifdef CONFIG_SMP
-	/* fill cpus_possible with all available cpus */
-	xen_fill_possible_map();
-#endif
-
 	paravirt_disable_iospace();
 	paravirt_disable_iospace();
 
 
 	fiddle_vdso();
 	fiddle_vdso();

+ 87 - 50
arch/x86/xen/smp.c

@@ -66,13 +66,22 @@ static __cpuinit void cpu_bringup_and_idle(void)
 	int cpu = smp_processor_id();
 	int cpu = smp_processor_id();
 
 
 	cpu_init();
 	cpu_init();
+	preempt_disable();
+
 	xen_enable_sysenter();
 	xen_enable_sysenter();
+	xen_enable_syscall();
 
 
-	preempt_disable();
-	per_cpu(cpu_state, cpu) = CPU_ONLINE;
+	cpu = smp_processor_id();
+	smp_store_cpu_info(cpu);
+	cpu_data(cpu).x86_max_cores = 1;
+	set_cpu_sibling_map(cpu);
 
 
 	xen_setup_cpu_clockevents();
 	xen_setup_cpu_clockevents();
 
 
+	cpu_set(cpu, cpu_online_map);
+	x86_write_percpu(cpu_state, CPU_ONLINE);
+	wmb();
+
 	/* We can take interrupts now: we're officially "up". */
 	/* We can take interrupts now: we're officially "up". */
 	local_irq_enable();
 	local_irq_enable();
 
 
@@ -141,56 +150,37 @@ static int xen_smp_intr_init(unsigned int cpu)
 	return rc;
 	return rc;
 }
 }
 
 
-void __init xen_fill_possible_map(void)
+static void __init xen_fill_possible_map(void)
 {
 {
 	int i, rc;
 	int i, rc;
 
 
 	for (i = 0; i < NR_CPUS; i++) {
 	for (i = 0; i < NR_CPUS; i++) {
 		rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
 		rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
-		if (rc >= 0)
+		if (rc >= 0) {
+			num_processors++;
 			cpu_set(i, cpu_possible_map);
 			cpu_set(i, cpu_possible_map);
+		}
 	}
 	}
 }
 }
 
 
-void __init xen_smp_prepare_boot_cpu(void)
+static void __init xen_smp_prepare_boot_cpu(void)
 {
 {
-	int cpu;
-
 	BUG_ON(smp_processor_id() != 0);
 	BUG_ON(smp_processor_id() != 0);
 	native_smp_prepare_boot_cpu();
 	native_smp_prepare_boot_cpu();
 
 
 	/* We've switched to the "real" per-cpu gdt, so make sure the
 	/* We've switched to the "real" per-cpu gdt, so make sure the
 	   old memory can be recycled */
 	   old memory can be recycled */
-	make_lowmem_page_readwrite(&per_cpu__gdt_page);
-
-	for_each_possible_cpu(cpu) {
-		cpus_clear(per_cpu(cpu_sibling_map, cpu));
-		/*
-		 * cpu_core_map lives in a per cpu area that is cleared
-		 * when the per cpu array is allocated.
-		 *
-		 * cpus_clear(per_cpu(cpu_core_map, cpu));
-		 */
-	}
+	make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
 
 
 	xen_setup_vcpu_info_placement();
 	xen_setup_vcpu_info_placement();
 }
 }
 
 
-void __init xen_smp_prepare_cpus(unsigned int max_cpus)
+static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
 {
 {
 	unsigned cpu;
 	unsigned cpu;
 
 
-	for_each_possible_cpu(cpu) {
-		cpus_clear(per_cpu(cpu_sibling_map, cpu));
-		/*
-		 * cpu_core_ map will be zeroed when the per
-		 * cpu area is allocated.
-		 *
-		 * cpus_clear(per_cpu(cpu_core_map, cpu));
-		 */
-	}
-
 	smp_store_cpu_info(0);
 	smp_store_cpu_info(0);
+	cpu_data(0).x86_max_cores = 1;
 	set_cpu_sibling_map(0);
 	set_cpu_sibling_map(0);
 
 
 	if (xen_smp_intr_init(0))
 	if (xen_smp_intr_init(0))
@@ -225,7 +215,7 @@ static __cpuinit int
 cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 {
 {
 	struct vcpu_guest_context *ctxt;
 	struct vcpu_guest_context *ctxt;
-	struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
+	struct desc_struct *gdt;
 
 
 	if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
 	if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
 		return 0;
 		return 0;
@@ -234,12 +224,15 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 	if (ctxt == NULL)
 	if (ctxt == NULL)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
+	gdt = get_cpu_gdt_table(cpu);
+
 	ctxt->flags = VGCF_IN_KERNEL;
 	ctxt->flags = VGCF_IN_KERNEL;
 	ctxt->user_regs.ds = __USER_DS;
 	ctxt->user_regs.ds = __USER_DS;
 	ctxt->user_regs.es = __USER_DS;
 	ctxt->user_regs.es = __USER_DS;
-	ctxt->user_regs.fs = __KERNEL_PERCPU;
-	ctxt->user_regs.gs = 0;
 	ctxt->user_regs.ss = __KERNEL_DS;
 	ctxt->user_regs.ss = __KERNEL_DS;
+#ifdef CONFIG_X86_32
+	ctxt->user_regs.fs = __KERNEL_PERCPU;
+#endif
 	ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
 	ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
 	ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
 	ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
 
 
@@ -249,11 +242,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 
 
 	ctxt->ldt_ents = 0;
 	ctxt->ldt_ents = 0;
 
 
-	BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
-	make_lowmem_page_readonly(gdt->gdt);
+	BUG_ON((unsigned long)gdt & ~PAGE_MASK);
+	make_lowmem_page_readonly(gdt);
 
 
-	ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
-	ctxt->gdt_ents      = ARRAY_SIZE(gdt->gdt);
+	ctxt->gdt_frames[0] = virt_to_mfn(gdt);
+	ctxt->gdt_ents      = GDT_ENTRIES;
 
 
 	ctxt->user_regs.cs = __KERNEL_CS;
 	ctxt->user_regs.cs = __KERNEL_CS;
 	ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
 	ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
@@ -261,9 +254,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 	ctxt->kernel_ss = __KERNEL_DS;
 	ctxt->kernel_ss = __KERNEL_DS;
 	ctxt->kernel_sp = idle->thread.sp0;
 	ctxt->kernel_sp = idle->thread.sp0;
 
 
+#ifdef CONFIG_X86_32
 	ctxt->event_callback_cs     = __KERNEL_CS;
 	ctxt->event_callback_cs     = __KERNEL_CS;
-	ctxt->event_callback_eip    = (unsigned long)xen_hypervisor_callback;
 	ctxt->failsafe_callback_cs  = __KERNEL_CS;
 	ctxt->failsafe_callback_cs  = __KERNEL_CS;
+#endif
+	ctxt->event_callback_eip    = (unsigned long)xen_hypervisor_callback;
 	ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
 	ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
 
 
 	per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
 	per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
@@ -276,7 +271,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 	return 0;
 	return 0;
 }
 }
 
 
-int __cpuinit xen_cpu_up(unsigned int cpu)
+static int __cpuinit xen_cpu_up(unsigned int cpu)
 {
 {
 	struct task_struct *idle = idle_task(cpu);
 	struct task_struct *idle = idle_task(cpu);
 	int rc;
 	int rc;
@@ -287,11 +282,28 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
 		return rc;
 		return rc;
 #endif
 #endif
 
 
+#ifdef CONFIG_X86_64
+	/* Allocate node local memory for AP pdas */
+	WARN_ON(cpu == 0);
+	if (cpu > 0) {
+		rc = get_local_pda(cpu);
+		if (rc)
+			return rc;
+	}
+#endif
+
+#ifdef CONFIG_X86_32
 	init_gdt(cpu);
 	init_gdt(cpu);
 	per_cpu(current_task, cpu) = idle;
 	per_cpu(current_task, cpu) = idle;
 	irq_ctx_init(cpu);
 	irq_ctx_init(cpu);
+#else
+	cpu_pda(cpu)->pcurrent = idle;
+	clear_tsk_thread_flag(idle, TIF_FORK);
+#endif
 	xen_setup_timer(cpu);
 	xen_setup_timer(cpu);
 
 
+	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+
 	/* make sure interrupts start blocked */
 	/* make sure interrupts start blocked */
 	per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
 	per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
 
 
@@ -306,20 +318,18 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
 	if (rc)
 	if (rc)
 		return rc;
 		return rc;
 
 
-	smp_store_cpu_info(cpu);
-	set_cpu_sibling_map(cpu);
-	/* This must be done before setting cpu_online_map */
-	wmb();
-
-	cpu_set(cpu, cpu_online_map);
-
 	rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
 	rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
 	BUG_ON(rc);
 	BUG_ON(rc);
 
 
+	while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
+		HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+		barrier();
+	}
+
 	return 0;
 	return 0;
 }
 }
 
 
-void xen_smp_cpus_done(unsigned int max_cpus)
+static void xen_smp_cpus_done(unsigned int max_cpus)
 {
 {
 }
 }
 
 
@@ -335,12 +345,12 @@ static void stop_self(void *v)
 	BUG();
 	BUG();
 }
 }
 
 
-void xen_smp_send_stop(void)
+static void xen_smp_send_stop(void)
 {
 {
 	smp_call_function(stop_self, NULL, 0);
 	smp_call_function(stop_self, NULL, 0);
 }
 }
 
 
-void xen_smp_send_reschedule(int cpu)
+static void xen_smp_send_reschedule(int cpu)
 {
 {
 	xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
 	xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
 }
 }
@@ -355,7 +365,7 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
 		xen_send_IPI_one(cpu, vector);
 		xen_send_IPI_one(cpu, vector);
 }
 }
 
 
-void xen_smp_send_call_function_ipi(cpumask_t mask)
+static void xen_smp_send_call_function_ipi(cpumask_t mask)
 {
 {
 	int cpu;
 	int cpu;
 
 
@@ -370,7 +380,7 @@ void xen_smp_send_call_function_ipi(cpumask_t mask)
 	}
 	}
 }
 }
 
 
-void xen_smp_send_call_function_single_ipi(int cpu)
+static void xen_smp_send_call_function_single_ipi(int cpu)
 {
 {
 	xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
 	xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
 }
 }
@@ -379,7 +389,11 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
 {
 {
 	irq_enter();
 	irq_enter();
 	generic_smp_call_function_interrupt();
 	generic_smp_call_function_interrupt();
+#ifdef CONFIG_X86_32
 	__get_cpu_var(irq_stat).irq_call_count++;
 	__get_cpu_var(irq_stat).irq_call_count++;
+#else
+	add_pda(irq_call_count, 1);
+#endif
 	irq_exit();
 	irq_exit();
 
 
 	return IRQ_HANDLED;
 	return IRQ_HANDLED;
@@ -389,8 +403,31 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
 {
 {
 	irq_enter();
 	irq_enter();
 	generic_smp_call_function_single_interrupt();
 	generic_smp_call_function_single_interrupt();
+#ifdef CONFIG_X86_32
 	__get_cpu_var(irq_stat).irq_call_count++;
 	__get_cpu_var(irq_stat).irq_call_count++;
+#else
+	add_pda(irq_call_count, 1);
+#endif
 	irq_exit();
 	irq_exit();
 
 
 	return IRQ_HANDLED;
 	return IRQ_HANDLED;
 }
 }
+
+static const struct smp_ops xen_smp_ops __initdata = {
+	.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
+	.smp_prepare_cpus = xen_smp_prepare_cpus,
+	.cpu_up = xen_cpu_up,
+	.smp_cpus_done = xen_smp_cpus_done,
+
+	.smp_send_stop = xen_smp_send_stop,
+	.smp_send_reschedule = xen_smp_send_reschedule,
+
+	.send_call_func_ipi = xen_smp_send_call_function_ipi,
+	.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
+};
+
+void __init xen_smp_init(void)
+{
+	smp_ops = xen_smp_ops;
+	xen_fill_possible_map();
+}

+ 4 - 1
arch/x86/xen/suspend.c

@@ -38,8 +38,11 @@ void xen_post_suspend(int suspend_cancelled)
 		xen_cpu_initialized_map = cpu_online_map;
 		xen_cpu_initialized_map = cpu_online_map;
 #endif
 #endif
 		xen_vcpu_restore();
 		xen_vcpu_restore();
-		xen_timer_resume();
 	}
 	}
 
 
 }
 }
 
 
+void xen_arch_resume(void)
+{
+	/* nothing */
+}

+ 0 - 0
arch/x86/xen/xen-asm.S → arch/x86/xen/xen-asm_32.S


+ 271 - 0
arch/x86/xen/xen-asm_64.S

@@ -0,0 +1,271 @@
+/*
+	Asm versions of Xen pv-ops, suitable for either direct use or inlining.
+	The inline versions are the same as the direct-use versions, with the
+	pre- and post-amble chopped off.
+
+	This code is encoded for size rather than absolute efficiency,
+	with a view to being able to inline as much as possible.
+
+	We only bother with direct forms (ie, vcpu in pda) of the operations
+	here; the indirect forms are better handled in C, since they're
+	generally too large to inline anyway.
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/asm-offsets.h>
+#include <asm/processor-flags.h>
+#include <asm/errno.h>
+#include <asm/segment.h>
+
+#include <xen/interface/xen.h>
+
+#define RELOC(x, v)	.globl x##_reloc; x##_reloc=v
+#define ENDPATCH(x)	.globl x##_end; x##_end=.
+
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI	0x80000000
+
+#if 0
+#include <asm/percpu.h>
+
+/*
+	Enable events.  This clears the event mask and tests the pending
+	event status with one and operation.  If there are pending
+	events, then enter the hypervisor to get them handled.
+ */
+ENTRY(xen_irq_enable_direct)
+	/* Unmask events */
+	movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
+
+	/* Preempt here doesn't matter because that will deal with
+	   any pending interrupts.  The pending check may end up being
+	   run on the wrong CPU, but that doesn't hurt. */
+
+	/* Test for pending */
+	testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
+	jz 1f
+
+2:	call check_events
+1:
+ENDPATCH(xen_irq_enable_direct)
+	ret
+	ENDPROC(xen_irq_enable_direct)
+	RELOC(xen_irq_enable_direct, 2b+1)
+
+/*
+	Disabling events is simply a matter of making the event mask
+	non-zero.
+ */
+ENTRY(xen_irq_disable_direct)
+	movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
+ENDPATCH(xen_irq_disable_direct)
+	ret
+	ENDPROC(xen_irq_disable_direct)
+	RELOC(xen_irq_disable_direct, 0)
+
+/*
+	(xen_)save_fl is used to get the current interrupt enable status.
+	Callers expect the status to be in X86_EFLAGS_IF, and other bits
+	may be set in the return value.  We take advantage of this by
+	making sure that X86_EFLAGS_IF has the right value (and other bits
+	in that byte are 0), but other bits in the return value are
+	undefined.  We need to toggle the state of the bit, because
+	Xen and x86 use opposite senses (mask vs enable).
+ */
+ENTRY(xen_save_fl_direct)
+	testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
+	setz %ah
+	addb %ah,%ah
+ENDPATCH(xen_save_fl_direct)
+	ret
+	ENDPROC(xen_save_fl_direct)
+	RELOC(xen_save_fl_direct, 0)
+
+/*
+	In principle the caller should be passing us a value return
+	from xen_save_fl_direct, but for robustness sake we test only
+	the X86_EFLAGS_IF flag rather than the whole byte. After
+	setting the interrupt mask state, it checks for unmasked
+	pending events and enters the hypervisor to get them delivered
+	if so.
+ */
+ENTRY(xen_restore_fl_direct)
+	testb $X86_EFLAGS_IF>>8, %ah
+	setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
+	/* Preempt here doesn't matter because that will deal with
+	   any pending interrupts.  The pending check may end up being
+	   run on the wrong CPU, but that doesn't hurt. */
+
+	/* check for unmasked and pending */
+	cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
+	jz 1f
+2:	call check_events
+1:
+ENDPATCH(xen_restore_fl_direct)
+	ret
+	ENDPROC(xen_restore_fl_direct)
+	RELOC(xen_restore_fl_direct, 2b+1)
+
+
+/*
+	Force an event check by making a hypercall,
+	but preserve regs before making the call.
+ */
+check_events:
+	push %rax
+	push %rcx
+	push %rdx
+	push %rsi
+	push %rdi
+	push %r8
+	push %r9
+	push %r10
+	push %r11
+	call force_evtchn_callback
+	pop %r11
+	pop %r10
+	pop %r9
+	pop %r8
+	pop %rdi
+	pop %rsi
+	pop %rdx
+	pop %rcx
+	pop %rax
+	ret
+#endif
+
+ENTRY(xen_adjust_exception_frame)
+	mov 8+0(%rsp),%rcx
+	mov 8+8(%rsp),%r11
+	ret $16
+
+hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
+/*
+	Xen64 iret frame:
+
+	ss
+	rsp
+	rflags
+	cs
+	rip		<-- standard iret frame
+
+	flags
+
+	rcx		}
+	r11		}<-- pushed by hypercall page
+rsp ->	rax		}
+ */
+ENTRY(xen_iret)
+	pushq $0
+1:	jmp hypercall_iret
+ENDPATCH(xen_iret)
+RELOC(xen_iret, 1b+1)
+
+/*
+	sysexit is not used for 64-bit processes, so it's
+	only ever used to return to 32-bit compat userspace.
+ */
+ENTRY(xen_sysexit)
+	pushq $__USER32_DS
+	pushq %rcx
+	pushq $X86_EFLAGS_IF
+	pushq $__USER32_CS
+	pushq %rdx
+
+	pushq $VGCF_in_syscall
+1:	jmp hypercall_iret
+ENDPATCH(xen_sysexit)
+RELOC(xen_sysexit, 1b+1)
+
+ENTRY(xen_sysret64)
+	/* We're already on the usermode stack at this point, but still
+	   with the kernel gs, so we can easily switch back */
+	movq %rsp, %gs:pda_oldrsp
+	movq %gs:pda_kernelstack,%rsp
+
+	pushq $__USER_DS
+	pushq %gs:pda_oldrsp
+	pushq %r11
+	pushq $__USER_CS
+	pushq %rcx
+
+	pushq $VGCF_in_syscall
+1:	jmp hypercall_iret
+ENDPATCH(xen_sysret64)
+RELOC(xen_sysret64, 1b+1)
+
+ENTRY(xen_sysret32)
+	/* We're already on the usermode stack at this point, but still
+	   with the kernel gs, so we can easily switch back */
+	movq %rsp, %gs:pda_oldrsp
+	movq %gs:pda_kernelstack, %rsp
+
+	pushq $__USER32_DS
+	pushq %gs:pda_oldrsp
+	pushq %r11
+	pushq $__USER32_CS
+	pushq %rcx
+
+	pushq $VGCF_in_syscall
+1:	jmp hypercall_iret
+ENDPATCH(xen_sysret32)
+RELOC(xen_sysret32, 1b+1)
+
+/*
+	Xen handles syscall callbacks much like ordinary exceptions,
+	which means we have:
+	 - kernel gs
+	 - kernel rsp
+	 - an iret-like stack frame on the stack (including rcx and r11):
+		ss
+		rsp
+		rflags
+		cs
+		rip
+		r11
+	rsp->	rcx
+
+	In all the entrypoints, we undo all that to make it look
+	like a CPU-generated syscall/sysenter and jump to the normal
+	entrypoint.
+ */
+
+.macro undo_xen_syscall
+	mov 0*8(%rsp),%rcx
+	mov 1*8(%rsp),%r11
+	mov 5*8(%rsp),%rsp
+.endm
+
+/* Normal 64-bit system call target */
+ENTRY(xen_syscall_target)
+	undo_xen_syscall
+	jmp system_call_after_swapgs
+ENDPROC(xen_syscall_target)
+
+#ifdef CONFIG_IA32_EMULATION
+
+/* 32-bit compat syscall target */
+ENTRY(xen_syscall32_target)
+	undo_xen_syscall
+	jmp ia32_cstar_target
+ENDPROC(xen_syscall32_target)
+
+/* 32-bit compat sysenter target */
+ENTRY(xen_sysenter_target)
+	undo_xen_syscall
+	jmp ia32_sysenter_target
+ENDPROC(xen_sysenter_target)
+
+#else /* !CONFIG_IA32_EMULATION */
+
+ENTRY(xen_syscall32_target)
+ENTRY(xen_sysenter_target)
+	lea 16(%rsp), %rsp	/* strip %rcx,%r11 */
+	mov $-ENOSYS, %rax
+	pushq $VGCF_in_syscall
+	jmp hypercall_iret
+ENDPROC(xen_syscall32_target)
+ENDPROC(xen_sysenter_target)
+
+#endif	/* CONFIG_IA32_EMULATION */

+ 21 - 7
arch/x86/xen/xen-head.S

@@ -5,15 +5,24 @@
 
 
 #include <linux/elfnote.h>
 #include <linux/elfnote.h>
 #include <linux/init.h>
 #include <linux/init.h>
+
 #include <asm/boot.h>
 #include <asm/boot.h>
+#include <asm/asm.h>
+#include <asm/page.h>
+
 #include <xen/interface/elfnote.h>
 #include <xen/interface/elfnote.h>
 #include <asm/xen/interface.h>
 #include <asm/xen/interface.h>
 
 
 	__INIT
 	__INIT
 ENTRY(startup_xen)
 ENTRY(startup_xen)
-	movl %esi,xen_start_info
 	cld
 	cld
-	movl $(init_thread_union+THREAD_SIZE),%esp
+#ifdef CONFIG_X86_32
+	mov %esi,xen_start_info
+	mov $init_thread_union+THREAD_SIZE,%esp
+#else
+	mov %rsi,xen_start_info
+	mov $init_thread_union+THREAD_SIZE,%rsp
+#endif
 	jmp xen_start_kernel
 	jmp xen_start_kernel
 
 
 	__FINIT
 	__FINIT
@@ -21,21 +30,26 @@ ENTRY(startup_xen)
 .pushsection .text
 .pushsection .text
 	.align PAGE_SIZE_asm
 	.align PAGE_SIZE_asm
 ENTRY(hypercall_page)
 ENTRY(hypercall_page)
-	.skip 0x1000
+	.skip PAGE_SIZE_asm
 .popsection
 .popsection
 
 
 	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")
 	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")
 	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz "2.6")
 	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz "2.6")
 	ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz "xen-3.0")
 	ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz "xen-3.0")
-	ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .long  __PAGE_OFFSET)
-	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .long  startup_xen)
-	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long  hypercall_page)
+#ifdef CONFIG_X86_32
+	ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      _ASM_PTR __PAGE_OFFSET)
+#else
+	ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      _ASM_PTR __START_KERNEL_map)
+#endif
+	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          _ASM_PTR startup_xen)
+	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
 	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz "!writable_page_tables|pae_pgdir_above_4gb")
 	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz "!writable_page_tables|pae_pgdir_above_4gb")
 	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")
 	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")
 	ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
 	ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
 	ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
 	ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
 		.quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
 		.quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
 	ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
 	ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
-	ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW,   .long __HYPERVISOR_VIRT_START)
+	ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW,   _ASM_PTR __HYPERVISOR_VIRT_START)
+	ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   _ASM_PTR 0)
 
 
 #endif /*CONFIG_XEN */
 #endif /*CONFIG_XEN */

+ 10 - 11
arch/x86/xen/xen-ops.h

@@ -26,6 +26,7 @@ char * __init xen_memory_setup(void);
 void __init xen_arch_setup(void);
 void __init xen_arch_setup(void);
 void __init xen_init_IRQ(void);
 void __init xen_init_IRQ(void);
 void xen_enable_sysenter(void);
 void xen_enable_sysenter(void);
+void xen_enable_syscall(void);
 void xen_vcpu_restore(void);
 void xen_vcpu_restore(void);
 
 
 void __init xen_build_dynamic_phys_to_machine(void);
 void __init xen_build_dynamic_phys_to_machine(void);
@@ -37,7 +38,6 @@ void __init xen_time_init(void);
 unsigned long xen_get_wallclock(void);
 unsigned long xen_get_wallclock(void);
 int xen_set_wallclock(unsigned long time);
 int xen_set_wallclock(unsigned long time);
 unsigned long long xen_sched_clock(void);
 unsigned long long xen_sched_clock(void);
-void xen_timer_resume(void);
 
 
 irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
 irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
 
 
@@ -45,20 +45,15 @@ bool xen_vcpu_stolen(int vcpu);
 
 
 void xen_mark_init_mm_pinned(void);
 void xen_mark_init_mm_pinned(void);
 
 
-void __init xen_fill_possible_map(void);
-
 void __init xen_setup_vcpu_info_placement(void);
 void __init xen_setup_vcpu_info_placement(void);
-void xen_smp_prepare_boot_cpu(void);
-void xen_smp_prepare_cpus(unsigned int max_cpus);
-int xen_cpu_up(unsigned int cpu);
-void xen_smp_cpus_done(unsigned int max_cpus);
 
 
-void xen_smp_send_stop(void);
-void xen_smp_send_reschedule(int cpu);
-void xen_smp_send_call_function_ipi(cpumask_t mask);
-void xen_smp_send_call_function_single_ipi(int cpu);
+#ifdef CONFIG_SMP
+void xen_smp_init(void);
 
 
 extern cpumask_t xen_cpu_initialized_map;
 extern cpumask_t xen_cpu_initialized_map;
+#else
+static inline void xen_smp_init(void) {}
+#endif
 
 
 
 
 /* Declare an asm function, along with symbols needed to make it
 /* Declare an asm function, along with symbols needed to make it
@@ -73,7 +68,11 @@ DECL_ASM(void, xen_irq_disable_direct, void);
 DECL_ASM(unsigned long, xen_save_fl_direct, void);
 DECL_ASM(unsigned long, xen_save_fl_direct, void);
 DECL_ASM(void, xen_restore_fl_direct, unsigned long);
 DECL_ASM(void, xen_restore_fl_direct, unsigned long);
 
 
+/* These are not functions, and cannot be called normally */
 void xen_iret(void);
 void xen_iret(void);
 void xen_sysexit(void);
 void xen_sysexit(void);
+void xen_sysret32(void);
+void xen_sysret64(void);
+void xen_adjust_exception_frame(void);
 
 
 #endif /* XEN_OPS_H */
 #endif /* XEN_OPS_H */

+ 15 - 4
drivers/net/xen-netfront.c

@@ -92,7 +92,7 @@ struct netfront_info {
 	 */
 	 */
 	union skb_entry {
 	union skb_entry {
 		struct sk_buff *skb;
 		struct sk_buff *skb;
-		unsigned link;
+		unsigned long link;
 	} tx_skbs[NET_TX_RING_SIZE];
 	} tx_skbs[NET_TX_RING_SIZE];
 	grant_ref_t gref_tx_head;
 	grant_ref_t gref_tx_head;
 	grant_ref_t grant_tx_ref[NET_TX_RING_SIZE];
 	grant_ref_t grant_tx_ref[NET_TX_RING_SIZE];
@@ -125,6 +125,17 @@ struct netfront_rx_info {
 	struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
 	struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
 };
 };
 
 
+static void skb_entry_set_link(union skb_entry *list, unsigned short id)
+{
+	list->link = id;
+}
+
+static int skb_entry_is_link(const union skb_entry *list)
+{
+	BUILD_BUG_ON(sizeof(list->skb) != sizeof(list->link));
+	return ((unsigned long)list->skb < PAGE_OFFSET);
+}
+
 /*
 /*
  * Access macros for acquiring freeing slots in tx_skbs[].
  * Access macros for acquiring freeing slots in tx_skbs[].
  */
  */
@@ -132,7 +143,7 @@ struct netfront_rx_info {
 static void add_id_to_freelist(unsigned *head, union skb_entry *list,
 static void add_id_to_freelist(unsigned *head, union skb_entry *list,
 			       unsigned short id)
 			       unsigned short id)
 {
 {
-	list[id].link = *head;
+	skb_entry_set_link(&list[id], *head);
 	*head = id;
 	*head = id;
 }
 }
 
 
@@ -993,7 +1004,7 @@ static void xennet_release_tx_bufs(struct netfront_info *np)
 
 
 	for (i = 0; i < NET_TX_RING_SIZE; i++) {
 	for (i = 0; i < NET_TX_RING_SIZE; i++) {
 		/* Skip over entries which are actually freelist references */
 		/* Skip over entries which are actually freelist references */
-		if ((unsigned long)np->tx_skbs[i].skb < PAGE_OFFSET)
+		if (skb_entry_is_link(&np->tx_skbs[i]))
 			continue;
 			continue;
 
 
 		skb = np->tx_skbs[i].skb;
 		skb = np->tx_skbs[i].skb;
@@ -1123,7 +1134,7 @@ static struct net_device * __devinit xennet_create_dev(struct xenbus_device *dev
 	/* Initialise tx_skbs as a free chain containing every entry. */
 	/* Initialise tx_skbs as a free chain containing every entry. */
 	np->tx_skb_freelist = 0;
 	np->tx_skb_freelist = 0;
 	for (i = 0; i < NET_TX_RING_SIZE; i++) {
 	for (i = 0; i < NET_TX_RING_SIZE; i++) {
-		np->tx_skbs[i].link = i+1;
+		skb_entry_set_link(&np->tx_skbs[i], i+1);
 		np->grant_tx_ref[i] = GRANT_INVALID_REF;
 		np->grant_tx_ref[i] = GRANT_INVALID_REF;
 	}
 	}
 
 

+ 1 - 1
drivers/pci/intel-iommu.c

@@ -37,7 +37,7 @@
 #include "intel-iommu.h"
 #include "intel-iommu.h"
 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
 #include <asm/cacheflush.h>
 #include <asm/cacheflush.h>
-#include <asm/gart.h>
+#include <asm/iommu.h>
 #include "pci.h"
 #include "pci.h"
 
 
 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)

+ 6 - 4
drivers/xen/manage.c

@@ -63,11 +63,12 @@ static int xen_suspend(void *data)
 	gnttab_resume();
 	gnttab_resume();
 	xen_mm_unpin_all();
 	xen_mm_unpin_all();
 
 
-	device_power_up();
+	device_power_up(PMSG_RESUME);
 
 
 	if (!*cancelled) {
 	if (!*cancelled) {
 		xen_irq_resume();
 		xen_irq_resume();
 		xen_console_resume();
 		xen_console_resume();
+		xen_timer_resume();
 	}
 	}
 
 
 	return 0;
 	return 0;
@@ -107,12 +108,13 @@ static void do_suspend(void)
 		goto out;
 		goto out;
 	}
 	}
 
 
-	if (!cancelled)
+	if (!cancelled) {
+		xen_arch_resume();
 		xenbus_resume();
 		xenbus_resume();
-	else
+	} else
 		xenbus_suspend_cancel();
 		xenbus_suspend_cancel();
 
 
-	device_resume();
+	device_resume(PMSG_RESUME);
 
 
 	/* Make sure timer events get retriggered on all CPUs */
 	/* Make sure timer events get retriggered on all CPUs */
 	clock_was_set();
 	clock_was_set();

+ 106 - 8
include/asm-x86/amd_iommu_types.h

@@ -27,13 +27,12 @@
 /*
 /*
  * some size calculation constants
  * some size calculation constants
  */
  */
-#define DEV_TABLE_ENTRY_SIZE		256
+#define DEV_TABLE_ENTRY_SIZE		32
 #define ALIAS_TABLE_ENTRY_SIZE		2
 #define ALIAS_TABLE_ENTRY_SIZE		2
 #define RLOOKUP_TABLE_ENTRY_SIZE	(sizeof(void *))
 #define RLOOKUP_TABLE_ENTRY_SIZE	(sizeof(void *))
 
 
 /* helper macros */
 /* helper macros */
 #define LOW_U32(x) ((x) & ((1ULL << 32)-1))
 #define LOW_U32(x) ((x) & ((1ULL << 32)-1))
-#define HIGH_U32(x) (LOW_U32((x) >> 32))
 
 
 /* Length of the MMIO region for the AMD IOMMU */
 /* Length of the MMIO region for the AMD IOMMU */
 #define MMIO_REGION_LENGTH       0x4000
 #define MMIO_REGION_LENGTH       0x4000
@@ -158,78 +157,170 @@
 
 
 #define MAX_DOMAIN_ID 65536
 #define MAX_DOMAIN_ID 65536
 
 
+/*
+ * This structure contains generic data for  IOMMU protection domains
+ * independent of their use.
+ */
 struct protection_domain {
 struct protection_domain {
-	spinlock_t lock;
-	u16 id;
-	int mode;
-	u64 *pt_root;
-	void *priv;
+	spinlock_t lock; /* mostly used to lock the page table*/
+	u16 id;		 /* the domain id written to the device table */
+	int mode;	 /* paging mode (0-6 levels) */
+	u64 *pt_root;	 /* page table root pointer */
+	void *priv;	 /* private data */
 };
 };
 
 
+/*
+ * Data container for a dma_ops specific protection domain
+ */
 struct dma_ops_domain {
 struct dma_ops_domain {
 	struct list_head list;
 	struct list_head list;
+
+	/* generic protection domain information */
 	struct protection_domain domain;
 	struct protection_domain domain;
+
+	/* size of the aperture for the mappings */
 	unsigned long aperture_size;
 	unsigned long aperture_size;
+
+	/* address we start to search for free addresses */
 	unsigned long next_bit;
 	unsigned long next_bit;
+
+	/* address allocation bitmap */
 	unsigned long *bitmap;
 	unsigned long *bitmap;
+
+	/*
+	 * Array of PTE pages for the aperture. In this array we save all the
+	 * leaf pages of the domain page table used for the aperture. This way
+	 * we don't need to walk the page table to find a specific PTE. We can
+	 * just calculate its address in constant time.
+	 */
 	u64 **pte_pages;
 	u64 **pte_pages;
 };
 };
 
 
+/*
+ * Structure where we save information about one hardware AMD IOMMU in the
+ * system.
+ */
 struct amd_iommu {
 struct amd_iommu {
 	struct list_head list;
 	struct list_head list;
+
+	/* locks the accesses to the hardware */
 	spinlock_t lock;
 	spinlock_t lock;
 
 
+	/* device id of this IOMMU */
 	u16 devid;
 	u16 devid;
+	/*
+	 * Capability pointer. There could be more than one IOMMU per PCI
+	 * device function if there are more than one AMD IOMMU capability
+	 * pointers.
+	 */
 	u16 cap_ptr;
 	u16 cap_ptr;
 
 
+	/* physical address of MMIO space */
 	u64 mmio_phys;
 	u64 mmio_phys;
+	/* virtual address of MMIO space */
 	u8 *mmio_base;
 	u8 *mmio_base;
+
+	/* capabilities of that IOMMU read from ACPI */
 	u32 cap;
 	u32 cap;
+
+	/* first device this IOMMU handles. read from PCI */
 	u16 first_device;
 	u16 first_device;
+	/* last device this IOMMU handles. read from PCI */
 	u16 last_device;
 	u16 last_device;
+
+	/* start of exclusion range of that IOMMU */
 	u64 exclusion_start;
 	u64 exclusion_start;
+	/* length of exclusion range of that IOMMU */
 	u64 exclusion_length;
 	u64 exclusion_length;
 
 
+	/* command buffer virtual address */
 	u8 *cmd_buf;
 	u8 *cmd_buf;
+	/* size of command buffer */
 	u32 cmd_buf_size;
 	u32 cmd_buf_size;
 
 
+	/* if one, we need to send a completion wait command */
 	int need_sync;
 	int need_sync;
 
 
+	/* default dma_ops domain for that IOMMU */
 	struct dma_ops_domain *default_dom;
 	struct dma_ops_domain *default_dom;
 };
 };
 
 
+/*
+ * List with all IOMMUs in the system. This list is not locked because it is
+ * only written and read at driver initialization or suspend time
+ */
 extern struct list_head amd_iommu_list;
 extern struct list_head amd_iommu_list;
 
 
+/*
+ * Structure defining one entry in the device table
+ */
 struct dev_table_entry {
 struct dev_table_entry {
 	u32 data[8];
 	u32 data[8];
 };
 };
 
 
+/*
+ * One entry for unity mappings parsed out of the ACPI table.
+ */
 struct unity_map_entry {
 struct unity_map_entry {
 	struct list_head list;
 	struct list_head list;
+
+	/* starting device id this entry is used for (including) */
 	u16 devid_start;
 	u16 devid_start;
+	/* end device id this entry is used for (including) */
 	u16 devid_end;
 	u16 devid_end;
+
+	/* start address to unity map (including) */
 	u64 address_start;
 	u64 address_start;
+	/* end address to unity map (including) */
 	u64 address_end;
 	u64 address_end;
+
+	/* required protection */
 	int prot;
 	int prot;
 };
 };
 
 
+/*
+ * List of all unity mappings. It is not locked because as runtime it is only
+ * read. It is created at ACPI table parsing time.
+ */
 extern struct list_head amd_iommu_unity_map;
 extern struct list_head amd_iommu_unity_map;
 
 
-/* data structures for device handling */
+/*
+ * Data structures for device handling
+ */
+
+/*
+ * Device table used by hardware. Read and write accesses by software are
+ * locked with the amd_iommu_pd_table lock.
+ */
 extern struct dev_table_entry *amd_iommu_dev_table;
 extern struct dev_table_entry *amd_iommu_dev_table;
+
+/*
+ * Alias table to find requestor ids to device ids. Not locked because only
+ * read on runtime.
+ */
 extern u16 *amd_iommu_alias_table;
 extern u16 *amd_iommu_alias_table;
+
+/*
+ * Reverse lookup table to find the IOMMU which translates a specific device.
+ */
 extern struct amd_iommu **amd_iommu_rlookup_table;
 extern struct amd_iommu **amd_iommu_rlookup_table;
 
 
+/* size of the dma_ops aperture as power of 2 */
 extern unsigned amd_iommu_aperture_order;
 extern unsigned amd_iommu_aperture_order;
 
 
+/* largest PCI device id we expect translation requests for */
 extern u16 amd_iommu_last_bdf;
 extern u16 amd_iommu_last_bdf;
 
 
 /* data structures for protection domain handling */
 /* data structures for protection domain handling */
 extern struct protection_domain **amd_iommu_pd_table;
 extern struct protection_domain **amd_iommu_pd_table;
+
+/* allocation bitmap for domain ids */
 extern unsigned long *amd_iommu_pd_alloc_bitmap;
 extern unsigned long *amd_iommu_pd_alloc_bitmap;
 
 
+/* will be 1 if device isolation is enabled */
 extern int amd_iommu_isolate;
 extern int amd_iommu_isolate;
 
 
+/* takes a PCI device id and prints it out in a readable form */
 static inline void print_devid(u16 devid, int nl)
 static inline void print_devid(u16 devid, int nl)
 {
 {
 	int bus = devid >> 8;
 	int bus = devid >> 8;
@@ -241,4 +332,11 @@ static inline void print_devid(u16 devid, int nl)
 		printk("\n");
 		printk("\n");
 }
 }
 
 
+/* takes bus and device/function and returns the device id
+ * FIXME: should that be in generic PCI code? */
+static inline u16 calc_devid(u8 bus, u8 devfn)
+{
+	return (((u16)bus) << 8) | devfn;
+}
+
 #endif
 #endif

+ 9 - 19
include/asm-x86/apic.h

@@ -3,6 +3,8 @@
 
 
 #include <linux/pm.h>
 #include <linux/pm.h>
 #include <linux/delay.h>
 #include <linux/delay.h>
+
+#include <asm/alternative.h>
 #include <asm/fixmap.h>
 #include <asm/fixmap.h>
 #include <asm/apicdef.h>
 #include <asm/apicdef.h>
 #include <asm/processor.h>
 #include <asm/processor.h>
@@ -10,7 +12,7 @@
 
 
 #define ARCH_APICTIMER_STOPS_ON_C3	1
 #define ARCH_APICTIMER_STOPS_ON_C3	1
 
 
-#define Dprintk(x...)
+#define Dprintk printk
 
 
 /*
 /*
  * Debugging macros
  * Debugging macros
@@ -35,7 +37,7 @@ extern void generic_apic_probe(void);
 
 
 #ifdef CONFIG_X86_LOCAL_APIC
 #ifdef CONFIG_X86_LOCAL_APIC
 
 
-extern int apic_verbosity;
+extern unsigned int apic_verbosity;
 extern int local_apic_timer_c2_ok;
 extern int local_apic_timer_c2_ok;
 
 
 extern int ioapic_force;
 extern int ioapic_force;
@@ -48,7 +50,6 @@ extern int disable_apic;
 #include <asm/paravirt.h>
 #include <asm/paravirt.h>
 #else
 #else
 #define apic_write native_apic_write
 #define apic_write native_apic_write
-#define apic_write_atomic native_apic_write_atomic
 #define apic_read native_apic_read
 #define apic_read native_apic_read
 #define setup_boot_clock setup_boot_APIC_clock
 #define setup_boot_clock setup_boot_APIC_clock
 #define setup_secondary_clock setup_secondary_APIC_clock
 #define setup_secondary_clock setup_secondary_APIC_clock
@@ -58,12 +59,11 @@ extern int is_vsmp_box(void);
 
 
 static inline void native_apic_write(unsigned long reg, u32 v)
 static inline void native_apic_write(unsigned long reg, u32 v)
 {
 {
-	*((volatile u32 *)(APIC_BASE + reg)) = v;
-}
+	volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
 
 
-static inline void native_apic_write_atomic(unsigned long reg, u32 v)
-{
-	(void)xchg((u32 *)(APIC_BASE + reg), v);
+	alternative_io("movl %0, %1", "xchgl %0, %1", X86_FEATURE_11AP,
+		       ASM_OUTPUT2("=r" (v), "=m" (*addr)),
+		       ASM_OUTPUT2("0" (v), "m" (*addr)));
 }
 }
 
 
 static inline u32 native_apic_read(unsigned long reg)
 static inline u32 native_apic_read(unsigned long reg)
@@ -75,16 +75,6 @@ extern void apic_wait_icr_idle(void);
 extern u32 safe_apic_wait_icr_idle(void);
 extern u32 safe_apic_wait_icr_idle(void);
 extern int get_physical_broadcast(void);
 extern int get_physical_broadcast(void);
 
 
-#ifdef CONFIG_X86_GOOD_APIC
-# define FORCE_READ_AROUND_WRITE 0
-# define apic_read_around(x)
-# define apic_write_around(x, y) apic_write((x), (y))
-#else
-# define FORCE_READ_AROUND_WRITE 1
-# define apic_read_around(x) apic_read(x)
-# define apic_write_around(x, y) apic_write_atomic((x), (y))
-#endif
-
 static inline void ack_APIC_irq(void)
 static inline void ack_APIC_irq(void)
 {
 {
 	/*
 	/*
@@ -95,7 +85,7 @@ static inline void ack_APIC_irq(void)
 	 */
 	 */
 
 
 	/* Docs say use 0 for future compatibility */
 	/* Docs say use 0 for future compatibility */
-	apic_write_around(APIC_EOI, 0);
+	apic_write(APIC_EOI, 0);
 }
 }
 
 
 extern int lapic_get_maxlvt(void);
 extern int lapic_get_maxlvt(void);

+ 1 - 0
include/asm-x86/arch_hooks.h

@@ -21,6 +21,7 @@ extern void intr_init_hook(void);
 extern void pre_intr_init_hook(void);
 extern void pre_intr_init_hook(void);
 extern void pre_setup_arch_hook(void);
 extern void pre_setup_arch_hook(void);
 extern void trap_init_hook(void);
 extern void trap_init_hook(void);
+extern void pre_time_init_hook(void);
 extern void time_init_hook(void);
 extern void time_init_hook(void);
 extern void mca_nmi_hook(void);
 extern void mca_nmi_hook(void);
 
 

+ 1 - 1
include/asm-x86/bitops.h

@@ -356,7 +356,7 @@ static inline unsigned long ffz(unsigned long word)
  * __fls: find last set bit in word
  * __fls: find last set bit in word
  * @word: The word to search
  * @word: The word to search
  *
  *
- * Undefined if no zero exists, so code should check against ~0UL first.
+ * Undefined if no set bit exists, so code should check against 0 first.
  */
  */
 static inline unsigned long __fls(unsigned long word)
 static inline unsigned long __fls(unsigned long word)
 {
 {

+ 4 - 2
include/asm-x86/calling.h

@@ -104,7 +104,7 @@
 	.endif
 	.endif
 	.endm
 	.endm
 
 
-	.macro LOAD_ARGS offset
+	.macro LOAD_ARGS offset, skiprax=0
 	movq \offset(%rsp),    %r11
 	movq \offset(%rsp),    %r11
 	movq \offset+8(%rsp),  %r10
 	movq \offset+8(%rsp),  %r10
 	movq \offset+16(%rsp), %r9
 	movq \offset+16(%rsp), %r9
@@ -113,7 +113,10 @@
 	movq \offset+48(%rsp), %rdx
 	movq \offset+48(%rsp), %rdx
 	movq \offset+56(%rsp), %rsi
 	movq \offset+56(%rsp), %rsi
 	movq \offset+64(%rsp), %rdi
 	movq \offset+64(%rsp), %rdi
+	.if \skiprax
+	.else
 	movq \offset+72(%rsp), %rax
 	movq \offset+72(%rsp), %rax
+	.endif
 	.endm
 	.endm
 
 
 #define REST_SKIP	6*8
 #define REST_SKIP	6*8
@@ -165,4 +168,3 @@
 	.macro icebp
 	.macro icebp
 	.byte 0xf1
 	.byte 0xf1
 	.endm
 	.endm
-

+ 1 - 0
include/asm-x86/cpufeature.h

@@ -79,6 +79,7 @@
 #define X86_FEATURE_REP_GOOD	(3*32+16) /* rep microcode works well on this CPU */
 #define X86_FEATURE_REP_GOOD	(3*32+16) /* rep microcode works well on this CPU */
 #define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* Mfence synchronizes RDTSC */
 #define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* Mfence synchronizes RDTSC */
 #define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* Lfence synchronizes RDTSC */
 #define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* Lfence synchronizes RDTSC */
+#define X86_FEATURE_11AP	(3*32+19)  /* Bad local APIC aka 11AP */
 
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* Streaming SIMD Extensions-3 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* Streaming SIMD Extensions-3 */

+ 0 - 1
include/asm-x86/dma-mapping.h

@@ -14,7 +14,6 @@ extern dma_addr_t bad_dma_address;
 extern int iommu_merge;
 extern int iommu_merge;
 extern struct device fallback_dev;
 extern struct device fallback_dev;
 extern int panic_on_overflow;
 extern int panic_on_overflow;
-extern int forbid_dac;
 extern int force_iommu;
 extern int force_iommu;
 
 
 struct dma_mapping_ops {
 struct dma_mapping_ops {

Some files were not shown because too many files changed in this diff