Procházet zdrojové kódy

Merge git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86

* git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86: (78 commits)
  x86: fix RTC lockdep warning: potential hardirq recursion
  x86: cpa, micro-optimization
  x86: cpa, clean up code flow
  x86: cpa, eliminate CPA_ enum
  x86: cpa, cleanups
  x86: implement gbpages support in change_page_attr()
  x86: support gbpages in pagetable dump
  x86: add gbpages support to lookup_address
  x86: add pgtable accessor functions for gbpages
  x86: add PUD_PAGE_SIZE
  x86: add feature macros for the gbpages cpuid bit
  x86: switch direct mapping setup over to set_pte
  x86: fix page-present check in cpa_flush_range
  x86: remove cpa warning
  x86: remove now unused clear_kernel_mapping
  x86: switch pci-gart over to using set_memory_np() instead of clear_kernel_mapping()
  x86: cpa selftest, skip non present entries
  x86: CPA fix pagetable split
  x86: rename LARGE_PAGE_SIZE to PMD_PAGE_SIZE
  x86: cpa, fix lookup_address
  ...
Linus Torvalds před 17 roky
rodič
revize
d2fc0bacd5
68 změnil soubory, kde provedl 821 přidání a 814 odebrání
  1. 16 0
      Documentation/x86_64/00-INDEX
  2. 1 0
      arch/x86/Kconfig
  3. 0 1
      arch/x86/Makefile
  4. 13 5
      arch/x86/boot/Makefile
  5. 1 0
      arch/x86/boot/compressed/Makefile
  6. 4 4
      arch/x86/boot/compressed/head_64.S
  7. 21 5
      arch/x86/boot/cpu.c
  8. 49 0
      arch/x86/boot/mkcpustr.c
  9. 4 2
      arch/x86/kernel/Makefile
  10. 1 0
      arch/x86/kernel/cpu/Makefile
  11. 0 10
      arch/x86/kernel/cpu/common.c
  12. 9 0
      arch/x86/kernel/cpu/cpu.h
  13. 83 0
      arch/x86/kernel/cpu/feature_names.c
  14. 1 0
      arch/x86/kernel/cpu/intel.c
  15. 0 107
      arch/x86/kernel/cpu/mtrr/cyrix.c
  16. 0 16
      arch/x86/kernel/cpu/mtrr/main.c
  17. 4 0
      arch/x86/kernel/cpu/mtrr/mtrr.h
  18. 0 74
      arch/x86/kernel/cpu/proc.c
  19. 23 29
      arch/x86/kernel/cpuid.c
  20. 30 27
      arch/x86/kernel/efi.c
  21. 11 11
      arch/x86/kernel/efi_64.c
  22. 2 2
      arch/x86/kernel/head_64.S
  23. 2 1
      arch/x86/kernel/ldt.c
  24. 9 5
      arch/x86/kernel/msr.c
  25. 3 2
      arch/x86/kernel/pci-gart_64.c
  26. 1 1
      arch/x86/kernel/process_32.c
  27. 0 76
      arch/x86/kernel/setup_64.c
  28. 2 10
      arch/x86/kernel/test_nx.c
  29. 1 6
      arch/x86/kernel/trampoline_32.S
  30. 0 3
      arch/x86/kernel/trampoline_64.S
  31. 3 3
      arch/x86/kernel/vmi_32.c
  32. 7 24
      arch/x86/lib/mmx_32.c
  33. 3 9
      arch/x86/lib/usercopy_32.c
  34. 3 9
      arch/x86/lib/usercopy_64.c
  35. 11 23
      arch/x86/mm/fault.c
  36. 2 4
      arch/x86/mm/init_32.c
  37. 2 47
      arch/x86/mm/init_64.c
  38. 12 29
      arch/x86/mm/ioremap.c
  39. 6 1
      arch/x86/mm/numa_64.c
  40. 2 1
      arch/x86/mm/pageattr-test.c
  41. 317 83
      arch/x86/mm/pageattr.c
  42. 20 41
      arch/x86/mm/pgtable_32.c
  43. 44 8
      arch/x86/pci/numa.c
  44. 7 4
      include/asm-generic/rtc.h
  45. 0 1
      include/asm-generic/tlb.h
  46. 7 0
      include/asm-x86/asm.h
  47. 1 1
      include/asm-x86/bugs.h
  48. 11 3
      include/asm-x86/cpufeature.h
  49. 2 2
      include/asm-x86/efi.h
  50. 6 17
      include/asm-x86/futex.h
  51. 2 2
      include/asm-x86/highmem.h
  52. 1 1
      include/asm-x86/hw_irq_32.h
  53. 4 12
      include/asm-x86/i387.h
  54. 0 25
      include/asm-x86/io_32.h
  55. 2 0
      include/asm-x86/mach-numaq/mach_apic.h
  56. 2 8
      include/asm-x86/msr.h
  57. 2 2
      include/asm-x86/page.h
  58. 3 0
      include/asm-x86/page_64.h
  59. 4 2
      include/asm-x86/pgalloc_32.h
  60. 11 15
      include/asm-x86/pgtable-3level.h
  61. 4 0
      include/asm-x86/pgtable.h
  62. 2 0
      include/asm-x86/pgtable_32.h
  63. 6 1
      include/asm-x86/pgtable_64.h
  64. 4 4
      include/asm-x86/string_32.h
  65. 9 14
      include/asm-x86/system.h
  66. 5 13
      include/asm-x86/uaccess_32.h
  67. 2 8
      include/asm-x86/uaccess_64.h
  68. 1 0
      include/asm-x86/vm86.h

+ 16 - 0
Documentation/x86_64/00-INDEX

@@ -0,0 +1,16 @@
+00-INDEX
+	- This file
+boot-options.txt
+	- AMD64-specific boot options.
+cpu-hotplug-spec
+	- Firmware support for CPU hotplug under Linux/x86-64
+fake-numa-for-cpusets
+	- Using numa=fake and CPUSets for Resource Management
+kernel-stacks
+	- Context-specific per-processor interrupt stacks.
+machinecheck
+	- Configurable sysfs parameters for the x86-64 machine check code.
+mm.txt
+	- Memory layout of x86-64 (4 level page tables, 46 bits physical).
+uefi.txt
+	- Booting Linux via Unified Extensible Firmware Interface.

+ 1 - 0
arch/x86/Kconfig

@@ -306,6 +306,7 @@ config X86_RDC321X
 	select M486
 	select X86_REBOOTFIXUPS
 	select GENERIC_GPIO
+	select LEDS_CLASS
 	select LEDS_GPIO
 	help
 	  This option is needed for RDC R-321x system-on-chip, also known

+ 0 - 1
arch/x86/Makefile

@@ -92,7 +92,6 @@ KBUILD_AFLAGS += $(cfi) $(cfi-sigframe)
 KBUILD_CFLAGS += $(cfi) $(cfi-sigframe)
 
 LDFLAGS := -m elf_$(UTS_MACHINE)
-OBJCOPYFLAGS := -O binary -R .note -R .comment -S
 
 # Speed up the build
 KBUILD_CFLAGS += -pipe

+ 13 - 5
arch/x86/boot/Makefile

@@ -26,7 +26,7 @@ SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
 #RAMDISK := -DRAMDISK=512
 
 targets		:= vmlinux.bin setup.bin setup.elf zImage bzImage
-subdir- 	:= compressed
+subdir-		:= compressed
 
 setup-y		+= a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o
 setup-y		+= header.o main.o mca.o memory.o pm.o pmjump.o
@@ -43,9 +43,17 @@ setup-y		+= video-vesa.o
 setup-y		+= video-bios.o
 
 targets		+= $(setup-y)
-hostprogs-y	:= tools/build
+hostprogs-y	:= mkcpustr tools/build
 
-HOSTCFLAGS_build.o := $(LINUXINCLUDE)
+HOST_EXTRACFLAGS += $(LINUXINCLUDE)
+
+$(obj)/cpu.o: $(obj)/cpustr.h
+
+quiet_cmd_cpustr = CPUSTR  $@
+      cmd_cpustr = $(obj)/mkcpustr > $@
+targets		+= cpustr.h
+$(obj)/cpustr.h: $(obj)/mkcpustr FORCE
+	$(call if_changed,cpustr)
 
 # ---------------------------------------------------------------------------
 
@@ -80,6 +88,7 @@ $(obj)/zImage $(obj)/bzImage: $(obj)/setup.bin \
 	$(call if_changed,image)
 	@echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
 
+OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note -R .comment -S
 $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
 	$(call if_changed,objcopy)
 
@@ -90,7 +99,6 @@ $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
 	$(call if_changed,ld)
 
 OBJCOPYFLAGS_setup.bin	:= -O binary
-
 $(obj)/setup.bin: $(obj)/setup.elf FORCE
 	$(call if_changed,objcopy)
 
@@ -98,7 +106,7 @@ $(obj)/compressed/vmlinux: FORCE
 	$(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@
 
 # Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel
-FDARGS = 
+FDARGS =
 # Set this if you want an initrd included with the zdisk/fdimage/isoimage kernel
 FDINITRD =
 

+ 1 - 0
arch/x86/boot/compressed/Makefile

@@ -22,6 +22,7 @@ $(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $
 	$(call if_changed,ld)
 	@:
 
+OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note -R .comment -S
 $(obj)/vmlinux.bin: vmlinux FORCE
 	$(call if_changed,objcopy)
 

+ 4 - 4
arch/x86/boot/compressed/head_64.S

@@ -80,8 +80,8 @@ startup_32:
 
 #ifdef CONFIG_RELOCATABLE
 	movl	%ebp, %ebx
-	addl	$(LARGE_PAGE_SIZE -1), %ebx
-	andl	$LARGE_PAGE_MASK, %ebx
+	addl	$(PMD_PAGE_SIZE -1), %ebx
+	andl	$PMD_PAGE_MASK, %ebx
 #else
 	movl	$CONFIG_PHYSICAL_START, %ebx
 #endif
@@ -220,8 +220,8 @@ ENTRY(startup_64)
 	/* Start with the delta to where the kernel will run at. */
 #ifdef CONFIG_RELOCATABLE
 	leaq	startup_32(%rip) /* - $startup_32 */, %rbp
-	addq	$(LARGE_PAGE_SIZE - 1), %rbp
-	andq	$LARGE_PAGE_MASK, %rbp
+	addq	$(PMD_PAGE_SIZE - 1), %rbp
+	andq	$PMD_PAGE_MASK, %rbp
 	movq	%rbp, %rbx
 #else
 	movq	$CONFIG_PHYSICAL_START, %rbp

+ 21 - 5
arch/x86/boot/cpu.c

@@ -1,7 +1,7 @@
 /* -*- linux-c -*- ------------------------------------------------------- *
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
- *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2007-2008 rPath, Inc. - All Rights Reserved
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -9,7 +9,7 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * arch/i386/boot/cpu.c
+ * arch/x86/boot/cpu.c
  *
  * Check for obligatory CPU features and abort if the features are not
  * present.
@@ -19,6 +19,8 @@
 #include "bitops.h"
 #include <asm/cpufeature.h>
 
+#include "cpustr.h"
+
 static char *cpu_name(int level)
 {
 	static char buf[6];
@@ -35,6 +37,7 @@ int validate_cpu(void)
 {
 	u32 *err_flags;
 	int cpu_level, req_level;
+	const unsigned char *msg_strs;
 
 	check_cpu(&cpu_level, &req_level, &err_flags);
 
@@ -51,13 +54,26 @@ int validate_cpu(void)
 		puts("This kernel requires the following features "
 		     "not present on the CPU:\n");
 
+		msg_strs = (const unsigned char *)x86_cap_strs;
+
 		for (i = 0; i < NCAPINTS; i++) {
 			u32 e = err_flags[i];
 
 			for (j = 0; j < 32; j++) {
-				if (e & 1)
-					printf("%d:%d ", i, j);
-
+				int n = (i << 5)+j;
+				if (*msg_strs < n) {
+					/* Skip to the next string */
+					do {
+						msg_strs++;
+					} while (*msg_strs);
+					msg_strs++;
+				}
+				if (e & 1) {
+					if (*msg_strs == n && msg_strs[1])
+						printf("%s ", msg_strs+1);
+					else
+						printf("%d:%d ", i, j);
+				}
 				e >>= 1;
 			}
 		}

+ 49 - 0
arch/x86/boot/mkcpustr.c

@@ -0,0 +1,49 @@
+/* ----------------------------------------------------------------------- *
+ *
+ *   Copyright 2008 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2 or (at your
+ *   option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * This is a host program to preprocess the CPU strings into a
+ * compact format suitable for the setup code.
+ */
+
+#include <stdio.h>
+
+#include "../kernel/cpu/feature_names.c"
+
+#if NCAPFLAGS > 8
+# error "Need to adjust the boot code handling of CPUID strings"
+#endif
+
+int main(void)
+{
+	int i;
+	const char *str;
+
+	printf("static const char x86_cap_strs[] = \n");
+
+	for (i = 0; i < NCAPINTS*32; i++) {
+		str = x86_cap_flags[i];
+
+		if (i == NCAPINTS*32-1) {
+			/* The last entry must be unconditional; this
+			   also consumes the compiler-added null character */
+			if (!str)
+				str = "";
+			printf("\t\"\\x%02x\"\"%s\"\n", i, str);
+		} else if (str) {
+			printf("#if REQUIRED_MASK%d & (1 << %d)\n"
+			       "\t\"\\x%02x\"\"%s\\0\"\n"
+			       "#endif\n",
+			       i >> 5, i & 31, i, str);
+		}
+	}
+	printf("\t;\n");
+	return 0;
+}

+ 4 - 2
arch/x86/kernel/Makefile

@@ -37,7 +37,8 @@ obj-$(CONFIG_X86_MSR)		+= msr.o
 obj-$(CONFIG_X86_CPUID)		+= cpuid.o
 obj-$(CONFIG_MICROCODE)		+= microcode.o
 obj-$(CONFIG_PCI)		+= early-quirks.o
-obj-$(CONFIG_APM)		+= apm_32.o
+apm-y				:= apm_32.o
+obj-$(CONFIG_APM)		+= apm.o
 obj-$(CONFIG_X86_SMP)		+= smp_$(BITS).o smpboot_$(BITS).o tsc_sync.o
 obj-$(CONFIG_X86_32_SMP)	+= smpcommon_32.o
 obj-$(CONFIG_X86_64_SMP)	+= smp_64.o smpboot_64.o tsc_sync.o
@@ -74,7 +75,8 @@ ifdef CONFIG_INPUT_PCSPKR
 obj-y				+= pcspeaker.o
 endif
 
-obj-$(CONFIG_SCx200)		+= scx200_32.o
+obj-$(CONFIG_SCx200)		+= scx200.o
+scx200-y			+= scx200_32.o
 
 ###
 # 64 bit specific files

+ 1 - 0
arch/x86/kernel/cpu/Makefile

@@ -3,6 +3,7 @@
 #
 
 obj-y			:= intel_cacheinfo.o addon_cpuid_features.o
+obj-y			+= feature_names.o
 
 obj-$(CONFIG_X86_32)	+= common.o proc.o bugs.o
 obj-$(CONFIG_X86_32)	+= amd.o

+ 0 - 10
arch/x86/kernel/cpu/common.c

@@ -623,16 +623,6 @@ cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
  * They will insert themselves into the cpu_devs structure.
  * Then, when cpu_init() is called, we can just iterate over that array.
  */
-
-extern int intel_cpu_init(void);
-extern int cyrix_init_cpu(void);
-extern int nsc_init_cpu(void);
-extern int amd_init_cpu(void);
-extern int centaur_init_cpu(void);
-extern int transmeta_init_cpu(void);
-extern int nexgen_init_cpu(void);
-extern int umc_init_cpu(void);
-
 void __init early_cpu_init(void)
 {
 	intel_cpu_init();

+ 9 - 0
arch/x86/kernel/cpu/cpu.h

@@ -27,3 +27,12 @@ extern void display_cacheinfo(struct cpuinfo_x86 *c);
 extern void early_init_intel(struct cpuinfo_x86 *c);
 extern void early_init_amd(struct cpuinfo_x86 *c);
 
+/* Specific CPU type init functions */
+int intel_cpu_init(void);
+int amd_init_cpu(void);
+int cyrix_init_cpu(void);
+int nsc_init_cpu(void);
+int centaur_init_cpu(void);
+int transmeta_init_cpu(void);
+int nexgen_init_cpu(void);
+int umc_init_cpu(void);

+ 83 - 0
arch/x86/kernel/cpu/feature_names.c

@@ -0,0 +1,83 @@
+/*
+ * Strings for the various x86 capability flags.
+ *
+ * This file must not contain any executable code.
+ */
+
+#include "asm/cpufeature.h"
+
+/*
+ * These flag bits must match the definitions in <asm/cpufeature.h>.
+ * NULL means this bit is undefined or reserved; either way it doesn't
+ * have meaning as far as Linux is concerned.  Note that it's important
+ * to realize there is a difference between this table and CPUID -- if
+ * applications want to get the raw CPUID data, they should access
+ * /dev/cpu/<cpu_nr>/cpuid instead.
+ */
+const char * const x86_cap_flags[NCAPINTS*32] = {
+	/* Intel-defined */
+	"fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
+	"cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
+	"pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
+	"fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
+
+	/* AMD-defined */
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
+	NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
+	"3dnowext", "3dnow",
+
+	/* Transmeta-defined */
+	"recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+
+	/* Other (Linux-defined) */
+	"cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
+	NULL, NULL, NULL, NULL,
+	"constant_tsc", "up", NULL, "arch_perfmon",
+	"pebs", "bts", NULL, NULL,
+	"rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+
+	/* Intel-defined (#2) */
+	"pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
+	"tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
+	NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+
+	/* VIA/Cyrix/Centaur-defined */
+	NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
+	"ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+
+	/* AMD-defined (#2) */
+	"lahf_lm", "cmp_legacy", "svm", "extapic",
+	"cr8_legacy", "abm", "sse4a", "misalignsse",
+	"3dnowprefetch", "osvw", "ibs", "sse5",
+	"skinit", "wdt", NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+
+	/* Auxiliary (Linux-defined) */
+	"ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+};
+
+const char *const x86_power_flags[32] = {
+	"ts",	/* temperature sensor */
+	"fid",  /* frequency id control */
+	"vid",  /* voltage id control */
+	"ttp",  /* thermal trip */
+	"tm",
+	"stc",
+	"100mhzsteps",
+	"hwpstate",
+	"",	/* tsc invariant mapped to constant_tsc */
+		/* nothing */
+};

+ 1 - 0
arch/x86/kernel/cpu/intel.c

@@ -13,6 +13,7 @@
 #include <asm/uaccess.h>
 #include <asm/ptrace.h>
 #include <asm/ds.h>
+#include <asm/bugs.h>
 
 #include "cpu.h"
 

+ 0 - 107
arch/x86/kernel/cpu/mtrr/cyrix.c

@@ -7,8 +7,6 @@
 #include <asm/processor-flags.h>
 #include "mtrr.h"
 
-int arr3_protected;
-
 static void
 cyrix_get_arr(unsigned int reg, unsigned long *base,
 	      unsigned long *size, mtrr_type * type)
@@ -99,8 +97,6 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
 	case 4:
 		return replace_reg;
 	case 3:
-		if (arr3_protected)
-			break;
 	case 2:
 	case 1:
 	case 0:
@@ -115,8 +111,6 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
 	} else {
 		for (i = 0; i < 7; i++) {
 			cyrix_get_arr(i, &lbase, &lsize, &ltype);
-			if ((i == 3) && arr3_protected)
-				continue;
 			if (lsize == 0)
 				return i;
 		}
@@ -260,107 +254,6 @@ static void cyrix_set_all(void)
 	post_set();
 }
 
-#if 0
-/*
- * On Cyrix 6x86(MX) and M II the ARR3 is special: it has connection
- * with the SMM (System Management Mode) mode. So we need the following:
- * Check whether SMI_LOCK (CCR3 bit 0) is set
- *   if it is set, write a warning message: ARR3 cannot be changed!
- *     (it cannot be changed until the next processor reset)
- *   if it is reset, then we can change it, set all the needed bits:
- *   - disable access to SMM memory through ARR3 range (CCR1 bit 7 reset)
- *   - disable access to SMM memory (CCR1 bit 2 reset)
- *   - disable SMM mode (CCR1 bit 1 reset)
- *   - disable write protection of ARR3 (CCR6 bit 1 reset)
- *   - (maybe) disable ARR3
- * Just to be sure, we enable ARR usage by the processor (CCR5 bit 5 set)
- */
-static void __init
-cyrix_arr_init(void)
-{
-	struct set_mtrr_context ctxt;
-	unsigned char ccr[7];
-	int ccrc[7] = { 0, 0, 0, 0, 0, 0, 0 };
-#ifdef CONFIG_SMP
-	int i;
-#endif
-
-	/* flush cache and enable MAPEN */
-	set_mtrr_prepare_save(&ctxt);
-	set_mtrr_cache_disable(&ctxt);
-
-	/* Save all CCRs locally */
-	ccr[0] = getCx86(CX86_CCR0);
-	ccr[1] = getCx86(CX86_CCR1);
-	ccr[2] = getCx86(CX86_CCR2);
-	ccr[3] = ctxt.ccr3;
-	ccr[4] = getCx86(CX86_CCR4);
-	ccr[5] = getCx86(CX86_CCR5);
-	ccr[6] = getCx86(CX86_CCR6);
-
-	if (ccr[3] & 1) {
-		ccrc[3] = 1;
-		arr3_protected = 1;
-	} else {
-		/* Disable SMM mode (bit 1), access to SMM memory (bit 2) and
-		 * access to SMM memory through ARR3 (bit 7).
-		 */
-		if (ccr[1] & 0x80) {
-			ccr[1] &= 0x7f;
-			ccrc[1] |= 0x80;
-		}
-		if (ccr[1] & 0x04) {
-			ccr[1] &= 0xfb;
-			ccrc[1] |= 0x04;
-		}
-		if (ccr[1] & 0x02) {
-			ccr[1] &= 0xfd;
-			ccrc[1] |= 0x02;
-		}
-		arr3_protected = 0;
-		if (ccr[6] & 0x02) {
-			ccr[6] &= 0xfd;
-			ccrc[6] = 1;	/* Disable write protection of ARR3 */
-			setCx86(CX86_CCR6, ccr[6]);
-		}
-		/* Disable ARR3. This is safe now that we disabled SMM. */
-		/* cyrix_set_arr_up (3, 0, 0, 0, FALSE); */
-	}
-	/* If we changed CCR1 in memory, change it in the processor, too. */
-	if (ccrc[1])
-		setCx86(CX86_CCR1, ccr[1]);
-
-	/* Enable ARR usage by the processor */
-	if (!(ccr[5] & 0x20)) {
-		ccr[5] |= 0x20;
-		ccrc[5] = 1;
-		setCx86(CX86_CCR5, ccr[5]);
-	}
-#ifdef CONFIG_SMP
-	for (i = 0; i < 7; i++)
-		ccr_state[i] = ccr[i];
-	for (i = 0; i < 8; i++)
-		cyrix_get_arr(i,
-			      &arr_state[i].base, &arr_state[i].size,
-			      &arr_state[i].type);
-#endif
-
-	set_mtrr_done(&ctxt);	/* flush cache and disable MAPEN */
-
-	if (ccrc[5])
-		printk(KERN_INFO "mtrr: ARR usage was not enabled, enabled manually\n");
-	if (ccrc[3])
-		printk(KERN_INFO "mtrr: ARR3 cannot be changed\n");
-/*
-    if ( ccrc[1] & 0x80) printk ("mtrr: SMM memory access through ARR3 disabled\n");
-    if ( ccrc[1] & 0x04) printk ("mtrr: SMM memory access disabled\n");
-    if ( ccrc[1] & 0x02) printk ("mtrr: SMM mode disabled\n");
-*/
-	if (ccrc[6])
-		printk(KERN_INFO "mtrr: ARR3 was write protected, unprotected\n");
-}
-#endif
-
 static struct mtrr_ops cyrix_mtrr_ops = {
 	.vendor            = X86_VENDOR_CYRIX,
 //	.init              = cyrix_arr_init,

+ 0 - 16
arch/x86/kernel/cpu/mtrr/main.c

@@ -59,12 +59,6 @@ struct mtrr_ops * mtrr_if = NULL;
 static void set_mtrr(unsigned int reg, unsigned long base,
 		     unsigned long size, mtrr_type type);
 
-#ifndef CONFIG_X86_64
-extern int arr3_protected;
-#else
-#define arr3_protected 0
-#endif
-
 void set_mtrr_ops(struct mtrr_ops * ops)
 {
 	if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
@@ -513,12 +507,6 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
 		printk(KERN_WARNING "mtrr: register: %d too big\n", reg);
 		goto out;
 	}
-	if (is_cpu(CYRIX) && !use_intel()) {
-		if ((reg == 3) && arr3_protected) {
-			printk(KERN_WARNING "mtrr: ARR3 cannot be changed\n");
-			goto out;
-		}
-	}
 	mtrr_if->get(reg, &lbase, &lsize, &ltype);
 	if (lsize < 1) {
 		printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg);
@@ -566,10 +554,6 @@ EXPORT_SYMBOL(mtrr_del);
  * These should be called implicitly, but we can't yet until all the initcall
  * stuff is done...
  */
-extern void amd_init_mtrr(void);
-extern void cyrix_init_mtrr(void);
-extern void centaur_init_mtrr(void);
-
 static void __init init_ifs(void)
 {
 #ifndef CONFIG_X86_64

+ 4 - 0
arch/x86/kernel/cpu/mtrr/mtrr.h

@@ -97,3 +97,7 @@ void mtrr_state_warn(void);
 const char *mtrr_attrib_to_str(int x);
 void mtrr_wrmsr(unsigned, unsigned, unsigned);
 
+/* CPU specific mtrr init functions */
+int amd_init_mtrr(void);
+int cyrix_init_mtrr(void);
+int centaur_init_mtrr(void);

+ 0 - 74
arch/x86/kernel/cpu/proc.c

@@ -10,80 +10,6 @@
  */
 static int show_cpuinfo(struct seq_file *m, void *v)
 {
-	/* 
-	 * These flag bits must match the definitions in <asm/cpufeature.h>.
-	 * NULL means this bit is undefined or reserved; either way it doesn't
-	 * have meaning as far as Linux is concerned.  Note that it's important
-	 * to realize there is a difference between this table and CPUID -- if
-	 * applications want to get the raw CPUID data, they should access
-	 * /dev/cpu/<cpu_nr>/cpuid instead.
-	 */
-	static const char * const x86_cap_flags[] = {
-		/* Intel-defined */
-	        "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
-	        "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
-	        "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
-	        "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
-
-		/* AMD-defined */
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
-		NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
-		"3dnowext", "3dnow",
-
-		/* Transmeta-defined */
-		"recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-		/* Other (Linux-defined) */
-		"cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
-		NULL, NULL, NULL, NULL,
-		"constant_tsc", "up", NULL, "arch_perfmon",
-		"pebs", "bts", NULL, "sync_rdtsc",
-		"rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-		/* Intel-defined (#2) */
-		"pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
-		"tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
-		NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-		/* VIA/Cyrix/Centaur-defined */
-		NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
-		"ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-		/* AMD-defined (#2) */
-		"lahf_lm", "cmp_legacy", "svm", "extapic",
-		"cr8_legacy", "abm", "sse4a", "misalignsse",
-		"3dnowprefetch", "osvw", "ibs", "sse5",
-		"skinit", "wdt", NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-		/* Auxiliary (Linux-defined) */
-		"ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	};
-	static const char * const x86_power_flags[] = {
-		"ts",	/* temperature sensor */
-		"fid",  /* frequency id control */
-		"vid",  /* voltage id control */
-		"ttp",  /* thermal trip */
-		"tm",
-		"stc",
-		"100mhzsteps",
-		"hwpstate",
-		"",	/* constant_tsc - moved to flags */
-		/* nothing */
-	};
 	struct cpuinfo_x86 *c = v;
 	int i, n = 0;
 	int fpu_exception;

+ 23 - 29
arch/x86/kernel/cpuid.c

@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------- *
- *   
- *   Copyright 2000 H. Peter Anvin - All Rights Reserved
+ *
+ *   Copyright 2000-2008 H. Peter Anvin - All Rights Reserved
  *
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
@@ -17,6 +17,10 @@
  * and then read in chunks of 16 bytes.  A larger size means multiple
  * reads of consecutive levels.
  *
+ * The lower 32 bits of the file position is used as the incoming %eax,
+ * and the upper 32 bits of the file position as the incoming %ecx,
+ * the latter intended for "counting" eax levels like eax=4.
+ *
  * This driver uses /dev/cpu/%d/cpuid where %d is the minor number, and on
  * an SMP box will direct the access to CPU %d.
  */
@@ -43,35 +47,24 @@
 
 static struct class *cpuid_class;
 
-struct cpuid_command {
-	u32 reg;
-	u32 *data;
+struct cpuid_regs {
+	u32 eax, ebx, ecx, edx;
 };
 
 static void cpuid_smp_cpuid(void *cmd_block)
 {
-	struct cpuid_command *cmd = cmd_block;
-
-	cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2],
-		      &cmd->data[3]);
-}
-
-static inline void do_cpuid(int cpu, u32 reg, u32 * data)
-{
-	struct cpuid_command cmd;
-
-	cmd.reg = reg;
-	cmd.data = data;
+	struct cpuid_regs *cmd = (struct cpuid_regs *)cmd_block;
 
-	smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1);
+	cpuid_count(cmd->eax, cmd->ecx,
+		    &cmd->eax, &cmd->ebx, &cmd->ecx, &cmd->edx);
 }
 
 static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
 {
 	loff_t ret;
+	struct inode *inode = file->f_mapping->host;
 
-	lock_kernel();
-
+	mutex_lock(&inode->i_mutex);
 	switch (orig) {
 	case 0:
 		file->f_pos = offset;
@@ -84,8 +77,7 @@ static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
 	default:
 		ret = -EINVAL;
 	}
-
-	unlock_kernel();
+	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 
@@ -93,19 +85,21 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
 			  size_t count, loff_t * ppos)
 {
 	char __user *tmp = buf;
-	u32 data[4];
-	u32 reg = *ppos;
+	struct cpuid_regs cmd;
 	int cpu = iminor(file->f_path.dentry->d_inode);
+	u64 pos = *ppos;
 
 	if (count % 16)
 		return -EINVAL;	/* Invalid chunk size */
 
 	for (; count; count -= 16) {
-		do_cpuid(cpu, reg, data);
-		if (copy_to_user(tmp, &data, 16))
+		cmd.eax = pos;
+		cmd.ecx = pos >> 32;
+		smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1);
+		if (copy_to_user(tmp, &cmd, 16))
 			return -EFAULT;
 		tmp += 16;
-		*ppos = reg++;
+		*ppos = ++pos;
 	}
 
 	return tmp - buf;
@@ -193,7 +187,7 @@ static int __init cpuid_init(void)
 	}
 	for_each_online_cpu(i) {
 		err = cpuid_device_create(i);
-		if (err != 0) 
+		if (err != 0)
 			goto out_class;
 	}
 	register_hotcpu_notifier(&cpuid_class_cpu_notifier);
@@ -208,7 +202,7 @@ out_class:
 	}
 	class_destroy(cpuid_class);
 out_chrdev:
-	unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");	
+	unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");
 out:
 	return err;
 }

+ 30 - 27
arch/x86/kernel/efi.c

@@ -379,11 +379,9 @@ void __init efi_init(void)
 #endif
 }
 
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 static void __init runtime_code_page_mkexec(void)
 {
 	efi_memory_desc_t *md;
-	unsigned long end;
 	void *p;
 
 	if (!(__supported_pte_mask & _PAGE_NX))
@@ -392,18 +390,13 @@ static void __init runtime_code_page_mkexec(void)
 	/* Make EFI runtime service code area executable */
 	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
 		md = p;
-		end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
-		if (md->type == EFI_RUNTIME_SERVICES_CODE &&
-		    (end >> PAGE_SHIFT) <= max_pfn_mapped) {
-			set_memory_x(md->virt_addr, md->num_pages);
-			set_memory_uc(md->virt_addr, md->num_pages);
-		}
+
+		if (md->type != EFI_RUNTIME_SERVICES_CODE)
+			continue;
+
+		set_memory_x(md->virt_addr, md->num_pages << EFI_PAGE_SHIFT);
 	}
-	__flush_tlb_all();
 }
-#else
-static inline void __init runtime_code_page_mkexec(void) { }
-#endif
 
 /*
  * This function will switch the EFI runtime services to virtual mode.
@@ -417,30 +410,40 @@ void __init efi_enter_virtual_mode(void)
 {
 	efi_memory_desc_t *md;
 	efi_status_t status;
-	unsigned long end;
-	void *p;
+	unsigned long size;
+	u64 end, systab;
+	void *p, *va;
 
 	efi.systab = NULL;
 	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
 		md = p;
 		if (!(md->attribute & EFI_MEMORY_RUNTIME))
 			continue;
-		end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
-		if ((md->attribute & EFI_MEMORY_WB) &&
-		    ((end >> PAGE_SHIFT) <= max_pfn_mapped))
-			md->virt_addr = (unsigned long)__va(md->phys_addr);
+
+		size = md->num_pages << EFI_PAGE_SHIFT;
+		end = md->phys_addr + size;
+
+		if ((end >> PAGE_SHIFT) <= max_pfn_mapped)
+			va = __va(md->phys_addr);
 		else
-			md->virt_addr = (unsigned long)
-				efi_ioremap(md->phys_addr,
-					    md->num_pages << EFI_PAGE_SHIFT);
-		if (!md->virt_addr)
+			va = efi_ioremap(md->phys_addr, size);
+
+		if (md->attribute & EFI_MEMORY_WB)
+			set_memory_uc(md->virt_addr, size);
+
+		md->virt_addr = (u64) (unsigned long) va;
+
+		if (!va) {
 			printk(KERN_ERR PFX "ioremap of 0x%llX failed!\n",
 			       (unsigned long long)md->phys_addr);
-		if ((md->phys_addr <= (unsigned long)efi_phys.systab) &&
-		    ((unsigned long)efi_phys.systab < end))
-			efi.systab = (efi_system_table_t *)(unsigned long)
-				(md->virt_addr - md->phys_addr +
-				 (unsigned long)efi_phys.systab);
+			continue;
+		}
+
+		systab = (u64) (unsigned long) efi_phys.systab;
+		if (md->phys_addr <= systab && systab < end) {
+			systab += md->virt_addr - md->phys_addr;
+			efi.systab = (efi_system_table_t *) (unsigned long) systab;
+		}
 	}
 
 	BUG_ON(!efi.systab);

+ 11 - 11
arch/x86/kernel/efi_64.c

@@ -54,10 +54,10 @@ static void __init early_mapping_set_exec(unsigned long start,
 		else
 			set_pte(kpte, __pte((pte_val(*kpte) | _PAGE_NX) & \
 					    __supported_pte_mask));
-		if (level == 4)
-			start = (start + PMD_SIZE) & PMD_MASK;
-		else
+		if (level == PG_LEVEL_4K)
 			start = (start + PAGE_SIZE) & PAGE_MASK;
+		else
+			start = (start + PMD_SIZE) & PMD_MASK;
 	}
 }
 
@@ -109,23 +109,23 @@ void __init efi_reserve_bootmem(void)
 				memmap.nr_map * memmap.desc_size);
 }
 
-void __iomem * __init efi_ioremap(unsigned long offset,
-				  unsigned long size)
+void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size)
 {
 	static unsigned pages_mapped;
-	unsigned long last_addr;
 	unsigned i, pages;
 
-	last_addr = offset + size - 1;
-	offset &= PAGE_MASK;
-	pages = (PAGE_ALIGN(last_addr) - offset) >> PAGE_SHIFT;
+	/* phys_addr and size must be page aligned */
+	if ((phys_addr & ~PAGE_MASK) || (size & ~PAGE_MASK))
+		return NULL;
+
+	pages = size >> PAGE_SHIFT;
 	if (pages_mapped + pages > MAX_EFI_IO_PAGES)
 		return NULL;
 
 	for (i = 0; i < pages; i++) {
 		__set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped,
-			     offset, PAGE_KERNEL_EXEC_NOCACHE);
-		offset += PAGE_SIZE;
+			     phys_addr, PAGE_KERNEL);
+		phys_addr += PAGE_SIZE;
 		pages_mapped++;
 	}
 

+ 2 - 2
arch/x86/kernel/head_64.S

@@ -63,7 +63,7 @@ startup_64:
 
 	/* Is the address not 2M aligned? */
 	movq	%rbp, %rax
-	andl	$~LARGE_PAGE_MASK, %eax
+	andl	$~PMD_PAGE_MASK, %eax
 	testl	%eax, %eax
 	jnz	bad_address
 
@@ -88,7 +88,7 @@ startup_64:
 
 	/* Add an Identity mapping if I am above 1G */
 	leaq	_text(%rip), %rdi
-	andq	$LARGE_PAGE_MASK, %rdi
+	andq	$PMD_PAGE_MASK, %rdi
 
 	movq	%rdi, %rax
 	shrq	$PUD_SHIFT, %rax

+ 2 - 1
arch/x86/kernel/ldt.c

@@ -35,7 +35,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 	if (mincount <= pc->size)
 		return 0;
 	oldsize = pc->size;
-	mincount = (mincount + 511) & (~511);
+	mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
+			(~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
 	if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
 		newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
 	else

+ 9 - 5
arch/x86/kernel/msr.c

@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------- *
- *   
- *   Copyright 2000 H. Peter Anvin - All Rights Reserved
+ *
+ *   Copyright 2000-2008 H. Peter Anvin - All Rights Reserved
  *
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
@@ -45,9 +45,10 @@ static struct class *msr_class;
 
 static loff_t msr_seek(struct file *file, loff_t offset, int orig)
 {
-	loff_t ret = -EINVAL;
+	loff_t ret;
+	struct inode *inode = file->f_mapping->host;
 
-	lock_kernel();
+	mutex_lock(&inode->i_mutex);
 	switch (orig) {
 	case 0:
 		file->f_pos = offset;
@@ -56,8 +57,11 @@ static loff_t msr_seek(struct file *file, loff_t offset, int orig)
 	case 1:
 		file->f_pos += offset;
 		ret = file->f_pos;
+		break;
+	default:
+		ret = -EINVAL;
 	}
-	unlock_kernel();
+	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 

+ 3 - 2
arch/x86/kernel/pci-gart_64.c

@@ -501,7 +501,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
 	}
 
 	a = aper + iommu_size;
-	iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a;
+	iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
 
 	if (iommu_size < 64*1024*1024) {
 		printk(KERN_WARNING
@@ -731,7 +731,8 @@ void __init gart_iommu_init(void)
 	 * the backing memory. The GART address is only used by PCI
 	 * devices.
 	 */
-	clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size);
+	set_memory_np((unsigned long)__va(iommu_bus_base),
+				iommu_size >> PAGE_SHIFT);
 
 	/*
 	 * Try to workaround a bug (thanks to BenH)

+ 1 - 1
arch/x86/kernel/process_32.c

@@ -251,7 +251,7 @@ void cpu_idle_wait(void)
 		 * because it has nothing to do.
 		 * Give all the remaining CPUS a kick.
 		 */
-		smp_call_function_mask(map, do_nothing, 0, 0);
+		smp_call_function_mask(map, do_nothing, NULL, 0);
 	} while (!cpus_empty(map));
 
 	set_cpus_allowed(current, tmp);

+ 0 - 76
arch/x86/kernel/setup_64.c

@@ -1068,82 +1068,6 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 	struct cpuinfo_x86 *c = v;
 	int cpu = 0, i;
 
-	/*
-	 * These flag bits must match the definitions in <asm/cpufeature.h>.
-	 * NULL means this bit is undefined or reserved; either way it doesn't
-	 * have meaning as far as Linux is concerned.  Note that it's important
-	 * to realize there is a difference between this table and CPUID -- if
-	 * applications want to get the raw CPUID data, they should access
-	 * /dev/cpu/<cpu_nr>/cpuid instead.
-	 */
-	static const char *const x86_cap_flags[] = {
-		/* Intel-defined */
-		"fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
-		"cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
-		"pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
-		"fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
-
-		/* AMD-defined */
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
-		NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
-		"3dnowext", "3dnow",
-
-		/* Transmeta-defined */
-		"recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-		/* Other (Linux-defined) */
-		"cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
-		NULL, NULL, NULL, NULL,
-		"constant_tsc", "up", NULL, "arch_perfmon",
-		"pebs", "bts", NULL, "sync_rdtsc",
-		"rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-		/* Intel-defined (#2) */
-		"pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
-		"tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
-		NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-		/* VIA/Cyrix/Centaur-defined */
-		NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
-		"ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-		/* AMD-defined (#2) */
-		"lahf_lm", "cmp_legacy", "svm", "extapic",
-		"cr8_legacy", "abm", "sse4a", "misalignsse",
-		"3dnowprefetch", "osvw", "ibs", "sse5",
-		"skinit", "wdt", NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-		/* Auxiliary (Linux-defined) */
-		"ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	};
-	static const char *const x86_power_flags[] = {
-		"ts",	/* temperature sensor */
-		"fid",  /* frequency id control */
-		"vid",  /* voltage id control */
-		"ttp",  /* thermal trip */
-		"tm",
-		"stc",
-		"100mhzsteps",
-		"hwpstate",
-		"",	/* tsc invariant mapped to constant_tsc */
-		/* nothing */
-	};
-
-
 #ifdef CONFIG_SMP
 	cpu = c->cpu_index;
 #endif

+ 2 - 10
arch/x86/kernel/test_nx.c

@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/sort.h>
 #include <asm/uaccess.h>
+#include <asm/asm.h>
 
 extern int rodata_test_data;
 
@@ -89,16 +90,7 @@ static noinline int test_address(void *address)
 		"2:	mov %[zero], %[rslt]\n"
 		"	ret\n"
 		".previous\n"
-		".section __ex_table,\"a\"\n"
-		"       .align 8\n"
-#ifdef CONFIG_X86_32
-		"	.long 0b\n"
-		"	.long 2b\n"
-#else
-		"	.quad 0b\n"
-		"	.quad 2b\n"
-#endif
-		".previous\n"
+		_ASM_EXTABLE(0b,2b)
 		: [rslt] "=r" (result)
 		: [fake_code] "r" (address), [zero] "r" (0UL), "0" (result)
 	);

+ 1 - 6
arch/x86/kernel/trampoline_32.S

@@ -11,12 +11,7 @@
  *	trampoline page to make our stack and everything else
  *	is a mystery.
  *
- *	In fact we don't actually need a stack so we don't
- *	set one up.
- *
- *	We jump into the boot/compressed/head.S code. So you'd
- *	better be running a compressed kernel image or you
- *	won't get very far.
+ *	We jump into arch/x86/kernel/head_32.S.
  *
  *	On entry to trampoline_data, the processor is in real mode
  *	with 16-bit addressing and 16-bit data.  CS has some value

+ 0 - 3
arch/x86/kernel/trampoline_64.S

@@ -10,9 +10,6 @@
  *	trampoline page to make our stack and everything else
  *	is a mystery.
  *
- *	In fact we don't actually need a stack so we don't
- *	set one up.
- *
  *	On entry to trampoline_data, the processor is in real mode
  *	with 16-bit addressing and 16-bit data.  CS has some value
  *	and IP is zero.  Thus, data addresses need to be absolute

+ 3 - 3
arch/x86/kernel/vmi_32.c

@@ -220,21 +220,21 @@ static void vmi_set_tr(void)
 static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
 {
 	u32 *idt_entry = (u32 *)g;
-	vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[2]);
+	vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[1]);
 }
 
 static void vmi_write_gdt_entry(struct desc_struct *dt, int entry,
 				const void *desc, int type)
 {
 	u32 *gdt_entry = (u32 *)desc;
-	vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[2]);
+	vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[1]);
 }
 
 static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
 				const void *desc)
 {
 	u32 *ldt_entry = (u32 *)desc;
-	vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[2]);
+	vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
 }
 
 static void vmi_load_sp0(struct tss_struct *tss,

+ 7 - 24
arch/x86/lib/mmx_32.c

@@ -4,6 +4,7 @@
 #include <linux/hardirq.h>
 #include <linux/module.h>
 
+#include <asm/asm.h>
 #include <asm/i387.h>
 
 
@@ -50,10 +51,7 @@ void *_mmx_memcpy(void *to, const void *from, size_t len)
 		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
 		"   jmp 2b\n"
 		".previous\n"
-		".section __ex_table,\"a\"\n"
-		"	.align 4\n"
-		"	.long 1b, 3b\n"
-		".previous"
+		_ASM_EXTABLE(1b,3b)
 		: : "r" (from) );
 		
 	
@@ -81,10 +79,7 @@ void *_mmx_memcpy(void *to, const void *from, size_t len)
 		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
 		"   jmp 2b\n"
 		".previous\n"
-		".section __ex_table,\"a\"\n"
-		"	.align 4\n"
-		"	.long 1b, 3b\n"
-		".previous"
+		_ASM_EXTABLE(1b,3b)
 		: : "r" (from), "r" (to) : "memory");
 		from+=64;
 		to+=64;
@@ -181,10 +176,7 @@ static void fast_copy_page(void *to, void *from)
 		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
 		"   jmp 2b\n"
 		".previous\n"
-		".section __ex_table,\"a\"\n"
-		"	.align 4\n"
-		"	.long 1b, 3b\n"
-		".previous"
+		_ASM_EXTABLE(1b,3b)
 		: : "r" (from) );
 
 	for(i=0; i<(4096-320)/64; i++)
@@ -211,10 +203,7 @@ static void fast_copy_page(void *to, void *from)
 		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
 		"   jmp 2b\n"
 		".previous\n"
-		".section __ex_table,\"a\"\n"
-		"	.align 4\n"
-		"	.long 1b, 3b\n"
-		".previous"
+		_ASM_EXTABLE(1b,3b)
 		: : "r" (from), "r" (to) : "memory");
 		from+=64;
 		to+=64;
@@ -311,10 +300,7 @@ static void fast_copy_page(void *to, void *from)
 		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
 		"   jmp 2b\n"
 		".previous\n"
-		".section __ex_table,\"a\"\n"
-		"	.align 4\n"
-		"	.long 1b, 3b\n"
-		".previous"
+		_ASM_EXTABLE(1b,3b)
 		: : "r" (from) );
 
 	for(i=0; i<4096/64; i++)
@@ -341,10 +327,7 @@ static void fast_copy_page(void *to, void *from)
 		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
 		"   jmp 2b\n"
 		".previous\n"
-		".section __ex_table,\"a\"\n"
-		"	.align 4\n"
-		"	.long 1b, 3b\n"
-		".previous"
+		_ASM_EXTABLE(1b,3b)
 		: : "r" (from), "r" (to) : "memory");
 		from+=64;
 		to+=64;

+ 3 - 9
arch/x86/lib/usercopy_32.c

@@ -48,10 +48,7 @@ do {									   \
 		"3:	movl %5,%0\n"					   \
 		"	jmp 2b\n"					   \
 		".previous\n"						   \
-		".section __ex_table,\"a\"\n"				   \
-		"	.align 4\n"					   \
-		"	.long 0b,3b\n"					   \
-		".previous"						   \
+		_ASM_EXTABLE(0b,3b)					   \
 		: "=d"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1),	   \
 		  "=&D" (__d2)						   \
 		: "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
@@ -132,11 +129,8 @@ do {									\
 		"3:	lea 0(%2,%0,4),%0\n"				\
 		"	jmp 2b\n"					\
 		".previous\n"						\
-		".section __ex_table,\"a\"\n"				\
-		"	.align 4\n"					\
-		"	.long 0b,3b\n"					\
-		"	.long 1b,2b\n"					\
-		".previous"						\
+		_ASM_EXTABLE(0b,3b)					\
+		_ASM_EXTABLE(1b,2b)					\
 		: "=&c"(size), "=&D" (__d0)				\
 		: "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0));	\
 } while (0)

+ 3 - 9
arch/x86/lib/usercopy_64.c

@@ -31,10 +31,7 @@ do {									   \
 		"3:	movq %5,%0\n"					   \
 		"	jmp 2b\n"					   \
 		".previous\n"						   \
-		".section __ex_table,\"a\"\n"				   \
-		"	.align 8\n"					   \
-		"	.quad 0b,3b\n"					   \
-		".previous"						   \
+		_ASM_EXTABLE(0b,3b)					   \
 		: "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1),	   \
 		  "=&D" (__d2)						   \
 		: "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
@@ -87,11 +84,8 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
 		"3:	lea 0(%[size1],%[size8],8),%[size8]\n"
 		"	jmp 2b\n"
 		".previous\n"
-		".section __ex_table,\"a\"\n"
-		"       .align 8\n"
-		"	.quad 0b,3b\n"
-		"	.quad 1b,2b\n"
-		".previous"
+		_ASM_EXTABLE(0b,3b)
+		_ASM_EXTABLE(1b,2b)
 		: [size8] "=c"(size), [dst] "=&D" (__d0)
 		: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr),
 		  [zero] "r" (0UL), [eight] "r" (8UL));

+ 11 - 23
arch/x86/mm/fault.c

@@ -240,7 +240,8 @@ void dump_pagetable(unsigned long address)
 	pud = pud_offset(pgd, address);
 	if (bad_address(pud)) goto bad;
 	printk("PUD %lx ", pud_val(*pud));
-	if (!pud_present(*pud))	goto ret;
+	if (!pud_present(*pud) || pud_large(*pud))
+		goto ret;
 
 	pmd = pmd_offset(pud, address);
 	if (bad_address(pmd)) goto bad;
@@ -508,6 +509,10 @@ static int vmalloc_fault(unsigned long address)
 	pmd_t *pmd, *pmd_ref;
 	pte_t *pte, *pte_ref;
 
+	/* Make sure we are in vmalloc area */
+	if (!(address >= VMALLOC_START && address < VMALLOC_END))
+		return -1;
+
 	/* Copy kernel mappings over when needed. This can also
 	   happen within a race in page table update. In the later
 	   case just flush. */
@@ -603,6 +608,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	 */
 #ifdef CONFIG_X86_32
 	if (unlikely(address >= TASK_SIZE)) {
+#else
+	if (unlikely(address >= TASK_SIZE64)) {
+#endif
 		if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
 		    vmalloc_fault(address) >= 0)
 			return;
@@ -618,6 +626,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 		goto bad_area_nosemaphore;
 	}
 
+
+#ifdef CONFIG_X86_32
 	/* It's safe to allow irq's after cr2 has been saved and the vmalloc
 	   fault has been handled. */
 	if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
@@ -630,28 +640,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	if (in_atomic() || !mm)
 		goto bad_area_nosemaphore;
 #else /* CONFIG_X86_64 */
-	if (unlikely(address >= TASK_SIZE64)) {
-		/*
-		 * Don't check for the module range here: its PML4
-		 * is always initialized because it's shared with the main
-		 * kernel text. Only vmalloc may need PML4 syncups.
-		 */
-		if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
-		      ((address >= VMALLOC_START && address < VMALLOC_END))) {
-			if (vmalloc_fault(address) >= 0)
-				return;
-		}
-
-		/* Can handle a stale RO->RW TLB */
-		if (spurious_fault(address, error_code))
-			return;
-
-		/*
-		 * Don't take the mm semaphore here. If we fixup a prefetch
-		 * fault we could otherwise deadlock.
-		 */
-		goto bad_area_nosemaphore;
-	}
 	if (likely(regs->flags & X86_EFLAGS_IF))
 		local_irq_enable();
 

+ 2 - 4
arch/x86/mm/init_32.c

@@ -31,6 +31,7 @@
 #include <linux/initrd.h>
 #include <linux/cpumask.h>
 
+#include <asm/asm.h>
 #include <asm/processor.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -718,10 +719,7 @@ static noinline int do_test_wp_bit(void)
 		"1:	movb %1, %0	\n"
 		"	xorl %2, %2	\n"
 		"2:			\n"
-		".section __ex_table, \"a\"\n"
-		"	.align 4	\n"
-		"	.long 1b, 2b	\n"
-		".previous		\n"
+		_ASM_EXTABLE(1b,2b)
 		:"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
 		 "=q" (tmp_reg),
 		 "=r" (flag)

+ 2 - 47
arch/x86/mm/init_64.c

@@ -273,7 +273,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
 	int i = pmd_index(address);
 
 	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
-		unsigned long entry;
 		pmd_t *pmd = pmd_page + pmd_index(address);
 
 		if (address >= end) {
@@ -287,9 +286,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
 		if (pmd_val(*pmd))
 			continue;
 
-		entry = __PAGE_KERNEL_LARGE|_PAGE_GLOBAL|address;
-		entry &= __supported_pte_mask;
-		set_pmd(pmd, __pmd(entry));
+		set_pte((pte_t *)pmd,
+			pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
 	}
 }
 
@@ -434,49 +432,6 @@ void __init paging_init(void)
 }
 #endif
 
-/*
- * Unmap a kernel mapping if it exists. This is useful to avoid
- * prefetches from the CPU leading to inconsistent cache lines.
- * address and size must be aligned to 2MB boundaries.
- * Does nothing when the mapping doesn't exist.
- */
-void __init clear_kernel_mapping(unsigned long address, unsigned long size)
-{
-	unsigned long end = address + size;
-
-	BUG_ON(address & ~LARGE_PAGE_MASK);
-	BUG_ON(size & ~LARGE_PAGE_MASK);
-
-	for (; address < end; address += LARGE_PAGE_SIZE) {
-		pgd_t *pgd = pgd_offset_k(address);
-		pud_t *pud;
-		pmd_t *pmd;
-
-		if (pgd_none(*pgd))
-			continue;
-
-		pud = pud_offset(pgd, address);
-		if (pud_none(*pud))
-			continue;
-
-		pmd = pmd_offset(pud, address);
-		if (!pmd || pmd_none(*pmd))
-			continue;
-
-		if (!(pmd_val(*pmd) & _PAGE_PSE)) {
-			/*
-			 * Could handle this, but it should not happen
-			 * currently:
-			 */
-			printk(KERN_ERR "clear_kernel_mapping: "
-				"mapping has been split. will leak memory\n");
-			pmd_ERROR(*pmd);
-		}
-		set_pmd(pmd, __pmd(0));
-	}
-	__flush_tlb_all();
-}
-
 /*
  * Memory hotplug specific functions
  */

+ 12 - 29
arch/x86/mm/ioremap.c

@@ -70,25 +70,12 @@ int page_is_ram(unsigned long pagenr)
  * Fix up the linear direct mapping of the kernel to avoid cache attribute
  * conflicts.
  */
-static int ioremap_change_attr(unsigned long paddr, unsigned long size,
+static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
 			       enum ioremap_mode mode)
 {
-	unsigned long vaddr = (unsigned long)__va(paddr);
 	unsigned long nrpages = size >> PAGE_SHIFT;
-	unsigned int level;
 	int err;
 
-	/* No change for pages after the last mapping */
-	if ((paddr + size - 1) >= (max_pfn_mapped << PAGE_SHIFT))
-		return 0;
-
-	/*
-	 * If there is no identity map for this address,
-	 * change_page_attr_addr is unnecessary
-	 */
-	if (!lookup_address(vaddr, &level))
-		return 0;
-
 	switch (mode) {
 	case IOR_MODE_UNCACHED:
 	default:
@@ -114,9 +101,8 @@ static int ioremap_change_attr(unsigned long paddr, unsigned long size,
 static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
 			       enum ioremap_mode mode)
 {
-	void __iomem *addr;
+	unsigned long pfn, offset, last_addr, vaddr;
 	struct vm_struct *area;
-	unsigned long offset, last_addr;
 	pgprot_t prot;
 
 	/* Don't allow wraparound or zero size */
@@ -133,9 +119,10 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
 	/*
 	 * Don't allow anybody to remap normal RAM that we're using..
 	 */
-	for (offset = phys_addr >> PAGE_SHIFT; offset < max_pfn_mapped &&
-	     (offset << PAGE_SHIFT) < last_addr; offset++) {
-		if (page_is_ram(offset))
+	for (pfn = phys_addr >> PAGE_SHIFT; pfn < max_pfn_mapped &&
+	     (pfn << PAGE_SHIFT) < last_addr; pfn++) {
+		if (page_is_ram(pfn) && pfn_valid(pfn) &&
+		    !PageReserved(pfn_to_page(pfn)))
 			return NULL;
 	}
 
@@ -163,19 +150,18 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
 	if (!area)
 		return NULL;
 	area->phys_addr = phys_addr;
-	addr = (void __iomem *) area->addr;
-	if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
-			       phys_addr, prot)) {
-		remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
+	vaddr = (unsigned long) area->addr;
+	if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) {
+		remove_vm_area((void *)(vaddr & PAGE_MASK));
 		return NULL;
 	}
 
-	if (ioremap_change_attr(phys_addr, size, mode) < 0) {
-		vunmap(addr);
+	if (ioremap_change_attr(vaddr, size, mode) < 0) {
+		vunmap(area->addr);
 		return NULL;
 	}
 
-	return (void __iomem *) (offset + (char __iomem *)addr);
+	return (void __iomem *) (vaddr + offset);
 }
 
 /**
@@ -254,9 +240,6 @@ void iounmap(volatile void __iomem *addr)
 		return;
 	}
 
-	/* Reset the direct mapping. Can block */
-	ioremap_change_attr(p->phys_addr, p->size, IOR_MODE_CACHED);
-
 	/* Finally remove it */
 	o = remove_vm_area((void *)addr);
 	BUG_ON(p != o || o == NULL);

+ 6 - 1
arch/x86/mm/numa_64.c

@@ -202,6 +202,8 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
 	if (node_data[nodeid] == NULL)
 		return;
 	nodedata_phys = __pa(node_data[nodeid]);
+	printk(KERN_INFO "  NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
+		nodedata_phys + pgdat_size - 1);
 
 	memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
 	NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
@@ -225,12 +227,15 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
 		return;
 	}
 	bootmap_start = __pa(bootmap);
-	Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
 
 	bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
 					 bootmap_start >> PAGE_SHIFT,
 					 start_pfn, end_pfn);
 
+	printk(KERN_INFO "  bootmap [%016lx -  %016lx] pages %lx\n",
+		 bootmap_start, bootmap_start + bootmap_size - 1,
+		 bootmap_pages);
+
 	free_bootmem_with_active_regions(nodeid, end);
 
 	reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);

+ 2 - 1
arch/x86/mm/pageattr-test.c

@@ -137,7 +137,8 @@ static __init int exercise_pageattr(void)
 
 		for (k = 0; k < len[i]; k++) {
 			pte = lookup_address(addr[i] + k*PAGE_SIZE, &level);
-			if (!pte || pgprot_val(pte_pgprot(*pte)) == 0) {
+			if (!pte || pgprot_val(pte_pgprot(*pte)) == 0 ||
+			    !(pte_val(*pte) & _PAGE_PRESENT)) {
 				addr[i] = 0;
 				break;
 			}

+ 317 - 83
arch/x86/mm/pageattr.c

@@ -16,6 +16,17 @@
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 
+/*
+ * The current flushing context - we pass it instead of 5 arguments:
+ */
+struct cpa_data {
+	unsigned long	vaddr;
+	pgprot_t	mask_set;
+	pgprot_t	mask_clr;
+	int		numpages;
+	int		flushtlb;
+};
+
 static inline int
 within(unsigned long addr, unsigned long start, unsigned long end)
 {
@@ -52,21 +63,23 @@ void clflush_cache_range(void *vaddr, unsigned int size)
 
 static void __cpa_flush_all(void *arg)
 {
+	unsigned long cache = (unsigned long)arg;
+
 	/*
 	 * Flush all to work around Errata in early athlons regarding
 	 * large page flushing.
 	 */
 	__flush_tlb_all();
 
-	if (boot_cpu_data.x86_model >= 4)
+	if (cache && boot_cpu_data.x86_model >= 4)
 		wbinvd();
 }
 
-static void cpa_flush_all(void)
+static void cpa_flush_all(unsigned long cache)
 {
 	BUG_ON(irqs_disabled());
 
-	on_each_cpu(__cpa_flush_all, NULL, 1, 1);
+	on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
 }
 
 static void __cpa_flush_range(void *arg)
@@ -79,7 +92,7 @@ static void __cpa_flush_range(void *arg)
 	__flush_tlb_all();
 }
 
-static void cpa_flush_range(unsigned long start, int numpages)
+static void cpa_flush_range(unsigned long start, int numpages, int cache)
 {
 	unsigned int i, level;
 	unsigned long addr;
@@ -89,6 +102,9 @@ static void cpa_flush_range(unsigned long start, int numpages)
 
 	on_each_cpu(__cpa_flush_range, NULL, 1, 1);
 
+	if (!cache)
+		return;
+
 	/*
 	 * We only need to flush on one CPU,
 	 * clflush is a MESI-coherent instruction that
@@ -101,11 +117,27 @@ static void cpa_flush_range(unsigned long start, int numpages)
 		/*
 		 * Only flush present addresses:
 		 */
-		if (pte && pte_present(*pte))
+		if (pte && (pte_val(*pte) & _PAGE_PRESENT))
 			clflush_cache_range((void *) addr, PAGE_SIZE);
 	}
 }
 
+#define HIGH_MAP_START	__START_KERNEL_map
+#define HIGH_MAP_END	(__START_KERNEL_map + KERNEL_TEXT_SIZE)
+
+
+/*
+ * Converts a virtual address to a X86-64 highmap address
+ */
+static unsigned long virt_to_highmap(void *address)
+{
+#ifdef CONFIG_X86_64
+	return __pa((unsigned long)address) + HIGH_MAP_START - phys_base;
+#else
+	return (unsigned long)address;
+#endif
+}
+
 /*
  * Certain areas of memory on x86 require very specific protection flags,
  * for example the BIOS area or kernel text. Callers don't always get this
@@ -129,12 +161,24 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address)
 	 */
 	if (within(address, (unsigned long)_text, (unsigned long)_etext))
 		pgprot_val(forbidden) |= _PAGE_NX;
+	/*
+	 * Do the same for the x86-64 high kernel mapping
+	 */
+	if (within(address, virt_to_highmap(_text), virt_to_highmap(_etext)))
+		pgprot_val(forbidden) |= _PAGE_NX;
+
 
 #ifdef CONFIG_DEBUG_RODATA
 	/* The .rodata section needs to be read-only */
 	if (within(address, (unsigned long)__start_rodata,
 				(unsigned long)__end_rodata))
 		pgprot_val(forbidden) |= _PAGE_RW;
+	/*
+	 * Do the same for the x86-64 high kernel mapping
+	 */
+	if (within(address, virt_to_highmap(__start_rodata),
+				virt_to_highmap(__end_rodata)))
+		pgprot_val(forbidden) |= _PAGE_RW;
 #endif
 
 	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
@@ -142,6 +186,14 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address)
 	return prot;
 }
 
+/*
+ * Lookup the page table entry for a virtual address. Return a pointer
+ * to the entry and the level of the mapping.
+ *
+ * Note: We return pud and pmd either when the entry is marked large
+ * or when the present bit is not set. Otherwise we would return a
+ * pointer to a nonexisting mapping.
+ */
 pte_t *lookup_address(unsigned long address, int *level)
 {
 	pgd_t *pgd = pgd_offset_k(address);
@@ -152,21 +204,31 @@ pte_t *lookup_address(unsigned long address, int *level)
 
 	if (pgd_none(*pgd))
 		return NULL;
+
 	pud = pud_offset(pgd, address);
 	if (pud_none(*pud))
 		return NULL;
+
+	*level = PG_LEVEL_1G;
+	if (pud_large(*pud) || !pud_present(*pud))
+		return (pte_t *)pud;
+
 	pmd = pmd_offset(pud, address);
 	if (pmd_none(*pmd))
 		return NULL;
 
 	*level = PG_LEVEL_2M;
-	if (pmd_large(*pmd))
+	if (pmd_large(*pmd) || !pmd_present(*pmd))
 		return (pte_t *)pmd;
 
 	*level = PG_LEVEL_4K;
+
 	return pte_offset_kernel(pmd, address);
 }
 
+/*
+ * Set the new pmd in all the pgds we know about:
+ */
 static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
 {
 	/* change init_mm */
@@ -175,6 +237,7 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
 	if (!SHARED_KERNEL_PMD) {
 		struct page *page;
 
+		address = __pa(address);
 		list_for_each_entry(page, &pgd_list, lru) {
 			pgd_t *pgd;
 			pud_t *pud;
@@ -189,18 +252,114 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
 #endif
 }
 
+static int
+try_preserve_large_page(pte_t *kpte, unsigned long address,
+			struct cpa_data *cpa)
+{
+	unsigned long nextpage_addr, numpages, pmask, psize, flags;
+	pte_t new_pte, old_pte, *tmp;
+	pgprot_t old_prot, new_prot;
+	int level, do_split = 1;
+
+	/*
+	 * An Athlon 64 X2 showed hard hangs if we tried to preserve
+	 * largepages and changed the PSE entry from RW to RO.
+	 *
+	 * As AMD CPUs have a long series of erratas in this area,
+	 * (and none of the known ones seem to explain this hang),
+	 * disable this code until the hang can be debugged:
+	 */
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		return 1;
+
+	spin_lock_irqsave(&pgd_lock, flags);
+	/*
+	 * Check for races, another CPU might have split this page
+	 * up already:
+	 */
+	tmp = lookup_address(address, &level);
+	if (tmp != kpte)
+		goto out_unlock;
+
+	switch (level) {
+	case PG_LEVEL_2M:
+		psize = PMD_PAGE_SIZE;
+		pmask = PMD_PAGE_MASK;
+		break;
+#ifdef CONFIG_X86_64
+	case PG_LEVEL_1G:
+		psize = PMD_PAGE_SIZE;
+		pmask = PMD_PAGE_MASK;
+		break;
+#endif
+	default:
+		do_split = -EINVAL;
+		goto out_unlock;
+	}
+
+	/*
+	 * Calculate the number of pages, which fit into this large
+	 * page starting at address:
+	 */
+	nextpage_addr = (address + psize) & pmask;
+	numpages = (nextpage_addr - address) >> PAGE_SHIFT;
+	if (numpages < cpa->numpages)
+		cpa->numpages = numpages;
+
+	/*
+	 * We are safe now. Check whether the new pgprot is the same:
+	 */
+	old_pte = *kpte;
+	old_prot = new_prot = pte_pgprot(old_pte);
+
+	pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
+	pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
+	new_prot = static_protections(new_prot, address);
+
+	/*
+	 * If there are no changes, return. maxpages has been updated
+	 * above:
+	 */
+	if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
+		do_split = 0;
+		goto out_unlock;
+	}
+
+	/*
+	 * We need to change the attributes. Check, whether we can
+	 * change the large page in one go. We request a split, when
+	 * the address is not aligned and the number of pages is
+	 * smaller than the number of pages in the large page. Note
+	 * that we limited the number of possible pages already to
+	 * the number of pages in the large page.
+	 */
+	if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
+		/*
+		 * The address is aligned and the number of pages
+		 * covers the full page.
+		 */
+		new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
+		__set_pmd_pte(kpte, address, new_pte);
+		cpa->flushtlb = 1;
+		do_split = 0;
+	}
+
+out_unlock:
+	spin_unlock_irqrestore(&pgd_lock, flags);
+
+	return do_split;
+}
+
 static int split_large_page(pte_t *kpte, unsigned long address)
 {
-	pgprot_t ref_prot = pte_pgprot(pte_clrhuge(*kpte));
+	unsigned long flags, pfn, pfninc = 1;
 	gfp_t gfp_flags = GFP_KERNEL;
-	unsigned long flags;
-	unsigned long addr;
+	unsigned int i, level;
 	pte_t *pbase, *tmp;
+	pgprot_t ref_prot;
 	struct page *base;
-	unsigned int i, level;
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
-	gfp_flags = __GFP_HIGH | __GFP_NOFAIL | __GFP_NOWARN;
 	gfp_flags = GFP_ATOMIC | __GFP_NOWARN;
 #endif
 	base = alloc_pages(gfp_flags, 0);
@@ -213,30 +372,41 @@ static int split_large_page(pte_t *kpte, unsigned long address)
 	 * up for us already:
 	 */
 	tmp = lookup_address(address, &level);
-	if (tmp != kpte) {
-		WARN_ON_ONCE(1);
+	if (tmp != kpte)
 		goto out_unlock;
-	}
 
-	address = __pa(address);
-	addr = address & LARGE_PAGE_MASK;
 	pbase = (pte_t *)page_address(base);
 #ifdef CONFIG_X86_32
 	paravirt_alloc_pt(&init_mm, page_to_pfn(base));
 #endif
+	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
+
+#ifdef CONFIG_X86_64
+	if (level == PG_LEVEL_1G) {
+		pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
+		pgprot_val(ref_prot) |= _PAGE_PSE;
+	}
+#endif
 
-	pgprot_val(ref_prot) &= ~_PAGE_NX;
-	for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE)
-		set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, ref_prot));
+	/*
+	 * Get the target pfn from the original entry:
+	 */
+	pfn = pte_pfn(*kpte);
+	for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
+		set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
 
 	/*
-	 * Install the new, split up pagetable. Important detail here:
+	 * Install the new, split up pagetable. Important details here:
 	 *
 	 * On Intel the NX bit of all levels must be cleared to make a
 	 * page executable. See section 4.13.2 of Intel 64 and IA-32
 	 * Architectures Software Developer's Manual).
+	 *
+	 * Mark the entry present. The current mapping might be
+	 * set to not present, which we preserved above.
 	 */
 	ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
+	pgprot_val(ref_prot) |= _PAGE_PRESENT;
 	__set_pmd_pte(kpte, address, mk_pte(base, ref_prot));
 	base = NULL;
 
@@ -249,18 +419,12 @@ out_unlock:
 	return 0;
 }
 
-static int
-__change_page_attr(unsigned long address, unsigned long pfn,
-		   pgprot_t mask_set, pgprot_t mask_clr)
+static int __change_page_attr(unsigned long address, struct cpa_data *cpa)
 {
+	int level, do_split, err;
 	struct page *kpte_page;
-	int level, err = 0;
 	pte_t *kpte;
 
-#ifdef CONFIG_X86_32
-	BUG_ON(pfn > max_low_pfn);
-#endif
-
 repeat:
 	kpte = lookup_address(address, &level);
 	if (!kpte)
@@ -271,23 +435,62 @@ repeat:
 	BUG_ON(PageCompound(kpte_page));
 
 	if (level == PG_LEVEL_4K) {
-		pgprot_t new_prot = pte_pgprot(*kpte);
 		pte_t new_pte, old_pte = *kpte;
+		pgprot_t new_prot = pte_pgprot(old_pte);
+
+		if(!pte_val(old_pte)) {
+			printk(KERN_WARNING "CPA: called for zero pte. "
+			       "vaddr = %lx cpa->vaddr = %lx\n", address,
+				cpa->vaddr);
+			WARN_ON(1);
+			return -EINVAL;
+		}
 
-		pgprot_val(new_prot) &= ~pgprot_val(mask_clr);
-		pgprot_val(new_prot) |= pgprot_val(mask_set);
+		pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
+		pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
 
 		new_prot = static_protections(new_prot, address);
 
-		new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
-		BUG_ON(pte_pfn(new_pte) != pte_pfn(old_pte));
+		/*
+		 * We need to keep the pfn from the existing PTE,
+		 * after all we're only going to change it's attributes
+		 * not the memory it points to
+		 */
+		new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
+
+		/*
+		 * Do we really change anything ?
+		 */
+		if (pte_val(old_pte) != pte_val(new_pte)) {
+			set_pte_atomic(kpte, new_pte);
+			cpa->flushtlb = 1;
+		}
+		cpa->numpages = 1;
+		return 0;
+	}
+
+	/*
+	 * Check, whether we can keep the large page intact
+	 * and just change the pte:
+	 */
+	do_split = try_preserve_large_page(kpte, address, cpa);
+	/*
+	 * When the range fits into the existing large page,
+	 * return. cp->numpages and cpa->tlbflush have been updated in
+	 * try_large_page:
+	 */
+	if (do_split <= 0)
+		return do_split;
 
-		set_pte_atomic(kpte, new_pte);
-	} else {
-		err = split_large_page(kpte, address);
-		if (!err)
-			goto repeat;
+	/*
+	 * We have to split the large page:
+	 */
+	err = split_large_page(kpte, address);
+	if (!err) {
+		cpa->flushtlb = 1;
+		goto repeat;
 	}
+
 	return err;
 }
 
@@ -304,19 +507,14 @@ repeat:
  *
  * Modules and drivers should use the set_memory_* APIs instead.
  */
-
-#define HIGH_MAP_START	__START_KERNEL_map
-#define HIGH_MAP_END	(__START_KERNEL_map + KERNEL_TEXT_SIZE)
-
-static int
-change_page_attr_addr(unsigned long address, pgprot_t mask_set,
-		      pgprot_t mask_clr)
+static int change_page_attr_addr(struct cpa_data *cpa)
 {
-	unsigned long phys_addr = __pa(address);
-	unsigned long pfn = phys_addr >> PAGE_SHIFT;
 	int err;
+	unsigned long address = cpa->vaddr;
 
 #ifdef CONFIG_X86_64
+	unsigned long phys_addr = __pa(address);
+
 	/*
 	 * If we are inside the high mapped kernel range, then we
 	 * fixup the low mapping first. __va() returns the virtual
@@ -326,7 +524,7 @@ change_page_attr_addr(unsigned long address, pgprot_t mask_set,
 		address = (unsigned long) __va(phys_addr);
 #endif
 
-	err = __change_page_attr(address, pfn, mask_set, mask_clr);
+	err = __change_page_attr(address, cpa);
 	if (err)
 		return err;
 
@@ -339,42 +537,89 @@ change_page_attr_addr(unsigned long address, pgprot_t mask_set,
 		/*
 		 * Calc the high mapping address. See __phys_addr()
 		 * for the non obvious details.
+		 *
+		 * Note that NX and other required permissions are
+		 * checked in static_protections().
 		 */
 		address = phys_addr + HIGH_MAP_START - phys_base;
-		/* Make sure the kernel mappings stay executable */
-		pgprot_val(mask_clr) |= _PAGE_NX;
 
 		/*
 		 * Our high aliases are imprecise, because we check
 		 * everything between 0 and KERNEL_TEXT_SIZE, so do
 		 * not propagate lookup failures back to users:
 		 */
-		__change_page_attr(address, pfn, mask_set, mask_clr);
+		__change_page_attr(address, cpa);
 	}
 #endif
 	return err;
 }
 
-static int __change_page_attr_set_clr(unsigned long addr, int numpages,
-				      pgprot_t mask_set, pgprot_t mask_clr)
+static int __change_page_attr_set_clr(struct cpa_data *cpa)
 {
-	unsigned int i;
-	int ret;
+	int ret, numpages = cpa->numpages;
 
-	for (i = 0; i < numpages ; i++, addr += PAGE_SIZE) {
-		ret = change_page_attr_addr(addr, mask_set, mask_clr);
+	while (numpages) {
+		/*
+		 * Store the remaining nr of pages for the large page
+		 * preservation check.
+		 */
+		cpa->numpages = numpages;
+		ret = change_page_attr_addr(cpa);
 		if (ret)
 			return ret;
-	}
 
+		/*
+		 * Adjust the number of pages with the result of the
+		 * CPA operation. Either a large page has been
+		 * preserved or a single page update happened.
+		 */
+		BUG_ON(cpa->numpages > numpages);
+		numpages -= cpa->numpages;
+		cpa->vaddr += cpa->numpages * PAGE_SIZE;
+	}
 	return 0;
 }
 
+static inline int cache_attr(pgprot_t attr)
+{
+	return pgprot_val(attr) &
+		(_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
+}
+
 static int change_page_attr_set_clr(unsigned long addr, int numpages,
 				    pgprot_t mask_set, pgprot_t mask_clr)
 {
-	int ret = __change_page_attr_set_clr(addr, numpages, mask_set,
-					     mask_clr);
+	struct cpa_data cpa;
+	int ret, cache;
+
+	/*
+	 * Check, if we are requested to change a not supported
+	 * feature:
+	 */
+	mask_set = canon_pgprot(mask_set);
+	mask_clr = canon_pgprot(mask_clr);
+	if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
+		return 0;
+
+	cpa.vaddr = addr;
+	cpa.numpages = numpages;
+	cpa.mask_set = mask_set;
+	cpa.mask_clr = mask_clr;
+	cpa.flushtlb = 0;
+
+	ret = __change_page_attr_set_clr(&cpa);
+
+	/*
+	 * Check whether we really changed something:
+	 */
+	if (!cpa.flushtlb)
+		return ret;
+
+	/*
+	 * No need to flush, when we did not set any of the caching
+	 * attributes:
+	 */
+	cache = cache_attr(mask_set);
 
 	/*
 	 * On success we use clflush, when the CPU supports it to
@@ -383,9 +628,9 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
 	 * wbindv):
 	 */
 	if (!ret && cpu_has_clflush)
-		cpa_flush_range(addr, numpages);
+		cpa_flush_range(addr, numpages, cache);
 	else
-		cpa_flush_all();
+		cpa_flush_all(cache);
 
 	return ret;
 }
@@ -489,37 +734,26 @@ int set_pages_rw(struct page *page, int numpages)
 	return set_memory_rw(addr, numpages);
 }
 
-
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_CPA_DEBUG)
-static inline int __change_page_attr_set(unsigned long addr, int numpages,
-					 pgprot_t mask)
-{
-	return __change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
-}
-
-static inline int __change_page_attr_clear(unsigned long addr, int numpages,
-					   pgprot_t mask)
-{
-	return __change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
-}
-#endif
-
 #ifdef CONFIG_DEBUG_PAGEALLOC
 
 static int __set_pages_p(struct page *page, int numpages)
 {
-	unsigned long addr = (unsigned long)page_address(page);
+	struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
+				.numpages = numpages,
+				.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+				.mask_clr = __pgprot(0)};
 
-	return __change_page_attr_set(addr, numpages,
-				      __pgprot(_PAGE_PRESENT | _PAGE_RW));
+	return __change_page_attr_set_clr(&cpa);
 }
 
 static int __set_pages_np(struct page *page, int numpages)
 {
-	unsigned long addr = (unsigned long)page_address(page);
+	struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
+				.numpages = numpages,
+				.mask_set = __pgprot(0),
+				.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)};
 
-	return __change_page_attr_clear(addr, numpages,
-					__pgprot(_PAGE_PRESENT));
+	return __change_page_attr_set_clr(&cpa);
 }
 
 void kernel_map_pages(struct page *page, int numpages, int enable)

+ 20 - 41
arch/x86/mm/pgtable_32.c

@@ -219,50 +219,39 @@ static inline void pgd_list_del(pgd_t *pgd)
 	list_del(&page->lru);
 }
 
+#define UNSHARED_PTRS_PER_PGD				\
+	(SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
 
-
-#if (PTRS_PER_PMD == 1)
-/* Non-PAE pgd constructor */
-static void pgd_ctor(void *pgd)
+static void pgd_ctor(void *p)
 {
+	pgd_t *pgd = p;
 	unsigned long flags;
 
-	/* !PAE, no pagetable sharing */
+	/* Clear usermode parts of PGD */
 	memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
 
 	spin_lock_irqsave(&pgd_lock, flags);
 
-	/* must happen under lock */
-	clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
-			swapper_pg_dir + USER_PTRS_PER_PGD,
-			KERNEL_PGD_PTRS);
-	paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
-				__pa(swapper_pg_dir) >> PAGE_SHIFT,
-				USER_PTRS_PER_PGD,
-				KERNEL_PGD_PTRS);
-	pgd_list_add(pgd);
-	spin_unlock_irqrestore(&pgd_lock, flags);
-}
-#else  /* PTRS_PER_PMD > 1 */
-/* PAE pgd constructor */
-static void pgd_ctor(void *pgd)
-{
-	/* PAE, kernel PMD may be shared */
-
-	if (SHARED_KERNEL_PMD) {
-		clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
+	/* If the pgd points to a shared pagetable level (either the
+	   ptes in non-PAE, or shared PMD in PAE), then just copy the
+	   references from swapper_pg_dir. */
+	if (PAGETABLE_LEVELS == 2 ||
+	    (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
+		clone_pgd_range(pgd + USER_PTRS_PER_PGD,
 				swapper_pg_dir + USER_PTRS_PER_PGD,
 				KERNEL_PGD_PTRS);
-	} else {
-		unsigned long flags;
+		paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
+					__pa(swapper_pg_dir) >> PAGE_SHIFT,
+					USER_PTRS_PER_PGD,
+					KERNEL_PGD_PTRS);
+	}
 
-		memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
-		spin_lock_irqsave(&pgd_lock, flags);
+	/* list required to sync kernel mapping updates */
+	if (!SHARED_KERNEL_PMD)
 		pgd_list_add(pgd);
-		spin_unlock_irqrestore(&pgd_lock, flags);
-	}
+
+	spin_unlock_irqrestore(&pgd_lock, flags);
 }
-#endif	/* PTRS_PER_PMD */
 
 static void pgd_dtor(void *pgd)
 {
@@ -276,9 +265,6 @@ static void pgd_dtor(void *pgd)
 	spin_unlock_irqrestore(&pgd_lock, flags);
 }
 
-#define UNSHARED_PTRS_PER_PGD				\
-	(SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
-
 #ifdef CONFIG_X86_PAE
 /*
  * Mop up any pmd pages which may still be attached to the pgd.
@@ -387,13 +373,6 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
 
 void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 {
-	/* This is called just after the pmd has been detached from
-	   the pgd, which requires a full tlb flush to be recognized
-	   by the CPU.  Rather than incurring multiple tlb flushes
-	   while the address space is being pulled down, make the tlb
-	   gathering machinery do a full flush when we're done. */
-	tlb->fullmm = 1;
-
 	paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
 	tlb_remove_page(tlb, virt_to_page(pmd));
 }

+ 44 - 8
arch/x86/pci/numa.c

@@ -5,36 +5,62 @@
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/nodemask.h>
+#include <mach_apic.h>
 #include "pci.h"
 
+#define XQUAD_PORTIO_BASE 0xfe400000
+#define XQUAD_PORTIO_QUAD 0x40000  /* 256k per quad. */
+
 #define BUS2QUAD(global) (mp_bus_id_to_node[global])
 #define BUS2LOCAL(global) (mp_bus_id_to_local[global])
 #define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local])
 
+extern void *xquad_portio;    /* Where the IO area was mapped */
+#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
+
 #define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \
 	(0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3))
 
+static void write_cf8(unsigned bus, unsigned devfn, unsigned reg)
+{
+	unsigned val = PCI_CONF1_MQ_ADDRESS(bus, devfn, reg);
+	if (xquad_portio)
+		writel(val, XQUAD_PORT_ADDR(0xcf8, BUS2QUAD(bus)));
+	else
+		outl(val, 0xCF8);
+}
+
 static int pci_conf1_mq_read(unsigned int seg, unsigned int bus,
 			     unsigned int devfn, int reg, int len, u32 *value)
 {
 	unsigned long flags;
+	void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus));
 
 	if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255))
 		return -EINVAL;
 
 	spin_lock_irqsave(&pci_config_lock, flags);
 
-	outl_quad(PCI_CONF1_MQ_ADDRESS(bus, devfn, reg), 0xCF8, BUS2QUAD(bus));
+	write_cf8(bus, devfn, reg);
 
 	switch (len) {
 	case 1:
-		*value = inb_quad(0xCFC + (reg & 3), BUS2QUAD(bus));
+		if (xquad_portio)
+			*value = readb(adr + (reg & 3));
+		else
+			*value = inb(0xCFC + (reg & 3));
 		break;
 	case 2:
-		*value = inw_quad(0xCFC + (reg & 2), BUS2QUAD(bus));
+		if (xquad_portio)
+			*value = readw(adr + (reg & 2));
+		else
+			*value = inw(0xCFC + (reg & 2));
 		break;
 	case 4:
-		*value = inl_quad(0xCFC, BUS2QUAD(bus));
+		if (xquad_portio)
+			*value = readl(adr);
+		else
+			*value = inl(0xCFC);
 		break;
 	}
 
@@ -47,23 +73,33 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus,
 			      unsigned int devfn, int reg, int len, u32 value)
 {
 	unsigned long flags;
+	void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus));
 
 	if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) 
 		return -EINVAL;
 
 	spin_lock_irqsave(&pci_config_lock, flags);
 
-	outl_quad(PCI_CONF1_MQ_ADDRESS(bus, devfn, reg), 0xCF8, BUS2QUAD(bus));
+	write_cf8(bus, devfn, reg);
 
 	switch (len) {
 	case 1:
-		outb_quad((u8)value, 0xCFC + (reg & 3), BUS2QUAD(bus));
+		if (xquad_portio)
+			writeb(value, adr + (reg & 3));
+		else
+			outb((u8)value, 0xCFC + (reg & 3));
 		break;
 	case 2:
-		outw_quad((u16)value, 0xCFC + (reg & 2), BUS2QUAD(bus));
+		if (xquad_portio)
+			writew(value, adr + (reg & 2));
+		else
+			outw((u16)value, 0xCFC + (reg & 2));
 		break;
 	case 4:
-		outl_quad((u32)value, 0xCFC, BUS2QUAD(bus));
+		if (xquad_portio)
+			writel(value, adr + reg);
+		else
+			outl((u32)value, 0xCFC);
 		break;
 	}
 

+ 7 - 4
include/asm-generic/rtc.h

@@ -35,10 +35,11 @@
 static inline unsigned char rtc_is_updating(void)
 {
 	unsigned char uip;
+	unsigned long flags;
 
-	spin_lock_irq(&rtc_lock);
+	spin_lock_irqsave(&rtc_lock, flags);
 	uip = (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP);
-	spin_unlock_irq(&rtc_lock);
+	spin_unlock_irqrestore(&rtc_lock, flags);
 	return uip;
 }
 
@@ -46,6 +47,8 @@ static inline unsigned int get_rtc_time(struct rtc_time *time)
 {
 	unsigned long uip_watchdog = jiffies;
 	unsigned char ctrl;
+	unsigned long flags;
+
 #ifdef CONFIG_MACH_DECSTATION
 	unsigned int real_year;
 #endif
@@ -72,7 +75,7 @@ static inline unsigned int get_rtc_time(struct rtc_time *time)
 	 * RTC has RTC_DAY_OF_WEEK, we ignore it, as it is only updated
 	 * by the RTC when initially set to a non-zero value.
 	 */
-	spin_lock_irq(&rtc_lock);
+	spin_lock_irqsave(&rtc_lock, flags);
 	time->tm_sec = CMOS_READ(RTC_SECONDS);
 	time->tm_min = CMOS_READ(RTC_MINUTES);
 	time->tm_hour = CMOS_READ(RTC_HOURS);
@@ -83,7 +86,7 @@ static inline unsigned int get_rtc_time(struct rtc_time *time)
 	real_year = CMOS_READ(RTC_DEC_YEAR);
 #endif
 	ctrl = CMOS_READ(RTC_CONTROL);
-	spin_unlock_irq(&rtc_lock);
+	spin_unlock_irqrestore(&rtc_lock, flags);
 
 	if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
 	{

+ 0 - 1
include/asm-generic/tlb.h

@@ -14,7 +14,6 @@
 #define _ASM_GENERIC__TLB_H
 
 #include <linux/swap.h>
-#include <linux/quicklist.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 

+ 7 - 0
include/asm-x86/asm.h

@@ -29,4 +29,11 @@
 
 #endif /* CONFIG_X86_32 */
 
+/* Exception table entry */
+# define _ASM_EXTABLE(from,to) \
+	" .section __ex_table,\"a\"\n" \
+	_ASM_ALIGN "\n" \
+	_ASM_PTR #from "," #to "\n" \
+	" .previous\n"
+
 #endif /* _ASM_X86_ASM_H */

+ 1 - 1
include/asm-x86/bugs.h

@@ -2,6 +2,6 @@
 #define _ASM_X86_BUGS_H
 
 extern void check_bugs(void);
-extern int ppro_with_ram_bug(void);
+int ppro_with_ram_bug(void);
 
 #endif /* _ASM_X86_BUGS_H */

+ 11 - 3
include/asm-x86/cpufeature.h

@@ -4,9 +4,6 @@
 #ifndef _ASM_X86_CPUFEATURE_H
 #define _ASM_X86_CPUFEATURE_H
 
-#ifndef __ASSEMBLY__
-#include <linux/bitops.h>
-#endif
 #include <asm/required-features.h>
 
 #define NCAPINTS	8	/* N 32-bit words worth of info */
@@ -49,6 +46,7 @@
 #define X86_FEATURE_MP		(1*32+19) /* MP Capable. */
 #define X86_FEATURE_NX		(1*32+20) /* Execute Disable */
 #define X86_FEATURE_MMXEXT	(1*32+22) /* AMD MMX extensions */
+#define X86_FEATURE_GBPAGES	(1*32+26) /* GB pages */
 #define X86_FEATURE_RDTSCP	(1*32+27) /* RDTSCP */
 #define X86_FEATURE_LM		(1*32+29) /* Long Mode (x86-64) */
 #define X86_FEATURE_3DNOWEXT	(1*32+30) /* AMD 3DNow! extensions */
@@ -115,6 +113,13 @@
  */
 #define X86_FEATURE_IDA		(7*32+ 0) /* Intel Dynamic Acceleration */
 
+#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
+
+#include <linux/bitops.h>
+
+extern const char * const x86_cap_flags[NCAPINTS*32];
+extern const char * const x86_power_flags[32];
+
 #define cpu_has(c, bit)							\
 	(__builtin_constant_p(bit) &&					\
 	 ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) ||	\
@@ -175,6 +180,7 @@
 #define cpu_has_pebs		boot_cpu_has(X86_FEATURE_PEBS)
 #define cpu_has_clflush		boot_cpu_has(X86_FEATURE_CLFLSH)
 #define cpu_has_bts		boot_cpu_has(X86_FEATURE_BTS)
+#define cpu_has_gbpages		boot_cpu_has(X86_FEATURE_GBPAGES)
 
 #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
 # define cpu_has_invlpg		1
@@ -204,4 +210,6 @@
 
 #endif /* CONFIG_X86_64 */
 
+#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
+
 #endif /* _ASM_X86_CPUFEATURE_H */

+ 2 - 2
include/asm-x86/efi.h

@@ -33,7 +33,7 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
 #define efi_call_virt6(f, a1, a2, a3, a4, a5, a6)	\
 	efi_call_virt(f, a1, a2, a3, a4, a5, a6)
 
-#define efi_ioremap(addr, size)			ioremap(addr, size)
+#define efi_ioremap(addr, size)			ioremap_cache(addr, size)
 
 #else /* !CONFIG_X86_32 */
 
@@ -86,7 +86,7 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
 	efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
 		  (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
 
-extern void *efi_ioremap(unsigned long offset, unsigned long size);
+extern void *efi_ioremap(unsigned long addr, unsigned long size);
 
 #endif /* CONFIG_X86_32 */
 

+ 6 - 17
include/asm-x86/futex.h

@@ -17,11 +17,8 @@
 "2:	.section .fixup,\"ax\"\n				\
 3:	mov	%3, %1\n					\
 	jmp	2b\n						\
-	.previous\n						\
-	.section __ex_table,\"a\"\n				\
-	.align	8\n"						\
-	_ASM_PTR "1b,3b\n					\
-	.previous"						\
+	.previous\n"						\
+	_ASM_EXTABLE(1b,3b)					\
 	: "=r" (oldval), "=r" (ret), "+m" (*uaddr)		\
 	: "i" (-EFAULT), "0" (oparg), "1" (0))
 
@@ -35,11 +32,9 @@
 3:	.section .fixup,\"ax\"\n				\
 4:	mov	%5, %1\n					\
 	jmp	3b\n						\
-	.previous\n						\
-	.section __ex_table,\"a\"\n				\
-	.align	8\n"						\
-	_ASM_PTR "1b,4b,2b,4b\n					\
-	.previous"						\
+	.previous\n"						\
+	_ASM_EXTABLE(1b,4b)					\
+	_ASM_EXTABLE(2b,4b)					\
 	: "=&a" (oldval), "=&r" (ret), "+m" (*uaddr),		\
 	  "=&r" (tem)						\
 	: "r" (oparg), "i" (-EFAULT), "1" (0))
@@ -111,18 +106,12 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 		return -EFAULT;
 
 	__asm__ __volatile__(
-
 		"1:	lock; cmpxchgl %3, %1			\n"
 		"2:	.section .fixup, \"ax\"			\n"
 		"3:	mov     %2, %0				\n"
 		"	jmp     2b				\n"
 		"	.previous				\n"
-
-		"	.section __ex_table, \"a\"		\n"
-		"	.align  8				\n"
-			_ASM_PTR " 1b,3b			\n"
-		"	.previous				\n"
-
+		_ASM_EXTABLE(1b,3b)
 		: "=a" (oldval), "+m" (*uaddr)
 		: "i" (-EFAULT), "r" (newval), "0" (oldval)
 		: "memory"

+ 2 - 2
include/asm-x86/highmem.h

@@ -63,8 +63,8 @@ extern pte_t *pkmap_page_table;
 #define PKMAP_NR(virt)  ((virt-PKMAP_BASE) >> PAGE_SHIFT)
 #define PKMAP_ADDR(nr)  (PKMAP_BASE + ((nr) << PAGE_SHIFT))
 
-extern void * FASTCALL(kmap_high(struct page *page));
-extern void FASTCALL(kunmap_high(struct page *page));
+extern void *kmap_high(struct page *page);
+extern void kunmap_high(struct page *page);
 
 void *kmap(struct page *page);
 void kunmap(struct page *page);

+ 1 - 1
include/asm-x86/hw_irq_32.h

@@ -47,7 +47,7 @@ void enable_8259A_irq(unsigned int irq);
 int i8259A_irq_pending(unsigned int irq);
 void make_8259A_irq(unsigned int irq);
 void init_8259A(int aeoi);
-void FASTCALL(send_IPI_self(int vector));
+void send_IPI_self(int vector);
 void init_VISWS_APIC_irqs(void);
 void setup_IO_APIC(void);
 void disable_IO_APIC(void);

+ 4 - 12
include/asm-x86/i387.h

@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/kernel_stat.h>
 #include <linux/regset.h>
+#include <asm/asm.h>
 #include <asm/processor.h>
 #include <asm/sigcontext.h>
 #include <asm/user.h>
@@ -41,10 +42,7 @@ static inline void tolerant_fwait(void)
 {
 	asm volatile("1: fwait\n"
 		     "2:\n"
-		     "   .section __ex_table,\"a\"\n"
-		     "	.align 8\n"
-		     "	.quad 1b,2b\n"
-		     "	.previous\n");
+		     _ASM_EXTABLE(1b,2b));
 }
 
 static inline int restore_fpu_checking(struct i387_fxsave_struct *fx)
@@ -57,10 +55,7 @@ static inline int restore_fpu_checking(struct i387_fxsave_struct *fx)
 		     "3:  movl $-1,%[err]\n"
 		     "    jmp  2b\n"
 		     ".previous\n"
-		     ".section __ex_table,\"a\"\n"
-		     "   .align 8\n"
-		     "   .quad  1b,3b\n"
-		     ".previous"
+		     _ASM_EXTABLE(1b,3b)
 		     : [err] "=r" (err)
 #if 0 /* See comment in __save_init_fpu() below. */
 		     : [fx] "r" (fx), "m" (*fx), "0" (0));
@@ -99,10 +94,7 @@ static inline int save_i387_checking(struct i387_fxsave_struct __user *fx)
 		     "3:  movl $-1,%[err]\n"
 		     "    jmp  2b\n"
 		     ".previous\n"
-		     ".section __ex_table,\"a\"\n"
-		     "   .align 8\n"
-		     "   .quad  1b,3b\n"
-		     ".previous"
+		     _ASM_EXTABLE(1b,3b)
 		     : [err] "=r" (err), "=m" (*fx)
 #if 0 /* See comment in __fxsave_clear() below. */
 		     : [fx] "r" (fx), "0" (0));

+ 0 - 25
include/asm-x86/io_32.h

@@ -275,29 +275,6 @@ static inline void slow_down_io(void) {
 
 #endif
 
-#ifdef CONFIG_X86_NUMAQ
-extern void *xquad_portio;    /* Where the IO area was mapped */
-#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
-#define __BUILDIO(bwl,bw,type) \
-static inline void out##bwl##_quad(unsigned type value, int port, int quad) { \
-	if (xquad_portio) \
-		write##bwl(value, XQUAD_PORT_ADDR(port, quad)); \
-	else \
-		out##bwl##_local(value, port); \
-} \
-static inline void out##bwl(unsigned type value, int port) { \
-	out##bwl##_quad(value, port, 0); \
-} \
-static inline unsigned type in##bwl##_quad(int port, int quad) { \
-	if (xquad_portio) \
-		return read##bwl(XQUAD_PORT_ADDR(port, quad)); \
-	else \
-		return in##bwl##_local(port); \
-} \
-static inline unsigned type in##bwl(int port) { \
-	return in##bwl##_quad(port, 0); \
-}
-#else
 #define __BUILDIO(bwl,bw,type) \
 static inline void out##bwl(unsigned type value, int port) { \
 	out##bwl##_local(value, port); \
@@ -305,8 +282,6 @@ static inline void out##bwl(unsigned type value, int port) { \
 static inline unsigned type in##bwl(int port) { \
 	return in##bwl##_local(port); \
 }
-#endif
-
 
 #define BUILDIO(bwl,bw,type) \
 static inline void out##bwl##_local(unsigned type value, int port) { \

+ 2 - 0
include/asm-x86/mach-numaq/mach_apic.h

@@ -109,6 +109,8 @@ static inline int mpc_apic_id(struct mpc_config_processor *m,
 	return logical_apicid;
 }
 
+extern void *xquad_portio;
+
 static inline void setup_portio_remap(void)
 {
 	int num_quads = num_online_nodes();

+ 2 - 8
include/asm-x86/msr.h

@@ -57,10 +57,7 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr,
 		     ".section .fixup,\"ax\"\n\t"
 		     "3:  mov %3,%0 ; jmp 1b\n\t"
 		     ".previous\n\t"
-		     ".section __ex_table,\"a\"\n"
-		     _ASM_ALIGN "\n\t"
-		     _ASM_PTR " 2b,3b\n\t"
-		     ".previous"
+		     _ASM_EXTABLE(2b,3b)
 		     : "=r" (*err), EAX_EDX_RET(val, low, high)
 		     : "c" (msr), "i" (-EFAULT));
 	return EAX_EDX_VAL(val, low, high);
@@ -81,10 +78,7 @@ static inline int native_write_msr_safe(unsigned int msr,
 		     ".section .fixup,\"ax\"\n\t"
 		     "3:  mov %4,%0 ; jmp 1b\n\t"
 		     ".previous\n\t"
-		     ".section __ex_table,\"a\"\n"
-		     _ASM_ALIGN "\n\t"
-		     _ASM_PTR " 2b,3b\n\t"
-		     ".previous"
+		     _ASM_EXTABLE(2b,3b)
 		     : "=a" (err)
 		     : "c" (msr), "0" (low), "d" (high),
 		       "i" (-EFAULT));

+ 2 - 2
include/asm-x86/page.h

@@ -13,8 +13,8 @@
 #define PHYSICAL_PAGE_MASK	(PAGE_MASK & __PHYSICAL_MASK)
 #define PTE_MASK		(_AT(long, PHYSICAL_PAGE_MASK))
 
-#define LARGE_PAGE_SIZE		(_AC(1,UL) << PMD_SHIFT)
-#define LARGE_PAGE_MASK		(~(LARGE_PAGE_SIZE-1))
+#define PMD_PAGE_SIZE		(_AC(1, UL) << PMD_SHIFT)
+#define PMD_PAGE_MASK		(~(PMD_PAGE_SIZE-1))
 
 #define HPAGE_SHIFT		PMD_SHIFT
 #define HPAGE_SIZE		(_AC(1,UL) << HPAGE_SHIFT)

+ 3 - 0
include/asm-x86/page_64.h

@@ -23,6 +23,9 @@
 #define MCE_STACK 5
 #define N_EXCEPTION_STACKS 5  /* hw limit: 7 */
 
+#define PUD_PAGE_SIZE		(_AC(1, UL) << PUD_SHIFT)
+#define PUD_PAGE_MASK		(~(PUD_PAGE_SIZE-1))
+
 #define __PAGE_OFFSET           _AC(0xffff810000000000, UL)
 
 #define __PHYSICAL_START	CONFIG_PHYSICAL_START

+ 4 - 2
include/asm-x86/pgalloc_32.h

@@ -80,8 +80,10 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
 	set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
 
 	/*
-	 * Pentium-II erratum A13: in PAE mode we explicitly have to flush
-	 * the TLB via cr3 if the top-level pgd is changed...
+	 * According to Intel App note "TLBs, Paging-Structure Caches,
+	 * and Their Invalidation", April 2007, document 317080-001,
+	 * section 8.1: in PAE mode we explicitly have to flush the
+	 * TLB via cr3 if the top-level pgd is changed...
 	 */
 	if (mm == current->active_mm)
 		write_cr3(read_cr3());

+ 11 - 15
include/asm-x86/pgtable-3level.h

@@ -93,26 +93,22 @@ static inline void native_pmd_clear(pmd_t *pmd)
 
 static inline void pud_clear(pud_t *pudp)
 {
+	unsigned long pgd;
+
 	set_pud(pudp, __pud(0));
 
 	/*
-	 * In principle we need to do a cr3 reload here to make sure
-	 * the processor recognizes the changed pgd.  In practice, all
-	 * the places where pud_clear() gets called are followed by
-	 * full tlb flushes anyway, so we can defer the cost here.
-	 *
-	 * Specifically:
-	 *
-	 * mm/memory.c:free_pmd_range() - immediately after the
-	 * pud_clear() it does a pmd_free_tlb().  We change the
-	 * mmu_gather structure to do a full tlb flush (which has the
-	 * effect of reloading cr3) when the pagetable free is
-	 * complete.
+	 * According to Intel App note "TLBs, Paging-Structure Caches,
+	 * and Their Invalidation", April 2007, document 317080-001,
+	 * section 8.1: in PAE mode we explicitly have to flush the
+	 * TLB via cr3 if the top-level pgd is changed...
 	 *
-	 * arch/x86/mm/hugetlbpage.c:huge_pmd_unshare() - the call to
-	 * this is followed by a flush_tlb_range, which on x86 does a
-	 * full tlb flush.
+	 * Make sure the pud entry we're updating is within the
+	 * current pgd to avoid unnecessary TLB flushes.
 	 */
+	pgd = read_cr3();
+	if (__pa(pudp) >= pgd && __pa(pudp) < (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
+		write_cr3(pgd);
 }
 
 #define pud_page(pud) \

+ 4 - 0
include/asm-x86/pgtable.h

@@ -13,10 +13,12 @@
 #define _PAGE_BIT_DIRTY		6
 #define _PAGE_BIT_FILE		6
 #define _PAGE_BIT_PSE		7	/* 4 MB (or 2MB) page */
+#define _PAGE_BIT_PAT		7	/* on 4KB pages */
 #define _PAGE_BIT_GLOBAL	8	/* Global TLB entry PPro+ */
 #define _PAGE_BIT_UNUSED1	9	/* available for programmer */
 #define _PAGE_BIT_UNUSED2	10
 #define _PAGE_BIT_UNUSED3	11
+#define _PAGE_BIT_PAT_LARGE	12	/* On 2MB or 1GB pages */
 #define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
 
 /*
@@ -36,6 +38,8 @@
 #define _PAGE_UNUSED1	(_AC(1, L)<<_PAGE_BIT_UNUSED1)
 #define _PAGE_UNUSED2	(_AC(1, L)<<_PAGE_BIT_UNUSED2)
 #define _PAGE_UNUSED3	(_AC(1, L)<<_PAGE_BIT_UNUSED3)
+#define _PAGE_PAT	(_AC(1, L)<<_PAGE_BIT_PAT)
+#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE)
 
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 #define _PAGE_NX	(_AC(1, ULL) << _PAGE_BIT_NX)

+ 2 - 0
include/asm-x86/pgtable_32.h

@@ -148,6 +148,8 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
  */
 #define pgd_offset_k(address) pgd_offset(&init_mm, address)
 
+static inline int pud_large(pud_t pud) { return 0; }
+
 /*
  * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
  *

+ 6 - 1
include/asm-x86/pgtable_64.h

@@ -21,7 +21,6 @@ extern pgd_t init_level4_pgt[];
 #define swapper_pg_dir init_level4_pgt
 
 extern void paging_init(void);
-extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
 
 #endif /* !__ASSEMBLY__ */
 
@@ -199,6 +198,12 @@ static inline unsigned long pmd_bad(pmd_t pmd)
 #define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address))
 #define pud_present(pud) (pud_val(pud) & _PAGE_PRESENT)
 
+static inline int pud_large(pud_t pte)
+{
+	return (pud_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
+		(_PAGE_PSE|_PAGE_PRESENT);
+}
+
 /* PMD  - Level 2 access */
 #define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
 #define pmd_page(pmd)		(pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))

+ 4 - 4
include/asm-x86/string_32.h

@@ -213,14 +213,14 @@ static __always_inline void * __constant_c_and_count_memset(void * s, unsigned l
 		case 0:
 			return s;
 		case 1:
-			*(unsigned char *)s = pattern;
+			*(unsigned char *)s = pattern & 0xff;
 			return s;
 		case 2:
-			*(unsigned short *)s = pattern;
+			*(unsigned short *)s = pattern & 0xffff;
 			return s;
 		case 3:
-			*(unsigned short *)s = pattern;
-			*(2+(unsigned char *)s) = pattern;
+			*(unsigned short *)s = pattern & 0xffff;
+			*(2+(unsigned char *)s) = pattern & 0xff;
 			return s;
 		case 4:
 			*(unsigned long *)s = pattern;

+ 9 - 14
include/asm-x86/system.h

@@ -20,8 +20,8 @@
 #ifdef CONFIG_X86_32
 
 struct task_struct; /* one of the stranger aspects of C forward declarations */
-extern struct task_struct *FASTCALL(__switch_to(struct task_struct *prev,
-						struct task_struct *next));
+struct task_struct *__switch_to(struct task_struct *prev,
+				struct task_struct *next);
 
 /*
  * Saving eflags is important. It switches not only IOPL between tasks,
@@ -130,10 +130,7 @@ extern void load_gs_index(unsigned);
 		"movl %k1, %%" #seg "\n\t"	\
 		"jmp 2b\n"			\
 		".previous\n"			\
-		".section __ex_table,\"a\"\n\t"	\
-		_ASM_ALIGN "\n\t"		\
-		_ASM_PTR " 1b,3b\n"		\
-		".previous"			\
+		_ASM_EXTABLE(1b,3b)		\
 		: :"r" (value), "r" (0))
 
 
@@ -214,12 +211,10 @@ static inline unsigned long native_read_cr4_safe(void)
 	/* This could fault if %cr4 does not exist. In x86_64, a cr4 always
 	 * exists, so it will never fail. */
 #ifdef CONFIG_X86_32
-	asm volatile("1: mov %%cr4, %0		\n"
-		"2:				\n"
-		".section __ex_table,\"a\"	\n"
-		".long 1b,2b			\n"
-		".previous			\n"
-		: "=r" (val), "=m" (__force_order) : "0" (0));
+	asm volatile("1: mov %%cr4, %0\n"
+		     "2:\n"
+		     _ASM_EXTABLE(1b,2b)
+		     : "=r" (val), "=m" (__force_order) : "0" (0));
 #else
 	val = native_read_cr4();
 #endif
@@ -276,9 +271,9 @@ static inline void native_wbinvd(void)
 
 #endif /* __KERNEL__ */
 
-static inline void clflush(void *__p)
+static inline void clflush(volatile void *__p)
 {
-	asm volatile("clflush %0" : "+m" (*(char __force *)__p));
+	asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
 }
 
 #define nop() __asm__ __volatile__ ("nop")

+ 5 - 13
include/asm-x86/uaccess_32.h

@@ -8,6 +8,7 @@
 #include <linux/thread_info.h>
 #include <linux/prefetch.h>
 #include <linux/string.h>
+#include <asm/asm.h>
 #include <asm/page.h>
 
 #define VERIFY_READ 0
@@ -287,11 +288,8 @@ extern void __put_user_8(void);
 		"4:	movl %3,%0\n"				\
 		"	jmp 3b\n"				\
 		".previous\n"					\
-		".section __ex_table,\"a\"\n"			\
-		"	.align 4\n"				\
-		"	.long 1b,4b\n"				\
-		"	.long 2b,4b\n"				\
-		".previous"					\
+		_ASM_EXTABLE(1b,4b)				\
+		_ASM_EXTABLE(2b,4b)				\
 		: "=r"(err)					\
 		: "A" (x), "r" (addr), "i"(-EFAULT), "0"(err))
 
@@ -338,10 +336,7 @@ struct __large_struct { unsigned long buf[100]; };
 		"3:	movl %3,%0\n"					\
 		"	jmp 2b\n"					\
 		".previous\n"						\
-		".section __ex_table,\"a\"\n"				\
-		"	.align 4\n"					\
-		"	.long 1b,3b\n"					\
-		".previous"						\
+		_ASM_EXTABLE(1b,3b)					\
 		: "=r"(err)						\
 		: ltype (x), "m"(__m(addr)), "i"(errret), "0"(err))
 
@@ -378,10 +373,7 @@ do {									\
 		"	xor"itype" %"rtype"1,%"rtype"1\n"		\
 		"	jmp 2b\n"					\
 		".previous\n"						\
-		".section __ex_table,\"a\"\n"				\
-		"	.align 4\n"					\
-		"	.long 1b,3b\n"					\
-		".previous"						\
+		_ASM_EXTABLE(1b,3b)					\
 		: "=r"(err), ltype (x)					\
 		: "m"(__m(addr)), "i"(errret), "0"(err))
 

+ 2 - 8
include/asm-x86/uaccess_64.h

@@ -181,10 +181,7 @@ struct __large_struct { unsigned long buf[100]; };
 		"3:	mov %3,%0\n"				\
 		"	jmp 2b\n"				\
 		".previous\n"					\
-		".section __ex_table,\"a\"\n"			\
-		"	.align 8\n"				\
-		"	.quad 1b,3b\n"				\
-		".previous"					\
+		_ASM_EXTABLE(1b,3b)				\
 		: "=r"(err)					\
 		: ltype (x), "m"(__m(addr)), "i"(errno), "0"(err))
 
@@ -226,10 +223,7 @@ do {									\
 		"	xor"itype" %"rtype"1,%"rtype"1\n"	\
 		"	jmp 2b\n"				\
 		".previous\n"					\
-		".section __ex_table,\"a\"\n"			\
-		"	.align 8\n"				\
-		"	.quad 1b,3b\n"				\
-		".previous"					\
+		_ASM_EXTABLE(1b,3b)				\
 		: "=r"(err), ltype (x)				\
 		: "m"(__m(addr)), "i"(errno), "0"(err))
 

+ 1 - 0
include/asm-x86/vm86.h

@@ -195,6 +195,7 @@ struct kernel_vm86_struct {
 
 void handle_vm86_fault(struct kernel_vm86_regs *, long);
 int handle_vm86_trap(struct kernel_vm86_regs *, long, int);
+struct pt_regs *save_v86_state(struct kernel_vm86_regs *);
 
 struct task_struct;
 void release_vm86_irqs(struct task_struct *);