Explorar o código

Merge branch 'x86/mpparse' into x86/devel

Conflicts:

	arch/x86/Kconfig
	arch/x86/kernel/io_apic_32.c
	arch/x86/kernel/setup_64.c
	arch/x86/mm/init_32.c

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Ingo Molnar %!s(int64=17) %!d(string=hai) anos
pai
achega
3de352bbd8
Modificáronse 90 ficheiros con 3873 adicións e 2949 borrados
  1. 23 0
      Documentation/kernel-parameters.txt
  2. 94 41
      arch/x86/Kconfig
  3. 0 9
      arch/x86/Kconfig.debug
  4. 1 18
      arch/x86/Makefile
  5. 0 4
      arch/x86/boot/compressed/misc.c
  6. 2 1
      arch/x86/boot/memory.c
  7. 2 2
      arch/x86/kernel/Makefile
  8. 405 2
      arch/x86/kernel/acpi/boot.c
  9. 1 1
      arch/x86/kernel/aperture_64.c
  10. 10 2
      arch/x86/kernel/apic_32.c
  11. 6 0
      arch/x86/kernel/apic_64.c
  12. 26 12
      arch/x86/kernel/cpu/mtrr/generic.c
  13. 881 20
      arch/x86/kernel/cpu/mtrr/main.c
  14. 3 0
      arch/x86/kernel/cpu/mtrr/mtrr.h
  15. 687 516
      arch/x86/kernel/e820.c
  16. 0 775
      arch/x86/kernel/e820_32.c
  17. 46 13
      arch/x86/kernel/efi.c
  18. 1 7
      arch/x86/kernel/efi_64.c
  19. 1 1
      arch/x86/kernel/genapic_64.c
  20. 73 0
      arch/x86/kernel/head.c
  21. 27 0
      arch/x86/kernel/head32.c
  22. 0 68
      arch/x86/kernel/head64.c
  23. 6 0
      arch/x86/kernel/head_32.S
  24. 67 60
      arch/x86/kernel/io_apic_32.c
  25. 42 40
      arch/x86/kernel/io_apic_64.c
  26. 485 342
      arch/x86/kernel/mpparse.c
  27. 20 5
      arch/x86/kernel/numaq_32.c
  28. 26 0
      arch/x86/kernel/setup.c
  29. 328 228
      arch/x86/kernel/setup_32.c
  30. 16 36
      arch/x86/kernel/setup_64.c
  31. 0 17
      arch/x86/kernel/smpboot.c
  32. 53 138
      arch/x86/kernel/srat_32.c
  33. 2 0
      arch/x86/kernel/summit_32.c
  34. 1 1
      arch/x86/kernel/trampoline.c
  35. 4 3
      arch/x86/lguest/boot.c
  36. 0 42
      arch/x86/mach-default/setup.c
  37. 0 1
      arch/x86/mach-es7000/Makefile
  38. 2 47
      arch/x86/mach-es7000/es7000plat.c
  39. 7 3
      arch/x86/mach-generic/Makefile
  40. 1 3
      arch/x86/mach-generic/bigsmp.c
  41. 41 0
      arch/x86/mach-generic/numaq.c
  42. 14 1
      arch/x86/mach-generic/probe.c
  43. 2 5
      arch/x86/mach-visws/mpparse.c
  44. 3 3
      arch/x86/mach-visws/setup.c
  45. 6 31
      arch/x86/mach-voyager/setup.c
  46. 0 14
      arch/x86/mach-voyager/voyager_smp.c
  47. 95 91
      arch/x86/mm/discontig_32.c
  48. 57 29
      arch/x86/mm/init_32.c
  49. 25 5
      arch/x86/mm/init_64.c
  50. 4 0
      arch/x86/mm/k8topology_64.c
  51. 1 1
      arch/x86/mm/numa_64.c
  52. 3 2
      arch/x86/pci/Makefile_32
  53. 2 2
      arch/x86/pci/amd_bus.c
  54. 4 25
      arch/x86/pci/numa.c
  55. 2 1
      arch/x86/xen/enlighten.c
  56. 2 2
      arch/x86/xen/setup.c
  57. 0 1
      drivers/acpi/Kconfig
  58. 5 0
      drivers/firmware/dmi_scan.c
  59. 2 3
      include/asm-x86/acpi.h
  60. 2 0
      include/asm-x86/bios_ebda.h
  61. 4 0
      include/asm-x86/bootparam.h
  62. 0 8
      include/asm-x86/dmi.h
  63. 90 6
      include/asm-x86/e820.h
  64. 0 50
      include/asm-x86/e820_32.h
  65. 0 56
      include/asm-x86/e820_64.h
  66. 1 1
      include/asm-x86/efi.h
  67. 3 0
      include/asm-x86/highmem.h
  68. 18 7
      include/asm-x86/io_apic.h
  69. 1 0
      include/asm-x86/ipi.h
  70. 0 8
      include/asm-x86/mach-bigsmp/mach_mpspec.h
  71. 0 8
      include/asm-x86/mach-es7000/mach_mpspec.h
  72. 5 2
      include/asm-x86/mach-generic/mach_mpparse.h
  73. 8 31
      include/asm-x86/mach-numaq/mach_apic.h
  74. 2 9
      include/asm-x86/mach-numaq/mach_mpparse.h
  75. 0 8
      include/asm-x86/mach-numaq/mach_mpspec.h
  76. 0 9
      include/asm-x86/mach-summit/mach_mpspec.h
  77. 6 20
      include/asm-x86/mmzone_32.h
  78. 25 4
      include/asm-x86/mpspec.h
  79. 1 1
      include/asm-x86/numa_32.h
  80. 4 2
      include/asm-x86/numaq.h
  81. 0 2
      include/asm-x86/proto.h
  82. 3 12
      include/asm-x86/setup.h
  83. 0 1
      include/asm-x86/smp.h
  84. 7 5
      include/asm-x86/srat.h
  85. 0 1
      include/asm-x86/system.h
  86. 2 0
      include/linux/bootmem.h
  87. 0 4
      include/linux/efi.h
  88. 4 2
      include/linux/mm.h
  89. 3 5
      include/linux/pageblock-flags.h
  90. 67 13
      mm/page_alloc.c

+ 23 - 0
Documentation/kernel-parameters.txt

@@ -610,6 +610,29 @@ and is between 256 and 4096 characters. It is defined in the file
 			See drivers/char/README.epca and
 			Documentation/digiepca.txt.
 
+	disable_mtrr_cleanup [X86]
+	enable_mtrr_cleanup [X86]
+			The kernel tries to adjust MTRR layout from continuous
+			to discrete, to make X server driver able to add WB
+			entry later. This parameter enables/disables that.
+
+	mtrr_chunk_size=nn[KMG] [X86]
+			used for mtrr cleanup. It is largest continous chunk
+			that could hold holes aka. UC entries.
+
+	mtrr_gran_size=nn[KMG] [X86]
+			Used for mtrr cleanup. It is granularity of mtrr block.
+			Default is 1.
+			Large value could prevent small alignment from
+			using up MTRRs.
+
+	mtrr_spare_reg_nr=n [X86]
+			Format: <integer>
+			Range: 0,7 : spare reg number
+			Default : 1
+			Used for mtrr cleanup. It is spare mtrr entries number.
+			Set to 2 or more if your graphical card needs more.
+
 	disable_mtrr_trim [X86, Intel and AMD only]
 			By default the kernel will trim any uncacheable
 			memory out of your available memory pool based on

+ 94 - 41
arch/x86/Kconfig

@@ -230,6 +230,27 @@ config SMP
 
 	  If you don't know what to do here, say N.
 
+config X86_FIND_SMP_CONFIG
+	def_bool y
+	depends on X86_MPPARSE || X86_VOYAGER || X86_VISWS
+	depends on X86_32
+
+if ACPI
+config X86_MPPARSE
+	def_bool y
+	bool "Enable MPS table"
+	depends on X86_LOCAL_APIC && !X86_VISWS
+	help
+	  For old smp systems that do not have proper acpi support. Newer systems
+	  (esp with 64bit cpus) with acpi support, MADT and DSDT will override it
+endif
+
+if !ACPI
+config X86_MPPARSE
+	def_bool y
+	depends on X86_LOCAL_APIC && !X86_VISWS
+endif
+
 choice
 	prompt "Subarchitecture Type"
 	default X86_PC
@@ -261,36 +282,6 @@ config X86_VOYAGER
 	  If you do not specifically know you have a Voyager based machine,
 	  say N here, otherwise the kernel you build will not be bootable.
 
-config X86_NUMAQ
-	bool "NUMAQ (IBM/Sequent)"
-	depends on SMP && X86_32 && PCI
-	select NUMA
-	help
-	  This option is used for getting Linux to run on a (IBM/Sequent) NUMA
-	  multiquad box. This changes the way that processors are bootstrapped,
-	  and uses Clustered Logical APIC addressing mode instead of Flat Logical.
-	  You will need a new lynxer.elf file to flash your firmware with - send
-	  email to <Martin.Bligh@us.ibm.com>.
-
-config X86_SUMMIT
-	bool "Summit/EXA (IBM x440)"
-	depends on X86_32 && SMP
-	help
-	  This option is needed for IBM systems that use the Summit/EXA chipset.
-	  In particular, it is needed for the x440.
-
-	  If you don't have one of these computers, you should say N here.
-	  If you want to build a NUMA kernel, you must select ACPI.
-
-config X86_BIGSMP
-	bool "Support for other sub-arch SMP systems with more than 8 CPUs"
-	depends on X86_32 && SMP
-	help
-	  This option is needed for the systems that have more than 8 CPUs
-	  and if the system is not of any sub-arch type above.
-
-	  If you don't have such a system, you should say N here.
-
 config X86_VISWS
 	bool "SGI 320/540 (Visual Workstation)"
 	depends on X86_32 && !PCI
@@ -304,12 +295,33 @@ config X86_VISWS
 	  and vice versa. See <file:Documentation/sgi-visws.txt> for details.
 
 config X86_GENERICARCH
-       bool "Generic architecture (Summit, bigsmp, ES7000, default)"
+       bool "Generic architecture"
 	depends on X86_32
        help
-          This option compiles in the Summit, bigsmp, ES7000, default subarchitectures.
-	  It is intended for a generic binary kernel.
-	  If you want a NUMA kernel, select ACPI.   We need SRAT for NUMA.
+          This option compiles in the NUMAQ, Summit, bigsmp, ES7000, default
+	  subarchitectures.  It is intended for a generic binary kernel.
+	  if you select them all, kernel will probe it one by one. and will
+	  fallback to default.
+
+if X86_GENERICARCH
+
+config X86_NUMAQ
+	bool "NUMAQ (IBM/Sequent)"
+	depends on SMP && X86_32 && PCI && X86_MPPARSE
+	select NUMA
+	help
+	  This option is used for getting Linux to run on a NUMAQ (IBM/Sequent)
+	  NUMA multiquad box. This changes the way that processors are
+	  bootstrapped, and uses Clustered Logical APIC addressing mode instead
+	  of Flat Logical.  You will need a new lynxer.elf file to flash your
+	  firmware with - send email to <Martin.Bligh@us.ibm.com>.
+
+config X86_SUMMIT
+	bool "Summit/EXA (IBM x440)"
+	depends on X86_32 && SMP
+	help
+	  This option is needed for IBM systems that use the Summit/EXA chipset.
+	  In particular, it is needed for the x440.
 
 config X86_ES7000
 	bool "Support for Unisys ES7000 IA32 series"
@@ -317,8 +329,15 @@ config X86_ES7000
 	help
 	  Support for Unisys ES7000 systems.  Say 'Y' here if this kernel is
 	  supposed to run on an IA32-based Unisys ES7000 system.
-	  Only choose this option if you have such a system, otherwise you
-	  should say N here.
+
+config X86_BIGSMP
+	bool "Support for big SMP systems with more than 8 CPUs"
+	depends on X86_32 && SMP
+	help
+	  This option is needed for the systems that have more than 8 CPUs
+	  and if the system is not of any sub-arch type above.
+
+endif
 
 config X86_RDC321X
 	bool "RDC R-321x SoC"
@@ -432,7 +451,7 @@ config MEMTEST
 
 config ACPI_SRAT
 	def_bool y
-	depends on X86_32 && ACPI && NUMA && (X86_SUMMIT || X86_GENERICARCH)
+	depends on X86_32 && ACPI && NUMA && X86_GENERICARCH
 	select ACPI_NUMA
 
 config HAVE_ARCH_PARSE_SRAT
@@ -441,11 +460,11 @@ config HAVE_ARCH_PARSE_SRAT
 
 config X86_SUMMIT_NUMA
 	def_bool y
-	depends on X86_32 && NUMA && (X86_SUMMIT || X86_GENERICARCH)
+	depends on X86_32 && NUMA && X86_GENERICARCH
 
 config X86_CYCLONE_TIMER
 	def_bool y
-	depends on X86_32 && X86_SUMMIT || X86_GENERICARCH
+	depends on X86_GENERICARCH
 
 config ES7000_CLUSTERED_APIC
 	def_bool y
@@ -910,9 +929,9 @@ config X86_PAE
 config NUMA
 	bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)"
 	depends on SMP
-	depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || (X86_SUMMIT || X86_GENERICARCH) && ACPI) && EXPERIMENTAL)
+	depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL)
 	default n if X86_PC
-	default y if (X86_NUMAQ || X86_SUMMIT)
+	default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP)
 	help
 	  Enable NUMA (Non Uniform Memory Access) support.
 	  The kernel will try to allocate memory used by a CPU on the
@@ -1089,6 +1108,40 @@ config MTRR
 
 	  See <file:Documentation/mtrr.txt> for more information.
 
+config MTRR_SANITIZER
+	def_bool y
+	prompt "MTRR cleanup support"
+	depends on MTRR
+	help
+	  Convert MTRR layout from continuous to discrete, so some X driver
+	  could add WB entries.
+
+	  Say N here if you see bootup problems (boot crash, boot hang,
+	  spontaneous reboots).
+
+	  Could be disabled with disable_mtrr_cleanup. Also mtrr_chunk_size
+	  could be used to send largest mtrr entry size for continuous block
+	  to hold holes (aka. UC entries)
+
+	  If unsure, say Y.
+
+config MTRR_SANITIZER_ENABLE_DEFAULT
+	int "MTRR cleanup enable value (0-1)"
+	range 0 1
+	default "0"
+	depends on MTRR_SANITIZER
+	help
+	  Enable mtrr cleanup default value
+
+config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
+	int "MTRR cleanup spare reg num (0-7)"
+	range 0 7
+	default "1"
+	depends on MTRR_SANITIZER
+	help
+	  mtrr cleanup spare entries default, it can be changed via
+	  mtrr_spare_reg_nr=
+
 config X86_PAT
 	bool
 	prompt "x86 PAT support"

+ 0 - 9
arch/x86/Kconfig.debug

@@ -137,15 +137,6 @@ config 4KSTACKS
 	  on the VM subsystem for higher order allocations. This option
 	  will also use IRQ stacks to compensate for the reduced stackspace.
 
-config X86_FIND_SMP_CONFIG
-	def_bool y
-	depends on X86_LOCAL_APIC || X86_VOYAGER
-	depends on X86_32
-
-config X86_MPPARSE
-	def_bool y
-	depends on (X86_32 && (X86_LOCAL_APIC && !X86_VISWS)) || X86_64
-
 config DOUBLEFAULT
 	default y
 	bool "Enable doublefault exception handler" if EMBEDDED

+ 1 - 18
arch/x86/Makefile

@@ -117,29 +117,11 @@ mcore-$(CONFIG_X86_VOYAGER)	:= arch/x86/mach-voyager/
 mflags-$(CONFIG_X86_VISWS)	:= -Iinclude/asm-x86/mach-visws
 mcore-$(CONFIG_X86_VISWS)	:= arch/x86/mach-visws/
 
-# NUMAQ subarch support
-mflags-$(CONFIG_X86_NUMAQ)	:= -Iinclude/asm-x86/mach-numaq
-mcore-$(CONFIG_X86_NUMAQ)	:= arch/x86/mach-default/
-
-# BIGSMP subarch support
-mflags-$(CONFIG_X86_BIGSMP)	:= -Iinclude/asm-x86/mach-bigsmp
-mcore-$(CONFIG_X86_BIGSMP)	:= arch/x86/mach-default/
-
-#Summit subarch support
-mflags-$(CONFIG_X86_SUMMIT)	:= -Iinclude/asm-x86/mach-summit
-mcore-$(CONFIG_X86_SUMMIT)	:= arch/x86/mach-default/
-
 # generic subarchitecture
 mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
 fcore-$(CONFIG_X86_GENERICARCH)	+= arch/x86/mach-generic/
 mcore-$(CONFIG_X86_GENERICARCH)	:= arch/x86/mach-default/
 
-
-# ES7000 subarch support
-mflags-$(CONFIG_X86_ES7000)	:= -Iinclude/asm-x86/mach-es7000
-fcore-$(CONFIG_X86_ES7000)	:= arch/x86/mach-es7000/
-mcore-$(CONFIG_X86_ES7000)	:= arch/x86/mach-default/
-
 # RDC R-321x subarch support
 mflags-$(CONFIG_X86_RDC321X)	:= -Iinclude/asm-x86/mach-rdc321x
 mcore-$(CONFIG_X86_RDC321X)	:= arch/x86/mach-default/
@@ -160,6 +142,7 @@ KBUILD_AFLAGS += $(mflags-y)
 
 head-y := arch/x86/kernel/head_$(BITS).o
 head-y += arch/x86/kernel/head$(BITS).o
+head-y += arch/x86/kernel/head.o
 head-y += arch/x86/kernel/init_task.o
 
 libs-y  += arch/x86/lib/

+ 0 - 4
arch/x86/boot/compressed/misc.c

@@ -218,10 +218,6 @@ static char *vidmem;
 static int vidport;
 static int lines, cols;
 
-#ifdef CONFIG_X86_NUMAQ
-void *xquad_portio;
-#endif
-
 #include "../../../../lib/inflate.c"
 
 static void *malloc(int size)

+ 2 - 1
arch/x86/boot/memory.c

@@ -13,6 +13,7 @@
  */
 
 #include "boot.h"
+#include <linux/kernel.h>
 
 #define SMAP	0x534d4150	/* ASCII "SMAP" */
 
@@ -53,7 +54,7 @@ static int detect_memory_e820(void)
 
 		count++;
 		desc++;
-	} while (next && count < E820MAX);
+	} while (next && count < ARRAY_SIZE(boot_params.e820_map));
 
 	return boot_params.e820_entries = count;
 }

+ 2 - 2
arch/x86/kernel/Makefile

@@ -2,7 +2,7 @@
 # Makefile for the linux kernel.
 #
 
-extra-y                := head_$(BITS).o head$(BITS).o init_task.o vmlinux.lds
+extra-y                := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds
 
 CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
 
@@ -22,7 +22,7 @@ obj-y			+= setup_$(BITS).o i8259.o irqinit_$(BITS).o setup.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o setup64.o
-obj-y			+= bootflag.o e820_$(BITS).o
+obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
 obj-y			+= alternative.o i8253.o pci-nommu.o
 obj-y			+= tsc_$(BITS).o io_delay.o rtc.o

+ 405 - 2
arch/x86/kernel/acpi/boot.c

@@ -83,6 +83,8 @@ int acpi_lapic;
 int acpi_ioapic;
 int acpi_strict;
 
+static int disable_irq0_through_ioapic __initdata;
+
 u8 acpi_sci_flags __initdata;
 int acpi_sci_override_gsi __initdata;
 int acpi_skip_timer_override __initdata;
@@ -338,8 +340,6 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e
 
 #ifdef CONFIG_X86_IO_APIC
 
-struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
-
 static int __init
 acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
 {
@@ -858,6 +858,372 @@ static int __init acpi_parse_madt_lapic_entries(void)
 #endif				/* CONFIG_X86_LOCAL_APIC */
 
 #ifdef	CONFIG_X86_IO_APIC
+#define MP_ISA_BUS		0
+
+#ifdef CONFIG_X86_ES7000
+extern int es7000_plat;
+#endif
+
+static struct {
+	int apic_id;
+	int gsi_base;
+	int gsi_end;
+	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+} mp_ioapic_routing[MAX_IO_APICS];
+
+static int mp_find_ioapic(int gsi)
+{
+	int i = 0;
+
+	/* Find the IOAPIC that manages this GSI. */
+	for (i = 0; i < nr_ioapics; i++) {
+		if ((gsi >= mp_ioapic_routing[i].gsi_base)
+		    && (gsi <= mp_ioapic_routing[i].gsi_end))
+			return i;
+	}
+
+	printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+	return -1;
+}
+
+static u8 __init uniq_ioapic_id(u8 id)
+{
+#ifdef CONFIG_X86_32
+	if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+	    !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+		return io_apic_get_unique_id(nr_ioapics, id);
+	else
+		return id;
+#else
+	int i;
+	DECLARE_BITMAP(used, 256);
+	bitmap_zero(used, 256);
+	for (i = 0; i < nr_ioapics; i++) {
+		struct mp_config_ioapic *ia = &mp_ioapics[i];
+		__set_bit(ia->mp_apicid, used);
+	}
+	if (!test_bit(id, used))
+		return id;
+	return find_first_zero_bit(used, 256);
+#endif
+}
+
+static int bad_ioapic(unsigned long address)
+{
+	if (nr_ioapics >= MAX_IO_APICS) {
+		printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
+		       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
+		panic("Recompile kernel with bigger MAX_IO_APICS!\n");
+	}
+	if (!address) {
+		printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
+		       " found in table, skipping!\n");
+		return 1;
+	}
+	return 0;
+}
+
+void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
+{
+	int idx = 0;
+
+	if (bad_ioapic(address))
+		return;
+
+	idx = nr_ioapics;
+
+	mp_ioapics[idx].mp_type = MP_IOAPIC;
+	mp_ioapics[idx].mp_flags = MPC_APIC_USABLE;
+	mp_ioapics[idx].mp_apicaddr = address;
+
+	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+	mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id);
+#ifdef CONFIG_X86_32
+	mp_ioapics[idx].mp_apicver = io_apic_get_version(idx);
+#else
+	mp_ioapics[idx].mp_apicver = 0;
+#endif
+	/*
+	 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
+	 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
+	 */
+	mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mp_apicid;
+	mp_ioapic_routing[idx].gsi_base = gsi_base;
+	mp_ioapic_routing[idx].gsi_end = gsi_base +
+	    io_apic_get_redir_entries(idx);
+
+	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
+	       "GSI %d-%d\n", idx, mp_ioapics[idx].mp_apicid,
+	       mp_ioapics[idx].mp_apicver, mp_ioapics[idx].mp_apicaddr,
+	       mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
+
+	nr_ioapics++;
+}
+
+static void assign_to_mp_irq(struct mp_config_intsrc *m,
+				    struct mp_config_intsrc *mp_irq)
+{
+	memcpy(mp_irq, m, sizeof(struct mp_config_intsrc));
+}
+
+static int mp_irq_cmp(struct mp_config_intsrc *mp_irq,
+				struct mp_config_intsrc *m)
+{
+	return memcmp(mp_irq, m, sizeof(struct mp_config_intsrc));
+}
+
+static void save_mp_irq(struct mp_config_intsrc *m)
+{
+	int i;
+
+	for (i = 0; i < mp_irq_entries; i++) {
+		if (!mp_irq_cmp(&mp_irqs[i], m))
+			return;
+	}
+
+	assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
+	if (++mp_irq_entries == MAX_IRQ_SOURCES)
+		panic("Max # of irq sources exceeded!!\n");
+}
+
+void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
+{
+	int ioapic;
+	int pin;
+	struct mp_config_intsrc mp_irq;
+
+	/* Skip the 8254 timer interrupt (IRQ 0) if requested.  */
+	if (bus_irq == 0 && disable_irq0_through_ioapic)
+		return;
+
+	/*
+	 * Convert 'gsi' to 'ioapic.pin'.
+	 */
+	ioapic = mp_find_ioapic(gsi);
+	if (ioapic < 0)
+		return;
+	pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+	/*
+	 * TBD: This check is for faulty timer entries, where the override
+	 *      erroneously sets the trigger to level, resulting in a HUGE
+	 *      increase of timer interrupts!
+	 */
+	if ((bus_irq == 0) && (trigger == 3))
+		trigger = 1;
+
+	mp_irq.mp_type = MP_INTSRC;
+	mp_irq.mp_irqtype = mp_INT;
+	mp_irq.mp_irqflag = (trigger << 2) | polarity;
+	mp_irq.mp_srcbus = MP_ISA_BUS;
+	mp_irq.mp_srcbusirq = bus_irq;	/* IRQ */
+	mp_irq.mp_dstapic = mp_ioapics[ioapic].mp_apicid; /* APIC ID */
+	mp_irq.mp_dstirq = pin;	/* INTIN# */
+
+	save_mp_irq(&mp_irq);
+}
+
+void __init mp_config_acpi_legacy_irqs(void)
+{
+	int i;
+	int ioapic;
+	unsigned int dstapic;
+	struct mp_config_intsrc mp_irq;
+
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+	/*
+	 * Fabricate the legacy ISA bus (bus #31).
+	 */
+	mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
+#endif
+	set_bit(MP_ISA_BUS, mp_bus_not_pci);
+	Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
+
+#ifdef CONFIG_X86_ES7000
+	/*
+	 * Older generations of ES7000 have no legacy identity mappings
+	 */
+	if (es7000_plat == 1)
+		return;
+#endif
+
+	/*
+	 * Locate the IOAPIC that manages the ISA IRQs (0-15).
+	 */
+	ioapic = mp_find_ioapic(0);
+	if (ioapic < 0)
+		return;
+	dstapic = mp_ioapics[ioapic].mp_apicid;
+
+	/*
+	 * Use the default configuration for the IRQs 0-15.  Unless
+	 * overridden by (MADT) interrupt source override entries.
+	 */
+	for (i = 0; i < 16; i++) {
+		int idx;
+
+		/* Skip the 8254 timer interrupt (IRQ 0) if requested.  */
+		if (i == 0 && disable_irq0_through_ioapic)
+			continue;
+
+		for (idx = 0; idx < mp_irq_entries; idx++) {
+			struct mp_config_intsrc *irq = mp_irqs + idx;
+
+			/* Do we already have a mapping for this ISA IRQ? */
+			if (irq->mp_srcbus == MP_ISA_BUS
+			    && irq->mp_srcbusirq == i)
+				break;
+
+			/* Do we already have a mapping for this IOAPIC pin */
+			if (irq->mp_dstapic == dstapic &&
+			    irq->mp_dstirq == i)
+				break;
+		}
+
+		if (idx != mp_irq_entries) {
+			printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
+			continue;	/* IRQ already used */
+		}
+
+		mp_irq.mp_type = MP_INTSRC;
+		mp_irq.mp_irqflag = 0;	/* Conforming */
+		mp_irq.mp_srcbus = MP_ISA_BUS;
+		mp_irq.mp_dstapic = dstapic;
+		mp_irq.mp_irqtype = mp_INT;
+		mp_irq.mp_srcbusirq = i; /* Identity mapped */
+		mp_irq.mp_dstirq = i;
+
+		save_mp_irq(&mp_irq);
+	}
+}
+
+int mp_register_gsi(u32 gsi, int triggering, int polarity)
+{
+	int ioapic;
+	int ioapic_pin;
+#ifdef CONFIG_X86_32
+#define MAX_GSI_NUM	4096
+#define IRQ_COMPRESSION_START	64
+
+	static int pci_irq = IRQ_COMPRESSION_START;
+	/*
+	 * Mapping between Global System Interrupts, which
+	 * represent all possible interrupts, and IRQs
+	 * assigned to actual devices.
+	 */
+	static int gsi_to_irq[MAX_GSI_NUM];
+#else
+
+	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
+		return gsi;
+#endif
+
+	/* Don't set up the ACPI SCI because it's already set up */
+	if (acpi_gbl_FADT.sci_interrupt == gsi)
+		return gsi;
+
+	ioapic = mp_find_ioapic(gsi);
+	if (ioapic < 0) {
+		printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
+		return gsi;
+	}
+
+	ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+#ifdef CONFIG_X86_32
+	if (ioapic_renumber_irq)
+		gsi = ioapic_renumber_irq(ioapic, gsi);
+#endif
+
+	/*
+	 * Avoid pin reprogramming.  PRTs typically include entries
+	 * with redundant pin->gsi mappings (but unique PCI devices);
+	 * we only program the IOAPIC on the first.
+	 */
+	if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
+		printk(KERN_ERR "Invalid reference to IOAPIC pin "
+		       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
+		       ioapic_pin);
+		return gsi;
+	}
+	if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
+		Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
+			mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
+#ifdef CONFIG_X86_32
+		return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
+#else
+		return gsi;
+#endif
+	}
+
+	set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
+#ifdef CONFIG_X86_32
+	/*
+	 * For GSI >= 64, use IRQ compression
+	 */
+	if ((gsi >= IRQ_COMPRESSION_START)
+	    && (triggering == ACPI_LEVEL_SENSITIVE)) {
+		/*
+		 * For PCI devices assign IRQs in order, avoiding gaps
+		 * due to unused I/O APIC pins.
+		 */
+		int irq = gsi;
+		if (gsi < MAX_GSI_NUM) {
+			/*
+			 * Retain the VIA chipset work-around (gsi > 15), but
+			 * avoid a problem where the 8254 timer (IRQ0) is setup
+			 * via an override (so it's not on pin 0 of the ioapic),
+			 * and at the same time, the pin 0 interrupt is a PCI
+			 * type.  The gsi > 15 test could cause these two pins
+			 * to be shared as IRQ0, and they are not shareable.
+			 * So test for this condition, and if necessary, avoid
+			 * the pin collision.
+			 */
+			gsi = pci_irq++;
+			/*
+			 * Don't assign IRQ used by ACPI SCI
+			 */
+			if (gsi == acpi_gbl_FADT.sci_interrupt)
+				gsi = pci_irq++;
+			gsi_to_irq[irq] = gsi;
+		} else {
+			printk(KERN_ERR "GSI %u is too high\n", gsi);
+			return gsi;
+		}
+	}
+#endif
+	io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
+				triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
+				polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+	return gsi;
+}
+
+int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
+			u32 gsi, int triggering, int polarity)
+{
+#ifdef CONFIG_X86_MPPARSE
+	struct mp_config_intsrc mp_irq;
+	int ioapic;
+
+	if (!acpi_ioapic)
+		return 0;
+
+	/* print the entry should happen on mptable identically */
+	mp_irq.mp_type = MP_INTSRC;
+	mp_irq.mp_irqtype = mp_INT;
+	mp_irq.mp_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+				(polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
+	mp_irq.mp_srcbus = number;
+	mp_irq.mp_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+	ioapic = mp_find_ioapic(gsi);
+	mp_irq.mp_dstapic = mp_ioapic_routing[ioapic].apic_id;
+	mp_irq.mp_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+	save_mp_irq(&mp_irq);
+#endif
+	return 0;
+}
+
 /*
  * Parse IOAPIC related entries in MADT
  * returns 0 on success, < 0 on error
@@ -1058,6 +1424,17 @@ static int __init force_acpi_ht(const struct dmi_system_id *d)
 	return 0;
 }
 
+/*
+ * Don't register any I/O APIC entries for the 8254 timer IRQ.
+ */
+static int __init
+dmi_disable_irq0_through_ioapic(const struct dmi_system_id *d)
+{
+	pr_notice("%s detected: disabling IRQ 0 through I/O APIC\n", d->ident);
+	disable_irq0_through_ioapic = 1;
+	return 0;
+}
+
 /*
  * If your system is blacklisted here, but you find that acpi=force
  * works for you, please contact acpi-devel@sourceforge.net
@@ -1225,6 +1602,32 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
 		     DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
 		     },
 	 },
+	/*
+	 * HP laptops which use a DSDT reporting as HP/SB400/10000,
+	 * which includes some code which overrides all temperature
+	 * trip points to 16C if the INTIN2 input of the I/O APIC
+	 * is enabled.  This input is incorrectly designated the
+	 * ISA IRQ 0 via an interrupt source override even though
+	 * it is wired to the output of the master 8259A and INTIN0
+	 * is not connected at all.  Abandon any attempts to route
+	 * IRQ 0 through the I/O APIC therefore.
+	 */
+	{
+	 .callback = dmi_disable_irq0_through_ioapic,
+	 .ident = "HP NX6125 laptop",
+	 .matches = {
+		     DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+		     DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6125"),
+		     },
+	 },
+	{
+	 .callback = dmi_disable_irq0_through_ioapic,
+	 .ident = "HP NX6325 laptop",
+	 .matches = {
+		     DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+		     DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"),
+		     },
+	 },
 	{}
 };
 

+ 1 - 1
arch/x86/kernel/aperture_64.c

@@ -328,7 +328,7 @@ void __init early_gart_iommu_check(void)
 				    E820_RAM)) {
 			/* reserve it, so we can reuse it in second kernel */
 			printk(KERN_INFO "update e820 for GART\n");
-			add_memory_region(aper_base, aper_size, E820_RESERVED);
+			e820_add_region(aper_base, aper_size, E820_RESERVED);
 			update_e820();
 		}
 	}

+ 10 - 2
arch/x86/kernel/apic_32.c

@@ -79,6 +79,11 @@ char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
  */
 int apic_verbosity;
 
+int pic_mode;
+
+/* Have we found an MP table */
+int smp_found_config;
+
 static unsigned int calibration_result;
 
 static int lapic_next_event(unsigned long delta,
@@ -1202,7 +1207,7 @@ void __init init_apic_mappings(void)
 
 		for (i = 0; i < nr_ioapics; i++) {
 			if (smp_found_config) {
-				ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+				ioapic_phys = mp_ioapics[i].mp_apicaddr;
 				if (!ioapic_phys) {
 					printk(KERN_ERR
 					       "WARNING: bogus zero IO-APIC "
@@ -1517,6 +1522,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
 		 */
 		cpu = 0;
 
+	if (apicid > max_physical_apicid)
+		max_physical_apicid = apicid;
+
 	/*
 	 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
 	 * but we need to work other dependencies like SMP_SUSPEND etc
@@ -1524,7 +1532,7 @@ void __cpuinit generic_processor_info(int apicid, int version)
 	 * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
 	 *       - Ashok Raj <ashok.raj@intel.com>
 	 */
-	if (num_processors > 8) {
+	if (max_physical_apicid >= 8) {
 		switch (boot_cpu_data.x86_vendor) {
 		case X86_VENDOR_INTEL:
 			if (!APIC_XAPIC(version)) {

+ 6 - 0
arch/x86/kernel/apic_64.c

@@ -56,6 +56,9 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
  */
 int apic_verbosity;
 
+/* Have we found an MP table */
+int smp_found_config;
+
 static struct resource lapic_resource = {
 	.name = "Local APIC",
 	.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
@@ -1068,6 +1071,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
 		 */
 		cpu = 0;
 	}
+	if (apicid > max_physical_apicid)
+		max_physical_apicid = apicid;
+
 	/* are we being called early in kernel startup? */
 	if (x86_cpu_to_apicid_early_ptr) {
 		u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;

+ 26 - 12
arch/x86/kernel/cpu/mtrr/generic.c

@@ -37,7 +37,7 @@ static struct fixed_range_block fixed_range_blocks[] = {
 static unsigned long smp_changes_mask;
 static struct mtrr_state mtrr_state = {};
 static int mtrr_state_set;
-static u64 tom2;
+u64 mtrr_tom2;
 
 #undef MODULE_PARAM_PREFIX
 #define MODULE_PARAM_PREFIX "mtrr."
@@ -139,8 +139,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
 		}
 	}
 
-	if (tom2) {
-		if (start >= (1ULL<<32) && (end < tom2))
+	if (mtrr_tom2) {
+		if (start >= (1ULL<<32) && (end < mtrr_tom2))
 			return MTRR_TYPE_WRBACK;
 	}
 
@@ -158,6 +158,20 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
 	rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
 }
 
+/*  fill the MSR pair relating to a var range  */
+void fill_mtrr_var_range(unsigned int index,
+		u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi)
+{
+	struct mtrr_var_range *vr;
+
+	vr = mtrr_state.var_ranges;
+
+	vr[index].base_lo = base_lo;
+	vr[index].base_hi = base_hi;
+	vr[index].mask_lo = mask_lo;
+	vr[index].mask_hi = mask_hi;
+}
+
 static void
 get_fixed_ranges(mtrr_type * frs)
 {
@@ -213,13 +227,13 @@ void __init get_mtrr_state(void)
 	mtrr_state.enabled = (lo & 0xc00) >> 10;
 
 	if (amd_special_default_mtrr()) {
-		unsigned lo, hi;
+		unsigned low, high;
 		/* TOP_MEM2 */
-		rdmsr(MSR_K8_TOP_MEM2, lo, hi);
-		tom2 = hi;
-		tom2 <<= 32;
-		tom2 |= lo;
-		tom2 &= 0xffffff8000000ULL;
+		rdmsr(MSR_K8_TOP_MEM2, low, high);
+		mtrr_tom2 = high;
+		mtrr_tom2 <<= 32;
+		mtrr_tom2 |= low;
+		mtrr_tom2 &= 0xffffff800000ULL;
 	}
 	if (mtrr_show) {
 		int high_width;
@@ -251,9 +265,9 @@ void __init get_mtrr_state(void)
 			else
 				printk(KERN_INFO "MTRR %u disabled\n", i);
 		}
-		if (tom2) {
+		if (mtrr_tom2) {
 			printk(KERN_INFO "TOM2: %016llx aka %lldM\n",
-					  tom2, tom2>>20);
+					  mtrr_tom2, mtrr_tom2>>20);
 		}
 	}
 	mtrr_state_set = 1;
@@ -328,7 +342,7 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
 
 	if (lo != msrwords[0] || hi != msrwords[1]) {
 		if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-		    boot_cpu_data.x86 == 15 &&
+		    (boot_cpu_data.x86 >= 0x0f && boot_cpu_data.x86 <= 0x11) &&
 		    ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
 			k8_enable_fixed_iorrs();
 		mtrr_wrmsr(msr, msrwords[0], msrwords[1]);

+ 881 - 20
arch/x86/kernel/cpu/mtrr/main.c

@@ -37,6 +37,7 @@
 #include <linux/smp.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
+#include <linux/sort.h>
 
 #include <asm/e820.h>
 #include <asm/mtrr.h>
@@ -609,6 +610,787 @@ static struct sysdev_driver mtrr_sysdev_driver = {
 	.resume		= mtrr_restore,
 };
 
+/* should be related to MTRR_VAR_RANGES nums */
+#define RANGE_NUM 256
+
+struct res_range {
+	unsigned long start;
+	unsigned long end;
+};
+
+static int __init
+add_range(struct res_range *range, int nr_range, unsigned long start,
+			      unsigned long end)
+{
+	/* out of slots */
+	if (nr_range >= RANGE_NUM)
+		return nr_range;
+
+	range[nr_range].start = start;
+	range[nr_range].end = end;
+
+	nr_range++;
+
+	return nr_range;
+}
+
+static int __init
+add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
+			      unsigned long end)
+{
+	int i;
+
+	/* try to merge it with old one */
+	for (i = 0; i < nr_range; i++) {
+		unsigned long final_start, final_end;
+		unsigned long common_start, common_end;
+
+		if (!range[i].end)
+			continue;
+
+		common_start = max(range[i].start, start);
+		common_end = min(range[i].end, end);
+		if (common_start > common_end + 1)
+			continue;
+
+		final_start = min(range[i].start, start);
+		final_end = max(range[i].end, end);
+
+		range[i].start = final_start;
+		range[i].end =  final_end;
+		return nr_range;
+	}
+
+	/* need to add that */
+	return add_range(range, nr_range, start, end);
+}
+
+static void __init
+subtract_range(struct res_range *range, unsigned long start, unsigned long end)
+{
+	int i, j;
+
+	for (j = 0; j < RANGE_NUM; j++) {
+		if (!range[j].end)
+			continue;
+
+		if (start <= range[j].start && end >= range[j].end) {
+			range[j].start = 0;
+			range[j].end = 0;
+			continue;
+		}
+
+		if (start <= range[j].start && end < range[j].end &&
+		    range[j].start < end + 1) {
+			range[j].start = end + 1;
+			continue;
+		}
+
+
+		if (start > range[j].start && end >= range[j].end &&
+		    range[j].end > start - 1) {
+			range[j].end = start - 1;
+			continue;
+		}
+
+		if (start > range[j].start && end < range[j].end) {
+			/* find the new spare */
+			for (i = 0; i < RANGE_NUM; i++) {
+				if (range[i].end == 0)
+					break;
+			}
+			if (i < RANGE_NUM) {
+				range[i].end = range[j].end;
+				range[i].start = end + 1;
+			} else {
+				printk(KERN_ERR "run of slot in ranges\n");
+			}
+			range[j].end = start - 1;
+			continue;
+		}
+	}
+}
+
+static int __init cmp_range(const void *x1, const void *x2)
+{
+	const struct res_range *r1 = x1;
+	const struct res_range *r2 = x2;
+	long start1, start2;
+
+	start1 = r1->start;
+	start2 = r2->start;
+
+	return start1 - start2;
+}
+
+struct var_mtrr_range_state {
+	unsigned long base_pfn;
+	unsigned long size_pfn;
+	mtrr_type type;
+};
+
+struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
+static int __initdata debug_print;
+
+static int __init
+x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
+		       unsigned long extra_remove_base,
+		       unsigned long extra_remove_size)
+{
+	unsigned long i, base, size;
+	mtrr_type type;
+
+	for (i = 0; i < num_var_ranges; i++) {
+		type = range_state[i].type;
+		if (type != MTRR_TYPE_WRBACK)
+			continue;
+		base = range_state[i].base_pfn;
+		size = range_state[i].size_pfn;
+		nr_range = add_range_with_merge(range, nr_range, base,
+						base + size - 1);
+	}
+	if (debug_print) {
+		printk(KERN_DEBUG "After WB checking\n");
+		for (i = 0; i < nr_range; i++)
+			printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+				 range[i].start, range[i].end + 1);
+	}
+
+	/* take out UC ranges */
+	for (i = 0; i < num_var_ranges; i++) {
+		type = range_state[i].type;
+		if (type != MTRR_TYPE_UNCACHABLE)
+			continue;
+		size = range_state[i].size_pfn;
+		if (!size)
+			continue;
+		base = range_state[i].base_pfn;
+		subtract_range(range, base, base + size - 1);
+	}
+	if (extra_remove_size)
+		subtract_range(range, extra_remove_base,
+				 extra_remove_base + extra_remove_size  - 1);
+
+	/* get new range num */
+	nr_range = 0;
+	for (i = 0; i < RANGE_NUM; i++) {
+		if (!range[i].end)
+			continue;
+		nr_range++;
+	}
+	if  (debug_print) {
+		printk(KERN_DEBUG "After UC checking\n");
+		for (i = 0; i < nr_range; i++)
+			printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+				 range[i].start, range[i].end + 1);
+	}
+
+	/* sort the ranges */
+	sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+	if  (debug_print) {
+		printk(KERN_DEBUG "After sorting\n");
+		for (i = 0; i < nr_range; i++)
+			printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+				 range[i].start, range[i].end + 1);
+	}
+
+	/* clear those is not used */
+	for (i = nr_range; i < RANGE_NUM; i++)
+		memset(&range[i], 0, sizeof(range[i]));
+
+	return nr_range;
+}
+
+static struct res_range __initdata range[RANGE_NUM];
+
+#ifdef CONFIG_MTRR_SANITIZER
+
+static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
+{
+	unsigned long sum;
+	int i;
+
+	sum = 0;
+	for (i = 0; i < nr_range; i++)
+		sum += range[i].end + 1 - range[i].start;
+
+	return sum;
+}
+
+static int enable_mtrr_cleanup __initdata =
+	CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
+
+static int __init disable_mtrr_cleanup_setup(char *str)
+{
+	if (enable_mtrr_cleanup != -1)
+		enable_mtrr_cleanup = 0;
+	return 0;
+}
+early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
+
+static int __init enable_mtrr_cleanup_setup(char *str)
+{
+	if (enable_mtrr_cleanup != -1)
+		enable_mtrr_cleanup = 1;
+	return 0;
+}
+early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup);
+
+struct var_mtrr_state {
+	unsigned long	range_startk;
+	unsigned long	range_sizek;
+	unsigned long	chunk_sizek;
+	unsigned long	gran_sizek;
+	unsigned int	reg;
+};
+
+static void __init
+set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
+		unsigned char type, unsigned int address_bits)
+{
+	u32 base_lo, base_hi, mask_lo, mask_hi;
+	u64 base, mask;
+
+	if (!sizek) {
+		fill_mtrr_var_range(reg, 0, 0, 0, 0);
+		return;
+	}
+
+	mask = (1ULL << address_bits) - 1;
+	mask &= ~((((u64)sizek) << 10) - 1);
+
+	base  = ((u64)basek) << 10;
+
+	base |= type;
+	mask |= 0x800;
+
+	base_lo = base & ((1ULL<<32) - 1);
+	base_hi = base >> 32;
+
+	mask_lo = mask & ((1ULL<<32) - 1);
+	mask_hi = mask >> 32;
+
+	fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
+}
+
+static void __init
+save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
+		unsigned char type)
+{
+	range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
+	range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
+	range_state[reg].type = type;
+}
+
+static void __init
+set_var_mtrr_all(unsigned int address_bits)
+{
+	unsigned long basek, sizek;
+	unsigned char type;
+	unsigned int reg;
+
+	for (reg = 0; reg < num_var_ranges; reg++) {
+		basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
+		sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
+		type = range_state[reg].type;
+
+		set_var_mtrr(reg, basek, sizek, type, address_bits);
+	}
+}
+
+static unsigned int __init
+range_to_mtrr(unsigned int reg, unsigned long range_startk,
+	      unsigned long range_sizek, unsigned char type)
+{
+	if (!range_sizek || (reg >= num_var_ranges))
+		return reg;
+
+	while (range_sizek) {
+		unsigned long max_align, align;
+		unsigned long sizek;
+
+		/* Compute the maximum size I can make a range */
+		if (range_startk)
+			max_align = ffs(range_startk) - 1;
+		else
+			max_align = 32;
+		align = fls(range_sizek) - 1;
+		if (align > max_align)
+			align = max_align;
+
+		sizek = 1 << align;
+		if (debug_print)
+			printk(KERN_DEBUG "Setting variable MTRR %d, "
+				"base: %ldMB, range: %ldMB, type %s\n",
+				reg, range_startk >> 10, sizek >> 10,
+				(type == MTRR_TYPE_UNCACHABLE)?"UC":
+				    ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
+				);
+		save_var_mtrr(reg++, range_startk, sizek, type);
+		range_startk += sizek;
+		range_sizek -= sizek;
+		if (reg >= num_var_ranges)
+			break;
+	}
+	return reg;
+}
+
+static unsigned __init
+range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
+			unsigned long sizek)
+{
+	unsigned long hole_basek, hole_sizek;
+	unsigned long second_basek, second_sizek;
+	unsigned long range0_basek, range0_sizek;
+	unsigned long range_basek, range_sizek;
+	unsigned long chunk_sizek;
+	unsigned long gran_sizek;
+
+	hole_basek = 0;
+	hole_sizek = 0;
+	second_basek = 0;
+	second_sizek = 0;
+	chunk_sizek = state->chunk_sizek;
+	gran_sizek = state->gran_sizek;
+
+	/* align with gran size, prevent small block used up MTRRs */
+	range_basek = ALIGN(state->range_startk, gran_sizek);
+	if ((range_basek > basek) && basek)
+		return second_sizek;
+	state->range_sizek -= (range_basek - state->range_startk);
+	range_sizek = ALIGN(state->range_sizek, gran_sizek);
+
+	while (range_sizek > state->range_sizek) {
+		range_sizek -= gran_sizek;
+		if (!range_sizek)
+			return 0;
+	}
+	state->range_sizek = range_sizek;
+
+	/* try to append some small hole */
+	range0_basek = state->range_startk;
+	range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
+	if (range0_sizek == state->range_sizek) {
+		if (debug_print)
+			printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
+				range0_basek<<10,
+				(range0_basek + state->range_sizek)<<10);
+		state->reg = range_to_mtrr(state->reg, range0_basek,
+				state->range_sizek, MTRR_TYPE_WRBACK);
+		return 0;
+	}
+
+	range0_sizek -= chunk_sizek;
+	if (range0_sizek && sizek) {
+	    while (range0_basek + range0_sizek > (basek + sizek)) {
+		range0_sizek -= chunk_sizek;
+		if (!range0_sizek)
+			break;
+	    }
+	}
+
+	if (range0_sizek) {
+		if (debug_print)
+			printk(KERN_DEBUG "range0: %016lx - %016lx\n",
+				range0_basek<<10,
+				(range0_basek + range0_sizek)<<10);
+		state->reg = range_to_mtrr(state->reg, range0_basek,
+				range0_sizek, MTRR_TYPE_WRBACK);
+
+	}
+
+	range_basek = range0_basek + range0_sizek;
+	range_sizek = chunk_sizek;
+
+	if (range_basek + range_sizek > basek &&
+	    range_basek + range_sizek <= (basek + sizek)) {
+		/* one hole */
+		second_basek = basek;
+		second_sizek = range_basek + range_sizek - basek;
+	}
+
+	/* if last piece, only could one hole near end */
+	if ((second_basek || !basek) &&
+	    range_sizek - (state->range_sizek - range0_sizek) - second_sizek <
+	    (chunk_sizek >> 1)) {
+		/*
+		 * one hole in middle (second_sizek is 0) or at end
+		 * (second_sizek is 0 )
+		 */
+		hole_sizek = range_sizek - (state->range_sizek - range0_sizek)
+				 - second_sizek;
+		hole_basek = range_basek + range_sizek - hole_sizek
+				 - second_sizek;
+	} else {
+		/* fallback for big hole, or several holes */
+		range_sizek = state->range_sizek - range0_sizek;
+		second_basek = 0;
+		second_sizek = 0;
+	}
+
+	if (debug_print)
+		printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10,
+			 (range_basek + range_sizek)<<10);
+	state->reg = range_to_mtrr(state->reg, range_basek, range_sizek,
+					 MTRR_TYPE_WRBACK);
+	if (hole_sizek) {
+		if (debug_print)
+			printk(KERN_DEBUG "hole: %016lx - %016lx\n",
+				 hole_basek<<10, (hole_basek + hole_sizek)<<10);
+		state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek,
+						 MTRR_TYPE_UNCACHABLE);
+
+	}
+
+	return second_sizek;
+}
+
+static void __init
+set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
+		   unsigned long size_pfn)
+{
+	unsigned long basek, sizek;
+	unsigned long second_sizek = 0;
+
+	if (state->reg >= num_var_ranges)
+		return;
+
+	basek = base_pfn << (PAGE_SHIFT - 10);
+	sizek = size_pfn << (PAGE_SHIFT - 10);
+
+	/* See if I can merge with the last range */
+	if ((basek <= 1024) ||
+	    (state->range_startk + state->range_sizek == basek)) {
+		unsigned long endk = basek + sizek;
+		state->range_sizek = endk - state->range_startk;
+		return;
+	}
+	/* Write the range mtrrs */
+	if (state->range_sizek != 0)
+		second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
+
+	/* Allocate an msr */
+	state->range_startk = basek + second_sizek;
+	state->range_sizek  = sizek - second_sizek;
+}
+
+/* mininum size of mtrr block that can take hole */
+static u64 mtrr_chunk_size __initdata = (256ULL<<20);
+
+static int __init parse_mtrr_chunk_size_opt(char *p)
+{
+	if (!p)
+		return -EINVAL;
+	mtrr_chunk_size = memparse(p, &p);
+	return 0;
+}
+early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
+
+/* granity of mtrr of block */
+static u64 mtrr_gran_size __initdata;
+
+static int __init parse_mtrr_gran_size_opt(char *p)
+{
+	if (!p)
+		return -EINVAL;
+	mtrr_gran_size = memparse(p, &p);
+	return 0;
+}
+early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
+
+static int nr_mtrr_spare_reg __initdata =
+				 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
+
+static int __init parse_mtrr_spare_reg(char *arg)
+{
+	if (arg)
+		nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
+	return 0;
+}
+
+early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
+
+static int __init
+x86_setup_var_mtrrs(struct res_range *range, int nr_range,
+		    u64 chunk_size, u64 gran_size)
+{
+	struct var_mtrr_state var_state;
+	int i;
+	int num_reg;
+
+	var_state.range_startk	= 0;
+	var_state.range_sizek	= 0;
+	var_state.reg		= 0;
+	var_state.chunk_sizek	= chunk_size >> 10;
+	var_state.gran_sizek	= gran_size >> 10;
+
+	memset(range_state, 0, sizeof(range_state));
+
+	/* Write the range etc */
+	for (i = 0; i < nr_range; i++)
+		set_var_mtrr_range(&var_state, range[i].start,
+				   range[i].end - range[i].start + 1);
+
+	/* Write the last range */
+	if (var_state.range_sizek != 0)
+		range_to_mtrr_with_hole(&var_state, 0, 0);
+
+	num_reg = var_state.reg;
+	/* Clear out the extra MTRR's */
+	while (var_state.reg < num_var_ranges) {
+		save_var_mtrr(var_state.reg, 0, 0, 0);
+		var_state.reg++;
+	}
+
+	return num_reg;
+}
+
+struct mtrr_cleanup_result {
+	unsigned long gran_sizek;
+	unsigned long chunk_sizek;
+	unsigned long lose_cover_sizek;
+	unsigned int num_reg;
+	int bad;
+};
+
+/*
+ * gran_size: 1M, 2M, ..., 2G
+ * chunk size: gran_size, ..., 4G
+ * so we need (2+13)*6
+ */
+#define NUM_RESULT	90
+#define PSHIFT		(PAGE_SHIFT - 10)
+
+static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
+static struct res_range __initdata range_new[RANGE_NUM];
+static unsigned long __initdata min_loss_pfn[RANGE_NUM];
+
+static int __init mtrr_cleanup(unsigned address_bits)
+{
+	unsigned long extra_remove_base, extra_remove_size;
+	unsigned long i, base, size, def, dummy;
+	mtrr_type type;
+	int nr_range, nr_range_new;
+	u64 chunk_size, gran_size;
+	unsigned long range_sums, range_sums_new;
+	int index_good;
+	int num_reg_good;
+
+	/* extra one for all 0 */
+	int num[MTRR_NUM_TYPES + 1];
+
+	if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
+		return 0;
+	rdmsr(MTRRdefType_MSR, def, dummy);
+	def &= 0xff;
+	if (def != MTRR_TYPE_UNCACHABLE)
+		return 0;
+
+	/* get it and store it aside */
+	memset(range_state, 0, sizeof(range_state));
+	for (i = 0; i < num_var_ranges; i++) {
+		mtrr_if->get(i, &base, &size, &type);
+		range_state[i].base_pfn = base;
+		range_state[i].size_pfn = size;
+		range_state[i].type = type;
+	}
+
+	/* check entries number */
+	memset(num, 0, sizeof(num));
+	for (i = 0; i < num_var_ranges; i++) {
+		type = range_state[i].type;
+		size = range_state[i].size_pfn;
+		if (type >= MTRR_NUM_TYPES)
+			continue;
+		if (!size)
+			type = MTRR_NUM_TYPES;
+		num[type]++;
+	}
+
+	/* check if we got UC entries */
+	if (!num[MTRR_TYPE_UNCACHABLE])
+		return 0;
+
+	/* check if we only had WB and UC */
+	if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
+		num_var_ranges - num[MTRR_NUM_TYPES])
+		return 0;
+
+	memset(range, 0, sizeof(range));
+	extra_remove_size = 0;
+	if (mtrr_tom2) {
+		extra_remove_base = 1 << (32 - PAGE_SHIFT);
+		extra_remove_size =
+			(mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
+	}
+	nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
+					  extra_remove_size);
+	range_sums = sum_ranges(range, nr_range);
+	printk(KERN_INFO "total RAM coverred: %ldM\n",
+	       range_sums >> (20 - PAGE_SHIFT));
+
+	if (mtrr_chunk_size && mtrr_gran_size) {
+		int num_reg;
+
+		debug_print = 1;
+		/* convert ranges to var ranges state */
+		num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
+					      mtrr_gran_size);
+
+		/* we got new setting in range_state, check it */
+		memset(range_new, 0, sizeof(range_new));
+		nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
+						      extra_remove_base,
+						      extra_remove_size);
+		range_sums_new = sum_ranges(range_new, nr_range_new);
+
+		i = 0;
+		result[i].chunk_sizek = mtrr_chunk_size >> 10;
+		result[i].gran_sizek = mtrr_gran_size >> 10;
+		result[i].num_reg = num_reg;
+		if (range_sums < range_sums_new) {
+			result[i].lose_cover_sizek =
+				(range_sums_new - range_sums) << PSHIFT;
+			result[i].bad = 1;
+		} else
+			result[i].lose_cover_sizek =
+				(range_sums - range_sums_new) << PSHIFT;
+
+		printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
+			 result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10,
+			 result[i].chunk_sizek >> 10);
+		printk(KERN_CONT "num_reg: %d  \tlose cover RAM: %s%ldM \n",
+			 result[i].num_reg, result[i].bad?"-":"",
+			 result[i].lose_cover_sizek >> 10);
+		if (!result[i].bad) {
+			set_var_mtrr_all(address_bits);
+			return 1;
+		}
+		printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
+		       "will find optimal one\n");
+		debug_print = 0;
+		memset(result, 0, sizeof(result[0]));
+	}
+
+	i = 0;
+	memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
+	memset(result, 0, sizeof(result));
+	for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) {
+		for (chunk_size = gran_size; chunk_size < (1ULL<<33);
+		     chunk_size <<= 1) {
+			int num_reg;
+
+			if (debug_print)
+				printk(KERN_INFO
+			       "\ngran_size: %lldM   chunk_size_size: %lldM\n",
+				       gran_size >> 20, chunk_size >> 20);
+			if (i >= NUM_RESULT)
+				continue;
+
+			/* convert ranges to var ranges state */
+			num_reg = x86_setup_var_mtrrs(range, nr_range,
+							 chunk_size, gran_size);
+
+			/* we got new setting in range_state, check it */
+			memset(range_new, 0, sizeof(range_new));
+			nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
+					 extra_remove_base, extra_remove_size);
+			range_sums_new = sum_ranges(range_new, nr_range_new);
+
+			result[i].chunk_sizek = chunk_size >> 10;
+			result[i].gran_sizek = gran_size >> 10;
+			result[i].num_reg = num_reg;
+			if (range_sums < range_sums_new) {
+				result[i].lose_cover_sizek =
+					(range_sums_new - range_sums) << PSHIFT;
+				result[i].bad = 1;
+			} else
+				result[i].lose_cover_sizek =
+					(range_sums - range_sums_new) << PSHIFT;
+
+			/* double check it */
+			if (!result[i].bad && !result[i].lose_cover_sizek) {
+				if (nr_range_new != nr_range ||
+					memcmp(range, range_new, sizeof(range)))
+						result[i].bad = 1;
+			}
+
+			if (!result[i].bad && (range_sums - range_sums_new <
+					       min_loss_pfn[num_reg])) {
+				min_loss_pfn[num_reg] =
+					range_sums - range_sums_new;
+			}
+			i++;
+		}
+	}
+
+	/* print out all */
+	for (i = 0; i < NUM_RESULT; i++) {
+		printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
+		       result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10,
+		       result[i].chunk_sizek >> 10);
+		printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n",
+		       result[i].num_reg, result[i].bad?"-":"",
+		       result[i].lose_cover_sizek >> 10);
+	}
+
+	/* try to find the optimal index */
+	if (nr_mtrr_spare_reg >= num_var_ranges)
+		nr_mtrr_spare_reg = num_var_ranges - 1;
+	num_reg_good = -1;
+	for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
+		if (!min_loss_pfn[i]) {
+			num_reg_good = i;
+			break;
+		}
+	}
+
+	index_good = -1;
+	if (num_reg_good != -1) {
+		for (i = 0; i < NUM_RESULT; i++) {
+			if (!result[i].bad &&
+			    result[i].num_reg == num_reg_good &&
+			    !result[i].lose_cover_sizek) {
+				index_good = i;
+				break;
+			}
+		}
+	}
+
+	if (index_good != -1) {
+		printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
+		i = index_good;
+		printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t",
+				result[i].gran_sizek >> 10,
+				result[i].chunk_sizek >> 10);
+		printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n",
+				result[i].num_reg,
+				result[i].lose_cover_sizek >> 10);
+		/* convert ranges to var ranges state */
+		chunk_size = result[i].chunk_sizek;
+		chunk_size <<= 10;
+		gran_size = result[i].gran_sizek;
+		gran_size <<= 10;
+		debug_print = 1;
+		x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
+		set_var_mtrr_all(address_bits);
+		return 1;
+	}
+
+	printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
+	printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
+
+	return 0;
+}
+#else
+static int __init mtrr_cleanup(unsigned address_bits)
+{
+	return 0;
+}
+#endif
+
+static int __initdata changed_by_mtrr_cleanup;
+
 static int disable_mtrr_trim;
 
 static int __init disable_mtrr_trim_setup(char *str)
@@ -648,6 +1430,19 @@ int __init amd_special_default_mtrr(void)
 	return 0;
 }
 
+static u64 __init real_trim_memory(unsigned long start_pfn,
+				   unsigned long limit_pfn)
+{
+	u64 trim_start, trim_size;
+	trim_start = start_pfn;
+	trim_start <<= PAGE_SHIFT;
+	trim_size = limit_pfn;
+	trim_size <<= PAGE_SHIFT;
+	trim_size -= trim_start;
+
+	return e820_update_range(trim_start, trim_size, E820_RAM,
+				E820_RESERVED);
+}
 /**
  * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
  * @end_pfn: ending page frame number
@@ -663,8 +1458,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 {
 	unsigned long i, base, size, highest_pfn = 0, def, dummy;
 	mtrr_type type;
-	u64 trim_start, trim_size;
+	int nr_range;
+	u64 total_trim_size;
 
+	/* extra one for all 0 */
+	int num[MTRR_NUM_TYPES + 1];
 	/*
 	 * Make sure we only trim uncachable memory on machines that
 	 * support the Intel MTRR architecture:
@@ -676,14 +1474,22 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 	if (def != MTRR_TYPE_UNCACHABLE)
 		return 0;
 
-	if (amd_special_default_mtrr())
-		return 0;
+	/* get it and store it aside */
+	memset(range_state, 0, sizeof(range_state));
+	for (i = 0; i < num_var_ranges; i++) {
+		mtrr_if->get(i, &base, &size, &type);
+		range_state[i].base_pfn = base;
+		range_state[i].size_pfn = size;
+		range_state[i].type = type;
+	}
 
 	/* Find highest cached pfn */
 	for (i = 0; i < num_var_ranges; i++) {
-		mtrr_if->get(i, &base, &size, &type);
+		type = range_state[i].type;
 		if (type != MTRR_TYPE_WRBACK)
 			continue;
+		base = range_state[i].base_pfn;
+		size = range_state[i].size_pfn;
 		if (highest_pfn < base + size)
 			highest_pfn = base + size;
 	}
@@ -698,22 +1504,65 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 		return 0;
 	}
 
-	if (highest_pfn < end_pfn) {
+	/* check entries number */
+	memset(num, 0, sizeof(num));
+	for (i = 0; i < num_var_ranges; i++) {
+		type = range_state[i].type;
+		if (type >= MTRR_NUM_TYPES)
+			continue;
+		size = range_state[i].size_pfn;
+		if (!size)
+			type = MTRR_NUM_TYPES;
+		num[type]++;
+	}
+
+	/* no entry for WB? */
+	if (!num[MTRR_TYPE_WRBACK])
+		return 0;
+
+	/* check if we only had WB and UC */
+	if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
+		num_var_ranges - num[MTRR_NUM_TYPES])
+		return 0;
+
+	memset(range, 0, sizeof(range));
+	nr_range = 0;
+	if (mtrr_tom2) {
+		range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
+		range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
+		if (highest_pfn < range[nr_range].end + 1)
+			highest_pfn = range[nr_range].end + 1;
+		nr_range++;
+	}
+	nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
+
+	total_trim_size = 0;
+	/* check the head */
+	if (range[0].start)
+		total_trim_size += real_trim_memory(0, range[0].start);
+	/* check the holes */
+	for (i = 0; i < nr_range - 1; i++) {
+		if (range[i].end + 1 < range[i+1].start)
+			total_trim_size += real_trim_memory(range[i].end + 1,
+							    range[i+1].start);
+	}
+	/* check the top */
+	i = nr_range - 1;
+	if (range[i].end + 1 < end_pfn)
+		total_trim_size += real_trim_memory(range[i].end + 1,
+							 end_pfn);
+
+	if (total_trim_size) {
 		printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
-			" all of memory, losing %luMB of RAM.\n",
-			(end_pfn - highest_pfn) >> (20 - PAGE_SHIFT));
+			" all of memory, losing %lluMB of RAM.\n",
+			total_trim_size >> 20);
 
-		WARN_ON(1);
+		if (!changed_by_mtrr_cleanup)
+			WARN_ON(1);
 
 		printk(KERN_INFO "update e820 for mtrr\n");
-		trim_start = highest_pfn;
-		trim_start <<= PAGE_SHIFT;
-		trim_size = end_pfn;
-		trim_size <<= PAGE_SHIFT;
-		trim_size -= trim_start;
-		update_memory_range(trim_start, trim_size, E820_RAM,
-					E820_RESERVED);
 		update_e820();
+
 		return 1;
 	}
 
@@ -729,18 +1578,21 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
  */
 void __init mtrr_bp_init(void)
 {
+	u32 phys_addr;
 	init_ifs();
 
+	phys_addr = 32;
+
 	if (cpu_has_mtrr) {
 		mtrr_if = &generic_mtrr_ops;
 		size_or_mask = 0xff000000;	/* 36 bits */
 		size_and_mask = 0x00f00000;
+		phys_addr = 36;
 
 		/* This is an AMD specific MSR, but we assume(hope?) that
 		   Intel will implement it to when they extend the address
 		   bus of the Xeon. */
 		if (cpuid_eax(0x80000000) >= 0x80000008) {
-			u32 phys_addr;
 			phys_addr = cpuid_eax(0x80000008) & 0xff;
 			/* CPUID workaround for Intel 0F33/0F34 CPU */
 			if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
@@ -758,6 +1610,7 @@ void __init mtrr_bp_init(void)
 			   don't support PAE */
 			size_or_mask = 0xfff00000;	/* 32 bits */
 			size_and_mask = 0;
+			phys_addr = 32;
 		}
 	} else {
 		switch (boot_cpu_data.x86_vendor) {
@@ -791,8 +1644,15 @@ void __init mtrr_bp_init(void)
 	if (mtrr_if) {
 		set_num_var_ranges();
 		init_table();
-		if (use_intel())
+		if (use_intel()) {
 			get_mtrr_state();
+
+			if (mtrr_cleanup(phys_addr)) {
+				changed_by_mtrr_cleanup = 1;
+				mtrr_if->set_all();
+			}
+
+		}
 	}
 }
 
@@ -829,9 +1689,10 @@ static int __init mtrr_init_finialize(void)
 {
 	if (!mtrr_if)
 		return 0;
-	if (use_intel())
-		mtrr_state_warn();
-	else {
+	if (use_intel()) {
+		if (!changed_by_mtrr_cleanup)
+			mtrr_state_warn();
+	} else {
 		/* The CPUs haven't MTRR and seem to not support SMP. They have
 		 * specific drivers, we use a tricky method to support
 		 * suspend/resume for them.

+ 3 - 0
arch/x86/kernel/cpu/mtrr/mtrr.h

@@ -81,6 +81,8 @@ void set_mtrr_done(struct set_mtrr_context *ctxt);
 void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
 void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
 
+void fill_mtrr_var_range(unsigned int index,
+		u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
 void get_mtrr_state(void);
 
 extern void set_mtrr_ops(struct mtrr_ops * ops);
@@ -92,6 +94,7 @@ extern struct mtrr_ops * mtrr_if;
 #define use_intel()	(mtrr_if && mtrr_if->use_intel_if == 1)
 
 extern unsigned int num_var_ranges;
+extern u64 mtrr_tom2;
 
 void mtrr_state_warn(void);
 const char *mtrr_attrib_to_str(int x);

A diferenza do arquivo foi suprimida porque é demasiado grande
+ 687 - 516
arch/x86/kernel/e820.c


+ 0 - 775
arch/x86/kernel/e820_32.c

@@ -1,775 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/ioport.h>
-#include <linux/string.h>
-#include <linux/kexec.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/pfn.h>
-#include <linux/uaccess.h>
-#include <linux/suspend.h>
-
-#include <asm/pgtable.h>
-#include <asm/page.h>
-#include <asm/e820.h>
-#include <asm/setup.h>
-
-struct e820map e820;
-struct change_member {
-	struct e820entry *pbios; /* pointer to original bios entry */
-	unsigned long long addr; /* address for this change point */
-};
-static struct change_member change_point_list[2*E820MAX] __initdata;
-static struct change_member *change_point[2*E820MAX] __initdata;
-static struct e820entry *overlap_list[E820MAX] __initdata;
-static struct e820entry new_bios[E820MAX] __initdata;
-/* For PCI or other memory-mapped resources */
-unsigned long pci_mem_start = 0x10000000;
-#ifdef CONFIG_PCI
-EXPORT_SYMBOL(pci_mem_start);
-#endif
-extern int user_defined_memmap;
-
-static struct resource system_rom_resource = {
-	.name	= "System ROM",
-	.start	= 0xf0000,
-	.end	= 0xfffff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource extension_rom_resource = {
-	.name	= "Extension ROM",
-	.start	= 0xe0000,
-	.end	= 0xeffff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource adapter_rom_resources[] = { {
-	.name 	= "Adapter ROM",
-	.start	= 0xc8000,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-} };
-
-static struct resource video_rom_resource = {
-	.name 	= "Video ROM",
-	.start	= 0xc0000,
-	.end	= 0xc7fff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-#define ROMSIGNATURE 0xaa55
-
-static int __init romsignature(const unsigned char *rom)
-{
-	const unsigned short * const ptr = (const unsigned short *)rom;
-	unsigned short sig;
-
-	return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
-}
-
-static int __init romchecksum(const unsigned char *rom, unsigned long length)
-{
-	unsigned char sum, c;
-
-	for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
-		sum += c;
-	return !length && !sum;
-}
-
-static void __init probe_roms(void)
-{
-	const unsigned char *rom;
-	unsigned long start, length, upper;
-	unsigned char c;
-	int i;
-
-	/* video rom */
-	upper = adapter_rom_resources[0].start;
-	for (start = video_rom_resource.start; start < upper; start += 2048) {
-		rom = isa_bus_to_virt(start);
-		if (!romsignature(rom))
-			continue;
-
-		video_rom_resource.start = start;
-
-		if (probe_kernel_address(rom + 2, c) != 0)
-			continue;
-
-		/* 0 < length <= 0x7f * 512, historically */
-		length = c * 512;
-
-		/* if checksum okay, trust length byte */
-		if (length && romchecksum(rom, length))
-			video_rom_resource.end = start + length - 1;
-
-		request_resource(&iomem_resource, &video_rom_resource);
-		break;
-	}
-
-	start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
-	if (start < upper)
-		start = upper;
-
-	/* system rom */
-	request_resource(&iomem_resource, &system_rom_resource);
-	upper = system_rom_resource.start;
-
-	/* check for extension rom (ignore length byte!) */
-	rom = isa_bus_to_virt(extension_rom_resource.start);
-	if (romsignature(rom)) {
-		length = extension_rom_resource.end - extension_rom_resource.start + 1;
-		if (romchecksum(rom, length)) {
-			request_resource(&iomem_resource, &extension_rom_resource);
-			upper = extension_rom_resource.start;
-		}
-	}
-
-	/* check for adapter roms on 2k boundaries */
-	for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
-		rom = isa_bus_to_virt(start);
-		if (!romsignature(rom))
-			continue;
-
-		if (probe_kernel_address(rom + 2, c) != 0)
-			continue;
-
-		/* 0 < length <= 0x7f * 512, historically */
-		length = c * 512;
-
-		/* but accept any length that fits if checksum okay */
-		if (!length || start + length > upper || !romchecksum(rom, length))
-			continue;
-
-		adapter_rom_resources[i].start = start;
-		adapter_rom_resources[i].end = start + length - 1;
-		request_resource(&iomem_resource, &adapter_rom_resources[i]);
-
-		start = adapter_rom_resources[i++].end & ~2047UL;
-	}
-}
-
-/*
- * Request address space for all standard RAM and ROM resources
- * and also for regions reported as reserved by the e820.
- */
-void __init init_iomem_resources(struct resource *code_resource,
-		struct resource *data_resource,
-		struct resource *bss_resource)
-{
-	int i;
-
-	probe_roms();
-	for (i = 0; i < e820.nr_map; i++) {
-		struct resource *res;
-#ifndef CONFIG_RESOURCES_64BIT
-		if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
-			continue;
-#endif
-		res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
-		switch (e820.map[i].type) {
-		case E820_RAM:	res->name = "System RAM"; break;
-		case E820_ACPI:	res->name = "ACPI Tables"; break;
-		case E820_NVS:	res->name = "ACPI Non-volatile Storage"; break;
-		default:	res->name = "reserved";
-		}
-		res->start = e820.map[i].addr;
-		res->end = res->start + e820.map[i].size - 1;
-		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-		if (request_resource(&iomem_resource, res)) {
-			kfree(res);
-			continue;
-		}
-		if (e820.map[i].type == E820_RAM) {
-			/*
-			 *  We don't know which RAM region contains kernel data,
-			 *  so we try it repeatedly and let the resource manager
-			 *  test it.
-			 */
-			request_resource(res, code_resource);
-			request_resource(res, data_resource);
-			request_resource(res, bss_resource);
-#ifdef CONFIG_KEXEC
-			if (crashk_res.start != crashk_res.end)
-				request_resource(res, &crashk_res);
-#endif
-		}
-	}
-}
-
-#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
-/**
- * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
- * correspond to e820 RAM areas and mark the corresponding pages as nosave for
- * hibernation.
- *
- * This function requires the e820 map to be sorted and without any
- * overlapping entries and assumes the first e820 area to be RAM.
- */
-void __init e820_mark_nosave_regions(void)
-{
-	int i;
-	unsigned long pfn;
-
-	pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
-	for (i = 1; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-
-		if (pfn < PFN_UP(ei->addr))
-			register_nosave_region(pfn, PFN_UP(ei->addr));
-
-		pfn = PFN_DOWN(ei->addr + ei->size);
-		if (ei->type != E820_RAM)
-			register_nosave_region(PFN_UP(ei->addr), pfn);
-
-		if (pfn >= max_low_pfn)
-			break;
-	}
-}
-#endif
-
-void __init add_memory_region(unsigned long long start,
-			      unsigned long long size, int type)
-{
-	int x;
-
-	x = e820.nr_map;
-
-	if (x == E820MAX) {
-		printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
-		return;
-	}
-
-	e820.map[x].addr = start;
-	e820.map[x].size = size;
-	e820.map[x].type = type;
-	e820.nr_map++;
-} /* add_memory_region */
-
-/*
- * Sanitize the BIOS e820 map.
- *
- * Some e820 responses include overlapping entries.  The following
- * replaces the original e820 map with a new one, removing overlaps.
- *
- */
-int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
-{
-	struct change_member *change_tmp;
-	unsigned long current_type, last_type;
-	unsigned long long last_addr;
-	int chgidx, still_changing;
-	int overlap_entries;
-	int new_bios_entry;
-	int old_nr, new_nr, chg_nr;
-	int i;
-
-	/*
-		Visually we're performing the following (1,2,3,4 = memory types)...
-
-		Sample memory map (w/overlaps):
-		   ____22__________________
-		   ______________________4_
-		   ____1111________________
-		   _44_____________________
-		   11111111________________
-		   ____________________33__
-		   ___________44___________
-		   __________33333_________
-		   ______________22________
-		   ___________________2222_
-		   _________111111111______
-		   _____________________11_
-		   _________________4______
-
-		Sanitized equivalent (no overlap):
-		   1_______________________
-		   _44_____________________
-		   ___1____________________
-		   ____22__________________
-		   ______11________________
-		   _________1______________
-		   __________3_____________
-		   ___________44___________
-		   _____________33_________
-		   _______________2________
-		   ________________1_______
-		   _________________4______
-		   ___________________2____
-		   ____________________33__
-		   ______________________4_
-	*/
-	/* if there's only one memory region, don't bother */
-	if (*pnr_map < 2) {
-		return -1;
-	}
-
-	old_nr = *pnr_map;
-
-	/* bail out if we find any unreasonable addresses in bios map */
-	for (i=0; i<old_nr; i++)
-		if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
-			return -1;
-		}
-
-	/* create pointers for initial change-point information (for sorting) */
-	for (i=0; i < 2*old_nr; i++)
-		change_point[i] = &change_point_list[i];
-
-	/* record all known change-points (starting and ending addresses),
-	   omitting those that are for empty memory regions */
-	chgidx = 0;
-	for (i=0; i < old_nr; i++)	{
-		if (biosmap[i].size != 0) {
-			change_point[chgidx]->addr = biosmap[i].addr;
-			change_point[chgidx++]->pbios = &biosmap[i];
-			change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
-			change_point[chgidx++]->pbios = &biosmap[i];
-		}
-	}
-	chg_nr = chgidx;    	/* true number of change-points */
-
-	/* sort change-point list by memory addresses (low -> high) */
-	still_changing = 1;
-	while (still_changing)	{
-		still_changing = 0;
-		for (i=1; i < chg_nr; i++)  {
-			/* if <current_addr> > <last_addr>, swap */
-			/* or, if current=<start_addr> & last=<end_addr>, swap */
-			if ((change_point[i]->addr < change_point[i-1]->addr) ||
-				((change_point[i]->addr == change_point[i-1]->addr) &&
-				 (change_point[i]->addr == change_point[i]->pbios->addr) &&
-				 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
-			   )
-			{
-				change_tmp = change_point[i];
-				change_point[i] = change_point[i-1];
-				change_point[i-1] = change_tmp;
-				still_changing=1;
-			}
-		}
-	}
-
-	/* create a new bios memory map, removing overlaps */
-	overlap_entries=0;	 /* number of entries in the overlap table */
-	new_bios_entry=0;	 /* index for creating new bios map entries */
-	last_type = 0;		 /* start with undefined memory type */
-	last_addr = 0;		 /* start with 0 as last starting address */
-	/* loop through change-points, determining affect on the new bios map */
-	for (chgidx=0; chgidx < chg_nr; chgidx++)
-	{
-		/* keep track of all overlapping bios entries */
-		if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
-		{
-			/* add map entry to overlap list (> 1 entry implies an overlap) */
-			overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
-		}
-		else
-		{
-			/* remove entry from list (order independent, so swap with last) */
-			for (i=0; i<overlap_entries; i++)
-			{
-				if (overlap_list[i] == change_point[chgidx]->pbios)
-					overlap_list[i] = overlap_list[overlap_entries-1];
-			}
-			overlap_entries--;
-		}
-		/* if there are overlapping entries, decide which "type" to use */
-		/* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
-		current_type = 0;
-		for (i=0; i<overlap_entries; i++)
-			if (overlap_list[i]->type > current_type)
-				current_type = overlap_list[i]->type;
-		/* continue building up new bios map based on this information */
-		if (current_type != last_type)	{
-			if (last_type != 0)	 {
-				new_bios[new_bios_entry].size =
-					change_point[chgidx]->addr - last_addr;
-				/* move forward only if the new size was non-zero */
-				if (new_bios[new_bios_entry].size != 0)
-					if (++new_bios_entry >= E820MAX)
-						break; 	/* no more space left for new bios entries */
-			}
-			if (current_type != 0)	{
-				new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
-				new_bios[new_bios_entry].type = current_type;
-				last_addr=change_point[chgidx]->addr;
-			}
-			last_type = current_type;
-		}
-	}
-	new_nr = new_bios_entry;   /* retain count for new bios entries */
-
-	/* copy new bios mapping into original location */
-	memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
-	*pnr_map = new_nr;
-
-	return 0;
-}
-
-/*
- * Copy the BIOS e820 map into a safe place.
- *
- * Sanity-check it while we're at it..
- *
- * If we're lucky and live on a modern system, the setup code
- * will have given us a memory map that we can use to properly
- * set up memory.  If we aren't, we'll fake a memory map.
- *
- * We check to see that the memory map contains at least 2 elements
- * before we'll use it, because the detection code in setup.S may
- * not be perfect and most every PC known to man has two memory
- * regions: one from 0 to 640k, and one from 1mb up.  (The IBM
- * thinkpad 560x, for example, does not cooperate with the memory
- * detection code.)
- */
-int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
-{
-	/* Only one memory region (or negative)? Ignore it */
-	if (nr_map < 2)
-		return -1;
-
-	do {
-		u64 start = biosmap->addr;
-		u64 size = biosmap->size;
-		u64 end = start + size;
-		u32 type = biosmap->type;
-
-		/* Overflow in 64 bits? Ignore the memory map. */
-		if (start > end)
-			return -1;
-
-		add_memory_region(start, size, type);
-	} while (biosmap++, --nr_map);
-
-	return 0;
-}
-
-/*
- * Find the highest page frame number we have available
- */
-void __init propagate_e820_map(void)
-{
-	int i;
-
-	max_pfn = 0;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		unsigned long start, end;
-		/* RAM? */
-		if (e820.map[i].type != E820_RAM)
-			continue;
-		start = PFN_UP(e820.map[i].addr);
-		end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
-		if (start >= end)
-			continue;
-		if (end > max_pfn)
-			max_pfn = end;
-		memory_present(0, start, end);
-	}
-}
-
-/*
- * Register fully available low RAM pages with the bootmem allocator.
- */
-void __init register_bootmem_low_pages(unsigned long max_low_pfn)
-{
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		unsigned long curr_pfn, last_pfn, size;
-		/*
-		 * Reserve usable low memory
-		 */
-		if (e820.map[i].type != E820_RAM)
-			continue;
-		/*
-		 * We are rounding up the start address of usable memory:
-		 */
-		curr_pfn = PFN_UP(e820.map[i].addr);
-		if (curr_pfn >= max_low_pfn)
-			continue;
-		/*
-		 * ... and at the end of the usable range downwards:
-		 */
-		last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
-
-		if (last_pfn > max_low_pfn)
-			last_pfn = max_low_pfn;
-
-		/*
-		 * .. finally, did all the rounding and playing
-		 * around just make the area go away?
-		 */
-		if (last_pfn <= curr_pfn)
-			continue;
-
-		size = last_pfn - curr_pfn;
-		free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
-	}
-}
-
-void __init e820_register_memory(void)
-{
-	unsigned long gapstart, gapsize, round;
-	unsigned long long last;
-	int i;
-
-	/*
-	 * Search for the biggest gap in the low 32 bits of the e820
-	 * memory space.
-	 */
-	last = 0x100000000ull;
-	gapstart = 0x10000000;
-	gapsize = 0x400000;
-	i = e820.nr_map;
-	while (--i >= 0) {
-		unsigned long long start = e820.map[i].addr;
-		unsigned long long end = start + e820.map[i].size;
-
-		/*
-		 * Since "last" is at most 4GB, we know we'll
-		 * fit in 32 bits if this condition is true
-		 */
-		if (last > end) {
-			unsigned long gap = last - end;
-
-			if (gap > gapsize) {
-				gapsize = gap;
-				gapstart = end;
-			}
-		}
-		if (start < last)
-			last = start;
-	}
-
-	/*
-	 * See how much we want to round up: start off with
-	 * rounding to the next 1MB area.
-	 */
-	round = 0x100000;
-	while ((gapsize >> 4) > round)
-		round += round;
-	/* Fun with two's complement */
-	pci_mem_start = (gapstart + round) & -round;
-
-	printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
-		pci_mem_start, gapstart, gapsize);
-}
-
-void __init print_memory_map(char *who)
-{
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		printk(" %s: %016Lx - %016Lx ", who,
-			e820.map[i].addr,
-			e820.map[i].addr + e820.map[i].size);
-		switch (e820.map[i].type) {
-		case E820_RAM:	printk("(usable)\n");
-				break;
-		case E820_RESERVED:
-				printk("(reserved)\n");
-				break;
-		case E820_ACPI:
-				printk("(ACPI data)\n");
-				break;
-		case E820_NVS:
-				printk("(ACPI NVS)\n");
-				break;
-		default:	printk("type %u\n", e820.map[i].type);
-				break;
-		}
-	}
-}
-
-void __init limit_regions(unsigned long long size)
-{
-	unsigned long long current_addr;
-	int i;
-
-	print_memory_map("limit_regions start");
-	for (i = 0; i < e820.nr_map; i++) {
-		current_addr = e820.map[i].addr + e820.map[i].size;
-		if (current_addr < size)
-			continue;
-
-		if (e820.map[i].type != E820_RAM)
-			continue;
-
-		if (e820.map[i].addr >= size) {
-			/*
-			 * This region starts past the end of the
-			 * requested size, skip it completely.
-			 */
-			e820.nr_map = i;
-		} else {
-			e820.nr_map = i + 1;
-			e820.map[i].size -= current_addr - size;
-		}
-		print_memory_map("limit_regions endfor");
-		return;
-	}
-	print_memory_map("limit_regions endfunc");
-}
-
-/*
- * This function checks if any part of the range <start,end> is mapped
- * with type.
- */
-int
-e820_any_mapped(u64 start, u64 end, unsigned type)
-{
-	int i;
-	for (i = 0; i < e820.nr_map; i++) {
-		const struct e820entry *ei = &e820.map[i];
-		if (type && ei->type != type)
-			continue;
-		if (ei->addr >= end || ei->addr + ei->size <= start)
-			continue;
-		return 1;
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(e820_any_mapped);
-
- /*
-  * This function checks if the entire range <start,end> is mapped with type.
-  *
-  * Note: this function only works correct if the e820 table is sorted and
-  * not-overlapping, which is the case
-  */
-int __init
-e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
-{
-	u64 start = s;
-	u64 end = e;
-	int i;
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		if (type && ei->type != type)
-			continue;
-		/* is the region (part) in overlap with the current region ?*/
-		if (ei->addr >= end || ei->addr + ei->size <= start)
-			continue;
-		/* if the region is at the beginning of <start,end> we move
-		 * start to the end of the region since it's ok until there
-		 */
-		if (ei->addr <= start)
-			start = ei->addr + ei->size;
-		/* if start is now at or beyond end, we're done, full
-		 * coverage */
-		if (start >= end)
-			return 1; /* we're done */
-	}
-	return 0;
-}
-
-static int __init parse_memmap(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-
-	if (strcmp(arg, "exactmap") == 0) {
-#ifdef CONFIG_CRASH_DUMP
-		/* If we are doing a crash dump, we
-		 * still need to know the real mem
-		 * size before original memory map is
-		 * reset.
-		 */
-		propagate_e820_map();
-		saved_max_pfn = max_pfn;
-#endif
-		e820.nr_map = 0;
-		user_defined_memmap = 1;
-	} else {
-		/* If the user specifies memory size, we
-		 * limit the BIOS-provided memory map to
-		 * that size. exactmap can be used to specify
-		 * the exact map. mem=number can be used to
-		 * trim the existing memory map.
-		 */
-		unsigned long long start_at, mem_size;
-
-		mem_size = memparse(arg, &arg);
-		if (*arg == '@') {
-			start_at = memparse(arg+1, &arg);
-			add_memory_region(start_at, mem_size, E820_RAM);
-		} else if (*arg == '#') {
-			start_at = memparse(arg+1, &arg);
-			add_memory_region(start_at, mem_size, E820_ACPI);
-		} else if (*arg == '$') {
-			start_at = memparse(arg+1, &arg);
-			add_memory_region(start_at, mem_size, E820_RESERVED);
-		} else {
-			limit_regions(mem_size);
-			user_defined_memmap = 1;
-		}
-	}
-	return 0;
-}
-early_param("memmap", parse_memmap);
-void __init update_memory_range(u64 start, u64 size, unsigned old_type,
-				unsigned new_type)
-{
-	int i;
-
-	BUG_ON(old_type == new_type);
-
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		u64 final_start, final_end;
-		if (ei->type != old_type)
-			continue;
-		/* totally covered? */
-		if (ei->addr >= start && ei->size <= size) {
-			ei->type = new_type;
-			continue;
-		}
-		/* partially covered */
-		final_start = max(start, ei->addr);
-		final_end = min(start + size, ei->addr + ei->size);
-		if (final_start >= final_end)
-			continue;
-		add_memory_region(final_start, final_end - final_start,
-					 new_type);
-	}
-}
-void __init update_e820(void)
-{
-	u8 nr_map;
-
-	nr_map = e820.nr_map;
-	if (sanitize_e820_map(e820.map, &nr_map))
-		return;
-	e820.nr_map = nr_map;
-	printk(KERN_INFO "modified physical RAM map:\n");
-	print_memory_map("modified");
-}

+ 46 - 13
arch/x86/kernel/efi.c

@@ -213,6 +213,48 @@ unsigned long efi_get_time(void)
 		      eft.minute, eft.second);
 }
 
+/*
+ * Tell the kernel about the EFI memory map.  This might include
+ * more than the max 128 entries that can fit in the e820 legacy
+ * (zeropage) memory map.
+ */
+
+static void __init add_efi_memmap(void)
+{
+	void *p;
+
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		efi_memory_desc_t *md = p;
+		unsigned long long start = md->phys_addr;
+		unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+		int e820_type;
+
+		if (md->attribute & EFI_MEMORY_WB)
+			e820_type = E820_RAM;
+		else
+			e820_type = E820_RESERVED;
+		e820_add_region(start, size, e820_type);
+	}
+	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+}
+
+void __init efi_reserve_early(void)
+{
+	unsigned long pmap;
+
+	pmap = boot_params.efi_info.efi_memmap;
+#ifdef CONFIG_X86_64
+	pmap += (__u64)boot_params.efi_info.efi_memmap_hi << 32;
+#endif
+	memmap.phys_map = (void *)pmap;
+	memmap.nr_map = boot_params.efi_info.efi_memmap_size /
+		boot_params.efi_info.efi_memdesc_size;
+	memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
+	memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
+	reserve_early(pmap, pmap + memmap.nr_map * memmap.desc_size,
+		      "EFI memmap");
+}
+
 #if EFI_DEBUG
 static void __init print_efi_memmap(void)
 {
@@ -242,21 +284,11 @@ void __init efi_init(void)
 	int i = 0;
 	void *tmp;
 
-#ifdef CONFIG_X86_32
 	efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
-	memmap.phys_map = (void *)boot_params.efi_info.efi_memmap;
-#else
-	efi_phys.systab = (efi_system_table_t *)
-		(boot_params.efi_info.efi_systab |
-		 ((__u64)boot_params.efi_info.efi_systab_hi<<32));
-	memmap.phys_map = (void *)
-		(boot_params.efi_info.efi_memmap |
-		 ((__u64)boot_params.efi_info.efi_memmap_hi<<32));
+#ifdef CONFIG_X86_64
+	efi_phys.systab = (void *)efi_phys.systab +
+		((__u64)boot_params.efi_info.efi_systab_hi<<32);
 #endif
-	memmap.nr_map = boot_params.efi_info.efi_memmap_size /
-		boot_params.efi_info.efi_memdesc_size;
-	memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
-	memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
 
 	efi.systab = early_ioremap((unsigned long)efi_phys.systab,
 				   sizeof(efi_system_table_t));
@@ -370,6 +402,7 @@ void __init efi_init(void)
 	if (memmap.desc_size != sizeof(efi_memory_desc_t))
 		printk(KERN_WARNING "Kernel-defined memdesc"
 		       "doesn't match the one from EFI!\n");
+	add_efi_memmap();
 
 	/* Setup for EFI runtime service */
 	reboot_type = BOOT_EFI;

+ 1 - 7
arch/x86/kernel/efi_64.c

@@ -97,13 +97,7 @@ void __init efi_call_phys_epilog(void)
 	early_runtime_code_mapping_set_exec(0);
 }
 
-void __init efi_reserve_bootmem(void)
-{
-	reserve_bootmem_generic((unsigned long)memmap.phys_map,
-				memmap.nr_map * memmap.desc_size);
-}
-
-void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size)
+void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size)
 {
 	static unsigned pages_mapped __initdata;
 	unsigned i, pages;

+ 1 - 1
arch/x86/kernel/genapic_64.c

@@ -51,7 +51,7 @@ void __init setup_apic_routing(void)
 	else
 #endif
 
-	if (num_possible_cpus() <= 8)
+	if (max_physical_apicid < 8)
 		genapic = &apic_flat;
 	else
 		genapic = &apic_physflat;

+ 73 - 0
arch/x86/kernel/head.c

@@ -0,0 +1,73 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+
+#include <asm/setup.h>
+#include <asm/bios_ebda.h>
+
+#define BIOS_LOWMEM_KILOBYTES 0x413
+
+/*
+ * The BIOS places the EBDA/XBDA at the top of conventional
+ * memory, and usually decreases the reported amount of
+ * conventional memory (int 0x12) too. This also contains a
+ * workaround for Dell systems that neglect to reserve EBDA.
+ * The same workaround also avoids a problem with the AMD768MPX
+ * chipset: reserve a page before VGA to prevent PCI prefetch
+ * into it (errata #56). Usually the page is reserved anyways,
+ * unless you have no PS/2 mouse plugged in.
+ */
+void __init reserve_ebda_region(void)
+{
+	unsigned int lowmem, ebda_addr;
+
+	/* To determine the position of the EBDA and the */
+	/* end of conventional memory, we need to look at */
+	/* the BIOS data area. In a paravirtual environment */
+	/* that area is absent. We'll just have to assume */
+	/* that the paravirt case can handle memory setup */
+	/* correctly, without our help. */
+	if (paravirt_enabled())
+		return;
+
+	/* end of low (conventional) memory */
+	lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
+	lowmem <<= 10;
+
+	/* start of EBDA area */
+	ebda_addr = get_bios_ebda();
+
+	/* Fixup: bios puts an EBDA in the top 64K segment */
+	/* of conventional memory, but does not adjust lowmem. */
+	if ((lowmem - ebda_addr) <= 0x10000)
+		lowmem = ebda_addr;
+
+	/* Fixup: bios does not report an EBDA at all. */
+	/* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
+	if ((ebda_addr == 0) && (lowmem >= 0x9f000))
+		lowmem = 0x9f000;
+
+	/* Paranoia: should never happen, but... */
+	if ((lowmem == 0) || (lowmem >= 0x100000))
+		lowmem = 0x9f000;
+
+	/* reserve all memory between lowmem and the 1MB mark */
+	reserve_early(lowmem, 0x100000, "BIOS reserved");
+}
+
+void __init reserve_setup_data(void)
+{
+	struct setup_data *data;
+	u64 pa_data;
+	char buf[32];
+
+	if (boot_params.hdr.version < 0x0209)
+		return;
+	pa_data = boot_params.hdr.setup_data;
+	while (pa_data) {
+		data = early_ioremap(pa_data, sizeof(*data));
+		sprintf(buf, "setup data %x", data->type);
+		reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
+		pa_data = data->next;
+		early_iounmap(data, sizeof(*data));
+	}
+}

+ 27 - 0
arch/x86/kernel/head32.c

@@ -8,7 +8,34 @@
 #include <linux/init.h>
 #include <linux/start_kernel.h>
 
+#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/e820.h>
+#include <asm/bios_ebda.h>
+
 void __init i386_start_kernel(void)
 {
+	reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+
+#ifdef CONFIG_BLK_DEV_INITRD
+	/* Reserve INITRD */
+	if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
+		u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+		u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+		u64 ramdisk_end   = ramdisk_image + ramdisk_size;
+		reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
+	}
+#endif
+	reserve_early(init_pg_tables_start, init_pg_tables_end,
+			"INIT_PG_TABLE");
+
+	reserve_ebda_region();
+
+	/*
+	 * At this point everything still needed from the boot loader
+	 * or BIOS or kernel text should be early reserved or marked not
+	 * RAM in e820. All other memory is free game.
+	 */
+
 	start_kernel();
 }

+ 0 - 68
arch/x86/kernel/head64.c

@@ -51,74 +51,6 @@ static void __init copy_bootdata(char *real_mode_data)
 	}
 }
 
-#define BIOS_LOWMEM_KILOBYTES 0x413
-
-/*
- * The BIOS places the EBDA/XBDA at the top of conventional
- * memory, and usually decreases the reported amount of
- * conventional memory (int 0x12) too. This also contains a
- * workaround for Dell systems that neglect to reserve EBDA.
- * The same workaround also avoids a problem with the AMD768MPX
- * chipset: reserve a page before VGA to prevent PCI prefetch
- * into it (errata #56). Usually the page is reserved anyways,
- * unless you have no PS/2 mouse plugged in.
- */
-static void __init reserve_ebda_region(void)
-{
-	unsigned int lowmem, ebda_addr;
-
-	/* To determine the position of the EBDA and the */
-	/* end of conventional memory, we need to look at */
-	/* the BIOS data area. In a paravirtual environment */
-	/* that area is absent. We'll just have to assume */
-	/* that the paravirt case can handle memory setup */
-	/* correctly, without our help. */
-	if (paravirt_enabled())
-		return;
-
-	/* end of low (conventional) memory */
-	lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
-	lowmem <<= 10;
-
-	/* start of EBDA area */
-	ebda_addr = get_bios_ebda();
-
-	/* Fixup: bios puts an EBDA in the top 64K segment */
-	/* of conventional memory, but does not adjust lowmem. */
-	if ((lowmem - ebda_addr) <= 0x10000)
-		lowmem = ebda_addr;
-
-	/* Fixup: bios does not report an EBDA at all. */
-	/* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
-	if ((ebda_addr == 0) && (lowmem >= 0x9f000))
-		lowmem = 0x9f000;
-
-	/* Paranoia: should never happen, but... */
-	if ((lowmem == 0) || (lowmem >= 0x100000))
-		lowmem = 0x9f000;
-
-	/* reserve all memory between lowmem and the 1MB mark */
-	reserve_early(lowmem, 0x100000, "BIOS reserved");
-}
-
-static void __init reserve_setup_data(void)
-{
-	struct setup_data *data;
-	unsigned long pa_data;
-	char buf[32];
-
-	if (boot_params.hdr.version < 0x0209)
-		return;
-	pa_data = boot_params.hdr.setup_data;
-	while (pa_data) {
-		data = early_ioremap(pa_data, sizeof(*data));
-		sprintf(buf, "setup data %x", data->type);
-		reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
-		pa_data = data->next;
-		early_iounmap(data, sizeof(*data));
-	}
-}
-
 void __init x86_64_start_kernel(char * real_mode_data)
 {
 	int i;

+ 6 - 0
arch/x86/kernel/head_32.S

@@ -194,6 +194,7 @@ default_entry:
 	xorl %ebx,%ebx				/* %ebx is kept at zero */
 
 	movl $pa(pg0), %edi
+	movl %edi, pa(init_pg_tables_start)
 	movl $pa(swapper_pg_pmd), %edx
 	movl $PTE_ATTR, %eax
 10:
@@ -219,6 +220,8 @@ default_entry:
 	jb 10b
 1:
 	movl %edi,pa(init_pg_tables_end)
+	shrl $12, %eax
+	movl %eax, pa(max_pfn_mapped)
 
 	/* Do early initialization of the fixmap area */
 	movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
@@ -228,6 +231,7 @@ default_entry:
 page_pde_offset = (__PAGE_OFFSET >> 20);
 
 	movl $pa(pg0), %edi
+	movl %edi, pa(init_pg_tables_start)
 	movl $pa(swapper_pg_dir), %edx
 	movl $PTE_ATTR, %eax
 10:
@@ -249,6 +253,8 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
 	cmpl %ebp,%eax
 	jb 10b
 	movl %edi,pa(init_pg_tables_end)
+	shrl $12, %eax
+	movl %eax, pa(max_pfn_mapped)
 
 	/* Do early initialization of the fixmap area */
 	movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax

+ 67 - 60
arch/x86/kernel/io_apic_32.c

@@ -72,15 +72,21 @@ int sis_apic_bug = -1;
 int nr_ioapic_registers[MAX_IO_APICS];
 
 /* I/O APIC entries */
-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
 int nr_ioapics;
 
 /* MP IRQ source entries */
-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
 
 /* # of MP IRQ source entries */
 int mp_irq_entries;
 
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
+
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
 static int disable_timer_pin_1 __initdata;
 
 /*
@@ -110,7 +116,7 @@ struct io_apic {
 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
 {
 	return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
-		+ (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+		+ (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
 }
 
 static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@ -802,10 +808,10 @@ static int find_irq_entry(int apic, int pin, int type)
 	int i;
 
 	for (i = 0; i < mp_irq_entries; i++)
-		if (mp_irqs[i].mpc_irqtype == type &&
-		    (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
-		     mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
-		    mp_irqs[i].mpc_dstirq == pin)
+		if (mp_irqs[i].mp_irqtype == type &&
+		    (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
+		     mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
+		    mp_irqs[i].mp_dstirq == pin)
 			return i;
 
 	return -1;
@@ -819,13 +825,13 @@ static int __init find_isa_irq_pin(int irq, int type)
 	int i;
 
 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mpc_srcbus;
+		int lbus = mp_irqs[i].mp_srcbus;
 
 		if (test_bit(lbus, mp_bus_not_pci) &&
-		    (mp_irqs[i].mpc_irqtype == type) &&
-		    (mp_irqs[i].mpc_srcbusirq == irq))
+		    (mp_irqs[i].mp_irqtype == type) &&
+		    (mp_irqs[i].mp_srcbusirq == irq))
 
-			return mp_irqs[i].mpc_dstirq;
+			return mp_irqs[i].mp_dstirq;
 	}
 	return -1;
 }
@@ -835,17 +841,17 @@ static int __init find_isa_irq_apic(int irq, int type)
 	int i;
 
 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mpc_srcbus;
+		int lbus = mp_irqs[i].mp_srcbus;
 
 		if (test_bit(lbus, mp_bus_not_pci) &&
-		    (mp_irqs[i].mpc_irqtype == type) &&
-		    (mp_irqs[i].mpc_srcbusirq == irq))
+		    (mp_irqs[i].mp_irqtype == type) &&
+		    (mp_irqs[i].mp_srcbusirq == irq))
 			break;
 	}
 	if (i < mp_irq_entries) {
 		int apic;
 		for (apic = 0; apic < nr_ioapics; apic++) {
-			if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
+			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
 				return apic;
 		}
 	}
@@ -865,28 +871,28 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
 
 	apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
 		"slot:%d, pin:%d.\n", bus, slot, pin);
-	if (mp_bus_id_to_pci_bus[bus] == -1) {
+	if (test_bit(bus, mp_bus_not_pci)) {
 		printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
 		return -1;
 	}
 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mpc_srcbus;
+		int lbus = mp_irqs[i].mp_srcbus;
 
 		for (apic = 0; apic < nr_ioapics; apic++)
-			if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
-			    mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
+			    mp_irqs[i].mp_dstapic == MP_APIC_ALL)
 				break;
 
 		if (!test_bit(lbus, mp_bus_not_pci) &&
-		    !mp_irqs[i].mpc_irqtype &&
+		    !mp_irqs[i].mp_irqtype &&
 		    (bus == lbus) &&
-		    (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
-			int irq = pin_2_irq(i, apic, mp_irqs[i].mpc_dstirq);
+		    (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
+			int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);
 
 			if (!(apic || IO_APIC_IRQ(irq)))
 				continue;
 
-			if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+			if (pin == (mp_irqs[i].mp_srcbusirq & 3))
 				return irq;
 			/*
 			 * Use the first all-but-pin matching entry as a
@@ -953,7 +959,7 @@ static int EISA_ELCR(unsigned int irq)
  * EISA conforming in the MP table, that means its trigger type must
  * be read in from the ELCR */
 
-#define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
+#define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
 #define default_EISA_polarity(idx)	default_ISA_polarity(idx)
 
 /* PCI interrupts are always polarity one level triggered,
@@ -970,13 +976,13 @@ static int EISA_ELCR(unsigned int irq)
 
 static int MPBIOS_polarity(int idx)
 {
-	int bus = mp_irqs[idx].mpc_srcbus;
+	int bus = mp_irqs[idx].mp_srcbus;
 	int polarity;
 
 	/*
 	 * Determine IRQ line polarity (high active or low active):
 	 */
-	switch (mp_irqs[idx].mpc_irqflag & 3) {
+	switch (mp_irqs[idx].mp_irqflag & 3) {
 	case 0: /* conforms, ie. bus-type dependent polarity */
 	{
 		polarity = test_bit(bus, mp_bus_not_pci)?
@@ -1012,13 +1018,13 @@ static int MPBIOS_polarity(int idx)
 
 static int MPBIOS_trigger(int idx)
 {
-	int bus = mp_irqs[idx].mpc_srcbus;
+	int bus = mp_irqs[idx].mp_srcbus;
 	int trigger;
 
 	/*
 	 * Determine IRQ trigger mode (edge or level sensitive):
 	 */
-	switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) {
+	switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
 	case 0: /* conforms, ie. bus-type dependent */
 	{
 		trigger = test_bit(bus, mp_bus_not_pci)?
@@ -1095,16 +1101,16 @@ static inline int irq_trigger(int idx)
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq, i;
-	int bus = mp_irqs[idx].mpc_srcbus;
+	int bus = mp_irqs[idx].mp_srcbus;
 
 	/*
 	 * Debugging check, we are in big trouble if this message pops up!
 	 */
-	if (mp_irqs[idx].mpc_dstirq != pin)
+	if (mp_irqs[idx].mp_dstirq != pin)
 		printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
 
 	if (test_bit(bus, mp_bus_not_pci))
-		irq = mp_irqs[idx].mpc_srcbusirq;
+		irq = mp_irqs[idx].mp_srcbusirq;
 	else {
 		/*
 		 * PCI IRQs are mapped in order
@@ -1248,12 +1254,12 @@ static void __init setup_IO_APIC_irqs(void)
 			if (first_notcon) {
 				apic_printk(APIC_VERBOSE, KERN_DEBUG
 						" IO-APIC (apicid-pin) %d-%d",
-						mp_ioapics[apic].mpc_apicid,
+						mp_ioapics[apic].mp_apicid,
 						pin);
 				first_notcon = 0;
 			} else
 				apic_printk(APIC_VERBOSE, ", %d-%d",
-					mp_ioapics[apic].mpc_apicid, pin);
+					mp_ioapics[apic].mp_apicid, pin);
 			continue;
 		}
 
@@ -1348,7 +1354,7 @@ void __init print_IO_APIC(void)
 	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
 	for (i = 0; i < nr_ioapics; i++)
 		printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
-		       mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+		       mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
 
 	/*
 	 * We are a bit conservative about what we expect.  We have to
@@ -1367,7 +1373,7 @@ void __init print_IO_APIC(void)
 		reg_03.raw = io_apic_read(apic, 3);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
-	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
 	printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
 	printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
 	printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -1708,7 +1714,6 @@ void disable_IO_APIC(void)
  * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
  */
 
-#ifndef CONFIG_X86_NUMAQ
 static void __init setup_ioapic_ids_from_mpc(void)
 {
 	union IO_APIC_reg_00 reg_00;
@@ -1718,6 +1723,11 @@ static void __init setup_ioapic_ids_from_mpc(void)
 	unsigned char old_id;
 	unsigned long flags;
 
+#ifdef CONFIG_X86_NUMAQ
+	if (found_numaq)
+		return;
+#endif
+
 	/*
 	 * Don't check I/O APIC IDs for xAPIC systems.  They have
 	 * no meaning without the serial APIC bus.
@@ -1741,14 +1751,14 @@ static void __init setup_ioapic_ids_from_mpc(void)
 		reg_00.raw = io_apic_read(apic, 0);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 
-		old_id = mp_ioapics[apic].mpc_apicid;
+		old_id = mp_ioapics[apic].mp_apicid;
 
-		if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
+		if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
 			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
-				apic, mp_ioapics[apic].mpc_apicid);
+				apic, mp_ioapics[apic].mp_apicid);
 			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
 				reg_00.bits.ID);
-			mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
+			mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
 		}
 
 		/*
@@ -1757,9 +1767,9 @@ static void __init setup_ioapic_ids_from_mpc(void)
 		 * 'stuck on smp_invalidate_needed IPI wait' messages.
 		 */
 		if (check_apicid_used(phys_id_present_map,
-					mp_ioapics[apic].mpc_apicid)) {
+					mp_ioapics[apic].mp_apicid)) {
 			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
-				apic, mp_ioapics[apic].mpc_apicid);
+				apic, mp_ioapics[apic].mp_apicid);
 			for (i = 0; i < get_physical_broadcast(); i++)
 				if (!physid_isset(i, phys_id_present_map))
 					break;
@@ -1768,13 +1778,13 @@ static void __init setup_ioapic_ids_from_mpc(void)
 			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
 				i);
 			physid_set(i, phys_id_present_map);
-			mp_ioapics[apic].mpc_apicid = i;
+			mp_ioapics[apic].mp_apicid = i;
 		} else {
 			physid_mask_t tmp;
-			tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
+			tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
 			apic_printk(APIC_VERBOSE, "Setting %d in the "
 					"phys_id_present_map\n",
-					mp_ioapics[apic].mpc_apicid);
+					mp_ioapics[apic].mp_apicid);
 			physids_or(phys_id_present_map, phys_id_present_map, tmp);
 		}
 
@@ -1783,11 +1793,11 @@ static void __init setup_ioapic_ids_from_mpc(void)
 		 * We need to adjust the IRQ routing table
 		 * if the ID changed.
 		 */
-		if (old_id != mp_ioapics[apic].mpc_apicid)
+		if (old_id != mp_ioapics[apic].mp_apicid)
 			for (i = 0; i < mp_irq_entries; i++)
-				if (mp_irqs[i].mpc_dstapic == old_id)
-					mp_irqs[i].mpc_dstapic
-						= mp_ioapics[apic].mpc_apicid;
+				if (mp_irqs[i].mp_dstapic == old_id)
+					mp_irqs[i].mp_dstapic
+						= mp_ioapics[apic].mp_apicid;
 
 		/*
 		 * Read the right value from the MPC table and
@@ -1795,9 +1805,9 @@ static void __init setup_ioapic_ids_from_mpc(void)
 		 */
 		apic_printk(APIC_VERBOSE, KERN_INFO
 			"...changing IO-APIC physical APIC ID to %d ...",
-			mp_ioapics[apic].mpc_apicid);
+			mp_ioapics[apic].mp_apicid);
 
-		reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
+		reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
 		spin_lock_irqsave(&ioapic_lock, flags);
 		io_apic_write(apic, 0, reg_00.raw);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -1808,15 +1818,12 @@ static void __init setup_ioapic_ids_from_mpc(void)
 		spin_lock_irqsave(&ioapic_lock, flags);
 		reg_00.raw = io_apic_read(apic, 0);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
-		if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
+		if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
 			printk("could not set ID!\n");
 		else
 			apic_printk(APIC_VERBOSE, " ok.\n");
 	}
 }
-#else
-static void __init setup_ioapic_ids_from_mpc(void) { }
-#endif
 
 int no_timer_check __initdata;
 
@@ -2352,8 +2359,8 @@ static int ioapic_resume(struct sys_device *dev)
 
 	spin_lock_irqsave(&ioapic_lock, flags);
 	reg_00.raw = io_apic_read(dev->id, 0);
-	if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
-		reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+	if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
+		reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
 		io_apic_write(dev->id, 0, reg_00.raw);
 	}
 	spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2785,7 +2792,7 @@ int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int ac
 
 	apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
 		"(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
-		mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
+		mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
 		edge_level, active_high_low);
 
 	ioapic_register_intr(irq, entry.vector, edge_level);
@@ -2806,8 +2813,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
 		return -1;
 
 	for (i = 0; i < mp_irq_entries; i++)
-		if (mp_irqs[i].mpc_irqtype == mp_INT &&
-		    mp_irqs[i].mpc_srcbusirq == bus_irq)
+		if (mp_irqs[i].mp_irqtype == mp_INT &&
+		    mp_irqs[i].mp_srcbusirq == bus_irq)
 			break;
 	if (i >= mp_irq_entries)
 		return -1;

+ 42 - 40
arch/x86/kernel/io_apic_64.c

@@ -108,15 +108,17 @@ DEFINE_SPINLOCK(vector_lock);
 int nr_ioapic_registers[MAX_IO_APICS];
 
 /* I/O APIC entries */
-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
 int nr_ioapics;
 
 /* MP IRQ source entries */
-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
 
 /* # of MP IRQ source entries */
 int mp_irq_entries;
 
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
 /*
  * Rough estimation of how many shared IRQs there are, can
  * be changed anytime.
@@ -144,7 +146,7 @@ struct io_apic {
 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
 {
 	return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
-		+ (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+		+ (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
 }
 
 static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@ -464,10 +466,10 @@ static int find_irq_entry(int apic, int pin, int type)
 	int i;
 
 	for (i = 0; i < mp_irq_entries; i++)
-		if (mp_irqs[i].mpc_irqtype == type &&
-		    (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
-		     mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
-		    mp_irqs[i].mpc_dstirq == pin)
+		if (mp_irqs[i].mp_irqtype == type &&
+		    (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
+		     mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
+		    mp_irqs[i].mp_dstirq == pin)
 			return i;
 
 	return -1;
@@ -481,13 +483,13 @@ static int __init find_isa_irq_pin(int irq, int type)
 	int i;
 
 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mpc_srcbus;
+		int lbus = mp_irqs[i].mp_srcbus;
 
 		if (test_bit(lbus, mp_bus_not_pci) &&
-		    (mp_irqs[i].mpc_irqtype == type) &&
-		    (mp_irqs[i].mpc_srcbusirq == irq))
+		    (mp_irqs[i].mp_irqtype == type) &&
+		    (mp_irqs[i].mp_srcbusirq == irq))
 
-			return mp_irqs[i].mpc_dstirq;
+			return mp_irqs[i].mp_dstirq;
 	}
 	return -1;
 }
@@ -497,17 +499,17 @@ static int __init find_isa_irq_apic(int irq, int type)
 	int i;
 
 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mpc_srcbus;
+		int lbus = mp_irqs[i].mp_srcbus;
 
 		if (test_bit(lbus, mp_bus_not_pci) &&
-		    (mp_irqs[i].mpc_irqtype == type) &&
-		    (mp_irqs[i].mpc_srcbusirq == irq))
+		    (mp_irqs[i].mp_irqtype == type) &&
+		    (mp_irqs[i].mp_srcbusirq == irq))
 			break;
 	}
 	if (i < mp_irq_entries) {
 		int apic;
 		for(apic = 0; apic < nr_ioapics; apic++) {
-			if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
+			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
 				return apic;
 		}
 	}
@@ -527,28 +529,28 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
 
 	apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
 		bus, slot, pin);
-	if (mp_bus_id_to_pci_bus[bus] == -1) {
+	if (test_bit(bus, mp_bus_not_pci)) {
 		apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
 		return -1;
 	}
 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mpc_srcbus;
+		int lbus = mp_irqs[i].mp_srcbus;
 
 		for (apic = 0; apic < nr_ioapics; apic++)
-			if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
-			    mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
+			    mp_irqs[i].mp_dstapic == MP_APIC_ALL)
 				break;
 
 		if (!test_bit(lbus, mp_bus_not_pci) &&
-		    !mp_irqs[i].mpc_irqtype &&
+		    !mp_irqs[i].mp_irqtype &&
 		    (bus == lbus) &&
-		    (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
-			int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+		    (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
+			int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
 
 			if (!(apic || IO_APIC_IRQ(irq)))
 				continue;
 
-			if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+			if (pin == (mp_irqs[i].mp_srcbusirq & 3))
 				return irq;
 			/*
 			 * Use the first all-but-pin matching entry as a
@@ -576,13 +578,13 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
 
 static int MPBIOS_polarity(int idx)
 {
-	int bus = mp_irqs[idx].mpc_srcbus;
+	int bus = mp_irqs[idx].mp_srcbus;
 	int polarity;
 
 	/*
 	 * Determine IRQ line polarity (high active or low active):
 	 */
-	switch (mp_irqs[idx].mpc_irqflag & 3)
+	switch (mp_irqs[idx].mp_irqflag & 3)
 	{
 		case 0: /* conforms, ie. bus-type dependent polarity */
 			if (test_bit(bus, mp_bus_not_pci))
@@ -618,13 +620,13 @@ static int MPBIOS_polarity(int idx)
 
 static int MPBIOS_trigger(int idx)
 {
-	int bus = mp_irqs[idx].mpc_srcbus;
+	int bus = mp_irqs[idx].mp_srcbus;
 	int trigger;
 
 	/*
 	 * Determine IRQ trigger mode (edge or level sensitive):
 	 */
-	switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+	switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
 	{
 		case 0: /* conforms, ie. bus-type dependent */
 			if (test_bit(bus, mp_bus_not_pci))
@@ -671,16 +673,16 @@ static inline int irq_trigger(int idx)
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq, i;
-	int bus = mp_irqs[idx].mpc_srcbus;
+	int bus = mp_irqs[idx].mp_srcbus;
 
 	/*
 	 * Debugging check, we are in big trouble if this message pops up!
 	 */
-	if (mp_irqs[idx].mpc_dstirq != pin)
+	if (mp_irqs[idx].mp_dstirq != pin)
 		printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
 
 	if (test_bit(bus, mp_bus_not_pci)) {
-		irq = mp_irqs[idx].mpc_srcbusirq;
+		irq = mp_irqs[idx].mp_srcbusirq;
 	} else {
 		/*
 		 * PCI IRQs are mapped in order
@@ -857,7 +859,7 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
 	apic_printk(APIC_VERBOSE,KERN_DEBUG
 		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
 		    "IRQ %d Mode:%i Active:%i)\n",
-		    apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector,
+		    apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
 		    irq, trigger, polarity);
 
 	/*
@@ -898,10 +900,10 @@ static void __init setup_IO_APIC_irqs(void)
 		idx = find_irq_entry(apic,pin,mp_INT);
 		if (idx == -1) {
 			if (first_notcon) {
-				apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+				apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
 				first_notcon = 0;
 			} else
-				apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+				apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
 			continue;
 		}
 		if (!first_notcon) {
@@ -969,7 +971,7 @@ void __apicdebuginit print_IO_APIC(void)
 	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
 	for (i = 0; i < nr_ioapics; i++)
 		printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
-		       mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+		       mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
 
 	/*
 	 * We are a bit conservative about what we expect.  We have to
@@ -987,7 +989,7 @@ void __apicdebuginit print_IO_APIC(void)
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	printk("\n");
-	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
 	printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
 	printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
 
@@ -1873,8 +1875,8 @@ static int ioapic_resume(struct sys_device *dev)
 
 	spin_lock_irqsave(&ioapic_lock, flags);
 	reg_00.raw = io_apic_read(dev->id, 0);
-	if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
-		reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+	if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
+		reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
 		io_apic_write(dev->id, 0, reg_00.raw);
 	}
 	spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2274,8 +2276,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
 		return -1;
 
 	for (i = 0; i < mp_irq_entries; i++)
-		if (mp_irqs[i].mpc_irqtype == mp_INT &&
-		    mp_irqs[i].mpc_srcbusirq == bus_irq)
+		if (mp_irqs[i].mp_irqtype == mp_INT &&
+		    mp_irqs[i].mp_srcbusirq == bus_irq)
 			break;
 	if (i >= mp_irq_entries)
 		return -1;
@@ -2368,7 +2370,7 @@ void __init ioapic_init_mappings(void)
 	ioapic_res = ioapic_setup_resources();
 	for (i = 0; i < nr_ioapics; i++) {
 		if (smp_found_config) {
-			ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+			ioapic_phys = mp_ioapics[i].mp_apicaddr;
 		} else {
 			ioapic_phys = (unsigned long)
 				alloc_bootmem_pages(PAGE_SIZE);

+ 485 - 342
arch/x86/kernel/mpparse.c

@@ -25,6 +25,8 @@
 #include <asm/proto.h>
 #include <asm/acpi.h>
 #include <asm/bios_ebda.h>
+#include <asm/e820.h>
+#include <asm/trampoline.h>
 
 #include <mach_apic.h>
 #ifdef CONFIG_X86_32
@@ -32,28 +34,6 @@
 #include <mach_mpparse.h>
 #endif
 
-/* Have we found an MP table */
-int smp_found_config;
-
-/*
- * Various Linux-internal data structures created from the
- * MP-table.
- */
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
-int mp_bus_id_to_type[MAX_MP_BUSSES];
-#endif
-
-DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
-int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
-
-static int mp_current_pci_id;
-
-int pic_mode;
-
-/*
- * Intel MP BIOS table parsing routines:
- */
-
 /*
  * Checksum an MP configuration block.
  */
@@ -69,15 +49,73 @@ static int __init mpf_checksum(unsigned char *mp, int len)
 }
 
 #ifdef CONFIG_X86_NUMAQ
+int found_numaq;
 /*
  * Have to match translation table entries to main table entries by counter
  * hence the mpc_record variable .... can't see a less disgusting way of
  * doing this ....
  */
+struct mpc_config_translation {
+	unsigned char mpc_type;
+	unsigned char trans_len;
+	unsigned char trans_type;
+	unsigned char trans_quad;
+	unsigned char trans_global;
+	unsigned char trans_local;
+	unsigned short trans_reserved;
+};
+
 
 static int mpc_record;
 static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
     __cpuinitdata;
+
+static inline int generate_logical_apicid(int quad, int phys_apicid)
+{
+	return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
+}
+
+
+static inline int mpc_apic_id(struct mpc_config_processor *m,
+			struct mpc_config_translation *translation_record)
+{
+	int quad = translation_record->trans_quad;
+	int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
+
+	printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
+	       m->mpc_apicid,
+	       (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+	       (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+	       m->mpc_apicver, quad, logical_apicid);
+	return logical_apicid;
+}
+
+int mp_bus_id_to_node[MAX_MP_BUSSES];
+
+int mp_bus_id_to_local[MAX_MP_BUSSES];
+
+static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
+	struct mpc_config_translation *translation)
+{
+	int quad = translation->trans_quad;
+	int local = translation->trans_local;
+
+	mp_bus_id_to_node[m->mpc_busid] = quad;
+	mp_bus_id_to_local[m->mpc_busid] = local;
+	printk(KERN_INFO "Bus #%d is %s (node %d)\n",
+	       m->mpc_busid, name, quad);
+}
+
+int quad_local_to_mp_bus_id [NR_CPUS/4][4];
+static void mpc_oem_pci_bus(struct mpc_config_bus *m,
+	struct mpc_config_translation *translation)
+{
+	int quad = translation->trans_quad;
+	int local = translation->trans_local;
+
+	quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
+}
+
 #endif
 
 static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
@@ -90,7 +128,10 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
 		return;
 	}
 #ifdef CONFIG_X86_NUMAQ
-	apicid = mpc_apic_id(m, translation_table[mpc_record]);
+	if (found_numaq)
+		apicid = mpc_apic_id(m, translation_table[mpc_record]);
+	else
+		apicid = m->mpc_apicid;
 #else
 	apicid = m->mpc_apicid;
 #endif
@@ -103,17 +144,18 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
 	generic_processor_info(apicid, m->mpc_apicver);
 }
 
+#ifdef CONFIG_X86_IO_APIC
 static void __init MP_bus_info(struct mpc_config_bus *m)
 {
 	char str[7];
-
 	memcpy(str, m->mpc_bustype, 6);
 	str[6] = 0;
 
 #ifdef CONFIG_X86_NUMAQ
-	mpc_oem_bus_info(m, str, translation_table[mpc_record]);
+	if (found_numaq)
+		mpc_oem_bus_info(m, str, translation_table[mpc_record]);
 #else
-	Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
+	printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
 #endif
 
 #if MAX_MP_BUSSES < 256
@@ -132,11 +174,10 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
 #endif
 	} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
 #ifdef CONFIG_X86_NUMAQ
-		mpc_oem_pci_bus(m, translation_table[mpc_record]);
+		if (found_numaq)
+			mpc_oem_pci_bus(m, translation_table[mpc_record]);
 #endif
 		clear_bit(m->mpc_busid, mp_bus_not_pci);
-		mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
-		mp_current_pci_id++;
 #if defined(CONFIG_EISA) || defined (CONFIG_MCA)
 		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
 	} else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
@@ -147,6 +188,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
 	} else
 		printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
 }
+#endif
 
 #ifdef CONFIG_X86_IO_APIC
 
@@ -176,18 +218,89 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
 	if (bad_ioapic(m->mpc_apicaddr))
 		return;
 
-	mp_ioapics[nr_ioapics] = *m;
+	mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr;
+	mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid;
+	mp_ioapics[nr_ioapics].mp_type = m->mpc_type;
+	mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver;
+	mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags;
 	nr_ioapics++;
 }
 
-static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
+static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
 {
-	mp_irqs[mp_irq_entries] = *m;
-	Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
+	printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x,"
 		" IRQ %02x, APIC ID %x, APIC INT %02x\n",
 		m->mpc_irqtype, m->mpc_irqflag & 3,
 		(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
 		m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
+}
+
+static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
+{
+	printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x,"
+		" IRQ %02x, APIC ID %x, APIC INT %02x\n",
+		mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
+		(mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
+		mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
+}
+
+static void __init assign_to_mp_irq(struct mpc_config_intsrc *m,
+				    struct mp_config_intsrc *mp_irq)
+{
+	mp_irq->mp_dstapic = m->mpc_dstapic;
+	mp_irq->mp_type = m->mpc_type;
+	mp_irq->mp_irqtype = m->mpc_irqtype;
+	mp_irq->mp_irqflag = m->mpc_irqflag;
+	mp_irq->mp_srcbus = m->mpc_srcbus;
+	mp_irq->mp_srcbusirq = m->mpc_srcbusirq;
+	mp_irq->mp_dstirq = m->mpc_dstirq;
+}
+
+static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
+					struct mpc_config_intsrc *m)
+{
+	m->mpc_dstapic = mp_irq->mp_dstapic;
+	m->mpc_type = mp_irq->mp_type;
+	m->mpc_irqtype = mp_irq->mp_irqtype;
+	m->mpc_irqflag = mp_irq->mp_irqflag;
+	m->mpc_srcbus = mp_irq->mp_srcbus;
+	m->mpc_srcbusirq = mp_irq->mp_srcbusirq;
+	m->mpc_dstirq = mp_irq->mp_dstirq;
+}
+
+static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
+					struct mpc_config_intsrc *m)
+{
+	if (mp_irq->mp_dstapic != m->mpc_dstapic)
+		return 1;
+	if (mp_irq->mp_type != m->mpc_type)
+		return 2;
+	if (mp_irq->mp_irqtype != m->mpc_irqtype)
+		return 3;
+	if (mp_irq->mp_irqflag != m->mpc_irqflag)
+		return 4;
+	if (mp_irq->mp_srcbus != m->mpc_srcbus)
+		return 5;
+	if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq)
+		return 6;
+	if (mp_irq->mp_dstirq != m->mpc_dstirq)
+		return 7;
+
+	return 0;
+}
+
+static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
+{
+	int i;
+
+	print_MP_intsrc_info(m);
+
+	for (i = 0; i < mp_irq_entries; i++) {
+		if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
+			return;
+	}
+
+	assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
 	if (++mp_irq_entries == MAX_IRQ_SOURCES)
 		panic("Max # of irq sources exceeded!!\n");
 }
@@ -196,7 +309,7 @@ static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
 
 static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
 {
-	Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
+	printk(KERN_INFO "Lint: type %d, pol %d, trig %d, bus %02x,"
 		" IRQ %02x, APIC ID %x, APIC LINT %02x\n",
 		m->mpc_irqtype, m->mpc_irqflag & 3,
 		(m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
@@ -266,11 +379,14 @@ static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
 	}
 }
 
-static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
+void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
 				 char *productid)
 {
 	if (strncmp(oem, "IBM NUMA", 8))
-		printk("Warning!  May not be a NUMA-Q system!\n");
+		printk("Warning!  Not a NUMA-Q system!\n");
+	else
+		found_numaq = 1;
+
 	if (mpc->mpc_oemptr)
 		smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
 				 mpc->mpc_oemsize);
@@ -281,12 +397,9 @@ static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
  * Read/parse the MPC
  */
 
-static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
+static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem,
+				char *str)
 {
-	char str[16];
-	char oem[10];
-	int count = sizeof(*mpc);
-	unsigned char *mpt = ((unsigned char *)mpc) + count;
 
 	if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
 		printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
@@ -309,19 +422,42 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
 	}
 	memcpy(oem, mpc->mpc_oem, 8);
 	oem[8] = 0;
-	printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem);
+	printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
 
 	memcpy(str, mpc->mpc_productid, 12);
 	str[12] = 0;
-	printk("Product ID: %s ", str);
 
-#ifdef CONFIG_X86_32
-	mps_oem_check(mpc, oem, str);
-#endif
-	printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
+	printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
 
 	printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
 
+	return 1;
+}
+
+static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
+{
+	char str[16];
+	char oem[10];
+
+	int count = sizeof(*mpc);
+	unsigned char *mpt = ((unsigned char *)mpc) + count;
+
+	if (!smp_check_mpc(mpc, oem, str))
+		return 0;
+
+#ifdef CONFIG_X86_32
+	/*
+	 * need to make sure summit and es7000's mps_oem_check is safe to be
+	 * called early via genericarch 's mps_oem_check
+	 */
+	if (early) {
+#ifdef CONFIG_X86_NUMAQ
+		numaq_mps_oem_check(mpc, oem, str);
+#endif
+	} else
+		mps_oem_check(mpc, oem, str);
+#endif
+
 	/* save the local APIC address, it might be non-default */
 	if (!acpi_lapic)
 		mp_lapic_addr = mpc->mpc_lapic;
@@ -352,7 +488,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
 			{
 				struct mpc_config_bus *m =
 				    (struct mpc_config_bus *)mpt;
+#ifdef CONFIG_X86_IO_APIC
 				MP_bus_info(m);
+#endif
 				mpt += sizeof(*m);
 				count += sizeof(*m);
 				break;
@@ -402,6 +540,11 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
 		++mpc_record;
 #endif
 	}
+
+#ifdef CONFIG_X86_GENERICARCH
+       generic_bigsmp_probe();
+#endif
+
 	setup_apic_routing();
 	if (!num_processors)
 		printk(KERN_ERR "MPTABLE: no processors registered!\n");
@@ -427,7 +570,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
 	intsrc.mpc_type = MP_INTSRC;
 	intsrc.mpc_irqflag = 0;	/* conforming */
 	intsrc.mpc_srcbus = 0;
-	intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
+	intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid;
 
 	intsrc.mpc_irqtype = mp_INT;
 
@@ -488,40 +631,11 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
 	MP_intsrc_info(&intsrc);
 }
 
-#endif
 
-static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+static void construct_ioapic_table(int mpc_default_type)
 {
-	struct mpc_config_processor processor;
-	struct mpc_config_bus bus;
-#ifdef CONFIG_X86_IO_APIC
 	struct mpc_config_ioapic ioapic;
-#endif
-	struct mpc_config_lintsrc lintsrc;
-	int linttypes[2] = { mp_ExtINT, mp_NMI };
-	int i;
-
-	/*
-	 * local APIC has default address
-	 */
-	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
-	/*
-	 * 2 CPUs, numbered 0 & 1.
-	 */
-	processor.mpc_type = MP_PROCESSOR;
-	/* Either an integrated APIC or a discrete 82489DX. */
-	processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
-	processor.mpc_cpuflag = CPU_ENABLED;
-	processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
-	    (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
-	processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
-	processor.mpc_reserved[0] = 0;
-	processor.mpc_reserved[1] = 0;
-	for (i = 0; i < 2; i++) {
-		processor.mpc_apicid = i;
-		MP_processor_info(&processor);
-	}
+	struct mpc_config_bus bus;
 
 	bus.mpc_type = MP_BUS;
 	bus.mpc_busid = 0;
@@ -550,7 +664,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
 		MP_bus_info(&bus);
 	}
 
-#ifdef CONFIG_X86_IO_APIC
 	ioapic.mpc_type = MP_IOAPIC;
 	ioapic.mpc_apicid = 2;
 	ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
@@ -562,7 +675,42 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
 	 * We set up most of the low 16 IO-APIC pins according to MPS rules.
 	 */
 	construct_default_ioirq_mptable(mpc_default_type);
+}
+#else
+static inline void construct_ioapic_table(int mpc_default_type) { }
 #endif
+
+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+{
+	struct mpc_config_processor processor;
+	struct mpc_config_lintsrc lintsrc;
+	int linttypes[2] = { mp_ExtINT, mp_NMI };
+	int i;
+
+	/*
+	 * local APIC has default address
+	 */
+	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+	/*
+	 * 2 CPUs, numbered 0 & 1.
+	 */
+	processor.mpc_type = MP_PROCESSOR;
+	/* Either an integrated APIC or a discrete 82489DX. */
+	processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+	processor.mpc_cpuflag = CPU_ENABLED;
+	processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
+	    (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
+	processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+	processor.mpc_reserved[0] = 0;
+	processor.mpc_reserved[1] = 0;
+	for (i = 0; i < 2; i++) {
+		processor.mpc_apicid = i;
+		MP_processor_info(&processor);
+	}
+
+	construct_ioapic_table(mpc_default_type);
+
 	lintsrc.mpc_type = MP_LINTSRC;
 	lintsrc.mpc_irqflag = 0;	/* conforming */
 	lintsrc.mpc_srcbusid = 0;
@@ -600,7 +748,7 @@ static void __init __get_smp_config(unsigned early)
 
 	printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
 	       mpf->mpf_specification);
-#ifdef CONFIG_X86_32
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
 	if (mpf->mpf_feature2 & (1 << 7)) {
 		printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
 		pic_mode = 1;
@@ -632,7 +780,9 @@ static void __init __get_smp_config(unsigned early)
 		 * override the defaults.
 		 */
 		if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) {
+#ifdef CONFIG_X86_LOCAL_APIC
 			smp_found_config = 0;
+#endif
 			printk(KERN_ERR
 			       "BIOS bug, MP table errors detected!...\n");
 			printk(KERN_ERR "... disabling SMP support. "
@@ -689,7 +839,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 	unsigned int *bp = phys_to_virt(base);
 	struct intel_mp_floating *mpf;
 
-	Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length);
+	printk(KERN_DEBUG "Scan SMP from %p for %ld bytes.\n", bp, length);
 	BUILD_BUG_ON(sizeof(*mpf) != 16);
 
 	while (length > 0) {
@@ -699,15 +849,21 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 		    !mpf_checksum((unsigned char *)bp, 16) &&
 		    ((mpf->mpf_specification == 1)
 		     || (mpf->mpf_specification == 4))) {
-
+#ifdef CONFIG_X86_LOCAL_APIC
 			smp_found_config = 1;
+#endif
 			mpf_found = mpf;
-#ifdef CONFIG_X86_32
+
 			printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
 			       mpf, virt_to_phys(mpf));
-			reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
+
+			if (!reserve)
+				return 1;
+			reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
 					BOOTMEM_DEFAULT);
 			if (mpf->mpf_physptr) {
+				unsigned long size = PAGE_SIZE;
+#ifdef CONFIG_X86_32
 				/*
 				 * We cannot access to MPC table to compute
 				 * table size yet, as only few megabytes from
@@ -717,24 +873,15 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 				 * PAGE_SIZE from mpg->mpf_physptr yields BUG()
 				 * in reserve_bootmem.
 				 */
-				unsigned long size = PAGE_SIZE;
 				unsigned long end = max_low_pfn * PAGE_SIZE;
 				if (mpf->mpf_physptr + size > end)
 					size = end - mpf->mpf_physptr;
-				reserve_bootmem(mpf->mpf_physptr, size,
+#endif
+				reserve_bootmem_generic(mpf->mpf_physptr, size,
 						BOOTMEM_DEFAULT);
 			}
 
-#else
-			if (!reserve)
-				return 1;
-
-			reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
-			if (mpf->mpf_physptr)
-				reserve_bootmem_generic(mpf->mpf_physptr,
-							PAGE_SIZE);
-#endif
-		return 1;
+			return 1;
 		}
 		bp += 4;
 		length -= 16;
@@ -790,298 +937,294 @@ void __init find_smp_config(void)
 	__find_smp_config(1);
 }
 
-/* --------------------------------------------------------------------------
-                            ACPI-based MP Configuration
-   -------------------------------------------------------------------------- */
+#ifdef CONFIG_X86_IO_APIC
+static u8 __initdata irq_used[MAX_IRQ_SOURCES];
 
-/*
- * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
- */
-int es7000_plat;
+static int  __init get_MP_intsrc_index(struct mpc_config_intsrc *m)
+{
+	int i;
 
-#ifdef CONFIG_ACPI
+	if (m->mpc_irqtype != mp_INT)
+		return 0;
 
-#ifdef	CONFIG_X86_IO_APIC
+	if (m->mpc_irqflag != 0x0f)
+		return 0;
 
-#define MP_ISA_BUS		0
+	/* not legacy */
 
-extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
+	for (i = 0; i < mp_irq_entries; i++) {
+		if (mp_irqs[i].mp_irqtype != mp_INT)
+			continue;
 
-static int mp_find_ioapic(int gsi)
-{
-	int i = 0;
+		if (mp_irqs[i].mp_irqflag != 0x0f)
+			continue;
 
-	/* Find the IOAPIC that manages this GSI. */
-	for (i = 0; i < nr_ioapics; i++) {
-		if ((gsi >= mp_ioapic_routing[i].gsi_base)
-		    && (gsi <= mp_ioapic_routing[i].gsi_end))
-			return i;
+		if (mp_irqs[i].mp_srcbus != m->mpc_srcbus)
+			continue;
+		if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq)
+			continue;
+		if (irq_used[i]) {
+			/* already claimed */
+			return -2;
+		}
+		irq_used[i] = 1;
+		return i;
 	}
 
-	printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+	/* not found */
 	return -1;
 }
 
-static u8 __init uniq_ioapic_id(u8 id)
-{
-#ifdef CONFIG_X86_32
-	if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
-	    !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
-		return io_apic_get_unique_id(nr_ioapics, id);
-	else
-		return id;
-#else
-	int i;
-	DECLARE_BITMAP(used, 256);
-	bitmap_zero(used, 256);
-	for (i = 0; i < nr_ioapics; i++) {
-		struct mpc_config_ioapic *ia = &mp_ioapics[i];
-		__set_bit(ia->mpc_apicid, used);
-	}
-	if (!test_bit(id, used))
-		return id;
-	return find_first_zero_bit(used, 256);
+#define SPARE_SLOT_NUM 20
+
+static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
 #endif
-}
 
-void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
+static int  __init replace_intsrc_all(struct mp_config_table *mpc,
+					unsigned long mpc_new_phys,
+					unsigned long mpc_new_length)
 {
-	int idx = 0;
-
-	if (bad_ioapic(address))
-		return;
+#ifdef CONFIG_X86_IO_APIC
+	int i;
+	int nr_m_spare = 0;
+#endif
 
-	idx = nr_ioapics;
+	int count = sizeof(*mpc);
+	unsigned char *mpt = ((unsigned char *)mpc) + count;
 
-	mp_ioapics[idx].mpc_type = MP_IOAPIC;
-	mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
-	mp_ioapics[idx].mpc_apicaddr = address;
+	printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length);
+	while (count < mpc->mpc_length) {
+		switch (*mpt) {
+		case MP_PROCESSOR:
+			{
+				struct mpc_config_processor *m =
+				    (struct mpc_config_processor *)mpt;
+				mpt += sizeof(*m);
+				count += sizeof(*m);
+				break;
+			}
+		case MP_BUS:
+			{
+				struct mpc_config_bus *m =
+				    (struct mpc_config_bus *)mpt;
+				mpt += sizeof(*m);
+				count += sizeof(*m);
+				break;
+			}
+		case MP_IOAPIC:
+			{
+				mpt += sizeof(struct mpc_config_ioapic);
+				count += sizeof(struct mpc_config_ioapic);
+				break;
+			}
+		case MP_INTSRC:
+			{
+#ifdef CONFIG_X86_IO_APIC
+				struct mpc_config_intsrc *m =
+				    (struct mpc_config_intsrc *)mpt;
 
-	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
-	mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
-#ifdef CONFIG_X86_32
-	mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
-#else
-	mp_ioapics[idx].mpc_apicver = 0;
+				printk(KERN_INFO "OLD ");
+				print_MP_intsrc_info(m);
+				i = get_MP_intsrc_index(m);
+				if (i > 0) {
+					assign_to_mpc_intsrc(&mp_irqs[i], m);
+					printk(KERN_INFO "NEW ");
+					print_mp_irq_info(&mp_irqs[i]);
+				} else if (!i) {
+					/* legacy, do nothing */
+				} else if (nr_m_spare < SPARE_SLOT_NUM) {
+					/*
+					 * not found (-1), or duplicated (-2)
+					 * are invalid entries,
+					 * we need to use the slot  later
+					 */
+					m_spare[nr_m_spare] = m;
+					nr_m_spare++;
+				}
 #endif
-	/*
-	 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
-	 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
-	 */
-	mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
-	mp_ioapic_routing[idx].gsi_base = gsi_base;
-	mp_ioapic_routing[idx].gsi_end = gsi_base +
-	    io_apic_get_redir_entries(idx);
-
-	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
-	       "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
-	       mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
-	       mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
-
-	nr_ioapics++;
-}
+				mpt += sizeof(struct mpc_config_intsrc);
+				count += sizeof(struct mpc_config_intsrc);
+				break;
+			}
+		case MP_LINTSRC:
+			{
+				struct mpc_config_lintsrc *m =
+				    (struct mpc_config_lintsrc *)mpt;
+				mpt += sizeof(*m);
+				count += sizeof(*m);
+				break;
+			}
+		default:
+			/* wrong mptable */
+			printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
+			printk(KERN_ERR "type %x\n", *mpt);
+			print_hex_dump(KERN_ERR, "  ", DUMP_PREFIX_ADDRESS, 16,
+					1, mpc, mpc->mpc_length, 1);
+			goto out;
+		}
+	}
 
-void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
-{
-	struct mpc_config_intsrc intsrc;
-	int ioapic = -1;
-	int pin = -1;
+#ifdef CONFIG_X86_IO_APIC
+	for (i = 0; i < mp_irq_entries; i++) {
+		if (irq_used[i])
+			continue;
 
-	/*
-	 * Convert 'gsi' to 'ioapic.pin'.
-	 */
-	ioapic = mp_find_ioapic(gsi);
-	if (ioapic < 0)
-		return;
-	pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+		if (mp_irqs[i].mp_irqtype != mp_INT)
+			continue;
 
-	/*
-	 * TBD: This check is for faulty timer entries, where the override
-	 *      erroneously sets the trigger to level, resulting in a HUGE
-	 *      increase of timer interrupts!
-	 */
-	if ((bus_irq == 0) && (trigger == 3))
-		trigger = 1;
+		if (mp_irqs[i].mp_irqflag != 0x0f)
+			continue;
 
-	intsrc.mpc_type = MP_INTSRC;
-	intsrc.mpc_irqtype = mp_INT;
-	intsrc.mpc_irqflag = (trigger << 2) | polarity;
-	intsrc.mpc_srcbus = MP_ISA_BUS;
-	intsrc.mpc_srcbusirq = bus_irq;	/* IRQ */
-	intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;	/* APIC ID */
-	intsrc.mpc_dstirq = pin;	/* INTIN# */
+		if (nr_m_spare > 0) {
+			printk(KERN_INFO "*NEW* found ");
+			nr_m_spare--;
+			assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
+			m_spare[nr_m_spare] = NULL;
+		} else {
+			struct mpc_config_intsrc *m =
+			    (struct mpc_config_intsrc *)mpt;
+			count += sizeof(struct mpc_config_intsrc);
+			if (!mpc_new_phys) {
+				printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
+			} else {
+				if (count <= mpc_new_length)
+					printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
+				else {
+					printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
+					goto out;
+				}
+			}
+			assign_to_mpc_intsrc(&mp_irqs[i], m);
+			mpc->mpc_length = count;
+			mpt += sizeof(struct mpc_config_intsrc);
+		}
+		print_mp_irq_info(&mp_irqs[i]);
+	}
+#endif
+out:
+	/* update checksum */
+	mpc->mpc_checksum = 0;
+	mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc,
+					   mpc->mpc_length);
 
-	MP_intsrc_info(&intsrc);
+	return 0;
 }
 
-void __init mp_config_acpi_legacy_irqs(void)
-{
-	struct mpc_config_intsrc intsrc;
-	int i = 0;
-	int ioapic = -1;
+static int __initdata enable_update_mptable;
 
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
-	/*
-	 * Fabricate the legacy ISA bus (bus #31).
-	 */
-	mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
-#endif
-	set_bit(MP_ISA_BUS, mp_bus_not_pci);
-	Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
+static int __init update_mptable_setup(char *str)
+{
+	enable_update_mptable = 1;
+	return 0;
+}
+early_param("update_mptable", update_mptable_setup);
 
-	/*
-	 * Older generations of ES7000 have no legacy identity mappings
-	 */
-	if (es7000_plat == 1)
-		return;
+static unsigned long __initdata mpc_new_phys;
+static unsigned long mpc_new_length __initdata = 4096;
 
-	/*
-	 * Locate the IOAPIC that manages the ISA IRQs (0-15).
-	 */
-	ioapic = mp_find_ioapic(0);
-	if (ioapic < 0)
-		return;
+/* alloc_mptable or alloc_mptable=4k */
+static int __initdata alloc_mptable;
+static int __init parse_alloc_mptable_opt(char *p)
+{
+	enable_update_mptable = 1;
+	alloc_mptable = 1;
+	if (!p)
+		return 0;
+	mpc_new_length = memparse(p, &p);
+	return 0;
+}
+early_param("alloc_mptable", parse_alloc_mptable_opt);
 
-	intsrc.mpc_type = MP_INTSRC;
-	intsrc.mpc_irqflag = 0;	/* Conforming */
-	intsrc.mpc_srcbus = MP_ISA_BUS;
-#ifdef CONFIG_X86_IO_APIC
-	intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
+void __init early_reserve_e820_mpc_new(void)
+{
+	if (enable_update_mptable && alloc_mptable) {
+		u64 startt = 0;
+#ifdef CONFIG_X86_TRAMPOLINE
+		startt = TRAMPOLINE_BASE;
 #endif
-	/*
-	 * Use the default configuration for the IRQs 0-15.  Unless
-	 * overridden by (MADT) interrupt source override entries.
-	 */
-	for (i = 0; i < 16; i++) {
-		int idx;
-
-		for (idx = 0; idx < mp_irq_entries; idx++) {
-			struct mpc_config_intsrc *irq = mp_irqs + idx;
-
-			/* Do we already have a mapping for this ISA IRQ? */
-			if (irq->mpc_srcbus == MP_ISA_BUS
-			    && irq->mpc_srcbusirq == i)
-				break;
-
-			/* Do we already have a mapping for this IOAPIC pin */
-			if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
-			    (irq->mpc_dstirq == i))
-				break;
-		}
-
-		if (idx != mp_irq_entries) {
-			printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
-			continue;	/* IRQ already used */
-		}
-
-		intsrc.mpc_irqtype = mp_INT;
-		intsrc.mpc_srcbusirq = i;	/* Identity mapped */
-		intsrc.mpc_dstirq = i;
-
-		MP_intsrc_info(&intsrc);
+		mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
 	}
 }
 
-int mp_register_gsi(u32 gsi, int triggering, int polarity)
+static int __init update_mp_table(void)
 {
-	int ioapic;
-	int ioapic_pin;
-#ifdef CONFIG_X86_32
-#define MAX_GSI_NUM	4096
-#define IRQ_COMPRESSION_START	64
+	char str[16];
+	char oem[10];
+	struct intel_mp_floating *mpf;
+	struct mp_config_table *mpc;
+	struct mp_config_table *mpc_new;
+
+	if (!enable_update_mptable)
+		return 0;
+
+	mpf = mpf_found;
+	if (!mpf)
+		return 0;
 
-	static int pci_irq = IRQ_COMPRESSION_START;
 	/*
-	 * Mapping between Global System Interrupts, which
-	 * represent all possible interrupts, and IRQs
-	 * assigned to actual devices.
+	 * Now see if we need to go further.
 	 */
-	static int gsi_to_irq[MAX_GSI_NUM];
-#else
-
-	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
-		return gsi;
-#endif
+	if (mpf->mpf_feature1 != 0)
+		return 0;
 
-	/* Don't set up the ACPI SCI because it's already set up */
-	if (acpi_gbl_FADT.sci_interrupt == gsi)
-		return gsi;
+	if (!mpf->mpf_physptr)
+		return 0;
 
-	ioapic = mp_find_ioapic(gsi);
-	if (ioapic < 0) {
-		printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
-		return gsi;
-	}
+	mpc = phys_to_virt(mpf->mpf_physptr);
 
-	ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+	if (!smp_check_mpc(mpc, oem, str))
+		return 0;
 
-#ifdef CONFIG_X86_32
-	if (ioapic_renumber_irq)
-		gsi = ioapic_renumber_irq(ioapic, gsi);
-#endif
+	printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf));
+	printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);
 
-	/*
-	 * Avoid pin reprogramming.  PRTs typically include entries
-	 * with redundant pin->gsi mappings (but unique PCI devices);
-	 * we only program the IOAPIC on the first.
-	 */
-	if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
-		printk(KERN_ERR "Invalid reference to IOAPIC pin "
-		       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
-		       ioapic_pin);
-		return gsi;
+	if (mpc_new_phys && mpc->mpc_length > mpc_new_length) {
+		mpc_new_phys = 0;
+		printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
+			 mpc_new_length);
 	}
-	if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
-		Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
-			mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-#ifdef CONFIG_X86_32
-		return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
-#else
-		return gsi;
-#endif
+
+	if (!mpc_new_phys) {
+		unsigned char old, new;
+		/* check if we can change the postion */
+		mpc->mpc_checksum = 0;
+		old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
+		mpc->mpc_checksum = 0xff;
+		new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
+		if (old == new) {
+			printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
+			return 0;
+		}
+		printk(KERN_INFO "use in-positon replacing\n");
+	} else {
+		mpf->mpf_physptr = mpc_new_phys;
+		mpc_new = phys_to_virt(mpc_new_phys);
+		memcpy(mpc_new, mpc, mpc->mpc_length);
+		mpc = mpc_new;
+		/* check if we can modify that */
+		if (mpc_new_phys - mpf->mpf_physptr) {
+			struct intel_mp_floating *mpf_new;
+			/* steal 16 bytes from [0, 1k) */
+			printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
+			mpf_new = phys_to_virt(0x400 - 16);
+			memcpy(mpf_new, mpf, 16);
+			mpf = mpf_new;
+			mpf->mpf_physptr = mpc_new_phys;
+		}
+		mpf->mpf_checksum = 0;
+		mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16);
+		printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr);
 	}
 
-	set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
-#ifdef CONFIG_X86_32
 	/*
-	 * For GSI >= 64, use IRQ compression
+	 * only replace the one with mp_INT and
+	 *	 MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW,
+	 * already in mp_irqs , stored by ... and mp_config_acpi_gsi,
+	 * may need pci=routeirq for all coverage
 	 */
-	if ((gsi >= IRQ_COMPRESSION_START)
-	    && (triggering == ACPI_LEVEL_SENSITIVE)) {
-		/*
-		 * For PCI devices assign IRQs in order, avoiding gaps
-		 * due to unused I/O APIC pins.
-		 */
-		int irq = gsi;
-		if (gsi < MAX_GSI_NUM) {
-			/*
-			 * Retain the VIA chipset work-around (gsi > 15), but
-			 * avoid a problem where the 8254 timer (IRQ0) is setup
-			 * via an override (so it's not on pin 0 of the ioapic),
-			 * and at the same time, the pin 0 interrupt is a PCI
-			 * type.  The gsi > 15 test could cause these two pins
-			 * to be shared as IRQ0, and they are not shareable.
-			 * So test for this condition, and if necessary, avoid
-			 * the pin collision.
-			 */
-			gsi = pci_irq++;
-			/*
-			 * Don't assign IRQ used by ACPI SCI
-			 */
-			if (gsi == acpi_gbl_FADT.sci_interrupt)
-				gsi = pci_irq++;
-			gsi_to_irq[irq] = gsi;
-		} else {
-			printk(KERN_ERR "GSI %u is too high\n", gsi);
-			return gsi;
-		}
-	}
-#endif
-	io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
-				triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
-				polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-	return gsi;
+	replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
+
+	return 0;
 }
 
-#endif /* CONFIG_X86_IO_APIC */
-#endif /* CONFIG_ACPI */
+late_initcall(update_mp_table);

+ 20 - 5
arch/x86/kernel/numaq_32.c

@@ -31,6 +31,8 @@
 #include <asm/numaq.h>
 #include <asm/topology.h>
 #include <asm/processor.h>
+#include <asm/mpspec.h>
+#include <asm/e820.h>
 
 #define	MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
 
@@ -58,6 +60,8 @@ static void __init smp_dump_qct(void)
 			node_end_pfn[node] = MB_TO_PAGES(
 				eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
 
+			e820_register_active_regions(node, node_start_pfn[node],
+							node_end_pfn[node]);
 			memory_present(node,
 				node_start_pfn[node], node_end_pfn[node]);
 			node_remap_size[node] = node_memmap_size_bytes(node,
@@ -67,13 +71,24 @@ static void __init smp_dump_qct(void)
 	}
 }
 
-/*
- * Unlike Summit, we don't really care to let the NUMA-Q
- * fall back to flat mode.  Don't compile for NUMA-Q
- * unless you really need it!
- */
+static __init void early_check_numaq(void)
+{
+	/*
+	 * Find possible boot-time SMP configuration:
+	 */
+	early_find_smp_config();
+	/*
+	 * get boot-time SMP configuration:
+	 */
+	if (smp_found_config)
+		early_get_smp_config();
+}
+
 int __init get_memcfg_numaq(void)
 {
+	early_check_numaq();
+	if (!found_numaq)
+		return 0;
 	smp_dump_qct();
 	return 1;
 }

+ 26 - 0
arch/x86/kernel/setup.c

@@ -17,6 +17,7 @@ unsigned int num_processors;
 unsigned disabled_cpus __cpuinitdata;
 /* Processor that is doing the boot up */
 unsigned int boot_cpu_physical_apicid = -1U;
+unsigned int max_physical_apicid;
 EXPORT_SYMBOL(boot_cpu_physical_apicid);
 
 DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
@@ -137,3 +138,28 @@ void __init setup_per_cpu_areas(void)
 }
 
 #endif
+
+void __init parse_setup_data(void)
+{
+	struct setup_data *data;
+	u64 pa_data;
+
+	if (boot_params.hdr.version < 0x0209)
+		return;
+	pa_data = boot_params.hdr.setup_data;
+	while (pa_data) {
+		data = early_ioremap(pa_data, PAGE_SIZE);
+		switch (data->type) {
+		case SETUP_E820_EXT:
+			parse_e820_ext(data, pa_data);
+			break;
+		default:
+			break;
+		}
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
+		free_early(pa_data, pa_data+sizeof(*data)+data->len);
+#endif
+		pa_data = data->next;
+		early_iounmap(data, PAGE_SIZE);
+	}
+}

+ 328 - 228
arch/x86/kernel/setup_32.c

@@ -59,6 +59,7 @@
 #include <asm/setup.h>
 #include <asm/arch_hooks.h>
 #include <asm/sections.h>
+#include <asm/dmi.h>
 #include <asm/io_apic.h>
 #include <asm/ist.h>
 #include <asm/io.h>
@@ -67,10 +68,13 @@
 #include <asm/bios_ebda.h>
 #include <asm/cacheflush.h>
 #include <asm/processor.h>
+#include <asm/efi.h>
+#include <asm/bugs.h>
 
 /* This value is set up by the early boot code to point to the value
    immediately after the boot time page tables.  It contains a *physical*
    address, and must not be in the .bss segment! */
+unsigned long init_pg_tables_start __initdata = ~0UL;
 unsigned long init_pg_tables_end __initdata = ~0UL;
 
 /*
@@ -181,6 +185,12 @@ int bootloader_type;
 /* user-defined highmem size */
 static unsigned int highmem_pages = -1;
 
+/*
+ * Early DMI memory
+ */
+int dmi_alloc_index;
+char dmi_alloc_data[DMI_MAX_DATA];
+
 /*
  * Setup options
  */
@@ -237,42 +247,6 @@ static inline void copy_edd(void)
 }
 #endif
 
-int __initdata user_defined_memmap;
-
-/*
- * "mem=nopentium" disables the 4MB page tables.
- * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
- * to <mem>, overriding the bios size.
- * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
- * <start> to <start>+<mem>, overriding the bios size.
- *
- * HPA tells me bootloaders need to parse mem=, so no new
- * option should be mem=  [also see Documentation/i386/boot.txt]
- */
-static int __init parse_mem(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-
-	if (strcmp(arg, "nopentium") == 0) {
-		setup_clear_cpu_cap(X86_FEATURE_PSE);
-	} else {
-		/* If the user specifies memory size, we
-		 * limit the BIOS-provided memory map to
-		 * that size. exactmap can be used to specify
-		 * the exact map. mem=number can be used to
-		 * trim the existing memory map.
-		 */
-		unsigned long long mem_size;
-
-		mem_size = memparse(arg, &arg);
-		limit_regions(mem_size);
-		user_defined_memmap = 1;
-	}
-	return 0;
-}
-early_param("mem", parse_mem);
-
 #ifdef CONFIG_PROC_VMCORE
 /* elfcorehdr= specifies the location of elf core header
  * stored by the crashed kernel.
@@ -395,56 +369,6 @@ unsigned long __init find_max_low_pfn(void)
 	return max_low_pfn;
 }
 
-#define BIOS_LOWMEM_KILOBYTES 0x413
-
-/*
- * The BIOS places the EBDA/XBDA at the top of conventional
- * memory, and usually decreases the reported amount of
- * conventional memory (int 0x12) too. This also contains a
- * workaround for Dell systems that neglect to reserve EBDA.
- * The same workaround also avoids a problem with the AMD768MPX
- * chipset: reserve a page before VGA to prevent PCI prefetch
- * into it (errata #56). Usually the page is reserved anyways,
- * unless you have no PS/2 mouse plugged in.
- */
-static void __init reserve_ebda_region(void)
-{
-	unsigned int lowmem, ebda_addr;
-
-	/* To determine the position of the EBDA and the */
-	/* end of conventional memory, we need to look at */
-	/* the BIOS data area. In a paravirtual environment */
-	/* that area is absent. We'll just have to assume */
-	/* that the paravirt case can handle memory setup */
-	/* correctly, without our help. */
-	if (paravirt_enabled())
-		return;
-
-	/* end of low (conventional) memory */
-	lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
-	lowmem <<= 10;
-
-	/* start of EBDA area */
-	ebda_addr = get_bios_ebda();
-
-	/* Fixup: bios puts an EBDA in the top 64K segment */
-	/* of conventional memory, but does not adjust lowmem. */
-	if ((lowmem - ebda_addr) <= 0x10000)
-		lowmem = ebda_addr;
-
-	/* Fixup: bios does not report an EBDA at all. */
-	/* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
-	if ((ebda_addr == 0) && (lowmem >= 0x9f000))
-		lowmem = 0x9f000;
-
-	/* Paranoia: should never happen, but... */
-	if ((lowmem == 0) || (lowmem >= 0x100000))
-		lowmem = 0x9f000;
-
-	/* reserve all memory between lowmem and the 1MB mark */
-	reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
-}
-
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 static void __init setup_bootmem_allocator(void);
 static unsigned long __init setup_memory(void)
@@ -462,11 +386,13 @@ static unsigned long __init setup_memory(void)
 	if (max_pfn > max_low_pfn) {
 		highstart_pfn = max_low_pfn;
 	}
+	memory_present(0, 0, highend_pfn);
 	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
 		pages_to_mb(highend_pfn - highstart_pfn));
 	num_physpages = highend_pfn;
 	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
 #else
+	memory_present(0, 0, max_low_pfn);
 	num_physpages = max_low_pfn;
 	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
 #endif
@@ -488,11 +414,12 @@ static void __init zone_sizes_init(void)
 	max_zone_pfns[ZONE_DMA] =
 		virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+	remove_all_active_ranges();
 #ifdef CONFIG_HIGHMEM
 	max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
-	add_active_range(0, 0, highend_pfn);
+	e820_register_active_regions(0, 0, highend_pfn);
 #else
-	add_active_range(0, 0, max_low_pfn);
+	e820_register_active_regions(0, 0, max_low_pfn);
 #endif
 
 	free_area_init_nodes(max_zone_pfns);
@@ -526,25 +453,28 @@ static void __init reserve_crashkernel(void)
 	ret = parse_crashkernel(boot_command_line, total_mem,
 			&crash_size, &crash_base);
 	if (ret == 0 && crash_size > 0) {
-		if (crash_base > 0) {
-			printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
-					"for crashkernel (System RAM: %ldMB)\n",
-					(unsigned long)(crash_size >> 20),
-					(unsigned long)(crash_base >> 20),
-					(unsigned long)(total_mem >> 20));
-
-			if (reserve_bootmem(crash_base, crash_size,
-					BOOTMEM_EXCLUSIVE) < 0) {
-				printk(KERN_INFO "crashkernel reservation "
-					"failed - memory is in use\n");
-				return;
-			}
-
-			crashk_res.start = crash_base;
-			crashk_res.end   = crash_base + crash_size - 1;
-		} else
+		if (crash_base <= 0) {
 			printk(KERN_INFO "crashkernel reservation failed - "
 					"you have to specify a base address\n");
+			return;
+		}
+
+		if (reserve_bootmem_generic(crash_base, crash_size,
+					BOOTMEM_EXCLUSIVE) < 0) {
+			printk(KERN_INFO "crashkernel reservation failed - "
+					"memory is in use\n");
+			return;
+		}
+
+		printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
+				"for crashkernel (System RAM: %ldMB)\n",
+				(unsigned long)(crash_size >> 20),
+				(unsigned long)(crash_base >> 20),
+				(unsigned long)(total_mem >> 20));
+
+		crashk_res.start = crash_base;
+		crashk_res.end   = crash_base + crash_size - 1;
+		insert_resource(&iomem_resource, &crashk_res);
 	}
 }
 #else
@@ -558,44 +488,57 @@ static bool do_relocate_initrd = false;
 
 static void __init reserve_initrd(void)
 {
-	unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
-	unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
-	unsigned long ramdisk_end   = ramdisk_image + ramdisk_size;
-	unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
-	unsigned long ramdisk_here;
-
-	initrd_start = 0;
+	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+	u64 ramdisk_end   = ramdisk_image + ramdisk_size;
+	u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+	u64 ramdisk_here;
 
 	if (!boot_params.hdr.type_of_loader ||
 	    !ramdisk_image || !ramdisk_size)
 		return;		/* No initrd provided by bootloader */
 
-	if (ramdisk_end < ramdisk_image) {
-		printk(KERN_ERR "initrd wraps around end of memory, "
-		       "disabling initrd\n");
-		return;
-	}
+	initrd_start = 0;
+
 	if (ramdisk_size >= end_of_lowmem/2) {
+		free_early(ramdisk_image, ramdisk_end);
 		printk(KERN_ERR "initrd too large to handle, "
 		       "disabling initrd\n");
 		return;
 	}
+
+	printk(KERN_INFO "old RAMDISK: %08llx - %08llx\n", ramdisk_image,
+			ramdisk_end);
+
+
 	if (ramdisk_end <= end_of_lowmem) {
 		/* All in lowmem, easy case */
-		reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
+		/*
+		 * don't need to reserve again, already reserved early
+		 * in i386_start_kernel
+		 */
 		initrd_start = ramdisk_image + PAGE_OFFSET;
 		initrd_end = initrd_start+ramdisk_size;
 		return;
 	}
 
 	/* We need to move the initrd down into lowmem */
-	ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
+	ramdisk_here = find_e820_area(min_low_pfn<<PAGE_SHIFT,
+				 end_of_lowmem, ramdisk_size,
+				 PAGE_SIZE);
+
+	if (ramdisk_here == -1ULL)
+		panic("Cannot find place for new RAMDISK of size %lld\n",
+			 ramdisk_size);
 
 	/* Note: this includes all the lowmem currently occupied by
 	   the initrd, we rely on that fact to keep the data intact. */
-	reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
+	reserve_early(ramdisk_here, ramdisk_here + ramdisk_size,
+			 "NEW RAMDISK");
 	initrd_start = ramdisk_here + PAGE_OFFSET;
 	initrd_end   = initrd_start + ramdisk_size;
+	printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
+			 ramdisk_here, ramdisk_here + ramdisk_size);
 
 	do_relocate_initrd = true;
 }
@@ -604,10 +547,10 @@ static void __init reserve_initrd(void)
 
 static void __init relocate_initrd(void)
 {
-	unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
-	unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
-	unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
-	unsigned long ramdisk_here;
+	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+	u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+	u64 ramdisk_here;
 	unsigned long slop, clen, mapaddr;
 	char *p, *q;
 
@@ -624,6 +567,10 @@ static void __init relocate_initrd(void)
 		p = (char *)__va(ramdisk_image);
 		memcpy(q, p, clen);
 		q += clen;
+		/* need to free these low pages...*/
+		printk(KERN_INFO "Freeing old partial RAMDISK %08llx-%08llx\n",
+			 ramdisk_image, ramdisk_image + clen - 1);
+		free_bootmem(ramdisk_image, clen);
 		ramdisk_image += clen;
 		ramdisk_size  -= clen;
 	}
@@ -642,66 +589,47 @@ static void __init relocate_initrd(void)
 		ramdisk_image += clen;
 		ramdisk_size  -= clen;
 	}
+	/* high pages is not converted by early_res_to_bootmem */
+	ramdisk_image = boot_params.hdr.ramdisk_image;
+	ramdisk_size  = boot_params.hdr.ramdisk_size;
+	printk(KERN_INFO "Copied RAMDISK from %016llx - %016llx to %08llx - %08llx\n",
+		ramdisk_image, ramdisk_image + ramdisk_size - 1,
+		ramdisk_here, ramdisk_here + ramdisk_size - 1);
+
+	/* need to free that, otherwise init highmem will reserve it again */
+	free_early(ramdisk_image, ramdisk_image+ramdisk_size);
 }
 
 #endif /* CONFIG_BLK_DEV_INITRD */
 
 void __init setup_bootmem_allocator(void)
 {
-	unsigned long bootmap_size;
+	int i;
+	unsigned long bootmap_size, bootmap;
 	/*
 	 * Initialize the boot-time allocator (with low memory only):
 	 */
-	bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
-
-	register_bootmem_low_pages(max_low_pfn);
-
-	/*
-	 * Reserve the bootmem bitmap itself as well. We do this in two
-	 * steps (first step was init_bootmem()) because this catches
-	 * the (very unlikely) case of us accidentally initializing the
-	 * bootmem allocator with an invalid RAM area.
-	 */
-	reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
-			 bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
-			 BOOTMEM_DEFAULT);
-
-	/*
-	 * reserve physical page 0 - it's a special BIOS page on many boxes,
-	 * enabling clean reboots, SMP operation, laptop functions.
-	 */
-	reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
-
-	/* reserve EBDA region */
-	reserve_ebda_region();
-
-#ifdef CONFIG_SMP
-	/*
-	 * But first pinch a few for the stack/trampoline stuff
-	 * FIXME: Don't need the extra page at 4K, but need to fix
-	 * trampoline before removing it. (see the GDT stuff)
-	 */
-	reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
-#endif
-#ifdef CONFIG_ACPI_SLEEP
-	/*
-	 * Reserve low memory region for sleep support.
-	 */
-	acpi_reserve_bootmem();
-#endif
-#ifdef CONFIG_X86_FIND_SMP_CONFIG
-	/*
-	 * Find and reserve possible boot-time SMP configuration:
-	 */
-	find_smp_config();
-#endif
+	bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
+	bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
+				 max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
+				 PAGE_SIZE);
+	if (bootmap == -1L)
+		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
+	reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
 #ifdef CONFIG_BLK_DEV_INITRD
 	reserve_initrd();
 #endif
-	numa_kva_reserve();
-	reserve_crashkernel();
+	bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn);
+	printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
+		 max_pfn_mapped<<PAGE_SHIFT);
+	printk(KERN_INFO "  low ram: %08lx - %08lx\n",
+		 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
+	printk(KERN_INFO "  bootmap %08lx - %08lx\n",
+		 bootmap, bootmap + bootmap_size);
+	for_each_online_node(i)
+		free_bootmem_with_active_regions(i, max_low_pfn);
+	early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
 
-	reserve_ibft_region();
 }
 
 /*
@@ -731,12 +659,6 @@ static void set_mca_bus(int x)
 static void set_mca_bus(int x) { }
 #endif
 
-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
-char * __init __attribute__((weak)) memory_setup(void)
-{
-	return machine_specific_memory_setup();
-}
-
 #ifdef CONFIG_NUMA
 /*
  * In the golden day, when everything among i386 and x86_64 will be
@@ -749,6 +671,8 @@ int x86_cpu_to_node_map_init[NR_CPUS] = {
 DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
 #endif
 
+static void probe_roms(void);
+
 /*
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
  * passed the efi memmap, systab, etc., so we should use these data structures
@@ -758,17 +682,21 @@ DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
  */
 void __init setup_arch(char **cmdline_p)
 {
+	int i;
 	unsigned long max_low_pfn;
 
 	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
 	pre_setup_arch_hook();
 	early_cpu_init();
 	early_ioremap_init();
+	reserve_setup_data();
 
 #ifdef CONFIG_EFI
 	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
-		     "EL32", 4))
+		     "EL32", 4)) {
 		efi_enabled = 1;
+		efi_reserve_early();
+	}
 #endif
 
 	ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
@@ -792,8 +720,7 @@ void __init setup_arch(char **cmdline_p)
 #endif
 	ARCH_SETUP
 
-	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-	print_memory_map(memory_setup());
+	setup_memory_map();
 
 	copy_edd();
 
@@ -811,12 +738,18 @@ void __init setup_arch(char **cmdline_p)
 	bss_resource.start = virt_to_phys(&__bss_start);
 	bss_resource.end = virt_to_phys(&__bss_stop)-1;
 
+	parse_setup_data();
+
 	parse_early_param();
 
-	if (user_defined_memmap) {
-		printk(KERN_INFO "user-defined physical RAM map:\n");
-		print_memory_map("user");
-	}
+	finish_e820_parsing();
+
+	probe_roms();
+
+	/* after parse_early_param, so could debug it */
+	insert_resource(&iomem_resource, &code_resource);
+	insert_resource(&iomem_resource, &data_resource);
+	insert_resource(&iomem_resource, &bss_resource);
 
 	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
 	*cmdline_p = command_line;
@@ -824,14 +757,67 @@ void __init setup_arch(char **cmdline_p)
 	if (efi_enabled)
 		efi_init();
 
+	if (ppro_with_ram_bug()) {
+		e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
+				  E820_RESERVED);
+		sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+		printk(KERN_INFO "fixed physical RAM map:\n");
+		e820_print_map("bad_ppro");
+	}
+
+	e820_register_active_regions(0, 0, -1UL);
+	/*
+	 * partially used pages are not usable - thus
+	 * we are rounding upwards:
+	 */
+	max_pfn = e820_end_of_ram();
+
+	/* preallocate 4k for mptable mpc */
+	early_reserve_e820_mpc_new();
 	/* update e820 for memory not covered by WB MTRRs */
-	propagate_e820_map();
 	mtrr_bp_init();
-	if (mtrr_trim_uncached_memory(max_pfn))
-		propagate_e820_map();
+	if (mtrr_trim_uncached_memory(max_pfn)) {
+		remove_all_active_ranges();
+		e820_register_active_regions(0, 0, -1UL);
+		max_pfn = e820_end_of_ram();
+	}
+
+	dmi_scan_machine();
+
+	io_delay_init();
+
+#ifdef CONFIG_ACPI
+	/*
+	 * Parse the ACPI tables for possible boot-time SMP configuration.
+	 */
+	acpi_boot_table_init();
+#endif
+
+#ifdef CONFIG_ACPI_NUMA
+        /*
+         * Parse SRAT to discover nodes.
+         */
+        acpi_numa_init();
+#endif
 
 	max_low_pfn = setup_memory();
 
+#ifdef CONFIG_ACPI_SLEEP
+	/*
+	 * Reserve low memory region for sleep support.
+	 */
+	acpi_reserve_bootmem();
+#endif
+#ifdef CONFIG_X86_FIND_SMP_CONFIG
+	/*
+	 * Find and reserve possible boot-time SMP configuration:
+	 */
+	find_smp_config();
+#endif
+	reserve_crashkernel();
+
+	reserve_ibft_region();
+
 #ifdef CONFIG_KVM_CLOCK
 	kvmclock_init();
 #endif
@@ -855,9 +841,6 @@ void __init setup_arch(char **cmdline_p)
 	 * not to exceed the 8Mb limit.
 	 */
 
-#ifdef CONFIG_SMP
-	smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
-#endif
 	paging_init();
 
 	/*
@@ -869,10 +852,6 @@ void __init setup_arch(char **cmdline_p)
 		init_ohci1394_dma_on_all_controllers();
 #endif
 
-	remapped_pgdat_init();
-	sparse_init();
-	zone_sizes_init();
-
 	/*
 	 * NOTE: at this point the bootmem allocator is fully available.
 	 */
@@ -881,11 +860,11 @@ void __init setup_arch(char **cmdline_p)
 	relocate_initrd();
 #endif
 
-	paravirt_post_allocator_init();
-
-	dmi_scan_machine();
+	remapped_pgdat_init();
+	sparse_init();
+	zone_sizes_init();
 
-	io_delay_init();
+	paravirt_post_allocator_init();
 
 #ifdef CONFIG_X86_SMP
 	/*
@@ -903,32 +882,31 @@ void __init setup_arch(char **cmdline_p)
 	generic_apic_probe();
 #endif
 
-#ifdef CONFIG_ACPI
-	/*
-	 * Parse the ACPI tables for possible boot-time SMP configuration.
-	 */
-	acpi_boot_table_init();
-#endif
-
 	early_quirks();
 
 #ifdef CONFIG_ACPI
 	acpi_boot_init();
-
+#endif
+#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
+	if (smp_found_config)
+		get_smp_config();
+#endif
 #if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
 	if (def_to_bigsmp)
 		printk(KERN_WARNING "More than 8 CPUs detected and "
 			"CONFIG_X86_PC cannot handle it.\nUse "
 			"CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
 #endif
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
-	if (smp_found_config)
-		get_smp_config();
-#endif
 
-	e820_register_memory();
-	e820_mark_nosave_regions();
+	e820_reserve_resources();
+	e820_mark_nosave_regions(max_low_pfn);
+
+	request_resource(&iomem_resource, &video_ram_resource);
+	/* request I/O space for devices used on all i[345]86 PCs */
+	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+		request_resource(&ioport_resource, &standard_io_resources[i]);
+
+	e820_setup_gap();
 
 #ifdef CONFIG_VT
 #if defined(CONFIG_VGA_CONSOLE)
@@ -940,25 +918,147 @@ void __init setup_arch(char **cmdline_p)
 #endif
 }
 
-/*
- * Request address space for all standard resources
- *
- * This is called just before pcibios_init(), which is also a
- * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
- */
-static int __init request_standard_resources(void)
+static struct resource system_rom_resource = {
+	.name	= "System ROM",
+	.start	= 0xf0000,
+	.end	= 0xfffff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource extension_rom_resource = {
+	.name	= "Extension ROM",
+	.start	= 0xe0000,
+	.end	= 0xeffff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource adapter_rom_resources[] = { {
+	.name 	= "Adapter ROM",
+	.start	= 0xc8000,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+} };
+
+static struct resource video_rom_resource = {
+	.name 	= "Video ROM",
+	.start	= 0xc0000,
+	.end	= 0xc7fff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+#define ROMSIGNATURE 0xaa55
+
+static int __init romsignature(const unsigned char *rom)
 {
+	const unsigned short * const ptr = (const unsigned short *)rom;
+	unsigned short sig;
+
+	return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
+}
+
+static int __init romchecksum(const unsigned char *rom, unsigned long length)
+{
+	unsigned char sum, c;
+
+	for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
+		sum += c;
+	return !length && !sum;
+}
+
+static void __init probe_roms(void)
+{
+	const unsigned char *rom;
+	unsigned long start, length, upper;
+	unsigned char c;
 	int i;
 
-	printk(KERN_INFO "Setting up standard PCI resources\n");
-	init_iomem_resources(&code_resource, &data_resource, &bss_resource);
+	/* video rom */
+	upper = adapter_rom_resources[0].start;
+	for (start = video_rom_resource.start; start < upper; start += 2048) {
+		rom = isa_bus_to_virt(start);
+		if (!romsignature(rom))
+			continue;
 
-	request_resource(&iomem_resource, &video_ram_resource);
+		video_rom_resource.start = start;
 
-	/* request I/O space for devices used on all i[345]86 PCs */
-	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
-		request_resource(&ioport_resource, &standard_io_resources[i]);
-	return 0;
+		if (probe_kernel_address(rom + 2, c) != 0)
+			continue;
+
+		/* 0 < length <= 0x7f * 512, historically */
+		length = c * 512;
+
+		/* if checksum okay, trust length byte */
+		if (length && romchecksum(rom, length))
+			video_rom_resource.end = start + length - 1;
+
+		request_resource(&iomem_resource, &video_rom_resource);
+		break;
+	}
+
+	start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
+	if (start < upper)
+		start = upper;
+
+	/* system rom */
+	request_resource(&iomem_resource, &system_rom_resource);
+	upper = system_rom_resource.start;
+
+	/* check for extension rom (ignore length byte!) */
+	rom = isa_bus_to_virt(extension_rom_resource.start);
+	if (romsignature(rom)) {
+		length = extension_rom_resource.end - extension_rom_resource.start + 1;
+		if (romchecksum(rom, length)) {
+			request_resource(&iomem_resource, &extension_rom_resource);
+			upper = extension_rom_resource.start;
+		}
+	}
+
+	/* check for adapter roms on 2k boundaries */
+	for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
+		rom = isa_bus_to_virt(start);
+		if (!romsignature(rom))
+			continue;
+
+		if (probe_kernel_address(rom + 2, c) != 0)
+			continue;
+
+		/* 0 < length <= 0x7f * 512, historically */
+		length = c * 512;
+
+		/* but accept any length that fits if checksum okay */
+		if (!length || start + length > upper || !romchecksum(rom, length))
+			continue;
+
+		adapter_rom_resources[i].start = start;
+		adapter_rom_resources[i].end = start + length - 1;
+		request_resource(&iomem_resource, &adapter_rom_resources[i]);
+
+		start = adapter_rom_resources[i++].end & ~2047UL;
+	}
 }
 
-subsys_initcall(request_standard_resources);

+ 16 - 36
arch/x86/kernel/setup_64.c

@@ -56,6 +56,7 @@
 #include <asm/desc.h>
 #include <video/edid.h>
 #include <asm/e820.h>
+#include <asm/mpspec.h>
 #include <asm/dma.h>
 #include <asm/gart.h>
 #include <asm/mpspec.h>
@@ -245,7 +246,7 @@ static void __init reserve_crashkernel(void)
 			return;
 		}
 
-		if (reserve_bootmem(crash_base, crash_size,
+		if (reserve_bootmem_generic(crash_base, crash_size,
 					BOOTMEM_EXCLUSIVE) < 0) {
 			printk(KERN_INFO "crashkernel reservation failed - "
 					"memory is in use\n");
@@ -267,34 +268,6 @@ static inline void __init reserve_crashkernel(void)
 {}
 #endif
 
-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
-void __attribute__((weak)) __init memory_setup(void)
-{
-       machine_specific_memory_setup();
-}
-
-static void __init parse_setup_data(void)
-{
-	struct setup_data *data;
-	unsigned long pa_data;
-
-	if (boot_params.hdr.version < 0x0209)
-		return;
-	pa_data = boot_params.hdr.setup_data;
-	while (pa_data) {
-		data = early_ioremap(pa_data, PAGE_SIZE);
-		switch (data->type) {
-		default:
-			break;
-		}
-#ifndef CONFIG_DEBUG_BOOT_PARAMS
-		free_early(pa_data, pa_data+sizeof(*data)+data->len);
-#endif
-		pa_data = data->next;
-		early_iounmap(data, PAGE_SIZE);
-	}
-}
-
 /*
  * setup_arch - architecture-specific boot-time initializations
  *
@@ -319,13 +292,15 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #ifdef CONFIG_EFI
 	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
-		     "EL64", 4))
+		     "EL64", 4)) {
 		efi_enabled = 1;
+		efi_reserve_early();
+	}
 #endif
 
 	ARCH_SETUP
 
-	memory_setup();
+	setup_memory_map();
 	copy_edd();
 
 	if (!boot_params.hdr.root_flags)
@@ -372,9 +347,13 @@ void __init setup_arch(char **cmdline_p)
 	 * we are rounding upwards:
 	 */
 	end_pfn = e820_end_of_ram();
+
+	/* pre allocte 4k for mptable mpc */
+	early_reserve_e820_mpc_new();
 	/* update e820 for memory not covered by WB MTRRs */
 	mtrr_bp_init();
 	if (mtrr_trim_uncached_memory(end_pfn)) {
+		remove_all_active_ranges();
 		e820_register_active_regions(0, 0, -1UL);
 		end_pfn = e820_end_of_ram();
 	}
@@ -383,7 +362,7 @@ void __init setup_arch(char **cmdline_p)
 
 	check_efer();
 
-	max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
+	max_pfn_mapped = init_memory_mapping(0, (end_pfn << PAGE_SHIFT));
 	if (efi_enabled)
 		efi_init();
 
@@ -444,13 +423,12 @@ void __init setup_arch(char **cmdline_p)
        acpi_reserve_bootmem();
 #endif
 
-	if (efi_enabled)
-		efi_reserve_bootmem();
-
+#ifdef CONFIG_X86_MPPARSE
        /*
 	* Find and reserve possible boot-time SMP configuration:
 	*/
 	find_smp_config();
+#endif
 #ifdef CONFIG_BLK_DEV_INITRD
 	if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
 		unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
@@ -493,11 +471,13 @@ void __init setup_arch(char **cmdline_p)
 
 	init_cpu_to_node();
 
+#ifdef CONFIG_X86_MPPARSE
 	/*
 	 * get boot-time SMP configuration:
 	 */
 	if (smp_found_config)
 		get_smp_config();
+#endif
 	init_apic_mappings();
 	ioapic_init_mappings();
 
@@ -507,7 +487,7 @@ void __init setup_arch(char **cmdline_p)
 	 * We trust e820 completely. No explicit ROM probing in memory.
 	 */
 	e820_reserve_resources();
-	e820_mark_nosave_regions();
+	e820_mark_nosave_regions(end_pfn);
 
 	/* request I/O space for devices used on all i[345]86 PCs */
 	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)

+ 0 - 17
arch/x86/kernel/smpboot.c

@@ -554,23 +554,6 @@ cpumask_t cpu_coregroup_map(int cpu)
 		return c->llc_shared_map;
 }
 
-#ifdef CONFIG_X86_32
-/*
- * We are called very early to get the low memory for the
- * SMP bootup trampoline page.
- */
-void __init smp_alloc_memory(void)
-{
-	trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
-	/*
-	 * Has to be in very low memory so we can execute
-	 * real-mode AP code.
-	 */
-	if (__pa(trampoline_base) >= 0x9F000)
-		BUG();
-}
-#endif
-
 static void impress_friends(void)
 {
 	int cpu;

+ 53 - 138
arch/x86/kernel/srat_32.c

@@ -31,6 +31,7 @@
 #include <asm/srat.h>
 #include <asm/topology.h>
 #include <asm/smp.h>
+#include <asm/e820.h>
 
 /*
  * proximity macros and definitions
@@ -41,7 +42,7 @@
 #define BMAP_TEST(bmap, bit)	((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
 /* bitmap length; _PXM is at most 255 */
 #define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) 
-static u8 pxm_bitmap[PXM_BITMAP_LEN];	/* bitmap of proximity domains */
+static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN];	/* bitmap of proximity domains */
 
 #define MAX_CHUNKS_PER_NODE	3
 #define MAXCHUNKS		(MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
@@ -52,16 +53,37 @@ struct node_memory_chunk_s {
 	u8	nid;		// which cnode contains this chunk?
 	u8	bank;		// which mem bank on this node
 };
-static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
+static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
 
-static int num_memory_chunks;		/* total number of memory chunks */
+static int __initdata num_memory_chunks; /* total number of memory chunks */
 static u8 __initdata apicid_to_pxm[MAX_APICID];
 
+int numa_off __initdata;
+int acpi_numa __initdata;
+
+static __init void bad_srat(void)
+{
+        printk(KERN_ERR "SRAT: SRAT not used.\n");
+        acpi_numa = -1;
+	num_memory_chunks = 0;
+}
+
+static __init inline int srat_disabled(void)
+{
+	return numa_off || acpi_numa < 0;
+}
+
 /* Identify CPU proximity domains */
-static void __init parse_cpu_affinity_structure(char *p)
+void __init
+acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
 {
-	struct acpi_srat_cpu_affinity *cpu_affinity =
-				(struct acpi_srat_cpu_affinity *) p;
+	if (srat_disabled())
+		return;
+	if (cpu_affinity->header.length !=
+	     sizeof(struct acpi_srat_cpu_affinity)) {
+		bad_srat();
+		return;
+	}
 
 	if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
 		return;		/* empty entry */
@@ -79,14 +101,21 @@ static void __init parse_cpu_affinity_structure(char *p)
  * Identify memory proximity domains and hot-remove capabilities.
  * Fill node memory chunk list structure.
  */
-static void __init parse_memory_affinity_structure (char *sratp)
+void __init
+acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
 {
 	unsigned long long paddr, size;
 	unsigned long start_pfn, end_pfn;
 	u8 pxm;
 	struct node_memory_chunk_s *p, *q, *pend;
-	struct acpi_srat_mem_affinity *memory_affinity =
-			(struct acpi_srat_mem_affinity *) sratp;
+
+	if (srat_disabled())
+		return;
+	if (memory_affinity->header.length !=
+	     sizeof(struct acpi_srat_mem_affinity)) {
+		bad_srat();
+		return;
+	}
 
 	if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
 		return;		/* empty entry */
@@ -134,6 +163,14 @@ static void __init parse_memory_affinity_structure (char *sratp)
 		 "enabled and removable" : "enabled" ) );
 }
 
+/* Callback for SLIT parsing */
+void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
+{
+}
+
+void acpi_numa_arch_fixup(void)
+{
+}
 /*
  * The SRAT table always lists ascending addresses, so can always
  * assume that the first "start" address that you see is the real
@@ -166,39 +203,13 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c
 		node_end_pfn[nid] = memory_chunk->end_pfn;
 }
 
-/* Parse the ACPI Static Resource Affinity Table */
-static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
+int __init get_memcfg_from_srat(void)
 {
-	u8 *start, *end, *p;
 	int i, j, nid;
 
-	start = (u8 *)(&(sratp->reserved) + 1);	/* skip header */
-	p = start;
-	end = (u8 *)sratp + sratp->header.length;
-
-	memset(pxm_bitmap, 0, sizeof(pxm_bitmap));	/* init proximity domain bitmap */
-	memset(node_memory_chunk, 0, sizeof(node_memory_chunk));
 
-	num_memory_chunks = 0;
-	while (p < end) {
-		switch (*p) {
-		case ACPI_SRAT_TYPE_CPU_AFFINITY:
-			parse_cpu_affinity_structure(p);
-			break;
-		case ACPI_SRAT_TYPE_MEMORY_AFFINITY:
-			parse_memory_affinity_structure(p);
-			break;
-		default:
-			printk("ACPI 2.0 SRAT: unknown entry skipped: type=0x%02X, len=%d\n", p[0], p[1]);
-			break;
-		}
-		p += p[1];
-		if (p[1] == 0) {
-			printk("acpi20_parse_srat: Entry length value is zero;"
-				" can't parse any further!\n");
-			break;
-		}
-	}
+	if (srat_disabled())
+		goto out_fail;
 
 	if (num_memory_chunks == 0) {
 		printk("could not finy any ACPI SRAT memory areas.\n");
@@ -244,115 +255,19 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 		printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
 		       j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
 		node_read_chunk(chunk->nid, chunk);
-		add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn);
+		e820_register_active_regions(chunk->nid, chunk->start_pfn,
+					     min(chunk->end_pfn, max_pfn));
 	}
- 
+
 	for_each_online_node(nid) {
 		unsigned long start = node_start_pfn[nid];
-		unsigned long end = node_end_pfn[nid];
+		unsigned long end = min(node_end_pfn[nid], max_pfn);
 
 		memory_present(nid, start, end);
 		node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
 	}
 	return 1;
 out_fail:
-	return 0;
-}
-
-struct acpi_static_rsdt {
-	struct acpi_table_rsdt table;
-	u32 padding[7]; /* Allow for 7 more table entries */
-};
-
-int __init get_memcfg_from_srat(void)
-{
-	struct acpi_table_header *header = NULL;
-	struct acpi_table_rsdp *rsdp = NULL;
-	struct acpi_table_rsdt *rsdt = NULL;
-	acpi_native_uint rsdp_address = 0;
-	struct acpi_static_rsdt saved_rsdt;
-	int tables = 0;
-	int i = 0;
-
-	rsdp_address = acpi_os_get_root_pointer();
-	if (!rsdp_address) {
-		printk("%s: System description tables not found\n",
-		       __func__);
-		goto out_err;
-	}
-
-	printk("%s: assigning address to rsdp\n", __func__);
-	rsdp = (struct acpi_table_rsdp *)(u32)rsdp_address;
-	if (!rsdp) {
-		printk("%s: Didn't find ACPI root!\n", __func__);
-		goto out_err;
-	}
-
-	printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision,
-		rsdp->oem_id);
-
-	if (strncmp(rsdp->signature, ACPI_SIG_RSDP,strlen(ACPI_SIG_RSDP))) {
-		printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __func__);
-		goto out_err;
-	}
-
-	rsdt = (struct acpi_table_rsdt *)
-	    early_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt));
-
-	if (!rsdt) {
-		printk(KERN_WARNING
-		       "%s: ACPI: Invalid root system description tables (RSDT)\n",
-		       __func__);
-		goto out_err;
-	}
-
-	header = &rsdt->header;
-
-	if (strncmp(header->signature, ACPI_SIG_RSDT, strlen(ACPI_SIG_RSDT))) {
-		printk(KERN_WARNING "ACPI: RSDT signature incorrect\n");
-		goto out_err;
-	}
-
-	/* 
-	 * The number of tables is computed by taking the 
-	 * size of all entries (header size minus total 
-	 * size of RSDT) divided by the size of each entry
-	 * (4-byte table pointers).
-	 */
-	tables = (header->length - sizeof(struct acpi_table_header)) / 4;
-
-	if (!tables)
-		goto out_err;
-
-	memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt));
-
-	if (saved_rsdt.table.header.length > sizeof(saved_rsdt)) {
-		printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n",
-		       saved_rsdt.table.header.length);
-		goto out_err;
-	}
-
-	printk("Begin SRAT table scan....\n");
-
-	for (i = 0; i < tables; i++) {
-		/* Map in header, then map in full table length. */
-		header = (struct acpi_table_header *)
-			early_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
-		if (!header)
-			break;
-		header = (struct acpi_table_header *)
-			early_ioremap(saved_rsdt.table.table_offset_entry[i], header->length);
-		if (!header)
-			break;
-
-		if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4))
-			continue;
-
-		/* we've found the srat table. don't need to look at any more tables */
-		return acpi20_parse_srat((struct acpi_table_srat *)header);
-	}
-out_err:
-	remove_all_active_ranges();
 	printk("failed to get NUMA memory information from SRAT table\n");
 	return 0;
 }

+ 2 - 0
arch/x86/kernel/summit_32.c

@@ -36,7 +36,9 @@ static struct rio_table_hdr *rio_table_hdr __initdata;
 static struct scal_detail   *scal_devs[MAX_NUMNODES] __initdata;
 static struct rio_detail    *rio_devs[MAX_NUMNODES*4] __initdata;
 
+#ifndef CONFIG_X86_NUMAQ
 static int mp_bus_id_to_node[MAX_MP_BUSSES] __initdata;
+#endif
 
 static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
 {

+ 1 - 1
arch/x86/kernel/trampoline.c

@@ -2,7 +2,7 @@
 
 #include <asm/trampoline.h>
 
-/* ready for x86_64, no harm for x86, since it will overwrite after alloc */
+/* ready for x86_64 and x86 */
 unsigned char *trampoline_base = __va(TRAMPOLINE_BASE);
 
 /*

+ 4 - 3
arch/x86/lguest/boot.c

@@ -835,7 +835,7 @@ static __init char *lguest_memory_setup(void)
 
 	/* The Linux bootloader header contains an "e820" memory map: the
 	 * Launcher populated the first entry with our memory limit. */
-	add_memory_region(boot_params.e820_map[0].addr,
+	e820_add_region(boot_params.e820_map[0].addr,
 			  boot_params.e820_map[0].size,
 			  boot_params.e820_map[0].type);
 
@@ -1012,6 +1012,7 @@ __init void lguest_init(void)
 	 * clobbered.  The Launcher places our initial pagetables somewhere at
 	 * the top of our physical memory, so we don't need extra space: set
 	 * init_pg_tables_end to the end of the kernel. */
+	init_pg_tables_start = __pa(pg0);
 	init_pg_tables_end = __pa(pg0);
 
 	/* Load the %fs segment register (the per-cpu segment register) with
@@ -1065,9 +1066,9 @@ __init void lguest_init(void)
 	pm_power_off = lguest_power_off;
 	machine_ops.restart = lguest_restart;
 
-	/* Now we're set up, call start_kernel() in init/main.c and we proceed
+	/* Now we're set up, call i386_start_kernel() in head32.c and we proceed
 	 * to boot as normal.  It never returns. */
-	start_kernel();
+	i386_start_kernel();
 }
 /*
  * This marks the end of stage II of our journey, The Guest.

+ 0 - 42
arch/x86/mach-default/setup.c

@@ -142,45 +142,3 @@ static int __init print_ipi_mode(void)
 
 late_initcall(print_ipi_mode);
 
-/**
- * machine_specific_memory_setup - Hook for machine specific memory setup.
- *
- * Description:
- *	This is included late in kernel/setup.c so that it can make
- *	use of all of the static functions.
- **/
-
-char * __init machine_specific_memory_setup(void)
-{
-	char *who;
-
-
-	who = "BIOS-e820";
-
-	/*
-	 * Try to copy the BIOS-supplied E820-map.
-	 *
-	 * Otherwise fake a memory map; one section from 0k->640k,
-	 * the next section from 1mb->appropriate_mem_k
-	 */
-	sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
-	if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries)
-	    < 0) {
-		unsigned long mem_size;
-
-		/* compare results from other methods and take the greater */
-		if (boot_params.alt_mem_k
-		    < boot_params.screen_info.ext_mem_k) {
-			mem_size = boot_params.screen_info.ext_mem_k;
-			who = "BIOS-88";
-		} else {
-			mem_size = boot_params.alt_mem_k;
-			who = "BIOS-e801";
-		}
-
-		e820.nr_map = 0;
-		add_memory_region(0, LOWMEMSIZE(), E820_RAM);
-		add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
-  	}
-	return who;
-}

+ 0 - 1
arch/x86/mach-es7000/Makefile

@@ -3,4 +3,3 @@
 #
 
 obj-$(CONFIG_X86_ES7000)	:= es7000plat.o
-obj-$(CONFIG_X86_GENERICARCH)	:= es7000plat.o

+ 2 - 47
arch/x86/mach-es7000/es7000plat.c

@@ -52,6 +52,8 @@ static struct mip_reg		*host_reg;
 static int 			mip_port;
 static unsigned long		mip_addr, host_addr;
 
+int es7000_plat;
+
 /*
  * GSI override for ES7000 platforms.
  */
@@ -175,53 +177,6 @@ find_unisys_acpi_oem_table(unsigned long *oem_addr)
 }
 #endif
 
-/*
- * This file also gets compiled if CONFIG_X86_GENERICARCH is set. Generic
- * arch already has got following function definitions (asm-generic/es7000.c)
- * hence no need to define these for that case.
- */
-#ifndef CONFIG_X86_GENERICARCH
-void es7000_sw_apic(void);
-void __init enable_apic_mode(void)
-{
-	es7000_sw_apic();
-	return;
-}
-
-__init int mps_oem_check(struct mp_config_table *mpc, char *oem,
-		char *productid)
-{
-	if (mpc->mpc_oemptr) {
-		struct mp_config_oemtable *oem_table =
-			(struct mp_config_oemtable *)mpc->mpc_oemptr;
-		if (!strncmp(oem, "UNISYS", 6))
-			return parse_unisys_oem((char *)oem_table);
-	}
-	return 0;
-}
-#ifdef CONFIG_ACPI
-/* Hook from generic ACPI tables.c */
-int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
-	unsigned long oem_addr;
-	if (!find_unisys_acpi_oem_table(&oem_addr)) {
-		if (es7000_check_dsdt())
-			return parse_unisys_oem((char *)oem_addr);
-		else {
-			setup_unisys();
-			return 1;
-		}
-	}
-	return 0;
-}
-#else
-int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
-	return 0;
-}
-#endif
-#endif /* COFIG_X86_GENERICARCH */
-
 static void
 es7000_spin(int n)
 {

+ 7 - 3
arch/x86/mach-generic/Makefile

@@ -2,7 +2,11 @@
 # Makefile for the generic architecture
 #
 
-EXTRA_CFLAGS	:= -Iarch/x86/kernel
+EXTRA_CFLAGS			:= -Iarch/x86/kernel
 
-obj-y		:= probe.o summit.o bigsmp.o es7000.o default.o 
-obj-y		+= ../../x86/mach-es7000/
+obj-y				:= probe.o default.o
+obj-$(CONFIG_X86_NUMAQ)		+= numaq.o
+obj-$(CONFIG_X86_SUMMIT)	+= summit.o
+obj-$(CONFIG_X86_BIGSMP)	+= bigsmp.o
+obj-$(CONFIG_X86_ES7000)	+= es7000.o
+obj-$(CONFIG_X86_ES7000)	+= ../../x86/mach-es7000/

+ 1 - 3
arch/x86/mach-generic/bigsmp.c

@@ -23,10 +23,8 @@ static int dmi_bigsmp; /* can be set by dmi scanners */
 
 static int hp_ht_bigsmp(const struct dmi_system_id *d)
 {
-#ifdef CONFIG_X86_GENERICARCH
 	printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident);
 	dmi_bigsmp = 1;
-#endif
 	return 0;
 }
 
@@ -48,7 +46,7 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {
 static int probe_bigsmp(void)
 {
 	if (def_to_bigsmp)
-	dmi_bigsmp = 1;
+		dmi_bigsmp = 1;
 	else
 		dmi_check_system(bigsmp_dmi_table);
 	return dmi_bigsmp;

+ 41 - 0
arch/x86/mach-generic/numaq.c

@@ -0,0 +1,41 @@
+/*
+ * APIC driver for the IBM NUMAQ chipset.
+ */
+#define APIC_DEFINITION 1
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/smp.h>
+#include <asm/mpspec.h>
+#include <asm/genapic.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <asm/mach-numaq/mach_apic.h>
+#include <asm/mach-numaq/mach_apicdef.h>
+#include <asm/mach-numaq/mach_ipi.h>
+#include <asm/mach-numaq/mach_mpparse.h>
+#include <asm/mach-numaq/mach_wakecpu.h>
+#include <asm/numaq.h>
+
+static int mps_oem_check(struct mp_config_table *mpc, char *oem,
+		char *productid)
+{
+	numaq_mps_oem_check(mpc, oem, productid);
+	return found_numaq;
+}
+
+static int probe_numaq(void)
+{
+	/* already know from get_memcfg_numaq() */
+	return found_numaq;
+}
+
+/* Hook from generic ACPI tables.c */
+static int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+	return 0;
+}
+
+struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq);

+ 14 - 1
arch/x86/mach-generic/probe.c

@@ -16,6 +16,7 @@
 #include <asm/apicdef.h>
 #include <asm/genapic.h>
 
+extern struct genapic apic_numaq;
 extern struct genapic apic_summit;
 extern struct genapic apic_bigsmp;
 extern struct genapic apic_es7000;
@@ -24,9 +25,18 @@ extern struct genapic apic_default;
 struct genapic *genapic = &apic_default;
 
 static struct genapic *apic_probe[] __initdata = {
+#ifdef CONFIG_X86_NUMAQ
+	&apic_numaq,
+#endif
+#ifdef CONFIG_X86_SUMMIT
 	&apic_summit,
+#endif
+#ifdef CONFIG_X86_BIGSMP
 	&apic_bigsmp,
+#endif
+#ifdef CONFIG_X86_ES7000
 	&apic_es7000,
+#endif
 	&apic_default,	/* must be last */
 	NULL,
 };
@@ -54,6 +64,7 @@ early_param("apic", parse_apic);
 
 void __init generic_bigsmp_probe(void)
 {
+#ifdef CONFIG_X86_BIGSMP
 	/*
 	 * This routine is used to switch to bigsmp mode when
 	 * - There is no apic= option specified by the user
@@ -67,6 +78,7 @@ void __init generic_bigsmp_probe(void)
 			printk(KERN_INFO "Overriding APIC driver with %s\n",
 			       genapic->name);
 		}
+#endif
 }
 
 void __init generic_apic_probe(void)
@@ -88,7 +100,8 @@ void __init generic_apic_probe(void)
 
 /* These functions can switch the APIC even after the initial ->probe() */
 
-int __init mps_oem_check(struct mp_config_table *mpc, char *oem, char *productid)
+int __init mps_oem_check(struct mp_config_table *mpc, char *oem,
+				 char *productid)
 {
 	int i;
 	for (i = 0; apic_probe[i]; ++i) {

+ 2 - 5
arch/x86/mach-visws/mpparse.c

@@ -8,11 +8,6 @@
 #include "cobalt.h"
 #include "mach_apic.h"
 
-/* Have we found an MP table */
-int smp_found_config;
-
-int pic_mode;
-
 extern unsigned int __cpuinitdata maxcpus;
 
 /*
@@ -76,7 +71,9 @@ void __init find_smp_config(void)
 	if (ncpus > maxcpus)
 		ncpus = maxcpus;
 
+#ifdef CONFIG_X86_LOCAL_APIC
 	smp_found_config = 1;
+#endif
 	while (ncpus--)
 		MP_processor_info(mp++);
 

+ 3 - 3
arch/x86/mach-visws/setup.c

@@ -175,9 +175,9 @@ char * __init machine_specific_memory_setup(void)
 	sgivwfb_mem_size &= ~((1 << 20) - 1);
 	sgivwfb_mem_phys = mem_size - gfx_mem_size;
 
-	add_memory_region(0, LOWMEMSIZE(), E820_RAM);
-	add_memory_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM);
-	add_memory_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED);
+	e820_add_region(0, LOWMEMSIZE(), E820_RAM);
+	e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM);
+	e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED);
 
 	return "PROM";
 }

+ 6 - 31
arch/x86/mach-voyager/setup.c

@@ -62,6 +62,7 @@ void __init time_init_hook(void)
 char *__init machine_specific_memory_setup(void)
 {
 	char *who;
+	int new_nr;
 
 	who = "NOT VOYAGER";
 
@@ -73,7 +74,7 @@ char *__init machine_specific_memory_setup(void)
 
 		e820.nr_map = 0;
 		for (i = 0; voyager_memory_detect(i, &addr, &length); i++) {
-			add_memory_region(addr, length, E820_RAM);
+			e820_add_region(addr, length, E820_RAM);
 		}
 		return who;
 	} else if (voyager_level == 4) {
@@ -91,43 +92,17 @@ char *__init machine_specific_memory_setup(void)
 			tom = (boot_params.screen_info.ext_mem_k) << 10;
 		}
 		who = "Voyager-TOM";
-		add_memory_region(0, 0x9f000, E820_RAM);
+		e820_add_region(0, 0x9f000, E820_RAM);
 		/* map from 1M to top of memory */
-		add_memory_region(1 * 1024 * 1024, tom - 1 * 1024 * 1024,
+		e820_add_region(1 * 1024 * 1024, tom - 1 * 1024 * 1024,
 				  E820_RAM);
 		/* FIXME: Should check the ASICs to see if I need to
 		 * take out the 8M window.  Just do it at the moment
 		 * */
-		add_memory_region(8 * 1024 * 1024, 8 * 1024 * 1024,
+		e820_add_region(8 * 1024 * 1024, 8 * 1024 * 1024,
 				  E820_RESERVED);
 		return who;
 	}
 
-	who = "BIOS-e820";
-
-	/*
-	 * Try to copy the BIOS-supplied E820-map.
-	 *
-	 * Otherwise fake a memory map; one section from 0k->640k,
-	 * the next section from 1mb->appropriate_mem_k
-	 */
-	sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
-	if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries)
-	    < 0) {
-		unsigned long mem_size;
-
-		/* compare results from other methods and take the greater */
-		if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) {
-			mem_size = boot_params.screen_info.ext_mem_k;
-			who = "BIOS-88";
-		} else {
-			mem_size = boot_params.alt_mem_k;
-			who = "BIOS-e801";
-		}
-
-		e820.nr_map = 0;
-		add_memory_region(0, LOWMEMSIZE(), E820_RAM);
-		add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
-	}
-	return who;
+	return default_machine_specific_memory_setup();
 }

+ 0 - 14
arch/x86/mach-voyager/voyager_smp.c

@@ -59,11 +59,6 @@ __u32 voyager_quad_processors = 0;
  * activity count.  Finally exported by i386_ksyms.c */
 static int voyager_extended_cpus = 1;
 
-/* Have we found an SMP box - used by time.c to do the profiling
-   interrupt for timeslicing; do not set to 1 until the per CPU timer
-   interrupt is active */
-int smp_found_config = 0;
-
 /* Used for the invalidate map that's also checked in the spinlock */
 static volatile unsigned long smp_invalidate_needed;
 
@@ -1137,15 +1132,6 @@ void flush_tlb_all(void)
 	on_each_cpu(do_flush_tlb_all, 0, 1, 1);
 }
 
-/* used to set up the trampoline for other CPUs when the memory manager
- * is sorted out */
-void __init smp_alloc_memory(void)
-{
-	trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
-	if (__pa(trampoline_base) >= 0x93000)
-		BUG();
-}
-
 /* send a reschedule CPI to one CPU by physical CPU number*/
 static void voyager_smp_send_reschedule(int cpu)
 {

+ 95 - 91
arch/x86/mm/discontig_32.c

@@ -38,6 +38,7 @@
 #include <asm/setup.h>
 #include <asm/mmzone.h>
 #include <asm/bios_ebda.h>
+#include <asm/proto.h>
 
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
@@ -59,14 +60,14 @@ unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
 /*
  * 4) physnode_map     - the mapping between a pfn and owning node
  * physnode_map keeps track of the physical memory layout of a generic
- * numa node on a 256Mb break (each element of the array will
- * represent 256Mb of memory and will be marked by the node id.  so,
+ * numa node on a 64Mb break (each element of the array will
+ * represent 64Mb of memory and will be marked by the node id.  so,
  * if the first gig is on node 0, and the second gig is on node 1
  * physnode_map will contain:
  *
- *     physnode_map[0-3] = 0;
- *     physnode_map[4-7] = 1;
- *     physnode_map[8- ] = -1;
+ *     physnode_map[0-15] = 0;
+ *     physnode_map[16-31] = 1;
+ *     physnode_map[32- ] = -1;
  */
 s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
 EXPORT_SYMBOL(physnode_map);
@@ -81,9 +82,9 @@ void memory_present(int nid, unsigned long start, unsigned long end)
 	printk(KERN_DEBUG "  ");
 	for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
 		physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
-		printk("%ld ", pfn);
+		printk(KERN_CONT "%ld ", pfn);
 	}
-	printk("\n");
+	printk(KERN_CONT "\n");
 }
 
 unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
@@ -99,7 +100,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
 #endif
 
 extern unsigned long find_max_low_pfn(void);
-extern void add_one_highpage_init(struct page *, int, int);
 extern unsigned long highend_pfn, highstart_pfn;
 
 #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
@@ -119,11 +119,11 @@ int __init get_memcfg_numa_flat(void)
 {
 	printk("NUMA - single node, flat memory mode\n");
 
-	/* Run the memory configuration and find the top of memory. */
-	propagate_e820_map();
 	node_start_pfn[0] = 0;
 	node_end_pfn[0] = max_pfn;
+	e820_register_active_regions(0, 0, max_pfn);
 	memory_present(0, 0, max_pfn);
+	node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
 
         /* Indicate there is one node available. */
 	nodes_clear(node_online_map);
@@ -159,9 +159,17 @@ static void __init allocate_pgdat(int nid)
 	if (nid && node_has_online_mem(nid) && node_remap_start_vaddr[nid])
 		NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
 	else {
-		NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn));
-		min_low_pfn += PFN_UP(sizeof(pg_data_t));
+		unsigned long pgdat_phys;
+		pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT,
+				 (nid ? max_low_pfn:max_pfn_mapped)<<PAGE_SHIFT,
+				 sizeof(pg_data_t),
+				 PAGE_SIZE);
+		NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
+		reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t),
+			      "NODE_DATA");
 	}
+	printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
+		nid, (unsigned long)NODE_DATA(nid));
 }
 
 /*
@@ -199,8 +207,12 @@ void __init remap_numa_kva(void)
 	int node;
 
 	for_each_online_node(node) {
+		printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
 		for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
 			vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
+			printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
+				(unsigned long)vaddr,
+				node_remap_start_pfn[node] + pfn);
 			set_pmd_pfn((ulong) vaddr, 
 				node_remap_start_pfn[node] + pfn, 
 				PAGE_KERNEL_LARGE);
@@ -212,17 +224,21 @@ static unsigned long calculate_numa_remap_pages(void)
 {
 	int nid;
 	unsigned long size, reserve_pages = 0;
-	unsigned long pfn;
 
 	for_each_online_node(nid) {
-		unsigned old_end_pfn = node_end_pfn[nid];
+		u64 node_kva_target;
+		u64 node_kva_final;
 
 		/*
 		 * The acpi/srat node info can show hot-add memroy zones
 		 * where memory could be added but not currently present.
 		 */
+		printk("node %d pfn: [%lx - %lx]\n",
+			nid, node_start_pfn[nid], node_end_pfn[nid]);
 		if (node_start_pfn[nid] > max_pfn)
 			continue;
+		if (!node_end_pfn[nid])
+			continue;
 		if (node_end_pfn[nid] > max_pfn)
 			node_end_pfn[nid] = max_pfn;
 
@@ -234,39 +250,45 @@ static unsigned long calculate_numa_remap_pages(void)
 		/* now the roundup is correct, convert to PAGE_SIZE pages */
 		size = size * PTRS_PER_PTE;
 
-		/*
-		 * Validate the region we are allocating only contains valid
-		 * pages.
-		 */
-		for (pfn = node_end_pfn[nid] - size;
-		     pfn < node_end_pfn[nid]; pfn++)
-			if (!page_is_ram(pfn))
-				break;
+		node_kva_target = round_down(node_end_pfn[nid] - size,
+						 PTRS_PER_PTE);
+		node_kva_target <<= PAGE_SHIFT;
+		do {
+			node_kva_final = find_e820_area(node_kva_target,
+					((u64)node_end_pfn[nid])<<PAGE_SHIFT,
+						((u64)size)<<PAGE_SHIFT,
+						LARGE_PAGE_BYTES);
+			node_kva_target -= LARGE_PAGE_BYTES;
+		} while (node_kva_final == -1ULL &&
+			 (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
+
+		if (node_kva_final == -1ULL)
+			panic("Can not get kva ram\n");
 
-		if (pfn != node_end_pfn[nid])
-			size = 0;
-
-		printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
-				size, nid);
 		node_remap_size[nid] = size;
 		node_remap_offset[nid] = reserve_pages;
 		reserve_pages += size;
-		printk("Shrinking node %d from %ld pages to %ld pages\n",
-			nid, node_end_pfn[nid], node_end_pfn[nid] - size);
-
-		if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) {
-			/*
-			 * Align node_end_pfn[] and node_remap_start_pfn[] to
-			 * pmd boundary. remap_numa_kva will barf otherwise.
-			 */
-			printk("Shrinking node %d further by %ld pages for proper alignment\n",
-				nid, node_end_pfn[nid] & (PTRS_PER_PTE-1));
-			size +=  node_end_pfn[nid] & (PTRS_PER_PTE-1);
-		}
+		printk("Reserving %ld pages of KVA for lmem_map of node %d at %llx\n",
+				size, nid, node_kva_final>>PAGE_SHIFT);
+
+		/*
+		 *  prevent kva address below max_low_pfn want it on system
+		 *  with less memory later.
+		 *  layout will be: KVA address , KVA RAM
+		 *
+		 *  we are supposed to only record the one less then max_low_pfn
+		 *  but we could have some hole in high memory, and it will only
+		 *  check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
+		 *  to use it as free.
+		 *  So reserve_early here, hope we don't run out of that array
+		 */
+		reserve_early(node_kva_final,
+			      node_kva_final+(((u64)size)<<PAGE_SHIFT),
+			      "KVA RAM");
 
-		node_end_pfn[nid] -= size;
-		node_remap_start_pfn[nid] = node_end_pfn[nid];
-		shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]);
+		node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
+		remove_active_range(nid, node_remap_start_pfn[nid],
+					 node_remap_start_pfn[nid] + size);
 	}
 	printk("Reserving total of %ld pages for numa KVA remap\n",
 			reserve_pages);
@@ -284,8 +306,7 @@ static void init_remap_allocator(int nid)
 
 	printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
 		(ulong) node_remap_start_vaddr[nid],
-		(ulong) pfn_to_kaddr(highstart_pfn
-		   + node_remap_offset[nid] + node_remap_size[nid]));
+		(ulong) node_remap_end_vaddr[nid]);
 }
 
 extern void setup_bootmem_allocator(void);
@@ -293,7 +314,7 @@ unsigned long __init setup_memory(void)
 {
 	int nid;
 	unsigned long system_start_pfn, system_max_low_pfn;
-	unsigned long wasted_pages;
+	long kva_target_pfn;
 
 	/*
 	 * When mapping a NUMA machine we allocate the node_mem_map arrays
@@ -302,34 +323,38 @@ unsigned long __init setup_memory(void)
 	 * this space and use it to adjust the boundary between ZONE_NORMAL
 	 * and ZONE_HIGHMEM.
 	 */
+
+	/* call find_max_low_pfn at first, it could update max_pfn */
+	system_max_low_pfn = max_low_pfn = find_max_low_pfn();
+
+	remove_all_active_ranges();
 	get_memcfg_numa();
 
-	kva_pages = calculate_numa_remap_pages();
+	kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE);
 
 	/* partially used pages are not usable - thus round upwards */
 	system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
 
-	kva_start_pfn = find_max_low_pfn() - kva_pages;
-
-#ifdef CONFIG_BLK_DEV_INITRD
-	/* Numa kva area is below the initrd */
-	if (initrd_start)
-		kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET)
-			- kva_pages;
-#endif
+	kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
+	do {
+		kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT,
+					max_low_pfn<<PAGE_SHIFT,
+					kva_pages<<PAGE_SHIFT,
+					PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
+		kva_target_pfn -= PTRS_PER_PTE;
+	} while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn);
 
-	/*
-	 * We waste pages past at the end of the KVA for no good reason other
-	 * than how it is located. This is bad.
-	 */
-	wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1);
-	kva_start_pfn -= wasted_pages;
-	kva_pages += wasted_pages;
+	if (kva_start_pfn == -1UL)
+		panic("Can not get kva space\n");
 
-	system_max_low_pfn = max_low_pfn = find_max_low_pfn();
 	printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
 		kva_start_pfn, max_low_pfn);
 	printk("max_pfn = %ld\n", max_pfn);
+
+	/* avoid clash with initrd */
+	reserve_early(kva_start_pfn<<PAGE_SHIFT,
+		      (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
+		     "KVA PG");
 #ifdef CONFIG_HIGHMEM
 	highstart_pfn = highend_pfn = max_pfn;
 	if (max_pfn > system_max_low_pfn)
@@ -365,16 +390,8 @@ unsigned long __init setup_memory(void)
 	return max_low_pfn;
 }
 
-void __init numa_kva_reserve(void)
-{
-	if (kva_pages)
-		reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages),
-				BOOTMEM_DEFAULT);
-}
-
 void __init zone_sizes_init(void)
 {
-	int nid;
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 	max_zone_pfns[ZONE_DMA] =
@@ -384,27 +401,18 @@ void __init zone_sizes_init(void)
 	max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
 #endif
 
-	/* If SRAT has not registered memory, register it now */
-	if (find_max_pfn_with_active_regions() == 0) {
-		for_each_online_node(nid) {
-			if (node_has_online_mem(nid))
-				add_active_range(nid, node_start_pfn[nid],
-							node_end_pfn[nid]);
-		}
-	}
-
 	free_area_init_nodes(max_zone_pfns);
 	return;
 }
 
-void __init set_highmem_pages_init(int bad_ppro) 
+void __init set_highmem_pages_init(void)
 {
 #ifdef CONFIG_HIGHMEM
 	struct zone *zone;
-	struct page *page;
+	int nid;
 
 	for_each_zone(zone) {
-		unsigned long node_pfn, zone_start_pfn, zone_end_pfn;
+		unsigned long zone_start_pfn, zone_end_pfn;
 
 		if (!is_highmem(zone))
 			continue;
@@ -412,16 +420,12 @@ void __init set_highmem_pages_init(int bad_ppro)
 		zone_start_pfn = zone->zone_start_pfn;
 		zone_end_pfn = zone_start_pfn + zone->spanned_pages;
 
+		nid = zone_to_nid(zone);
 		printk("Initializing %s for node %d (%08lx:%08lx)\n",
-				zone->name, zone_to_nid(zone),
-				zone_start_pfn, zone_end_pfn);
-
-		for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) {
-			if (!pfn_valid(node_pfn))
-				continue;
-			page = pfn_to_page(node_pfn);
-			add_one_highpage_init(page, node_pfn, bad_ppro);
-		}
+				zone->name, nid, zone_start_pfn, zone_end_pfn);
+
+		add_highpages_with_active_regions(nid, zone_start_pfn,
+				 zone_end_pfn);
 	}
 	totalram_pages += totalhigh_pages;
 #endif

+ 57 - 29
arch/x86/mm/init_32.c

@@ -225,13 +225,6 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 	update_page_count(PG_LEVEL_4K, pages_4k);
 }
 
-static inline int page_kills_ppro(unsigned long pagenr)
-{
-	if (pagenr >= 0x70000 && pagenr <= 0x7003F)
-		return 1;
-	return 0;
-}
-
 /*
  * devmem_is_allowed() checks to see if /dev/mem access to a certain address
  * is valid. The argument is a physical page number.
@@ -292,29 +285,60 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
 	pkmap_page_table = pte;
 }
 
-void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
+static void __init add_one_highpage_init(struct page *page, int pfn)
 {
-	if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
-		ClearPageReserved(page);
-		init_page_count(page);
-		__free_page(page);
-		totalhigh_pages++;
-	} else
-		SetPageReserved(page);
+	ClearPageReserved(page);
+	init_page_count(page);
+	__free_page(page);
+	totalhigh_pages++;
 }
 
-#ifndef CONFIG_NUMA
-static void __init set_highmem_pages_init(int bad_ppro)
+struct add_highpages_data {
+	unsigned long start_pfn;
+	unsigned long end_pfn;
+};
+
+static void __init add_highpages_work_fn(unsigned long start_pfn,
+					 unsigned long end_pfn, void *datax)
 {
-	int pfn;
+	int node_pfn;
+	struct page *page;
+	unsigned long final_start_pfn, final_end_pfn;
+	struct add_highpages_data *data;
 
-	for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
-		/*
-		 * Holes under sparsemem might not have no mem_map[]:
-		 */
-		if (pfn_valid(pfn))
-			add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
+	data = (struct add_highpages_data *)datax;
+
+	final_start_pfn = max(start_pfn, data->start_pfn);
+	final_end_pfn = min(end_pfn, data->end_pfn);
+	if (final_start_pfn >= final_end_pfn)
+		return;
+
+	for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
+	     node_pfn++) {
+		if (!pfn_valid(node_pfn))
+			continue;
+		page = pfn_to_page(node_pfn);
+		add_one_highpage_init(page, node_pfn);
 	}
+
+}
+
+void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
+					      unsigned long end_pfn)
+{
+	struct add_highpages_data data;
+
+	data.start_pfn = start_pfn;
+	data.end_pfn = end_pfn;
+
+	work_with_active_regions(nid, add_highpages_work_fn, &data);
+}
+
+#ifndef CONFIG_NUMA
+static void __init set_highmem_pages_init(void)
+{
+	add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
+
 	totalram_pages += totalhigh_pages;
 }
 #endif /* !CONFIG_NUMA */
@@ -322,7 +346,7 @@ static void __init set_highmem_pages_init(int bad_ppro)
 #else
 # define kmap_init()				do { } while (0)
 # define permanent_kmaps_init(pgd_base)		do { } while (0)
-# define set_highmem_pages_init(bad_ppro)	do { } while (0)
+# define set_highmem_pages_init()	do { } while (0)
 #endif /* CONFIG_HIGHMEM */
 
 pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
@@ -569,13 +593,11 @@ static struct kcore_list kcore_mem, kcore_vmalloc;
 void __init mem_init(void)
 {
 	int codesize, reservedpages, datasize, initsize;
-	int tmp, bad_ppro;
+	int tmp;
 
 #ifdef CONFIG_FLATMEM
 	BUG_ON(!mem_map);
 #endif
-	bad_ppro = ppro_with_ram_bug();
-
 	/* this will put all low memory onto the freelists */
 	totalram_pages += free_all_bootmem();
 
@@ -587,7 +609,7 @@ void __init mem_init(void)
 		if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
 			reservedpages++;
 
-	set_highmem_pages_init(bad_ppro);
+	set_highmem_pages_init();
 
 	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
@@ -776,3 +798,9 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	free_init_pages("initrd memory", start, end);
 }
 #endif
+
+int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
+				   int flags)
+{
+	return reserve_bootmem(phys, len, flags);
+}

+ 25 - 5
arch/x86/mm/init_64.c

@@ -48,6 +48,18 @@
 #include <asm/numa.h>
 #include <asm/cacheflush.h>
 
+/*
+ * PFN of last memory page.
+ */
+unsigned long end_pfn;
+
+/*
+ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
+ * The direct mapping extends to max_pfn_mapped, so that we can directly access
+ * apertures, ACPI and other tables without having to play with fixmaps.
+ */
+unsigned long max_pfn_mapped;
+
 static unsigned long dma_reserve __initdata;
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -808,12 +820,14 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 }
 #endif
 
-void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
+int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
+				   int flags)
 {
 #ifdef CONFIG_NUMA
 	int nid, next_nid;
 #endif
 	unsigned long pfn = phys >> PAGE_SHIFT;
+	int ret;
 
 	if (pfn >= end_pfn) {
 		/*
@@ -821,11 +835,11 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
 		 * firmware tables:
 		 */
 		if (pfn < max_pfn_mapped)
-			return;
+			return -EFAULT;
 
 		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
 				phys, len);
-		return;
+		return -EFAULT;
 	}
 
 	/* Should check here against the e820 map to avoid double free */
@@ -833,9 +847,13 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
 	nid = phys_to_nid(phys);
 	next_nid = phys_to_nid(phys + len - 1);
 	if (nid == next_nid)
-		reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
+		ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
 	else
-		reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
+		ret = reserve_bootmem(phys, len, flags);
+
+	if (ret != 0)
+		return ret;
+
 #else
 	reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
 #endif
@@ -844,6 +862,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
 		dma_reserve += len / PAGE_SIZE;
 		set_dma_reserve(dma_reserve);
 	}
+
+	return 0;
 }
 
 int kern_addr_valid(unsigned long addr)

+ 4 - 0
arch/x86/mm/k8topology_64.c

@@ -57,18 +57,22 @@ static __init void early_get_boot_cpu_id(void)
 	/*
 	 * Find possible boot-time SMP configuration:
 	 */
+#ifdef CONFIG_X86_MPPARSE
 	early_find_smp_config();
+#endif
 #ifdef CONFIG_ACPI
 	/*
 	 * Read APIC information from ACPI tables.
 	 */
 	early_acpi_boot_init();
 #endif
+#ifdef CONFIG_X86_MPPARSE
 	/*
 	 * get boot-time SMP configuration:
 	 */
 	if (smp_found_config)
 		early_get_smp_config();
+#endif
 	early_init_lapic_mapping();
 }
 

+ 1 - 1
arch/x86/mm/numa_64.c

@@ -233,7 +233,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
 	else
 		bootmap_start = round_up(start, PAGE_SIZE);
 	/*
-	 * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like
+	 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
 	 * to use that to align to PAGE_SIZE
 	 */
 	bootmap = early_node_mem(nodeid, bootmap_start, end,

+ 3 - 2
arch/x86/pci/Makefile_32

@@ -13,10 +13,11 @@ pci-y				:= fixup.o
 pci-$(CONFIG_ACPI)		+= acpi.o
 pci-y				+= legacy.o irq.o
 
-# Careful: VISWS and NUMAQ overrule the pci-y above. The colons are
+# Careful: VISWS overrule the pci-y above. The colons are
 # therefor correct. This needs a proper fix by distangling the code.
 pci-$(CONFIG_X86_VISWS)		:= visws.o fixup.o
-pci-$(CONFIG_X86_NUMAQ)		:= numa.o irq.o
+
+pci-$(CONFIG_X86_NUMAQ)		+= numa.o
 
 # Necessary for NUMAQ as well
 pci-$(CONFIG_NUMA)		+= mp_bus_to_node.o

+ 2 - 2
arch/x86/pci/amd_bus.c

@@ -388,7 +388,7 @@ static int __init early_fill_mp_bus_info(void)
 	/* need to take out [0, TOM) for RAM*/
 	address = MSR_K8_TOP_MEM1;
 	rdmsrl(address, val);
-	end = (val & 0xffffff8000000ULL);
+	end = (val & 0xffffff800000ULL);
 	printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20);
 	if (end < (1ULL<<32))
 		update_range(range, 0, end - 1);
@@ -482,7 +482,7 @@ static int __init early_fill_mp_bus_info(void)
 		/* TOP_MEM2 */
 		address = MSR_K8_TOP_MEM2;
 		rdmsrl(address, val);
-		end = (val & 0xffffff8000000ULL);
+		end = (val & 0xffffff800000ULL);
 		printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20);
 		update_range(range, 1ULL<<32, end - 1);
 	}

+ 4 - 25
arch/x86/pci/numa.c

@@ -6,45 +6,21 @@
 #include <linux/init.h>
 #include <linux/nodemask.h>
 #include <mach_apic.h>
+#include <asm/mpspec.h>
 #include "pci.h"
 
 #define XQUAD_PORTIO_BASE 0xfe400000
 #define XQUAD_PORTIO_QUAD 0x40000  /* 256k per quad. */
 
-int mp_bus_id_to_node[MAX_MP_BUSSES];
 #define BUS2QUAD(global) (mp_bus_id_to_node[global])
 
-int mp_bus_id_to_local[MAX_MP_BUSSES];
 #define BUS2LOCAL(global) (mp_bus_id_to_local[global])
 
-void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
-	struct mpc_config_translation *translation)
-{
-	int quad = translation->trans_quad;
-	int local = translation->trans_local;
-
-	mp_bus_id_to_node[m->mpc_busid] = quad;
-	mp_bus_id_to_local[m->mpc_busid] = local;
-	printk(KERN_INFO "Bus #%d is %s (node %d)\n",
-	       m->mpc_busid, name, quad);
-}
-
-int quad_local_to_mp_bus_id [NR_CPUS/4][4];
 #define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local])
-void mpc_oem_pci_bus(struct mpc_config_bus *m,
-	struct mpc_config_translation *translation)
-{
-	int quad = translation->trans_quad;
-	int local = translation->trans_local;
-
-	quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
-}
 
 /* Where the IO area was mapped on multiquad, always 0 otherwise */
 void *xquad_portio;
-#ifdef CONFIG_X86_NUMAQ
 EXPORT_SYMBOL(xquad_portio);
-#endif
 
 #define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
 
@@ -179,6 +155,9 @@ static int __init pci_numa_init(void)
 {
 	int quad;
 
+	if (!found_numaq)
+		return 0;
+
 	raw_pci_ops = &pci_direct_conf1_mq;
 
 	if (pcibios_scanned++)

+ 2 - 1
arch/x86/xen/enlighten.c

@@ -1273,6 +1273,7 @@ asmlinkage void __init xen_start_kernel(void)
 
 	pgd = (pgd_t *)xen_start_info->pt_base;
 
+	init_pg_tables_start = __pa(pgd);
 	init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
 
 	init_mm.pgd = pgd; /* use the Xen pagetables to start */
@@ -1316,5 +1317,5 @@ asmlinkage void __init xen_start_kernel(void)
 	}
 
 	/* Start the world */
-	start_kernel();
+	i386_start_kernel();
 }

+ 2 - 2
arch/x86/xen/setup.c

@@ -40,8 +40,8 @@ char * __init xen_memory_setup(void)
 	max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
 
 	e820.nr_map = 0;
-	add_memory_region(0, LOWMEMSIZE(), E820_RAM);
-	add_memory_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM);
+	e820_add_region(0, LOWMEMSIZE(), E820_RAM);
+	e820_add_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM);
 
 	return "Xen";
 }

+ 0 - 1
drivers/acpi/Kconfig

@@ -4,7 +4,6 @@
 
 menuconfig ACPI
 	bool "ACPI (Advanced Configuration and Power Interface) Support"
-	depends on !X86_NUMAQ
 	depends on !X86_VISWS
 	depends on !IA64_HP_SIM
 	depends on IA64 || X86

+ 5 - 0
drivers/firmware/dmi_scan.c

@@ -8,6 +8,11 @@
 #include <linux/slab.h>
 #include <asm/dmi.h>
 
+/*
+ * DMI stands for "Desktop Management Interface".  It is part
+ * of and an antecedent to, SMBIOS, which stands for System
+ * Management BIOS.  See further: http://www.dmtf.org/standards
+ */
 static char dmi_empty_string[] = "        ";
 
 static const char * __init dmi_string_nosave(const struct dmi_header *dm, u8 s)

+ 2 - 3
include/asm-x86/acpi.h

@@ -28,6 +28,7 @@
 #include <asm/numa.h>
 #include <asm/processor.h>
 #include <asm/mmu.h>
+#include <asm/mpspec.h>
 
 #define COMPILER_DEPENDENT_INT64   long long
 #define COMPILER_DEPENDENT_UINT64  unsigned long long
@@ -160,9 +161,7 @@ struct bootnode;
 #ifdef CONFIG_ACPI_NUMA
 extern int acpi_numa;
 extern int acpi_scan_nodes(unsigned long start, unsigned long end);
-#ifdef CONFIG_X86_64
-# define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
-#endif
+#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
 extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
 				   int num_nodes);
 #else

+ 2 - 0
include/asm-x86/bios_ebda.h

@@ -14,4 +14,6 @@ static inline unsigned int get_bios_ebda(void)
 	return address;	/* 0 means none */
 }
 
+void reserve_ebda_region(void);
+
 #endif /* _MACH_BIOS_EBDA_H */

+ 4 - 0
include/asm-x86/bootparam.h

@@ -11,6 +11,7 @@
 
 /* setup data types */
 #define SETUP_NONE			0
+#define SETUP_E820_EXT			1
 
 /* extensible setup data list node */
 struct setup_data {
@@ -107,4 +108,7 @@ struct boot_params {
 	__u8  _pad9[276];				/* 0xeec */
 } __attribute__((packed));
 
+void reserve_setup_data(void);
+void parse_setup_data(void);
+
 #endif /* _ASM_BOOTPARAM_H */

+ 0 - 8
include/asm-x86/dmi.h

@@ -3,12 +3,6 @@
 
 #include <asm/io.h>
 
-#ifdef CONFIG_X86_32
-
-#define dmi_alloc alloc_bootmem
-
-#else /* CONFIG_X86_32 */
-
 #define DMI_MAX_DATA 2048
 
 extern int dmi_alloc_index;
@@ -25,8 +19,6 @@ static inline void *dmi_alloc(unsigned len)
 	return dmi_alloc_data + idx;
 }
 
-#endif
-
 /* Use early IO mappings for DMI because it's initialized early */
 #define dmi_ioremap early_ioremap
 #define dmi_iounmap early_iounmap

+ 90 - 6
include/asm-x86/e820.h

@@ -2,6 +2,41 @@
 #define __ASM_E820_H
 #define E820MAP	0x2d0		/* our map */
 #define E820MAX	128		/* number of entries in E820MAP */
+
+/*
+ * Legacy E820 BIOS limits us to 128 (E820MAX) nodes due to the
+ * constrained space in the zeropage.  If we have more nodes than
+ * that, and if we've booted off EFI firmware, then the EFI tables
+ * passed us from the EFI firmware can list more nodes.  Size our
+ * internal memory map tables to have room for these additional
+ * nodes, based on up to three entries per node for which the
+ * kernel was built: MAX_NUMNODES == (1 << CONFIG_NODES_SHIFT),
+ * plus E820MAX, allowing space for the possible duplicate E820
+ * entries that might need room in the same arrays, prior to the
+ * call to sanitize_e820_map() to remove duplicates.  The allowance
+ * of three memory map entries per node is "enough" entries for
+ * the initial hardware platform motivating this mechanism to make
+ * use of additional EFI map entries.  Future platforms may want
+ * to allow more than three entries per node or otherwise refine
+ * this size.
+ */
+
+/*
+ * Odd: 'make headers_check' complains about numa.h if I try
+ * to collapse the next two #ifdef lines to a single line:
+ *	#if defined(__KERNEL__) && defined(CONFIG_EFI)
+ */
+#ifdef __KERNEL__
+#ifdef CONFIG_EFI
+#include <linux/numa.h>
+#define E820_X_MAX (E820MAX + 3 * MAX_NUMNODES)
+#else	/* ! CONFIG_EFI */
+#define E820_X_MAX E820MAX
+#endif
+#else	/* ! __KERNEL__ */
+#define E820_X_MAX E820MAX
+#endif
+
 #define E820NR	0x1e8		/* # entries in E820MAP */
 
 #define E820_RAM	1
@@ -18,8 +53,59 @@ struct e820entry {
 
 struct e820map {
 	__u32 nr_map;
-	struct e820entry map[E820MAX];
+	struct e820entry map[E820_X_MAX];
 };
+
+extern struct e820map e820;
+
+extern int e820_any_mapped(u64 start, u64 end, unsigned type);
+extern int e820_all_mapped(u64 start, u64 end, unsigned type);
+extern void e820_add_region(u64 start, u64 size, int type);
+extern void e820_print_map(char *who);
+extern int
+sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, int *pnr_map);
+extern int copy_e820_map(struct e820entry *biosmap, int nr_map);
+extern u64 e820_update_range(u64 start, u64 size, unsigned old_type,
+			       unsigned new_type);
+extern void update_e820(void);
+extern void e820_setup_gap(void);
+struct setup_data;
+extern void parse_e820_ext(struct setup_data *data, unsigned long pa_data);
+
+#if defined(CONFIG_X86_64) || \
+	(defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
+extern void e820_mark_nosave_regions(unsigned long limit_pfn);
+#else
+static inline void e820_mark_nosave_regions(unsigned long limit_pfn)
+{
+}
+#endif
+
+extern unsigned long end_user_pfn;
+
+extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
+extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
+extern void reserve_early(u64 start, u64 end, char *name);
+extern void free_early(u64 start, u64 end);
+extern void early_res_to_bootmem(u64 start, u64 end);
+extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
+
+extern unsigned long e820_end_of_ram(void);
+extern int e820_find_active_region(const struct e820entry *ei,
+				  unsigned long start_pfn,
+				  unsigned long last_pfn,
+				  unsigned long *ei_startpfn,
+				  unsigned long *ei_endpfn);
+extern void e820_register_active_regions(int nid, unsigned long start_pfn,
+					 unsigned long end_pfn);
+extern u64 e820_hole_size(u64 start, u64 end);
+extern void finish_e820_parsing(void);
+extern void e820_reserve_resources(void);
+extern void setup_memory_map(void);
+extern char *default_machine_specific_memory_setup(void);
+extern char *machine_specific_memory_setup(void);
+extern char *memory_setup(void);
+
 #endif /* __ASSEMBLY__ */
 
 #define ISA_START_ADDRESS	0xa0000
@@ -30,11 +116,9 @@ struct e820map {
 #define BIOS_END		0x00100000
 
 #ifdef __KERNEL__
-#ifdef CONFIG_X86_32
-# include "e820_32.h"
-#else
-# include "e820_64.h"
-#endif
+#include <linux/ioport.h>
+
+#define HIGH_MEMORY	(1024*1024)
 #endif /* __KERNEL__ */
 
 #endif  /* __ASM_E820_H */

+ 0 - 50
include/asm-x86/e820_32.h

@@ -1,50 +0,0 @@
-/*
- * structures and definitions for the int 15, ax=e820 memory map
- * scheme.
- *
- * In a nutshell, arch/i386/boot/setup.S populates a scratch table
- * in the empty_zero_block that contains a list of usable address/size
- * duples.   In arch/i386/kernel/setup.c, this information is
- * transferred into the e820map, and in arch/i386/mm/init.c, that
- * new information is used to mark pages reserved or not.
- *
- */
-#ifndef __E820_HEADER
-#define __E820_HEADER
-
-#include <linux/ioport.h>
-
-#define HIGH_MEMORY	(1024*1024)
-
-#ifndef __ASSEMBLY__
-
-extern struct e820map e820;
-extern void update_e820(void);
-
-extern int e820_all_mapped(unsigned long start, unsigned long end,
-			   unsigned type);
-extern int e820_any_mapped(u64 start, u64 end, unsigned type);
-extern void propagate_e820_map(void);
-extern void register_bootmem_low_pages(unsigned long max_low_pfn);
-extern void add_memory_region(unsigned long long start,
-			      unsigned long long size, int type);
-extern void update_memory_range(u64 start, u64 size, unsigned old_type,
-				unsigned new_type);
-extern void e820_register_memory(void);
-extern void limit_regions(unsigned long long size);
-extern void print_memory_map(char *who);
-extern void init_iomem_resources(struct resource *code_resource,
-				 struct resource *data_resource,
-				 struct resource *bss_resource);
-
-#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
-extern void e820_mark_nosave_regions(void);
-#else
-static inline void e820_mark_nosave_regions(void)
-{
-}
-#endif
-
-
-#endif/*!__ASSEMBLY__*/
-#endif/*__E820_HEADER*/

+ 0 - 56
include/asm-x86/e820_64.h

@@ -1,56 +0,0 @@
-/*
- * structures and definitions for the int 15, ax=e820 memory map
- * scheme.
- *
- * In a nutshell, setup.S populates a scratch table in the
- * empty_zero_block that contains a list of usable address/size
- * duples.  setup.c, this information is transferred into the e820map,
- * and in init.c/numa.c, that new information is used to mark pages
- * reserved or not.
- */
-#ifndef __E820_HEADER
-#define __E820_HEADER
-
-#include <linux/ioport.h>
-
-#ifndef __ASSEMBLY__
-extern unsigned long find_e820_area(unsigned long start, unsigned long end,
-				    unsigned long size, unsigned long align);
-extern unsigned long find_e820_area_size(unsigned long start,
-					 unsigned long *sizep,
-					 unsigned long align);
-extern void add_memory_region(unsigned long start, unsigned long size,
-			      int type);
-extern void update_memory_range(u64 start, u64 size, unsigned old_type,
-				unsigned new_type);
-extern void setup_memory_region(void);
-extern void contig_e820_setup(void);
-extern unsigned long e820_end_of_ram(void);
-extern void e820_reserve_resources(void);
-extern void e820_mark_nosave_regions(void);
-extern int e820_any_mapped(unsigned long start, unsigned long end,
-			   unsigned type);
-extern int e820_all_mapped(unsigned long start, unsigned long end,
-			   unsigned type);
-extern int e820_any_non_reserved(unsigned long start, unsigned long end);
-extern int is_memory_any_valid(unsigned long start, unsigned long end);
-extern int e820_all_non_reserved(unsigned long start, unsigned long end);
-extern int is_memory_all_valid(unsigned long start, unsigned long end);
-extern unsigned long e820_hole_size(unsigned long start, unsigned long end);
-
-extern void e820_setup_gap(void);
-extern void e820_register_active_regions(int nid, unsigned long start_pfn,
-					 unsigned long end_pfn);
-
-extern void finish_e820_parsing(void);
-
-extern struct e820map e820;
-extern void update_e820(void);
-
-extern void reserve_early(unsigned long start, unsigned long end, char *name);
-extern void free_early(unsigned long start, unsigned long end);
-extern void early_res_to_bootmem(unsigned long start, unsigned long end);
-
-#endif/*!__ASSEMBLY__*/
-
-#endif/*__E820_HEADER*/

+ 1 - 1
include/asm-x86/efi.h

@@ -90,7 +90,7 @@ extern void *efi_ioremap(unsigned long addr, unsigned long size);
 
 #endif /* CONFIG_X86_32 */
 
-extern void efi_reserve_bootmem(void);
+extern void efi_reserve_early(void);
 extern void efi_call_phys_prelog(void);
 extern void efi_call_phys_epilog(void);
 

+ 3 - 0
include/asm-x86/highmem.h

@@ -74,6 +74,9 @@ struct page *kmap_atomic_to_page(void *ptr);
 
 #define flush_cache_kmaps()	do { } while (0)
 
+extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
+					unsigned long end_pfn);
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_HIGHMEM_H */

+ 18 - 7
include/asm-x86/io_apic.h

@@ -121,21 +121,32 @@ extern int nr_ioapic_registers[MAX_IO_APICS];
 
 #define MP_MAX_IOAPIC_PIN 127
 
-struct mp_ioapic_routing {
-	int apic_id;
-	int gsi_base;
-	int gsi_end;
-	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+struct mp_config_ioapic {
+	unsigned long mp_apicaddr;
+	unsigned int mp_apicid;
+	unsigned char mp_type;
+	unsigned char mp_apicver;
+	unsigned char mp_flags;
+};
+
+struct mp_config_intsrc {
+	unsigned int mp_dstapic;
+	unsigned char mp_type;
+	unsigned char mp_irqtype;
+	unsigned short mp_irqflag;
+	unsigned char mp_srcbus;
+	unsigned char mp_srcbusirq;
+	unsigned char mp_dstirq;
 };
 
 /* I/O APIC entries */
-extern struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+extern struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
 
 /* # of MP IRQ source entries */
 extern int mp_irq_entries;
 
 /* MP IRQ source entries */
-extern struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+extern struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
 
 /* non-0 if default (table-less) MP configuration */
 extern int mpc_default_type;

+ 1 - 0
include/asm-x86/ipi.h

@@ -20,6 +20,7 @@
 
 #include <asm/hw_irq.h>
 #include <asm/apic.h>
+#include <asm/smp.h>
 
 /*
  * the following functions deal with sending IPIs between CPUs.

+ 0 - 8
include/asm-x86/mach-bigsmp/mach_mpspec.h

@@ -1,8 +0,0 @@
-#ifndef __ASM_MACH_MPSPEC_H
-#define __ASM_MACH_MPSPEC_H
-
-#define MAX_IRQ_SOURCES 256
-
-#define MAX_MP_BUSSES 32
-
-#endif /* __ASM_MACH_MPSPEC_H */

+ 0 - 8
include/asm-x86/mach-es7000/mach_mpspec.h

@@ -1,8 +0,0 @@
-#ifndef __ASM_MACH_MPSPEC_H
-#define __ASM_MACH_MPSPEC_H
-
-#define MAX_IRQ_SOURCES 256
-
-#define MAX_MP_BUSSES 256
-
-#endif /* __ASM_MACH_MPSPEC_H */

+ 5 - 2
include/asm-x86/mach-generic/mach_mpparse.h

@@ -1,7 +1,10 @@
 #ifndef _MACH_MPPARSE_H
 #define _MACH_MPPARSE_H 1
 
-int mps_oem_check(struct mp_config_table *mpc, char *oem, char *productid); 
-int acpi_madt_oem_check(char *oem_id, char *oem_table_id); 
+
+extern int mps_oem_check(struct mp_config_table *mpc, char *oem,
+			 char *productid);
+
+extern int acpi_madt_oem_check(char *oem_id, char *oem_table_id);
 
 #endif

+ 8 - 31
include/asm-x86/mach-numaq/mach_apic.h

@@ -20,8 +20,14 @@ static inline cpumask_t target_cpus(void)
 #define INT_DELIVERY_MODE dest_LowestPrio
 #define INT_DEST_MODE 0     /* physical delivery on LOCAL quad */
  
-#define check_apicid_used(bitmap, apicid) physid_isset(apicid, bitmap)
-#define check_apicid_present(bit) physid_isset(bit, phys_cpu_present_map)
+static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
+{
+	return physid_isset(apicid, bitmap);
+}
+static inline unsigned long check_apicid_present(int bit)
+{
+	return physid_isset(bit, phys_cpu_present_map);
+}
 #define apicid_cluster(apicid) (apicid & 0xF0)
 
 static inline int apic_id_registered(void)
@@ -77,11 +83,6 @@ static inline int cpu_present_to_apicid(int mps_cpu)
 		return BAD_APICID;
 }
 
-static inline int generate_logical_apicid(int quad, int phys_apicid)
-{
-	return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
-}
-
 static inline int apicid_to_node(int logical_apicid) 
 {
 	return logical_apicid >> 4;
@@ -95,30 +96,6 @@ static inline physid_mask_t apicid_to_cpu_present(int logical_apicid)
 	return physid_mask_of_physid(cpu + 4*node);
 }
 
-struct mpc_config_translation {
-	unsigned char mpc_type;
-	unsigned char trans_len;
-	unsigned char trans_type;
-	unsigned char trans_quad;
-	unsigned char trans_global;
-	unsigned char trans_local;
-	unsigned short trans_reserved;
-};
-
-static inline int mpc_apic_id(struct mpc_config_processor *m, 
-			struct mpc_config_translation *translation_record)
-{
-	int quad = translation_record->trans_quad;
-	int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
-
-	printk("Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
-	       m->mpc_apicid,
-	       (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
-	       (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
-	       m->mpc_apicver, quad, logical_apicid);
-	return logical_apicid;
-}
-
 extern void *xquad_portio;
 
 static inline void setup_portio_remap(void)

+ 2 - 9
include/asm-x86/mach-numaq/mach_mpparse.h

@@ -1,14 +1,7 @@
 #ifndef __ASM_MACH_MPPARSE_H
 #define __ASM_MACH_MPPARSE_H
 
-extern void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
-			     struct mpc_config_translation *translation);
-extern void mpc_oem_pci_bus(struct mpc_config_bus *m,
-	struct mpc_config_translation *translation);
-
-/* Hook from generic ACPI tables.c */
-static inline void acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
-}
+extern void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
+				char *productid);
 
 #endif /* __ASM_MACH_MPPARSE_H */

+ 0 - 8
include/asm-x86/mach-numaq/mach_mpspec.h

@@ -1,8 +0,0 @@
-#ifndef __ASM_MACH_MPSPEC_H
-#define __ASM_MACH_MPSPEC_H
-
-#define MAX_IRQ_SOURCES 512
-
-#define MAX_MP_BUSSES 32
-
-#endif /* __ASM_MACH_MPSPEC_H */

+ 0 - 9
include/asm-x86/mach-summit/mach_mpspec.h

@@ -1,9 +0,0 @@
-#ifndef __ASM_MACH_MPSPEC_H
-#define __ASM_MACH_MPSPEC_H
-
-#define MAX_IRQ_SOURCES 256
-
-/* Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets. */
-#define MAX_MP_BUSSES 260
-
-#endif /* __ASM_MACH_MPSPEC_H */

+ 6 - 20
include/asm-x86/mmzone_32.h

@@ -12,11 +12,9 @@
 extern struct pglist_data *node_data[];
 #define NODE_DATA(nid)	(node_data[nid])
 
-#ifdef CONFIG_X86_NUMAQ
-	#include <asm/numaq.h>
-#elif defined(CONFIG_ACPI_SRAT)/* summit or generic arch */
-	#include <asm/srat.h>
-#endif
+#include <asm/numaq.h>
+/* summit or generic arch */
+#include <asm/srat.h>
 
 extern int get_memcfg_numa_flat(void);
 /*
@@ -26,28 +24,20 @@ extern int get_memcfg_numa_flat(void);
  */
 static inline void get_memcfg_numa(void)
 {
-#ifdef CONFIG_X86_NUMAQ
+
 	if (get_memcfg_numaq())
 		return;
-#elif defined(CONFIG_ACPI_SRAT)
 	if (get_memcfg_from_srat())
 		return;
-#endif
-
 	get_memcfg_numa_flat();
 }
 
 extern int early_pfn_to_nid(unsigned long pfn);
-extern void numa_kva_reserve(void);
 
 #else /* !CONFIG_NUMA */
 
 #define get_memcfg_numa get_memcfg_numa_flat
-#define get_zholes_size(n) (0)
 
-static inline void numa_kva_reserve(void)
-{
-}
 #endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_DISCONTIGMEM
@@ -55,14 +45,14 @@ static inline void numa_kva_reserve(void)
 /*
  * generic node memory support, the following assumptions apply:
  *
- * 1) memory comes in 256Mb contigious chunks which are either present or not
+ * 1) memory comes in 64Mb contigious chunks which are either present or not
  * 2) we will not have more than 64Gb in total
  *
  * for now assume that 64Gb is max amount of RAM for whole system
  *    64Gb / 4096bytes/page = 16777216 pages
  */
 #define MAX_NR_PAGES 16777216
-#define MAX_ELEMENTS 256
+#define MAX_ELEMENTS 1024
 #define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS)
 
 extern s8 physnode_map[];
@@ -87,9 +77,6 @@ static inline int pfn_to_nid(unsigned long pfn)
 	__pgdat->node_start_pfn + __pgdat->node_spanned_pages;		\
 })
 
-#ifdef CONFIG_X86_NUMAQ            /* we have contiguous memory on NUMA-Q */
-#define pfn_valid(pfn)          ((pfn) < num_physpages)
-#else
 static inline int pfn_valid(int pfn)
 {
 	int nid = pfn_to_nid(pfn);
@@ -98,7 +85,6 @@ static inline int pfn_valid(int pfn)
 		return (pfn < node_end_pfn(nid));
 	return 0;
 }
-#endif /* CONFIG_X86_NUMAQ */
 
 #endif /* CONFIG_DISCONTIGMEM */
 

+ 25 - 4
include/asm-x86/mpspec.h

@@ -13,6 +13,12 @@ extern int apic_version[MAX_APICS];
 extern u8 apicid_2_node[];
 extern int pic_mode;
 
+#ifdef CONFIG_X86_NUMAQ
+extern int mp_bus_id_to_node[MAX_MP_BUSSES];
+extern int mp_bus_id_to_local[MAX_MP_BUSSES];
+extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
+#endif
+
 #define MAX_APICID 256
 
 #else
@@ -21,26 +27,30 @@ extern int pic_mode;
 /* Each PCI slot may be a combo card with its own bus.  4 IRQ pins per slot. */
 #define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4)
 
+#endif
+
 extern void early_find_smp_config(void);
 extern void early_get_smp_config(void);
 
-#endif
-
 #if defined(CONFIG_MCA) || defined(CONFIG_EISA)
 extern int mp_bus_id_to_type[MAX_MP_BUSSES];
 #endif
 
 extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
 
-extern int mp_bus_id_to_pci_bus[MAX_MP_BUSSES];
-
 extern unsigned int boot_cpu_physical_apicid;
+extern unsigned int max_physical_apicid;
 extern int smp_found_config;
 extern int mpc_default_type;
 extern unsigned long mp_lapic_addr;
 
 extern void find_smp_config(void);
 extern void get_smp_config(void);
+#ifdef CONFIG_X86_MPPARSE
+extern void early_reserve_e820_mpc_new(void);
+#else
+static inline void early_reserve_e820_mpc_new(void) { }
+#endif
 
 void __cpuinit generic_processor_info(int apicid, int version);
 #ifdef CONFIG_ACPI
@@ -49,6 +59,17 @@ extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
 				   u32 gsi);
 extern void mp_config_acpi_legacy_irqs(void);
 extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low);
+#ifdef CONFIG_X86_IO_APIC
+extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
+				u32 gsi, int triggering, int polarity);
+#else
+static inline int
+mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
+		   u32 gsi, int triggering, int polarity)
+{
+	return 0;
+}
+#endif
 #endif /* CONFIG_ACPI */
 
 #define PHYSID_ARRAY_SIZE	BITS_TO_LONGS(MAX_APICS)

+ 1 - 1
include/asm-x86/numa_32.h

@@ -5,7 +5,7 @@ extern int pxm_to_nid(int pxm);
 
 #ifdef CONFIG_NUMA
 extern void __init remap_numa_kva(void);
-extern void set_highmem_pages_init(int);
+extern void set_highmem_pages_init(void);
 #else
 static inline void remap_numa_kva(void)
 {

+ 4 - 2
include/asm-x86/numaq.h

@@ -28,6 +28,7 @@
 
 #ifdef CONFIG_X86_NUMAQ
 
+extern int found_numaq;
 extern int get_memcfg_numaq(void);
 
 /*
@@ -156,9 +157,10 @@ struct sys_cfg_data {
 	struct		eachquadmem eq[MAX_NUMNODES];	/* indexed by quad id */
 };
 
-static inline unsigned long *get_zholes_size(int nid)
+#else
+static inline int get_memcfg_numaq(void)
 {
-	return NULL;
+	return 0;
 }
 #endif /* CONFIG_X86_NUMAQ */
 #endif /* NUMAQ_H */

+ 0 - 2
include/asm-x86/proto.h

@@ -14,8 +14,6 @@ extern void ia32_syscall(void);
 extern void ia32_cstar_target(void);
 extern void ia32_sysenter_target(void);
 
-extern void reserve_bootmem_generic(unsigned long phys, unsigned len);
-
 extern void syscall32_cpu_init(void);
 
 extern void check_efer(void);

+ 3 - 12
include/asm-x86/setup.h

@@ -8,7 +8,6 @@
 /* Interrupt control for vSMPowered x86_64 systems */
 void vsmp_init(void);
 
-char *machine_specific_memory_setup(void);
 #ifndef CONFIG_PARAVIRT
 #define paravirt_post_allocator_init()	do {} while (0)
 #endif
@@ -43,27 +42,19 @@ char *machine_specific_memory_setup(void);
  */
 extern struct boot_params boot_params;
 
-#ifdef __i386__
 /*
  * Do NOT EVER look at the BIOS memory size location.
  * It does not work on many machines.
  */
 #define LOWMEMSIZE()	(0x9f000)
 
-struct e820entry;
-
-char * __init machine_specific_memory_setup(void);
-char *memory_setup(void);
+#ifdef __i386__
 
-int __init copy_e820_map(struct e820entry *biosmap, int nr_map);
-int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map);
-void __init add_memory_region(unsigned long long start,
-			      unsigned long long size, int type);
+void __init i386_start_kernel(void);
 
+extern unsigned long init_pg_tables_start;
 extern unsigned long init_pg_tables_end;
 
-
-
 #endif /* __i386__ */
 #endif /* _SETUP */
 #endif /* __ASSEMBLY__ */

+ 0 - 1
include/asm-x86/smp.h

@@ -201,7 +201,6 @@ extern void cpu_exit_clear(void);
 extern void cpu_uninit(void);
 #endif
 
-extern void smp_alloc_memory(void);
 extern void lock_ipi_call_lock(void);
 extern void unlock_ipi_call_lock(void);
 #endif /* __ASSEMBLY__ */

+ 7 - 5
include/asm-x86/srat.h

@@ -27,11 +27,13 @@
 #ifndef _ASM_SRAT_H_
 #define _ASM_SRAT_H_
 
-#ifndef CONFIG_ACPI_SRAT
-#error CONFIG_ACPI_SRAT not defined, and srat.h header has been included
-#endif
-
+#ifdef CONFIG_ACPI_SRAT
 extern int get_memcfg_from_srat(void);
-extern unsigned long *get_zholes_size(int);
+#else
+static inline int get_memcfg_from_srat(void)
+{
+	return 0;
+}
+#endif
 
 #endif /* _ASM_SRAT_H_ */

+ 0 - 1
include/asm-x86/system.h

@@ -303,7 +303,6 @@ static inline void clflush(volatile void *__p)
 void disable_hlt(void);
 void enable_hlt(void);
 
-extern int es7000_plat;
 void cpu_idle_wait(void);
 
 extern unsigned long arch_align_stack(unsigned long sp);

+ 2 - 0
include/linux/bootmem.h

@@ -84,6 +84,8 @@ extern int reserve_bootmem(unsigned long addr, unsigned long size, int flags);
 	__alloc_bootmem_low(x, PAGE_SIZE, 0)
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
+extern int reserve_bootmem_generic(unsigned long addr, unsigned long size,
+				   int flags);
 extern unsigned long free_all_bootmem(void);
 extern unsigned long free_all_bootmem_node(pg_data_t *pgdat);
 extern void *__alloc_bootmem_node(pg_data_t *pgdat,

+ 0 - 4
include/linux/efi.h

@@ -287,7 +287,6 @@ efi_guid_unparse(efi_guid_t *guid, char *out)
 extern void efi_init (void);
 extern void *efi_get_pal_addr (void);
 extern void efi_map_pal_code (void);
-extern void efi_map_memmap(void);
 extern void efi_memmap_walk (efi_freemem_callback_t callback, void *arg);
 extern void efi_gettimeofday (struct timespec *ts);
 extern void efi_enter_virtual_mode (void);	/* switch EFI to virtual mode, if possible */
@@ -295,14 +294,11 @@ extern u64 efi_get_iobase (void);
 extern u32 efi_mem_type (unsigned long phys_addr);
 extern u64 efi_mem_attributes (unsigned long phys_addr);
 extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size);
-extern int efi_mem_attribute_range (unsigned long phys_addr, unsigned long size,
-				    u64 attr);
 extern int __init efi_uart_console_only (void);
 extern void efi_initialize_iomem_resources(struct resource *code_resource,
 		struct resource *data_resource, struct resource *bss_resource);
 extern unsigned long efi_get_time(void);
 extern int efi_set_rtc_mmss(unsigned long nowtime);
-extern int is_available_memory(efi_memory_desc_t * md);
 extern struct efi_memory_map memmap;
 
 /**

+ 4 - 2
include/linux/mm.h

@@ -998,8 +998,8 @@ extern void free_area_init_node(int nid, pg_data_t *pgdat,
 extern void free_area_init_nodes(unsigned long *max_zone_pfn);
 extern void add_active_range(unsigned int nid, unsigned long start_pfn,
 					unsigned long end_pfn);
-extern void shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
-						unsigned long new_end_pfn);
+extern void remove_active_range(unsigned int nid, unsigned long start_pfn,
+					unsigned long end_pfn);
 extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn,
 					unsigned long end_pfn);
 extern void remove_all_active_ranges(void);
@@ -1011,6 +1011,8 @@ extern unsigned long find_min_pfn_with_active_regions(void);
 extern unsigned long find_max_pfn_with_active_regions(void);
 extern void free_bootmem_with_active_regions(int nid,
 						unsigned long max_low_pfn);
+typedef void (*work_fn_t)(unsigned long, unsigned long, void *);
+extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data);
 extern void sparse_memory_present_with_active_regions(int nid);
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
 extern int early_pfn_to_nid(unsigned long pfn);

+ 3 - 5
include/linux/pageblock-flags.h

@@ -25,13 +25,11 @@
 
 #include <linux/types.h>
 
-/* Macro to aid the definition of ranges of bits */
-#define PB_range(name, required_bits) \
-	name, name ## _end = (name + required_bits) - 1
-
 /* Bit indices that affect a whole block of pages */
 enum pageblock_bits {
-	PB_range(PB_migrate, 3), /* 3 bits required for migrate types */
+	PB_migrate,
+	PB_migrate_end = PB_migrate + 3 - 1,
+			/* 3 bits required for migrate types */
 	NR_PAGEBLOCK_BITS
 };
 

+ 67 - 13
mm/page_alloc.c

@@ -2929,6 +2929,14 @@ void __init free_bootmem_with_active_regions(int nid,
 	}
 }
 
+void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
+{
+	int i;
+
+	for_each_active_range_index_in_nid(i, nid)
+		work_fn(early_node_map[i].start_pfn, early_node_map[i].end_pfn,
+			data);
+}
 /**
  * sparse_memory_present_with_active_regions - Call memory_present for each active range
  * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
@@ -3461,6 +3469,11 @@ void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat,
 	calculate_node_totalpages(pgdat, zones_size, zholes_size);
 
 	alloc_node_mem_map(pgdat);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+	printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
+		nid, (unsigned long)pgdat,
+		(unsigned long)pgdat->node_mem_map);
+#endif
 
 	free_area_init_core(pgdat, zones_size, zholes_size);
 }
@@ -3547,27 +3560,68 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
 }
 
 /**
- * shrink_active_range - Shrink an existing registered range of PFNs
+ * remove_active_range - Shrink an existing registered range of PFNs
  * @nid: The node id the range is on that should be shrunk
- * @old_end_pfn: The old end PFN of the range
- * @new_end_pfn: The new PFN of the range
+ * @start_pfn: The new PFN of the range
+ * @end_pfn: The new PFN of the range
  *
  * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
- * The map is kept at the end physical page range that has already been
- * registered with add_active_range(). This function allows an arch to shrink
- * an existing registered range.
+ * The map is kept near the end physical page range that has already been
+ * registered. This function allows an arch to shrink an existing registered
+ * range.
  */
-void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
-						unsigned long new_end_pfn)
+void __init remove_active_range(unsigned int nid, unsigned long start_pfn,
+				unsigned long end_pfn)
 {
-	int i;
+	int i, j;
+	int removed = 0;
+
+	printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n",
+			  nid, start_pfn, end_pfn);
 
 	/* Find the old active region end and shrink */
-	for_each_active_range_index_in_nid(i, nid)
-		if (early_node_map[i].end_pfn == old_end_pfn) {
-			early_node_map[i].end_pfn = new_end_pfn;
-			break;
+	for_each_active_range_index_in_nid(i, nid) {
+		if (early_node_map[i].start_pfn >= start_pfn &&
+		    early_node_map[i].end_pfn <= end_pfn) {
+			/* clear it */
+			early_node_map[i].start_pfn = 0;
+			early_node_map[i].end_pfn = 0;
+			removed = 1;
+			continue;
 		}
+		if (early_node_map[i].start_pfn < start_pfn &&
+		    early_node_map[i].end_pfn > start_pfn) {
+			unsigned long temp_end_pfn = early_node_map[i].end_pfn;
+			early_node_map[i].end_pfn = start_pfn;
+			if (temp_end_pfn > end_pfn)
+				add_active_range(nid, end_pfn, temp_end_pfn);
+			continue;
+		}
+		if (early_node_map[i].start_pfn >= start_pfn &&
+		    early_node_map[i].end_pfn > end_pfn &&
+		    early_node_map[i].start_pfn < end_pfn) {
+			early_node_map[i].start_pfn = end_pfn;
+			continue;
+		}
+	}
+
+	if (!removed)
+		return;
+
+	/* remove the blank ones */
+	for (i = nr_nodemap_entries - 1; i > 0; i--) {
+		if (early_node_map[i].nid != nid)
+			continue;
+		if (early_node_map[i].end_pfn)
+			continue;
+		/* we found it, get rid of it */
+		for (j = i; j < nr_nodemap_entries - 1; j++)
+			memcpy(&early_node_map[j], &early_node_map[j+1],
+				sizeof(early_node_map[j]));
+		j = nr_nodemap_entries - 1;
+		memset(&early_node_map[j], 0, sizeof(early_node_map[j]));
+		nr_nodemap_entries--;
+	}
 }
 
 /**

Algúns arquivos non se mostraron porque demasiados arquivos cambiaron neste cambio