Ver Fonte

define new percpu interface for shared data

per cpu data section contains two types of data.  One set which is
exclusively accessed by the local cpu and the other set which is per cpu,
but also shared by remote cpus.  In the current kernel, these two sets are
not clearely separated out.  This can potentially cause the same data
cacheline shared between the two sets of data, which will result in
unnecessary bouncing of the cacheline between cpus.

One way to fix the problem is to cacheline align the remotely accessed per
cpu data, both at the beginning and at the end.  Because of the padding at
both ends, this will likely cause some memory wastage and also the
interface to achieve this is not clean.

This patch:

Moves the remotely accessed per cpu data (which is currently marked
as ____cacheline_aligned_in_smp) into a different section, where all the data
elements are cacheline aligned. And as such, this differentiates the local
only data and remotely accessed data cleanly.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: <linux-arch@vger.kernel.org>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Fenghua Yu há 18 anos atrás
pai
commit
5fb7dc37dc

+ 1 - 4
arch/alpha/kernel/vmlinux.lds.S

@@ -69,10 +69,7 @@ SECTIONS
   . = ALIGN(8);
   . = ALIGN(8);
   SECURITY_INIT
   SECURITY_INIT
 
 
-  . = ALIGN(8192);
-  __per_cpu_start = .;
-  .data.percpu : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(8192)
 
 
   . = ALIGN(2*8192);
   . = ALIGN(2*8192);
   __init_end = .;
   __init_end = .;

+ 1 - 0
arch/arm/kernel/vmlinux.lds.S

@@ -66,6 +66,7 @@ SECTIONS
 		. = ALIGN(4096);
 		. = ALIGN(4096);
 		__per_cpu_start = .;
 		__per_cpu_start = .;
 			*(.data.percpu)
 			*(.data.percpu)
+			*(.data.percpu.shared_aligned)
 		__per_cpu_end = .;
 		__per_cpu_end = .;
 #ifndef CONFIG_XIP_KERNEL
 #ifndef CONFIG_XIP_KERNEL
 		__init_begin = _stext;
 		__init_begin = _stext;

+ 1 - 4
arch/cris/arch-v32/vmlinux.lds.S

@@ -91,10 +91,7 @@ SECTIONS
 	}
 	}
 	SECURITY_INIT
 	SECURITY_INIT
 
 
-	. =  ALIGN (8192);
-	__per_cpu_start = .;
-	.data.percpu  : { *(.data.percpu) }
-	__per_cpu_end = .;
+	PERCPU(8192)
 
 
 #ifdef CONFIG_BLK_DEV_INITRD
 #ifdef CONFIG_BLK_DEV_INITRD
 	.init.ramfs : {
 	.init.ramfs : {

+ 1 - 4
arch/frv/kernel/vmlinux.lds.S

@@ -57,10 +57,7 @@ SECTIONS
   __alt_instructions_end = .;
   __alt_instructions_end = .;
  .altinstr_replacement : { *(.altinstr_replacement) }
  .altinstr_replacement : { *(.altinstr_replacement) }
 
 
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
 
 
 #ifdef CONFIG_BLK_DEV_INITRD
 #ifdef CONFIG_BLK_DEV_INITRD
   . = ALIGN(4096);
   . = ALIGN(4096);

+ 1 - 0
arch/i386/kernel/vmlinux.lds.S

@@ -181,6 +181,7 @@ SECTIONS
   .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
   .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
 	__per_cpu_start = .;
 	__per_cpu_start = .;
 	*(.data.percpu)
 	*(.data.percpu)
+	*(.data.percpu.shared_aligned)
 	__per_cpu_end = .;
 	__per_cpu_end = .;
   }
   }
   . = ALIGN(4096);
   . = ALIGN(4096);

+ 1 - 0
arch/ia64/kernel/vmlinux.lds.S

@@ -206,6 +206,7 @@ SECTIONS
 	{
 	{
 		__per_cpu_start = .;
 		__per_cpu_start = .;
 		*(.data.percpu)
 		*(.data.percpu)
+		*(.data.percpu.shared_aligned)
 		__per_cpu_end = .;
 		__per_cpu_end = .;
 	}
 	}
   . = __phys_per_cpu_start + PERCPU_PAGE_SIZE;	/* ensure percpu data fits
   . = __phys_per_cpu_start + PERCPU_PAGE_SIZE;	/* ensure percpu data fits

+ 1 - 4
arch/m32r/kernel/vmlinux.lds.S

@@ -110,10 +110,7 @@ SECTIONS
   __initramfs_end = .;
   __initramfs_end = .;
 #endif
 #endif
 
 
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
   . = ALIGN(4096);
   . = ALIGN(4096);
   __init_end = .;
   __init_end = .;
   /* freed after init ends here */
   /* freed after init ends here */

+ 1 - 4
arch/mips/kernel/vmlinux.lds.S

@@ -119,10 +119,7 @@ SECTIONS
   .init.ramfs : { *(.init.ramfs) }
   .init.ramfs : { *(.init.ramfs) }
   __initramfs_end = .;
   __initramfs_end = .;
 #endif
 #endif
-  . = ALIGN(_PAGE_SIZE);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(_PAGE_SIZE)
   . = ALIGN(_PAGE_SIZE);
   . = ALIGN(_PAGE_SIZE);
   __init_end = .;
   __init_end = .;
   /* freed after init ends here */
   /* freed after init ends here */

+ 3 - 4
arch/parisc/kernel/vmlinux.lds.S

@@ -181,10 +181,9 @@ SECTIONS
   .init.ramfs : { *(.init.ramfs) }
   .init.ramfs : { *(.init.ramfs) }
   __initramfs_end = .;
   __initramfs_end = .;
 #endif
 #endif
-  . = ALIGN(ASM_PAGE_SIZE);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+
+  PERCPU(ASM_PAGE_SIZE)
+
   . = ALIGN(ASM_PAGE_SIZE);
   . = ALIGN(ASM_PAGE_SIZE);
   __init_end = .;
   __init_end = .;
   /* freed after init ends here */
   /* freed after init ends here */

+ 1 - 0
arch/powerpc/kernel/vmlinux.lds.S

@@ -144,6 +144,7 @@ SECTIONS
 	.data.percpu : {
 	.data.percpu : {
 		__per_cpu_start = .;
 		__per_cpu_start = .;
 		*(.data.percpu)
 		*(.data.percpu)
+		*(.data.percpu.shared_aligned)
 		__per_cpu_end = .;
 		__per_cpu_end = .;
 	}
 	}
 
 

+ 1 - 4
arch/ppc/kernel/vmlinux.lds.S

@@ -130,10 +130,7 @@ SECTIONS
   __ftr_fixup : { *(__ftr_fixup) }
   __ftr_fixup : { *(__ftr_fixup) }
   __stop___ftr_fixup = .;
   __stop___ftr_fixup = .;
 
 
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
 
 
 #ifdef CONFIG_BLK_DEV_INITRD
 #ifdef CONFIG_BLK_DEV_INITRD
   . = ALIGN(4096);
   . = ALIGN(4096);

+ 1 - 4
arch/s390/kernel/vmlinux.lds.S

@@ -107,10 +107,7 @@ SECTIONS
   . = ALIGN(2);
   . = ALIGN(2);
   __initramfs_end = .;
   __initramfs_end = .;
 #endif
 #endif
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
   . = ALIGN(4096);
   . = ALIGN(4096);
   __init_end = .;
   __init_end = .;
   /* freed after init ends here */
   /* freed after init ends here */

+ 1 - 4
arch/sh/kernel/vmlinux.lds.S

@@ -60,10 +60,7 @@ SECTIONS
   . = ALIGN(PAGE_SIZE);
   . = ALIGN(PAGE_SIZE);
   __nosave_end = .;
   __nosave_end = .;
 
 
-  . = ALIGN(PAGE_SIZE);
-  __per_cpu_start = .;
-  .data.percpu : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(PAGE_SIZE)
   .data.cacheline_aligned : { *(.data.cacheline_aligned) }
   .data.cacheline_aligned : { *(.data.cacheline_aligned) }
 
 
   _edata = .;			/* End of data section */
   _edata = .;			/* End of data section */

+ 4 - 1
arch/sh64/kernel/vmlinux.lds.S

@@ -87,7 +87,10 @@ SECTIONS
 
 
   . = ALIGN(PAGE_SIZE);
   . = ALIGN(PAGE_SIZE);
   __per_cpu_start = .;
   __per_cpu_start = .;
-  .data.percpu : C_PHYS(.data.percpu) { *(.data.percpu) }
+  .data.percpu : C_PHYS(.data.percpu) {
+	*(.data.percpu)
+	*(.data.percpu.shared_aligned)
+  }
   __per_cpu_end = . ;
   __per_cpu_end = . ;
   .data.cacheline_aligned : C_PHYS(.data.cacheline_aligned) { *(.data.cacheline_aligned) }
   .data.cacheline_aligned : C_PHYS(.data.cacheline_aligned) { *(.data.cacheline_aligned) }
 
 

+ 1 - 4
arch/sparc/kernel/vmlinux.lds.S

@@ -65,10 +65,7 @@ SECTIONS
   __initramfs_end = .;
   __initramfs_end = .;
 #endif
 #endif
 
 
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
   . = ALIGN(4096);
   . = ALIGN(4096);
   __init_end = .;
   __init_end = .;
   . = ALIGN(32);
   . = ALIGN(32);

+ 2 - 4
arch/sparc64/kernel/vmlinux.lds.S

@@ -90,10 +90,8 @@ SECTIONS
   __initramfs_end = .;
   __initramfs_end = .;
 #endif
 #endif
 
 
-  . = ALIGN(PAGE_SIZE);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(PAGE_SIZE)
+
   . = ALIGN(PAGE_SIZE);
   . = ALIGN(PAGE_SIZE);
   __init_end = .;
   __init_end = .;
   __bss_start = .;
   __bss_start = .;

+ 2 - 4
arch/x86_64/kernel/vmlinux.lds.S

@@ -194,10 +194,8 @@ SECTIONS
   __initramfs_end = .;
   __initramfs_end = .;
 #endif
 #endif
 
 
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
+
   . = ALIGN(4096);
   . = ALIGN(4096);
   __init_end = .;
   __init_end = .;
 
 

+ 1 - 4
arch/xtensa/kernel/vmlinux.lds.S

@@ -190,10 +190,7 @@ SECTIONS
   __initramfs_end = .;
   __initramfs_end = .;
 #endif
 #endif
 
 
-  . = ALIGN(4096);
-  __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
-  __per_cpu_end = .;
+  PERCPU(4096)
 
 
 
 
   /* We need this dummy segment here */
   /* We need this dummy segment here */

+ 8 - 0
include/asm-generic/percpu.h

@@ -14,6 +14,11 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
 #define DEFINE_PER_CPU(type, name) \
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
 
 
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)		\
+    __attribute__((__section__(".data.percpu.shared_aligned"))) \
+    __typeof__(type) per_cpu__##name				\
+    ____cacheline_aligned_in_smp
+
 /* var is in discarded region: offset to particular copy we want */
 /* var is in discarded region: offset to particular copy we want */
 #define per_cpu(var, cpu) (*({				\
 #define per_cpu(var, cpu) (*({				\
 	extern int simple_identifier_##var(void);	\
 	extern int simple_identifier_##var(void);	\
@@ -34,6 +39,9 @@ do {								\
 #define DEFINE_PER_CPU(type, name) \
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
     __typeof__(type) per_cpu__##name
 
 
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)	\
+    DEFINE_PER_CPU(type, name)
+
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
 #define __get_cpu_var(var)			per_cpu__##var
 #define __raw_get_cpu_var(var)			per_cpu__##var
 #define __raw_get_cpu_var(var)			per_cpu__##var

+ 8 - 0
include/asm-generic/vmlinux.lds.h

@@ -245,3 +245,11 @@
   	*(.initcall7.init)						\
   	*(.initcall7.init)						\
   	*(.initcall7s.init)
   	*(.initcall7s.init)
 
 
+#define PERCPU(align)							\
+	. = ALIGN(align);						\
+	__per_cpu_start = .;						\
+	.data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {		\
+		*(.data.percpu)						\
+		*(.data.percpu.shared_aligned)				\
+	}								\
+	__per_cpu_end = .;

+ 5 - 0
include/asm-i386/percpu.h

@@ -54,6 +54,11 @@ extern unsigned long __per_cpu_offset[];
 #define DEFINE_PER_CPU(type, name) \
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
 
 
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)		\
+    __attribute__((__section__(".data.percpu.shared_aligned"))) \
+    __typeof__(type) per_cpu__##name				\
+    ____cacheline_aligned_in_smp
+
 /* We can use this directly for local CPU (faster). */
 /* We can use this directly for local CPU (faster). */
 DECLARE_PER_CPU(unsigned long, this_cpu_off);
 DECLARE_PER_CPU(unsigned long, this_cpu_off);
 
 

+ 10 - 0
include/asm-ia64/percpu.h

@@ -29,6 +29,16 @@
 	__attribute__((__section__(".data.percpu")))		\
 	__attribute__((__section__(".data.percpu")))		\
 	__SMALL_ADDR_AREA __typeof__(type) per_cpu__##name
 	__SMALL_ADDR_AREA __typeof__(type) per_cpu__##name
 
 
+#ifdef CONFIG_SMP
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)			\
+	__attribute__((__section__(".data.percpu.shared_aligned")))	\
+	__SMALL_ADDR_AREA __typeof__(type) per_cpu__##name		\
+	____cacheline_aligned_in_smp
+#else
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)	\
+	DEFINE_PER_CPU(type, name)
+#endif
+
 /*
 /*
  * Pretty much a literal copy of asm-generic/percpu.h, except that percpu_modcopy() is an
  * Pretty much a literal copy of asm-generic/percpu.h, except that percpu_modcopy() is an
  * external routine, to avoid include-hell.
  * external routine, to avoid include-hell.

+ 7 - 0
include/asm-powerpc/percpu.h

@@ -20,6 +20,11 @@
 #define DEFINE_PER_CPU(type, name) \
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
 
 
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)		\
+    __attribute__((__section__(".data.percpu.shared_aligned"))) \
+    __typeof__(type) per_cpu__##name				\
+    ____cacheline_aligned_in_smp
+
 /* var is in discarded region: offset to particular copy we want */
 /* var is in discarded region: offset to particular copy we want */
 #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)))
 #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)))
 #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()))
 #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()))
@@ -40,6 +45,8 @@ extern void setup_per_cpu_areas(void);
 
 
 #define DEFINE_PER_CPU(type, name) \
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
     __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)	\
+    DEFINE_PER_CPU(type, name)
 
 
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
 #define __get_cpu_var(var)			per_cpu__##var

+ 7 - 0
include/asm-s390/percpu.h

@@ -41,6 +41,11 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
     __attribute__((__section__(".data.percpu"))) \
     __attribute__((__section__(".data.percpu"))) \
     __typeof__(type) per_cpu__##name
     __typeof__(type) per_cpu__##name
 
 
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)		\
+    __attribute__((__section__(".data.percpu.shared_aligned"))) \
+    __typeof__(type) per_cpu__##name				\
+    ____cacheline_aligned_in_smp
+
 #define __get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset)
 #define __get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset)
 #define __raw_get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset)
 #define __raw_get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset)
 #define per_cpu(var,cpu) __reloc_hide(var,__per_cpu_offset[cpu])
 #define per_cpu(var,cpu) __reloc_hide(var,__per_cpu_offset[cpu])
@@ -59,6 +64,8 @@ do {								\
 
 
 #define DEFINE_PER_CPU(type, name) \
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
     __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)	\
+    DEFINE_PER_CPU(type, name)
 
 
 #define __get_cpu_var(var) __reloc_hide(var,0)
 #define __get_cpu_var(var) __reloc_hide(var,0)
 #define __raw_get_cpu_var(var) __reloc_hide(var,0)
 #define __raw_get_cpu_var(var) __reloc_hide(var,0)

+ 7 - 0
include/asm-sparc64/percpu.h

@@ -18,6 +18,11 @@ extern unsigned long __per_cpu_shift;
 #define DEFINE_PER_CPU(type, name) \
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
 
 
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)		\
+    __attribute__((__section__(".data.percpu.shared_aligned"))) \
+    __typeof__(type) per_cpu__##name				\
+    ____cacheline_aligned_in_smp
+
 register unsigned long __local_per_cpu_offset asm("g5");
 register unsigned long __local_per_cpu_offset asm("g5");
 
 
 /* var is in discarded region: offset to particular copy we want */
 /* var is in discarded region: offset to particular copy we want */
@@ -38,6 +43,8 @@ do {								\
 #define real_setup_per_cpu_areas()		do { } while (0)
 #define real_setup_per_cpu_areas()		do { } while (0)
 #define DEFINE_PER_CPU(type, name) \
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
     __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)	\
+    DEFINE_PER_CPU(type, name)
 
 
 #define per_cpu(var, cpu)			(*((void)cpu, &per_cpu__##var))
 #define per_cpu(var, cpu)			(*((void)cpu, &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
 #define __get_cpu_var(var)			per_cpu__##var

+ 7 - 0
include/asm-x86_64/percpu.h

@@ -20,6 +20,11 @@
 #define DEFINE_PER_CPU(type, name) \
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
 
 
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)		\
+    __attribute__((__section__(".data.percpu.shared_aligned"))) \
+    __typeof__(type) per_cpu__##name				\
+    ____cacheline_internodealigned_in_smp
+
 /* var is in discarded region: offset to particular copy we want */
 /* var is in discarded region: offset to particular copy we want */
 #define per_cpu(var, cpu) (*({				\
 #define per_cpu(var, cpu) (*({				\
 	extern int simple_identifier_##var(void);	\
 	extern int simple_identifier_##var(void);	\
@@ -46,6 +51,8 @@ extern void setup_per_cpu_areas(void);
 
 
 #define DEFINE_PER_CPU(type, name) \
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
     __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)	\
+    DEFINE_PER_CPU(type, name)
 
 
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
 #define __get_cpu_var(var)			per_cpu__##var