Browse Source

Merge branch 'tracing/ftrace' into auto-ftrace-next

Ingo Molnar 17 years ago
parent
commit
bac0c9103b
89 changed files with 8834 additions and 115 deletions
  1. 4 0
      Makefile
  2. 2 0
      arch/arm/Kconfig
  3. 6 0
      arch/arm/boot/compressed/Makefile
  4. 5 0
      arch/arm/kernel/Makefile
  5. 5 0
      arch/arm/kernel/armksyms.c
  6. 51 0
      arch/arm/kernel/entry-common.S
  7. 116 0
      arch/arm/kernel/ftrace.c
  8. 1 1
      arch/arm/kernel/kprobes.c
  9. 3 1
      arch/powerpc/Kconfig
  10. 14 0
      arch/powerpc/kernel/Makefile
  11. 127 0
      arch/powerpc/kernel/entry_32.S
  12. 65 0
      arch/powerpc/kernel/entry_64.S
  13. 154 0
      arch/powerpc/kernel/ftrace.c
  14. 2 1
      arch/powerpc/kernel/io.c
  15. 3 3
      arch/powerpc/kernel/irq.c
  16. 5 0
      arch/powerpc/kernel/ppc_ksyms.c
  17. 3 3
      arch/powerpc/kernel/setup_32.c
  18. 5 0
      arch/powerpc/platforms/powermac/Makefile
  19. 2 0
      arch/sparc64/Kconfig
  20. 1 1
      arch/sparc64/Kconfig.debug
  21. 1 0
      arch/sparc64/kernel/Makefile
  22. 94 0
      arch/sparc64/kernel/ftrace.c
  23. 2 2
      arch/sparc64/kernel/sparc64_ksyms.c
  24. 54 4
      arch/sparc64/lib/mcount.S
  25. 2 0
      arch/x86/Kconfig
  26. 8 0
      arch/x86/Kconfig.debug
  27. 8 0
      arch/x86/kernel/Makefile
  28. 11 11
      arch/x86/kernel/alternative.c
  29. 72 0
      arch/x86/kernel/entry_32.S
  30. 106 0
      arch/x86/kernel/entry_64.S
  31. 141 0
      arch/x86/kernel/ftrace.c
  32. 8 1
      arch/x86/kernel/i386_ksyms_32.c
  33. 4 0
      arch/x86/kernel/machine_kexec_32.c
  34. 4 0
      arch/x86/kernel/machine_kexec_64.c
  35. 3 0
      arch/x86/kernel/process_32.c
  36. 3 0
      arch/x86/kernel/process_64.c
  37. 2 1
      arch/x86/kernel/vsyscall_64.c
  38. 9 2
      arch/x86/kernel/x8664_ksyms_64.c
  39. 1 0
      arch/x86/lib/Makefile
  40. 47 0
      arch/x86/lib/thunk_32.S
  41. 17 2
      arch/x86/lib/thunk_64.S
  42. 56 0
      arch/x86/mm/fault.c
  43. 4 0
      arch/x86/mm/init_32.c
  44. 8 2
      arch/x86/mm/init_64.c
  45. 8 7
      arch/x86/vdso/vclock_gettime.c
  46. 2 1
      arch/x86/vdso/vgetcpu.c
  47. 14 0
      include/asm-arm/ftrace.h
  48. 1 0
      include/asm-arm/kprobes.h
  49. 14 0
      include/asm-powerpc/ftrace.h
  50. 10 0
      include/asm-powerpc/hw_irq.h
  51. 14 0
      include/asm-sparc64/ftrace.h
  52. 2 0
      include/asm-x86/alternative.h
  53. 14 0
      include/asm-x86/ftrace.h
  54. 2 22
      include/asm-x86/irqflags.h
  55. 9 0
      include/asm-x86/kdebug.h
  56. 2 1
      include/asm-x86/vsyscall.h
  57. 143 0
      include/linux/ftrace.h
  58. 11 2
      include/linux/irqflags.h
  59. 4 0
      include/linux/kprobes.h
  60. 2 0
      include/linux/linkage.h
  61. 29 11
      include/linux/marker.h
  62. 33 1
      include/linux/preempt.h
  63. 16 0
      include/linux/sched.h
  64. 2 0
      include/linux/writeback.h
  65. 14 0
      kernel/Makefile
  66. 1 1
      kernel/fork.c
  67. 26 7
      kernel/lockdep.c
  68. 14 16
      kernel/marker.c
  69. 2 0
      kernel/printk.c
  70. 54 3
      kernel/sched.c
  71. 1 0
      kernel/semaphore.c
  72. 1 1
      kernel/spinlock.c
  73. 11 0
      kernel/sysctl.c
  74. 127 0
      kernel/trace/Kconfig
  75. 22 0
      kernel/trace/Makefile
  76. 1710 0
      kernel/trace/ftrace.c
  77. 3100 0
      kernel/trace/trace.c
  78. 313 0
      kernel/trace/trace.h
  79. 78 0
      kernel/trace/trace_functions.c
  80. 486 0
      kernel/trace/trace_irqsoff.c
  81. 286 0
      kernel/trace/trace_sched_switch.c
  82. 447 0
      kernel/trace/trace_sched_wakeup.c
  83. 540 0
      kernel/trace/trace_selftest.c
  84. 7 0
      kernel/trace/trace_selftest_dynamic.c
  85. 2 0
      lib/Kconfig.debug
  86. 9 0
      lib/Makefile
  87. 3 3
      lib/smp_processor_id.c
  88. 7 3
      mm/page-writeback.c
  89. 2 1
      scripts/Makefile.lib

+ 4 - 0
Makefile

@@ -528,6 +528,10 @@ KBUILD_CFLAGS	+= -g
 KBUILD_AFLAGS	+= -gdwarf-2
 KBUILD_AFLAGS	+= -gdwarf-2
 endif
 endif
 
 
+ifdef CONFIG_FTRACE
+KBUILD_CFLAGS	+= -pg
+endif
+
 # We trigger additional mismatches with less inlining
 # We trigger additional mismatches with less inlining
 ifdef CONFIG_DEBUG_SECTION_MISMATCH
 ifdef CONFIG_DEBUG_SECTION_MISMATCH
 KBUILD_CFLAGS += $(call cc-option, -fno-inline-functions-called-once)
 KBUILD_CFLAGS += $(call cc-option, -fno-inline-functions-called-once)

+ 2 - 0
arch/arm/Kconfig

@@ -14,6 +14,8 @@ config ARM
 	select HAVE_OPROFILE
 	select HAVE_OPROFILE
 	select HAVE_KPROBES if (!XIP_KERNEL)
 	select HAVE_KPROBES if (!XIP_KERNEL)
 	select HAVE_KRETPROBES if (HAVE_KPROBES)
 	select HAVE_KRETPROBES if (HAVE_KPROBES)
+	select HAVE_FTRACE if (!XIP_KERNEL)
+	select HAVE_DYNAMIC_FTRACE if (HAVE_FTRACE)
 	help
 	help
 	  The ARM series is a line of low-power-consumption RISC chip designs
 	  The ARM series is a line of low-power-consumption RISC chip designs
 	  licensed by ARM Ltd and targeted at embedded applications and
 	  licensed by ARM Ltd and targeted at embedded applications and

+ 6 - 0
arch/arm/boot/compressed/Makefile

@@ -69,6 +69,12 @@ SEDFLAGS	= s/TEXT_START/$(ZTEXTADDR)/;s/BSS_START/$(ZBSSADDR)/
 
 
 targets       := vmlinux vmlinux.lds piggy.gz piggy.o font.o font.c \
 targets       := vmlinux vmlinux.lds piggy.gz piggy.o font.o font.c \
 		 head.o misc.o $(OBJS)
 		 head.o misc.o $(OBJS)
+
+ifeq ($(CONFIG_FTRACE),y)
+ORIG_CFLAGS := $(KBUILD_CFLAGS)
+KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS))
+endif
+
 EXTRA_CFLAGS  := -fpic -fno-builtin
 EXTRA_CFLAGS  := -fpic -fno-builtin
 EXTRA_AFLAGS  :=
 EXTRA_AFLAGS  :=
 
 

+ 5 - 0
arch/arm/kernel/Makefile

@@ -4,6 +4,10 @@
 
 
 AFLAGS_head.o := -DTEXT_OFFSET=$(TEXT_OFFSET)
 AFLAGS_head.o := -DTEXT_OFFSET=$(TEXT_OFFSET)
 
 
+ifdef CONFIG_DYNAMIC_FTRACE
+CFLAGS_REMOVE_ftrace.o = -pg
+endif
+
 # Object file lists.
 # Object file lists.
 
 
 obj-y		:= compat.o entry-armv.o entry-common.o irq.o \
 obj-y		:= compat.o entry-armv.o entry-common.o irq.o \
@@ -18,6 +22,7 @@ obj-$(CONFIG_ARTHUR)		+= arthur.o
 obj-$(CONFIG_ISA_DMA)		+= dma-isa.o
 obj-$(CONFIG_ISA_DMA)		+= dma-isa.o
 obj-$(CONFIG_PCI)		+= bios32.o isa.o
 obj-$(CONFIG_PCI)		+= bios32.o isa.o
 obj-$(CONFIG_SMP)		+= smp.o
 obj-$(CONFIG_SMP)		+= smp.o
+obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o kprobes-decode.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o kprobes-decode.o
 obj-$(CONFIG_ATAGS_PROC)	+= atags.o
 obj-$(CONFIG_ATAGS_PROC)	+= atags.o

+ 5 - 0
arch/arm/kernel/armksyms.c

@@ -18,6 +18,7 @@
 #include <asm/io.h>
 #include <asm/io.h>
 #include <asm/system.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/uaccess.h>
+#include <asm/ftrace.h>
 
 
 /*
 /*
  * libgcc functions - functions that are used internally by the
  * libgcc functions - functions that are used internally by the
@@ -181,3 +182,7 @@ EXPORT_SYMBOL(_find_next_bit_be);
 #endif
 #endif
 
 
 EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(copy_page);
+
+#ifdef CONFIG_FTRACE
+EXPORT_SYMBOL(mcount);
+#endif

+ 51 - 0
arch/arm/kernel/entry-common.S

@@ -9,6 +9,7 @@
  */
  */
 
 
 #include <asm/unistd.h>
 #include <asm/unistd.h>
+#include <asm/ftrace.h>
 #include <asm/arch/entry-macro.S>
 #include <asm/arch/entry-macro.S>
 
 
 #include "entry-header.S"
 #include "entry-header.S"
@@ -99,6 +100,56 @@ ENTRY(ret_from_fork)
 #undef CALL
 #undef CALL
 #define CALL(x) .long x
 #define CALL(x) .long x
 
 
+#ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+ENTRY(mcount)
+	stmdb sp!, {r0-r3, lr}
+	mov r0, lr
+	sub r0, r0, #MCOUNT_INSN_SIZE
+
+	.globl mcount_call
+mcount_call:
+	bl ftrace_stub
+	ldmia sp!, {r0-r3, pc}
+
+ENTRY(ftrace_caller)
+	stmdb sp!, {r0-r3, lr}
+	ldr r1, [fp, #-4]
+	mov r0, lr
+	sub r0, r0, #MCOUNT_INSN_SIZE
+
+	.globl ftrace_call
+ftrace_call:
+	bl ftrace_stub
+	ldmia sp!, {r0-r3, pc}
+
+#else
+
+ENTRY(mcount)
+	stmdb sp!, {r0-r3, lr}
+	ldr r0, =ftrace_trace_function
+	ldr r2, [r0]
+	adr r0, ftrace_stub
+	cmp r0, r2
+	bne trace
+	ldmia sp!, {r0-r3, pc}
+
+trace:
+	ldr r1, [fp, #-4]
+	mov r0, lr
+	sub r0, r0, #MCOUNT_INSN_SIZE
+	mov lr, pc
+	mov pc, r2
+	ldmia sp!, {r0-r3, pc}
+
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+	.globl ftrace_stub
+ftrace_stub:
+	mov pc, lr
+
+#endif /* CONFIG_FTRACE */
+
 /*=============================================================================
 /*=============================================================================
  * SWI handler
  * SWI handler
  *-----------------------------------------------------------------------------
  *-----------------------------------------------------------------------------

+ 116 - 0
arch/arm/kernel/ftrace.c

@@ -0,0 +1,116 @@
+/*
+ * Dynamic function tracing support.
+ *
+ * Copyright (C) 2008 Abhishek Sagar <sagar.abhishek@gmail.com>
+ *
+ * For licencing details, see COPYING.
+ *
+ * Defines low-level handling of mcount calls when the kernel
+ * is compiled with the -pg flag. When using dynamic ftrace, the
+ * mcount call-sites get patched lazily with NOP till they are
+ * enabled. All code mutation routines here take effect atomically.
+ */
+
+#include <linux/ftrace.h>
+
+#include <asm/cacheflush.h>
+#include <asm/ftrace.h>
+
+#define PC_OFFSET      8
+#define BL_OPCODE      0xeb000000
+#define BL_OFFSET_MASK 0x00ffffff
+
+static unsigned long bl_insn;
+static const unsigned long NOP = 0xe1a00000; /* mov r0, r0 */
+
+unsigned char *ftrace_nop_replace(void)
+{
+	return (char *)&NOP;
+}
+
+/* construct a branch (BL) instruction to addr */
+unsigned char *ftrace_call_replace(unsigned long pc, unsigned long addr)
+{
+	long offset;
+
+	offset = (long)addr - (long)(pc + PC_OFFSET);
+	if (unlikely(offset < -33554432 || offset > 33554428)) {
+		/* Can't generate branches that far (from ARM ARM). Ftrace
+		 * doesn't generate branches outside of kernel text.
+		 */
+		WARN_ON_ONCE(1);
+		return NULL;
+	}
+	offset = (offset >> 2) & BL_OFFSET_MASK;
+	bl_insn = BL_OPCODE | offset;
+	return (unsigned char *)&bl_insn;
+}
+
+int ftrace_modify_code(unsigned long pc, unsigned char *old_code,
+		       unsigned char *new_code)
+{
+	unsigned long err = 0, replaced = 0, old, new;
+
+	old = *(unsigned long *)old_code;
+	new = *(unsigned long *)new_code;
+
+	__asm__ __volatile__ (
+		"1:  ldr    %1, [%2]  \n"
+		"    cmp    %1, %4    \n"
+		"2:  streq  %3, [%2]  \n"
+		"    cmpne  %1, %3    \n"
+		"    movne  %0, #2    \n"
+		"3:\n"
+
+		".section .fixup, \"ax\"\n"
+		"4:  mov  %0, #1  \n"
+		"    b    3b      \n"
+		".previous\n"
+
+		".section __ex_table, \"a\"\n"
+		"    .long 1b, 4b \n"
+		"    .long 2b, 4b \n"
+		".previous\n"
+
+		: "=r"(err), "=r"(replaced)
+		: "r"(pc), "r"(new), "r"(old), "0"(err), "1"(replaced)
+		: "memory");
+
+	if (!err && (replaced == old))
+		flush_icache_range(pc, pc + MCOUNT_INSN_SIZE);
+
+	return err;
+}
+
+int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+	int ret;
+	unsigned long pc, old;
+	unsigned char *new;
+
+	pc = (unsigned long)&ftrace_call;
+	memcpy(&old, &ftrace_call, MCOUNT_INSN_SIZE);
+	new = ftrace_call_replace(pc, (unsigned long)func);
+	ret = ftrace_modify_code(pc, (unsigned char *)&old, new);
+	return ret;
+}
+
+int ftrace_mcount_set(unsigned long *data)
+{
+	unsigned long pc, old;
+	unsigned long *addr = data;
+	unsigned char *new;
+
+	pc = (unsigned long)&mcount_call;
+	memcpy(&old, &mcount_call, MCOUNT_INSN_SIZE);
+	new = ftrace_call_replace(pc, *addr);
+	*addr = ftrace_modify_code(pc, (unsigned char *)&old, new);
+	return 0;
+}
+
+/* run from kstop_machine */
+int __init ftrace_dyn_arch_init(void *data)
+{
+	ftrace_mcount_set(data);
+	return 0;
+}

+ 1 - 1
arch/arm/kernel/kprobes.c

@@ -274,7 +274,7 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
  * for kretprobe handlers which should normally be interested in r0 only
  * for kretprobe handlers which should normally be interested in r0 only
  * anyway.
  * anyway.
  */
  */
-static void __attribute__((naked)) __kprobes kretprobe_trampoline(void)
+void __naked __kprobes kretprobe_trampoline(void)
 {
 {
 	__asm__ __volatile__ (
 	__asm__ __volatile__ (
 		"stmdb	sp!, {r0 - r11}		\n\t"
 		"stmdb	sp!, {r0 - r11}		\n\t"

+ 3 - 1
arch/powerpc/Kconfig

@@ -105,11 +105,13 @@ config ARCH_NO_VIRT_TO_BUS
 config PPC
 config PPC
 	bool
 	bool
 	default y
 	default y
+	select HAVE_DYNAMIC_FTRACE
+	select HAVE_FTRACE
 	select HAVE_IDE
 	select HAVE_IDE
-	select HAVE_OPROFILE
 	select HAVE_KPROBES
 	select HAVE_KPROBES
 	select HAVE_KRETPROBES
 	select HAVE_KRETPROBES
 	select HAVE_LMB
 	select HAVE_LMB
+	select HAVE_OPROFILE
 
 
 config EARLY_PRINTK
 config EARLY_PRINTK
 	bool
 	bool

+ 14 - 0
arch/powerpc/kernel/Makefile

@@ -12,6 +12,18 @@ CFLAGS_prom_init.o      += -fPIC
 CFLAGS_btext.o		+= -fPIC
 CFLAGS_btext.o		+= -fPIC
 endif
 endif
 
 
+ifdef CONFIG_FTRACE
+# Do not trace early boot code
+CFLAGS_REMOVE_cputable.o = -pg
+CFLAGS_REMOVE_prom_init.o = -pg
+
+ifdef CONFIG_DYNAMIC_FTRACE
+# dynamic ftrace setup.
+CFLAGS_REMOVE_ftrace.o = -pg
+endif
+
+endif
+
 obj-y				:= cputable.o ptrace.o syscalls.o \
 obj-y				:= cputable.o ptrace.o syscalls.o \
 				   irq.o align.o signal_32.o pmc.o vdso.o \
 				   irq.o align.o signal_32.o pmc.o vdso.o \
 				   init_task.o process.o systbl.o idle.o \
 				   init_task.o process.o systbl.o idle.o \
@@ -78,6 +90,8 @@ obj-$(CONFIG_KEXEC)		+= machine_kexec.o crash.o \
 obj-$(CONFIG_AUDIT)		+= audit.o
 obj-$(CONFIG_AUDIT)		+= audit.o
 obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 
+obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
+
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
 
 
 ifneq ($(CONFIG_PPC_INDIRECT_IO),y)
 ifneq ($(CONFIG_PPC_INDIRECT_IO),y)

+ 127 - 0
arch/powerpc/kernel/entry_32.S

@@ -30,6 +30,7 @@
 #include <asm/ppc_asm.h>
 #include <asm/ppc_asm.h>
 #include <asm/asm-offsets.h>
 #include <asm/asm-offsets.h>
 #include <asm/unistd.h>
 #include <asm/unistd.h>
+#include <asm/ftrace.h>
 
 
 #undef SHOW_SYSCALLS
 #undef SHOW_SYSCALLS
 #undef SHOW_SYSCALLS_TASK
 #undef SHOW_SYSCALLS_TASK
@@ -1035,3 +1036,129 @@ machine_check_in_rtas:
 	/* XXX load up BATs and panic */
 	/* XXX load up BATs and panic */
 
 
 #endif /* CONFIG_PPC_RTAS */
 #endif /* CONFIG_PPC_RTAS */
+
+#ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+_GLOBAL(mcount)
+_GLOBAL(_mcount)
+	stwu	r1,-48(r1)
+	stw	r3, 12(r1)
+	stw	r4, 16(r1)
+	stw	r5, 20(r1)
+	stw	r6, 24(r1)
+	mflr	r3
+	stw	r7, 28(r1)
+	mfcr	r5
+	stw	r8, 32(r1)
+	stw	r9, 36(r1)
+	stw	r10,40(r1)
+	stw	r3, 44(r1)
+	stw	r5, 8(r1)
+	subi	r3, r3, MCOUNT_INSN_SIZE
+	.globl mcount_call
+mcount_call:
+	bl	ftrace_stub
+	nop
+	lwz	r6, 8(r1)
+	lwz	r0, 44(r1)
+	lwz	r3, 12(r1)
+	mtctr	r0
+	lwz	r4, 16(r1)
+	mtcr	r6
+	lwz	r5, 20(r1)
+	lwz	r6, 24(r1)
+	lwz	r0, 52(r1)
+	lwz	r7, 28(r1)
+	lwz	r8, 32(r1)
+	mtlr	r0
+	lwz	r9, 36(r1)
+	lwz	r10,40(r1)
+	addi	r1, r1, 48
+	bctr
+
+_GLOBAL(ftrace_caller)
+	/* Based off of objdump optput from glibc */
+	stwu	r1,-48(r1)
+	stw	r3, 12(r1)
+	stw	r4, 16(r1)
+	stw	r5, 20(r1)
+	stw	r6, 24(r1)
+	mflr	r3
+	lwz	r4, 52(r1)
+	mfcr	r5
+	stw	r7, 28(r1)
+	stw	r8, 32(r1)
+	stw	r9, 36(r1)
+	stw	r10,40(r1)
+	stw	r3, 44(r1)
+	stw	r5, 8(r1)
+	subi	r3, r3, MCOUNT_INSN_SIZE
+.globl ftrace_call
+ftrace_call:
+	bl	ftrace_stub
+	nop
+	lwz	r6, 8(r1)
+	lwz	r0, 44(r1)
+	lwz	r3, 12(r1)
+	mtctr	r0
+	lwz	r4, 16(r1)
+	mtcr	r6
+	lwz	r5, 20(r1)
+	lwz	r6, 24(r1)
+	lwz	r0, 52(r1)
+	lwz	r7, 28(r1)
+	lwz	r8, 32(r1)
+	mtlr	r0
+	lwz	r9, 36(r1)
+	lwz	r10,40(r1)
+	addi	r1, r1, 48
+	bctr
+#else
+_GLOBAL(mcount)
+_GLOBAL(_mcount)
+	stwu	r1,-48(r1)
+	stw	r3, 12(r1)
+	stw	r4, 16(r1)
+	stw	r5, 20(r1)
+	stw	r6, 24(r1)
+	mflr	r3
+	lwz	r4, 52(r1)
+	mfcr	r5
+	stw	r7, 28(r1)
+	stw	r8, 32(r1)
+	stw	r9, 36(r1)
+	stw	r10,40(r1)
+	stw	r3, 44(r1)
+	stw	r5, 8(r1)
+
+	subi	r3, r3, MCOUNT_INSN_SIZE
+	LOAD_REG_ADDR(r5, ftrace_trace_function)
+	lwz	r5,0(r5)
+
+	mtctr	r5
+	bctrl
+
+	nop
+
+	lwz	r6, 8(r1)
+	lwz	r0, 44(r1)
+	lwz	r3, 12(r1)
+	mtctr	r0
+	lwz	r4, 16(r1)
+	mtcr	r6
+	lwz	r5, 20(r1)
+	lwz	r6, 24(r1)
+	lwz	r0, 52(r1)
+	lwz	r7, 28(r1)
+	lwz	r8, 32(r1)
+	mtlr	r0
+	lwz	r9, 36(r1)
+	lwz	r10,40(r1)
+	addi	r1, r1, 48
+	bctr
+#endif
+
+_GLOBAL(ftrace_stub)
+	blr
+
+#endif /* CONFIG_MCOUNT */

+ 65 - 0
arch/powerpc/kernel/entry_64.S

@@ -31,6 +31,7 @@
 #include <asm/bug.h>
 #include <asm/bug.h>
 #include <asm/ptrace.h>
 #include <asm/ptrace.h>
 #include <asm/irqflags.h>
 #include <asm/irqflags.h>
+#include <asm/ftrace.h>
 
 
 /*
 /*
  * System calls.
  * System calls.
@@ -870,3 +871,67 @@ _GLOBAL(enter_prom)
 	ld	r0,16(r1)
 	ld	r0,16(r1)
 	mtlr    r0
 	mtlr    r0
         blr
         blr
+
+#ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+_GLOBAL(mcount)
+_GLOBAL(_mcount)
+	/* Taken from output of objdump from lib64/glibc */
+	mflr	r3
+	stdu	r1, -112(r1)
+	std	r3, 128(r1)
+	subi	r3, r3, MCOUNT_INSN_SIZE
+	.globl mcount_call
+mcount_call:
+	bl	ftrace_stub
+	nop
+	ld	r0, 128(r1)
+	mtlr	r0
+	addi	r1, r1, 112
+	blr
+
+_GLOBAL(ftrace_caller)
+	/* Taken from output of objdump from lib64/glibc */
+	mflr	r3
+	ld	r11, 0(r1)
+	stdu	r1, -112(r1)
+	std	r3, 128(r1)
+	ld	r4, 16(r11)
+	subi	r3, r3, MCOUNT_INSN_SIZE
+.globl ftrace_call
+ftrace_call:
+	bl	ftrace_stub
+	nop
+	ld	r0, 128(r1)
+	mtlr	r0
+	addi	r1, r1, 112
+_GLOBAL(ftrace_stub)
+	blr
+#else
+_GLOBAL(mcount)
+	blr
+
+_GLOBAL(_mcount)
+	/* Taken from output of objdump from lib64/glibc */
+	mflr	r3
+	ld	r11, 0(r1)
+	stdu	r1, -112(r1)
+	std	r3, 128(r1)
+	ld	r4, 16(r11)
+
+	subi	r3, r3, MCOUNT_INSN_SIZE
+	LOAD_REG_ADDR(r5,ftrace_trace_function)
+	ld	r5,0(r5)
+	ld	r5,0(r5)
+	mtctr	r5
+	bctrl
+
+	nop
+	ld	r0, 128(r1)
+	mtlr	r0
+	addi	r1, r1, 112
+_GLOBAL(ftrace_stub)
+	blr
+
+#endif
+#endif

+ 154 - 0
arch/powerpc/kernel/ftrace.c

@@ -0,0 +1,154 @@
+/*
+ * Code for replacing ftrace calls with jumps.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ *
+ * Thanks goes out to P.A. Semi, Inc for supplying me with a PPC64 box.
+ *
+ */
+
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/ftrace.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/list.h>
+
+#include <asm/cacheflush.h>
+#include <asm/ftrace.h>
+
+
+static unsigned int ftrace_nop = 0x60000000;
+
+#ifdef CONFIG_PPC32
+# define GET_ADDR(addr) addr
+#else
+/* PowerPC64's functions are data that points to the functions */
+# define GET_ADDR(addr) *(unsigned long *)addr
+#endif
+
+
+static unsigned int notrace ftrace_calc_offset(long ip, long addr)
+{
+	return (int)(addr - ip);
+}
+
+notrace unsigned char *ftrace_nop_replace(void)
+{
+	return (char *)&ftrace_nop;
+}
+
+notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+{
+	static unsigned int op;
+
+	/*
+	 * It would be nice to just use create_function_call, but that will
+	 * update the code itself. Here we need to just return the
+	 * instruction that is going to be modified, without modifying the
+	 * code.
+	 */
+	addr = GET_ADDR(addr);
+
+	/* Set to "bl addr" */
+	op = 0x48000001 | (ftrace_calc_offset(ip, addr) & 0x03fffffc);
+
+	/*
+	 * No locking needed, this must be called via kstop_machine
+	 * which in essence is like running on a uniprocessor machine.
+	 */
+	return (unsigned char *)&op;
+}
+
+#ifdef CONFIG_PPC64
+# define _ASM_ALIGN	" .align 3 "
+# define _ASM_PTR	" .llong "
+#else
+# define _ASM_ALIGN	" .align 2 "
+# define _ASM_PTR	" .long "
+#endif
+
+notrace int
+ftrace_modify_code(unsigned long ip, unsigned char *old_code,
+		   unsigned char *new_code)
+{
+	unsigned replaced;
+	unsigned old = *(unsigned *)old_code;
+	unsigned new = *(unsigned *)new_code;
+	int faulted = 0;
+
+	/*
+	 * Note: Due to modules and __init, code can
+	 *  disappear and change, we need to protect against faulting
+	 *  as well as code changing.
+	 *
+	 * No real locking needed, this code is run through
+	 * kstop_machine.
+	 */
+	asm volatile (
+		"1: lwz		%1, 0(%2)\n"
+		"   cmpw	%1, %5\n"
+		"   bne		2f\n"
+		"   stwu	%3, 0(%2)\n"
+		"2:\n"
+		".section .fixup, \"ax\"\n"
+		"3:	li %0, 1\n"
+		"	b 2b\n"
+		".previous\n"
+		".section __ex_table,\"a\"\n"
+		_ASM_ALIGN "\n"
+		_ASM_PTR "1b, 3b\n"
+		".previous"
+		: "=r"(faulted), "=r"(replaced)
+		: "r"(ip), "r"(new),
+		  "0"(faulted), "r"(old)
+		: "memory");
+
+	if (replaced != old && replaced != new)
+		faulted = 2;
+
+	if (!faulted)
+		flush_icache_range(ip, ip + 8);
+
+	return faulted;
+}
+
+notrace int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+	unsigned long ip = (unsigned long)(&ftrace_call);
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
+	int ret;
+
+	memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
+	new = ftrace_call_replace(ip, (unsigned long)func);
+	ret = ftrace_modify_code(ip, old, new);
+
+	return ret;
+}
+
+notrace int ftrace_mcount_set(unsigned long *data)
+{
+	unsigned long ip = (long)(&mcount_call);
+	unsigned long *addr = data;
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
+
+	/*
+	 * Replace the mcount stub with a pointer to the
+	 * ip recorder function.
+	 */
+	memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
+	new = ftrace_call_replace(ip, *addr);
+	*addr = ftrace_modify_code(ip, old, new);
+
+	return 0;
+}
+
+int __init ftrace_dyn_arch_init(void *data)
+{
+	/* This is running in kstop_machine */
+
+	ftrace_mcount_set(data);
+
+	return 0;
+}
+

+ 2 - 1
arch/powerpc/kernel/io.c

@@ -120,7 +120,8 @@ EXPORT_SYMBOL(_outsl_ns);
 
 
 #define IO_CHECK_ALIGN(v,a) ((((unsigned long)(v)) & ((a) - 1)) == 0)
 #define IO_CHECK_ALIGN(v,a) ((((unsigned long)(v)) & ((a) - 1)) == 0)
 
 
-void _memset_io(volatile void __iomem *addr, int c, unsigned long n)
+notrace void
+_memset_io(volatile void __iomem *addr, int c, unsigned long n)
 {
 {
 	void *p = (void __force *)addr;
 	void *p = (void __force *)addr;
 	u32 lc = c;
 	u32 lc = c;

+ 3 - 3
arch/powerpc/kernel/irq.c

@@ -98,7 +98,7 @@ EXPORT_SYMBOL(irq_desc);
 
 
 int distribute_irqs = 1;
 int distribute_irqs = 1;
 
 
-static inline unsigned long get_hard_enabled(void)
+static inline notrace unsigned long get_hard_enabled(void)
 {
 {
 	unsigned long enabled;
 	unsigned long enabled;
 
 
@@ -108,13 +108,13 @@ static inline unsigned long get_hard_enabled(void)
 	return enabled;
 	return enabled;
 }
 }
 
 
-static inline void set_soft_enabled(unsigned long enable)
+static inline notrace void set_soft_enabled(unsigned long enable)
 {
 {
 	__asm__ __volatile__("stb %0,%1(13)"
 	__asm__ __volatile__("stb %0,%1(13)"
 	: : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled)));
 	: : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled)));
 }
 }
 
 
-void raw_local_irq_restore(unsigned long en)
+notrace void raw_local_irq_restore(unsigned long en)
 {
 {
 	/*
 	/*
 	 * get_paca()->soft_enabled = en;
 	 * get_paca()->soft_enabled = en;

+ 5 - 0
arch/powerpc/kernel/ppc_ksyms.c

@@ -42,6 +42,7 @@
 #include <asm/div64.h>
 #include <asm/div64.h>
 #include <asm/signal.h>
 #include <asm/signal.h>
 #include <asm/dcr.h>
 #include <asm/dcr.h>
+#include <asm/ftrace.h>
 
 
 #ifdef CONFIG_PPC32
 #ifdef CONFIG_PPC32
 extern void transfer_to_handler(void);
 extern void transfer_to_handler(void);
@@ -67,6 +68,10 @@ EXPORT_SYMBOL(single_step_exception);
 EXPORT_SYMBOL(sys_sigreturn);
 EXPORT_SYMBOL(sys_sigreturn);
 #endif
 #endif
 
 
+#ifdef CONFIG_FTRACE
+EXPORT_SYMBOL(_mcount);
+#endif
+
 EXPORT_SYMBOL(strcpy);
 EXPORT_SYMBOL(strcpy);
 EXPORT_SYMBOL(strncpy);
 EXPORT_SYMBOL(strncpy);
 EXPORT_SYMBOL(strcat);
 EXPORT_SYMBOL(strcat);

+ 3 - 3
arch/powerpc/kernel/setup_32.c

@@ -81,7 +81,7 @@ int ucache_bsize;
  * from the address that it was linked at, so we must use RELOC/PTRRELOC
  * from the address that it was linked at, so we must use RELOC/PTRRELOC
  * to access static data (including strings).  -- paulus
  * to access static data (including strings).  -- paulus
  */
  */
-unsigned long __init early_init(unsigned long dt_ptr)
+notrace unsigned long __init early_init(unsigned long dt_ptr)
 {
 {
 	unsigned long offset = reloc_offset();
 	unsigned long offset = reloc_offset();
 	struct cpu_spec *spec;
 	struct cpu_spec *spec;
@@ -111,7 +111,7 @@ unsigned long __init early_init(unsigned long dt_ptr)
  * This is called very early on the boot process, after a minimal
  * This is called very early on the boot process, after a minimal
  * MMU environment has been set up but before MMU_init is called.
  * MMU environment has been set up but before MMU_init is called.
  */
  */
-void __init machine_init(unsigned long dt_ptr, unsigned long phys)
+notrace void __init machine_init(unsigned long dt_ptr, unsigned long phys)
 {
 {
 	/* Enable early debugging if any specified (see udbg.h) */
 	/* Enable early debugging if any specified (see udbg.h) */
 	udbg_early_init();
 	udbg_early_init();
@@ -133,7 +133,7 @@ void __init machine_init(unsigned long dt_ptr, unsigned long phys)
 
 
 #ifdef CONFIG_BOOKE_WDT
 #ifdef CONFIG_BOOKE_WDT
 /* Checks wdt=x and wdt_period=xx command-line option */
 /* Checks wdt=x and wdt_period=xx command-line option */
-int __init early_parse_wdt(char *p)
+notrace int __init early_parse_wdt(char *p)
 {
 {
 	if (p && strncmp(p, "0", 1) != 0)
 	if (p && strncmp(p, "0", 1) != 0)
 	       booke_wdt_enabled = 1;
 	       booke_wdt_enabled = 1;

+ 5 - 0
arch/powerpc/platforms/powermac/Makefile

@@ -1,5 +1,10 @@
 CFLAGS_bootx_init.o  		+= -fPIC
 CFLAGS_bootx_init.o  		+= -fPIC
 
 
+ifdef CONFIG_FTRACE
+# Do not trace early boot code
+CFLAGS_REMOVE_bootx_init.o = -pg
+endif
+
 obj-y				+= pic.o setup.o time.o feature.o pci.o \
 obj-y				+= pic.o setup.o time.o feature.o pci.o \
 				   sleep.o low_i2c.o cache.o pfunc_core.o \
 				   sleep.o low_i2c.o cache.o pfunc_core.o \
 				   pfunc_base.o
 				   pfunc_base.o

+ 2 - 0
arch/sparc64/Kconfig

@@ -11,6 +11,8 @@ config SPARC
 config SPARC64
 config SPARC64
 	bool
 	bool
 	default y
 	default y
+	select HAVE_DYNAMIC_FTRACE
+	select HAVE_FTRACE
 	select HAVE_IDE
 	select HAVE_IDE
 	select HAVE_LMB
 	select HAVE_LMB
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_KGDB

+ 1 - 1
arch/sparc64/Kconfig.debug

@@ -33,7 +33,7 @@ config DEBUG_PAGEALLOC
 
 
 config MCOUNT
 config MCOUNT
 	bool
 	bool
-	depends on STACK_DEBUG
+	depends on STACK_DEBUG || FTRACE
 	default y
 	default y
 
 
 config FRAME_POINTER
 config FRAME_POINTER

+ 1 - 0
arch/sparc64/kernel/Makefile

@@ -14,6 +14,7 @@ obj-y		:= process.o setup.o cpu.o idprom.o \
 		   power.o sbus.o sparc64_ksyms.o chmc.o \
 		   power.o sbus.o sparc64_ksyms.o chmc.o \
 		   visemul.o prom.o of_device.o hvapi.o sstate.o mdesc.o
 		   visemul.o prom.o of_device.o hvapi.o sstate.o mdesc.o
 
 
+obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-$(CONFIG_PCI)	 += ebus.o pci_common.o \
 obj-$(CONFIG_PCI)	 += ebus.o pci_common.o \
 			    pci_psycho.o pci_sabre.o pci_schizo.o \
 			    pci_psycho.o pci_sabre.o pci_schizo.o \

+ 94 - 0
arch/sparc64/kernel/ftrace.c

@@ -0,0 +1,94 @@
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/ftrace.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/list.h>
+
+#include <asm/ftrace.h>
+
+static const u32 ftrace_nop = 0x01000000;
+
+notrace unsigned char *ftrace_nop_replace(void)
+{
+	return (char *)&ftrace_nop;
+}
+
+notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+{
+	static u32 call;
+	s32 off;
+
+	off = ((s32)addr - (s32)ip);
+	call = 0x40000000 | ((u32)off >> 2);
+
+	return (unsigned char *) &call;
+}
+
+notrace int
+ftrace_modify_code(unsigned long ip, unsigned char *old_code,
+		   unsigned char *new_code)
+{
+	u32 old = *(u32 *)old_code;
+	u32 new = *(u32 *)new_code;
+	u32 replaced;
+	int faulted;
+
+	__asm__ __volatile__(
+	"1:	cas	[%[ip]], %[old], %[new]\n"
+	"	flush	%[ip]\n"
+	"	mov	0, %[faulted]\n"
+	"2:\n"
+	"	.section .fixup,#alloc,#execinstr\n"
+	"	.align	4\n"
+	"3:	sethi	%%hi(2b), %[faulted]\n"
+	"	jmpl	%[faulted] + %%lo(2b), %%g0\n"
+	"	 mov	1, %[faulted]\n"
+	"	.previous\n"
+	"	.section __ex_table,\"a\"\n"
+	"	.align	4\n"
+	"	.word	1b, 3b\n"
+	"	.previous\n"
+	: "=r" (replaced), [faulted] "=r" (faulted)
+	: [new] "0" (new), [old] "r" (old), [ip] "r" (ip)
+	: "memory");
+
+	if (replaced != old && replaced != new)
+		faulted = 2;
+
+	return faulted;
+}
+
+notrace int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+	unsigned long ip = (unsigned long)(&ftrace_call);
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
+
+	memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
+	new = ftrace_call_replace(ip, (unsigned long)func);
+	return ftrace_modify_code(ip, old, new);
+}
+
+notrace int ftrace_mcount_set(unsigned long *data)
+{
+	unsigned long ip = (long)(&mcount_call);
+	unsigned long *addr = data;
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
+
+	/*
+	 * Replace the mcount stub with a pointer to the
+	 * ip recorder function.
+	 */
+	memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
+	new = ftrace_call_replace(ip, *addr);
+	*addr = ftrace_modify_code(ip, old, new);
+
+	return 0;
+}
+
+
+int __init ftrace_dyn_arch_init(void *data)
+{
+	ftrace_mcount_set(data);
+	return 0;
+}

+ 2 - 2
arch/sparc64/kernel/sparc64_ksyms.c

@@ -53,6 +53,7 @@
 #include <asm/ns87303.h>
 #include <asm/ns87303.h>
 #include <asm/timer.h>
 #include <asm/timer.h>
 #include <asm/cpudata.h>
 #include <asm/cpudata.h>
+#include <asm/ftrace.h>
 
 
 struct poll {
 struct poll {
 	int fd;
 	int fd;
@@ -111,8 +112,7 @@ EXPORT_SYMBOL(__write_trylock);
 EXPORT_SYMBOL(smp_call_function);
 EXPORT_SYMBOL(smp_call_function);
 #endif /* CONFIG_SMP */
 #endif /* CONFIG_SMP */
 
 
-#if defined(CONFIG_MCOUNT)
-extern void _mcount(void);
+#ifdef CONFIG_MCOUNT
 EXPORT_SYMBOL(_mcount);
 EXPORT_SYMBOL(_mcount);
 #endif
 #endif
 
 

+ 54 - 4
arch/sparc64/lib/mcount.S

@@ -28,10 +28,13 @@ ovstack:
 	.skip		OVSTACKSIZE
 	.skip		OVSTACKSIZE
 #endif
 #endif
 	.text
 	.text
-	.align 32
-	.globl mcount, _mcount
-mcount:
+	.align		32
+	.globl		_mcount
+	.type		_mcount,#function
+	.globl		mcount
+	.type		mcount,#function
 _mcount:
 _mcount:
+mcount:
 #ifdef CONFIG_STACK_DEBUG
 #ifdef CONFIG_STACK_DEBUG
 	/*
 	/*
 	 * Check whether %sp is dangerously low.
 	 * Check whether %sp is dangerously low.
@@ -55,6 +58,53 @@ _mcount:
 	 or		%g3, %lo(panicstring), %o0
 	 or		%g3, %lo(panicstring), %o0
 	call		prom_halt
 	call		prom_halt
 	 nop
 	 nop
+1:
+#endif
+#ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+	mov		%o7, %o0
+	.globl		mcount_call
+mcount_call:
+	call		ftrace_stub
+	 mov		%o0, %o7
+#else
+	sethi		%hi(ftrace_trace_function), %g1
+	sethi		%hi(ftrace_stub), %g2
+	ldx		[%g1 + %lo(ftrace_trace_function)], %g1
+	or		%g2, %lo(ftrace_stub), %g2
+	cmp		%g1, %g2
+	be,pn		%icc, 1f
+	 mov		%i7, %o1
+	jmpl		%g1, %g0
+	 mov		%o7, %o0
+	/* not reached */
+1:
 #endif
 #endif
-1:	retl
+#endif
+	retl
 	 nop
 	 nop
+	.size		_mcount,.-_mcount
+	.size		mcount,.-mcount
+
+#ifdef CONFIG_FTRACE
+	.globl		ftrace_stub
+	.type		ftrace_stub,#function
+ftrace_stub:
+	retl
+	 nop
+	.size		ftrace_stub,.-ftrace_stub
+#ifdef CONFIG_DYNAMIC_FTRACE
+	.globl		ftrace_caller
+	.type		ftrace_caller,#function
+ftrace_caller:
+	mov		%i7, %o1
+	mov		%o7, %o0
+	.globl		ftrace_call
+ftrace_call:
+	call		ftrace_stub
+	 mov		%o0, %o7
+	retl
+	 nop
+	.size		ftrace_caller,.-ftrace_caller
+#endif
+#endif

+ 2 - 0
arch/x86/Kconfig

@@ -23,6 +23,8 @@ config X86
 	select HAVE_OPROFILE
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
 	select HAVE_KPROBES
 	select HAVE_KRETPROBES
 	select HAVE_KRETPROBES
+	select HAVE_DYNAMIC_FTRACE
+	select HAVE_FTRACE
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 	select HAVE_ARCH_KGDB if !X86_VOYAGER
 	select HAVE_ARCH_KGDB if !X86_VOYAGER
 
 

+ 8 - 0
arch/x86/Kconfig.debug

@@ -172,6 +172,14 @@ config IOMMU_LEAK
 	  Add a simple leak tracer to the IOMMU code. This is useful when you
 	  Add a simple leak tracer to the IOMMU code. This is useful when you
 	  are debugging a buggy device driver that leaks IOMMU mappings.
 	  are debugging a buggy device driver that leaks IOMMU mappings.
 
 
+config PAGE_FAULT_HANDLERS
+	bool "Custom page fault handlers"
+	depends on DEBUG_KERNEL
+	help
+	  Allow the use of custom page fault handlers. A kernel module may
+	  register a function that is called on every page fault. Custom
+	  handlers are used by some debugging and reverse engineering tools.
+
 #
 #
 # IO delay types:
 # IO delay types:
 #
 #

+ 8 - 0
arch/x86/kernel/Makefile

@@ -6,6 +6,13 @@ extra-y                := head_$(BITS).o head$(BITS).o init_task.o vmlinux.lds
 
 
 CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
 CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
 
 
+ifdef CONFIG_FTRACE
+# Do not profile debug utilities
+CFLAGS_REMOVE_tsc_64.o = -pg
+CFLAGS_REMOVE_tsc_32.o = -pg
+CFLAGS_REMOVE_rtc.o = -pg
+endif
+
 #
 #
 # vsyscalls (which work on the user stack) should have
 # vsyscalls (which work on the user stack) should have
 # no stack-protector checks:
 # no stack-protector checks:
@@ -56,6 +63,7 @@ obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic_$(BITS).o nmi_$(BITS).o
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic_$(BITS).o nmi_$(BITS).o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic_$(BITS).o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic_$(BITS).o
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
+obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o

+ 11 - 11
arch/x86/kernel/alternative.c

@@ -1,6 +1,6 @@
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/sched.h>
-#include <linux/spinlock.h>
+#include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/list.h>
 #include <linux/kprobes.h>
 #include <linux/kprobes.h>
 #include <linux/mm.h>
 #include <linux/mm.h>
@@ -143,7 +143,7 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = {
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
 
 
 extern char __vsyscall_0;
 extern char __vsyscall_0;
-static inline const unsigned char*const * find_nop_table(void)
+const unsigned char *const *find_nop_table(void)
 {
 {
 	return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
 	return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
 	       boot_cpu_data.x86 < 6 ? k8_nops : p6_nops;
 	       boot_cpu_data.x86 < 6 ? k8_nops : p6_nops;
@@ -162,7 +162,7 @@ static const struct nop {
 	{ -1, NULL }
 	{ -1, NULL }
 };
 };
 
 
-static const unsigned char*const * find_nop_table(void)
+const unsigned char *const *find_nop_table(void)
 {
 {
 	const unsigned char *const *noptable = intel_nops;
 	const unsigned char *const *noptable = intel_nops;
 	int i;
 	int i;
@@ -279,7 +279,7 @@ struct smp_alt_module {
 	struct list_head next;
 	struct list_head next;
 };
 };
 static LIST_HEAD(smp_alt_modules);
 static LIST_HEAD(smp_alt_modules);
-static DEFINE_SPINLOCK(smp_alt);
+static DEFINE_MUTEX(smp_alt);
 static int smp_mode = 1;	/* protected by smp_alt */
 static int smp_mode = 1;	/* protected by smp_alt */
 
 
 void alternatives_smp_module_add(struct module *mod, char *name,
 void alternatives_smp_module_add(struct module *mod, char *name,
@@ -312,12 +312,12 @@ void alternatives_smp_module_add(struct module *mod, char *name,
 		__func__, smp->locks, smp->locks_end,
 		__func__, smp->locks, smp->locks_end,
 		smp->text, smp->text_end, smp->name);
 		smp->text, smp->text_end, smp->name);
 
 
-	spin_lock(&smp_alt);
+	mutex_lock(&smp_alt);
 	list_add_tail(&smp->next, &smp_alt_modules);
 	list_add_tail(&smp->next, &smp_alt_modules);
 	if (boot_cpu_has(X86_FEATURE_UP))
 	if (boot_cpu_has(X86_FEATURE_UP))
 		alternatives_smp_unlock(smp->locks, smp->locks_end,
 		alternatives_smp_unlock(smp->locks, smp->locks_end,
 					smp->text, smp->text_end);
 					smp->text, smp->text_end);
-	spin_unlock(&smp_alt);
+	mutex_unlock(&smp_alt);
 }
 }
 
 
 void alternatives_smp_module_del(struct module *mod)
 void alternatives_smp_module_del(struct module *mod)
@@ -327,17 +327,17 @@ void alternatives_smp_module_del(struct module *mod)
 	if (smp_alt_once || noreplace_smp)
 	if (smp_alt_once || noreplace_smp)
 		return;
 		return;
 
 
-	spin_lock(&smp_alt);
+	mutex_lock(&smp_alt);
 	list_for_each_entry(item, &smp_alt_modules, next) {
 	list_for_each_entry(item, &smp_alt_modules, next) {
 		if (mod != item->mod)
 		if (mod != item->mod)
 			continue;
 			continue;
 		list_del(&item->next);
 		list_del(&item->next);
-		spin_unlock(&smp_alt);
+		mutex_unlock(&smp_alt);
 		DPRINTK("%s: %s\n", __func__, item->name);
 		DPRINTK("%s: %s\n", __func__, item->name);
 		kfree(item);
 		kfree(item);
 		return;
 		return;
 	}
 	}
-	spin_unlock(&smp_alt);
+	mutex_unlock(&smp_alt);
 }
 }
 
 
 void alternatives_smp_switch(int smp)
 void alternatives_smp_switch(int smp)
@@ -359,7 +359,7 @@ void alternatives_smp_switch(int smp)
 		return;
 		return;
 	BUG_ON(!smp && (num_online_cpus() > 1));
 	BUG_ON(!smp && (num_online_cpus() > 1));
 
 
-	spin_lock(&smp_alt);
+	mutex_lock(&smp_alt);
 
 
 	/*
 	/*
 	 * Avoid unnecessary switches because it forces JIT based VMs to
 	 * Avoid unnecessary switches because it forces JIT based VMs to
@@ -383,7 +383,7 @@ void alternatives_smp_switch(int smp)
 						mod->text, mod->text_end);
 						mod->text, mod->text_end);
 	}
 	}
 	smp_mode = smp;
 	smp_mode = smp;
-	spin_unlock(&smp_alt);
+	mutex_unlock(&smp_alt);
 }
 }
 
 
 #endif
 #endif

+ 72 - 0
arch/x86/kernel/entry_32.S

@@ -51,6 +51,7 @@
 #include <asm/percpu.h>
 #include <asm/percpu.h>
 #include <asm/dwarf2.h>
 #include <asm/dwarf2.h>
 #include <asm/processor-flags.h>
 #include <asm/processor-flags.h>
+#include <asm/ftrace.h>
 #include "irq_vectors.h"
 #include "irq_vectors.h"
 
 
 /*
 /*
@@ -1110,6 +1111,77 @@ ENDPROC(xen_failsafe_callback)
 
 
 #endif	/* CONFIG_XEN */
 #endif	/* CONFIG_XEN */
 
 
+#ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+ENTRY(mcount)
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %eax
+	subl $MCOUNT_INSN_SIZE, %eax
+
+.globl mcount_call
+mcount_call:
+	call ftrace_stub
+
+	popl %edx
+	popl %ecx
+	popl %eax
+
+	ret
+END(mcount)
+
+ENTRY(ftrace_caller)
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %eax
+	movl 0x4(%ebp), %edx
+	subl $MCOUNT_INSN_SIZE, %eax
+
+.globl ftrace_call
+ftrace_call:
+	call ftrace_stub
+
+	popl %edx
+	popl %ecx
+	popl %eax
+
+.globl ftrace_stub
+ftrace_stub:
+	ret
+END(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+ENTRY(mcount)
+	cmpl $ftrace_stub, ftrace_trace_function
+	jnz trace
+.globl ftrace_stub
+ftrace_stub:
+	ret
+
+	/* taken from glibc */
+trace:
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %eax
+	movl 0x4(%ebp), %edx
+	subl $MCOUNT_INSN_SIZE, %eax
+
+	call *ftrace_trace_function
+
+	popl %edx
+	popl %ecx
+	popl %eax
+
+	jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FTRACE */
+
 .section .rodata,"a"
 .section .rodata,"a"
 #include "syscall_table_32.S"
 #include "syscall_table_32.S"
 
 

+ 106 - 0
arch/x86/kernel/entry_64.S

@@ -51,9 +51,115 @@
 #include <asm/page.h>
 #include <asm/page.h>
 #include <asm/irqflags.h>
 #include <asm/irqflags.h>
 #include <asm/paravirt.h>
 #include <asm/paravirt.h>
+#include <asm/ftrace.h>
 
 
 	.code64
 	.code64
 
 
+#ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+ENTRY(mcount)
+
+	subq $0x38, %rsp
+	movq %rax, (%rsp)
+	movq %rcx, 8(%rsp)
+	movq %rdx, 16(%rsp)
+	movq %rsi, 24(%rsp)
+	movq %rdi, 32(%rsp)
+	movq %r8, 40(%rsp)
+	movq %r9, 48(%rsp)
+
+	movq 0x38(%rsp), %rdi
+	subq $MCOUNT_INSN_SIZE, %rdi
+
+.globl mcount_call
+mcount_call:
+	call ftrace_stub
+
+	movq 48(%rsp), %r9
+	movq 40(%rsp), %r8
+	movq 32(%rsp), %rdi
+	movq 24(%rsp), %rsi
+	movq 16(%rsp), %rdx
+	movq 8(%rsp), %rcx
+	movq (%rsp), %rax
+	addq $0x38, %rsp
+
+	retq
+END(mcount)
+
+ENTRY(ftrace_caller)
+
+	/* taken from glibc */
+	subq $0x38, %rsp
+	movq %rax, (%rsp)
+	movq %rcx, 8(%rsp)
+	movq %rdx, 16(%rsp)
+	movq %rsi, 24(%rsp)
+	movq %rdi, 32(%rsp)
+	movq %r8, 40(%rsp)
+	movq %r9, 48(%rsp)
+
+	movq 0x38(%rsp), %rdi
+	movq 8(%rbp), %rsi
+	subq $MCOUNT_INSN_SIZE, %rdi
+
+.globl ftrace_call
+ftrace_call:
+	call ftrace_stub
+
+	movq 48(%rsp), %r9
+	movq 40(%rsp), %r8
+	movq 32(%rsp), %rdi
+	movq 24(%rsp), %rsi
+	movq 16(%rsp), %rdx
+	movq 8(%rsp), %rcx
+	movq (%rsp), %rax
+	addq $0x38, %rsp
+
+.globl ftrace_stub
+ftrace_stub:
+	retq
+END(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+ENTRY(mcount)
+	cmpq $ftrace_stub, ftrace_trace_function
+	jnz trace
+.globl ftrace_stub
+ftrace_stub:
+	retq
+
+trace:
+	/* taken from glibc */
+	subq $0x38, %rsp
+	movq %rax, (%rsp)
+	movq %rcx, 8(%rsp)
+	movq %rdx, 16(%rsp)
+	movq %rsi, 24(%rsp)
+	movq %rdi, 32(%rsp)
+	movq %r8, 40(%rsp)
+	movq %r9, 48(%rsp)
+
+	movq 0x38(%rsp), %rdi
+	movq 8(%rbp), %rsi
+	subq $MCOUNT_INSN_SIZE, %rdi
+
+	call   *ftrace_trace_function
+
+	movq 48(%rsp), %r9
+	movq 40(%rsp), %r8
+	movq 32(%rsp), %rdi
+	movq 24(%rsp), %rsi
+	movq 16(%rsp), %rdx
+	movq 8(%rsp), %rcx
+	movq (%rsp), %rax
+	addq $0x38, %rsp
+
+	jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FTRACE */
+
 #ifndef CONFIG_PREEMPT
 #ifndef CONFIG_PREEMPT
 #define retint_kernel retint_restore_args
 #define retint_kernel retint_restore_args
 #endif	
 #endif	

+ 141 - 0
arch/x86/kernel/ftrace.c

@@ -0,0 +1,141 @@
+/*
+ * Code for replacing ftrace calls with jumps.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ *
+ * Thanks goes to Ingo Molnar, for suggesting the idea.
+ * Mathieu Desnoyers, for suggesting postponing the modifications.
+ * Arjan van de Ven, for keeping me straight, and explaining to me
+ * the dangers of modifying code on the run.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/ftrace.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/list.h>
+
+#include <asm/alternative.h>
+#include <asm/ftrace.h>
+
+
+/* Long is fine, even if it is only 4 bytes ;-) */
+static long *ftrace_nop;
+
+union ftrace_code_union {
+	char code[MCOUNT_INSN_SIZE];
+	struct {
+		char e8;
+		int offset;
+	} __attribute__((packed));
+};
+
+
+static int notrace ftrace_calc_offset(long ip, long addr)
+{
+	return (int)(addr - ip);
+}
+
+notrace unsigned char *ftrace_nop_replace(void)
+{
+	return (char *)ftrace_nop;
+}
+
+notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+{
+	static union ftrace_code_union calc;
+
+	calc.e8		= 0xe8;
+	calc.offset	= ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
+
+	/*
+	 * No locking needed, this must be called via kstop_machine
+	 * which in essence is like running on a uniprocessor machine.
+	 */
+	return calc.code;
+}
+
+notrace int
+ftrace_modify_code(unsigned long ip, unsigned char *old_code,
+		   unsigned char *new_code)
+{
+	unsigned replaced;
+	unsigned old = *(unsigned *)old_code; /* 4 bytes */
+	unsigned new = *(unsigned *)new_code; /* 4 bytes */
+	unsigned char newch = new_code[4];
+	int faulted = 0;
+
+	/*
+	 * Note: Due to modules and __init, code can
+	 *  disappear and change, we need to protect against faulting
+	 *  as well as code changing.
+	 *
+	 * No real locking needed, this code is run through
+	 * kstop_machine.
+	 */
+	asm volatile (
+		"1: lock\n"
+		"   cmpxchg %3, (%2)\n"
+		"   jnz 2f\n"
+		"   movb %b4, 4(%2)\n"
+		"2:\n"
+		".section .fixup, \"ax\"\n"
+		"3:	movl $1, %0\n"
+		"	jmp 2b\n"
+		".previous\n"
+		_ASM_EXTABLE(1b, 3b)
+		: "=r"(faulted), "=a"(replaced)
+		: "r"(ip), "r"(new), "c"(newch),
+		  "0"(faulted), "a"(old)
+		: "memory");
+	sync_core();
+
+	if (replaced != old && replaced != new)
+		faulted = 2;
+
+	return faulted;
+}
+
+notrace int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+	unsigned long ip = (unsigned long)(&ftrace_call);
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
+	int ret;
+
+	memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
+	new = ftrace_call_replace(ip, (unsigned long)func);
+	ret = ftrace_modify_code(ip, old, new);
+
+	return ret;
+}
+
+notrace int ftrace_mcount_set(unsigned long *data)
+{
+	unsigned long ip = (long)(&mcount_call);
+	unsigned long *addr = data;
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
+
+	/*
+	 * Replace the mcount stub with a pointer to the
+	 * ip recorder function.
+	 */
+	memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
+	new = ftrace_call_replace(ip, *addr);
+	*addr = ftrace_modify_code(ip, old, new);
+
+	return 0;
+}
+
+int __init ftrace_dyn_arch_init(void *data)
+{
+	const unsigned char *const *noptable = find_nop_table();
+
+	/* This is running in kstop_machine */
+
+	ftrace_mcount_set(data);
+
+	ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE];
+
+	return 0;
+}

+ 8 - 1
arch/x86/kernel/i386_ksyms_32.c

@@ -1,7 +1,14 @@
 #include <linux/module.h>
 #include <linux/module.h>
+
 #include <asm/checksum.h>
 #include <asm/checksum.h>
-#include <asm/desc.h>
 #include <asm/pgtable.h>
 #include <asm/pgtable.h>
+#include <asm/desc.h>
+#include <asm/ftrace.h>
+
+#ifdef CONFIG_FTRACE
+/* mcount is defined in assembly */
+EXPORT_SYMBOL(mcount);
+#endif
 
 
 /* Networking helper routines. */
 /* Networking helper routines. */
 EXPORT_SYMBOL(csum_partial_copy_generic);
 EXPORT_SYMBOL(csum_partial_copy_generic);

+ 4 - 0
arch/x86/kernel/machine_kexec_32.c

@@ -11,6 +11,8 @@
 #include <linux/delay.h>
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/init.h>
 #include <linux/numa.h>
 #include <linux/numa.h>
+#include <linux/ftrace.h>
+
 #include <asm/pgtable.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/tlbflush.h>
@@ -107,6 +109,8 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	unsigned long page_list[PAGES_NR];
 	unsigned long page_list[PAGES_NR];
 	void *control_page;
 	void *control_page;
 
 
+	tracer_disable();
+
 	/* Interrupts aren't acceptable while we reboot */
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
 	local_irq_disable();
 
 

+ 4 - 0
arch/x86/kernel/machine_kexec_64.c

@@ -11,6 +11,8 @@
 #include <linux/string.h>
 #include <linux/string.h>
 #include <linux/reboot.h>
 #include <linux/reboot.h>
 #include <linux/numa.h>
 #include <linux/numa.h>
+#include <linux/ftrace.h>
+
 #include <asm/pgtable.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
 #include <asm/mmu_context.h>
@@ -184,6 +186,8 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	unsigned long page_list[PAGES_NR];
 	unsigned long page_list[PAGES_NR];
 	void *control_page;
 	void *control_page;
 
 
+	tracer_disable();
+
 	/* Interrupts aren't acceptable while we reboot */
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
 	local_irq_disable();
 
 

+ 3 - 0
arch/x86/kernel/process_32.c

@@ -185,7 +185,10 @@ void cpu_idle(void)
 
 
 			local_irq_disable();
 			local_irq_disable();
 			__get_cpu_var(irq_stat).idle_timestamp = jiffies;
 			__get_cpu_var(irq_stat).idle_timestamp = jiffies;
+			/* Don't trace irqs off for idle */
+			stop_critical_timings();
 			idle();
 			idle();
+			start_critical_timings();
 		}
 		}
 		tick_nohz_restart_sched_tick();
 		tick_nohz_restart_sched_tick();
 		preempt_enable_no_resched();
 		preempt_enable_no_resched();

+ 3 - 0
arch/x86/kernel/process_64.c

@@ -165,7 +165,10 @@ void cpu_idle(void)
 			 */
 			 */
 			local_irq_disable();
 			local_irq_disable();
 			enter_idle();
 			enter_idle();
+			/* Don't trace irqs off for idle */
+			stop_critical_timings();
 			idle();
 			idle();
+			start_critical_timings();
 			/* In many cases the interrupt that ended idle
 			/* In many cases the interrupt that ended idle
 			   has already called exit_idle. But some idle
 			   has already called exit_idle. But some idle
 			   loops can be woken up without interrupt. */
 			   loops can be woken up without interrupt. */

+ 2 - 1
arch/x86/kernel/vsyscall_64.c

@@ -42,7 +42,8 @@
 #include <asm/topology.h>
 #include <asm/topology.h>
 #include <asm/vgtod.h>
 #include <asm/vgtod.h>
 
 
-#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+#define __vsyscall(nr) \
+		__attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
 #define __syscall_clobber "r11","cx","memory"
 #define __syscall_clobber "r11","cx","memory"
 
 
 /*
 /*

+ 9 - 2
arch/x86/kernel/x8664_ksyms_64.c

@@ -2,13 +2,20 @@
    All C exports should go in the respective C files. */
    All C exports should go in the respective C files. */
 
 
 #include <linux/module.h>
 #include <linux/module.h>
-#include <net/checksum.h>
 #include <linux/smp.h>
 #include <linux/smp.h>
 
 
+#include <net/checksum.h>
+
 #include <asm/processor.h>
 #include <asm/processor.h>
-#include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/pgtable.h>
+#include <asm/uaccess.h>
 #include <asm/desc.h>
 #include <asm/desc.h>
+#include <asm/ftrace.h>
+
+#ifdef CONFIG_FTRACE
+/* mcount is defined in assembly */
+EXPORT_SYMBOL(mcount);
+#endif
 
 
 EXPORT_SYMBOL(kernel_thread);
 EXPORT_SYMBOL(kernel_thread);
 
 

+ 1 - 0
arch/x86/lib/Makefile

@@ -5,6 +5,7 @@
 obj-$(CONFIG_SMP) := msr-on-cpu.o
 obj-$(CONFIG_SMP) := msr-on-cpu.o
 
 
 lib-y := delay_$(BITS).o
 lib-y := delay_$(BITS).o
+lib-y += thunk_$(BITS).o
 lib-y += usercopy_$(BITS).o getuser_$(BITS).o putuser_$(BITS).o
 lib-y += usercopy_$(BITS).o getuser_$(BITS).o putuser_$(BITS).o
 lib-y += memcpy_$(BITS).o
 lib-y += memcpy_$(BITS).o
 
 

+ 47 - 0
arch/x86/lib/thunk_32.S

@@ -0,0 +1,47 @@
+/*
+ * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash)
+ * Copyright 2008 by Steven Rostedt, Red Hat, Inc
+ *  (inspired by Andi Kleen's thunk_64.S)
+ * Subject to the GNU public license, v.2. No warranty of any kind.
+ */
+
+	#include <linux/linkage.h>
+
+#define ARCH_TRACE_IRQS_ON			\
+	pushl %eax;				\
+	pushl %ecx;				\
+	pushl %edx;				\
+	call trace_hardirqs_on;			\
+	popl %edx;				\
+	popl %ecx;				\
+	popl %eax;
+
+#define ARCH_TRACE_IRQS_OFF			\
+	pushl %eax;				\
+	pushl %ecx;				\
+	pushl %edx;				\
+	call trace_hardirqs_off;		\
+	popl %edx;				\
+	popl %ecx;				\
+	popl %eax;
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+	/* put return address in eax (arg1) */
+	.macro thunk_ra name,func
+	.globl \name
+\name:
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	/* Place EIP in the arg1 */
+	movl 3*4(%esp), %eax
+	call \func
+	popl %edx
+	popl %ecx
+	popl %eax
+	ret
+	.endm
+
+	thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
+	thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
+#endif

+ 17 - 2
arch/x86/lib/thunk_64.S

@@ -2,6 +2,7 @@
  * Save registers before calling assembly functions. This avoids
  * Save registers before calling assembly functions. This avoids
  * disturbance of register allocation in some inline assembly constructs.
  * disturbance of register allocation in some inline assembly constructs.
  * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
  * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
+ * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc.
  * Subject to the GNU public license, v.2. No warranty of any kind.
  * Subject to the GNU public license, v.2. No warranty of any kind.
  */
  */
 
 
@@ -42,8 +43,22 @@
 #endif	
 #endif	
 	
 	
 #ifdef CONFIG_TRACE_IRQFLAGS
 #ifdef CONFIG_TRACE_IRQFLAGS
-	thunk trace_hardirqs_on_thunk,trace_hardirqs_on
-	thunk trace_hardirqs_off_thunk,trace_hardirqs_off
+	/* put return address in rdi (arg1) */
+	.macro thunk_ra name,func
+	.globl \name
+\name:
+	CFI_STARTPROC
+	SAVE_ARGS
+	/* SAVE_ARGS pushs 9 elements */
+	/* the next element would be the rip */
+	movq 9*8(%rsp), %rdi
+	call \func
+	jmp  restore
+	CFI_ENDPROC
+	.endm
+
+	thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
+	thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
 #endif
 #endif
 
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #ifdef CONFIG_DEBUG_LOCK_ALLOC

+ 56 - 0
arch/x86/mm/fault.c

@@ -49,6 +49,60 @@
 #define PF_RSVD		(1<<3)
 #define PF_RSVD		(1<<3)
 #define PF_INSTR	(1<<4)
 #define PF_INSTR	(1<<4)
 
 
+#ifdef CONFIG_PAGE_FAULT_HANDLERS
+static HLIST_HEAD(pf_handlers); /* protected by RCU */
+static DEFINE_SPINLOCK(pf_handlers_writer);
+
+void register_page_fault_handler(struct pf_handler *new_pfh)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&pf_handlers_writer, flags);
+	hlist_add_head_rcu(&new_pfh->hlist, &pf_handlers);
+	spin_unlock_irqrestore(&pf_handlers_writer, flags);
+}
+EXPORT_SYMBOL_GPL(register_page_fault_handler);
+
+/**
+ * unregister_page_fault_handler:
+ * The caller must ensure @old_pfh is not in use anymore before freeing it.
+ * This function does not guarantee it. The list of handlers is protected by
+ * RCU, so you can do this by e.g. calling synchronize_rcu().
+ */
+void unregister_page_fault_handler(struct pf_handler *old_pfh)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&pf_handlers_writer, flags);
+	hlist_del_rcu(&old_pfh->hlist);
+	spin_unlock_irqrestore(&pf_handlers_writer, flags);
+}
+EXPORT_SYMBOL_GPL(unregister_page_fault_handler);
+#endif
+
+/* returns non-zero if do_page_fault() should return */
+static int handle_custom_pf(struct pt_regs *regs, unsigned long error_code,
+							unsigned long address)
+{
+#ifdef CONFIG_PAGE_FAULT_HANDLERS
+	int ret = 0;
+	struct pf_handler *cur;
+	struct hlist_node *ncur;
+
+	if (hlist_empty(&pf_handlers))
+		return 0;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(cur, ncur, &pf_handlers, hlist) {
+		ret = cur->handler(regs, error_code, address);
+		if (ret)
+			break;
+	}
+	rcu_read_unlock();
+	return ret;
+#else
+	return 0;
+#endif
+}
+
 static inline int notify_page_fault(struct pt_regs *regs)
 static inline int notify_page_fault(struct pt_regs *regs)
 {
 {
 #ifdef CONFIG_KPROBES
 #ifdef CONFIG_KPROBES
@@ -606,6 +660,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 
 
 	if (notify_page_fault(regs))
 	if (notify_page_fault(regs))
 		return;
 		return;
+	if (handle_custom_pf(regs, error_code, address))
+		return;
 
 
 	/*
 	/*
 	 * We fault-in kernel-space virtual memory on-demand. The
 	 * We fault-in kernel-space virtual memory on-demand. The

+ 4 - 0
arch/x86/mm/init_32.c

@@ -710,6 +710,8 @@ void mark_rodata_ro(void)
 	unsigned long start = PFN_ALIGN(_text);
 	unsigned long start = PFN_ALIGN(_text);
 	unsigned long size = PFN_ALIGN(_etext) - start;
 	unsigned long size = PFN_ALIGN(_etext) - start;
 
 
+#ifndef CONFIG_DYNAMIC_FTRACE
+	/* Dynamic tracing modifies the kernel text section */
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 	printk(KERN_INFO "Write protecting the kernel text: %luk\n",
 	printk(KERN_INFO "Write protecting the kernel text: %luk\n",
 		size >> 10);
 		size >> 10);
@@ -722,6 +724,8 @@ void mark_rodata_ro(void)
 	printk(KERN_INFO "Testing CPA: write protecting again\n");
 	printk(KERN_INFO "Testing CPA: write protecting again\n");
 	set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
 	set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
 #endif
 #endif
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
 	start += size;
 	start += size;
 	size = (unsigned long)__end_rodata - start;
 	size = (unsigned long)__end_rodata - start;
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);

+ 8 - 2
arch/x86/mm/init_64.c

@@ -767,6 +767,13 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
 void mark_rodata_ro(void)
 void mark_rodata_ro(void)
 {
 {
 	unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
 	unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
+	unsigned long rodata_start =
+		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+	/* Dynamic tracing modifies the kernel text section */
+	start = rodata_start;
+#endif
 
 
 	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
 	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
 	       (end - start) >> 10);
 	       (end - start) >> 10);
@@ -776,8 +783,7 @@ void mark_rodata_ro(void)
 	 * The rodata section (but not the kernel text!) should also be
 	 * The rodata section (but not the kernel text!) should also be
 	 * not-executable.
 	 * not-executable.
 	 */
 	 */
-	start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
-	set_memory_nx(start, (end - start) >> PAGE_SHIFT);
+	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
 
 
 	rodata_test();
 	rodata_test();
 
 

+ 8 - 7
arch/x86/vdso/vclock_gettime.c

@@ -23,7 +23,7 @@
 
 
 #define gtod vdso_vsyscall_gtod_data
 #define gtod vdso_vsyscall_gtod_data
 
 
-static long vdso_fallback_gettime(long clock, struct timespec *ts)
+notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
 {
 {
 	long ret;
 	long ret;
 	asm("syscall" : "=a" (ret) :
 	asm("syscall" : "=a" (ret) :
@@ -31,7 +31,7 @@ static long vdso_fallback_gettime(long clock, struct timespec *ts)
 	return ret;
 	return ret;
 }
 }
 
 
-static inline long vgetns(void)
+notrace static inline long vgetns(void)
 {
 {
 	long v;
 	long v;
 	cycles_t (*vread)(void);
 	cycles_t (*vread)(void);
@@ -40,7 +40,7 @@ static inline long vgetns(void)
 	return (v * gtod->clock.mult) >> gtod->clock.shift;
 	return (v * gtod->clock.mult) >> gtod->clock.shift;
 }
 }
 
 
-static noinline int do_realtime(struct timespec *ts)
+notrace static noinline int do_realtime(struct timespec *ts)
 {
 {
 	unsigned long seq, ns;
 	unsigned long seq, ns;
 	do {
 	do {
@@ -54,7 +54,8 @@ static noinline int do_realtime(struct timespec *ts)
 }
 }
 
 
 /* Copy of the version in kernel/time.c which we cannot directly access */
 /* Copy of the version in kernel/time.c which we cannot directly access */
-static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
+notrace static void
+vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
 {
 {
 	while (nsec >= NSEC_PER_SEC) {
 	while (nsec >= NSEC_PER_SEC) {
 		nsec -= NSEC_PER_SEC;
 		nsec -= NSEC_PER_SEC;
@@ -68,7 +69,7 @@ static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
 	ts->tv_nsec = nsec;
 	ts->tv_nsec = nsec;
 }
 }
 
 
-static noinline int do_monotonic(struct timespec *ts)
+notrace static noinline int do_monotonic(struct timespec *ts)
 {
 {
 	unsigned long seq, ns, secs;
 	unsigned long seq, ns, secs;
 	do {
 	do {
@@ -82,7 +83,7 @@ static noinline int do_monotonic(struct timespec *ts)
 	return 0;
 	return 0;
 }
 }
 
 
-int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
+notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 {
 {
 	if (likely(gtod->sysctl_enabled && gtod->clock.vread))
 	if (likely(gtod->sysctl_enabled && gtod->clock.vread))
 		switch (clock) {
 		switch (clock) {
@@ -96,7 +97,7 @@ int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 int clock_gettime(clockid_t, struct timespec *)
 int clock_gettime(clockid_t, struct timespec *)
 	__attribute__((weak, alias("__vdso_clock_gettime")));
 	__attribute__((weak, alias("__vdso_clock_gettime")));
 
 
-int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
+notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
 {
 {
 	long ret;
 	long ret;
 	if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {
 	if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {

+ 2 - 1
arch/x86/vdso/vgetcpu.c

@@ -13,7 +13,8 @@
 #include <asm/vgtod.h>
 #include <asm/vgtod.h>
 #include "vextern.h"
 #include "vextern.h"
 
 
-long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
+notrace long
+__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
 {
 {
 	unsigned int p;
 	unsigned int p;
 
 

+ 14 - 0
include/asm-arm/ftrace.h

@@ -0,0 +1,14 @@
+#ifndef _ASM_ARM_FTRACE
+#define _ASM_ARM_FTRACE
+
+#ifdef CONFIG_FTRACE
+#define MCOUNT_ADDR		((long)(mcount))
+#define MCOUNT_INSN_SIZE	4 /* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
+extern void mcount(void);
+#endif
+
+#endif
+
+#endif /* _ASM_ARM_FTRACE */

+ 1 - 0
include/asm-arm/kprobes.h

@@ -59,6 +59,7 @@ struct kprobe_ctlblk {
 };
 };
 
 
 void arch_remove_kprobe(struct kprobe *);
 void arch_remove_kprobe(struct kprobe *);
+void kretprobe_trampoline(void);
 
 
 int kprobe_trap_handler(struct pt_regs *regs, unsigned int instr);
 int kprobe_trap_handler(struct pt_regs *regs, unsigned int instr);
 int kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr);
 int kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr);

+ 14 - 0
include/asm-powerpc/ftrace.h

@@ -0,0 +1,14 @@
+#ifndef _ASM_POWERPC_FTRACE
+#define _ASM_POWERPC_FTRACE
+
+#ifdef CONFIG_FTRACE
+#define MCOUNT_ADDR		((long)(_mcount))
+#define MCOUNT_INSN_SIZE	4 /* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
+extern void _mcount(void);
+#endif
+
+#endif
+
+#endif /* _ASM_POWERPC_FTRACE */

+ 10 - 0
include/asm-powerpc/hw_irq.h

@@ -59,6 +59,11 @@ extern void iseries_handle_interrupts(void);
 		get_paca()->hard_enabled = 0;	\
 		get_paca()->hard_enabled = 0;	\
 	} while(0)
 	} while(0)
 
 
+static inline int irqs_disabled_flags(unsigned long flags)
+{
+	return flags == 0;
+}
+
 #else
 #else
 
 
 #if defined(CONFIG_BOOKE)
 #if defined(CONFIG_BOOKE)
@@ -113,6 +118,11 @@ static inline void local_irq_save_ptr(unsigned long *flags)
 #define hard_irq_enable()	local_irq_enable()
 #define hard_irq_enable()	local_irq_enable()
 #define hard_irq_disable()	local_irq_disable()
 #define hard_irq_disable()	local_irq_disable()
 
 
+static inline int irqs_disabled_flags(unsigned long flags)
+{
+	return (flags & MSR_EE) == 0;
+}
+
 #endif /* CONFIG_PPC64 */
 #endif /* CONFIG_PPC64 */
 
 
 /*
 /*

+ 14 - 0
include/asm-sparc64/ftrace.h

@@ -0,0 +1,14 @@
+#ifndef _ASM_SPARC64_FTRACE
+#define _ASM_SPARC64_FTRACE
+
+#ifdef CONFIG_MCOUNT
+#define MCOUNT_ADDR		((long)(_mcount))
+#define MCOUNT_INSN_SIZE	4 /* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
+extern void _mcount(void);
+#endif
+
+#endif
+
+#endif /* _ASM_SPARC64_FTRACE */

+ 2 - 0
include/asm-x86/alternative.h

@@ -72,6 +72,8 @@ static inline void alternatives_smp_module_del(struct module *mod) {}
 static inline void alternatives_smp_switch(int smp) {}
 static inline void alternatives_smp_switch(int smp) {}
 #endif	/* CONFIG_SMP */
 #endif	/* CONFIG_SMP */
 
 
+const unsigned char *const *find_nop_table(void);
+
 /*
 /*
  * Alternative instructions for different CPU types or capabilities.
  * Alternative instructions for different CPU types or capabilities.
  *
  *

+ 14 - 0
include/asm-x86/ftrace.h

@@ -0,0 +1,14 @@
+#ifndef _ASM_X86_FTRACE
+#define _ASM_SPARC64_FTRACE
+
+#ifdef CONFIG_FTRACE
+#define MCOUNT_ADDR		((long)(mcount))
+#define MCOUNT_INSN_SIZE	5 /* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
+extern void mcount(void);
+#endif
+
+#endif /* CONFIG_FTRACE */
+
+#endif /* _ASM_X86_FTRACE */

+ 2 - 22
include/asm-x86/irqflags.h

@@ -179,8 +179,6 @@ static inline void trace_hardirqs_fixup(void)
  * have a reliable stack. x86_64 only.
  * have a reliable stack. x86_64 only.
  */
  */
 #define SWAPGS_UNSAFE_STACK	swapgs
 #define SWAPGS_UNSAFE_STACK	swapgs
-#define ARCH_TRACE_IRQS_ON		call trace_hardirqs_on_thunk
-#define ARCH_TRACE_IRQS_OFF		call trace_hardirqs_off_thunk
 #define ARCH_LOCKDEP_SYS_EXIT		call lockdep_sys_exit_thunk
 #define ARCH_LOCKDEP_SYS_EXIT		call lockdep_sys_exit_thunk
 #define ARCH_LOCKDEP_SYS_EXIT_IRQ	\
 #define ARCH_LOCKDEP_SYS_EXIT_IRQ	\
 	TRACE_IRQS_ON; \
 	TRACE_IRQS_ON; \
@@ -192,24 +190,6 @@ static inline void trace_hardirqs_fixup(void)
 	TRACE_IRQS_OFF;
 	TRACE_IRQS_OFF;
 
 
 #else
 #else
-#define ARCH_TRACE_IRQS_ON			\
-	pushl %eax;				\
-	pushl %ecx;				\
-	pushl %edx;				\
-	call trace_hardirqs_on;			\
-	popl %edx;				\
-	popl %ecx;				\
-	popl %eax;
-
-#define ARCH_TRACE_IRQS_OFF			\
-	pushl %eax;				\
-	pushl %ecx;				\
-	pushl %edx;				\
-	call trace_hardirqs_off;		\
-	popl %edx;				\
-	popl %ecx;				\
-	popl %eax;
-
 #define ARCH_LOCKDEP_SYS_EXIT			\
 #define ARCH_LOCKDEP_SYS_EXIT			\
 	pushl %eax;				\
 	pushl %eax;				\
 	pushl %ecx;				\
 	pushl %ecx;				\
@@ -223,8 +203,8 @@ static inline void trace_hardirqs_fixup(void)
 #endif
 #endif
 
 
 #ifdef CONFIG_TRACE_IRQFLAGS
 #ifdef CONFIG_TRACE_IRQFLAGS
-#  define TRACE_IRQS_ON		ARCH_TRACE_IRQS_ON
-#  define TRACE_IRQS_OFF	ARCH_TRACE_IRQS_OFF
+#  define TRACE_IRQS_ON		call trace_hardirqs_on_thunk;
+#  define TRACE_IRQS_OFF	call trace_hardirqs_off_thunk;
 #else
 #else
 #  define TRACE_IRQS_ON
 #  define TRACE_IRQS_ON
 #  define TRACE_IRQS_OFF
 #  define TRACE_IRQS_OFF

+ 9 - 0
include/asm-x86/kdebug.h

@@ -35,4 +35,13 @@ extern void show_regs(struct pt_regs *regs);
 extern unsigned long oops_begin(void);
 extern unsigned long oops_begin(void);
 extern void oops_end(unsigned long, struct pt_regs *, int signr);
 extern void oops_end(unsigned long, struct pt_regs *, int signr);
 
 
+struct pf_handler {
+	struct hlist_node hlist;
+	int (*handler)(struct pt_regs *regs, unsigned long error_code,
+						unsigned long address);
+};
+
+extern void register_page_fault_handler(struct pf_handler *new_pfh);
+extern void unregister_page_fault_handler(struct pf_handler *old_pfh);
+
 #endif
 #endif

+ 2 - 1
include/asm-x86/vsyscall.h

@@ -24,7 +24,8 @@ enum vsyscall_num {
 	((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
 	((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
 #define __section_vsyscall_clock __attribute__ \
 #define __section_vsyscall_clock __attribute__ \
 	((unused, __section__ (".vsyscall_clock"),aligned(16)))
 	((unused, __section__ (".vsyscall_clock"),aligned(16)))
-#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn")))
+#define __vsyscall_fn \
+	__attribute__ ((unused, __section__(".vsyscall_fn"))) notrace
 
 
 #define VGETCPU_RDTSCP	1
 #define VGETCPU_RDTSCP	1
 #define VGETCPU_LSL	2
 #define VGETCPU_LSL	2

+ 143 - 0
include/linux/ftrace.h

@@ -0,0 +1,143 @@
+#ifndef _LINUX_FTRACE_H
+#define _LINUX_FTRACE_H
+
+#ifdef CONFIG_FTRACE
+
+#include <linux/linkage.h>
+#include <linux/fs.h>
+
+extern int ftrace_enabled;
+extern int
+ftrace_enable_sysctl(struct ctl_table *table, int write,
+		     struct file *filp, void __user *buffer, size_t *lenp,
+		     loff_t *ppos);
+
+typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip);
+
+struct ftrace_ops {
+	ftrace_func_t	  func;
+	struct ftrace_ops *next;
+};
+
+/*
+ * The ftrace_ops must be a static and should also
+ * be read_mostly.  These functions do modify read_mostly variables
+ * so use them sparely. Never free an ftrace_op or modify the
+ * next pointer after it has been registered. Even after unregistering
+ * it, the next pointer may still be used internally.
+ */
+int register_ftrace_function(struct ftrace_ops *ops);
+int unregister_ftrace_function(struct ftrace_ops *ops);
+void clear_ftrace_function(void);
+
+extern void ftrace_stub(unsigned long a0, unsigned long a1);
+
+#else /* !CONFIG_FTRACE */
+# define register_ftrace_function(ops) do { } while (0)
+# define unregister_ftrace_function(ops) do { } while (0)
+# define clear_ftrace_function(ops) do { } while (0)
+#endif /* CONFIG_FTRACE */
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+# define FTRACE_HASHBITS	10
+# define FTRACE_HASHSIZE	(1<<FTRACE_HASHBITS)
+
+enum {
+	FTRACE_FL_FREE		= (1 << 0),
+	FTRACE_FL_FAILED	= (1 << 1),
+	FTRACE_FL_FILTER	= (1 << 2),
+	FTRACE_FL_ENABLED	= (1 << 3),
+	FTRACE_FL_NOTRACE	= (1 << 4),
+	FTRACE_FL_CONVERTED	= (1 << 5),
+	FTRACE_FL_FROZEN	= (1 << 6),
+};
+
+struct dyn_ftrace {
+	struct hlist_node node;
+	unsigned long	  ip; /* address of mcount call-site */
+	unsigned long	  flags;
+};
+
+int ftrace_force_update(void);
+void ftrace_set_filter(unsigned char *buf, int len, int reset);
+
+/* defined in arch */
+extern int ftrace_ip_converted(unsigned long ip);
+extern unsigned char *ftrace_nop_replace(void);
+extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr);
+extern int ftrace_dyn_arch_init(void *data);
+extern int ftrace_mcount_set(unsigned long *data);
+extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code,
+			      unsigned char *new_code);
+extern int ftrace_update_ftrace_func(ftrace_func_t func);
+extern void ftrace_caller(void);
+extern void ftrace_call(void);
+extern void mcount_call(void);
+
+extern int skip_trace(unsigned long ip);
+
+void ftrace_disable_daemon(void);
+void ftrace_enable_daemon(void);
+
+#else
+# define skip_trace(ip)				({ 0; })
+# define ftrace_force_update()			({ 0; })
+# define ftrace_set_filter(buf, len, reset)	do { } while (0)
+# define ftrace_disable_daemon()		do { } while (0)
+# define ftrace_enable_daemon()			do { } while (0)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+/* totally disable ftrace - can not re-enable after this */
+void ftrace_kill(void);
+
+static inline void tracer_disable(void)
+{
+#ifdef CONFIG_FTRACE
+	ftrace_enabled = 0;
+#endif
+}
+
+#ifdef CONFIG_FRAME_POINTER
+/* TODO: need to fix this for ARM */
+# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
+# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
+# define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3))
+# define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4))
+# define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5))
+# define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6))
+#else
+# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+# define CALLER_ADDR1 0UL
+# define CALLER_ADDR2 0UL
+# define CALLER_ADDR3 0UL
+# define CALLER_ADDR4 0UL
+# define CALLER_ADDR5 0UL
+# define CALLER_ADDR6 0UL
+#endif
+
+#ifdef CONFIG_IRQSOFF_TRACER
+  extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
+  extern void time_hardirqs_off(unsigned long a0, unsigned long a1);
+#else
+# define time_hardirqs_on(a0, a1)		do { } while (0)
+# define time_hardirqs_off(a0, a1)		do { } while (0)
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACER
+  extern void trace_preempt_on(unsigned long a0, unsigned long a1);
+  extern void trace_preempt_off(unsigned long a0, unsigned long a1);
+#else
+# define trace_preempt_on(a0, a1)		do { } while (0)
+# define trace_preempt_off(a0, a1)		do { } while (0)
+#endif
+
+#ifdef CONFIG_TRACING
+extern void
+ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
+#else
+static inline void
+ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
+#endif
+
+#endif /* _LINUX_FTRACE_H */

+ 11 - 2
include/linux/irqflags.h

@@ -12,10 +12,10 @@
 #define _LINUX_TRACE_IRQFLAGS_H
 #define _LINUX_TRACE_IRQFLAGS_H
 
 
 #ifdef CONFIG_TRACE_IRQFLAGS
 #ifdef CONFIG_TRACE_IRQFLAGS
-  extern void trace_hardirqs_on(void);
-  extern void trace_hardirqs_off(void);
   extern void trace_softirqs_on(unsigned long ip);
   extern void trace_softirqs_on(unsigned long ip);
   extern void trace_softirqs_off(unsigned long ip);
   extern void trace_softirqs_off(unsigned long ip);
+  extern void trace_hardirqs_on(void);
+  extern void trace_hardirqs_off(void);
 # define trace_hardirq_context(p)	((p)->hardirq_context)
 # define trace_hardirq_context(p)	((p)->hardirq_context)
 # define trace_softirq_context(p)	((p)->softirq_context)
 # define trace_softirq_context(p)	((p)->softirq_context)
 # define trace_hardirqs_enabled(p)	((p)->hardirqs_enabled)
 # define trace_hardirqs_enabled(p)	((p)->hardirqs_enabled)
@@ -41,6 +41,15 @@
 # define INIT_TRACE_IRQFLAGS
 # define INIT_TRACE_IRQFLAGS
 #endif
 #endif
 
 
+#if defined(CONFIG_IRQSOFF_TRACER) || \
+	defined(CONFIG_PREEMPT_TRACER)
+ extern void stop_critical_timings(void);
+ extern void start_critical_timings(void);
+#else
+# define stop_critical_timings() do { } while (0)
+# define start_critical_timings() do { } while (0)
+#endif
+
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 
 
 #include <asm/irqflags.h>
 #include <asm/irqflags.h>

+ 4 - 0
include/linux/kprobes.h

@@ -259,6 +259,10 @@ void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
 struct jprobe;
 struct jprobe;
 struct kretprobe;
 struct kretprobe;
 
 
+static inline struct kprobe *get_kprobe(void *addr)
+{
+	return NULL;
+}
 static inline struct kprobe *kprobe_running(void)
 static inline struct kprobe *kprobe_running(void)
 {
 {
 	return NULL;
 	return NULL;

+ 2 - 0
include/linux/linkage.h

@@ -3,6 +3,8 @@
 
 
 #include <asm/linkage.h>
 #include <asm/linkage.h>
 
 
+#define notrace __attribute__((no_instrument_function))
+
 #ifdef __cplusplus
 #ifdef __cplusplus
 #define CPP_ASMLINKAGE extern "C"
 #define CPP_ASMLINKAGE extern "C"
 #else
 #else

+ 29 - 11
include/linux/marker.h

@@ -44,8 +44,8 @@ struct marker {
 				 */
 				 */
 	char state;		/* Marker state. */
 	char state;		/* Marker state. */
 	char ptype;		/* probe type : 0 : single, 1 : multi */
 	char ptype;		/* probe type : 0 : single, 1 : multi */
-	void (*call)(const struct marker *mdata,	/* Probe wrapper */
-		void *call_private, const char *fmt, ...);
+				/* Probe wrapper */
+	void (*call)(const struct marker *mdata, void *call_private, ...);
 	struct marker_probe_closure single;
 	struct marker_probe_closure single;
 	struct marker_probe_closure *multi;
 	struct marker_probe_closure *multi;
 } __attribute__((aligned(8)));
 } __attribute__((aligned(8)));
@@ -58,8 +58,12 @@ struct marker {
  * Make sure the alignment of the structure in the __markers section will
  * Make sure the alignment of the structure in the __markers section will
  * not add unwanted padding between the beginning of the section and the
  * not add unwanted padding between the beginning of the section and the
  * structure. Force alignment to the same alignment as the section start.
  * structure. Force alignment to the same alignment as the section start.
+ *
+ * The "generic" argument controls which marker enabling mechanism must be used.
+ * If generic is true, a variable read is used.
+ * If generic is false, immediate values are used.
  */
  */
-#define __trace_mark(name, call_private, format, args...)		\
+#define __trace_mark(generic, name, call_private, format, args...)	\
 	do {								\
 	do {								\
 		static const char __mstrtab_##name[]			\
 		static const char __mstrtab_##name[]			\
 		__attribute__((section("__markers_strings")))		\
 		__attribute__((section("__markers_strings")))		\
@@ -72,15 +76,14 @@ struct marker {
 		__mark_check_format(format, ## args);			\
 		__mark_check_format(format, ## args);			\
 		if (unlikely(__mark_##name.state)) {			\
 		if (unlikely(__mark_##name.state)) {			\
 			(*__mark_##name.call)				\
 			(*__mark_##name.call)				\
-				(&__mark_##name, call_private,		\
-				format, ## args);			\
+				(&__mark_##name, call_private, ## args);\
 		}							\
 		}							\
 	} while (0)
 	} while (0)
 
 
 extern void marker_update_probe_range(struct marker *begin,
 extern void marker_update_probe_range(struct marker *begin,
 	struct marker *end);
 	struct marker *end);
 #else /* !CONFIG_MARKERS */
 #else /* !CONFIG_MARKERS */
-#define __trace_mark(name, call_private, format, args...) \
+#define __trace_mark(generic, name, call_private, format, args...) \
 		__mark_check_format(format, ## args)
 		__mark_check_format(format, ## args)
 static inline void marker_update_probe_range(struct marker *begin,
 static inline void marker_update_probe_range(struct marker *begin,
 	struct marker *end)
 	struct marker *end)
@@ -88,15 +91,30 @@ static inline void marker_update_probe_range(struct marker *begin,
 #endif /* CONFIG_MARKERS */
 #endif /* CONFIG_MARKERS */
 
 
 /**
 /**
- * trace_mark - Marker
+ * trace_mark - Marker using code patching
  * @name: marker name, not quoted.
  * @name: marker name, not quoted.
  * @format: format string
  * @format: format string
  * @args...: variable argument list
  * @args...: variable argument list
  *
  *
- * Places a marker.
+ * Places a marker using optimized code patching technique (imv_read())
+ * to be enabled when immediate values are present.
  */
  */
 #define trace_mark(name, format, args...) \
 #define trace_mark(name, format, args...) \
-	__trace_mark(name, NULL, format, ## args)
+	__trace_mark(0, name, NULL, format, ## args)
+
+/**
+ * _trace_mark - Marker using variable read
+ * @name: marker name, not quoted.
+ * @format: format string
+ * @args...: variable argument list
+ *
+ * Places a marker using a standard memory read (_imv_read()) to be
+ * enabled. Should be used for markers in code paths where instruction
+ * modification based enabling is not welcome. (__init and __exit functions,
+ * lockdep, some traps, printk).
+ */
+#define _trace_mark(name, format, args...) \
+	__trace_mark(1, name, NULL, format, ## args)
 
 
 /**
 /**
  * MARK_NOARGS - Format string for a marker with no argument.
  * MARK_NOARGS - Format string for a marker with no argument.
@@ -117,9 +135,9 @@ static inline void __printf(1, 2) ___mark_check_format(const char *fmt, ...)
 extern marker_probe_func __mark_empty_function;
 extern marker_probe_func __mark_empty_function;
 
 
 extern void marker_probe_cb(const struct marker *mdata,
 extern void marker_probe_cb(const struct marker *mdata,
-	void *call_private, const char *fmt, ...);
+	void *call_private, ...);
 extern void marker_probe_cb_noarg(const struct marker *mdata,
 extern void marker_probe_cb_noarg(const struct marker *mdata,
-	void *call_private, const char *fmt, ...);
+	void *call_private, ...);
 
 
 /*
 /*
  * Connect a probe to a marker.
  * Connect a probe to a marker.

+ 33 - 1
include/linux/preempt.h

@@ -10,7 +10,7 @@
 #include <linux/linkage.h>
 #include <linux/linkage.h>
 #include <linux/list.h>
 #include <linux/list.h>
 
 
-#ifdef CONFIG_DEBUG_PREEMPT
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
   extern void add_preempt_count(int val);
   extern void add_preempt_count(int val);
   extern void sub_preempt_count(int val);
   extern void sub_preempt_count(int val);
 #else
 #else
@@ -52,6 +52,34 @@ do { \
 	preempt_check_resched(); \
 	preempt_check_resched(); \
 } while (0)
 } while (0)
 
 
+/* For debugging and tracer internals only! */
+#define add_preempt_count_notrace(val)			\
+	do { preempt_count() += (val); } while (0)
+#define sub_preempt_count_notrace(val)			\
+	do { preempt_count() -= (val); } while (0)
+#define inc_preempt_count_notrace() add_preempt_count_notrace(1)
+#define dec_preempt_count_notrace() sub_preempt_count_notrace(1)
+
+#define preempt_disable_notrace() \
+do { \
+	inc_preempt_count_notrace(); \
+	barrier(); \
+} while (0)
+
+#define preempt_enable_no_resched_notrace() \
+do { \
+	barrier(); \
+	dec_preempt_count_notrace(); \
+} while (0)
+
+/* preempt_check_resched is OK to trace */
+#define preempt_enable_notrace() \
+do { \
+	preempt_enable_no_resched_notrace(); \
+	barrier(); \
+	preempt_check_resched(); \
+} while (0)
+
 #else
 #else
 
 
 #define preempt_disable()		do { } while (0)
 #define preempt_disable()		do { } while (0)
@@ -59,6 +87,10 @@ do { \
 #define preempt_enable()		do { } while (0)
 #define preempt_enable()		do { } while (0)
 #define preempt_check_resched()		do { } while (0)
 #define preempt_check_resched()		do { } while (0)
 
 
+#define preempt_disable_notrace()		do { } while (0)
+#define preempt_enable_no_resched_notrace()	do { } while (0)
+#define preempt_enable_notrace()		do { } while (0)
+
 #endif
 #endif
 
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 #ifdef CONFIG_PREEMPT_NOTIFIERS

+ 16 - 0
include/linux/sched.h

@@ -246,6 +246,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
 extern void init_idle(struct task_struct *idle, int cpu);
 extern void init_idle(struct task_struct *idle, int cpu);
 extern void init_idle_bootup_task(struct task_struct *idle);
 extern void init_idle_bootup_task(struct task_struct *idle);
 
 
+extern int runqueue_is_locked(void);
+
 extern cpumask_t nohz_cpu_mask;
 extern cpumask_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern int select_nohz_load_balancer(int cpu);
 extern int select_nohz_load_balancer(int cpu);
@@ -2131,6 +2133,18 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm)
 }
 }
 #endif
 #endif
 
 
+#ifdef CONFIG_TRACING
+extern void
+__trace_special(void *__tr, void *__data,
+		unsigned long arg1, unsigned long arg2, unsigned long arg3);
+#else
+static inline void
+__trace_special(void *__tr, void *__data,
+		unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+}
+#endif
+
 extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
 extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
 extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
 extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
 
 
@@ -2225,6 +2239,8 @@ static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 }
 }
 #endif /* CONFIG_MM_OWNER */
 #endif /* CONFIG_MM_OWNER */
 
 
+#define TASK_STATE_TO_CHAR_STR "RSDTtZX"
+
 #endif /* __KERNEL__ */
 #endif /* __KERNEL__ */
 
 
 #endif
 #endif

+ 2 - 0
include/linux/writeback.h

@@ -105,6 +105,8 @@ extern int vm_highmem_is_dirtyable;
 extern int block_dump;
 extern int block_dump;
 extern int laptop_mode;
 extern int laptop_mode;
 
 
+extern unsigned long determine_dirtyable_memory(void);
+
 extern int dirty_ratio_handler(struct ctl_table *table, int write,
 extern int dirty_ratio_handler(struct ctl_table *table, int write,
 		struct file *filp, void __user *buffer, size_t *lenp,
 		struct file *filp, void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 		loff_t *ppos);

+ 14 - 0
kernel/Makefile

@@ -11,6 +11,18 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o
 	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o
 
 
+CFLAGS_REMOVE_sched.o = -pg -mno-spe
+
+ifdef CONFIG_FTRACE
+# Do not trace debug files and internal ftrace files
+CFLAGS_REMOVE_lockdep.o = -pg
+CFLAGS_REMOVE_lockdep_proc.o = -pg
+CFLAGS_REMOVE_mutex-debug.o = -pg
+CFLAGS_REMOVE_rtmutex-debug.o = -pg
+CFLAGS_REMOVE_cgroup-debug.o = -pg
+CFLAGS_REMOVE_sched_clock.o = -pg
+endif
+
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
 obj-y += time/
@@ -69,6 +81,8 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_FTRACE) += trace/
+obj-$(CONFIG_TRACING) += trace/
 
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is

+ 1 - 1
kernel/fork.c

@@ -909,7 +909,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 
 	rt_mutex_init_task(p);
 	rt_mutex_init_task(p);
 
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_LOCKDEP)
 	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
 	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
 #endif

+ 26 - 7
kernel/lockdep.c

@@ -39,6 +39,7 @@
 #include <linux/irqflags.h>
 #include <linux/irqflags.h>
 #include <linux/utsname.h>
 #include <linux/utsname.h>
 #include <linux/hash.h>
 #include <linux/hash.h>
+#include <linux/ftrace.h>
 
 
 #include <asm/sections.h>
 #include <asm/sections.h>
 
 
@@ -81,6 +82,8 @@ static int graph_lock(void)
 		__raw_spin_unlock(&lockdep_lock);
 		__raw_spin_unlock(&lockdep_lock);
 		return 0;
 		return 0;
 	}
 	}
+	/* prevent any recursions within lockdep from causing deadlocks */
+	current->lockdep_recursion++;
 	return 1;
 	return 1;
 }
 }
 
 
@@ -89,6 +92,7 @@ static inline int graph_unlock(void)
 	if (debug_locks && !__raw_spin_is_locked(&lockdep_lock))
 	if (debug_locks && !__raw_spin_is_locked(&lockdep_lock))
 		return DEBUG_LOCKS_WARN_ON(1);
 		return DEBUG_LOCKS_WARN_ON(1);
 
 
+	current->lockdep_recursion--;
 	__raw_spin_unlock(&lockdep_lock);
 	__raw_spin_unlock(&lockdep_lock);
 	return 0;
 	return 0;
 }
 }
@@ -982,7 +986,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
 	return 1;
 	return 1;
 }
 }
 
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 /*
 /*
  * Forwards and backwards subgraph searching, for the purposes of
  * Forwards and backwards subgraph searching, for the purposes of
  * proving that two subgraphs can be connected by a new dependency
  * proving that two subgraphs can be connected by a new dependency
@@ -1680,7 +1684,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
 static int mark_lock(struct task_struct *curr, struct held_lock *this,
 static int mark_lock(struct task_struct *curr, struct held_lock *this,
 		     enum lock_usage_bit new_bit);
 		     enum lock_usage_bit new_bit);
 
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 
 
 /*
 /*
  * print irq inversion bug:
  * print irq inversion bug:
@@ -2013,11 +2017,13 @@ void early_boot_irqs_on(void)
 /*
 /*
  * Hardirqs will be enabled:
  * Hardirqs will be enabled:
  */
  */
-void trace_hardirqs_on(void)
+void trace_hardirqs_on_caller(unsigned long a0)
 {
 {
 	struct task_struct *curr = current;
 	struct task_struct *curr = current;
 	unsigned long ip;
 	unsigned long ip;
 
 
+	time_hardirqs_on(CALLER_ADDR0, a0);
+
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 		return;
 
 
@@ -2055,16 +2061,23 @@ void trace_hardirqs_on(void)
 	curr->hardirq_enable_event = ++curr->irq_events;
 	curr->hardirq_enable_event = ++curr->irq_events;
 	debug_atomic_inc(&hardirqs_on_events);
 	debug_atomic_inc(&hardirqs_on_events);
 }
 }
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
 
+void trace_hardirqs_on(void)
+{
+	trace_hardirqs_on_caller(CALLER_ADDR0);
+}
 EXPORT_SYMBOL(trace_hardirqs_on);
 EXPORT_SYMBOL(trace_hardirqs_on);
 
 
 /*
 /*
  * Hardirqs were disabled:
  * Hardirqs were disabled:
  */
  */
-void trace_hardirqs_off(void)
+void trace_hardirqs_off_caller(unsigned long a0)
 {
 {
 	struct task_struct *curr = current;
 	struct task_struct *curr = current;
 
 
+	time_hardirqs_off(CALLER_ADDR0, a0);
+
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 		return;
 
 
@@ -2082,7 +2095,12 @@ void trace_hardirqs_off(void)
 	} else
 	} else
 		debug_atomic_inc(&redundant_hardirqs_off);
 		debug_atomic_inc(&redundant_hardirqs_off);
 }
 }
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
 
 
+void trace_hardirqs_off(void)
+{
+	trace_hardirqs_off_caller(CALLER_ADDR0);
+}
 EXPORT_SYMBOL(trace_hardirqs_off);
 EXPORT_SYMBOL(trace_hardirqs_off);
 
 
 /*
 /*
@@ -2246,7 +2264,7 @@ static inline int separate_irq_context(struct task_struct *curr,
  * Mark a lock with a usage bit, and validate the state transition:
  * Mark a lock with a usage bit, and validate the state transition:
  */
  */
 static int mark_lock(struct task_struct *curr, struct held_lock *this,
 static int mark_lock(struct task_struct *curr, struct held_lock *this,
-		     enum lock_usage_bit new_bit)
+			     enum lock_usage_bit new_bit)
 {
 {
 	unsigned int new_mask = 1 << new_bit, ret = 1;
 	unsigned int new_mask = 1 << new_bit, ret = 1;
 
 
@@ -2686,7 +2704,7 @@ static void check_flags(unsigned long flags)
  * and also avoid lockdep recursion:
  * and also avoid lockdep recursion:
  */
  */
 void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
-		  int trylock, int read, int check, unsigned long ip)
+			  int trylock, int read, int check, unsigned long ip)
 {
 {
 	unsigned long flags;
 	unsigned long flags;
 
 
@@ -2708,7 +2726,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 
 
 EXPORT_SYMBOL_GPL(lock_acquire);
 EXPORT_SYMBOL_GPL(lock_acquire);
 
 
-void lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+void lock_release(struct lockdep_map *lock, int nested,
+			  unsigned long ip)
 {
 {
 	unsigned long flags;
 	unsigned long flags;
 
 

+ 14 - 16
kernel/marker.c

@@ -55,8 +55,8 @@ static DEFINE_MUTEX(markers_mutex);
 struct marker_entry {
 struct marker_entry {
 	struct hlist_node hlist;
 	struct hlist_node hlist;
 	char *format;
 	char *format;
-	void (*call)(const struct marker *mdata,	/* Probe wrapper */
-		void *call_private, const char *fmt, ...);
+			/* Probe wrapper */
+	void (*call)(const struct marker *mdata, void *call_private, ...);
 	struct marker_probe_closure single;
 	struct marker_probe_closure single;
 	struct marker_probe_closure *multi;
 	struct marker_probe_closure *multi;
 	int refcount;	/* Number of times armed. 0 if disarmed. */
 	int refcount;	/* Number of times armed. 0 if disarmed. */
@@ -91,15 +91,13 @@ EXPORT_SYMBOL_GPL(__mark_empty_function);
  * marker_probe_cb Callback that prepares the variable argument list for probes.
  * marker_probe_cb Callback that prepares the variable argument list for probes.
  * @mdata: pointer of type struct marker
  * @mdata: pointer of type struct marker
  * @call_private: caller site private data
  * @call_private: caller site private data
- * @fmt: format string
  * @...:  Variable argument list.
  * @...:  Variable argument list.
  *
  *
  * Since we do not use "typical" pointer based RCU in the 1 argument case, we
  * Since we do not use "typical" pointer based RCU in the 1 argument case, we
  * need to put a full smp_rmb() in this branch. This is why we do not use
  * need to put a full smp_rmb() in this branch. This is why we do not use
  * rcu_dereference() for the pointer read.
  * rcu_dereference() for the pointer read.
  */
  */
-void marker_probe_cb(const struct marker *mdata, void *call_private,
-	const char *fmt, ...)
+void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
 {
 {
 	va_list args;
 	va_list args;
 	char ptype;
 	char ptype;
@@ -120,8 +118,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private,
 		/* Must read the ptr before private data. They are not data
 		/* Must read the ptr before private data. They are not data
 		 * dependant, so we put an explicit smp_rmb() here. */
 		 * dependant, so we put an explicit smp_rmb() here. */
 		smp_rmb();
 		smp_rmb();
-		va_start(args, fmt);
-		func(mdata->single.probe_private, call_private, fmt, &args);
+		va_start(args, call_private);
+		func(mdata->single.probe_private, call_private, mdata->format,
+			&args);
 		va_end(args);
 		va_end(args);
 	} else {
 	} else {
 		struct marker_probe_closure *multi;
 		struct marker_probe_closure *multi;
@@ -136,9 +135,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private,
 		smp_read_barrier_depends();
 		smp_read_barrier_depends();
 		multi = mdata->multi;
 		multi = mdata->multi;
 		for (i = 0; multi[i].func; i++) {
 		for (i = 0; multi[i].func; i++) {
-			va_start(args, fmt);
-			multi[i].func(multi[i].probe_private, call_private, fmt,
-				&args);
+			va_start(args, call_private);
+			multi[i].func(multi[i].probe_private, call_private,
+				mdata->format, &args);
 			va_end(args);
 			va_end(args);
 		}
 		}
 	}
 	}
@@ -150,13 +149,11 @@ EXPORT_SYMBOL_GPL(marker_probe_cb);
  * marker_probe_cb Callback that does not prepare the variable argument list.
  * marker_probe_cb Callback that does not prepare the variable argument list.
  * @mdata: pointer of type struct marker
  * @mdata: pointer of type struct marker
  * @call_private: caller site private data
  * @call_private: caller site private data
- * @fmt: format string
  * @...:  Variable argument list.
  * @...:  Variable argument list.
  *
  *
  * Should be connected to markers "MARK_NOARGS".
  * Should be connected to markers "MARK_NOARGS".
  */
  */
-void marker_probe_cb_noarg(const struct marker *mdata,
-	void *call_private, const char *fmt, ...)
+void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
 {
 {
 	va_list args;	/* not initialized */
 	va_list args;	/* not initialized */
 	char ptype;
 	char ptype;
@@ -172,7 +169,8 @@ void marker_probe_cb_noarg(const struct marker *mdata,
 		/* Must read the ptr before private data. They are not data
 		/* Must read the ptr before private data. They are not data
 		 * dependant, so we put an explicit smp_rmb() here. */
 		 * dependant, so we put an explicit smp_rmb() here. */
 		smp_rmb();
 		smp_rmb();
-		func(mdata->single.probe_private, call_private, fmt, &args);
+		func(mdata->single.probe_private, call_private, mdata->format,
+			&args);
 	} else {
 	} else {
 		struct marker_probe_closure *multi;
 		struct marker_probe_closure *multi;
 		int i;
 		int i;
@@ -186,8 +184,8 @@ void marker_probe_cb_noarg(const struct marker *mdata,
 		smp_read_barrier_depends();
 		smp_read_barrier_depends();
 		multi = mdata->multi;
 		multi = mdata->multi;
 		for (i = 0; multi[i].func; i++)
 		for (i = 0; multi[i].func; i++)
-			multi[i].func(multi[i].probe_private, call_private, fmt,
-				&args);
+			multi[i].func(multi[i].probe_private, call_private,
+				mdata->format, &args);
 	}
 	}
 	preempt_enable();
 	preempt_enable();
 }
 }

+ 2 - 0
kernel/printk.c

@@ -1041,7 +1041,9 @@ void release_console_sem(void)
 		_log_end = log_end;
 		_log_end = log_end;
 		con_start = log_end;		/* Flush */
 		con_start = log_end;		/* Flush */
 		spin_unlock(&logbuf_lock);
 		spin_unlock(&logbuf_lock);
+		stop_critical_timings();	/* don't trace print latency */
 		call_console_drivers(_con_start, _log_end);
 		call_console_drivers(_con_start, _log_end);
+		start_critical_timings();
 		local_irq_restore(flags);
 		local_irq_restore(flags);
 	}
 	}
 	console_locked = 0;
 	console_locked = 0;

+ 54 - 3
kernel/sched.c

@@ -70,6 +70,7 @@
 #include <linux/bootmem.h>
 #include <linux/bootmem.h>
 #include <linux/debugfs.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ctype.h>
+#include <linux/ftrace.h>
 
 
 #include <asm/tlb.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 #include <asm/irq_regs.h>
@@ -607,6 +608,24 @@ static inline void update_rq_clock(struct rq *rq)
 # define const_debug static const
 # define const_debug static const
 #endif
 #endif
 
 
+/**
+ * runqueue_is_locked
+ *
+ * Returns true if the current cpu runqueue is locked.
+ * This interface allows printk to be called with the runqueue lock
+ * held and know whether or not it is OK to wake up the klogd.
+ */
+int runqueue_is_locked(void)
+{
+	int cpu = get_cpu();
+	struct rq *rq = cpu_rq(cpu);
+	int ret;
+
+	ret = spin_is_locked(&rq->lock);
+	put_cpu();
+	return ret;
+}
+
 /*
 /*
  * Debugging: various feature bits
  * Debugging: various feature bits
  */
  */
@@ -831,7 +850,7 @@ static unsigned long long __cpu_clock(int cpu)
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
  * clock constructed from sched_clock():
  * clock constructed from sched_clock():
  */
  */
-unsigned long long cpu_clock(int cpu)
+unsigned long long notrace cpu_clock(int cpu)
 {
 {
 	unsigned long long prev_cpu_time, time, delta_time;
 	unsigned long long prev_cpu_time, time, delta_time;
 	unsigned long flags;
 	unsigned long flags;
@@ -2149,6 +2168,9 @@ out_activate:
 	success = 1;
 	success = 1;
 
 
 out_running:
 out_running:
+	trace_mark(kernel_sched_wakeup,
+		"pid %d state %ld ## rq %p task %p rq->curr %p",
+		p->pid, p->state, rq, p, rq->curr);
 	check_preempt_curr(rq, p);
 	check_preempt_curr(rq, p);
 
 
 	p->state = TASK_RUNNING;
 	p->state = TASK_RUNNING;
@@ -2279,6 +2301,9 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		p->sched_class->task_new(rq, p);
 		p->sched_class->task_new(rq, p);
 		inc_nr_running(p, rq);
 		inc_nr_running(p, rq);
 	}
 	}
+	trace_mark(kernel_sched_wakeup_new,
+		"pid %d state %ld ## rq %p task %p rq->curr %p",
+		p->pid, p->state, rq, p, rq->curr);
 	check_preempt_curr(rq, p);
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
 	if (p->sched_class->task_wake_up)
@@ -2451,6 +2476,11 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	struct mm_struct *mm, *oldmm;
 	struct mm_struct *mm, *oldmm;
 
 
 	prepare_task_switch(rq, prev, next);
 	prepare_task_switch(rq, prev, next);
+	trace_mark(kernel_sched_schedule,
+		"prev_pid %d next_pid %d prev_state %ld "
+		"## rq %p prev %p next %p",
+		prev->pid, next->pid, prev->state,
+		rq, prev, next);
 	mm = next->mm;
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	oldmm = prev->active_mm;
 	/*
 	/*
@@ -4021,26 +4051,44 @@ void scheduler_tick(void)
 #endif
 #endif
 }
 }
 
 
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+				defined(CONFIG_PREEMPT_TRACER))
+
+static inline unsigned long get_parent_ip(unsigned long addr)
+{
+	if (in_lock_functions(addr)) {
+		addr = CALLER_ADDR2;
+		if (in_lock_functions(addr))
+			addr = CALLER_ADDR3;
+	}
+	return addr;
+}
 
 
 void __kprobes add_preempt_count(int val)
 void __kprobes add_preempt_count(int val)
 {
 {
+#ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	/*
 	 * Underflow?
 	 * Underflow?
 	 */
 	 */
 	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
 	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
 		return;
 		return;
+#endif
 	preempt_count() += val;
 	preempt_count() += val;
+#ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	/*
 	 * Spinlock count overflowing soon?
 	 * Spinlock count overflowing soon?
 	 */
 	 */
 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
 				PREEMPT_MASK - 10);
 				PREEMPT_MASK - 10);
+#endif
+	if (preempt_count() == val)
+		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 }
 EXPORT_SYMBOL(add_preempt_count);
 EXPORT_SYMBOL(add_preempt_count);
 
 
 void __kprobes sub_preempt_count(int val)
 void __kprobes sub_preempt_count(int val)
 {
 {
+#ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	/*
 	 * Underflow?
 	 * Underflow?
 	 */
 	 */
@@ -4052,7 +4100,10 @@ void __kprobes sub_preempt_count(int val)
 	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
 	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
 			!(preempt_count() & PREEMPT_MASK)))
 			!(preempt_count() & PREEMPT_MASK)))
 		return;
 		return;
+#endif
 
 
+	if (preempt_count() == val)
+		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 	preempt_count() -= val;
 	preempt_count() -= val;
 }
 }
 EXPORT_SYMBOL(sub_preempt_count);
 EXPORT_SYMBOL(sub_preempt_count);
@@ -5384,7 +5435,7 @@ out_unlock:
 	return retval;
 	return retval;
 }
 }
 
 
-static const char stat_nam[] = "RSDTtZX";
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
 
 
 void sched_show_task(struct task_struct *p)
 void sched_show_task(struct task_struct *p)
 {
 {

+ 1 - 0
kernel/semaphore.c

@@ -31,6 +31,7 @@
 #include <linux/sched.h>
 #include <linux/sched.h>
 #include <linux/semaphore.h>
 #include <linux/semaphore.h>
 #include <linux/spinlock.h>
 #include <linux/spinlock.h>
+#include <linux/ftrace.h>
 
 
 static noinline void __down(struct semaphore *sem);
 static noinline void __down(struct semaphore *sem);
 static noinline int __down_interruptible(struct semaphore *sem);
 static noinline int __down_interruptible(struct semaphore *sem);

+ 1 - 1
kernel/spinlock.c

@@ -436,7 +436,7 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock)
 }
 }
 EXPORT_SYMBOL(_spin_trylock_bh);
 EXPORT_SYMBOL(_spin_trylock_bh);
 
 
-int in_lock_functions(unsigned long addr)
+notrace int in_lock_functions(unsigned long addr)
 {
 {
 	/* Linker adds these: start and end of __lockfunc functions */
 	/* Linker adds these: start and end of __lockfunc functions */
 	extern char __lock_text_start[], __lock_text_end[];
 	extern char __lock_text_start[], __lock_text_end[];

+ 11 - 0
kernel/sysctl.c

@@ -46,6 +46,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_fs.h>
 #include <linux/acpi.h>
 #include <linux/acpi.h>
 #include <linux/reboot.h>
 #include <linux/reboot.h>
+#include <linux/ftrace.h>
 
 
 #include <asm/uaccess.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <asm/processor.h>
@@ -455,6 +456,16 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 		.proc_handler	= &proc_dointvec,
 	},
 	},
+#ifdef CONFIG_FTRACE
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "ftrace_enabled",
+		.data		= &ftrace_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &ftrace_enable_sysctl,
+	},
+#endif
 #ifdef CONFIG_KMOD
 #ifdef CONFIG_KMOD
 	{
 	{
 		.ctl_name	= KERN_MODPROBE,
 		.ctl_name	= KERN_MODPROBE,

+ 127 - 0
kernel/trace/Kconfig

@@ -0,0 +1,127 @@
+#
+# Architectures that offer an FTRACE implementation should select HAVE_FTRACE:
+#
+config HAVE_FTRACE
+	bool
+
+config HAVE_DYNAMIC_FTRACE
+	bool
+
+config TRACER_MAX_TRACE
+	bool
+
+config TRACING
+	bool
+	select DEBUG_FS
+	select STACKTRACE
+
+config FTRACE
+	bool "Kernel Function Tracer"
+	depends on HAVE_FTRACE
+	select FRAME_POINTER
+	select TRACING
+	select CONTEXT_SWITCH_TRACER
+	help
+	  Enable the kernel to trace every kernel function. This is done
+	  by using a compiler feature to insert a small, 5-byte No-Operation
+	  instruction to the beginning of every kernel function, which NOP
+	  sequence is then dynamically patched into a tracer call when
+	  tracing is enabled by the administrator. If it's runtime disabled
+	  (the bootup default), then the overhead of the instructions is very
+	  small and not measurable even in micro-benchmarks.
+
+config IRQSOFF_TRACER
+	bool "Interrupts-off Latency Tracer"
+	default n
+	depends on TRACE_IRQFLAGS_SUPPORT
+	depends on GENERIC_TIME
+	depends on HAVE_FTRACE
+	select TRACE_IRQFLAGS
+	select TRACING
+	select TRACER_MAX_TRACE
+	help
+	  This option measures the time spent in irqs-off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started
+	  via:
+
+	      echo 0 > /debugfs/tracing/tracing_max_latency
+
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the preempt-off timing option can be
+	  used together or separately.)
+
+config PREEMPT_TRACER
+	bool "Preemption-off Latency Tracer"
+	default n
+	depends on GENERIC_TIME
+	depends on PREEMPT
+	depends on HAVE_FTRACE
+	select TRACING
+	select TRACER_MAX_TRACE
+	help
+	  This option measures the time spent in preemption off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started
+	  via:
+
+	      echo 0 > /debugfs/tracing/tracing_max_latency
+
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the irqs-off timing option can be
+	  used together or separately.)
+
+config SCHED_TRACER
+	bool "Scheduling Latency Tracer"
+	depends on HAVE_FTRACE
+	select TRACING
+	select CONTEXT_SWITCH_TRACER
+	select TRACER_MAX_TRACE
+	help
+	  This tracer tracks the latency of the highest priority task
+	  to be scheduled in, starting from the point it has woken up.
+
+config CONTEXT_SWITCH_TRACER
+	bool "Trace process context switches"
+	depends on HAVE_FTRACE
+	select TRACING
+	select MARKERS
+	help
+	  This tracer gets called from the context switch and records
+	  all switching of tasks.
+
+config DYNAMIC_FTRACE
+	bool "enable/disable ftrace tracepoints dynamically"
+	depends on FTRACE
+	depends on HAVE_DYNAMIC_FTRACE
+	default y
+	help
+         This option will modify all the calls to ftrace dynamically
+	 (will patch them out of the binary image and replaces them
+	 with a No-Op instruction) as they are called. A table is
+	 created to dynamically enable them again.
+
+	 This way a CONFIG_FTRACE kernel is slightly larger, but otherwise
+	 has native performance as long as no tracing is active.
+
+	 The changes to the code are done by a kernel thread that
+	 wakes up once a second and checks to see if any ftrace calls
+	 were made. If so, it runs stop_machine (stops all CPUS)
+	 and modifies the code to jump over the call to ftrace.
+
+config FTRACE_SELFTEST
+	bool
+
+config FTRACE_STARTUP_TEST
+	bool "Perform a startup test on ftrace"
+	depends on TRACING
+	select FTRACE_SELFTEST
+	help
+	  This option performs a series of startup tests on ftrace. On bootup
+	  a series of tests are made to verify that the tracer is
+	  functioning properly. It will do tests on all the configured
+	  tracers of ftrace.

+ 22 - 0
kernel/trace/Makefile

@@ -0,0 +1,22 @@
+
+# Do not instrument the tracer itself:
+
+ifdef CONFIG_FTRACE
+ORIG_CFLAGS := $(KBUILD_CFLAGS)
+KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
+
+# selftest needs instrumentation
+CFLAGS_trace_selftest_dynamic.o = -pg
+obj-y += trace_selftest_dynamic.o
+endif
+
+obj-$(CONFIG_FTRACE) += libftrace.o
+
+obj-$(CONFIG_TRACING) += trace.o
+obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
+obj-$(CONFIG_FTRACE) += trace_functions.o
+obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
+obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
+obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
+
+libftrace-y := ftrace.o

+ 1710 - 0
kernel/trace/ftrace.c

@@ -0,0 +1,1710 @@
+/*
+ * Infrastructure for profiling code inserted by 'gcc -pg'.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2004-2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Originally ported from the -rt patch by:
+ *   Copyright (C) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Based on code in the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+
+#include <linux/stop_machine.h>
+#include <linux/clocksource.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/hardirq.h>
+#include <linux/kthread.h>
+#include <linux/uaccess.h>
+#include <linux/kprobes.h>
+#include <linux/ftrace.h>
+#include <linux/sysctl.h>
+#include <linux/ctype.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+
+#include <asm/ftrace.h>
+
+#include "trace.h"
+
+/* ftrace_enabled is a method to turn ftrace on or off */
+int ftrace_enabled __read_mostly;
+static int last_ftrace_enabled;
+
+/*
+ * ftrace_disabled is set when an anomaly is discovered.
+ * ftrace_disabled is much stronger than ftrace_enabled.
+ */
+static int ftrace_disabled __read_mostly;
+
+static DEFINE_SPINLOCK(ftrace_lock);
+static DEFINE_MUTEX(ftrace_sysctl_lock);
+
+static struct ftrace_ops ftrace_list_end __read_mostly =
+{
+	.func = ftrace_stub,
+};
+
+static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
+ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
+
+static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
+{
+	struct ftrace_ops *op = ftrace_list;
+
+	/* in case someone actually ports this to alpha! */
+	read_barrier_depends();
+
+	while (op != &ftrace_list_end) {
+		/* silly alpha */
+		read_barrier_depends();
+		op->func(ip, parent_ip);
+		op = op->next;
+	};
+}
+
+/**
+ * clear_ftrace_function - reset the ftrace function
+ *
+ * This NULLs the ftrace function and in essence stops
+ * tracing.  There may be lag
+ */
+void clear_ftrace_function(void)
+{
+	ftrace_trace_function = ftrace_stub;
+}
+
+static int __register_ftrace_function(struct ftrace_ops *ops)
+{
+	/* Should never be called by interrupts */
+	spin_lock(&ftrace_lock);
+
+	ops->next = ftrace_list;
+	/*
+	 * We are entering ops into the ftrace_list but another
+	 * CPU might be walking that list. We need to make sure
+	 * the ops->next pointer is valid before another CPU sees
+	 * the ops pointer included into the ftrace_list.
+	 */
+	smp_wmb();
+	ftrace_list = ops;
+
+	if (ftrace_enabled) {
+		/*
+		 * For one func, simply call it directly.
+		 * For more than one func, call the chain.
+		 */
+		if (ops->next == &ftrace_list_end)
+			ftrace_trace_function = ops->func;
+		else
+			ftrace_trace_function = ftrace_list_func;
+	}
+
+	spin_unlock(&ftrace_lock);
+
+	return 0;
+}
+
+static int __unregister_ftrace_function(struct ftrace_ops *ops)
+{
+	struct ftrace_ops **p;
+	int ret = 0;
+
+	spin_lock(&ftrace_lock);
+
+	/*
+	 * If we are removing the last function, then simply point
+	 * to the ftrace_stub.
+	 */
+	if (ftrace_list == ops && ops->next == &ftrace_list_end) {
+		ftrace_trace_function = ftrace_stub;
+		ftrace_list = &ftrace_list_end;
+		goto out;
+	}
+
+	for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next)
+		if (*p == ops)
+			break;
+
+	if (*p != ops) {
+		ret = -1;
+		goto out;
+	}
+
+	*p = (*p)->next;
+
+	if (ftrace_enabled) {
+		/* If we only have one func left, then call that directly */
+		if (ftrace_list == &ftrace_list_end ||
+		    ftrace_list->next == &ftrace_list_end)
+			ftrace_trace_function = ftrace_list->func;
+	}
+
+ out:
+	spin_unlock(&ftrace_lock);
+
+	return ret;
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+static struct task_struct *ftraced_task;
+
+enum {
+	FTRACE_ENABLE_CALLS		= (1 << 0),
+	FTRACE_DISABLE_CALLS		= (1 << 1),
+	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
+	FTRACE_ENABLE_MCOUNT		= (1 << 3),
+	FTRACE_DISABLE_MCOUNT		= (1 << 4),
+};
+
+static int ftrace_filtered;
+static int tracing_on;
+static int frozen_record_count;
+
+static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
+
+static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
+
+static DEFINE_SPINLOCK(ftrace_shutdown_lock);
+static DEFINE_MUTEX(ftraced_lock);
+static DEFINE_MUTEX(ftrace_regex_lock);
+
+struct ftrace_page {
+	struct ftrace_page	*next;
+	unsigned long		index;
+	struct dyn_ftrace	records[];
+};
+
+#define ENTRIES_PER_PAGE \
+  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
+
+/* estimate from running different kernels */
+#define NR_TO_INIT		10000
+
+static struct ftrace_page	*ftrace_pages_start;
+static struct ftrace_page	*ftrace_pages;
+
+static int ftraced_trigger;
+static int ftraced_suspend;
+static int ftraced_stop;
+
+static int ftrace_record_suspend;
+
+static struct dyn_ftrace *ftrace_free_records;
+
+
+#ifdef CONFIG_KPROBES
+static inline void freeze_record(struct dyn_ftrace *rec)
+{
+	if (!(rec->flags & FTRACE_FL_FROZEN)) {
+		rec->flags |= FTRACE_FL_FROZEN;
+		frozen_record_count++;
+	}
+}
+
+static inline void unfreeze_record(struct dyn_ftrace *rec)
+{
+	if (rec->flags & FTRACE_FL_FROZEN) {
+		rec->flags &= ~FTRACE_FL_FROZEN;
+		frozen_record_count--;
+	}
+}
+
+static inline int record_frozen(struct dyn_ftrace *rec)
+{
+	return rec->flags & FTRACE_FL_FROZEN;
+}
+#else
+# define freeze_record(rec)			({ 0; })
+# define unfreeze_record(rec)			({ 0; })
+# define record_frozen(rec)			({ 0; })
+#endif /* CONFIG_KPROBES */
+
+int skip_trace(unsigned long ip)
+{
+	unsigned long fl;
+	struct dyn_ftrace *rec;
+	struct hlist_node *t;
+	struct hlist_head *head;
+
+	if (frozen_record_count == 0)
+		return 0;
+
+	head = &ftrace_hash[hash_long(ip, FTRACE_HASHBITS)];
+	hlist_for_each_entry_rcu(rec, t, head, node) {
+		if (rec->ip == ip) {
+			if (record_frozen(rec)) {
+				if (rec->flags & FTRACE_FL_FAILED)
+					return 1;
+
+				if (!(rec->flags & FTRACE_FL_CONVERTED))
+					return 1;
+
+				if (!tracing_on || !ftrace_enabled)
+					return 1;
+
+				if (ftrace_filtered) {
+					fl = rec->flags & (FTRACE_FL_FILTER |
+							   FTRACE_FL_NOTRACE);
+					if (!fl || (fl & FTRACE_FL_NOTRACE))
+						return 1;
+				}
+			}
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static inline int
+ftrace_ip_in_hash(unsigned long ip, unsigned long key)
+{
+	struct dyn_ftrace *p;
+	struct hlist_node *t;
+	int found = 0;
+
+	hlist_for_each_entry_rcu(p, t, &ftrace_hash[key], node) {
+		if (p->ip == ip) {
+			found = 1;
+			break;
+		}
+	}
+
+	return found;
+}
+
+static inline void
+ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
+{
+	hlist_add_head_rcu(&node->node, &ftrace_hash[key]);
+}
+
+/* called from kstop_machine */
+static inline void ftrace_del_hash(struct dyn_ftrace *node)
+{
+	hlist_del(&node->node);
+}
+
+static void ftrace_free_rec(struct dyn_ftrace *rec)
+{
+	/* no locking, only called from kstop_machine */
+
+	rec->ip = (unsigned long)ftrace_free_records;
+	ftrace_free_records = rec;
+	rec->flags |= FTRACE_FL_FREE;
+}
+
+static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
+{
+	struct dyn_ftrace *rec;
+
+	/* First check for freed records */
+	if (ftrace_free_records) {
+		rec = ftrace_free_records;
+
+		if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
+			WARN_ON_ONCE(1);
+			ftrace_free_records = NULL;
+			ftrace_disabled = 1;
+			ftrace_enabled = 0;
+			return NULL;
+		}
+
+		ftrace_free_records = (void *)rec->ip;
+		memset(rec, 0, sizeof(*rec));
+		return rec;
+	}
+
+	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
+		if (!ftrace_pages->next)
+			return NULL;
+		ftrace_pages = ftrace_pages->next;
+	}
+
+	return &ftrace_pages->records[ftrace_pages->index++];
+}
+
+static void
+ftrace_record_ip(unsigned long ip)
+{
+	struct dyn_ftrace *node;
+	unsigned long flags;
+	unsigned long key;
+	int resched;
+	int atomic;
+	int cpu;
+
+	if (!ftrace_enabled || ftrace_disabled)
+		return;
+
+	resched = need_resched();
+	preempt_disable_notrace();
+
+	/*
+	 * We simply need to protect against recursion.
+	 * Use the the raw version of smp_processor_id and not
+	 * __get_cpu_var which can call debug hooks that can
+	 * cause a recursive crash here.
+	 */
+	cpu = raw_smp_processor_id();
+	per_cpu(ftrace_shutdown_disable_cpu, cpu)++;
+	if (per_cpu(ftrace_shutdown_disable_cpu, cpu) != 1)
+		goto out;
+
+	if (unlikely(ftrace_record_suspend))
+		goto out;
+
+	key = hash_long(ip, FTRACE_HASHBITS);
+
+	WARN_ON_ONCE(key >= FTRACE_HASHSIZE);
+
+	if (ftrace_ip_in_hash(ip, key))
+		goto out;
+
+	atomic = irqs_disabled();
+
+	spin_lock_irqsave(&ftrace_shutdown_lock, flags);
+
+	/* This ip may have hit the hash before the lock */
+	if (ftrace_ip_in_hash(ip, key))
+		goto out_unlock;
+
+	node = ftrace_alloc_dyn_node(ip);
+	if (!node)
+		goto out_unlock;
+
+	node->ip = ip;
+
+	ftrace_add_hash(node, key);
+
+	ftraced_trigger = 1;
+
+ out_unlock:
+	spin_unlock_irqrestore(&ftrace_shutdown_lock, flags);
+ out:
+	per_cpu(ftrace_shutdown_disable_cpu, cpu)--;
+
+	/* prevent recursion with scheduler */
+	if (resched)
+		preempt_enable_no_resched_notrace();
+	else
+		preempt_enable_notrace();
+}
+
+#define FTRACE_ADDR ((long)(ftrace_caller))
+
+static int
+__ftrace_replace_code(struct dyn_ftrace *rec,
+		      unsigned char *old, unsigned char *new, int enable)
+{
+	unsigned long ip, fl;
+
+	ip = rec->ip;
+
+	if (ftrace_filtered && enable) {
+		/*
+		 * If filtering is on:
+		 *
+		 * If this record is set to be filtered and
+		 * is enabled then do nothing.
+		 *
+		 * If this record is set to be filtered and
+		 * it is not enabled, enable it.
+		 *
+		 * If this record is not set to be filtered
+		 * and it is not enabled do nothing.
+		 *
+		 * If this record is set not to trace then
+		 * do nothing.
+		 *
+		 * If this record is set not to trace and
+		 * it is enabled then disable it.
+		 *
+		 * If this record is not set to be filtered and
+		 * it is enabled, disable it.
+		 */
+
+		fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE |
+				   FTRACE_FL_ENABLED);
+
+		if ((fl ==  (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) ||
+		    (fl ==  (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE)) ||
+		    !fl || (fl == FTRACE_FL_NOTRACE))
+			return 0;
+
+		/*
+		 * If it is enabled disable it,
+		 * otherwise enable it!
+		 */
+		if (fl & FTRACE_FL_ENABLED) {
+			/* swap new and old */
+			new = old;
+			old = ftrace_call_replace(ip, FTRACE_ADDR);
+			rec->flags &= ~FTRACE_FL_ENABLED;
+		} else {
+			new = ftrace_call_replace(ip, FTRACE_ADDR);
+			rec->flags |= FTRACE_FL_ENABLED;
+		}
+	} else {
+
+		if (enable) {
+			/*
+			 * If this record is set not to trace and is
+			 * not enabled, do nothing.
+			 */
+			fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED);
+			if (fl == FTRACE_FL_NOTRACE)
+				return 0;
+
+			new = ftrace_call_replace(ip, FTRACE_ADDR);
+		} else
+			old = ftrace_call_replace(ip, FTRACE_ADDR);
+
+		if (enable) {
+			if (rec->flags & FTRACE_FL_ENABLED)
+				return 0;
+			rec->flags |= FTRACE_FL_ENABLED;
+		} else {
+			if (!(rec->flags & FTRACE_FL_ENABLED))
+				return 0;
+			rec->flags &= ~FTRACE_FL_ENABLED;
+		}
+	}
+
+	return ftrace_modify_code(ip, old, new);
+}
+
+static void ftrace_replace_code(int enable)
+{
+	int i, failed;
+	unsigned char *new = NULL, *old = NULL;
+	struct dyn_ftrace *rec;
+	struct ftrace_page *pg;
+
+	if (enable)
+		old = ftrace_nop_replace();
+	else
+		new = ftrace_nop_replace();
+
+	for (pg = ftrace_pages_start; pg; pg = pg->next) {
+		for (i = 0; i < pg->index; i++) {
+			rec = &pg->records[i];
+
+			/* don't modify code that has already faulted */
+			if (rec->flags & FTRACE_FL_FAILED)
+				continue;
+
+			/* ignore updates to this record's mcount site */
+			if (get_kprobe((void *)rec->ip)) {
+				freeze_record(rec);
+				continue;
+			} else {
+				unfreeze_record(rec);
+			}
+
+			failed = __ftrace_replace_code(rec, old, new, enable);
+			if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
+				rec->flags |= FTRACE_FL_FAILED;
+				if ((system_state == SYSTEM_BOOTING) ||
+				    !core_kernel_text(rec->ip)) {
+					ftrace_del_hash(rec);
+					ftrace_free_rec(rec);
+				}
+			}
+		}
+	}
+}
+
+static void ftrace_shutdown_replenish(void)
+{
+	if (ftrace_pages->next)
+		return;
+
+	/* allocate another page */
+	ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
+}
+
+static int
+ftrace_code_disable(struct dyn_ftrace *rec)
+{
+	unsigned long ip;
+	unsigned char *nop, *call;
+	int failed;
+
+	ip = rec->ip;
+
+	nop = ftrace_nop_replace();
+	call = ftrace_call_replace(ip, MCOUNT_ADDR);
+
+	failed = ftrace_modify_code(ip, call, nop);
+	if (failed) {
+		rec->flags |= FTRACE_FL_FAILED;
+		return 0;
+	}
+	return 1;
+}
+
+static int __ftrace_update_code(void *ignore);
+
+static int __ftrace_modify_code(void *data)
+{
+	unsigned long addr;
+	int *command = data;
+
+	if (*command & FTRACE_ENABLE_CALLS) {
+		/*
+		 * Update any recorded ips now that we have the
+		 * machine stopped
+		 */
+		__ftrace_update_code(NULL);
+		ftrace_replace_code(1);
+		tracing_on = 1;
+	} else if (*command & FTRACE_DISABLE_CALLS) {
+		ftrace_replace_code(0);
+		tracing_on = 0;
+	}
+
+	if (*command & FTRACE_UPDATE_TRACE_FUNC)
+		ftrace_update_ftrace_func(ftrace_trace_function);
+
+	if (*command & FTRACE_ENABLE_MCOUNT) {
+		addr = (unsigned long)ftrace_record_ip;
+		ftrace_mcount_set(&addr);
+	} else if (*command & FTRACE_DISABLE_MCOUNT) {
+		addr = (unsigned long)ftrace_stub;
+		ftrace_mcount_set(&addr);
+	}
+
+	return 0;
+}
+
+static void ftrace_run_update_code(int command)
+{
+	stop_machine_run(__ftrace_modify_code, &command, NR_CPUS);
+}
+
+void ftrace_disable_daemon(void)
+{
+	/* Stop the daemon from calling kstop_machine */
+	mutex_lock(&ftraced_lock);
+	ftraced_stop = 1;
+	mutex_unlock(&ftraced_lock);
+
+	ftrace_force_update();
+}
+
+void ftrace_enable_daemon(void)
+{
+	mutex_lock(&ftraced_lock);
+	ftraced_stop = 0;
+	mutex_unlock(&ftraced_lock);
+
+	ftrace_force_update();
+}
+
+static ftrace_func_t saved_ftrace_func;
+
+static void ftrace_startup(void)
+{
+	int command = 0;
+
+	if (unlikely(ftrace_disabled))
+		return;
+
+	mutex_lock(&ftraced_lock);
+	ftraced_suspend++;
+	if (ftraced_suspend == 1)
+		command |= FTRACE_ENABLE_CALLS;
+
+	if (saved_ftrace_func != ftrace_trace_function) {
+		saved_ftrace_func = ftrace_trace_function;
+		command |= FTRACE_UPDATE_TRACE_FUNC;
+	}
+
+	if (!command || !ftrace_enabled)
+		goto out;
+
+	ftrace_run_update_code(command);
+ out:
+	mutex_unlock(&ftraced_lock);
+}
+
+static void ftrace_shutdown(void)
+{
+	int command = 0;
+
+	if (unlikely(ftrace_disabled))
+		return;
+
+	mutex_lock(&ftraced_lock);
+	ftraced_suspend--;
+	if (!ftraced_suspend)
+		command |= FTRACE_DISABLE_CALLS;
+
+	if (saved_ftrace_func != ftrace_trace_function) {
+		saved_ftrace_func = ftrace_trace_function;
+		command |= FTRACE_UPDATE_TRACE_FUNC;
+	}
+
+	if (!command || !ftrace_enabled)
+		goto out;
+
+	ftrace_run_update_code(command);
+ out:
+	mutex_unlock(&ftraced_lock);
+}
+
+static void ftrace_startup_sysctl(void)
+{
+	int command = FTRACE_ENABLE_MCOUNT;
+
+	if (unlikely(ftrace_disabled))
+		return;
+
+	mutex_lock(&ftraced_lock);
+	/* Force update next time */
+	saved_ftrace_func = NULL;
+	/* ftraced_suspend is true if we want ftrace running */
+	if (ftraced_suspend)
+		command |= FTRACE_ENABLE_CALLS;
+
+	ftrace_run_update_code(command);
+	mutex_unlock(&ftraced_lock);
+}
+
+static void ftrace_shutdown_sysctl(void)
+{
+	int command = FTRACE_DISABLE_MCOUNT;
+
+	if (unlikely(ftrace_disabled))
+		return;
+
+	mutex_lock(&ftraced_lock);
+	/* ftraced_suspend is true if ftrace is running */
+	if (ftraced_suspend)
+		command |= FTRACE_DISABLE_CALLS;
+
+	ftrace_run_update_code(command);
+	mutex_unlock(&ftraced_lock);
+}
+
+static cycle_t		ftrace_update_time;
+static unsigned long	ftrace_update_cnt;
+unsigned long		ftrace_update_tot_cnt;
+
+static int __ftrace_update_code(void *ignore)
+{
+	int i, save_ftrace_enabled;
+	cycle_t start, stop;
+	struct dyn_ftrace *p;
+	struct hlist_node *t, *n;
+	struct hlist_head *head, temp_list;
+
+	/* Don't be recording funcs now */
+	ftrace_record_suspend++;
+	save_ftrace_enabled = ftrace_enabled;
+	ftrace_enabled = 0;
+
+	start = ftrace_now(raw_smp_processor_id());
+	ftrace_update_cnt = 0;
+
+	/* No locks needed, the machine is stopped! */
+	for (i = 0; i < FTRACE_HASHSIZE; i++) {
+		INIT_HLIST_HEAD(&temp_list);
+		head = &ftrace_hash[i];
+
+		/* all CPUS are stopped, we are safe to modify code */
+		hlist_for_each_entry_safe(p, t, n, head, node) {
+			/* Skip over failed records which have not been
+			 * freed. */
+			if (p->flags & FTRACE_FL_FAILED)
+				continue;
+
+			/* Unconverted records are always at the head of the
+			 * hash bucket. Once we encounter a converted record,
+			 * simply skip over to the next bucket. Saves ftraced
+			 * some processor cycles (ftrace does its bid for
+			 * global warming :-p ). */
+			if (p->flags & (FTRACE_FL_CONVERTED))
+				break;
+
+			/* Ignore updates to this record's mcount site.
+			 * Reintroduce this record at the head of this
+			 * bucket to attempt to "convert" it again if
+			 * the kprobe on it is unregistered before the
+			 * next run. */
+			if (get_kprobe((void *)p->ip)) {
+				ftrace_del_hash(p);
+				INIT_HLIST_NODE(&p->node);
+				hlist_add_head(&p->node, &temp_list);
+				freeze_record(p);
+				continue;
+			} else {
+				unfreeze_record(p);
+			}
+
+			/* convert record (i.e, patch mcount-call with NOP) */
+			if (ftrace_code_disable(p)) {
+				p->flags |= FTRACE_FL_CONVERTED;
+				ftrace_update_cnt++;
+			} else {
+				if ((system_state == SYSTEM_BOOTING) ||
+				    !core_kernel_text(p->ip)) {
+					ftrace_del_hash(p);
+					ftrace_free_rec(p);
+				}
+			}
+		}
+
+		hlist_for_each_entry_safe(p, t, n, &temp_list, node) {
+			hlist_del(&p->node);
+			INIT_HLIST_NODE(&p->node);
+			hlist_add_head(&p->node, head);
+		}
+	}
+
+	stop = ftrace_now(raw_smp_processor_id());
+	ftrace_update_time = stop - start;
+	ftrace_update_tot_cnt += ftrace_update_cnt;
+	ftraced_trigger = 0;
+
+	ftrace_enabled = save_ftrace_enabled;
+	ftrace_record_suspend--;
+
+	return 0;
+}
+
+static int ftrace_update_code(void)
+{
+	if (unlikely(ftrace_disabled) ||
+	    !ftrace_enabled || !ftraced_trigger)
+		return 0;
+
+	stop_machine_run(__ftrace_update_code, NULL, NR_CPUS);
+
+	return 1;
+}
+
+static int ftraced(void *ignore)
+{
+	unsigned long usecs;
+
+	while (!kthread_should_stop()) {
+
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		/* check once a second */
+		schedule_timeout(HZ);
+
+		if (unlikely(ftrace_disabled))
+			continue;
+
+		mutex_lock(&ftrace_sysctl_lock);
+		mutex_lock(&ftraced_lock);
+		if (!ftraced_suspend && !ftraced_stop &&
+		    ftrace_update_code()) {
+			usecs = nsecs_to_usecs(ftrace_update_time);
+			if (ftrace_update_tot_cnt > 100000) {
+				ftrace_update_tot_cnt = 0;
+				pr_info("hm, dftrace overflow: %lu change%s"
+					" (%lu total) in %lu usec%s\n",
+					ftrace_update_cnt,
+					ftrace_update_cnt != 1 ? "s" : "",
+					ftrace_update_tot_cnt,
+					usecs, usecs != 1 ? "s" : "");
+				ftrace_disabled = 1;
+				WARN_ON_ONCE(1);
+			}
+		}
+		mutex_unlock(&ftraced_lock);
+		mutex_unlock(&ftrace_sysctl_lock);
+
+		ftrace_shutdown_replenish();
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static int __init ftrace_dyn_table_alloc(void)
+{
+	struct ftrace_page *pg;
+	int cnt;
+	int i;
+
+	/* allocate a few pages */
+	ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!ftrace_pages_start)
+		return -1;
+
+	/*
+	 * Allocate a few more pages.
+	 *
+	 * TODO: have some parser search vmlinux before
+	 *   final linking to find all calls to ftrace.
+	 *   Then we can:
+	 *    a) know how many pages to allocate.
+	 *     and/or
+	 *    b) set up the table then.
+	 *
+	 *  The dynamic code is still necessary for
+	 *  modules.
+	 */
+
+	pg = ftrace_pages = ftrace_pages_start;
+
+	cnt = NR_TO_INIT / ENTRIES_PER_PAGE;
+
+	for (i = 0; i < cnt; i++) {
+		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+
+		/* If we fail, we'll try later anyway */
+		if (!pg->next)
+			break;
+
+		pg = pg->next;
+	}
+
+	return 0;
+}
+
+enum {
+	FTRACE_ITER_FILTER	= (1 << 0),
+	FTRACE_ITER_CONT	= (1 << 1),
+	FTRACE_ITER_NOTRACE	= (1 << 2),
+	FTRACE_ITER_FAILURES	= (1 << 3),
+};
+
+#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
+
+struct ftrace_iterator {
+	loff_t			pos;
+	struct ftrace_page	*pg;
+	unsigned		idx;
+	unsigned		flags;
+	unsigned char		buffer[FTRACE_BUFF_MAX+1];
+	unsigned		buffer_idx;
+	unsigned		filtered;
+};
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ftrace_iterator *iter = m->private;
+	struct dyn_ftrace *rec = NULL;
+
+	(*pos)++;
+
+ retry:
+	if (iter->idx >= iter->pg->index) {
+		if (iter->pg->next) {
+			iter->pg = iter->pg->next;
+			iter->idx = 0;
+			goto retry;
+		}
+	} else {
+		rec = &iter->pg->records[iter->idx++];
+		if ((!(iter->flags & FTRACE_ITER_FAILURES) &&
+		     (rec->flags & FTRACE_FL_FAILED)) ||
+
+		    ((iter->flags & FTRACE_ITER_FAILURES) &&
+		     (!(rec->flags & FTRACE_FL_FAILED) ||
+		      (rec->flags & FTRACE_FL_FREE))) ||
+
+		    ((iter->flags & FTRACE_ITER_FILTER) &&
+		     !(rec->flags & FTRACE_FL_FILTER)) ||
+
+		    ((iter->flags & FTRACE_ITER_NOTRACE) &&
+		     !(rec->flags & FTRACE_FL_NOTRACE))) {
+			rec = NULL;
+			goto retry;
+		}
+	}
+
+	iter->pos = *pos;
+
+	return rec;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	struct ftrace_iterator *iter = m->private;
+	void *p = NULL;
+	loff_t l = -1;
+
+	if (*pos != iter->pos) {
+		for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l))
+			;
+	} else {
+		l = *pos;
+		p = t_next(m, p, &l);
+	}
+
+	return p;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+	struct dyn_ftrace *rec = v;
+	char str[KSYM_SYMBOL_LEN];
+
+	if (!rec)
+		return 0;
+
+	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+
+	seq_printf(m, "%s\n", str);
+
+	return 0;
+}
+
+static struct seq_operations show_ftrace_seq_ops = {
+	.start = t_start,
+	.next = t_next,
+	.stop = t_stop,
+	.show = t_show,
+};
+
+static int
+ftrace_avail_open(struct inode *inode, struct file *file)
+{
+	struct ftrace_iterator *iter;
+	int ret;
+
+	if (unlikely(ftrace_disabled))
+		return -ENODEV;
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return -ENOMEM;
+
+	iter->pg = ftrace_pages_start;
+	iter->pos = -1;
+
+	ret = seq_open(file, &show_ftrace_seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+
+		m->private = iter;
+	} else {
+		kfree(iter);
+	}
+
+	return ret;
+}
+
+int ftrace_avail_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = (struct seq_file *)file->private_data;
+	struct ftrace_iterator *iter = m->private;
+
+	seq_release(inode, file);
+	kfree(iter);
+
+	return 0;
+}
+
+static int
+ftrace_failures_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	struct seq_file *m;
+	struct ftrace_iterator *iter;
+
+	ret = ftrace_avail_open(inode, file);
+	if (!ret) {
+		m = (struct seq_file *)file->private_data;
+		iter = (struct ftrace_iterator *)m->private;
+		iter->flags = FTRACE_ITER_FAILURES;
+	}
+
+	return ret;
+}
+
+
+static void ftrace_filter_reset(int enable)
+{
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+	unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+	unsigned i;
+
+	/* keep kstop machine from running */
+	preempt_disable();
+	if (enable)
+		ftrace_filtered = 0;
+	pg = ftrace_pages_start;
+	while (pg) {
+		for (i = 0; i < pg->index; i++) {
+			rec = &pg->records[i];
+			if (rec->flags & FTRACE_FL_FAILED)
+				continue;
+			rec->flags &= ~type;
+		}
+		pg = pg->next;
+	}
+	preempt_enable();
+}
+
+static int
+ftrace_regex_open(struct inode *inode, struct file *file, int enable)
+{
+	struct ftrace_iterator *iter;
+	int ret = 0;
+
+	if (unlikely(ftrace_disabled))
+		return -ENODEV;
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return -ENOMEM;
+
+	mutex_lock(&ftrace_regex_lock);
+	if ((file->f_mode & FMODE_WRITE) &&
+	    !(file->f_flags & O_APPEND))
+		ftrace_filter_reset(enable);
+
+	if (file->f_mode & FMODE_READ) {
+		iter->pg = ftrace_pages_start;
+		iter->pos = -1;
+		iter->flags = enable ? FTRACE_ITER_FILTER :
+			FTRACE_ITER_NOTRACE;
+
+		ret = seq_open(file, &show_ftrace_seq_ops);
+		if (!ret) {
+			struct seq_file *m = file->private_data;
+			m->private = iter;
+		} else
+			kfree(iter);
+	} else
+		file->private_data = iter;
+	mutex_unlock(&ftrace_regex_lock);
+
+	return ret;
+}
+
+static int
+ftrace_filter_open(struct inode *inode, struct file *file)
+{
+	return ftrace_regex_open(inode, file, 1);
+}
+
+static int
+ftrace_notrace_open(struct inode *inode, struct file *file)
+{
+	return ftrace_regex_open(inode, file, 0);
+}
+
+static ssize_t
+ftrace_regex_read(struct file *file, char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	if (file->f_mode & FMODE_READ)
+		return seq_read(file, ubuf, cnt, ppos);
+	else
+		return -EPERM;
+}
+
+static loff_t
+ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
+{
+	loff_t ret;
+
+	if (file->f_mode & FMODE_READ)
+		ret = seq_lseek(file, offset, origin);
+	else
+		file->f_pos = ret = 1;
+
+	return ret;
+}
+
+enum {
+	MATCH_FULL,
+	MATCH_FRONT_ONLY,
+	MATCH_MIDDLE_ONLY,
+	MATCH_END_ONLY,
+};
+
+static void
+ftrace_match(unsigned char *buff, int len, int enable)
+{
+	char str[KSYM_SYMBOL_LEN];
+	char *search = NULL;
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+	int type = MATCH_FULL;
+	unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+	unsigned i, match = 0, search_len = 0;
+
+	for (i = 0; i < len; i++) {
+		if (buff[i] == '*') {
+			if (!i) {
+				search = buff + i + 1;
+				type = MATCH_END_ONLY;
+				search_len = len - (i + 1);
+			} else {
+				if (type == MATCH_END_ONLY) {
+					type = MATCH_MIDDLE_ONLY;
+				} else {
+					match = i;
+					type = MATCH_FRONT_ONLY;
+				}
+				buff[i] = 0;
+				break;
+			}
+		}
+	}
+
+	/* keep kstop machine from running */
+	preempt_disable();
+	if (enable)
+		ftrace_filtered = 1;
+	pg = ftrace_pages_start;
+	while (pg) {
+		for (i = 0; i < pg->index; i++) {
+			int matched = 0;
+			char *ptr;
+
+			rec = &pg->records[i];
+			if (rec->flags & FTRACE_FL_FAILED)
+				continue;
+			kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+			switch (type) {
+			case MATCH_FULL:
+				if (strcmp(str, buff) == 0)
+					matched = 1;
+				break;
+			case MATCH_FRONT_ONLY:
+				if (memcmp(str, buff, match) == 0)
+					matched = 1;
+				break;
+			case MATCH_MIDDLE_ONLY:
+				if (strstr(str, search))
+					matched = 1;
+				break;
+			case MATCH_END_ONLY:
+				ptr = strstr(str, search);
+				if (ptr && (ptr[search_len] == 0))
+					matched = 1;
+				break;
+			}
+			if (matched)
+				rec->flags |= flag;
+		}
+		pg = pg->next;
+	}
+	preempt_enable();
+}
+
+static ssize_t
+ftrace_regex_write(struct file *file, const char __user *ubuf,
+		   size_t cnt, loff_t *ppos, int enable)
+{
+	struct ftrace_iterator *iter;
+	char ch;
+	size_t read = 0;
+	ssize_t ret;
+
+	if (!cnt || cnt < 0)
+		return 0;
+
+	mutex_lock(&ftrace_regex_lock);
+
+	if (file->f_mode & FMODE_READ) {
+		struct seq_file *m = file->private_data;
+		iter = m->private;
+	} else
+		iter = file->private_data;
+
+	if (!*ppos) {
+		iter->flags &= ~FTRACE_ITER_CONT;
+		iter->buffer_idx = 0;
+	}
+
+	ret = get_user(ch, ubuf++);
+	if (ret)
+		goto out;
+	read++;
+	cnt--;
+
+	if (!(iter->flags & ~FTRACE_ITER_CONT)) {
+		/* skip white space */
+		while (cnt && isspace(ch)) {
+			ret = get_user(ch, ubuf++);
+			if (ret)
+				goto out;
+			read++;
+			cnt--;
+		}
+
+		if (isspace(ch)) {
+			file->f_pos += read;
+			ret = read;
+			goto out;
+		}
+
+		iter->buffer_idx = 0;
+	}
+
+	while (cnt && !isspace(ch)) {
+		if (iter->buffer_idx < FTRACE_BUFF_MAX)
+			iter->buffer[iter->buffer_idx++] = ch;
+		else {
+			ret = -EINVAL;
+			goto out;
+		}
+		ret = get_user(ch, ubuf++);
+		if (ret)
+			goto out;
+		read++;
+		cnt--;
+	}
+
+	if (isspace(ch)) {
+		iter->filtered++;
+		iter->buffer[iter->buffer_idx] = 0;
+		ftrace_match(iter->buffer, iter->buffer_idx, enable);
+		iter->buffer_idx = 0;
+	} else
+		iter->flags |= FTRACE_ITER_CONT;
+
+
+	file->f_pos += read;
+
+	ret = read;
+ out:
+	mutex_unlock(&ftrace_regex_lock);
+
+	return ret;
+}
+
+static ssize_t
+ftrace_filter_write(struct file *file, const char __user *ubuf,
+		    size_t cnt, loff_t *ppos)
+{
+	return ftrace_regex_write(file, ubuf, cnt, ppos, 1);
+}
+
+static ssize_t
+ftrace_notrace_write(struct file *file, const char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	return ftrace_regex_write(file, ubuf, cnt, ppos, 0);
+}
+
+static void
+ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
+{
+	if (unlikely(ftrace_disabled))
+		return;
+
+	mutex_lock(&ftrace_regex_lock);
+	if (reset)
+		ftrace_filter_reset(enable);
+	if (buf)
+		ftrace_match(buf, len, enable);
+	mutex_unlock(&ftrace_regex_lock);
+}
+
+/**
+ * ftrace_set_filter - set a function to filter on in ftrace
+ * @buf - the string that holds the function filter text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Filters denote which functions should be enabled when tracing is enabled.
+ * If @buf is NULL and reset is set, all functions will be enabled for tracing.
+ */
+void ftrace_set_filter(unsigned char *buf, int len, int reset)
+{
+	ftrace_set_regex(buf, len, reset, 1);
+}
+
+/**
+ * ftrace_set_notrace - set a function to not trace in ftrace
+ * @buf - the string that holds the function notrace text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Notrace Filters denote which functions should not be enabled when tracing
+ * is enabled. If @buf is NULL and reset is set, all functions will be enabled
+ * for tracing.
+ */
+void ftrace_set_notrace(unsigned char *buf, int len, int reset)
+{
+	ftrace_set_regex(buf, len, reset, 0);
+}
+
+static int
+ftrace_regex_release(struct inode *inode, struct file *file, int enable)
+{
+	struct seq_file *m = (struct seq_file *)file->private_data;
+	struct ftrace_iterator *iter;
+
+	mutex_lock(&ftrace_regex_lock);
+	if (file->f_mode & FMODE_READ) {
+		iter = m->private;
+
+		seq_release(inode, file);
+	} else
+		iter = file->private_data;
+
+	if (iter->buffer_idx) {
+		iter->filtered++;
+		iter->buffer[iter->buffer_idx] = 0;
+		ftrace_match(iter->buffer, iter->buffer_idx, enable);
+	}
+
+	mutex_lock(&ftrace_sysctl_lock);
+	mutex_lock(&ftraced_lock);
+	if (iter->filtered && ftraced_suspend && ftrace_enabled)
+		ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+	mutex_unlock(&ftraced_lock);
+	mutex_unlock(&ftrace_sysctl_lock);
+
+	kfree(iter);
+	mutex_unlock(&ftrace_regex_lock);
+	return 0;
+}
+
+static int
+ftrace_filter_release(struct inode *inode, struct file *file)
+{
+	return ftrace_regex_release(inode, file, 1);
+}
+
+static int
+ftrace_notrace_release(struct inode *inode, struct file *file)
+{
+	return ftrace_regex_release(inode, file, 0);
+}
+
+static ssize_t
+ftraced_read(struct file *filp, char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	/* don't worry about races */
+	char *buf = ftraced_stop ? "disabled\n" : "enabled\n";
+	int r = strlen(buf);
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+ftraced_write(struct file *filp, const char __user *ubuf,
+		      size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	long val;
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	if (strncmp(buf, "enable", 6) == 0)
+		val = 1;
+	else if (strncmp(buf, "disable", 7) == 0)
+		val = 0;
+	else {
+		buf[cnt] = 0;
+
+		ret = strict_strtoul(buf, 10, &val);
+		if (ret < 0)
+			return ret;
+
+		val = !!val;
+	}
+
+	if (val)
+		ftrace_enable_daemon();
+	else
+		ftrace_disable_daemon();
+
+	filp->f_pos += cnt;
+
+	return cnt;
+}
+
+static struct file_operations ftrace_avail_fops = {
+	.open = ftrace_avail_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = ftrace_avail_release,
+};
+
+static struct file_operations ftrace_failures_fops = {
+	.open = ftrace_failures_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = ftrace_avail_release,
+};
+
+static struct file_operations ftrace_filter_fops = {
+	.open = ftrace_filter_open,
+	.read = ftrace_regex_read,
+	.write = ftrace_filter_write,
+	.llseek = ftrace_regex_lseek,
+	.release = ftrace_filter_release,
+};
+
+static struct file_operations ftrace_notrace_fops = {
+	.open = ftrace_notrace_open,
+	.read = ftrace_regex_read,
+	.write = ftrace_notrace_write,
+	.llseek = ftrace_regex_lseek,
+	.release = ftrace_notrace_release,
+};
+
+static struct file_operations ftraced_fops = {
+	.open = tracing_open_generic,
+	.read = ftraced_read,
+	.write = ftraced_write,
+};
+
+/**
+ * ftrace_force_update - force an update to all recording ftrace functions
+ */
+int ftrace_force_update(void)
+{
+	int ret = 0;
+
+	if (unlikely(ftrace_disabled))
+		return -ENODEV;
+
+	mutex_lock(&ftrace_sysctl_lock);
+	mutex_lock(&ftraced_lock);
+
+	/*
+	 * If ftraced_trigger is not set, then there is nothing
+	 * to update.
+	 */
+	if (ftraced_trigger && !ftrace_update_code())
+		ret = -EBUSY;
+
+	mutex_unlock(&ftraced_lock);
+	mutex_unlock(&ftrace_sysctl_lock);
+
+	return ret;
+}
+
+static void ftrace_force_shutdown(void)
+{
+	struct task_struct *task;
+	int command = FTRACE_DISABLE_CALLS | FTRACE_UPDATE_TRACE_FUNC;
+
+	mutex_lock(&ftraced_lock);
+	task = ftraced_task;
+	ftraced_task = NULL;
+	ftraced_suspend = -1;
+	ftrace_run_update_code(command);
+	mutex_unlock(&ftraced_lock);
+
+	if (task)
+		kthread_stop(task);
+}
+
+static __init int ftrace_init_debugfs(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+
+	entry = debugfs_create_file("available_filter_functions", 0444,
+				    d_tracer, NULL, &ftrace_avail_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'available_filter_functions' entry\n");
+
+	entry = debugfs_create_file("failures", 0444,
+				    d_tracer, NULL, &ftrace_failures_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'failures' entry\n");
+
+	entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer,
+				    NULL, &ftrace_filter_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'set_ftrace_filter' entry\n");
+
+	entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer,
+				    NULL, &ftrace_notrace_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'set_ftrace_notrace' entry\n");
+
+	entry = debugfs_create_file("ftraced_enabled", 0644, d_tracer,
+				    NULL, &ftraced_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'ftraced_enabled' entry\n");
+	return 0;
+}
+
+fs_initcall(ftrace_init_debugfs);
+
+static int __init ftrace_dynamic_init(void)
+{
+	struct task_struct *p;
+	unsigned long addr;
+	int ret;
+
+	addr = (unsigned long)ftrace_record_ip;
+
+	stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS);
+
+	/* ftrace_dyn_arch_init places the return code in addr */
+	if (addr) {
+		ret = (int)addr;
+		goto failed;
+	}
+
+	ret = ftrace_dyn_table_alloc();
+	if (ret)
+		goto failed;
+
+	p = kthread_run(ftraced, NULL, "ftraced");
+	if (IS_ERR(p)) {
+		ret = -1;
+		goto failed;
+	}
+
+	last_ftrace_enabled = ftrace_enabled = 1;
+	ftraced_task = p;
+
+	return 0;
+
+ failed:
+	ftrace_disabled = 1;
+	return ret;
+}
+
+core_initcall(ftrace_dynamic_init);
+#else
+# define ftrace_startup()		do { } while (0)
+# define ftrace_shutdown()		do { } while (0)
+# define ftrace_startup_sysctl()	do { } while (0)
+# define ftrace_shutdown_sysctl()	do { } while (0)
+# define ftrace_force_shutdown()	do { } while (0)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+/**
+ * ftrace_kill - totally shutdown ftrace
+ *
+ * This is a safety measure. If something was detected that seems
+ * wrong, calling this function will keep ftrace from doing
+ * any more modifications, and updates.
+ * used when something went wrong.
+ */
+void ftrace_kill(void)
+{
+	mutex_lock(&ftrace_sysctl_lock);
+	ftrace_disabled = 1;
+	ftrace_enabled = 0;
+
+	clear_ftrace_function();
+	mutex_unlock(&ftrace_sysctl_lock);
+
+	/* Try to totally disable ftrace */
+	ftrace_force_shutdown();
+}
+
+/**
+ * register_ftrace_function - register a function for profiling
+ * @ops - ops structure that holds the function for profiling.
+ *
+ * Register a function to be called by all functions in the
+ * kernel.
+ *
+ * Note: @ops->func and all the functions it calls must be labeled
+ *       with "notrace", otherwise it will go into a
+ *       recursive loop.
+ */
+int register_ftrace_function(struct ftrace_ops *ops)
+{
+	int ret;
+
+	if (unlikely(ftrace_disabled))
+		return -1;
+
+	mutex_lock(&ftrace_sysctl_lock);
+	ret = __register_ftrace_function(ops);
+	ftrace_startup();
+	mutex_unlock(&ftrace_sysctl_lock);
+
+	return ret;
+}
+
+/**
+ * unregister_ftrace_function - unresgister a function for profiling.
+ * @ops - ops structure that holds the function to unregister
+ *
+ * Unregister a function that was added to be called by ftrace profiling.
+ */
+int unregister_ftrace_function(struct ftrace_ops *ops)
+{
+	int ret;
+
+	mutex_lock(&ftrace_sysctl_lock);
+	ret = __unregister_ftrace_function(ops);
+	ftrace_shutdown();
+	mutex_unlock(&ftrace_sysctl_lock);
+
+	return ret;
+}
+
+int
+ftrace_enable_sysctl(struct ctl_table *table, int write,
+		     struct file *file, void __user *buffer, size_t *lenp,
+		     loff_t *ppos)
+{
+	int ret;
+
+	if (unlikely(ftrace_disabled))
+		return -ENODEV;
+
+	mutex_lock(&ftrace_sysctl_lock);
+
+	ret  = proc_dointvec(table, write, file, buffer, lenp, ppos);
+
+	if (ret || !write || (last_ftrace_enabled == ftrace_enabled))
+		goto out;
+
+	last_ftrace_enabled = ftrace_enabled;
+
+	if (ftrace_enabled) {
+
+		ftrace_startup_sysctl();
+
+		/* we are starting ftrace again */
+		if (ftrace_list != &ftrace_list_end) {
+			if (ftrace_list->next == &ftrace_list_end)
+				ftrace_trace_function = ftrace_list->func;
+			else
+				ftrace_trace_function = ftrace_list_func;
+		}
+
+	} else {
+		/* stopping ftrace calls (just send to ftrace_stub) */
+		ftrace_trace_function = ftrace_stub;
+
+		ftrace_shutdown_sysctl();
+	}
+
+ out:
+	mutex_unlock(&ftrace_sysctl_lock);
+	return ret;
+}

+ 3100 - 0
kernel/trace/trace.c

@@ -0,0 +1,3100 @@
+/*
+ * ring buffer based function tracer
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Originally taken from the RT patch by:
+ *    Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/utsrelease.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/pagemap.h>
+#include <linux/hardirq.h>
+#include <linux/linkage.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/gfp.h>
+#include <linux/fs.h>
+#include <linux/kprobes.h>
+#include <linux/writeback.h>
+
+#include <linux/stacktrace.h>
+
+#include "trace.h"
+
+unsigned long __read_mostly	tracing_max_latency = (cycle_t)ULONG_MAX;
+unsigned long __read_mostly	tracing_thresh;
+
+static unsigned long __read_mostly	tracing_nr_buffers;
+static cpumask_t __read_mostly		tracing_buffer_mask;
+
+#define for_each_tracing_cpu(cpu)	\
+	for_each_cpu_mask(cpu, tracing_buffer_mask)
+
+static int trace_alloc_page(void);
+static int trace_free_page(void);
+
+static int tracing_disabled = 1;
+
+static unsigned long tracing_pages_allocated;
+
+long
+ns2usecs(cycle_t nsec)
+{
+	nsec += 500;
+	do_div(nsec, 1000);
+	return nsec;
+}
+
+cycle_t ftrace_now(int cpu)
+{
+	return cpu_clock(cpu);
+}
+
+/*
+ * The global_trace is the descriptor that holds the tracing
+ * buffers for the live tracing. For each CPU, it contains
+ * a link list of pages that will store trace entries. The
+ * page descriptor of the pages in the memory is used to hold
+ * the link list by linking the lru item in the page descriptor
+ * to each of the pages in the buffer per CPU.
+ *
+ * For each active CPU there is a data field that holds the
+ * pages for the buffer for that CPU. Each CPU has the same number
+ * of pages allocated for its buffer.
+ */
+static struct trace_array	global_trace;
+
+static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
+
+/*
+ * The max_tr is used to snapshot the global_trace when a maximum
+ * latency is reached. Some tracers will use this to store a maximum
+ * trace while it continues examining live traces.
+ *
+ * The buffers for the max_tr are set up the same as the global_trace.
+ * When a snapshot is taken, the link list of the max_tr is swapped
+ * with the link list of the global_trace and the buffers are reset for
+ * the global_trace so the tracing can continue.
+ */
+static struct trace_array	max_tr;
+
+static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
+
+/* tracer_enabled is used to toggle activation of a tracer */
+static int			tracer_enabled = 1;
+
+/*
+ * trace_nr_entries is the number of entries that is allocated
+ * for a buffer. Note, the number of entries is always rounded
+ * to ENTRIES_PER_PAGE.
+ */
+static unsigned long		trace_nr_entries = 65536UL;
+
+/* trace_types holds a link list of available tracers. */
+static struct tracer		*trace_types __read_mostly;
+
+/* current_trace points to the tracer that is currently active */
+static struct tracer		*current_trace __read_mostly;
+
+/*
+ * max_tracer_type_len is used to simplify the allocating of
+ * buffers to read userspace tracer names. We keep track of
+ * the longest tracer name registered.
+ */
+static int			max_tracer_type_len;
+
+/*
+ * trace_types_lock is used to protect the trace_types list.
+ * This lock is also used to keep user access serialized.
+ * Accesses from userspace will grab this lock while userspace
+ * activities happen inside the kernel.
+ */
+static DEFINE_MUTEX(trace_types_lock);
+
+/* trace_wait is a waitqueue for tasks blocked on trace_poll */
+static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
+
+/* trace_flags holds iter_ctrl options */
+unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
+
+static notrace void no_trace_init(struct trace_array *tr)
+{
+	int cpu;
+
+	if(tr->ctrl)
+		for_each_online_cpu(cpu)
+			tracing_reset(tr->data[cpu]);
+	tracer_enabled = 0;
+}
+
+/* dummy trace to disable tracing */
+static struct tracer no_tracer __read_mostly = {
+	.name		= "none",
+	.init		= no_trace_init
+};
+
+
+/**
+ * trace_wake_up - wake up tasks waiting for trace input
+ *
+ * Simply wakes up any task that is blocked on the trace_wait
+ * queue. These is used with trace_poll for tasks polling the trace.
+ */
+void trace_wake_up(void)
+{
+	/*
+	 * The runqueue_is_locked() can fail, but this is the best we
+	 * have for now:
+	 */
+	if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked())
+		wake_up(&trace_wait);
+}
+
+#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry))
+
+static int __init set_nr_entries(char *str)
+{
+	unsigned long nr_entries;
+	int ret;
+
+	if (!str)
+		return 0;
+	ret = strict_strtoul(str, 0, &nr_entries);
+	/* nr_entries can not be zero */
+	if (ret < 0 || nr_entries == 0)
+		return 0;
+	trace_nr_entries = nr_entries;
+	return 1;
+}
+__setup("trace_entries=", set_nr_entries);
+
+unsigned long nsecs_to_usecs(unsigned long nsecs)
+{
+	return nsecs / 1000;
+}
+
+/*
+ * trace_flag_type is an enumeration that holds different
+ * states when a trace occurs. These are:
+ *  IRQS_OFF	- interrupts were disabled
+ *  NEED_RESCED - reschedule is requested
+ *  HARDIRQ	- inside an interrupt handler
+ *  SOFTIRQ	- inside a softirq handler
+ */
+enum trace_flag_type {
+	TRACE_FLAG_IRQS_OFF		= 0x01,
+	TRACE_FLAG_NEED_RESCHED		= 0x02,
+	TRACE_FLAG_HARDIRQ		= 0x04,
+	TRACE_FLAG_SOFTIRQ		= 0x08,
+};
+
+/*
+ * TRACE_ITER_SYM_MASK masks the options in trace_flags that
+ * control the output of kernel symbols.
+ */
+#define TRACE_ITER_SYM_MASK \
+	(TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
+
+/* These must match the bit postions in trace_iterator_flags */
+static const char *trace_options[] = {
+	"print-parent",
+	"sym-offset",
+	"sym-addr",
+	"verbose",
+	"raw",
+	"hex",
+	"bin",
+	"block",
+	"stacktrace",
+	"sched-tree",
+	NULL
+};
+
+/*
+ * ftrace_max_lock is used to protect the swapping of buffers
+ * when taking a max snapshot. The buffers themselves are
+ * protected by per_cpu spinlocks. But the action of the swap
+ * needs its own lock.
+ *
+ * This is defined as a raw_spinlock_t in order to help
+ * with performance when lockdep debugging is enabled.
+ */
+static raw_spinlock_t ftrace_max_lock =
+	(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+/*
+ * Copy the new maximum trace into the separate maximum-trace
+ * structure. (this way the maximum trace is permanently saved,
+ * for later retrieval via /debugfs/tracing/latency_trace)
+ */
+static void
+__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+	struct trace_array_cpu *data = tr->data[cpu];
+
+	max_tr.cpu = cpu;
+	max_tr.time_start = data->preempt_timestamp;
+
+	data = max_tr.data[cpu];
+	data->saved_latency = tracing_max_latency;
+
+	memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
+	data->pid = tsk->pid;
+	data->uid = tsk->uid;
+	data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
+	data->policy = tsk->policy;
+	data->rt_priority = tsk->rt_priority;
+
+	/* record this tasks comm */
+	tracing_record_cmdline(current);
+}
+
+#define CHECK_COND(cond)			\
+	if (unlikely(cond)) {			\
+		tracing_disabled = 1;		\
+		WARN_ON(1);			\
+		return -1;			\
+	}
+
+/**
+ * check_pages - integrity check of trace buffers
+ *
+ * As a safty measure we check to make sure the data pages have not
+ * been corrupted.
+ */
+int check_pages(struct trace_array_cpu *data)
+{
+	struct page *page, *tmp;
+
+	CHECK_COND(data->trace_pages.next->prev != &data->trace_pages);
+	CHECK_COND(data->trace_pages.prev->next != &data->trace_pages);
+
+	list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) {
+		CHECK_COND(page->lru.next->prev != &page->lru);
+		CHECK_COND(page->lru.prev->next != &page->lru);
+	}
+
+	return 0;
+}
+
+/**
+ * head_page - page address of the first page in per_cpu buffer.
+ *
+ * head_page returns the page address of the first page in
+ * a per_cpu buffer. This also preforms various consistency
+ * checks to make sure the buffer has not been corrupted.
+ */
+void *head_page(struct trace_array_cpu *data)
+{
+	struct page *page;
+
+	if (list_empty(&data->trace_pages))
+		return NULL;
+
+	page = list_entry(data->trace_pages.next, struct page, lru);
+	BUG_ON(&page->lru == &data->trace_pages);
+
+	return page_address(page);
+}
+
+/**
+ * trace_seq_printf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+int
+trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+{
+	int len = (PAGE_SIZE - 1) - s->len;
+	va_list ap;
+	int ret;
+
+	if (!len)
+		return 0;
+
+	va_start(ap, fmt);
+	ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
+	va_end(ap);
+
+	/* If we can't write it all, don't bother writing anything */
+	if (ret >= len)
+		return 0;
+
+	s->len += ret;
+
+	return len;
+}
+
+/**
+ * trace_seq_puts - trace sequence printing of simple string
+ * @s: trace sequence descriptor
+ * @str: simple string to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple string
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ */
+static int
+trace_seq_puts(struct trace_seq *s, const char *str)
+{
+	int len = strlen(str);
+
+	if (len > ((PAGE_SIZE - 1) - s->len))
+		return 0;
+
+	memcpy(s->buffer + s->len, str, len);
+	s->len += len;
+
+	return len;
+}
+
+static int
+trace_seq_putc(struct trace_seq *s, unsigned char c)
+{
+	if (s->len >= (PAGE_SIZE - 1))
+		return 0;
+
+	s->buffer[s->len++] = c;
+
+	return 1;
+}
+
+static int
+trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
+{
+	if (len > ((PAGE_SIZE - 1) - s->len))
+		return 0;
+
+	memcpy(s->buffer + s->len, mem, len);
+	s->len += len;
+
+	return len;
+}
+
+#define HEX_CHARS 17
+static const char hex2asc[] = "0123456789abcdef";
+
+static int
+trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
+{
+	unsigned char hex[HEX_CHARS];
+	unsigned char *data = mem;
+	unsigned char byte;
+	int i, j;
+
+	BUG_ON(len >= HEX_CHARS);
+
+#ifdef __BIG_ENDIAN
+	for (i = 0, j = 0; i < len; i++) {
+#else
+	for (i = len-1, j = 0; i >= 0; i--) {
+#endif
+		byte = data[i];
+
+		hex[j++] = hex2asc[byte & 0x0f];
+		hex[j++] = hex2asc[byte >> 4];
+	}
+	hex[j++] = ' ';
+
+	return trace_seq_putmem(s, hex, j);
+}
+
+static void
+trace_seq_reset(struct trace_seq *s)
+{
+	s->len = 0;
+	s->readpos = 0;
+}
+
+ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
+{
+	int len;
+	int ret;
+
+	if (s->len <= s->readpos)
+		return -EBUSY;
+
+	len = s->len - s->readpos;
+	if (cnt > len)
+		cnt = len;
+	ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
+	if (ret)
+		return -EFAULT;
+
+	s->readpos += len;
+	return cnt;
+}
+
+static void
+trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+	int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
+
+	s->buffer[len] = 0;
+	seq_puts(m, s->buffer);
+
+	trace_seq_reset(s);
+}
+
+/*
+ * flip the trace buffers between two trace descriptors.
+ * This usually is the buffers between the global_trace and
+ * the max_tr to record a snapshot of a current trace.
+ *
+ * The ftrace_max_lock must be held.
+ */
+static void
+flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)
+{
+	struct list_head flip_pages;
+
+	INIT_LIST_HEAD(&flip_pages);
+
+	memcpy(&tr1->trace_head_idx, &tr2->trace_head_idx,
+		sizeof(struct trace_array_cpu) -
+		offsetof(struct trace_array_cpu, trace_head_idx));
+
+	check_pages(tr1);
+	check_pages(tr2);
+	list_splice_init(&tr1->trace_pages, &flip_pages);
+	list_splice_init(&tr2->trace_pages, &tr1->trace_pages);
+	list_splice_init(&flip_pages, &tr2->trace_pages);
+	BUG_ON(!list_empty(&flip_pages));
+	check_pages(tr1);
+	check_pages(tr2);
+}
+
+/**
+ * update_max_tr - snapshot all trace buffers from global_trace to max_tr
+ * @tr: tracer
+ * @tsk: the task with the latency
+ * @cpu: The cpu that initiated the trace.
+ *
+ * Flip the buffers between the @tr and the max_tr and record information
+ * about which task was the cause of this latency.
+ */
+void
+update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+	struct trace_array_cpu *data;
+	int i;
+
+	WARN_ON_ONCE(!irqs_disabled());
+	__raw_spin_lock(&ftrace_max_lock);
+	/* clear out all the previous traces */
+	for_each_tracing_cpu(i) {
+		data = tr->data[i];
+		flip_trace(max_tr.data[i], data);
+		tracing_reset(data);
+	}
+
+	__update_max_tr(tr, tsk, cpu);
+	__raw_spin_unlock(&ftrace_max_lock);
+}
+
+/**
+ * update_max_tr_single - only copy one trace over, and reset the rest
+ * @tr - tracer
+ * @tsk - task with the latency
+ * @cpu - the cpu of the buffer to copy.
+ *
+ * Flip the trace of a single CPU buffer between the @tr and the max_tr.
+ */
+void
+update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+	struct trace_array_cpu *data = tr->data[cpu];
+	int i;
+
+	WARN_ON_ONCE(!irqs_disabled());
+	__raw_spin_lock(&ftrace_max_lock);
+	for_each_tracing_cpu(i)
+		tracing_reset(max_tr.data[i]);
+
+	flip_trace(max_tr.data[cpu], data);
+	tracing_reset(data);
+
+	__update_max_tr(tr, tsk, cpu);
+	__raw_spin_unlock(&ftrace_max_lock);
+}
+
+/**
+ * register_tracer - register a tracer with the ftrace system.
+ * @type - the plugin for the tracer
+ *
+ * Register a new plugin tracer.
+ */
+int register_tracer(struct tracer *type)
+{
+	struct tracer *t;
+	int len;
+	int ret = 0;
+
+	if (!type->name) {
+		pr_info("Tracer must have a name\n");
+		return -1;
+	}
+
+	mutex_lock(&trace_types_lock);
+	for (t = trace_types; t; t = t->next) {
+		if (strcmp(type->name, t->name) == 0) {
+			/* already found */
+			pr_info("Trace %s already registered\n",
+				type->name);
+			ret = -1;
+			goto out;
+		}
+	}
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+	if (type->selftest) {
+		struct tracer *saved_tracer = current_trace;
+		struct trace_array_cpu *data;
+		struct trace_array *tr = &global_trace;
+		int saved_ctrl = tr->ctrl;
+		int i;
+		/*
+		 * Run a selftest on this tracer.
+		 * Here we reset the trace buffer, and set the current
+		 * tracer to be this tracer. The tracer can then run some
+		 * internal tracing to verify that everything is in order.
+		 * If we fail, we do not register this tracer.
+		 */
+		for_each_tracing_cpu(i) {
+			data = tr->data[i];
+			if (!head_page(data))
+				continue;
+			tracing_reset(data);
+		}
+		current_trace = type;
+		tr->ctrl = 0;
+		/* the test is responsible for initializing and enabling */
+		pr_info("Testing tracer %s: ", type->name);
+		ret = type->selftest(type, tr);
+		/* the test is responsible for resetting too */
+		current_trace = saved_tracer;
+		tr->ctrl = saved_ctrl;
+		if (ret) {
+			printk(KERN_CONT "FAILED!\n");
+			goto out;
+		}
+		/* Only reset on passing, to avoid touching corrupted buffers */
+		for_each_tracing_cpu(i) {
+			data = tr->data[i];
+			if (!head_page(data))
+				continue;
+			tracing_reset(data);
+		}
+		printk(KERN_CONT "PASSED\n");
+	}
+#endif
+
+	type->next = trace_types;
+	trace_types = type;
+	len = strlen(type->name);
+	if (len > max_tracer_type_len)
+		max_tracer_type_len = len;
+
+ out:
+	mutex_unlock(&trace_types_lock);
+
+	return ret;
+}
+
+void unregister_tracer(struct tracer *type)
+{
+	struct tracer **t;
+	int len;
+
+	mutex_lock(&trace_types_lock);
+	for (t = &trace_types; *t; t = &(*t)->next) {
+		if (*t == type)
+			goto found;
+	}
+	pr_info("Trace %s not registered\n", type->name);
+	goto out;
+
+ found:
+	*t = (*t)->next;
+	if (strlen(type->name) != max_tracer_type_len)
+		goto out;
+
+	max_tracer_type_len = 0;
+	for (t = &trace_types; *t; t = &(*t)->next) {
+		len = strlen((*t)->name);
+		if (len > max_tracer_type_len)
+			max_tracer_type_len = len;
+	}
+ out:
+	mutex_unlock(&trace_types_lock);
+}
+
+void tracing_reset(struct trace_array_cpu *data)
+{
+	data->trace_idx = 0;
+	data->overrun = 0;
+	data->trace_head = data->trace_tail = head_page(data);
+	data->trace_head_idx = 0;
+	data->trace_tail_idx = 0;
+}
+
+#define SAVED_CMDLINES 128
+static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
+static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
+static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
+static int cmdline_idx;
+static DEFINE_SPINLOCK(trace_cmdline_lock);
+
+/* temporary disable recording */
+atomic_t trace_record_cmdline_disabled __read_mostly;
+
+static void trace_init_cmdlines(void)
+{
+	memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline));
+	memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid));
+	cmdline_idx = 0;
+}
+
+void trace_stop_cmdline_recording(void);
+
+static void trace_save_cmdline(struct task_struct *tsk)
+{
+	unsigned map;
+	unsigned idx;
+
+	if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
+		return;
+
+	/*
+	 * It's not the end of the world if we don't get
+	 * the lock, but we also don't want to spin
+	 * nor do we want to disable interrupts,
+	 * so if we miss here, then better luck next time.
+	 */
+	if (!spin_trylock(&trace_cmdline_lock))
+		return;
+
+	idx = map_pid_to_cmdline[tsk->pid];
+	if (idx >= SAVED_CMDLINES) {
+		idx = (cmdline_idx + 1) % SAVED_CMDLINES;
+
+		map = map_cmdline_to_pid[idx];
+		if (map <= PID_MAX_DEFAULT)
+			map_pid_to_cmdline[map] = (unsigned)-1;
+
+		map_pid_to_cmdline[tsk->pid] = idx;
+
+		cmdline_idx = idx;
+	}
+
+	memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
+
+	spin_unlock(&trace_cmdline_lock);
+}
+
+static char *trace_find_cmdline(int pid)
+{
+	char *cmdline = "<...>";
+	unsigned map;
+
+	if (!pid)
+		return "<idle>";
+
+	if (pid > PID_MAX_DEFAULT)
+		goto out;
+
+	map = map_pid_to_cmdline[pid];
+	if (map >= SAVED_CMDLINES)
+		goto out;
+
+	cmdline = saved_cmdlines[map];
+
+ out:
+	return cmdline;
+}
+
+void tracing_record_cmdline(struct task_struct *tsk)
+{
+	if (atomic_read(&trace_record_cmdline_disabled))
+		return;
+
+	trace_save_cmdline(tsk);
+}
+
+static inline struct list_head *
+trace_next_list(struct trace_array_cpu *data, struct list_head *next)
+{
+	/*
+	 * Roundrobin - but skip the head (which is not a real page):
+	 */
+	next = next->next;
+	if (unlikely(next == &data->trace_pages))
+		next = next->next;
+	BUG_ON(next == &data->trace_pages);
+
+	return next;
+}
+
+static inline void *
+trace_next_page(struct trace_array_cpu *data, void *addr)
+{
+	struct list_head *next;
+	struct page *page;
+
+	page = virt_to_page(addr);
+
+	next = trace_next_list(data, &page->lru);
+	page = list_entry(next, struct page, lru);
+
+	return page_address(page);
+}
+
+static inline struct trace_entry *
+tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data)
+{
+	unsigned long idx, idx_next;
+	struct trace_entry *entry;
+
+	data->trace_idx++;
+	idx = data->trace_head_idx;
+	idx_next = idx + 1;
+
+	BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE);
+
+	entry = data->trace_head + idx * TRACE_ENTRY_SIZE;
+
+	if (unlikely(idx_next >= ENTRIES_PER_PAGE)) {
+		data->trace_head = trace_next_page(data, data->trace_head);
+		idx_next = 0;
+	}
+
+	if (data->trace_head == data->trace_tail &&
+	    idx_next == data->trace_tail_idx) {
+		/* overrun */
+		data->overrun++;
+		data->trace_tail_idx++;
+		if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
+			data->trace_tail =
+				trace_next_page(data, data->trace_tail);
+			data->trace_tail_idx = 0;
+		}
+	}
+
+	data->trace_head_idx = idx_next;
+
+	return entry;
+}
+
+static inline void
+tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
+{
+	struct task_struct *tsk = current;
+	unsigned long pc;
+
+	pc = preempt_count();
+
+	entry->preempt_count	= pc & 0xff;
+	entry->pid		= (tsk) ? tsk->pid : 0;
+	entry->t		= ftrace_now(raw_smp_processor_id());
+	entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
+		((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
+		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
+		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
+}
+
+void
+trace_function(struct trace_array *tr, struct trace_array_cpu *data,
+	       unsigned long ip, unsigned long parent_ip, unsigned long flags)
+{
+	struct trace_entry *entry;
+	unsigned long irq_flags;
+
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&data->lock);
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type		= TRACE_FN;
+	entry->fn.ip		= ip;
+	entry->fn.parent_ip	= parent_ip;
+	__raw_spin_unlock(&data->lock);
+	raw_local_irq_restore(irq_flags);
+}
+
+void
+ftrace(struct trace_array *tr, struct trace_array_cpu *data,
+       unsigned long ip, unsigned long parent_ip, unsigned long flags)
+{
+	if (likely(!atomic_read(&data->disabled)))
+		trace_function(tr, data, ip, parent_ip, flags);
+}
+
+void __trace_stack(struct trace_array *tr,
+		   struct trace_array_cpu *data,
+		   unsigned long flags,
+		   int skip)
+{
+	struct trace_entry *entry;
+	struct stack_trace trace;
+
+	if (!(trace_flags & TRACE_ITER_STACKTRACE))
+		return;
+
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type		= TRACE_STACK;
+
+	memset(&entry->stack, 0, sizeof(entry->stack));
+
+	trace.nr_entries	= 0;
+	trace.max_entries	= FTRACE_STACK_ENTRIES;
+	trace.skip		= skip;
+	trace.entries		= entry->stack.caller;
+
+	save_stack_trace(&trace);
+}
+
+void
+__trace_special(void *__tr, void *__data,
+		unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+	struct trace_array_cpu *data = __data;
+	struct trace_array *tr = __tr;
+	struct trace_entry *entry;
+	unsigned long irq_flags;
+
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&data->lock);
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, 0);
+	entry->type		= TRACE_SPECIAL;
+	entry->special.arg1	= arg1;
+	entry->special.arg2	= arg2;
+	entry->special.arg3	= arg3;
+	__trace_stack(tr, data, irq_flags, 4);
+	__raw_spin_unlock(&data->lock);
+	raw_local_irq_restore(irq_flags);
+
+	trace_wake_up();
+}
+
+void
+tracing_sched_switch_trace(struct trace_array *tr,
+			   struct trace_array_cpu *data,
+			   struct task_struct *prev,
+			   struct task_struct *next,
+			   unsigned long flags)
+{
+	struct trace_entry *entry;
+	unsigned long irq_flags;
+
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&data->lock);
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type		= TRACE_CTX;
+	entry->ctx.prev_pid	= prev->pid;
+	entry->ctx.prev_prio	= prev->prio;
+	entry->ctx.prev_state	= prev->state;
+	entry->ctx.next_pid	= next->pid;
+	entry->ctx.next_prio	= next->prio;
+	entry->ctx.next_state	= next->state;
+	__trace_stack(tr, data, flags, 5);
+	__raw_spin_unlock(&data->lock);
+	raw_local_irq_restore(irq_flags);
+}
+
+void
+tracing_sched_wakeup_trace(struct trace_array *tr,
+			   struct trace_array_cpu *data,
+			   struct task_struct *wakee,
+			   struct task_struct *curr,
+			   unsigned long flags)
+{
+	struct trace_entry *entry;
+	unsigned long irq_flags;
+
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&data->lock);
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type		= TRACE_WAKE;
+	entry->ctx.prev_pid	= curr->pid;
+	entry->ctx.prev_prio	= curr->prio;
+	entry->ctx.prev_state	= curr->state;
+	entry->ctx.next_pid	= wakee->pid;
+	entry->ctx.next_prio	= wakee->prio;
+	entry->ctx.next_state	= wakee->state;
+	__trace_stack(tr, data, flags, 6);
+	__raw_spin_unlock(&data->lock);
+	raw_local_irq_restore(irq_flags);
+
+	trace_wake_up();
+}
+
+void
+ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	if (tracing_disabled || current_trace == &no_tracer || !tr->ctrl)
+		return;
+
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		__trace_special(tr, data, arg1, arg2, arg3);
+
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+}
+
+#ifdef CONFIG_FTRACE
+static void
+function_trace_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	if (unlikely(!tracer_enabled))
+		return;
+
+	if (skip_trace(ip))
+		return;
+
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		trace_function(tr, data, ip, parent_ip, flags);
+
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+	.func = function_trace_call,
+};
+
+void tracing_start_function_trace(void)
+{
+	register_ftrace_function(&trace_ops);
+}
+
+void tracing_stop_function_trace(void)
+{
+	unregister_ftrace_function(&trace_ops);
+}
+#endif
+
+enum trace_file_type {
+	TRACE_FILE_LAT_FMT	= 1,
+};
+
+static struct trace_entry *
+trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data,
+		struct trace_iterator *iter, int cpu)
+{
+	struct page *page;
+	struct trace_entry *array;
+
+	if (iter->next_idx[cpu] >= tr->entries ||
+	    iter->next_idx[cpu] >= data->trace_idx ||
+	    (data->trace_head == data->trace_tail &&
+	     data->trace_head_idx == data->trace_tail_idx))
+		return NULL;
+
+	if (!iter->next_page[cpu]) {
+		/* Initialize the iterator for this cpu trace buffer */
+		WARN_ON(!data->trace_tail);
+		page = virt_to_page(data->trace_tail);
+		iter->next_page[cpu] = &page->lru;
+		iter->next_page_idx[cpu] = data->trace_tail_idx;
+	}
+
+	page = list_entry(iter->next_page[cpu], struct page, lru);
+	BUG_ON(&data->trace_pages == &page->lru);
+
+	array = page_address(page);
+
+	WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE);
+	return &array[iter->next_page_idx[cpu]];
+}
+
+static struct trace_entry *
+find_next_entry(struct trace_iterator *iter, int *ent_cpu)
+{
+	struct trace_array *tr = iter->tr;
+	struct trace_entry *ent, *next = NULL;
+	int next_cpu = -1;
+	int cpu;
+
+	for_each_tracing_cpu(cpu) {
+		if (!head_page(tr->data[cpu]))
+			continue;
+		ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu);
+		/*
+		 * Pick the entry with the smallest timestamp:
+		 */
+		if (ent && (!next || ent->t < next->t)) {
+			next = ent;
+			next_cpu = cpu;
+		}
+	}
+
+	if (ent_cpu)
+		*ent_cpu = next_cpu;
+
+	return next;
+}
+
+static void trace_iterator_increment(struct trace_iterator *iter)
+{
+	iter->idx++;
+	iter->next_idx[iter->cpu]++;
+	iter->next_page_idx[iter->cpu]++;
+
+	if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) {
+		struct trace_array_cpu *data = iter->tr->data[iter->cpu];
+
+		iter->next_page_idx[iter->cpu] = 0;
+		iter->next_page[iter->cpu] =
+			trace_next_list(data, iter->next_page[iter->cpu]);
+	}
+}
+
+static void trace_consume(struct trace_iterator *iter)
+{
+	struct trace_array_cpu *data = iter->tr->data[iter->cpu];
+
+	data->trace_tail_idx++;
+	if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
+		data->trace_tail = trace_next_page(data, data->trace_tail);
+		data->trace_tail_idx = 0;
+	}
+
+	/* Check if we empty it, then reset the index */
+	if (data->trace_head == data->trace_tail &&
+	    data->trace_head_idx == data->trace_tail_idx)
+		data->trace_idx = 0;
+}
+
+static void *find_next_entry_inc(struct trace_iterator *iter)
+{
+	struct trace_entry *next;
+	int next_cpu = -1;
+
+	next = find_next_entry(iter, &next_cpu);
+
+	iter->prev_ent = iter->ent;
+	iter->prev_cpu = iter->cpu;
+
+	iter->ent = next;
+	iter->cpu = next_cpu;
+
+	if (next)
+		trace_iterator_increment(iter);
+
+	return next ? iter : NULL;
+}
+
+static void *s_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct trace_iterator *iter = m->private;
+	void *last_ent = iter->ent;
+	int i = (int)*pos;
+	void *ent;
+
+	(*pos)++;
+
+	/* can't go backwards */
+	if (iter->idx > i)
+		return NULL;
+
+	if (iter->idx < 0)
+		ent = find_next_entry_inc(iter);
+	else
+		ent = iter;
+
+	while (ent && iter->idx < i)
+		ent = find_next_entry_inc(iter);
+
+	iter->pos = *pos;
+
+	if (last_ent && !ent)
+		seq_puts(m, "\n\nvim:ft=help\n");
+
+	return ent;
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+	struct trace_iterator *iter = m->private;
+	void *p = NULL;
+	loff_t l = 0;
+	int i;
+
+	mutex_lock(&trace_types_lock);
+
+	if (!current_trace || current_trace != iter->trace) {
+		mutex_unlock(&trace_types_lock);
+		return NULL;
+	}
+
+	atomic_inc(&trace_record_cmdline_disabled);
+
+	/* let the tracer grab locks here if needed */
+	if (current_trace->start)
+		current_trace->start(iter);
+
+	if (*pos != iter->pos) {
+		iter->ent = NULL;
+		iter->cpu = 0;
+		iter->idx = -1;
+		iter->prev_ent = NULL;
+		iter->prev_cpu = -1;
+
+		for_each_tracing_cpu(i) {
+			iter->next_idx[i] = 0;
+			iter->next_page[i] = NULL;
+		}
+
+		for (p = iter; p && l < *pos; p = s_next(m, p, &l))
+			;
+
+	} else {
+		l = *pos - 1;
+		p = s_next(m, p, &l);
+	}
+
+	return p;
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+	struct trace_iterator *iter = m->private;
+
+	atomic_dec(&trace_record_cmdline_disabled);
+
+	/* let the tracer release locks here if needed */
+	if (current_trace && current_trace == iter->trace && iter->trace->stop)
+		iter->trace->stop(iter);
+
+	mutex_unlock(&trace_types_lock);
+}
+
+#define KRETPROBE_MSG "[unknown/kretprobe'd]"
+
+#ifdef CONFIG_KRETPROBES
+static inline int kretprobed(unsigned long addr)
+{
+	return addr == (unsigned long)kretprobe_trampoline;
+}
+#else
+static inline int kretprobed(unsigned long addr)
+{
+	return 0;
+}
+#endif /* CONFIG_KRETPROBES */
+
+static int
+seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+	char str[KSYM_SYMBOL_LEN];
+
+	kallsyms_lookup(address, NULL, NULL, NULL, str);
+
+	return trace_seq_printf(s, fmt, str);
+#endif
+	return 1;
+}
+
+static int
+seq_print_sym_offset(struct trace_seq *s, const char *fmt,
+		     unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+	char str[KSYM_SYMBOL_LEN];
+
+	sprint_symbol(str, address);
+	return trace_seq_printf(s, fmt, str);
+#endif
+	return 1;
+}
+
+#ifndef CONFIG_64BIT
+# define IP_FMT "%08lx"
+#else
+# define IP_FMT "%016lx"
+#endif
+
+static int
+seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
+{
+	int ret;
+
+	if (!ip)
+		return trace_seq_printf(s, "0");
+
+	if (sym_flags & TRACE_ITER_SYM_OFFSET)
+		ret = seq_print_sym_offset(s, "%s", ip);
+	else
+		ret = seq_print_sym_short(s, "%s", ip);
+
+	if (!ret)
+		return 0;
+
+	if (sym_flags & TRACE_ITER_SYM_ADDR)
+		ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+	return ret;
+}
+
+static void print_lat_help_header(struct seq_file *m)
+{
+	seq_puts(m, "#                _------=> CPU#            \n");
+	seq_puts(m, "#               / _-----=> irqs-off        \n");
+	seq_puts(m, "#              | / _----=> need-resched    \n");
+	seq_puts(m, "#              || / _---=> hardirq/softirq \n");
+	seq_puts(m, "#              ||| / _--=> preempt-depth   \n");
+	seq_puts(m, "#              |||| /                      \n");
+	seq_puts(m, "#              |||||     delay             \n");
+	seq_puts(m, "#  cmd     pid ||||| time  |   caller      \n");
+	seq_puts(m, "#     \\   /    |||||   \\   |   /           \n");
+}
+
+static void print_func_help_header(struct seq_file *m)
+{
+	seq_puts(m, "#           TASK-PID   CPU#    TIMESTAMP  FUNCTION\n");
+	seq_puts(m, "#              | |      |          |         |\n");
+}
+
+
+static void
+print_trace_header(struct seq_file *m, struct trace_iterator *iter)
+{
+	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+	struct trace_array *tr = iter->tr;
+	struct trace_array_cpu *data = tr->data[tr->cpu];
+	struct tracer *type = current_trace;
+	unsigned long total   = 0;
+	unsigned long entries = 0;
+	int cpu;
+	const char *name = "preemption";
+
+	if (type)
+		name = type->name;
+
+	for_each_tracing_cpu(cpu) {
+		if (head_page(tr->data[cpu])) {
+			total += tr->data[cpu]->trace_idx;
+			if (tr->data[cpu]->trace_idx > tr->entries)
+				entries += tr->entries;
+			else
+				entries += tr->data[cpu]->trace_idx;
+		}
+	}
+
+	seq_printf(m, "%s latency trace v1.1.5 on %s\n",
+		   name, UTS_RELEASE);
+	seq_puts(m, "-----------------------------------"
+		 "---------------------------------\n");
+	seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |"
+		   " (M:%s VP:%d, KP:%d, SP:%d HP:%d",
+		   nsecs_to_usecs(data->saved_latency),
+		   entries,
+		   total,
+		   tr->cpu,
+#if defined(CONFIG_PREEMPT_NONE)
+		   "server",
+#elif defined(CONFIG_PREEMPT_VOLUNTARY)
+		   "desktop",
+#elif defined(CONFIG_PREEMPT_DESKTOP)
+		   "preempt",
+#else
+		   "unknown",
+#endif
+		   /* These are reserved for later use */
+		   0, 0, 0, 0);
+#ifdef CONFIG_SMP
+	seq_printf(m, " #P:%d)\n", num_online_cpus());
+#else
+	seq_puts(m, ")\n");
+#endif
+	seq_puts(m, "    -----------------\n");
+	seq_printf(m, "    | task: %.16s-%d "
+		   "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
+		   data->comm, data->pid, data->uid, data->nice,
+		   data->policy, data->rt_priority);
+	seq_puts(m, "    -----------------\n");
+
+	if (data->critical_start) {
+		seq_puts(m, " => started at: ");
+		seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags);
+		trace_print_seq(m, &iter->seq);
+		seq_puts(m, "\n => ended at:   ");
+		seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
+		trace_print_seq(m, &iter->seq);
+		seq_puts(m, "\n");
+	}
+
+	seq_puts(m, "\n");
+}
+
+static void
+lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
+{
+	int hardirq, softirq;
+	char *comm;
+
+	comm = trace_find_cmdline(entry->pid);
+
+	trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
+	trace_seq_printf(s, "%d", cpu);
+	trace_seq_printf(s, "%c%c",
+			(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.',
+			((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
+
+	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+	if (hardirq && softirq) {
+		trace_seq_putc(s, 'H');
+	} else {
+		if (hardirq) {
+			trace_seq_putc(s, 'h');
+		} else {
+			if (softirq)
+				trace_seq_putc(s, 's');
+			else
+				trace_seq_putc(s, '.');
+		}
+	}
+
+	if (entry->preempt_count)
+		trace_seq_printf(s, "%x", entry->preempt_count);
+	else
+		trace_seq_puts(s, ".");
+}
+
+unsigned long preempt_mark_thresh = 100;
+
+static void
+lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs,
+		    unsigned long rel_usecs)
+{
+	trace_seq_printf(s, " %4lldus", abs_usecs);
+	if (rel_usecs > preempt_mark_thresh)
+		trace_seq_puts(s, "!: ");
+	else if (rel_usecs > 1)
+		trace_seq_puts(s, "+: ");
+	else
+		trace_seq_puts(s, " : ");
+}
+
+static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
+
+static int
+print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
+{
+	struct trace_seq *s = &iter->seq;
+	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+	struct trace_entry *next_entry = find_next_entry(iter, NULL);
+	unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
+	struct trace_entry *entry = iter->ent;
+	unsigned long abs_usecs;
+	unsigned long rel_usecs;
+	char *comm;
+	int S, T;
+	int i;
+	unsigned state;
+
+	if (!next_entry)
+		next_entry = entry;
+	rel_usecs = ns2usecs(next_entry->t - entry->t);
+	abs_usecs = ns2usecs(entry->t - iter->tr->time_start);
+
+	if (verbose) {
+		comm = trace_find_cmdline(entry->pid);
+		trace_seq_printf(s, "%16s %5d %d %d %08x %08x [%08lx]"
+				 " %ld.%03ldms (+%ld.%03ldms): ",
+				 comm,
+				 entry->pid, cpu, entry->flags,
+				 entry->preempt_count, trace_idx,
+				 ns2usecs(entry->t),
+				 abs_usecs/1000,
+				 abs_usecs % 1000, rel_usecs/1000,
+				 rel_usecs % 1000);
+	} else {
+		lat_print_generic(s, entry, cpu);
+		lat_print_timestamp(s, abs_usecs, rel_usecs);
+	}
+	switch (entry->type) {
+	case TRACE_FN:
+		seq_print_ip_sym(s, entry->fn.ip, sym_flags);
+		trace_seq_puts(s, " (");
+		if (kretprobed(entry->fn.parent_ip))
+			trace_seq_puts(s, KRETPROBE_MSG);
+		else
+			seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags);
+		trace_seq_puts(s, ")\n");
+		break;
+	case TRACE_CTX:
+	case TRACE_WAKE:
+		T = entry->ctx.next_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.next_state] : 'X';
+
+		state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 1 : 0;
+		S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X';
+		comm = trace_find_cmdline(entry->ctx.next_pid);
+		trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n",
+				 entry->ctx.prev_pid,
+				 entry->ctx.prev_prio,
+				 S, entry->type == TRACE_CTX ? "==>" : "  +",
+				 entry->ctx.next_pid,
+				 entry->ctx.next_prio,
+				 T, comm);
+		break;
+	case TRACE_SPECIAL:
+		trace_seq_printf(s, "# %ld %ld %ld\n",
+				 entry->special.arg1,
+				 entry->special.arg2,
+				 entry->special.arg3);
+		break;
+	case TRACE_STACK:
+		for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+			if (i)
+				trace_seq_puts(s, " <= ");
+			seq_print_ip_sym(s, entry->stack.caller[i], sym_flags);
+		}
+		trace_seq_puts(s, "\n");
+		break;
+	default:
+		trace_seq_printf(s, "Unknown type %d\n", entry->type);
+	}
+	return 1;
+}
+
+static int print_trace_fmt(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+	struct trace_entry *entry;
+	unsigned long usec_rem;
+	unsigned long long t;
+	unsigned long secs;
+	char *comm;
+	int ret;
+	int S, T;
+	int i;
+
+	entry = iter->ent;
+
+	comm = trace_find_cmdline(iter->ent->pid);
+
+	t = ns2usecs(entry->t);
+	usec_rem = do_div(t, 1000000ULL);
+	secs = (unsigned long)t;
+
+	ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
+	if (!ret)
+		return 0;
+	ret = trace_seq_printf(s, "[%02d] ", iter->cpu);
+	if (!ret)
+		return 0;
+	ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
+	if (!ret)
+		return 0;
+
+	switch (entry->type) {
+	case TRACE_FN:
+		ret = seq_print_ip_sym(s, entry->fn.ip, sym_flags);
+		if (!ret)
+			return 0;
+		if ((sym_flags & TRACE_ITER_PRINT_PARENT) &&
+						entry->fn.parent_ip) {
+			ret = trace_seq_printf(s, " <-");
+			if (!ret)
+				return 0;
+			if (kretprobed(entry->fn.parent_ip))
+				ret = trace_seq_puts(s, KRETPROBE_MSG);
+			else
+				ret = seq_print_ip_sym(s, entry->fn.parent_ip,
+						       sym_flags);
+			if (!ret)
+				return 0;
+		}
+		ret = trace_seq_printf(s, "\n");
+		if (!ret)
+			return 0;
+		break;
+	case TRACE_CTX:
+	case TRACE_WAKE:
+		S = entry->ctx.prev_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.prev_state] : 'X';
+		T = entry->ctx.next_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.next_state] : 'X';
+		ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n",
+				       entry->ctx.prev_pid,
+				       entry->ctx.prev_prio,
+				       S,
+				       entry->type == TRACE_CTX ? "==>" : "  +",
+				       entry->ctx.next_pid,
+				       entry->ctx.next_prio,
+				       T);
+		if (!ret)
+			return 0;
+		break;
+	case TRACE_SPECIAL:
+		ret = trace_seq_printf(s, "# %ld %ld %ld\n",
+				 entry->special.arg1,
+				 entry->special.arg2,
+				 entry->special.arg3);
+		if (!ret)
+			return 0;
+		break;
+	case TRACE_STACK:
+		for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+			if (i) {
+				ret = trace_seq_puts(s, " <= ");
+				if (!ret)
+					return 0;
+			}
+			ret = seq_print_ip_sym(s, entry->stack.caller[i],
+					       sym_flags);
+			if (!ret)
+				return 0;
+		}
+		ret = trace_seq_puts(s, "\n");
+		if (!ret)
+			return 0;
+		break;
+	}
+	return 1;
+}
+
+static int print_raw_fmt(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry;
+	int ret;
+	int S, T;
+
+	entry = iter->ent;
+
+	ret = trace_seq_printf(s, "%d %d %llu ",
+		entry->pid, iter->cpu, entry->t);
+	if (!ret)
+		return 0;
+
+	switch (entry->type) {
+	case TRACE_FN:
+		ret = trace_seq_printf(s, "%x %x\n",
+					entry->fn.ip, entry->fn.parent_ip);
+		if (!ret)
+			return 0;
+		break;
+	case TRACE_CTX:
+	case TRACE_WAKE:
+		S = entry->ctx.prev_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.prev_state] : 'X';
+		T = entry->ctx.next_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.next_state] : 'X';
+		if (entry->type == TRACE_WAKE)
+			S = '+';
+		ret = trace_seq_printf(s, "%d %d %c %d %d %c\n",
+				       entry->ctx.prev_pid,
+				       entry->ctx.prev_prio,
+				       S,
+				       entry->ctx.next_pid,
+				       entry->ctx.next_prio,
+				       T);
+		if (!ret)
+			return 0;
+		break;
+	case TRACE_SPECIAL:
+	case TRACE_STACK:
+		ret = trace_seq_printf(s, "# %ld %ld %ld\n",
+				 entry->special.arg1,
+				 entry->special.arg2,
+				 entry->special.arg3);
+		if (!ret)
+			return 0;
+		break;
+	}
+	return 1;
+}
+
+#define SEQ_PUT_FIELD_RET(s, x)				\
+do {							\
+	if (!trace_seq_putmem(s, &(x), sizeof(x)))	\
+		return 0;				\
+} while (0)
+
+#define SEQ_PUT_HEX_FIELD_RET(s, x)			\
+do {							\
+	if (!trace_seq_putmem_hex(s, &(x), sizeof(x)))	\
+		return 0;				\
+} while (0)
+
+static int print_hex_fmt(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	unsigned char newline = '\n';
+	struct trace_entry *entry;
+	int S, T;
+
+	entry = iter->ent;
+
+	SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
+	SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
+	SEQ_PUT_HEX_FIELD_RET(s, entry->t);
+
+	switch (entry->type) {
+	case TRACE_FN:
+		SEQ_PUT_HEX_FIELD_RET(s, entry->fn.ip);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip);
+		break;
+	case TRACE_CTX:
+	case TRACE_WAKE:
+		S = entry->ctx.prev_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.prev_state] : 'X';
+		T = entry->ctx.next_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.next_state] : 'X';
+		if (entry->type == TRACE_WAKE)
+			S = '+';
+		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio);
+		SEQ_PUT_HEX_FIELD_RET(s, S);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip);
+		SEQ_PUT_HEX_FIELD_RET(s, T);
+		break;
+	case TRACE_SPECIAL:
+	case TRACE_STACK:
+		SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3);
+		break;
+	}
+	SEQ_PUT_FIELD_RET(s, newline);
+
+	return 1;
+}
+
+static int print_bin_fmt(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry;
+
+	entry = iter->ent;
+
+	SEQ_PUT_FIELD_RET(s, entry->pid);
+	SEQ_PUT_FIELD_RET(s, entry->cpu);
+	SEQ_PUT_FIELD_RET(s, entry->t);
+
+	switch (entry->type) {
+	case TRACE_FN:
+		SEQ_PUT_FIELD_RET(s, entry->fn.ip);
+		SEQ_PUT_FIELD_RET(s, entry->fn.parent_ip);
+		break;
+	case TRACE_CTX:
+		SEQ_PUT_FIELD_RET(s, entry->ctx.prev_pid);
+		SEQ_PUT_FIELD_RET(s, entry->ctx.prev_prio);
+		SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state);
+		SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid);
+		SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio);
+		SEQ_PUT_FIELD_RET(s, entry->ctx.next_state);
+		break;
+	case TRACE_SPECIAL:
+	case TRACE_STACK:
+		SEQ_PUT_FIELD_RET(s, entry->special.arg1);
+		SEQ_PUT_FIELD_RET(s, entry->special.arg2);
+		SEQ_PUT_FIELD_RET(s, entry->special.arg3);
+		break;
+	}
+	return 1;
+}
+
+static int trace_empty(struct trace_iterator *iter)
+{
+	struct trace_array_cpu *data;
+	int cpu;
+
+	for_each_tracing_cpu(cpu) {
+		data = iter->tr->data[cpu];
+
+		if (head_page(data) && data->trace_idx &&
+		    (data->trace_tail != data->trace_head ||
+		     data->trace_tail_idx != data->trace_head_idx))
+			return 0;
+	}
+	return 1;
+}
+
+static int print_trace_line(struct trace_iterator *iter)
+{
+	if (iter->trace && iter->trace->print_line)
+		return iter->trace->print_line(iter);
+
+	if (trace_flags & TRACE_ITER_BIN)
+		return print_bin_fmt(iter);
+
+	if (trace_flags & TRACE_ITER_HEX)
+		return print_hex_fmt(iter);
+
+	if (trace_flags & TRACE_ITER_RAW)
+		return print_raw_fmt(iter);
+
+	if (iter->iter_flags & TRACE_FILE_LAT_FMT)
+		return print_lat_fmt(iter, iter->idx, iter->cpu);
+
+	return print_trace_fmt(iter);
+}
+
+static int s_show(struct seq_file *m, void *v)
+{
+	struct trace_iterator *iter = v;
+
+	if (iter->ent == NULL) {
+		if (iter->tr) {
+			seq_printf(m, "# tracer: %s\n", iter->trace->name);
+			seq_puts(m, "#\n");
+		}
+		if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+			/* print nothing if the buffers are empty */
+			if (trace_empty(iter))
+				return 0;
+			print_trace_header(m, iter);
+			if (!(trace_flags & TRACE_ITER_VERBOSE))
+				print_lat_help_header(m);
+		} else {
+			if (!(trace_flags & TRACE_ITER_VERBOSE))
+				print_func_help_header(m);
+		}
+	} else {
+		print_trace_line(iter);
+		trace_print_seq(m, &iter->seq);
+	}
+
+	return 0;
+}
+
+static struct seq_operations tracer_seq_ops = {
+	.start		= s_start,
+	.next		= s_next,
+	.stop		= s_stop,
+	.show		= s_show,
+};
+
+static struct trace_iterator *
+__tracing_open(struct inode *inode, struct file *file, int *ret)
+{
+	struct trace_iterator *iter;
+
+	if (tracing_disabled) {
+		*ret = -ENODEV;
+		return NULL;
+	}
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter) {
+		*ret = -ENOMEM;
+		goto out;
+	}
+
+	mutex_lock(&trace_types_lock);
+	if (current_trace && current_trace->print_max)
+		iter->tr = &max_tr;
+	else
+		iter->tr = inode->i_private;
+	iter->trace = current_trace;
+	iter->pos = -1;
+
+	/* TODO stop tracer */
+	*ret = seq_open(file, &tracer_seq_ops);
+	if (!*ret) {
+		struct seq_file *m = file->private_data;
+		m->private = iter;
+
+		/* stop the trace while dumping */
+		if (iter->tr->ctrl)
+			tracer_enabled = 0;
+
+		if (iter->trace && iter->trace->open)
+			iter->trace->open(iter);
+	} else {
+		kfree(iter);
+		iter = NULL;
+	}
+	mutex_unlock(&trace_types_lock);
+
+ out:
+	return iter;
+}
+
+int tracing_open_generic(struct inode *inode, struct file *filp)
+{
+	if (tracing_disabled)
+		return -ENODEV;
+
+	filp->private_data = inode->i_private;
+	return 0;
+}
+
+int tracing_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = (struct seq_file *)file->private_data;
+	struct trace_iterator *iter = m->private;
+
+	mutex_lock(&trace_types_lock);
+	if (iter->trace && iter->trace->close)
+		iter->trace->close(iter);
+
+	/* reenable tracing if it was previously enabled */
+	if (iter->tr->ctrl)
+		tracer_enabled = 1;
+	mutex_unlock(&trace_types_lock);
+
+	seq_release(inode, file);
+	kfree(iter);
+	return 0;
+}
+
+static int tracing_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	__tracing_open(inode, file, &ret);
+
+	return ret;
+}
+
+static int tracing_lt_open(struct inode *inode, struct file *file)
+{
+	struct trace_iterator *iter;
+	int ret;
+
+	iter = __tracing_open(inode, file, &ret);
+
+	if (!ret)
+		iter->iter_flags |= TRACE_FILE_LAT_FMT;
+
+	return ret;
+}
+
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct tracer *t = m->private;
+
+	(*pos)++;
+
+	if (t)
+		t = t->next;
+
+	m->private = t;
+
+	return t;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	struct tracer *t = m->private;
+	loff_t l = 0;
+
+	mutex_lock(&trace_types_lock);
+	for (; t && l < *pos; t = t_next(m, t, &l))
+		;
+
+	return t;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+	mutex_unlock(&trace_types_lock);
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+	struct tracer *t = v;
+
+	if (!t)
+		return 0;
+
+	seq_printf(m, "%s", t->name);
+	if (t->next)
+		seq_putc(m, ' ');
+	else
+		seq_putc(m, '\n');
+
+	return 0;
+}
+
+static struct seq_operations show_traces_seq_ops = {
+	.start		= t_start,
+	.next		= t_next,
+	.stop		= t_stop,
+	.show		= t_show,
+};
+
+static int show_traces_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	if (tracing_disabled)
+		return -ENODEV;
+
+	ret = seq_open(file, &show_traces_seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = trace_types;
+	}
+
+	return ret;
+}
+
+static struct file_operations tracing_fops = {
+	.open		= tracing_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= tracing_release,
+};
+
+static struct file_operations tracing_lt_fops = {
+	.open		= tracing_lt_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= tracing_release,
+};
+
+static struct file_operations show_traces_fops = {
+	.open		= show_traces_open,
+	.read		= seq_read,
+	.release	= seq_release,
+};
+
+/*
+ * Only trace on a CPU if the bitmask is set:
+ */
+static cpumask_t tracing_cpumask = CPU_MASK_ALL;
+
+/*
+ * When tracing/tracing_cpu_mask is modified then this holds
+ * the new bitmask we are about to install:
+ */
+static cpumask_t tracing_cpumask_new;
+
+/*
+ * The tracer itself will not take this lock, but still we want
+ * to provide a consistent cpumask to user-space:
+ */
+static DEFINE_MUTEX(tracing_cpumask_update_lock);
+
+/*
+ * Temporary storage for the character representation of the
+ * CPU bitmask (and one more byte for the newline):
+ */
+static char mask_str[NR_CPUS + 1];
+
+static ssize_t
+tracing_cpumask_read(struct file *filp, char __user *ubuf,
+		     size_t count, loff_t *ppos)
+{
+	int len;
+
+	mutex_lock(&tracing_cpumask_update_lock);
+
+	len = cpumask_scnprintf(mask_str, count, tracing_cpumask);
+	if (count - len < 2) {
+		count = -EINVAL;
+		goto out_err;
+	}
+	len += sprintf(mask_str + len, "\n");
+	count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
+
+out_err:
+	mutex_unlock(&tracing_cpumask_update_lock);
+
+	return count;
+}
+
+static ssize_t
+tracing_cpumask_write(struct file *filp, const char __user *ubuf,
+		      size_t count, loff_t *ppos)
+{
+	int err, cpu;
+
+	mutex_lock(&tracing_cpumask_update_lock);
+	err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
+	if (err)
+		goto err_unlock;
+
+	raw_local_irq_disable();
+	__raw_spin_lock(&ftrace_max_lock);
+	for_each_tracing_cpu(cpu) {
+		/*
+		 * Increase/decrease the disabled counter if we are
+		 * about to flip a bit in the cpumask:
+		 */
+		if (cpu_isset(cpu, tracing_cpumask) &&
+				!cpu_isset(cpu, tracing_cpumask_new)) {
+			atomic_inc(&global_trace.data[cpu]->disabled);
+		}
+		if (!cpu_isset(cpu, tracing_cpumask) &&
+				cpu_isset(cpu, tracing_cpumask_new)) {
+			atomic_dec(&global_trace.data[cpu]->disabled);
+		}
+	}
+	__raw_spin_unlock(&ftrace_max_lock);
+	raw_local_irq_enable();
+
+	tracing_cpumask = tracing_cpumask_new;
+
+	mutex_unlock(&tracing_cpumask_update_lock);
+
+	return count;
+
+err_unlock:
+	mutex_unlock(&tracing_cpumask_update_lock);
+
+	return err;
+}
+
+static struct file_operations tracing_cpumask_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_cpumask_read,
+	.write		= tracing_cpumask_write,
+};
+
+static ssize_t
+tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	char *buf;
+	int r = 0;
+	int len = 0;
+	int i;
+
+	/* calulate max size */
+	for (i = 0; trace_options[i]; i++) {
+		len += strlen(trace_options[i]);
+		len += 3; /* "no" and space */
+	}
+
+	/* +2 for \n and \0 */
+	buf = kmalloc(len + 2, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	for (i = 0; trace_options[i]; i++) {
+		if (trace_flags & (1 << i))
+			r += sprintf(buf + r, "%s ", trace_options[i]);
+		else
+			r += sprintf(buf + r, "no%s ", trace_options[i]);
+	}
+
+	r += sprintf(buf + r, "\n");
+	WARN_ON(r >= len + 2);
+
+	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+
+	kfree(buf);
+
+	return r;
+}
+
+static ssize_t
+tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
+			size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	char *cmp = buf;
+	int neg = 0;
+	int i;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	if (strncmp(buf, "no", 2) == 0) {
+		neg = 1;
+		cmp += 2;
+	}
+
+	for (i = 0; trace_options[i]; i++) {
+		int len = strlen(trace_options[i]);
+
+		if (strncmp(cmp, trace_options[i], len) == 0) {
+			if (neg)
+				trace_flags &= ~(1 << i);
+			else
+				trace_flags |= (1 << i);
+			break;
+		}
+	}
+	/*
+	 * If no option could be set, return an error:
+	 */
+	if (!trace_options[i])
+		return -EINVAL;
+
+	filp->f_pos += cnt;
+
+	return cnt;
+}
+
+static struct file_operations tracing_iter_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_iter_ctrl_read,
+	.write		= tracing_iter_ctrl_write,
+};
+
+static const char readme_msg[] =
+	"tracing mini-HOWTO:\n\n"
+	"# mkdir /debug\n"
+	"# mount -t debugfs nodev /debug\n\n"
+	"# cat /debug/tracing/available_tracers\n"
+	"wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n"
+	"# cat /debug/tracing/current_tracer\n"
+	"none\n"
+	"# echo sched_switch > /debug/tracing/current_tracer\n"
+	"# cat /debug/tracing/current_tracer\n"
+	"sched_switch\n"
+	"# cat /debug/tracing/iter_ctrl\n"
+	"noprint-parent nosym-offset nosym-addr noverbose\n"
+	"# echo print-parent > /debug/tracing/iter_ctrl\n"
+	"# echo 1 > /debug/tracing/tracing_enabled\n"
+	"# cat /debug/tracing/trace > /tmp/trace.txt\n"
+	"echo 0 > /debug/tracing/tracing_enabled\n"
+;
+
+static ssize_t
+tracing_readme_read(struct file *filp, char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	return simple_read_from_buffer(ubuf, cnt, ppos,
+					readme_msg, strlen(readme_msg));
+}
+
+static struct file_operations tracing_readme_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_readme_read,
+};
+
+static ssize_t
+tracing_ctrl_read(struct file *filp, char __user *ubuf,
+		  size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = filp->private_data;
+	char buf[64];
+	int r;
+
+	r = sprintf(buf, "%ld\n", tr->ctrl);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_ctrl_write(struct file *filp, const char __user *ubuf,
+		   size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = filp->private_data;
+	char buf[64];
+	long val;
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	val = !!val;
+
+	mutex_lock(&trace_types_lock);
+	if (tr->ctrl ^ val) {
+		if (val)
+			tracer_enabled = 1;
+		else
+			tracer_enabled = 0;
+
+		tr->ctrl = val;
+
+		if (current_trace && current_trace->ctrl_update)
+			current_trace->ctrl_update(tr);
+	}
+	mutex_unlock(&trace_types_lock);
+
+	filp->f_pos += cnt;
+
+	return cnt;
+}
+
+static ssize_t
+tracing_set_trace_read(struct file *filp, char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	char buf[max_tracer_type_len+2];
+	int r;
+
+	mutex_lock(&trace_types_lock);
+	if (current_trace)
+		r = sprintf(buf, "%s\n", current_trace->name);
+	else
+		r = sprintf(buf, "\n");
+	mutex_unlock(&trace_types_lock);
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_set_trace_write(struct file *filp, const char __user *ubuf,
+			size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = &global_trace;
+	struct tracer *t;
+	char buf[max_tracer_type_len+1];
+	int i;
+
+	if (cnt > max_tracer_type_len)
+		cnt = max_tracer_type_len;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	/* strip ending whitespace. */
+	for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
+		buf[i] = 0;
+
+	mutex_lock(&trace_types_lock);
+	for (t = trace_types; t; t = t->next) {
+		if (strcmp(t->name, buf) == 0)
+			break;
+	}
+	if (!t || t == current_trace)
+		goto out;
+
+	if (current_trace && current_trace->reset)
+		current_trace->reset(tr);
+
+	current_trace = t;
+	if (t->init)
+		t->init(tr);
+
+ out:
+	mutex_unlock(&trace_types_lock);
+
+	filp->f_pos += cnt;
+
+	return cnt;
+}
+
+static ssize_t
+tracing_max_lat_read(struct file *filp, char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	unsigned long *ptr = filp->private_data;
+	char buf[64];
+	int r;
+
+	r = snprintf(buf, sizeof(buf), "%ld\n",
+		     *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr));
+	if (r > sizeof(buf))
+		r = sizeof(buf);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_max_lat_write(struct file *filp, const char __user *ubuf,
+		      size_t cnt, loff_t *ppos)
+{
+	long *ptr = filp->private_data;
+	char buf[64];
+	long val;
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	*ptr = val * 1000;
+
+	return cnt;
+}
+
+static atomic_t tracing_reader;
+
+static int tracing_open_pipe(struct inode *inode, struct file *filp)
+{
+	struct trace_iterator *iter;
+
+	if (tracing_disabled)
+		return -ENODEV;
+
+	/* We only allow for reader of the pipe */
+	if (atomic_inc_return(&tracing_reader) != 1) {
+		atomic_dec(&tracing_reader);
+		return -EBUSY;
+	}
+
+	/* create a buffer to store the information to pass to userspace */
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return -ENOMEM;
+
+	mutex_lock(&trace_types_lock);
+	iter->tr = &global_trace;
+	iter->trace = current_trace;
+	filp->private_data = iter;
+
+	if (iter->trace->pipe_open)
+		iter->trace->pipe_open(iter);
+	mutex_unlock(&trace_types_lock);
+
+	return 0;
+}
+
+static int tracing_release_pipe(struct inode *inode, struct file *file)
+{
+	struct trace_iterator *iter = file->private_data;
+
+	kfree(iter);
+	atomic_dec(&tracing_reader);
+
+	return 0;
+}
+
+static unsigned int
+tracing_poll_pipe(struct file *filp, poll_table *poll_table)
+{
+	struct trace_iterator *iter = filp->private_data;
+
+	if (trace_flags & TRACE_ITER_BLOCK) {
+		/*
+		 * Always select as readable when in blocking mode
+		 */
+		return POLLIN | POLLRDNORM;
+	} else {
+		if (!trace_empty(iter))
+			return POLLIN | POLLRDNORM;
+		poll_wait(filp, &trace_wait, poll_table);
+		if (!trace_empty(iter))
+			return POLLIN | POLLRDNORM;
+
+		return 0;
+	}
+}
+
+/*
+ * Consumer reader.
+ */
+static ssize_t
+tracing_read_pipe(struct file *filp, char __user *ubuf,
+		  size_t cnt, loff_t *ppos)
+{
+	struct trace_iterator *iter = filp->private_data;
+	struct trace_array_cpu *data;
+	static cpumask_t mask;
+	unsigned long flags;
+#ifdef CONFIG_FTRACE
+	int ftrace_save;
+#endif
+	int cpu;
+	ssize_t sret;
+
+	/* return any leftover data */
+	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+	if (sret != -EBUSY)
+		return sret;
+	sret = 0;
+
+	trace_seq_reset(&iter->seq);
+
+	mutex_lock(&trace_types_lock);
+	if (iter->trace->read) {
+		sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
+		if (sret)
+			goto out;
+	}
+
+	while (trace_empty(iter)) {
+
+		if ((filp->f_flags & O_NONBLOCK)) {
+			sret = -EAGAIN;
+			goto out;
+		}
+
+		/*
+		 * This is a make-shift waitqueue. The reason we don't use
+		 * an actual wait queue is because:
+		 *  1) we only ever have one waiter
+		 *  2) the tracing, traces all functions, we don't want
+		 *     the overhead of calling wake_up and friends
+		 *     (and tracing them too)
+		 *     Anyway, this is really very primitive wakeup.
+		 */
+		set_current_state(TASK_INTERRUPTIBLE);
+		iter->tr->waiter = current;
+
+		mutex_unlock(&trace_types_lock);
+
+		/* sleep for 100 msecs, and try again. */
+		schedule_timeout(HZ/10);
+
+		mutex_lock(&trace_types_lock);
+
+		iter->tr->waiter = NULL;
+
+		if (signal_pending(current)) {
+			sret = -EINTR;
+			goto out;
+		}
+
+		if (iter->trace != current_trace)
+			goto out;
+
+		/*
+		 * We block until we read something and tracing is disabled.
+		 * We still block if tracing is disabled, but we have never
+		 * read anything. This allows a user to cat this file, and
+		 * then enable tracing. But after we have read something,
+		 * we give an EOF when tracing is again disabled.
+		 *
+		 * iter->pos will be 0 if we haven't read anything.
+		 */
+		if (!tracer_enabled && iter->pos)
+			break;
+
+		continue;
+	}
+
+	/* stop when tracing is finished */
+	if (trace_empty(iter))
+		goto out;
+
+	if (cnt >= PAGE_SIZE)
+		cnt = PAGE_SIZE - 1;
+
+	/* reset all but tr, trace, and overruns */
+	memset(&iter->seq, 0,
+	       sizeof(struct trace_iterator) -
+	       offsetof(struct trace_iterator, seq));
+	iter->pos = -1;
+
+	/*
+	 * We need to stop all tracing on all CPUS to read the
+	 * the next buffer. This is a bit expensive, but is
+	 * not done often. We fill all what we can read,
+	 * and then release the locks again.
+	 */
+
+	cpus_clear(mask);
+	local_irq_save(flags);
+#ifdef CONFIG_FTRACE
+	ftrace_save = ftrace_enabled;
+	ftrace_enabled = 0;
+#endif
+	smp_wmb();
+	for_each_tracing_cpu(cpu) {
+		data = iter->tr->data[cpu];
+
+		if (!head_page(data) || !data->trace_idx)
+			continue;
+
+		atomic_inc(&data->disabled);
+		cpu_set(cpu, mask);
+	}
+
+	for_each_cpu_mask(cpu, mask) {
+		data = iter->tr->data[cpu];
+		__raw_spin_lock(&data->lock);
+
+		if (data->overrun > iter->last_overrun[cpu])
+			iter->overrun[cpu] +=
+				data->overrun - iter->last_overrun[cpu];
+		iter->last_overrun[cpu] = data->overrun;
+	}
+
+	while (find_next_entry_inc(iter) != NULL) {
+		int ret;
+		int len = iter->seq.len;
+
+		ret = print_trace_line(iter);
+		if (!ret) {
+			/* don't print partial lines */
+			iter->seq.len = len;
+			break;
+		}
+
+		trace_consume(iter);
+
+		if (iter->seq.len >= cnt)
+			break;
+	}
+
+	for_each_cpu_mask(cpu, mask) {
+		data = iter->tr->data[cpu];
+		__raw_spin_unlock(&data->lock);
+	}
+
+	for_each_cpu_mask(cpu, mask) {
+		data = iter->tr->data[cpu];
+		atomic_dec(&data->disabled);
+	}
+#ifdef CONFIG_FTRACE
+	ftrace_enabled = ftrace_save;
+#endif
+	local_irq_restore(flags);
+
+	/* Now copy what we have to the user */
+	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+	if (iter->seq.readpos >= iter->seq.len)
+		trace_seq_reset(&iter->seq);
+	if (sret == -EBUSY)
+		sret = 0;
+
+out:
+	mutex_unlock(&trace_types_lock);
+
+	return sret;
+}
+
+static ssize_t
+tracing_entries_read(struct file *filp, char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = filp->private_data;
+	char buf[64];
+	int r;
+
+	r = sprintf(buf, "%lu\n", tr->entries);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_entries_write(struct file *filp, const char __user *ubuf,
+		      size_t cnt, loff_t *ppos)
+{
+	unsigned long val;
+	char buf[64];
+	int i, ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	/* must have at least 1 entry */
+	if (!val)
+		return -EINVAL;
+
+	mutex_lock(&trace_types_lock);
+
+	if (current_trace != &no_tracer) {
+		cnt = -EBUSY;
+		pr_info("ftrace: set current_tracer to none"
+			" before modifying buffer size\n");
+		goto out;
+	}
+
+	if (val > global_trace.entries) {
+		long pages_requested;
+		unsigned long freeable_pages;
+
+		/* make sure we have enough memory before mapping */
+		pages_requested =
+			(val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE;
+
+		/* account for each buffer (and max_tr) */
+		pages_requested *= tracing_nr_buffers * 2;
+
+		/* Check for overflow */
+		if (pages_requested < 0) {
+			cnt = -ENOMEM;
+			goto out;
+		}
+
+		freeable_pages = determine_dirtyable_memory();
+
+		/* we only allow to request 1/4 of useable memory */
+		if (pages_requested >
+		    ((freeable_pages + tracing_pages_allocated) / 4)) {
+			cnt = -ENOMEM;
+			goto out;
+		}
+
+		while (global_trace.entries < val) {
+			if (trace_alloc_page()) {
+				cnt = -ENOMEM;
+				goto out;
+			}
+			/* double check that we don't go over the known pages */
+			if (tracing_pages_allocated > pages_requested)
+				break;
+		}
+
+	} else {
+		/* include the number of entries in val (inc of page entries) */
+		while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1))
+			trace_free_page();
+	}
+
+	/* check integrity */
+	for_each_tracing_cpu(i)
+		check_pages(global_trace.data[i]);
+
+	filp->f_pos += cnt;
+
+	/* If check pages failed, return ENOMEM */
+	if (tracing_disabled)
+		cnt = -ENOMEM;
+ out:
+	max_tr.entries = global_trace.entries;
+	mutex_unlock(&trace_types_lock);
+
+	return cnt;
+}
+
+static struct file_operations tracing_max_lat_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_max_lat_read,
+	.write		= tracing_max_lat_write,
+};
+
+static struct file_operations tracing_ctrl_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_ctrl_read,
+	.write		= tracing_ctrl_write,
+};
+
+static struct file_operations set_tracer_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_set_trace_read,
+	.write		= tracing_set_trace_write,
+};
+
+static struct file_operations tracing_pipe_fops = {
+	.open		= tracing_open_pipe,
+	.poll		= tracing_poll_pipe,
+	.read		= tracing_read_pipe,
+	.release	= tracing_release_pipe,
+};
+
+static struct file_operations tracing_entries_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_entries_read,
+	.write		= tracing_entries_write,
+};
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+static ssize_t
+tracing_read_long(struct file *filp, char __user *ubuf,
+		  size_t cnt, loff_t *ppos)
+{
+	unsigned long *p = filp->private_data;
+	char buf[64];
+	int r;
+
+	r = sprintf(buf, "%ld\n", *p);
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static struct file_operations tracing_read_long_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_read_long,
+};
+#endif
+
+static struct dentry *d_tracer;
+
+struct dentry *tracing_init_dentry(void)
+{
+	static int once;
+
+	if (d_tracer)
+		return d_tracer;
+
+	d_tracer = debugfs_create_dir("tracing", NULL);
+
+	if (!d_tracer && !once) {
+		once = 1;
+		pr_warning("Could not create debugfs directory 'tracing'\n");
+		return NULL;
+	}
+
+	return d_tracer;
+}
+
+#ifdef CONFIG_FTRACE_SELFTEST
+/* Let selftest have access to static functions in this file */
+#include "trace_selftest.c"
+#endif
+
+static __init void tracer_init_debugfs(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+
+	entry = debugfs_create_file("tracing_enabled", 0644, d_tracer,
+				    &global_trace, &tracing_ctrl_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
+
+	entry = debugfs_create_file("iter_ctrl", 0644, d_tracer,
+				    NULL, &tracing_iter_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'iter_ctrl' entry\n");
+
+	entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
+				    NULL, &tracing_cpumask_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
+
+	entry = debugfs_create_file("latency_trace", 0444, d_tracer,
+				    &global_trace, &tracing_lt_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'latency_trace' entry\n");
+
+	entry = debugfs_create_file("trace", 0444, d_tracer,
+				    &global_trace, &tracing_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'trace' entry\n");
+
+	entry = debugfs_create_file("available_tracers", 0444, d_tracer,
+				    &global_trace, &show_traces_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'trace' entry\n");
+
+	entry = debugfs_create_file("current_tracer", 0444, d_tracer,
+				    &global_trace, &set_tracer_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'trace' entry\n");
+
+	entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
+				    &tracing_max_latency,
+				    &tracing_max_lat_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'tracing_max_latency' entry\n");
+
+	entry = debugfs_create_file("tracing_thresh", 0644, d_tracer,
+				    &tracing_thresh, &tracing_max_lat_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'tracing_threash' entry\n");
+	entry = debugfs_create_file("README", 0644, d_tracer,
+				    NULL, &tracing_readme_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'README' entry\n");
+
+	entry = debugfs_create_file("trace_pipe", 0644, d_tracer,
+				    NULL, &tracing_pipe_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'tracing_threash' entry\n");
+
+	entry = debugfs_create_file("trace_entries", 0644, d_tracer,
+				    &global_trace, &tracing_entries_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'tracing_threash' entry\n");
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+	entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
+				    &ftrace_update_tot_cnt,
+				    &tracing_read_long_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'dyn_ftrace_total_info' entry\n");
+#endif
+}
+
+static int trace_alloc_page(void)
+{
+	struct trace_array_cpu *data;
+	struct page *page, *tmp;
+	LIST_HEAD(pages);
+	void *array;
+	unsigned pages_allocated = 0;
+	int i;
+
+	/* first allocate a page for each CPU */
+	for_each_tracing_cpu(i) {
+		array = (void *)__get_free_page(GFP_KERNEL);
+		if (array == NULL) {
+			printk(KERN_ERR "tracer: failed to allocate page"
+			       "for trace buffer!\n");
+			goto free_pages;
+		}
+
+		pages_allocated++;
+		page = virt_to_page(array);
+		list_add(&page->lru, &pages);
+
+/* Only allocate if we are actually using the max trace */
+#ifdef CONFIG_TRACER_MAX_TRACE
+		array = (void *)__get_free_page(GFP_KERNEL);
+		if (array == NULL) {
+			printk(KERN_ERR "tracer: failed to allocate page"
+			       "for trace buffer!\n");
+			goto free_pages;
+		}
+		pages_allocated++;
+		page = virt_to_page(array);
+		list_add(&page->lru, &pages);
+#endif
+	}
+
+	/* Now that we successfully allocate a page per CPU, add them */
+	for_each_tracing_cpu(i) {
+		data = global_trace.data[i];
+		page = list_entry(pages.next, struct page, lru);
+		list_del_init(&page->lru);
+		list_add_tail(&page->lru, &data->trace_pages);
+		ClearPageLRU(page);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+		data = max_tr.data[i];
+		page = list_entry(pages.next, struct page, lru);
+		list_del_init(&page->lru);
+		list_add_tail(&page->lru, &data->trace_pages);
+		SetPageLRU(page);
+#endif
+	}
+	tracing_pages_allocated += pages_allocated;
+	global_trace.entries += ENTRIES_PER_PAGE;
+
+	return 0;
+
+ free_pages:
+	list_for_each_entry_safe(page, tmp, &pages, lru) {
+		list_del_init(&page->lru);
+		__free_page(page);
+	}
+	return -ENOMEM;
+}
+
+static int trace_free_page(void)
+{
+	struct trace_array_cpu *data;
+	struct page *page;
+	struct list_head *p;
+	int i;
+	int ret = 0;
+
+	/* free one page from each buffer */
+	for_each_tracing_cpu(i) {
+		data = global_trace.data[i];
+		p = data->trace_pages.next;
+		if (p == &data->trace_pages) {
+			/* should never happen */
+			WARN_ON(1);
+			tracing_disabled = 1;
+			ret = -1;
+			break;
+		}
+		page = list_entry(p, struct page, lru);
+		ClearPageLRU(page);
+		list_del(&page->lru);
+		tracing_pages_allocated--;
+		tracing_pages_allocated--;
+		__free_page(page);
+
+		tracing_reset(data);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+		data = max_tr.data[i];
+		p = data->trace_pages.next;
+		if (p == &data->trace_pages) {
+			/* should never happen */
+			WARN_ON(1);
+			tracing_disabled = 1;
+			ret = -1;
+			break;
+		}
+		page = list_entry(p, struct page, lru);
+		ClearPageLRU(page);
+		list_del(&page->lru);
+		__free_page(page);
+
+		tracing_reset(data);
+#endif
+	}
+	global_trace.entries -= ENTRIES_PER_PAGE;
+
+	return ret;
+}
+
+__init static int tracer_alloc_buffers(void)
+{
+	struct trace_array_cpu *data;
+	void *array;
+	struct page *page;
+	int pages = 0;
+	int ret = -ENOMEM;
+	int i;
+
+	/* TODO: make the number of buffers hot pluggable with CPUS */
+	tracing_nr_buffers = num_possible_cpus();
+	tracing_buffer_mask = cpu_possible_map;
+
+	/* Allocate the first page for all buffers */
+	for_each_tracing_cpu(i) {
+		data = global_trace.data[i] = &per_cpu(global_trace_cpu, i);
+		max_tr.data[i] = &per_cpu(max_data, i);
+
+		array = (void *)__get_free_page(GFP_KERNEL);
+		if (array == NULL) {
+			printk(KERN_ERR "tracer: failed to allocate page"
+			       "for trace buffer!\n");
+			goto free_buffers;
+		}
+
+		/* set the array to the list */
+		INIT_LIST_HEAD(&data->trace_pages);
+		page = virt_to_page(array);
+		list_add(&page->lru, &data->trace_pages);
+		/* use the LRU flag to differentiate the two buffers */
+		ClearPageLRU(page);
+
+		data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+		max_tr.data[i]->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+/* Only allocate if we are actually using the max trace */
+#ifdef CONFIG_TRACER_MAX_TRACE
+		array = (void *)__get_free_page(GFP_KERNEL);
+		if (array == NULL) {
+			printk(KERN_ERR "tracer: failed to allocate page"
+			       "for trace buffer!\n");
+			goto free_buffers;
+		}
+
+		INIT_LIST_HEAD(&max_tr.data[i]->trace_pages);
+		page = virt_to_page(array);
+		list_add(&page->lru, &max_tr.data[i]->trace_pages);
+		SetPageLRU(page);
+#endif
+	}
+
+	/*
+	 * Since we allocate by orders of pages, we may be able to
+	 * round up a bit.
+	 */
+	global_trace.entries = ENTRIES_PER_PAGE;
+	pages++;
+
+	while (global_trace.entries < trace_nr_entries) {
+		if (trace_alloc_page())
+			break;
+		pages++;
+	}
+	max_tr.entries = global_trace.entries;
+
+	pr_info("tracer: %d pages allocated for %ld entries of %ld bytes\n",
+		pages, trace_nr_entries, (long)TRACE_ENTRY_SIZE);
+	pr_info("   actual entries %ld\n", global_trace.entries);
+
+	tracer_init_debugfs();
+
+	trace_init_cmdlines();
+
+	register_tracer(&no_tracer);
+	current_trace = &no_tracer;
+
+	/* All seems OK, enable tracing */
+	global_trace.ctrl = tracer_enabled;
+	tracing_disabled = 0;
+
+	return 0;
+
+ free_buffers:
+	for (i-- ; i >= 0; i--) {
+		struct page *page, *tmp;
+		struct trace_array_cpu *data = global_trace.data[i];
+
+		if (data) {
+			list_for_each_entry_safe(page, tmp,
+						 &data->trace_pages, lru) {
+				list_del_init(&page->lru);
+				__free_page(page);
+			}
+		}
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+		data = max_tr.data[i];
+		if (data) {
+			list_for_each_entry_safe(page, tmp,
+						 &data->trace_pages, lru) {
+				list_del_init(&page->lru);
+				__free_page(page);
+			}
+		}
+#endif
+	}
+	return ret;
+}
+fs_initcall(tracer_alloc_buffers);

+ 313 - 0
kernel/trace/trace.h

@@ -0,0 +1,313 @@
+#ifndef _LINUX_KERNEL_TRACE_H
+#define _LINUX_KERNEL_TRACE_H
+
+#include <linux/fs.h>
+#include <asm/atomic.h>
+#include <linux/sched.h>
+#include <linux/clocksource.h>
+
+enum trace_type {
+	__TRACE_FIRST_TYPE = 0,
+
+	TRACE_FN,
+	TRACE_CTX,
+	TRACE_WAKE,
+	TRACE_STACK,
+	TRACE_SPECIAL,
+
+	__TRACE_LAST_TYPE
+};
+
+/*
+ * Function trace entry - function address and parent function addres:
+ */
+struct ftrace_entry {
+	unsigned long		ip;
+	unsigned long		parent_ip;
+};
+
+/*
+ * Context switch trace entry - which task (and prio) we switched from/to:
+ */
+struct ctx_switch_entry {
+	unsigned int		prev_pid;
+	unsigned char		prev_prio;
+	unsigned char		prev_state;
+	unsigned int		next_pid;
+	unsigned char		next_prio;
+	unsigned char		next_state;
+};
+
+/*
+ * Special (free-form) trace entry:
+ */
+struct special_entry {
+	unsigned long		arg1;
+	unsigned long		arg2;
+	unsigned long		arg3;
+};
+
+/*
+ * Stack-trace entry:
+ */
+
+#define FTRACE_STACK_ENTRIES	8
+
+struct stack_entry {
+	unsigned long		caller[FTRACE_STACK_ENTRIES];
+};
+
+/*
+ * The trace entry - the most basic unit of tracing. This is what
+ * is printed in the end as a single line in the trace output, such as:
+ *
+ *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
+ */
+struct trace_entry {
+	char			type;
+	char			cpu;
+	char			flags;
+	char			preempt_count;
+	int			pid;
+	cycle_t			t;
+	union {
+		struct ftrace_entry		fn;
+		struct ctx_switch_entry		ctx;
+		struct special_entry		special;
+		struct stack_entry		stack;
+	};
+};
+
+#define TRACE_ENTRY_SIZE	sizeof(struct trace_entry)
+
+/*
+ * The CPU trace array - it consists of thousands of trace entries
+ * plus some other descriptor data: (for example which task started
+ * the trace, etc.)
+ */
+struct trace_array_cpu {
+	struct list_head	trace_pages;
+	atomic_t		disabled;
+	raw_spinlock_t		lock;
+	struct lock_class_key	lock_key;
+
+	/* these fields get copied into max-trace: */
+	unsigned		trace_head_idx;
+	unsigned		trace_tail_idx;
+	void			*trace_head; /* producer */
+	void			*trace_tail; /* consumer */
+	unsigned long		trace_idx;
+	unsigned long		overrun;
+	unsigned long		saved_latency;
+	unsigned long		critical_start;
+	unsigned long		critical_end;
+	unsigned long		critical_sequence;
+	unsigned long		nice;
+	unsigned long		policy;
+	unsigned long		rt_priority;
+	cycle_t			preempt_timestamp;
+	pid_t			pid;
+	uid_t			uid;
+	char			comm[TASK_COMM_LEN];
+};
+
+struct trace_iterator;
+
+/*
+ * The trace array - an array of per-CPU trace arrays. This is the
+ * highest level data structure that individual tracers deal with.
+ * They have on/off state as well:
+ */
+struct trace_array {
+	unsigned long		entries;
+	long			ctrl;
+	int			cpu;
+	cycle_t			time_start;
+	struct task_struct	*waiter;
+	struct trace_array_cpu	*data[NR_CPUS];
+};
+
+/*
+ * A specific tracer, represented by methods that operate on a trace array:
+ */
+struct tracer {
+	const char		*name;
+	void			(*init)(struct trace_array *tr);
+	void			(*reset)(struct trace_array *tr);
+	void			(*open)(struct trace_iterator *iter);
+	void			(*pipe_open)(struct trace_iterator *iter);
+	void			(*close)(struct trace_iterator *iter);
+	void			(*start)(struct trace_iterator *iter);
+	void			(*stop)(struct trace_iterator *iter);
+	ssize_t			(*read)(struct trace_iterator *iter,
+					struct file *filp, char __user *ubuf,
+					size_t cnt, loff_t *ppos);
+	void			(*ctrl_update)(struct trace_array *tr);
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+	int			(*selftest)(struct tracer *trace,
+					    struct trace_array *tr);
+#endif
+	int			(*print_line)(struct trace_iterator *iter);
+	struct tracer		*next;
+	int			print_max;
+};
+
+struct trace_seq {
+	unsigned char		buffer[PAGE_SIZE];
+	unsigned int		len;
+	unsigned int		readpos;
+};
+
+/*
+ * Trace iterator - used by printout routines who present trace
+ * results to users and which routines might sleep, etc:
+ */
+struct trace_iterator {
+	struct trace_array	*tr;
+	struct tracer		*trace;
+	void			*private;
+	long			last_overrun[NR_CPUS];
+	long			overrun[NR_CPUS];
+
+	/* The below is zeroed out in pipe_read */
+	struct trace_seq	seq;
+	struct trace_entry	*ent;
+	int			cpu;
+
+	struct trace_entry	*prev_ent;
+	int			prev_cpu;
+
+	unsigned long		iter_flags;
+	loff_t			pos;
+	unsigned long		next_idx[NR_CPUS];
+	struct list_head	*next_page[NR_CPUS];
+	unsigned		next_page_idx[NR_CPUS];
+	long			idx;
+};
+
+void tracing_reset(struct trace_array_cpu *data);
+int tracing_open_generic(struct inode *inode, struct file *filp);
+struct dentry *tracing_init_dentry(void);
+void ftrace(struct trace_array *tr,
+			    struct trace_array_cpu *data,
+			    unsigned long ip,
+			    unsigned long parent_ip,
+			    unsigned long flags);
+void tracing_sched_switch_trace(struct trace_array *tr,
+				struct trace_array_cpu *data,
+				struct task_struct *prev,
+				struct task_struct *next,
+				unsigned long flags);
+void tracing_record_cmdline(struct task_struct *tsk);
+
+void tracing_sched_wakeup_trace(struct trace_array *tr,
+				struct trace_array_cpu *data,
+				struct task_struct *wakee,
+				struct task_struct *cur,
+				unsigned long flags);
+void trace_special(struct trace_array *tr,
+		   struct trace_array_cpu *data,
+		   unsigned long arg1,
+		   unsigned long arg2,
+		   unsigned long arg3);
+void trace_function(struct trace_array *tr,
+		    struct trace_array_cpu *data,
+		    unsigned long ip,
+		    unsigned long parent_ip,
+		    unsigned long flags);
+
+void tracing_start_function_trace(void);
+void tracing_stop_function_trace(void);
+void tracing_start_cmdline_record(void);
+void tracing_stop_cmdline_record(void);
+int register_tracer(struct tracer *type);
+void unregister_tracer(struct tracer *type);
+
+extern unsigned long nsecs_to_usecs(unsigned long nsecs);
+
+extern unsigned long tracing_max_latency;
+extern unsigned long tracing_thresh;
+
+void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
+void update_max_tr_single(struct trace_array *tr,
+			  struct task_struct *tsk, int cpu);
+
+extern cycle_t ftrace_now(int cpu);
+
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+typedef void
+(*tracer_switch_func_t)(void *private,
+			void *__rq,
+			struct task_struct *prev,
+			struct task_struct *next);
+
+struct tracer_switch_ops {
+	tracer_switch_func_t		func;
+	void				*private;
+	struct tracer_switch_ops	*next;
+};
+
+#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern unsigned long ftrace_update_tot_cnt;
+#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
+extern int DYN_FTRACE_TEST_NAME(void);
+#endif
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+#ifdef CONFIG_FTRACE
+extern int trace_selftest_startup_function(struct tracer *trace,
+					   struct trace_array *tr);
+#endif
+#ifdef CONFIG_IRQSOFF_TRACER
+extern int trace_selftest_startup_irqsoff(struct tracer *trace,
+					  struct trace_array *tr);
+#endif
+#ifdef CONFIG_PREEMPT_TRACER
+extern int trace_selftest_startup_preemptoff(struct tracer *trace,
+					     struct trace_array *tr);
+#endif
+#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
+extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace,
+						 struct trace_array *tr);
+#endif
+#ifdef CONFIG_SCHED_TRACER
+extern int trace_selftest_startup_wakeup(struct tracer *trace,
+					 struct trace_array *tr);
+#endif
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+extern int trace_selftest_startup_sched_switch(struct tracer *trace,
+					       struct trace_array *tr);
+#endif
+#endif /* CONFIG_FTRACE_STARTUP_TEST */
+
+extern void *head_page(struct trace_array_cpu *data);
+extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
+extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
+				 size_t cnt);
+extern long ns2usecs(cycle_t nsec);
+
+extern unsigned long trace_flags;
+
+/*
+ * trace_iterator_flags is an enumeration that defines bit
+ * positions into trace_flags that controls the output.
+ *
+ * NOTE: These bits must match the trace_options array in
+ *       trace.c.
+ */
+enum trace_iterator_flags {
+	TRACE_ITER_PRINT_PARENT		= 0x01,
+	TRACE_ITER_SYM_OFFSET		= 0x02,
+	TRACE_ITER_SYM_ADDR		= 0x04,
+	TRACE_ITER_VERBOSE		= 0x08,
+	TRACE_ITER_RAW			= 0x10,
+	TRACE_ITER_HEX			= 0x20,
+	TRACE_ITER_BIN			= 0x40,
+	TRACE_ITER_BLOCK		= 0x80,
+	TRACE_ITER_STACKTRACE		= 0x100,
+	TRACE_ITER_SCHED_TREE		= 0x200,
+};
+
+#endif /* _LINUX_KERNEL_TRACE_H */

+ 78 - 0
kernel/trace/trace_functions.c

@@ -0,0 +1,78 @@
+/*
+ * ring buffer based function tracer
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+static void function_reset(struct trace_array *tr)
+{
+	int cpu;
+
+	tr->time_start = ftrace_now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr->data[cpu]);
+}
+
+static void start_function_trace(struct trace_array *tr)
+{
+	function_reset(tr);
+	tracing_start_cmdline_record();
+	tracing_start_function_trace();
+}
+
+static void stop_function_trace(struct trace_array *tr)
+{
+	tracing_stop_function_trace();
+	tracing_stop_cmdline_record();
+}
+
+static void function_trace_init(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		start_function_trace(tr);
+}
+
+static void function_trace_reset(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		stop_function_trace(tr);
+}
+
+static void function_trace_ctrl_update(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		start_function_trace(tr);
+	else
+		stop_function_trace(tr);
+}
+
+static struct tracer function_trace __read_mostly =
+{
+	.name	     = "ftrace",
+	.init	     = function_trace_init,
+	.reset	     = function_trace_reset,
+	.ctrl_update = function_trace_ctrl_update,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_function,
+#endif
+};
+
+static __init int init_function_trace(void)
+{
+	return register_tracer(&function_trace);
+}
+
+device_initcall(init_function_trace);

+ 486 - 0
kernel/trace/trace_irqsoff.c

@@ -0,0 +1,486 @@
+/*
+ * trace irqs off criticall timings
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * From code in the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/kallsyms.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+static struct trace_array		*irqsoff_trace __read_mostly;
+static int				tracer_enabled __read_mostly;
+
+static DEFINE_PER_CPU(int, tracing_cpu);
+
+static DEFINE_SPINLOCK(max_trace_lock);
+
+enum {
+	TRACER_IRQS_OFF		= (1 << 1),
+	TRACER_PREEMPT_OFF	= (1 << 2),
+};
+
+static int trace_type __read_mostly;
+
+#ifdef CONFIG_PREEMPT_TRACER
+static inline int
+preempt_trace(void)
+{
+	return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count());
+}
+#else
+# define preempt_trace() (0)
+#endif
+
+#ifdef CONFIG_IRQSOFF_TRACER
+static inline int
+irq_trace(void)
+{
+	return ((trace_type & TRACER_IRQS_OFF) &&
+		irqs_disabled());
+}
+#else
+# define irq_trace() (0)
+#endif
+
+/*
+ * Sequence count - we record it when starting a measurement and
+ * skip the latency if the sequence has changed - some other section
+ * did a maximum and could disturb our measurement with serial console
+ * printouts, etc. Truly coinciding maximum latencies should be rare
+ * and what happens together happens separately as well, so this doesnt
+ * decrease the validity of the maximum found:
+ */
+static __cacheline_aligned_in_smp	unsigned long max_sequence;
+
+#ifdef CONFIG_FTRACE
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void
+irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	/*
+	 * Does not matter if we preempt. We test the flags
+	 * afterward, to see if irqs are disabled or not.
+	 * If we preempt and get a false positive, the flags
+	 * test will fail.
+	 */
+	cpu = raw_smp_processor_id();
+	if (likely(!per_cpu(tracing_cpu, cpu)))
+		return;
+
+	local_save_flags(flags);
+	/* slight chance to get a false positive on tracing_cpu */
+	if (!irqs_disabled_flags(flags))
+		return;
+
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		trace_function(tr, data, ip, parent_ip, flags);
+
+	atomic_dec(&data->disabled);
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+	.func = irqsoff_tracer_call,
+};
+#endif /* CONFIG_FTRACE */
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int report_latency(cycle_t delta)
+{
+	if (tracing_thresh) {
+		if (delta < tracing_thresh)
+			return 0;
+	} else {
+		if (delta <= tracing_max_latency)
+			return 0;
+	}
+	return 1;
+}
+
+static void
+check_critical_timing(struct trace_array *tr,
+		      struct trace_array_cpu *data,
+		      unsigned long parent_ip,
+		      int cpu)
+{
+	unsigned long latency, t0, t1;
+	cycle_t T0, T1, delta;
+	unsigned long flags;
+
+	/*
+	 * usecs conversion is slow so we try to delay the conversion
+	 * as long as possible:
+	 */
+	T0 = data->preempt_timestamp;
+	T1 = ftrace_now(cpu);
+	delta = T1-T0;
+
+	local_save_flags(flags);
+
+	if (!report_latency(delta))
+		goto out;
+
+	spin_lock_irqsave(&max_trace_lock, flags);
+
+	/* check if we are still the max latency */
+	if (!report_latency(delta))
+		goto out_unlock;
+
+	trace_function(tr, data, CALLER_ADDR0, parent_ip, flags);
+
+	latency = nsecs_to_usecs(delta);
+
+	if (data->critical_sequence != max_sequence)
+		goto out_unlock;
+
+	tracing_max_latency = delta;
+	t0 = nsecs_to_usecs(T0);
+	t1 = nsecs_to_usecs(T1);
+
+	data->critical_end = parent_ip;
+
+	update_max_tr_single(tr, current, cpu);
+
+	max_sequence++;
+
+out_unlock:
+	spin_unlock_irqrestore(&max_trace_lock, flags);
+
+out:
+	data->critical_sequence = max_sequence;
+	data->preempt_timestamp = ftrace_now(cpu);
+	tracing_reset(data);
+	trace_function(tr, data, CALLER_ADDR0, parent_ip, flags);
+}
+
+static inline void
+start_critical_timing(unsigned long ip, unsigned long parent_ip)
+{
+	int cpu;
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+
+	if (likely(!tracer_enabled))
+		return;
+
+	cpu = raw_smp_processor_id();
+
+	if (per_cpu(tracing_cpu, cpu))
+		return;
+
+	data = tr->data[cpu];
+
+	if (unlikely(!data) || atomic_read(&data->disabled))
+		return;
+
+	atomic_inc(&data->disabled);
+
+	data->critical_sequence = max_sequence;
+	data->preempt_timestamp = ftrace_now(cpu);
+	data->critical_start = parent_ip ? : ip;
+	tracing_reset(data);
+
+	local_save_flags(flags);
+
+	trace_function(tr, data, ip, parent_ip, flags);
+
+	per_cpu(tracing_cpu, cpu) = 1;
+
+	atomic_dec(&data->disabled);
+}
+
+static inline void
+stop_critical_timing(unsigned long ip, unsigned long parent_ip)
+{
+	int cpu;
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+
+	cpu = raw_smp_processor_id();
+	/* Always clear the tracing cpu on stopping the trace */
+	if (unlikely(per_cpu(tracing_cpu, cpu)))
+		per_cpu(tracing_cpu, cpu) = 0;
+	else
+		return;
+
+	if (!tracer_enabled)
+		return;
+
+	data = tr->data[cpu];
+
+	if (unlikely(!data) || unlikely(!head_page(data)) ||
+	    !data->critical_start || atomic_read(&data->disabled))
+		return;
+
+	atomic_inc(&data->disabled);
+
+	local_save_flags(flags);
+	trace_function(tr, data, ip, parent_ip, flags);
+	check_critical_timing(tr, data, parent_ip ? : ip, cpu);
+	data->critical_start = 0;
+	atomic_dec(&data->disabled);
+}
+
+/* start and stop critical timings used to for stoppage (in idle) */
+void start_critical_timings(void)
+{
+	if (preempt_trace() || irq_trace())
+		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+
+void stop_critical_timings(void)
+{
+	if (preempt_trace() || irq_trace())
+		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+
+#ifdef CONFIG_IRQSOFF_TRACER
+#ifdef CONFIG_PROVE_LOCKING
+void time_hardirqs_on(unsigned long a0, unsigned long a1)
+{
+	if (!preempt_trace() && irq_trace())
+		stop_critical_timing(a0, a1);
+}
+
+void time_hardirqs_off(unsigned long a0, unsigned long a1)
+{
+	if (!preempt_trace() && irq_trace())
+		start_critical_timing(a0, a1);
+}
+
+#else /* !CONFIG_PROVE_LOCKING */
+
+/*
+ * Stubs:
+ */
+
+void early_boot_irqs_off(void)
+{
+}
+
+void early_boot_irqs_on(void)
+{
+}
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+
+/*
+ * We are only interested in hardirq on/off events:
+ */
+void trace_hardirqs_on(void)
+{
+	if (!preempt_trace() && irq_trace())
+		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void trace_hardirqs_off(void)
+{
+	if (!preempt_trace() && irq_trace())
+		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+void trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+	if (!preempt_trace() && irq_trace())
+		stop_critical_timing(CALLER_ADDR0, caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+void trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+	if (!preempt_trace() && irq_trace())
+		start_critical_timing(CALLER_ADDR0, caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+#endif /* CONFIG_PROVE_LOCKING */
+#endif /*  CONFIG_IRQSOFF_TRACER */
+
+#ifdef CONFIG_PREEMPT_TRACER
+void trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+	stop_critical_timing(a0, a1);
+}
+
+void trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+	start_critical_timing(a0, a1);
+}
+#endif /* CONFIG_PREEMPT_TRACER */
+
+static void start_irqsoff_tracer(struct trace_array *tr)
+{
+	register_ftrace_function(&trace_ops);
+	tracer_enabled = 1;
+}
+
+static void stop_irqsoff_tracer(struct trace_array *tr)
+{
+	tracer_enabled = 0;
+	unregister_ftrace_function(&trace_ops);
+}
+
+static void __irqsoff_tracer_init(struct trace_array *tr)
+{
+	irqsoff_trace = tr;
+	/* make sure that the tracer is visible */
+	smp_wmb();
+
+	if (tr->ctrl)
+		start_irqsoff_tracer(tr);
+}
+
+static void irqsoff_tracer_reset(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		stop_irqsoff_tracer(tr);
+}
+
+static void irqsoff_tracer_ctrl_update(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		start_irqsoff_tracer(tr);
+	else
+		stop_irqsoff_tracer(tr);
+}
+
+static void irqsoff_tracer_open(struct trace_iterator *iter)
+{
+	/* stop the trace while dumping */
+	if (iter->tr->ctrl)
+		stop_irqsoff_tracer(iter->tr);
+}
+
+static void irqsoff_tracer_close(struct trace_iterator *iter)
+{
+	if (iter->tr->ctrl)
+		start_irqsoff_tracer(iter->tr);
+}
+
+#ifdef CONFIG_IRQSOFF_TRACER
+static void irqsoff_tracer_init(struct trace_array *tr)
+{
+	trace_type = TRACER_IRQS_OFF;
+
+	__irqsoff_tracer_init(tr);
+}
+static struct tracer irqsoff_tracer __read_mostly =
+{
+	.name		= "irqsoff",
+	.init		= irqsoff_tracer_init,
+	.reset		= irqsoff_tracer_reset,
+	.open		= irqsoff_tracer_open,
+	.close		= irqsoff_tracer_close,
+	.ctrl_update	= irqsoff_tracer_ctrl_update,
+	.print_max	= 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_irqsoff,
+#endif
+};
+# define register_irqsoff(trace) register_tracer(&trace)
+#else
+# define register_irqsoff(trace) do { } while (0)
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACER
+static void preemptoff_tracer_init(struct trace_array *tr)
+{
+	trace_type = TRACER_PREEMPT_OFF;
+
+	__irqsoff_tracer_init(tr);
+}
+
+static struct tracer preemptoff_tracer __read_mostly =
+{
+	.name		= "preemptoff",
+	.init		= preemptoff_tracer_init,
+	.reset		= irqsoff_tracer_reset,
+	.open		= irqsoff_tracer_open,
+	.close		= irqsoff_tracer_close,
+	.ctrl_update	= irqsoff_tracer_ctrl_update,
+	.print_max	= 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_preemptoff,
+#endif
+};
+# define register_preemptoff(trace) register_tracer(&trace)
+#else
+# define register_preemptoff(trace) do { } while (0)
+#endif
+
+#if defined(CONFIG_IRQSOFF_TRACER) && \
+	defined(CONFIG_PREEMPT_TRACER)
+
+static void preemptirqsoff_tracer_init(struct trace_array *tr)
+{
+	trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
+
+	__irqsoff_tracer_init(tr);
+}
+
+static struct tracer preemptirqsoff_tracer __read_mostly =
+{
+	.name		= "preemptirqsoff",
+	.init		= preemptirqsoff_tracer_init,
+	.reset		= irqsoff_tracer_reset,
+	.open		= irqsoff_tracer_open,
+	.close		= irqsoff_tracer_close,
+	.ctrl_update	= irqsoff_tracer_ctrl_update,
+	.print_max	= 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_preemptirqsoff,
+#endif
+};
+
+# define register_preemptirqsoff(trace) register_tracer(&trace)
+#else
+# define register_preemptirqsoff(trace) do { } while (0)
+#endif
+
+__init static int init_irqsoff_tracer(void)
+{
+	register_irqsoff(irqsoff_tracer);
+	register_preemptoff(preemptoff_tracer);
+	register_preemptirqsoff(preemptirqsoff_tracer);
+
+	return 0;
+}
+device_initcall(init_irqsoff_tracer);

+ 286 - 0
kernel/trace/trace_sched_switch.c

@@ -0,0 +1,286 @@
+/*
+ * trace context switch
+ *
+ * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/marker.h>
+#include <linux/ftrace.h>
+
+#include "trace.h"
+
+static struct trace_array	*ctx_trace;
+static int __read_mostly	tracer_enabled;
+static atomic_t			sched_ref;
+
+static void
+sched_switch_func(void *private, void *__rq, struct task_struct *prev,
+			struct task_struct *next)
+{
+	struct trace_array **ptr = private;
+	struct trace_array *tr = *ptr;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	tracing_record_cmdline(prev);
+	tracing_record_cmdline(next);
+
+	if (!tracer_enabled)
+		return;
+
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		tracing_sched_switch_trace(tr, data, prev, next, flags);
+
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+}
+
+static notrace void
+sched_switch_callback(void *probe_data, void *call_data,
+		      const char *format, va_list *args)
+{
+	struct task_struct *prev;
+	struct task_struct *next;
+	struct rq *__rq;
+
+	if (!atomic_read(&sched_ref))
+		return;
+
+	/* skip prev_pid %d next_pid %d prev_state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	__rq = va_arg(*args, typeof(__rq));
+	prev = va_arg(*args, typeof(prev));
+	next = va_arg(*args, typeof(next));
+
+	/*
+	 * If tracer_switch_func only points to the local
+	 * switch func, it still needs the ptr passed to it.
+	 */
+	sched_switch_func(probe_data, __rq, prev, next);
+}
+
+static void
+wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct
+			task_struct *curr)
+{
+	struct trace_array **ptr = private;
+	struct trace_array *tr = *ptr;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	if (!tracer_enabled)
+		return;
+
+	tracing_record_cmdline(curr);
+
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		tracing_sched_wakeup_trace(tr, data, wakee, curr, flags);
+
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+}
+
+static notrace void
+wake_up_callback(void *probe_data, void *call_data,
+		 const char *format, va_list *args)
+{
+	struct task_struct *curr;
+	struct task_struct *task;
+	struct rq *__rq;
+
+	if (likely(!tracer_enabled))
+		return;
+
+	/* Skip pid %d state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	/* now get the meat: "rq %p task %p rq->curr %p" */
+	__rq = va_arg(*args, typeof(__rq));
+	task = va_arg(*args, typeof(task));
+	curr = va_arg(*args, typeof(curr));
+
+	tracing_record_cmdline(task);
+	tracing_record_cmdline(curr);
+
+	wakeup_func(probe_data, __rq, task, curr);
+}
+
+static void sched_switch_reset(struct trace_array *tr)
+{
+	int cpu;
+
+	tr->time_start = ftrace_now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr->data[cpu]);
+}
+
+static int tracing_sched_register(void)
+{
+	int ret;
+
+	ret = marker_probe_register("kernel_sched_wakeup",
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			wake_up_callback,
+			&ctx_trace);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't add marker"
+			" probe to kernel_sched_wakeup\n");
+		return ret;
+	}
+
+	ret = marker_probe_register("kernel_sched_wakeup_new",
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			wake_up_callback,
+			&ctx_trace);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't add marker"
+			" probe to kernel_sched_wakeup_new\n");
+		goto fail_deprobe;
+	}
+
+	ret = marker_probe_register("kernel_sched_schedule",
+		"prev_pid %d next_pid %d prev_state %ld "
+		"## rq %p prev %p next %p",
+		sched_switch_callback,
+		&ctx_trace);
+	if (ret) {
+		pr_info("sched trace: Couldn't add marker"
+			" probe to kernel_sched_schedule\n");
+		goto fail_deprobe_wake_new;
+	}
+
+	return ret;
+fail_deprobe_wake_new:
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				wake_up_callback,
+				&ctx_trace);
+fail_deprobe:
+	marker_probe_unregister("kernel_sched_wakeup",
+				wake_up_callback,
+				&ctx_trace);
+	return ret;
+}
+
+static void tracing_sched_unregister(void)
+{
+	marker_probe_unregister("kernel_sched_schedule",
+				sched_switch_callback,
+				&ctx_trace);
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				wake_up_callback,
+				&ctx_trace);
+	marker_probe_unregister("kernel_sched_wakeup",
+				wake_up_callback,
+				&ctx_trace);
+}
+
+static void tracing_start_sched_switch(void)
+{
+	long ref;
+
+	ref = atomic_inc_return(&sched_ref);
+	if (ref == 1)
+		tracing_sched_register();
+}
+
+static void tracing_stop_sched_switch(void)
+{
+	long ref;
+
+	ref = atomic_dec_and_test(&sched_ref);
+	if (ref)
+		tracing_sched_unregister();
+}
+
+void tracing_start_cmdline_record(void)
+{
+	tracing_start_sched_switch();
+}
+
+void tracing_stop_cmdline_record(void)
+{
+	tracing_stop_sched_switch();
+}
+
+static void start_sched_trace(struct trace_array *tr)
+{
+	sched_switch_reset(tr);
+	tracer_enabled = 1;
+	tracing_start_cmdline_record();
+}
+
+static void stop_sched_trace(struct trace_array *tr)
+{
+	tracing_stop_cmdline_record();
+	tracer_enabled = 0;
+}
+
+static void sched_switch_trace_init(struct trace_array *tr)
+{
+	ctx_trace = tr;
+
+	if (tr->ctrl)
+		start_sched_trace(tr);
+}
+
+static void sched_switch_trace_reset(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		stop_sched_trace(tr);
+}
+
+static void sched_switch_trace_ctrl_update(struct trace_array *tr)
+{
+	/* When starting a new trace, reset the buffers */
+	if (tr->ctrl)
+		start_sched_trace(tr);
+	else
+		stop_sched_trace(tr);
+}
+
+static struct tracer sched_switch_trace __read_mostly =
+{
+	.name		= "sched_switch",
+	.init		= sched_switch_trace_init,
+	.reset		= sched_switch_trace_reset,
+	.ctrl_update	= sched_switch_trace_ctrl_update,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_sched_switch,
+#endif
+};
+
+__init static int init_sched_switch_trace(void)
+{
+	int ret = 0;
+
+	if (atomic_read(&sched_ref))
+		ret = tracing_sched_register();
+	if (ret) {
+		pr_info("error registering scheduler trace\n");
+		return ret;
+	}
+	return register_tracer(&sched_switch_trace);
+}
+device_initcall(init_sched_switch_trace);

+ 447 - 0
kernel/trace/trace_sched_wakeup.c

@@ -0,0 +1,447 @@
+/*
+ * trace task wakeup timings
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/marker.h>
+
+#include "trace.h"
+
+static struct trace_array	*wakeup_trace;
+static int __read_mostly	tracer_enabled;
+
+static struct task_struct	*wakeup_task;
+static int			wakeup_cpu;
+static unsigned			wakeup_prio = -1;
+
+static DEFINE_SPINLOCK(wakeup_lock);
+
+static void __wakeup_reset(struct trace_array *tr);
+
+#ifdef CONFIG_FTRACE
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = wakeup_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int resched;
+	int cpu;
+
+	if (likely(!wakeup_task))
+		return;
+
+	resched = need_resched();
+	preempt_disable_notrace();
+
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+	if (unlikely(disabled != 1))
+		goto out;
+
+	spin_lock_irqsave(&wakeup_lock, flags);
+
+	if (unlikely(!wakeup_task))
+		goto unlock;
+
+	/*
+	 * The task can't disappear because it needs to
+	 * wake up first, and we have the wakeup_lock.
+	 */
+	if (task_cpu(wakeup_task) != cpu)
+		goto unlock;
+
+	trace_function(tr, data, ip, parent_ip, flags);
+
+ unlock:
+	spin_unlock_irqrestore(&wakeup_lock, flags);
+
+ out:
+	atomic_dec(&data->disabled);
+
+	/*
+	 * To prevent recursion from the scheduler, if the
+	 * resched flag was set before we entered, then
+	 * don't reschedule.
+	 */
+	if (resched)
+		preempt_enable_no_resched_notrace();
+	else
+		preempt_enable_notrace();
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+	.func = wakeup_tracer_call,
+};
+#endif /* CONFIG_FTRACE */
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int report_latency(cycle_t delta)
+{
+	if (tracing_thresh) {
+		if (delta < tracing_thresh)
+			return 0;
+	} else {
+		if (delta <= tracing_max_latency)
+			return 0;
+	}
+	return 1;
+}
+
+static void notrace
+wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
+	struct task_struct *next)
+{
+	unsigned long latency = 0, t0 = 0, t1 = 0;
+	struct trace_array **ptr = private;
+	struct trace_array *tr = *ptr;
+	struct trace_array_cpu *data;
+	cycle_t T0, T1, delta;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	if (unlikely(!tracer_enabled))
+		return;
+
+	/*
+	 * When we start a new trace, we set wakeup_task to NULL
+	 * and then set tracer_enabled = 1. We want to make sure
+	 * that another CPU does not see the tracer_enabled = 1
+	 * and the wakeup_task with an older task, that might
+	 * actually be the same as next.
+	 */
+	smp_rmb();
+
+	if (next != wakeup_task)
+		return;
+
+	/* The task we are waiting for is waking up */
+	data = tr->data[wakeup_cpu];
+
+	/* disable local data, not wakeup_cpu data */
+	cpu = raw_smp_processor_id();
+	disabled = atomic_inc_return(&tr->data[cpu]->disabled);
+	if (likely(disabled != 1))
+		goto out;
+
+	spin_lock_irqsave(&wakeup_lock, flags);
+
+	/* We could race with grabbing wakeup_lock */
+	if (unlikely(!tracer_enabled || next != wakeup_task))
+		goto out_unlock;
+
+	trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags);
+
+	/*
+	 * usecs conversion is slow so we try to delay the conversion
+	 * as long as possible:
+	 */
+	T0 = data->preempt_timestamp;
+	T1 = ftrace_now(cpu);
+	delta = T1-T0;
+
+	if (!report_latency(delta))
+		goto out_unlock;
+
+	latency = nsecs_to_usecs(delta);
+
+	tracing_max_latency = delta;
+	t0 = nsecs_to_usecs(T0);
+	t1 = nsecs_to_usecs(T1);
+
+	update_max_tr(tr, wakeup_task, wakeup_cpu);
+
+out_unlock:
+	__wakeup_reset(tr);
+	spin_unlock_irqrestore(&wakeup_lock, flags);
+out:
+	atomic_dec(&tr->data[cpu]->disabled);
+}
+
+static notrace void
+sched_switch_callback(void *probe_data, void *call_data,
+		      const char *format, va_list *args)
+{
+	struct task_struct *prev;
+	struct task_struct *next;
+	struct rq *__rq;
+
+	/* skip prev_pid %d next_pid %d prev_state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	__rq = va_arg(*args, typeof(__rq));
+	prev = va_arg(*args, typeof(prev));
+	next = va_arg(*args, typeof(next));
+
+	tracing_record_cmdline(prev);
+
+	/*
+	 * If tracer_switch_func only points to the local
+	 * switch func, it still needs the ptr passed to it.
+	 */
+	wakeup_sched_switch(probe_data, __rq, prev, next);
+}
+
+static void __wakeup_reset(struct trace_array *tr)
+{
+	struct trace_array_cpu *data;
+	int cpu;
+
+	assert_spin_locked(&wakeup_lock);
+
+	for_each_possible_cpu(cpu) {
+		data = tr->data[cpu];
+		tracing_reset(data);
+	}
+
+	wakeup_cpu = -1;
+	wakeup_prio = -1;
+
+	if (wakeup_task)
+		put_task_struct(wakeup_task);
+
+	wakeup_task = NULL;
+}
+
+static void wakeup_reset(struct trace_array *tr)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&wakeup_lock, flags);
+	__wakeup_reset(tr);
+	spin_unlock_irqrestore(&wakeup_lock, flags);
+}
+
+static void
+wakeup_check_start(struct trace_array *tr, struct task_struct *p,
+		   struct task_struct *curr)
+{
+	int cpu = smp_processor_id();
+	unsigned long flags;
+	long disabled;
+
+	if (likely(!rt_task(p)) ||
+			p->prio >= wakeup_prio ||
+			p->prio >= curr->prio)
+		return;
+
+	disabled = atomic_inc_return(&tr->data[cpu]->disabled);
+	if (unlikely(disabled != 1))
+		goto out;
+
+	/* interrupts should be off from try_to_wake_up */
+	spin_lock(&wakeup_lock);
+
+	/* check for races. */
+	if (!tracer_enabled || p->prio >= wakeup_prio)
+		goto out_locked;
+
+	/* reset the trace */
+	__wakeup_reset(tr);
+
+	wakeup_cpu = task_cpu(p);
+	wakeup_prio = p->prio;
+
+	wakeup_task = p;
+	get_task_struct(wakeup_task);
+
+	local_save_flags(flags);
+
+	tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu);
+	trace_function(tr, tr->data[wakeup_cpu],
+		       CALLER_ADDR1, CALLER_ADDR2, flags);
+
+out_locked:
+	spin_unlock(&wakeup_lock);
+out:
+	atomic_dec(&tr->data[cpu]->disabled);
+}
+
+static notrace void
+wake_up_callback(void *probe_data, void *call_data,
+		 const char *format, va_list *args)
+{
+	struct trace_array **ptr = probe_data;
+	struct trace_array *tr = *ptr;
+	struct task_struct *curr;
+	struct task_struct *task;
+	struct rq *__rq;
+
+	if (likely(!tracer_enabled))
+		return;
+
+	/* Skip pid %d state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	/* now get the meat: "rq %p task %p rq->curr %p" */
+	__rq = va_arg(*args, typeof(__rq));
+	task = va_arg(*args, typeof(task));
+	curr = va_arg(*args, typeof(curr));
+
+	tracing_record_cmdline(task);
+	tracing_record_cmdline(curr);
+
+	wakeup_check_start(tr, task, curr);
+}
+
+static void start_wakeup_tracer(struct trace_array *tr)
+{
+	int ret;
+
+	ret = marker_probe_register("kernel_sched_wakeup",
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			wake_up_callback,
+			&wakeup_trace);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't add marker"
+			" probe to kernel_sched_wakeup\n");
+		return;
+	}
+
+	ret = marker_probe_register("kernel_sched_wakeup_new",
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			wake_up_callback,
+			&wakeup_trace);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't add marker"
+			" probe to kernel_sched_wakeup_new\n");
+		goto fail_deprobe;
+	}
+
+	ret = marker_probe_register("kernel_sched_schedule",
+		"prev_pid %d next_pid %d prev_state %ld "
+		"## rq %p prev %p next %p",
+		sched_switch_callback,
+		&wakeup_trace);
+	if (ret) {
+		pr_info("sched trace: Couldn't add marker"
+			" probe to kernel_sched_schedule\n");
+		goto fail_deprobe_wake_new;
+	}
+
+	wakeup_reset(tr);
+
+	/*
+	 * Don't let the tracer_enabled = 1 show up before
+	 * the wakeup_task is reset. This may be overkill since
+	 * wakeup_reset does a spin_unlock after setting the
+	 * wakeup_task to NULL, but I want to be safe.
+	 * This is a slow path anyway.
+	 */
+	smp_wmb();
+
+	tracer_enabled = 1;
+	register_ftrace_function(&trace_ops);
+
+	return;
+fail_deprobe_wake_new:
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				wake_up_callback,
+				&wakeup_trace);
+fail_deprobe:
+	marker_probe_unregister("kernel_sched_wakeup",
+				wake_up_callback,
+				&wakeup_trace);
+}
+
+static void stop_wakeup_tracer(struct trace_array *tr)
+{
+	tracer_enabled = 0;
+	unregister_ftrace_function(&trace_ops);
+	marker_probe_unregister("kernel_sched_schedule",
+				sched_switch_callback,
+				&wakeup_trace);
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				wake_up_callback,
+				&wakeup_trace);
+	marker_probe_unregister("kernel_sched_wakeup",
+				wake_up_callback,
+				&wakeup_trace);
+}
+
+static void wakeup_tracer_init(struct trace_array *tr)
+{
+	wakeup_trace = tr;
+
+	if (tr->ctrl)
+		start_wakeup_tracer(tr);
+}
+
+static void wakeup_tracer_reset(struct trace_array *tr)
+{
+	if (tr->ctrl) {
+		stop_wakeup_tracer(tr);
+		/* make sure we put back any tasks we are tracing */
+		wakeup_reset(tr);
+	}
+}
+
+static void wakeup_tracer_ctrl_update(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		start_wakeup_tracer(tr);
+	else
+		stop_wakeup_tracer(tr);
+}
+
+static void wakeup_tracer_open(struct trace_iterator *iter)
+{
+	/* stop the trace while dumping */
+	if (iter->tr->ctrl)
+		stop_wakeup_tracer(iter->tr);
+}
+
+static void wakeup_tracer_close(struct trace_iterator *iter)
+{
+	/* forget about any processes we were recording */
+	if (iter->tr->ctrl)
+		start_wakeup_tracer(iter->tr);
+}
+
+static struct tracer wakeup_tracer __read_mostly =
+{
+	.name		= "wakeup",
+	.init		= wakeup_tracer_init,
+	.reset		= wakeup_tracer_reset,
+	.open		= wakeup_tracer_open,
+	.close		= wakeup_tracer_close,
+	.ctrl_update	= wakeup_tracer_ctrl_update,
+	.print_max	= 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_wakeup,
+#endif
+};
+
+__init static int init_wakeup_tracer(void)
+{
+	int ret;
+
+	ret = register_tracer(&wakeup_tracer);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+device_initcall(init_wakeup_tracer);

+ 540 - 0
kernel/trace/trace_selftest.c

@@ -0,0 +1,540 @@
+/* Include in trace.c */
+
+#include <linux/kthread.h>
+#include <linux/delay.h>
+
+static inline int trace_valid_entry(struct trace_entry *entry)
+{
+	switch (entry->type) {
+	case TRACE_FN:
+	case TRACE_CTX:
+	case TRACE_WAKE:
+	case TRACE_STACK:
+	case TRACE_SPECIAL:
+		return 1;
+	}
+	return 0;
+}
+
+static int
+trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data)
+{
+	struct trace_entry *entries;
+	struct page *page;
+	int idx = 0;
+	int i;
+
+	BUG_ON(list_empty(&data->trace_pages));
+	page = list_entry(data->trace_pages.next, struct page, lru);
+	entries = page_address(page);
+
+	check_pages(data);
+	if (head_page(data) != entries)
+		goto failed;
+
+	/*
+	 * The starting trace buffer always has valid elements,
+	 * if any element exists.
+	 */
+	entries = head_page(data);
+
+	for (i = 0; i < tr->entries; i++) {
+
+		if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) {
+			printk(KERN_CONT ".. invalid entry %d ",
+				entries[idx].type);
+			goto failed;
+		}
+
+		idx++;
+		if (idx >= ENTRIES_PER_PAGE) {
+			page = virt_to_page(entries);
+			if (page->lru.next == &data->trace_pages) {
+				if (i != tr->entries - 1) {
+					printk(KERN_CONT ".. entries buffer mismatch");
+					goto failed;
+				}
+			} else {
+				page = list_entry(page->lru.next, struct page, lru);
+				entries = page_address(page);
+			}
+			idx = 0;
+		}
+	}
+
+	page = virt_to_page(entries);
+	if (page->lru.next != &data->trace_pages) {
+		printk(KERN_CONT ".. too many entries");
+		goto failed;
+	}
+
+	return 0;
+
+ failed:
+	/* disable tracing */
+	tracing_disabled = 1;
+	printk(KERN_CONT ".. corrupted trace buffer .. ");
+	return -1;
+}
+
+/*
+ * Test the trace buffer to see if all the elements
+ * are still sane.
+ */
+static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
+{
+	unsigned long flags, cnt = 0;
+	int cpu, ret = 0;
+
+	/* Don't allow flipping of max traces now */
+	raw_local_irq_save(flags);
+	__raw_spin_lock(&ftrace_max_lock);
+	for_each_possible_cpu(cpu) {
+		if (!head_page(tr->data[cpu]))
+			continue;
+
+		cnt += tr->data[cpu]->trace_idx;
+
+		ret = trace_test_buffer_cpu(tr, tr->data[cpu]);
+		if (ret)
+			break;
+	}
+	__raw_spin_unlock(&ftrace_max_lock);
+	raw_local_irq_restore(flags);
+
+	if (count)
+		*count = cnt;
+
+	return ret;
+}
+
+#ifdef CONFIG_FTRACE
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+#define __STR(x) #x
+#define STR(x) __STR(x)
+
+/* Test dynamic code modification and ftrace filters */
+int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
+					   struct trace_array *tr,
+					   int (*func)(void))
+{
+	unsigned long count;
+	int ret;
+	int save_ftrace_enabled = ftrace_enabled;
+	int save_tracer_enabled = tracer_enabled;
+	char *func_name;
+
+	/* The ftrace test PASSED */
+	printk(KERN_CONT "PASSED\n");
+	pr_info("Testing dynamic ftrace: ");
+
+	/* enable tracing, and record the filter function */
+	ftrace_enabled = 1;
+	tracer_enabled = 1;
+
+	/* passed in by parameter to fool gcc from optimizing */
+	func();
+
+	/* update the records */
+	ret = ftrace_force_update();
+	if (ret) {
+		printk(KERN_CONT ".. ftraced failed .. ");
+		return ret;
+	}
+
+	/*
+	 * Some archs *cough*PowerPC*cough* add charachters to the
+	 * start of the function names. We simply put a '*' to
+	 * accomodate them.
+	 */
+	func_name = "*" STR(DYN_FTRACE_TEST_NAME);
+
+	/* filter only on our function */
+	ftrace_set_filter(func_name, strlen(func_name), 1);
+
+	/* enable tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* Sleep for a 1/10 of a second */
+	msleep(100);
+
+	/* we should have nothing in the buffer */
+	ret = trace_test_buffer(tr, &count);
+	if (ret)
+		goto out;
+
+	if (count) {
+		ret = -1;
+		printk(KERN_CONT ".. filter did not filter .. ");
+		goto out;
+	}
+
+	/* call our function again */
+	func();
+
+	/* sleep again */
+	msleep(100);
+
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	ftrace_enabled = 0;
+
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+
+	/* we should only have one item */
+	if (!ret && count != 1) {
+		printk(KERN_CONT ".. filter failed count=%ld ..", count);
+		ret = -1;
+		goto out;
+	}
+ out:
+	ftrace_enabled = save_ftrace_enabled;
+	tracer_enabled = save_tracer_enabled;
+
+	/* Enable tracing on all functions again */
+	ftrace_set_filter(NULL, 0, 1);
+
+	return ret;
+}
+#else
+# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
+#endif /* CONFIG_DYNAMIC_FTRACE */
+/*
+ * Simple verification test of ftrace function tracer.
+ * Enable ftrace, sleep 1/10 second, and then read the trace
+ * buffer to see if all is in order.
+ */
+int
+trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long count;
+	int ret;
+	int save_ftrace_enabled = ftrace_enabled;
+	int save_tracer_enabled = tracer_enabled;
+
+	/* make sure msleep has been recorded */
+	msleep(1);
+
+	/* force the recorded functions to be traced */
+	ret = ftrace_force_update();
+	if (ret) {
+		printk(KERN_CONT ".. ftraced failed .. ");
+		return ret;
+	}
+
+	/* start the tracing */
+	ftrace_enabled = 1;
+	tracer_enabled = 1;
+
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* Sleep for a 1/10 of a second */
+	msleep(100);
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	ftrace_enabled = 0;
+
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+		goto out;
+	}
+
+	ret = trace_selftest_startup_dynamic_tracing(trace, tr,
+						     DYN_FTRACE_TEST_NAME);
+
+ out:
+	ftrace_enabled = save_ftrace_enabled;
+	tracer_enabled = save_tracer_enabled;
+
+	/* kill ftrace totally if we failed */
+	if (ret)
+		ftrace_kill();
+
+	return ret;
+}
+#endif /* CONFIG_FTRACE */
+
+#ifdef CONFIG_IRQSOFF_TRACER
+int
+trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long save_max = tracing_max_latency;
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* reset the max latency */
+	tracing_max_latency = 0;
+	/* disable interrupts for a bit */
+	local_irq_disable();
+	udelay(100);
+	local_irq_enable();
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check both trace buffers */
+	ret = trace_test_buffer(tr, NULL);
+	if (!ret)
+		ret = trace_test_buffer(&max_tr, &count);
+	trace->reset(tr);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+	}
+
+	tracing_max_latency = save_max;
+
+	return ret;
+}
+#endif /* CONFIG_IRQSOFF_TRACER */
+
+#ifdef CONFIG_PREEMPT_TRACER
+int
+trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long save_max = tracing_max_latency;
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* reset the max latency */
+	tracing_max_latency = 0;
+	/* disable preemption for a bit */
+	preempt_disable();
+	udelay(100);
+	preempt_enable();
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check both trace buffers */
+	ret = trace_test_buffer(tr, NULL);
+	if (!ret)
+		ret = trace_test_buffer(&max_tr, &count);
+	trace->reset(tr);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+	}
+
+	tracing_max_latency = save_max;
+
+	return ret;
+}
+#endif /* CONFIG_PREEMPT_TRACER */
+
+#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
+int
+trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long save_max = tracing_max_latency;
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+
+	/* reset the max latency */
+	tracing_max_latency = 0;
+
+	/* disable preemption and interrupts for a bit */
+	preempt_disable();
+	local_irq_disable();
+	udelay(100);
+	preempt_enable();
+	/* reverse the order of preempt vs irqs */
+	local_irq_enable();
+
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check both trace buffers */
+	ret = trace_test_buffer(tr, NULL);
+	if (ret)
+		goto out;
+
+	ret = trace_test_buffer(&max_tr, &count);
+	if (ret)
+		goto out;
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+		goto out;
+	}
+
+	/* do the test by disabling interrupts first this time */
+	tracing_max_latency = 0;
+	tr->ctrl = 1;
+	trace->ctrl_update(tr);
+	preempt_disable();
+	local_irq_disable();
+	udelay(100);
+	preempt_enable();
+	/* reverse the order of preempt vs irqs */
+	local_irq_enable();
+
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check both trace buffers */
+	ret = trace_test_buffer(tr, NULL);
+	if (ret)
+		goto out;
+
+	ret = trace_test_buffer(&max_tr, &count);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+		goto out;
+	}
+
+ out:
+	trace->reset(tr);
+	tracing_max_latency = save_max;
+
+	return ret;
+}
+#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */
+
+#ifdef CONFIG_SCHED_TRACER
+static int trace_wakeup_test_thread(void *data)
+{
+	/* Make this a RT thread, doesn't need to be too high */
+	struct sched_param param = { .sched_priority = 5 };
+	struct completion *x = data;
+
+	sched_setscheduler(current, SCHED_FIFO, &param);
+
+	/* Make it know we have a new prio */
+	complete(x);
+
+	/* now go to sleep and let the test wake us up */
+	set_current_state(TASK_INTERRUPTIBLE);
+	schedule();
+
+	/* we are awake, now wait to disappear */
+	while (!kthread_should_stop()) {
+		/*
+		 * This is an RT task, do short sleeps to let
+		 * others run.
+		 */
+		msleep(100);
+	}
+
+	return 0;
+}
+
+int
+trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long save_max = tracing_max_latency;
+	struct task_struct *p;
+	struct completion isrt;
+	unsigned long count;
+	int ret;
+
+	init_completion(&isrt);
+
+	/* create a high prio thread */
+	p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test");
+	if (IS_ERR(p)) {
+		printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
+		return -1;
+	}
+
+	/* make sure the thread is running at an RT prio */
+	wait_for_completion(&isrt);
+
+	/* start the tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* reset the max latency */
+	tracing_max_latency = 0;
+
+	/* sleep to let the RT thread sleep too */
+	msleep(100);
+
+	/*
+	 * Yes this is slightly racy. It is possible that for some
+	 * strange reason that the RT thread we created, did not
+	 * call schedule for 100ms after doing the completion,
+	 * and we do a wakeup on a task that already is awake.
+	 * But that is extremely unlikely, and the worst thing that
+	 * happens in such a case, is that we disable tracing.
+	 * Honestly, if this race does happen something is horrible
+	 * wrong with the system.
+	 */
+
+	wake_up_process(p);
+
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check both trace buffers */
+	ret = trace_test_buffer(tr, NULL);
+	if (!ret)
+		ret = trace_test_buffer(&max_tr, &count);
+
+
+	trace->reset(tr);
+
+	tracing_max_latency = save_max;
+
+	/* kill the thread */
+	kthread_stop(p);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+	}
+
+	return ret;
+}
+#endif /* CONFIG_SCHED_TRACER */
+
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+int
+trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* Sleep for a 1/10 of a second */
+	msleep(100);
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+	}
+
+	return ret;
+}
+#endif /* CONFIG_CONTEXT_SWITCH_TRACER */

+ 7 - 0
kernel/trace/trace_selftest_dynamic.c

@@ -0,0 +1,7 @@
+#include "trace.h"
+
+int DYN_FTRACE_TEST_NAME(void)
+{
+	/* used to call mcount */
+	return 0;
+}

+ 2 - 0
lib/Kconfig.debug

@@ -634,6 +634,8 @@ config LATENCYTOP
 	  Enable this option if you want to use the LatencyTOP tool
 	  Enable this option if you want to use the LatencyTOP tool
 	  to find out which userspace is blocking on what kernel operations.
 	  to find out which userspace is blocking on what kernel operations.
 
 
+source kernel/trace/Kconfig
+
 config PROVIDE_OHCI1394_DMA_INIT
 config PROVIDE_OHCI1394_DMA_INIT
 	bool "Remote debugging over FireWire early on boot"
 	bool "Remote debugging over FireWire early on boot"
 	depends on PCI && X86
 	depends on PCI && X86

+ 9 - 0
lib/Makefile

@@ -8,6 +8,15 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 sha1.o irq_regs.o reciprocal_div.o argv_split.o \
 	 sha1.o irq_regs.o reciprocal_div.o argv_split.o \
 	 proportions.o prio_heap.o ratelimit.o
 	 proportions.o prio_heap.o ratelimit.o
 
 
+ifdef CONFIG_FTRACE
+# Do not profile string.o, since it may be used in early boot or vdso
+CFLAGS_REMOVE_string.o = -pg
+# Also do not profile any debug utilities
+CFLAGS_REMOVE_spinlock_debug.o = -pg
+CFLAGS_REMOVE_list_debug.o = -pg
+CFLAGS_REMOVE_debugobjects.o = -pg
+endif
+
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
 lib-$(CONFIG_SMP) += cpumask.o
 
 

+ 3 - 3
lib/smp_processor_id.c

@@ -7,7 +7,7 @@
 #include <linux/kallsyms.h>
 #include <linux/kallsyms.h>
 #include <linux/sched.h>
 #include <linux/sched.h>
 
 
-unsigned int debug_smp_processor_id(void)
+notrace unsigned int debug_smp_processor_id(void)
 {
 {
 	unsigned long preempt_count = preempt_count();
 	unsigned long preempt_count = preempt_count();
 	int this_cpu = raw_smp_processor_id();
 	int this_cpu = raw_smp_processor_id();
@@ -37,7 +37,7 @@ unsigned int debug_smp_processor_id(void)
 	/*
 	/*
 	 * Avoid recursion:
 	 * Avoid recursion:
 	 */
 	 */
-	preempt_disable();
+	preempt_disable_notrace();
 
 
 	if (!printk_ratelimit())
 	if (!printk_ratelimit())
 		goto out_enable;
 		goto out_enable;
@@ -49,7 +49,7 @@ unsigned int debug_smp_processor_id(void)
 	dump_stack();
 	dump_stack();
 
 
 out_enable:
 out_enable:
-	preempt_enable_no_resched();
+	preempt_enable_no_resched_notrace();
 out:
 out:
 	return this_cpu;
 	return this_cpu;
 }
 }

+ 7 - 3
mm/page-writeback.c

@@ -126,8 +126,6 @@ static void background_writeout(unsigned long _min_pages);
 static struct prop_descriptor vm_completions;
 static struct prop_descriptor vm_completions;
 static struct prop_descriptor vm_dirties;
 static struct prop_descriptor vm_dirties;
 
 
-static unsigned long determine_dirtyable_memory(void);
-
 /*
 /*
  * couple the period to the dirty_ratio:
  * couple the period to the dirty_ratio:
  *
  *
@@ -347,7 +345,13 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
 #endif
 #endif
 }
 }
 
 
-static unsigned long determine_dirtyable_memory(void)
+/**
+ * determine_dirtyable_memory - amount of memory that may be used
+ *
+ * Returns the numebr of pages that can currently be freed and used
+ * by the kernel for direct mappings.
+ */
+unsigned long determine_dirtyable_memory(void)
 {
 {
 	unsigned long x;
 	unsigned long x;
 
 

+ 2 - 1
scripts/Makefile.lib

@@ -96,7 +96,8 @@ basename_flags = -D"KBUILD_BASENAME=KBUILD_STR($(call name-fix,$(basetarget)))"
 modname_flags  = $(if $(filter 1,$(words $(modname))),\
 modname_flags  = $(if $(filter 1,$(words $(modname))),\
                  -D"KBUILD_MODNAME=KBUILD_STR($(call name-fix,$(modname)))")
                  -D"KBUILD_MODNAME=KBUILD_STR($(call name-fix,$(modname)))")
 
 
-_c_flags       = $(KBUILD_CFLAGS) $(ccflags-y) $(CFLAGS_$(basetarget).o)
+orig_c_flags   = $(KBUILD_CFLAGS) $(ccflags-y) $(CFLAGS_$(basetarget).o)
+_c_flags       = $(filter-out $(CFLAGS_REMOVE_$(basetarget).o), $(orig_c_flags))
 _a_flags       = $(KBUILD_AFLAGS) $(asflags-y) $(AFLAGS_$(basetarget).o)
 _a_flags       = $(KBUILD_AFLAGS) $(asflags-y) $(AFLAGS_$(basetarget).o)
 _cpp_flags     = $(KBUILD_CPPFLAGS) $(cppflags-y) $(CPPFLAGS_$(@F))
 _cpp_flags     = $(KBUILD_CPPFLAGS) $(cppflags-y) $(CPPFLAGS_$(@F))