瀏覽代碼

Merge branch 'tracing-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'tracing-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (241 commits)
  sched, trace: update trace_sched_wakeup()
  tracing/ftrace: don't trace on early stage of a secondary cpu boot, v3
  Revert "x86: disable X86_PTRACE_BTS"
  ring-buffer: prevent false positive warning
  ring-buffer: fix dangling commit race
  ftrace: enable format arguments checking
  x86, bts: memory accounting
  x86, bts: add fork and exit handling
  ftrace: introduce tracing_reset_online_cpus() helper
  tracing: fix warnings in kernel/trace/trace_sched_switch.c
  tracing: fix warning in kernel/trace/trace.c
  tracing/ring-buffer: remove unused ring_buffer size
  trace: fix task state printout
  ftrace: add not to regex on filtering functions
  trace: better use of stack_trace_enabled for boot up code
  trace: add a way to enable or disable the stack tracer
  x86: entry_64 - introduce FTRACE_ frame macro v2
  tracing/ftrace: add the printk-msg-only option
  tracing/ftrace: use preempt_enable_no_resched_notrace in ring_buffer_time_stamp()
  x86, bts: correctly report invalid bts records
  ...

Fixed up trivial conflict in scripts/recordmcount.pl due to SH bits
being already partly merged by the SH merge.
Linus Torvalds 16 年之前
父節點
當前提交
b0f4b285d7
共有 100 個文件被更改,包括 5789 次插入2109 次删除
  1. 117 32
      Documentation/ftrace.txt
  2. 12 0
      Documentation/kernel-parameters.txt
  3. 24 5
      Documentation/markers.txt
  4. 54 40
      Documentation/tracepoints.txt
  5. 13 1
      arch/powerpc/include/asm/ftrace.h
  6. 15 1
      arch/powerpc/include/asm/module.h
  7. 1 0
      arch/powerpc/kernel/Makefile
  8. 9 31
      arch/powerpc/kernel/entry_32.S
  9. 0 12
      arch/powerpc/kernel/entry_64.S
  10. 421 40
      arch/powerpc/kernel/ftrace.c
  11. 5 0
      arch/powerpc/kernel/idle.c
  12. 10 0
      arch/powerpc/kernel/module_32.c
  13. 13 0
      arch/powerpc/kernel/module_64.c
  14. 3 0
      arch/powerpc/lib/Makefile
  15. 3 0
      arch/x86/Kconfig
  16. 1 1
      arch/x86/Kconfig.cpu
  17. 0 4
      arch/x86/Kconfig.debug
  18. 172 140
      arch/x86/include/asm/ds.h
  19. 60 1
      arch/x86/include/asm/ftrace.h
  20. 2 1
      arch/x86/include/asm/msr.h
  21. 13 0
      arch/x86/include/asm/processor.h
  22. 7 36
      arch/x86/include/asm/ptrace.h
  23. 3 4
      arch/x86/include/asm/thread_info.h
  24. 1 0
      arch/x86/kernel/Makefile
  25. 2 1
      arch/x86/kernel/apic.c
  26. 5 0
      arch/x86/kernel/cpu/Makefile
  27. 4 0
      arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
  28. 0 4
      arch/x86/kernel/cpu/intel.c
  29. 590 488
      arch/x86/kernel/ds.c
  30. 33 1
      arch/x86/kernel/dumpstack.c
  31. 1 1
      arch/x86/kernel/dumpstack.h
  32. 4 1
      arch/x86/kernel/dumpstack_32.c
  33. 4 3
      arch/x86/kernel/dumpstack_64.c
  34. 50 1
      arch/x86/kernel/entry_32.S
  35. 69 29
      arch/x86/kernel/entry_64.S
  36. 380 10
      arch/x86/kernel/ftrace.c
  37. 2 1
      arch/x86/kernel/irq_64.c
  38. 16 0
      arch/x86/kernel/process.c
  39. 16 51
      arch/x86/kernel/process_32.c
  40. 16 42
      arch/x86/kernel/process_64.c
  41. 152 279
      arch/x86/kernel/ptrace.c
  42. 1 1
      arch/x86/kernel/smpboot.c
  43. 64 0
      arch/x86/kernel/stacktrace.c
  44. 1 0
      arch/x86/kernel/vmlinux_32.lds.S
  45. 1 0
      arch/x86/kernel/vmlinux_64.lds.S
  46. 3 0
      arch/x86/kernel/vsyscall_64.c
  47. 1 2
      arch/x86/mm/Makefile
  48. 1 1
      arch/x86/mm/fault.c
  49. 3 0
      arch/x86/vdso/vclock_gettime.c
  50. 1 0
      block/Kconfig
  51. 27 19
      block/blk-core.c
  52. 329 3
      block/blktrace.c
  53. 9 3
      block/elevator.c
  54. 17 1
      drivers/char/sysrq.c
  55. 5 3
      drivers/md/dm.c
  56. 4 1
      fs/bio.c
  57. 13 1
      fs/seq_file.c
  58. 30 1
      include/asm-generic/vmlinux.lds.h
  59. 3 169
      include/linux/blktrace_api.h
  60. 82 2
      include/linux/compiler.h
  61. 267 26
      include/linux/ftrace.h
  62. 13 0
      include/linux/ftrace_irq.h
  63. 13 2
      include/linux/hardirq.h
  64. 63 12
      include/linux/marker.h
  65. 2 0
      include/linux/mm.h
  66. 2 2
      include/linux/pid.h
  67. 22 0
      include/linux/ptrace.h
  68. 2 0
      include/linux/rcupdate.h
  69. 12 4
      include/linux/ring_buffer.h
  70. 31 0
      include/linux/sched.h
  71. 1 0
      include/linux/seq_file.h
  72. 8 0
      include/linux/stacktrace.h
  73. 38 19
      include/linux/tracepoint.h
  74. 1 1
      include/linux/tty.h
  75. 76 0
      include/trace/block.h
  76. 60 0
      include/trace/boot.h
  77. 18 18
      include/trace/sched.h
  78. 1 0
      init/Kconfig
  79. 20 15
      init/main.c
  80. 4 1
      kernel/exit.c
  81. 3 2
      kernel/extable.c
  82. 12 2
      kernel/fork.c
  83. 3 0
      kernel/kthread.c
  84. 1 0
      kernel/lockdep.c
  85. 127 65
      kernel/marker.c
  86. 2 11
      kernel/module.c
  87. 3 10
      kernel/power/disk.c
  88. 1 4
      kernel/power/main.c
  89. 1 1
      kernel/profile.c
  90. 12 0
      kernel/ptrace.c
  91. 11 3
      kernel/sched.c
  92. 2 0
      kernel/signal.c
  93. 20 0
      kernel/sysctl.c
  94. 111 4
      kernel/trace/Kconfig
  95. 9 0
      kernel/trace/Makefile
  96. 642 130
      kernel/trace/ftrace.c
  97. 302 184
      kernel/trace/ring_buffer.c
  98. 611 66
      kernel/trace/trace.c
  99. 255 10
      kernel/trace/trace.h
  100. 109 49
      kernel/trace/trace_boot.c

+ 117 - 32
Documentation/ftrace.txt

@@ -82,7 +82,7 @@ of ftrace. Here is a list of some of the key files:
 		tracer is not adding more data, they will display
 		tracer is not adding more data, they will display
 		the same information every time they are read.
 		the same information every time they are read.
 
 
-  iter_ctrl: This file lets the user control the amount of data
+  trace_options: This file lets the user control the amount of data
 		that is displayed in one of the above output
 		that is displayed in one of the above output
 		files.
 		files.
 
 
@@ -94,10 +94,10 @@ of ftrace. Here is a list of some of the key files:
 		only be recorded if the latency is greater than
 		only be recorded if the latency is greater than
 		the value in this file. (in microseconds)
 		the value in this file. (in microseconds)
 
 
-  trace_entries: This sets or displays the number of bytes each CPU
+  buffer_size_kb: This sets or displays the number of kilobytes each CPU
 		buffer can hold. The tracer buffers are the same size
 		buffer can hold. The tracer buffers are the same size
 		for each CPU. The displayed number is the size of the
 		for each CPU. The displayed number is the size of the
-		 CPU buffer and not total size of all buffers. The
+		CPU buffer and not total size of all buffers. The
 		trace buffers are allocated in pages (blocks of memory
 		trace buffers are allocated in pages (blocks of memory
 		that the kernel uses for allocation, usually 4 KB in size).
 		that the kernel uses for allocation, usually 4 KB in size).
 		If the last page allocated has room for more bytes
 		If the last page allocated has room for more bytes
@@ -127,6 +127,8 @@ of ftrace. Here is a list of some of the key files:
 		be traced. If a function exists in both set_ftrace_filter
 		be traced. If a function exists in both set_ftrace_filter
 		and set_ftrace_notrace,	the function will _not_ be traced.
 		and set_ftrace_notrace,	the function will _not_ be traced.
 
 
+  set_ftrace_pid: Have the function tracer only trace a single thread.
+
   available_filter_functions: This lists the functions that ftrace
   available_filter_functions: This lists the functions that ftrace
 		has processed and can trace. These are the function
 		has processed and can trace. These are the function
 		names that you can pass to "set_ftrace_filter" or
 		names that you can pass to "set_ftrace_filter" or
@@ -316,23 +318,23 @@ The above is mostly meaningful for kernel developers.
   The rest is the same as the 'trace' file.
   The rest is the same as the 'trace' file.
 
 
 
 
-iter_ctrl
----------
+trace_options
+-------------
 
 
-The iter_ctrl file is used to control what gets printed in the trace
+The trace_options file is used to control what gets printed in the trace
 output. To see what is available, simply cat the file:
 output. To see what is available, simply cat the file:
 
 
-  cat /debug/tracing/iter_ctrl
+  cat /debug/tracing/trace_options
   print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \
   print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \
- noblock nostacktrace nosched-tree
+ noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj
 
 
 To disable one of the options, echo in the option prepended with "no".
 To disable one of the options, echo in the option prepended with "no".
 
 
-  echo noprint-parent > /debug/tracing/iter_ctrl
+  echo noprint-parent > /debug/tracing/trace_options
 
 
 To enable an option, leave off the "no".
 To enable an option, leave off the "no".
 
 
-  echo sym-offset > /debug/tracing/iter_ctrl
+  echo sym-offset > /debug/tracing/trace_options
 
 
 Here are the available options:
 Here are the available options:
 
 
@@ -378,6 +380,20 @@ Here are the available options:
 		When a trace is recorded, so is the stack of functions.
 		When a trace is recorded, so is the stack of functions.
 		This allows for back traces of trace sites.
 		This allows for back traces of trace sites.
 
 
+  userstacktrace - This option changes the trace.
+		   It records a stacktrace of the current userspace thread.
+
+  sym-userobj - when user stacktrace are enabled, look up which object the
+		address belongs to, and print a relative address
+		This is especially useful when ASLR is on, otherwise you don't
+		get a chance to resolve the address to object/file/line after the app is no
+		longer running
+
+		The lookup is performed when you read trace,trace_pipe,latency_trace. Example:
+
+		a.out-1623  [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0
+x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
+
   sched-tree - TBD (any users??)
   sched-tree - TBD (any users??)
 
 
 
 
@@ -1059,6 +1075,83 @@ For simple one time traces, the above is sufficent. For anything else,
 a search through /proc/mounts may be needed to find where the debugfs
 a search through /proc/mounts may be needed to find where the debugfs
 file-system is mounted.
 file-system is mounted.
 
 
+
+Single thread tracing
+---------------------
+
+By writing into /debug/tracing/set_ftrace_pid you can trace a
+single thread. For example:
+
+# cat /debug/tracing/set_ftrace_pid
+no pid
+# echo 3111 > /debug/tracing/set_ftrace_pid
+# cat /debug/tracing/set_ftrace_pid
+3111
+# echo function > /debug/tracing/current_tracer
+# cat /debug/tracing/trace | head
+ # tracer: function
+ #
+ #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
+ #              | |       |          |         |
+     yum-updatesd-3111  [003]  1637.254676: finish_task_switch <-thread_return
+     yum-updatesd-3111  [003]  1637.254681: hrtimer_cancel <-schedule_hrtimeout_range
+     yum-updatesd-3111  [003]  1637.254682: hrtimer_try_to_cancel <-hrtimer_cancel
+     yum-updatesd-3111  [003]  1637.254683: lock_hrtimer_base <-hrtimer_try_to_cancel
+     yum-updatesd-3111  [003]  1637.254685: fget_light <-do_sys_poll
+     yum-updatesd-3111  [003]  1637.254686: pipe_poll <-do_sys_poll
+# echo -1 > /debug/tracing/set_ftrace_pid
+# cat /debug/tracing/trace |head
+ # tracer: function
+ #
+ #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
+ #              | |       |          |         |
+ ##### CPU 3 buffer started ####
+     yum-updatesd-3111  [003]  1701.957688: free_poll_entry <-poll_freewait
+     yum-updatesd-3111  [003]  1701.957689: remove_wait_queue <-free_poll_entry
+     yum-updatesd-3111  [003]  1701.957691: fput <-free_poll_entry
+     yum-updatesd-3111  [003]  1701.957692: audit_syscall_exit <-sysret_audit
+     yum-updatesd-3111  [003]  1701.957693: path_put <-audit_syscall_exit
+
+If you want to trace a function when executing, you could use
+something like this simple program:
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+int main (int argc, char **argv)
+{
+        if (argc < 1)
+                exit(-1);
+
+        if (fork() > 0) {
+                int fd, ffd;
+                char line[64];
+                int s;
+
+                ffd = open("/debug/tracing/current_tracer", O_WRONLY);
+                if (ffd < 0)
+                        exit(-1);
+                write(ffd, "nop", 3);
+
+                fd = open("/debug/tracing/set_ftrace_pid", O_WRONLY);
+                s = sprintf(line, "%d\n", getpid());
+                write(fd, line, s);
+
+                write(ffd, "function", 8);
+
+                close(fd);
+                close(ffd);
+
+                execvp(argv[1], argv+1);
+        }
+
+        return 0;
+}
+
 dynamic ftrace
 dynamic ftrace
 --------------
 --------------
 
 
@@ -1158,7 +1251,11 @@ These are the only wild cards which are supported.
 
 
   <match>*<match> will not work.
   <match>*<match> will not work.
 
 
- # echo hrtimer_* > /debug/tracing/set_ftrace_filter
+Note: It is better to use quotes to enclose the wild cards, otherwise
+  the shell may expand the parameters into names of files in the local
+  directory.
+
+ # echo 'hrtimer_*' > /debug/tracing/set_ftrace_filter
 
 
 Produces:
 Produces:
 
 
@@ -1213,7 +1310,7 @@ Again, now we want to append.
  # echo sys_nanosleep > /debug/tracing/set_ftrace_filter
  # echo sys_nanosleep > /debug/tracing/set_ftrace_filter
  # cat /debug/tracing/set_ftrace_filter
  # cat /debug/tracing/set_ftrace_filter
 sys_nanosleep
 sys_nanosleep
- # echo hrtimer_* >> /debug/tracing/set_ftrace_filter
+ # echo 'hrtimer_*' >> /debug/tracing/set_ftrace_filter
  # cat /debug/tracing/set_ftrace_filter
  # cat /debug/tracing/set_ftrace_filter
 hrtimer_run_queues
 hrtimer_run_queues
 hrtimer_run_pending
 hrtimer_run_pending
@@ -1299,41 +1396,29 @@ trace entries
 -------------
 -------------
 
 
 Having too much or not enough data can be troublesome in diagnosing
 Having too much or not enough data can be troublesome in diagnosing
-an issue in the kernel. The file trace_entries is used to modify
+an issue in the kernel. The file buffer_size_kb is used to modify
 the size of the internal trace buffers. The number listed
 the size of the internal trace buffers. The number listed
 is the number of entries that can be recorded per CPU. To know
 is the number of entries that can be recorded per CPU. To know
 the full size, multiply the number of possible CPUS with the
 the full size, multiply the number of possible CPUS with the
 number of entries.
 number of entries.
 
 
- # cat /debug/tracing/trace_entries
-65620
+ # cat /debug/tracing/buffer_size_kb
+1408 (units kilobytes)
 
 
 Note, to modify this, you must have tracing completely disabled. To do that,
 Note, to modify this, you must have tracing completely disabled. To do that,
 echo "nop" into the current_tracer. If the current_tracer is not set
 echo "nop" into the current_tracer. If the current_tracer is not set
 to "nop", an EINVAL error will be returned.
 to "nop", an EINVAL error will be returned.
 
 
  # echo nop > /debug/tracing/current_tracer
  # echo nop > /debug/tracing/current_tracer
- # echo 100000 > /debug/tracing/trace_entries
- # cat /debug/tracing/trace_entries
-100045
-
-
-Notice that we echoed in 100,000 but the size is 100,045. The entries
-are held in individual pages. It allocates the number of pages it takes
-to fulfill the request. If more entries may fit on the last page
-then they will be added.
-
- # echo 1 > /debug/tracing/trace_entries
- # cat /debug/tracing/trace_entries
-85
-
-This shows us that 85 entries can fit in a single page.
+ # echo 10000 > /debug/tracing/buffer_size_kb
+ # cat /debug/tracing/buffer_size_kb
+10000 (units kilobytes)
 
 
 The number of pages which will be allocated is limited to a percentage
 The number of pages which will be allocated is limited to a percentage
 of available memory. Allocating too much will produce an error.
 of available memory. Allocating too much will produce an error.
 
 
- # echo 1000000000000 > /debug/tracing/trace_entries
+ # echo 1000000000000 > /debug/tracing/buffer_size_kb
 -bash: echo: write error: Cannot allocate memory
 -bash: echo: write error: Cannot allocate memory
- # cat /debug/tracing/trace_entries
+ # cat /debug/tracing/buffer_size_kb
 85
 85
 
 

+ 12 - 0
Documentation/kernel-parameters.txt

@@ -89,6 +89,7 @@ parameter is applicable:
 	SPARC	Sparc architecture is enabled.
 	SPARC	Sparc architecture is enabled.
 	SWSUSP	Software suspend (hibernation) is enabled.
 	SWSUSP	Software suspend (hibernation) is enabled.
 	SUSPEND	System suspend states are enabled.
 	SUSPEND	System suspend states are enabled.
+	FTRACE	Function tracing enabled.
 	TS	Appropriate touchscreen support is enabled.
 	TS	Appropriate touchscreen support is enabled.
 	USB	USB support is enabled.
 	USB	USB support is enabled.
 	USBHID	USB Human Interface Device support is enabled.
 	USBHID	USB Human Interface Device support is enabled.
@@ -753,6 +754,14 @@ and is between 256 and 4096 characters. It is defined in the file
 			parameter will force ia64_sal_cache_flush to call
 			parameter will force ia64_sal_cache_flush to call
 			ia64_pal_cache_flush instead of SAL_CACHE_FLUSH.
 			ia64_pal_cache_flush instead of SAL_CACHE_FLUSH.
 
 
+	ftrace=[tracer]
+			[ftrace] will set and start the specified tracer
+			as early as possible in order to facilitate early
+			boot debugging.
+
+	ftrace_dump_on_oops
+			[ftrace] will dump the trace buffers on oops.
+
 	gamecon.map[2|3]=
 	gamecon.map[2|3]=
 			[HW,JOY] Multisystem joystick and NES/SNES/PSX pad
 			[HW,JOY] Multisystem joystick and NES/SNES/PSX pad
 			support via parallel port (up to 5 devices per port)
 			support via parallel port (up to 5 devices per port)
@@ -2196,6 +2205,9 @@ and is between 256 and 4096 characters. It is defined in the file
 	st=		[HW,SCSI] SCSI tape parameters (buffers, etc.)
 	st=		[HW,SCSI] SCSI tape parameters (buffers, etc.)
 			See Documentation/scsi/st.txt.
 			See Documentation/scsi/st.txt.
 
 
+	stacktrace	[FTRACE]
+			Enabled the stack tracer on boot up.
+
 	sti=		[PARISC,HW]
 	sti=		[PARISC,HW]
 			Format: <num>
 			Format: <num>
 			Set the STI (builtin display/keyboard on the HP-PARISC
 			Set the STI (builtin display/keyboard on the HP-PARISC

+ 24 - 5
Documentation/markers.txt

@@ -51,11 +51,16 @@ to call) for the specific marker through marker_probe_register() and can be
 activated by calling marker_arm(). Marker deactivation can be done by calling
 activated by calling marker_arm(). Marker deactivation can be done by calling
 marker_disarm() as many times as marker_arm() has been called. Removing a probe
 marker_disarm() as many times as marker_arm() has been called. Removing a probe
 is done through marker_probe_unregister(); it will disarm the probe.
 is done through marker_probe_unregister(); it will disarm the probe.
-marker_synchronize_unregister() must be called before the end of the module exit
-function to make sure there is no caller left using the probe. This, and the
-fact that preemption is disabled around the probe call, make sure that probe
-removal and module unload are safe. See the "Probe example" section below for a
-sample probe module.
+
+marker_synchronize_unregister() must be called between probe unregistration and
+the first occurrence of
+- the end of module exit function,
+  to make sure there is no caller left using the probe;
+- the free of any resource used by the probes,
+  to make sure the probes wont be accessing invalid data.
+This, and the fact that preemption is disabled around the probe call, make sure
+that probe removal and module unload are safe. See the "Probe example" section
+below for a sample probe module.
 
 
 The marker mechanism supports inserting multiple instances of the same marker.
 The marker mechanism supports inserting multiple instances of the same marker.
 Markers can be put in inline functions, inlined static functions, and
 Markers can be put in inline functions, inlined static functions, and
@@ -70,6 +75,20 @@ a printk warning which identifies the inconsistency:
 
 
 "Format mismatch for probe probe_name (format), marker (format)"
 "Format mismatch for probe probe_name (format), marker (format)"
 
 
+Another way to use markers is to simply define the marker without generating any
+function call to actually call into the marker. This is useful in combination
+with tracepoint probes in a scheme like this :
+
+void probe_tracepoint_name(unsigned int arg1, struct task_struct *tsk);
+
+DEFINE_MARKER_TP(marker_eventname, tracepoint_name, probe_tracepoint_name,
+	"arg1 %u pid %d");
+
+notrace void probe_tracepoint_name(unsigned int arg1, struct task_struct *tsk)
+{
+	struct marker *marker = &GET_MARKER(kernel_irq_entry);
+	/* write data to trace buffers ... */
+}
 
 
 * Probe / marker example
 * Probe / marker example
 
 

+ 54 - 40
Documentation/tracepoints.txt

@@ -3,28 +3,30 @@
 			    Mathieu Desnoyers
 			    Mathieu Desnoyers
 
 
 
 
-This document introduces Linux Kernel Tracepoints and their use. It provides
-examples of how to insert tracepoints in the kernel and connect probe functions
-to them and provides some examples of probe functions.
+This document introduces Linux Kernel Tracepoints and their use. It
+provides examples of how to insert tracepoints in the kernel and
+connect probe functions to them and provides some examples of probe
+functions.
 
 
 
 
 * Purpose of tracepoints
 * Purpose of tracepoints
 
 
-A tracepoint placed in code provides a hook to call a function (probe) that you
-can provide at runtime. A tracepoint can be "on" (a probe is connected to it) or
-"off" (no probe is attached). When a tracepoint is "off" it has no effect,
-except for adding a tiny time penalty (checking a condition for a branch) and
-space penalty (adding a few bytes for the function call at the end of the
-instrumented function and adds a data structure in a separate section).  When a
-tracepoint is "on", the function you provide is called each time the tracepoint
-is executed, in the execution context of the caller. When the function provided
-ends its execution, it returns to the caller (continuing from the tracepoint
-site).
+A tracepoint placed in code provides a hook to call a function (probe)
+that you can provide at runtime. A tracepoint can be "on" (a probe is
+connected to it) or "off" (no probe is attached). When a tracepoint is
+"off" it has no effect, except for adding a tiny time penalty
+(checking a condition for a branch) and space penalty (adding a few
+bytes for the function call at the end of the instrumented function
+and adds a data structure in a separate section).  When a tracepoint
+is "on", the function you provide is called each time the tracepoint
+is executed, in the execution context of the caller. When the function
+provided ends its execution, it returns to the caller (continuing from
+the tracepoint site).
 
 
 You can put tracepoints at important locations in the code. They are
 You can put tracepoints at important locations in the code. They are
 lightweight hooks that can pass an arbitrary number of parameters,
 lightweight hooks that can pass an arbitrary number of parameters,
-which prototypes are described in a tracepoint declaration placed in a header
-file.
+which prototypes are described in a tracepoint declaration placed in a
+header file.
 
 
 They can be used for tracing and performance accounting.
 They can be used for tracing and performance accounting.
 
 
@@ -42,14 +44,16 @@ In include/trace/subsys.h :
 
 
 #include <linux/tracepoint.h>
 #include <linux/tracepoint.h>
 
 
-DEFINE_TRACE(subsys_eventname,
-	TPPTOTO(int firstarg, struct task_struct *p),
+DECLARE_TRACE(subsys_eventname,
+	TPPROTO(int firstarg, struct task_struct *p),
 	TPARGS(firstarg, p));
 	TPARGS(firstarg, p));
 
 
 In subsys/file.c (where the tracing statement must be added) :
 In subsys/file.c (where the tracing statement must be added) :
 
 
 #include <trace/subsys.h>
 #include <trace/subsys.h>
 
 
+DEFINE_TRACE(subsys_eventname);
+
 void somefct(void)
 void somefct(void)
 {
 {
 	...
 	...
@@ -61,31 +65,41 @@ Where :
 - subsys_eventname is an identifier unique to your event
 - subsys_eventname is an identifier unique to your event
     - subsys is the name of your subsystem.
     - subsys is the name of your subsystem.
     - eventname is the name of the event to trace.
     - eventname is the name of the event to trace.
-- TPPTOTO(int firstarg, struct task_struct *p) is the prototype of the function
-  called by this tracepoint.
-- TPARGS(firstarg, p) are the parameters names, same as found in the prototype.
 
 
-Connecting a function (probe) to a tracepoint is done by providing a probe
-(function to call) for the specific tracepoint through
-register_trace_subsys_eventname().  Removing a probe is done through
-unregister_trace_subsys_eventname(); it will remove the probe sure there is no
-caller left using the probe when it returns. Probe removal is preempt-safe
-because preemption is disabled around the probe call. See the "Probe example"
-section below for a sample probe module.
-
-The tracepoint mechanism supports inserting multiple instances of the same
-tracepoint, but a single definition must be made of a given tracepoint name over
-all the kernel to make sure no type conflict will occur. Name mangling of the
-tracepoints is done using the prototypes to make sure typing is correct.
-Verification of probe type correctness is done at the registration site by the
-compiler. Tracepoints can be put in inline functions, inlined static functions,
-and unrolled loops as well as regular functions.
-
-The naming scheme "subsys_event" is suggested here as a convention intended
-to limit collisions. Tracepoint names are global to the kernel: they are
-considered as being the same whether they are in the core kernel image or in
-modules.
+- TPPROTO(int firstarg, struct task_struct *p) is the prototype of the
+  function called by this tracepoint.
 
 
+- TPARGS(firstarg, p) are the parameters names, same as found in the
+  prototype.
+
+Connecting a function (probe) to a tracepoint is done by providing a
+probe (function to call) for the specific tracepoint through
+register_trace_subsys_eventname().  Removing a probe is done through
+unregister_trace_subsys_eventname(); it will remove the probe.
+
+tracepoint_synchronize_unregister() must be called before the end of
+the module exit function to make sure there is no caller left using
+the probe. This, and the fact that preemption is disabled around the
+probe call, make sure that probe removal and module unload are safe.
+See the "Probe example" section below for a sample probe module.
+
+The tracepoint mechanism supports inserting multiple instances of the
+same tracepoint, but a single definition must be made of a given
+tracepoint name over all the kernel to make sure no type conflict will
+occur. Name mangling of the tracepoints is done using the prototypes
+to make sure typing is correct. Verification of probe type correctness
+is done at the registration site by the compiler. Tracepoints can be
+put in inline functions, inlined static functions, and unrolled loops
+as well as regular functions.
+
+The naming scheme "subsys_event" is suggested here as a convention
+intended to limit collisions. Tracepoint names are global to the
+kernel: they are considered as being the same whether they are in the
+core kernel image or in modules.
+
+If the tracepoint has to be used in kernel modules, an
+EXPORT_TRACEPOINT_SYMBOL_GPL() or EXPORT_TRACEPOINT_SYMBOL() can be
+used to export the defined tracepoints.
 
 
 * Probe / tracepoint example
 * Probe / tracepoint example
 
 

+ 13 - 1
arch/powerpc/include/asm/ftrace.h

@@ -7,7 +7,19 @@
 
 
 #ifndef __ASSEMBLY__
 #ifndef __ASSEMBLY__
 extern void _mcount(void);
 extern void _mcount(void);
-#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+static inline unsigned long ftrace_call_adjust(unsigned long addr)
+{
+       /* reloction of mcount call site is the same as the address */
+       return addr;
+}
+
+struct dyn_arch_ftrace {
+	struct module *mod;
+};
+#endif /*  CONFIG_DYNAMIC_FTRACE */
+#endif /* __ASSEMBLY__ */
 
 
 #endif
 #endif
 
 

+ 15 - 1
arch/powerpc/include/asm/module.h

@@ -34,11 +34,19 @@ struct mod_arch_specific {
 #ifdef __powerpc64__
 #ifdef __powerpc64__
 	unsigned int stubs_section;	/* Index of stubs section in module */
 	unsigned int stubs_section;	/* Index of stubs section in module */
 	unsigned int toc_section;	/* What section is the TOC? */
 	unsigned int toc_section;	/* What section is the TOC? */
-#else
+#ifdef CONFIG_DYNAMIC_FTRACE
+	unsigned long toc;
+	unsigned long tramp;
+#endif
+
+#else /* powerpc64 */
 	/* Indices of PLT sections within module. */
 	/* Indices of PLT sections within module. */
 	unsigned int core_plt_section;
 	unsigned int core_plt_section;
 	unsigned int init_plt_section;
 	unsigned int init_plt_section;
+#ifdef CONFIG_DYNAMIC_FTRACE
+	unsigned long tramp;
 #endif
 #endif
+#endif /* powerpc64 */
 
 
 	/* List of BUG addresses, source line numbers and filenames */
 	/* List of BUG addresses, source line numbers and filenames */
 	struct list_head bug_list;
 	struct list_head bug_list;
@@ -68,6 +76,12 @@ struct mod_arch_specific {
 #    endif	/* MODULE */
 #    endif	/* MODULE */
 #endif
 #endif
 
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+#    ifdef MODULE
+	asm(".section .ftrace.tramp,\"ax\",@nobits; .align 3; .previous");
+#    endif	/* MODULE */
+#endif
+
 
 
 struct exception_table_entry;
 struct exception_table_entry;
 void sort_ex_table(struct exception_table_entry *start,
 void sort_ex_table(struct exception_table_entry *start,

+ 1 - 0
arch/powerpc/kernel/Makefile

@@ -17,6 +17,7 @@ ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_cputable.o = -pg -mno-sched-epilog
 CFLAGS_REMOVE_cputable.o = -pg -mno-sched-epilog
 CFLAGS_REMOVE_prom_init.o = -pg -mno-sched-epilog
 CFLAGS_REMOVE_prom_init.o = -pg -mno-sched-epilog
 CFLAGS_REMOVE_btext.o = -pg -mno-sched-epilog
 CFLAGS_REMOVE_btext.o = -pg -mno-sched-epilog
+CFLAGS_REMOVE_prom.o = -pg -mno-sched-epilog
 
 
 ifdef CONFIG_DYNAMIC_FTRACE
 ifdef CONFIG_DYNAMIC_FTRACE
 # dynamic ftrace setup.
 # dynamic ftrace setup.

+ 9 - 31
arch/powerpc/kernel/entry_32.S

@@ -1162,39 +1162,17 @@ machine_check_in_rtas:
 #ifdef CONFIG_DYNAMIC_FTRACE
 #ifdef CONFIG_DYNAMIC_FTRACE
 _GLOBAL(mcount)
 _GLOBAL(mcount)
 _GLOBAL(_mcount)
 _GLOBAL(_mcount)
-	stwu	r1,-48(r1)
-	stw	r3, 12(r1)
-	stw	r4, 16(r1)
-	stw	r5, 20(r1)
-	stw	r6, 24(r1)
-	mflr	r3
-	stw	r7, 28(r1)
-	mfcr	r5
-	stw	r8, 32(r1)
-	stw	r9, 36(r1)
-	stw	r10,40(r1)
-	stw	r3, 44(r1)
-	stw	r5, 8(r1)
-	subi	r3, r3, MCOUNT_INSN_SIZE
-	.globl mcount_call
-mcount_call:
-	bl	ftrace_stub
-	nop
-	lwz	r6, 8(r1)
-	lwz	r0, 44(r1)
-	lwz	r3, 12(r1)
+	/*
+	 * It is required that _mcount on PPC32 must preserve the
+	 * link register. But we have r0 to play with. We use r0
+	 * to push the return address back to the caller of mcount
+	 * into the ctr register, restore the link register and
+	 * then jump back using the ctr register.
+	 */
+	mflr	r0
 	mtctr	r0
 	mtctr	r0
-	lwz	r4, 16(r1)
-	mtcr	r6
-	lwz	r5, 20(r1)
-	lwz	r6, 24(r1)
-	lwz	r0, 52(r1)
-	lwz	r7, 28(r1)
-	lwz	r8, 32(r1)
+	lwz	r0, 4(r1)
 	mtlr	r0
 	mtlr	r0
-	lwz	r9, 36(r1)
-	lwz	r10,40(r1)
-	addi	r1, r1, 48
 	bctr
 	bctr
 
 
 _GLOBAL(ftrace_caller)
 _GLOBAL(ftrace_caller)

+ 0 - 12
arch/powerpc/kernel/entry_64.S

@@ -894,18 +894,6 @@ _GLOBAL(enter_prom)
 #ifdef CONFIG_DYNAMIC_FTRACE
 #ifdef CONFIG_DYNAMIC_FTRACE
 _GLOBAL(mcount)
 _GLOBAL(mcount)
 _GLOBAL(_mcount)
 _GLOBAL(_mcount)
-	/* Taken from output of objdump from lib64/glibc */
-	mflr	r3
-	stdu	r1, -112(r1)
-	std	r3, 128(r1)
-	subi	r3, r3, MCOUNT_INSN_SIZE
-	.globl mcount_call
-mcount_call:
-	bl	ftrace_stub
-	nop
-	ld	r0, 128(r1)
-	mtlr	r0
-	addi	r1, r1, 112
 	blr
 	blr
 
 
 _GLOBAL(ftrace_caller)
 _GLOBAL(ftrace_caller)

+ 421 - 40
arch/powerpc/kernel/ftrace.c

@@ -9,22 +9,30 @@
 
 
 #include <linux/spinlock.h>
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
 #include <linux/hardirq.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
 #include <linux/ftrace.h>
 #include <linux/ftrace.h>
 #include <linux/percpu.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/list.h>
 
 
 #include <asm/cacheflush.h>
 #include <asm/cacheflush.h>
+#include <asm/code-patching.h>
 #include <asm/ftrace.h>
 #include <asm/ftrace.h>
 
 
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(fmt , ...)	do { } while (0)
+#endif
 
 
-static unsigned int ftrace_nop = 0x60000000;
+static unsigned int ftrace_nop = PPC_NOP_INSTR;
 
 
 #ifdef CONFIG_PPC32
 #ifdef CONFIG_PPC32
 # define GET_ADDR(addr) addr
 # define GET_ADDR(addr) addr
 #else
 #else
 /* PowerPC64's functions are data that points to the functions */
 /* PowerPC64's functions are data that points to the functions */
-# define GET_ADDR(addr) *(unsigned long *)addr
+# define GET_ADDR(addr) (*(unsigned long *)addr)
 #endif
 #endif
 
 
 
 
@@ -33,12 +41,12 @@ static unsigned int ftrace_calc_offset(long ip, long addr)
 	return (int)(addr - ip);
 	return (int)(addr - ip);
 }
 }
 
 
-unsigned char *ftrace_nop_replace(void)
+static unsigned char *ftrace_nop_replace(void)
 {
 {
 	return (char *)&ftrace_nop;
 	return (char *)&ftrace_nop;
 }
 }
 
 
-unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 {
 {
 	static unsigned int op;
 	static unsigned int op;
 
 
@@ -68,49 +76,422 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 # define _ASM_PTR	" .long "
 # define _ASM_PTR	" .long "
 #endif
 #endif
 
 
-int
+static int
 ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 		   unsigned char *new_code)
 		   unsigned char *new_code)
 {
 {
-	unsigned replaced;
-	unsigned old = *(unsigned *)old_code;
-	unsigned new = *(unsigned *)new_code;
-	int faulted = 0;
+	unsigned char replaced[MCOUNT_INSN_SIZE];
 
 
 	/*
 	/*
 	 * Note: Due to modules and __init, code can
 	 * Note: Due to modules and __init, code can
 	 *  disappear and change, we need to protect against faulting
 	 *  disappear and change, we need to protect against faulting
-	 *  as well as code changing.
+	 *  as well as code changing. We do this by using the
+	 *  probe_kernel_* functions.
 	 *
 	 *
 	 * No real locking needed, this code is run through
 	 * No real locking needed, this code is run through
-	 * kstop_machine.
+	 * kstop_machine, or before SMP starts.
 	 */
 	 */
-	asm volatile (
-		"1: lwz		%1, 0(%2)\n"
-		"   cmpw	%1, %5\n"
-		"   bne		2f\n"
-		"   stwu	%3, 0(%2)\n"
-		"2:\n"
-		".section .fixup, \"ax\"\n"
-		"3:	li %0, 1\n"
-		"	b 2b\n"
-		".previous\n"
-		".section __ex_table,\"a\"\n"
-		_ASM_ALIGN "\n"
-		_ASM_PTR "1b, 3b\n"
-		".previous"
-		: "=r"(faulted), "=r"(replaced)
-		: "r"(ip), "r"(new),
-		  "0"(faulted), "r"(old)
-		: "memory");
-
-	if (replaced != old && replaced != new)
-		faulted = 2;
-
-	if (!faulted)
-		flush_icache_range(ip, ip + 8);
-
-	return faulted;
+
+	/* read the text we want to modify */
+	if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
+		return -EFAULT;
+
+	/* Make sure it is what we expect it to be */
+	if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
+		return -EINVAL;
+
+	/* replace the text with the new text */
+	if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
+		return -EPERM;
+
+	flush_icache_range(ip, ip + 8);
+
+	return 0;
+}
+
+/*
+ * Helper functions that are the same for both PPC64 and PPC32.
+ */
+static int test_24bit_addr(unsigned long ip, unsigned long addr)
+{
+
+	/* use the create_branch to verify that this offset can be branched */
+	return create_branch((unsigned int *)ip, addr, 0);
+}
+
+static int is_bl_op(unsigned int op)
+{
+	return (op & 0xfc000003) == 0x48000001;
+}
+
+static unsigned long find_bl_target(unsigned long ip, unsigned int op)
+{
+	static int offset;
+
+	offset = (op & 0x03fffffc);
+	/* make it signed */
+	if (offset & 0x02000000)
+		offset |= 0xfe000000;
+
+	return ip + (long)offset;
+}
+
+#ifdef CONFIG_PPC64
+static int
+__ftrace_make_nop(struct module *mod,
+		  struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned int op;
+	unsigned int jmp[5];
+	unsigned long ptr;
+	unsigned long ip = rec->ip;
+	unsigned long tramp;
+	int offset;
+
+	/* read where this goes */
+	if (probe_kernel_read(&op, (void *)ip, sizeof(int)))
+		return -EFAULT;
+
+	/* Make sure that that this is still a 24bit jump */
+	if (!is_bl_op(op)) {
+		printk(KERN_ERR "Not expected bl: opcode is %x\n", op);
+		return -EINVAL;
+	}
+
+	/* lets find where the pointer goes */
+	tramp = find_bl_target(ip, op);
+
+	/*
+	 * On PPC64 the trampoline looks like:
+	 * 0x3d, 0x82, 0x00, 0x00,    addis   r12,r2, <high>
+	 * 0x39, 0x8c, 0x00, 0x00,    addi    r12,r12, <low>
+	 *   Where the bytes 2,3,6 and 7 make up the 32bit offset
+	 *   to the TOC that holds the pointer.
+	 *   to jump to.
+	 * 0xf8, 0x41, 0x00, 0x28,    std     r2,40(r1)
+	 * 0xe9, 0x6c, 0x00, 0x20,    ld      r11,32(r12)
+	 *   The actually address is 32 bytes from the offset
+	 *   into the TOC.
+	 * 0xe8, 0x4c, 0x00, 0x28,    ld      r2,40(r12)
+	 */
+
+	DEBUGP("ip:%lx jumps to %lx r2: %lx", ip, tramp, mod->arch.toc);
+
+	/* Find where the trampoline jumps to */
+	if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) {
+		printk(KERN_ERR "Failed to read %lx\n", tramp);
+		return -EFAULT;
+	}
+
+	DEBUGP(" %08x %08x", jmp[0], jmp[1]);
+
+	/* verify that this is what we expect it to be */
+	if (((jmp[0] & 0xffff0000) != 0x3d820000) ||
+	    ((jmp[1] & 0xffff0000) != 0x398c0000) ||
+	    (jmp[2] != 0xf8410028) ||
+	    (jmp[3] != 0xe96c0020) ||
+	    (jmp[4] != 0xe84c0028)) {
+		printk(KERN_ERR "Not a trampoline\n");
+		return -EINVAL;
+	}
+
+	offset = (unsigned)((unsigned short)jmp[0]) << 16 |
+		(unsigned)((unsigned short)jmp[1]);
+
+	DEBUGP(" %x ", offset);
+
+	/* get the address this jumps too */
+	tramp = mod->arch.toc + offset + 32;
+	DEBUGP("toc: %lx", tramp);
+
+	if (probe_kernel_read(jmp, (void *)tramp, 8)) {
+		printk(KERN_ERR "Failed to read %lx\n", tramp);
+		return -EFAULT;
+	}
+
+	DEBUGP(" %08x %08x\n", jmp[0], jmp[1]);
+
+	ptr = ((unsigned long)jmp[0] << 32) + jmp[1];
+
+	/* This should match what was called */
+	if (ptr != GET_ADDR(addr)) {
+		printk(KERN_ERR "addr does not match %lx\n", ptr);
+		return -EINVAL;
+	}
+
+	/*
+	 * We want to nop the line, but the next line is
+	 *  0xe8, 0x41, 0x00, 0x28   ld r2,40(r1)
+	 * This needs to be turned to a nop too.
+	 */
+	if (probe_kernel_read(&op, (void *)(ip+4), MCOUNT_INSN_SIZE))
+		return -EFAULT;
+
+	if (op != 0xe8410028) {
+		printk(KERN_ERR "Next line is not ld! (%08x)\n", op);
+		return -EINVAL;
+	}
+
+	/*
+	 * Milton Miller pointed out that we can not blindly do nops.
+	 * If a task was preempted when calling a trace function,
+	 * the nops will remove the way to restore the TOC in r2
+	 * and the r2 TOC will get corrupted.
+	 */
+
+	/*
+	 * Replace:
+	 *   bl <tramp>  <==== will be replaced with "b 1f"
+	 *   ld r2,40(r1)
+	 *  1:
+	 */
+	op = 0x48000008;	/* b +8 */
+
+	if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE))
+		return -EPERM;
+
+
+	flush_icache_range(ip, ip + 8);
+
+	return 0;
+}
+
+#else /* !PPC64 */
+static int
+__ftrace_make_nop(struct module *mod,
+		  struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned int op;
+	unsigned int jmp[4];
+	unsigned long ip = rec->ip;
+	unsigned long tramp;
+
+	if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE))
+		return -EFAULT;
+
+	/* Make sure that that this is still a 24bit jump */
+	if (!is_bl_op(op)) {
+		printk(KERN_ERR "Not expected bl: opcode is %x\n", op);
+		return -EINVAL;
+	}
+
+	/* lets find where the pointer goes */
+	tramp = find_bl_target(ip, op);
+
+	/*
+	 * On PPC32 the trampoline looks like:
+	 *  0x3d, 0x60, 0x00, 0x00  lis r11,sym@ha
+	 *  0x39, 0x6b, 0x00, 0x00  addi r11,r11,sym@l
+	 *  0x7d, 0x69, 0x03, 0xa6  mtctr r11
+	 *  0x4e, 0x80, 0x04, 0x20  bctr
+	 */
+
+	DEBUGP("ip:%lx jumps to %lx", ip, tramp);
+
+	/* Find where the trampoline jumps to */
+	if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) {
+		printk(KERN_ERR "Failed to read %lx\n", tramp);
+		return -EFAULT;
+	}
+
+	DEBUGP(" %08x %08x ", jmp[0], jmp[1]);
+
+	/* verify that this is what we expect it to be */
+	if (((jmp[0] & 0xffff0000) != 0x3d600000) ||
+	    ((jmp[1] & 0xffff0000) != 0x396b0000) ||
+	    (jmp[2] != 0x7d6903a6) ||
+	    (jmp[3] != 0x4e800420)) {
+		printk(KERN_ERR "Not a trampoline\n");
+		return -EINVAL;
+	}
+
+	tramp = (jmp[1] & 0xffff) |
+		((jmp[0] & 0xffff) << 16);
+	if (tramp & 0x8000)
+		tramp -= 0x10000;
+
+	DEBUGP(" %x ", tramp);
+
+	if (tramp != addr) {
+		printk(KERN_ERR
+		       "Trampoline location %08lx does not match addr\n",
+		       tramp);
+		return -EINVAL;
+	}
+
+	op = PPC_NOP_INSTR;
+
+	if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE))
+		return -EPERM;
+
+	flush_icache_range(ip, ip + 8);
+
+	return 0;
+}
+#endif /* PPC64 */
+
+int ftrace_make_nop(struct module *mod,
+		    struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned char *old, *new;
+	unsigned long ip = rec->ip;
+
+	/*
+	 * If the calling address is more that 24 bits away,
+	 * then we had to use a trampoline to make the call.
+	 * Otherwise just update the call site.
+	 */
+	if (test_24bit_addr(ip, addr)) {
+		/* within range */
+		old = ftrace_call_replace(ip, addr);
+		new = ftrace_nop_replace();
+		return ftrace_modify_code(ip, old, new);
+	}
+
+	/*
+	 * Out of range jumps are called from modules.
+	 * We should either already have a pointer to the module
+	 * or it has been passed in.
+	 */
+	if (!rec->arch.mod) {
+		if (!mod) {
+			printk(KERN_ERR "No module loaded addr=%lx\n",
+			       addr);
+			return -EFAULT;
+		}
+		rec->arch.mod = mod;
+	} else if (mod) {
+		if (mod != rec->arch.mod) {
+			printk(KERN_ERR
+			       "Record mod %p not equal to passed in mod %p\n",
+			       rec->arch.mod, mod);
+			return -EINVAL;
+		}
+		/* nothing to do if mod == rec->arch.mod */
+	} else
+		mod = rec->arch.mod;
+
+	return __ftrace_make_nop(mod, rec, addr);
+
+}
+
+#ifdef CONFIG_PPC64
+static int
+__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned int op[2];
+	unsigned long ip = rec->ip;
+
+	/* read where this goes */
+	if (probe_kernel_read(op, (void *)ip, MCOUNT_INSN_SIZE * 2))
+		return -EFAULT;
+
+	/*
+	 * It should be pointing to two nops or
+	 *  b +8; ld r2,40(r1)
+	 */
+	if (((op[0] != 0x48000008) || (op[1] != 0xe8410028)) &&
+	    ((op[0] != PPC_NOP_INSTR) || (op[1] != PPC_NOP_INSTR))) {
+		printk(KERN_ERR "Expected NOPs but have %x %x\n", op[0], op[1]);
+		return -EINVAL;
+	}
+
+	/* If we never set up a trampoline to ftrace_caller, then bail */
+	if (!rec->arch.mod->arch.tramp) {
+		printk(KERN_ERR "No ftrace trampoline\n");
+		return -EINVAL;
+	}
+
+	/* create the branch to the trampoline */
+	op[0] = create_branch((unsigned int *)ip,
+			      rec->arch.mod->arch.tramp, BRANCH_SET_LINK);
+	if (!op[0]) {
+		printk(KERN_ERR "REL24 out of range!\n");
+		return -EINVAL;
+	}
+
+	/* ld r2,40(r1) */
+	op[1] = 0xe8410028;
+
+	DEBUGP("write to %lx\n", rec->ip);
+
+	if (probe_kernel_write((void *)ip, op, MCOUNT_INSN_SIZE * 2))
+		return -EPERM;
+
+	flush_icache_range(ip, ip + 8);
+
+	return 0;
+}
+#else
+static int
+__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned int op;
+	unsigned long ip = rec->ip;
+
+	/* read where this goes */
+	if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE))
+		return -EFAULT;
+
+	/* It should be pointing to a nop */
+	if (op != PPC_NOP_INSTR) {
+		printk(KERN_ERR "Expected NOP but have %x\n", op);
+		return -EINVAL;
+	}
+
+	/* If we never set up a trampoline to ftrace_caller, then bail */
+	if (!rec->arch.mod->arch.tramp) {
+		printk(KERN_ERR "No ftrace trampoline\n");
+		return -EINVAL;
+	}
+
+	/* create the branch to the trampoline */
+	op = create_branch((unsigned int *)ip,
+			   rec->arch.mod->arch.tramp, BRANCH_SET_LINK);
+	if (!op) {
+		printk(KERN_ERR "REL24 out of range!\n");
+		return -EINVAL;
+	}
+
+	DEBUGP("write to %lx\n", rec->ip);
+
+	if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE))
+		return -EPERM;
+
+	flush_icache_range(ip, ip + 8);
+
+	return 0;
+}
+#endif /* CONFIG_PPC64 */
+
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned char *old, *new;
+	unsigned long ip = rec->ip;
+
+	/*
+	 * If the calling address is more that 24 bits away,
+	 * then we had to use a trampoline to make the call.
+	 * Otherwise just update the call site.
+	 */
+	if (test_24bit_addr(ip, addr)) {
+		/* within range */
+		old = ftrace_nop_replace();
+		new = ftrace_call_replace(ip, addr);
+		return ftrace_modify_code(ip, old, new);
+	}
+
+	/*
+	 * Out of range jumps are called from modules.
+	 * Being that we are converting from nop, it had better
+	 * already have a module defined.
+	 */
+	if (!rec->arch.mod) {
+		printk(KERN_ERR "No module loaded\n");
+		return -EINVAL;
+	}
+
+	return __ftrace_make_call(rec, addr);
 }
 }
 
 
 int ftrace_update_ftrace_func(ftrace_func_t func)
 int ftrace_update_ftrace_func(ftrace_func_t func)
@@ -128,10 +509,10 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 
 
 int __init ftrace_dyn_arch_init(void *data)
 int __init ftrace_dyn_arch_init(void *data)
 {
 {
-	/* This is running in kstop_machine */
+	/* caller expects data to be zero */
+	unsigned long *p = data;
 
 
-	ftrace_mcount_set(data);
+	*p = 0;
 
 
 	return 0;
 	return 0;
 }
 }
-

+ 5 - 0
arch/powerpc/kernel/idle.c

@@ -69,10 +69,15 @@ void cpu_idle(void)
 				smp_mb();
 				smp_mb();
 				local_irq_disable();
 				local_irq_disable();
 
 
+				/* Don't trace irqs off for idle */
+				stop_critical_timings();
+
 				/* check again after disabling irqs */
 				/* check again after disabling irqs */
 				if (!need_resched() && !cpu_should_die())
 				if (!need_resched() && !cpu_should_die())
 					ppc_md.power_save();
 					ppc_md.power_save();
 
 
+				start_critical_timings();
+
 				local_irq_enable();
 				local_irq_enable();
 				set_thread_flag(TIF_POLLING_NRFLAG);
 				set_thread_flag(TIF_POLLING_NRFLAG);
 
 

+ 10 - 0
arch/powerpc/kernel/module_32.c

@@ -22,6 +22,7 @@
 #include <linux/fs.h>
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/kernel.h>
+#include <linux/ftrace.h>
 #include <linux/cache.h>
 #include <linux/cache.h>
 #include <linux/bug.h>
 #include <linux/bug.h>
 #include <linux/sort.h>
 #include <linux/sort.h>
@@ -53,6 +54,9 @@ static unsigned int count_relocs(const Elf32_Rela *rela, unsigned int num)
 			r_addend = rela[i].r_addend;
 			r_addend = rela[i].r_addend;
 		}
 		}
 
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+	_count_relocs++;	/* add one for ftrace_caller */
+#endif
 	return _count_relocs;
 	return _count_relocs;
 }
 }
 
 
@@ -306,5 +310,11 @@ int apply_relocate_add(Elf32_Shdr *sechdrs,
 			return -ENOEXEC;
 			return -ENOEXEC;
 		}
 		}
 	}
 	}
+#ifdef CONFIG_DYNAMIC_FTRACE
+	module->arch.tramp =
+		do_plt_call(module->module_core,
+			    (unsigned long)ftrace_caller,
+			    sechdrs, module);
+#endif
 	return 0;
 	return 0;
 }
 }

+ 13 - 0
arch/powerpc/kernel/module_64.c

@@ -20,6 +20,7 @@
 #include <linux/moduleloader.h>
 #include <linux/moduleloader.h>
 #include <linux/err.h>
 #include <linux/err.h>
 #include <linux/vmalloc.h>
 #include <linux/vmalloc.h>
+#include <linux/ftrace.h>
 #include <linux/bug.h>
 #include <linux/bug.h>
 #include <asm/module.h>
 #include <asm/module.h>
 #include <asm/firmware.h>
 #include <asm/firmware.h>
@@ -163,6 +164,11 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr,
 		}
 		}
 	}
 	}
 
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+	/* make the trampoline to the ftrace_caller */
+	relocs++;
+#endif
+
 	DEBUGP("Looks like a total of %lu stubs, max\n", relocs);
 	DEBUGP("Looks like a total of %lu stubs, max\n", relocs);
 	return relocs * sizeof(struct ppc64_stub_entry);
 	return relocs * sizeof(struct ppc64_stub_entry);
 }
 }
@@ -441,5 +447,12 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 		}
 		}
 	}
 	}
 
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+	me->arch.toc = my_r2(sechdrs, me);
+	me->arch.tramp = stub_for_addr(sechdrs,
+				       (unsigned long)ftrace_caller,
+				       me);
+#endif
+
 	return 0;
 	return 0;
 }
 }

+ 3 - 0
arch/powerpc/lib/Makefile

@@ -6,6 +6,9 @@ ifeq ($(CONFIG_PPC64),y)
 EXTRA_CFLAGS		+= -mno-minimal-toc
 EXTRA_CFLAGS		+= -mno-minimal-toc
 endif
 endif
 
 
+CFLAGS_REMOVE_code-patching.o = -pg
+CFLAGS_REMOVE_feature-fixups.o = -pg
+
 obj-y			:= string.o alloc.o \
 obj-y			:= string.o alloc.o \
 			   checksum_$(CONFIG_WORD_SIZE).o
 			   checksum_$(CONFIG_WORD_SIZE).o
 obj-$(CONFIG_PPC32)	+= div64.o copy_32.o crtsavres.o
 obj-$(CONFIG_PPC32)	+= div64.o copy_32.o crtsavres.o

+ 3 - 0
arch/x86/Kconfig

@@ -31,11 +31,14 @@ config X86
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_TRACER
+	select HAVE_FUNCTION_GRAPH_TRACER
+	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 	select HAVE_ARCH_KGDB if !X86_VOYAGER
 	select HAVE_ARCH_KGDB if !X86_VOYAGER
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_GENERIC_DMA_COHERENT if X86_32
 	select HAVE_GENERIC_DMA_COHERENT if X86_32
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
+	select USER_STACKTRACE_SUPPORT
 
 
 config ARCH_DEFCONFIG
 config ARCH_DEFCONFIG
 	string
 	string

+ 1 - 1
arch/x86/Kconfig.cpu

@@ -515,12 +515,12 @@ config CPU_SUP_UMC_32
 config X86_DS
 config X86_DS
 	def_bool X86_PTRACE_BTS
 	def_bool X86_PTRACE_BTS
 	depends on X86_DEBUGCTLMSR
 	depends on X86_DEBUGCTLMSR
+	select HAVE_HW_BRANCH_TRACER
 
 
 config X86_PTRACE_BTS
 config X86_PTRACE_BTS
 	bool "Branch Trace Store"
 	bool "Branch Trace Store"
 	default y
 	default y
 	depends on X86_DEBUGCTLMSR
 	depends on X86_DEBUGCTLMSR
-	depends on BROKEN
 	help
 	help
 	  This adds a ptrace interface to the hardware's branch trace store.
 	  This adds a ptrace interface to the hardware's branch trace store.
 
 

+ 0 - 4
arch/x86/Kconfig.debug

@@ -174,14 +174,10 @@ config IOMMU_LEAK
 	  Add a simple leak tracer to the IOMMU code. This is useful when you
 	  Add a simple leak tracer to the IOMMU code. This is useful when you
 	  are debugging a buggy device driver that leaks IOMMU mappings.
 	  are debugging a buggy device driver that leaks IOMMU mappings.
 
 
-config MMIOTRACE_HOOKS
-	bool
-
 config MMIOTRACE
 config MMIOTRACE
 	bool "Memory mapped IO tracing"
 	bool "Memory mapped IO tracing"
 	depends on DEBUG_KERNEL && PCI
 	depends on DEBUG_KERNEL && PCI
 	select TRACING
 	select TRACING
-	select MMIOTRACE_HOOKS
 	help
 	help
 	  Mmiotrace traces Memory Mapped I/O access and is meant for
 	  Mmiotrace traces Memory Mapped I/O access and is meant for
 	  debugging and reverse engineering. It is called from the ioremap
 	  debugging and reverse engineering. It is called from the ioremap

+ 172 - 140
arch/x86/include/asm/ds.h

@@ -6,14 +6,13 @@
  * precise-event based sampling (PEBS).
  * precise-event based sampling (PEBS).
  *
  *
  * It manages:
  * It manages:
- * - per-thread and per-cpu allocation of BTS and PEBS
- * - buffer memory allocation (optional)
- * - buffer overflow handling
+ * - DS and BTS hardware configuration
+ * - buffer overflow handling (to be done)
  * - buffer access
  * - buffer access
  *
  *
- * It assumes:
- * - get_task_struct on all parameter tasks
- * - current is allowed to trace parameter tasks
+ * It does not do:
+ * - security checking (is the caller allowed to trace the task)
+ * - buffer allocation (memory accounting)
  *
  *
  *
  *
  * Copyright (C) 2007-2008 Intel Corporation.
  * Copyright (C) 2007-2008 Intel Corporation.
@@ -26,11 +25,51 @@
 
 
 #include <linux/types.h>
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/init.h>
+#include <linux/err.h>
 
 
 
 
 #ifdef CONFIG_X86_DS
 #ifdef CONFIG_X86_DS
 
 
 struct task_struct;
 struct task_struct;
+struct ds_context;
+struct ds_tracer;
+struct bts_tracer;
+struct pebs_tracer;
+
+typedef void (*bts_ovfl_callback_t)(struct bts_tracer *);
+typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *);
+
+
+/*
+ * A list of features plus corresponding macros to talk about them in
+ * the ds_request function's flags parameter.
+ *
+ * We use the enum to index an array of corresponding control bits;
+ * we use the macro to index a flags bit-vector.
+ */
+enum ds_feature {
+	dsf_bts = 0,
+	dsf_bts_kernel,
+#define BTS_KERNEL (1 << dsf_bts_kernel)
+	/* trace kernel-mode branches */
+
+	dsf_bts_user,
+#define BTS_USER (1 << dsf_bts_user)
+	/* trace user-mode branches */
+
+	dsf_bts_overflow,
+	dsf_bts_max,
+	dsf_pebs = dsf_bts_max,
+
+	dsf_pebs_max,
+	dsf_ctl_max = dsf_pebs_max,
+	dsf_bts_timestamps = dsf_ctl_max,
+#define BTS_TIMESTAMPS (1 << dsf_bts_timestamps)
+	/* add timestamps into BTS trace */
+
+#define BTS_USER_FLAGS (BTS_KERNEL | BTS_USER | BTS_TIMESTAMPS)
+};
+
 
 
 /*
 /*
  * Request BTS or PEBS
  * Request BTS or PEBS
@@ -38,163 +77,169 @@ struct task_struct;
  * Due to alignement constraints, the actual buffer may be slightly
  * Due to alignement constraints, the actual buffer may be slightly
  * smaller than the requested or provided buffer.
  * smaller than the requested or provided buffer.
  *
  *
- * Returns 0 on success; -Eerrno otherwise
+ * Returns a pointer to a tracer structure on success, or
+ * ERR_PTR(errcode) on failure.
+ *
+ * The interrupt threshold is independent from the overflow callback
+ * to allow users to use their own overflow interrupt handling mechanism.
  *
  *
  * task: the task to request recording for;
  * task: the task to request recording for;
  *       NULL for per-cpu recording on the current cpu
  *       NULL for per-cpu recording on the current cpu
  * base: the base pointer for the (non-pageable) buffer;
  * base: the base pointer for the (non-pageable) buffer;
- *       NULL if buffer allocation requested
- * size: the size of the requested or provided buffer
+ * size: the size of the provided buffer in bytes
  * ovfl: pointer to a function to be called on buffer overflow;
  * ovfl: pointer to a function to be called on buffer overflow;
  *       NULL if cyclic buffer requested
  *       NULL if cyclic buffer requested
+ * th: the interrupt threshold in records from the end of the buffer;
+ *     -1 if no interrupt threshold is requested.
+ * flags: a bit-mask of the above flags
  */
  */
-typedef void (*ds_ovfl_callback_t)(struct task_struct *);
-extern int ds_request_bts(struct task_struct *task, void *base, size_t size,
-			  ds_ovfl_callback_t ovfl);
-extern int ds_request_pebs(struct task_struct *task, void *base, size_t size,
-			   ds_ovfl_callback_t ovfl);
+extern struct bts_tracer *ds_request_bts(struct task_struct *task,
+					 void *base, size_t size,
+					 bts_ovfl_callback_t ovfl,
+					 size_t th, unsigned int flags);
+extern struct pebs_tracer *ds_request_pebs(struct task_struct *task,
+					   void *base, size_t size,
+					   pebs_ovfl_callback_t ovfl,
+					   size_t th, unsigned int flags);
 
 
 /*
 /*
  * Release BTS or PEBS resources
  * Release BTS or PEBS resources
+ * Suspend and resume BTS or PEBS tracing
  *
  *
- * Frees buffers allocated on ds_request.
- *
- * Returns 0 on success; -Eerrno otherwise
- *
- * task: the task to release resources for;
- *       NULL to release resources for the current cpu
+ * tracer: the tracer handle returned from ds_request_~()
  */
  */
-extern int ds_release_bts(struct task_struct *task);
-extern int ds_release_pebs(struct task_struct *task);
+extern void ds_release_bts(struct bts_tracer *tracer);
+extern void ds_suspend_bts(struct bts_tracer *tracer);
+extern void ds_resume_bts(struct bts_tracer *tracer);
+extern void ds_release_pebs(struct pebs_tracer *tracer);
+extern void ds_suspend_pebs(struct pebs_tracer *tracer);
+extern void ds_resume_pebs(struct pebs_tracer *tracer);
 
 
-/*
- * Return the (array) index of the write pointer.
- * (assuming an array of BTS/PEBS records)
- *
- * Returns -Eerrno on error
- *
- * task: the task to access;
- *       NULL to access the current cpu
- * pos (out): if not NULL, will hold the result
- */
-extern int ds_get_bts_index(struct task_struct *task, size_t *pos);
-extern int ds_get_pebs_index(struct task_struct *task, size_t *pos);
 
 
 /*
 /*
- * Return the (array) index one record beyond the end of the array.
- * (assuming an array of BTS/PEBS records)
+ * The raw DS buffer state as it is used for BTS and PEBS recording.
  *
  *
- * Returns -Eerrno on error
- *
- * task: the task to access;
- *       NULL to access the current cpu
- * pos (out): if not NULL, will hold the result
+ * This is the low-level, arch-dependent interface for working
+ * directly on the raw trace data.
  */
  */
-extern int ds_get_bts_end(struct task_struct *task, size_t *pos);
-extern int ds_get_pebs_end(struct task_struct *task, size_t *pos);
+struct ds_trace {
+	/* the number of bts/pebs records */
+	size_t n;
+	/* the size of a bts/pebs record in bytes */
+	size_t size;
+	/* pointers into the raw buffer:
+	   - to the first entry */
+	void *begin;
+	/* - one beyond the last entry */
+	void *end;
+	/* - one beyond the newest entry */
+	void *top;
+	/* - the interrupt threshold */
+	void *ith;
+	/* flags given on ds_request() */
+	unsigned int flags;
+};
 
 
 /*
 /*
- * Provide a pointer to the BTS/PEBS record at parameter index.
- * (assuming an array of BTS/PEBS records)
- *
- * The pointer points directly into the buffer. The user is
- * responsible for copying the record.
- *
- * Returns the size of a single record on success; -Eerrno on error
- *
- * task: the task to access;
- *       NULL to access the current cpu
- * index: the index of the requested record
- * record (out): pointer to the requested record
+ * An arch-independent view on branch trace data.
  */
  */
-extern int ds_access_bts(struct task_struct *task,
-			 size_t index, const void **record);
-extern int ds_access_pebs(struct task_struct *task,
-			  size_t index, const void **record);
+enum bts_qualifier {
+	bts_invalid,
+#define BTS_INVALID bts_invalid
+
+	bts_branch,
+#define BTS_BRANCH bts_branch
+
+	bts_task_arrives,
+#define BTS_TASK_ARRIVES bts_task_arrives
+
+	bts_task_departs,
+#define BTS_TASK_DEPARTS bts_task_departs
+
+	bts_qual_bit_size = 4,
+	bts_qual_max = (1 << bts_qual_bit_size),
+};
+
+struct bts_struct {
+	__u64 qualifier;
+	union {
+		/* BTS_BRANCH */
+		struct {
+			__u64 from;
+			__u64 to;
+		} lbr;
+		/* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
+		struct {
+			__u64 jiffies;
+			pid_t pid;
+		} timestamp;
+	} variant;
+};
 
 
-/*
- * Write one or more BTS/PEBS records at the write pointer index and
- * advance the write pointer.
- *
- * If size is not a multiple of the record size, trailing bytes are
- * zeroed out.
- *
- * May result in one or more overflow notifications.
- *
- * If called during overflow handling, that is, with index >=
- * interrupt threshold, the write will wrap around.
- *
- * An overflow notification is given if and when the interrupt
- * threshold is reached during or after the write.
- *
- * Returns the number of bytes written or -Eerrno.
- *
- * task: the task to access;
- *       NULL to access the current cpu
- * buffer: the buffer to write
- * size: the size of the buffer
- */
-extern int ds_write_bts(struct task_struct *task,
-			const void *buffer, size_t size);
-extern int ds_write_pebs(struct task_struct *task,
-			 const void *buffer, size_t size);
 
 
 /*
 /*
- * Same as ds_write_bts/pebs, but omit ownership checks.
+ * The BTS state.
  *
  *
- * This is needed to have some other task than the owner of the
- * BTS/PEBS buffer or the parameter task itself write into the
- * respective buffer.
+ * This gives access to the raw DS state and adds functions to provide
+ * an arch-independent view of the BTS data.
  */
  */
-extern int ds_unchecked_write_bts(struct task_struct *task,
-				  const void *buffer, size_t size);
-extern int ds_unchecked_write_pebs(struct task_struct *task,
-				   const void *buffer, size_t size);
+struct bts_trace {
+	struct ds_trace ds;
+
+	int (*read)(struct bts_tracer *tracer, const void *at,
+		    struct bts_struct *out);
+	int (*write)(struct bts_tracer *tracer, const struct bts_struct *in);
+};
+
 
 
 /*
 /*
- * Reset the write pointer of the BTS/PEBS buffer.
+ * The PEBS state.
  *
  *
- * Returns 0 on success; -Eerrno on error
- *
- * task: the task to access;
- *       NULL to access the current cpu
+ * This gives access to the raw DS state and the PEBS-specific counter
+ * reset value.
  */
  */
-extern int ds_reset_bts(struct task_struct *task);
-extern int ds_reset_pebs(struct task_struct *task);
+struct pebs_trace {
+	struct ds_trace ds;
+
+	/* the PEBS reset value */
+	unsigned long long reset_value;
+};
+
 
 
 /*
 /*
- * Clear the BTS/PEBS buffer and reset the write pointer.
- * The entire buffer will be zeroed out.
+ * Read the BTS or PEBS trace.
  *
  *
- * Returns 0 on success; -Eerrno on error
+ * Returns a view on the trace collected for the parameter tracer.
+ *
+ * The view remains valid as long as the traced task is not running or
+ * the tracer is suspended.
+ * Writes into the trace buffer are not reflected.
  *
  *
- * task: the task to access;
- *       NULL to access the current cpu
+ * tracer: the tracer handle returned from ds_request_~()
  */
  */
-extern int ds_clear_bts(struct task_struct *task);
-extern int ds_clear_pebs(struct task_struct *task);
+extern const struct bts_trace *ds_read_bts(struct bts_tracer *tracer);
+extern const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer);
+
 
 
 /*
 /*
- * Provide the PEBS counter reset value.
+ * Reset the write pointer of the BTS/PEBS buffer.
  *
  *
  * Returns 0 on success; -Eerrno on error
  * Returns 0 on success; -Eerrno on error
  *
  *
- * task: the task to access;
- *       NULL to access the current cpu
- * value (out): the counter reset value
+ * tracer: the tracer handle returned from ds_request_~()
  */
  */
-extern int ds_get_pebs_reset(struct task_struct *task, u64 *value);
+extern int ds_reset_bts(struct bts_tracer *tracer);
+extern int ds_reset_pebs(struct pebs_tracer *tracer);
 
 
 /*
 /*
  * Set the PEBS counter reset value.
  * Set the PEBS counter reset value.
  *
  *
  * Returns 0 on success; -Eerrno on error
  * Returns 0 on success; -Eerrno on error
  *
  *
- * task: the task to access;
- *       NULL to access the current cpu
+ * tracer: the tracer handle returned from ds_request_pebs()
  * value: the new counter reset value
  * value: the new counter reset value
  */
  */
-extern int ds_set_pebs_reset(struct task_struct *task, u64 value);
+extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value);
 
 
 /*
 /*
  * Initialization
  * Initialization
@@ -202,39 +247,26 @@ extern int ds_set_pebs_reset(struct task_struct *task, u64 value);
 struct cpuinfo_x86;
 struct cpuinfo_x86;
 extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
 extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
 
 
-
-
 /*
 /*
- * The DS context - part of struct thread_struct.
+ * Context switch work
  */
  */
-struct ds_context {
-	/* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
-	unsigned char *ds;
-	/* the owner of the BTS and PEBS configuration, respectively */
-	struct task_struct *owner[2];
-	/* buffer overflow notification function for BTS and PEBS */
-	ds_ovfl_callback_t callback[2];
-	/* the original buffer address */
-	void *buffer[2];
-	/* the number of allocated pages for on-request allocated buffers */
-	unsigned int pages[2];
-	/* use count */
-	unsigned long count;
-	/* a pointer to the context location inside the thread_struct
-	 * or the per_cpu context array */
-	struct ds_context **this;
-	/* a pointer to the task owning this context, or NULL, if the
-	 * context is owned by a cpu */
-	struct task_struct *task;
-};
+extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
 
 
-/* called by exit_thread() to free leftover contexts */
-extern void ds_free(struct ds_context *context);
+/*
+ * Task clone/init and cleanup work
+ */
+extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father);
+extern void ds_exit_thread(struct task_struct *tsk);
 
 
 #else /* CONFIG_X86_DS */
 #else /* CONFIG_X86_DS */
 
 
 struct cpuinfo_x86;
 struct cpuinfo_x86;
 static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
 static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
+static inline void ds_switch_to(struct task_struct *prev,
+				struct task_struct *next) {}
+static inline void ds_copy_thread(struct task_struct *tsk,
+				  struct task_struct *father) {}
+static inline void ds_exit_thread(struct task_struct *tsk) {}
 
 
 #endif /* CONFIG_X86_DS */
 #endif /* CONFIG_X86_DS */
 #endif /* _ASM_X86_DS_H */
 #endif /* _ASM_X86_DS_H */

+ 60 - 1
arch/x86/include/asm/ftrace.h

@@ -1,6 +1,33 @@
 #ifndef _ASM_X86_FTRACE_H
 #ifndef _ASM_X86_FTRACE_H
 #define _ASM_X86_FTRACE_H
 #define _ASM_X86_FTRACE_H
 
 
+#ifdef __ASSEMBLY__
+
+	.macro MCOUNT_SAVE_FRAME
+	/* taken from glibc */
+	subq $0x38, %rsp
+	movq %rax, (%rsp)
+	movq %rcx, 8(%rsp)
+	movq %rdx, 16(%rsp)
+	movq %rsi, 24(%rsp)
+	movq %rdi, 32(%rsp)
+	movq %r8, 40(%rsp)
+	movq %r9, 48(%rsp)
+	.endm
+
+	.macro MCOUNT_RESTORE_FRAME
+	movq 48(%rsp), %r9
+	movq 40(%rsp), %r8
+	movq 32(%rsp), %rdi
+	movq 24(%rsp), %rsi
+	movq 16(%rsp), %rdx
+	movq 8(%rsp), %rcx
+	movq (%rsp), %rax
+	addq $0x38, %rsp
+	.endm
+
+#endif
+
 #ifdef CONFIG_FUNCTION_TRACER
 #ifdef CONFIG_FUNCTION_TRACER
 #define MCOUNT_ADDR		((long)(mcount))
 #define MCOUNT_ADDR		((long)(mcount))
 #define MCOUNT_INSN_SIZE	5 /* sizeof mcount call */
 #define MCOUNT_INSN_SIZE	5 /* sizeof mcount call */
@@ -17,8 +44,40 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
 	 */
 	 */
 	return addr - 1;
 	return addr - 1;
 }
 }
-#endif
 
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+struct dyn_arch_ftrace {
+	/* No extra data needed for x86 */
+};
+
+#endif /*  CONFIG_DYNAMIC_FTRACE */
+#endif /* __ASSEMBLY__ */
 #endif /* CONFIG_FUNCTION_TRACER */
 #endif /* CONFIG_FUNCTION_TRACER */
 
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+#ifndef __ASSEMBLY__
+
+/*
+ * Stack of return addresses for functions
+ * of a thread.
+ * Used in struct thread_info
+ */
+struct ftrace_ret_stack {
+	unsigned long ret;
+	unsigned long func;
+	unsigned long long calltime;
+};
+
+/*
+ * Primary handler of a function return.
+ * It relays on ftrace_return_to_handler.
+ * Defined in entry_32/64.S
+ */
+extern void return_to_handler(void);
+
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
 #endif /* _ASM_X86_FTRACE_H */
 #endif /* _ASM_X86_FTRACE_H */

+ 2 - 1
arch/x86/include/asm/msr.h

@@ -85,7 +85,8 @@ static inline void native_write_msr(unsigned int msr,
 	asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
 	asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
 }
 }
 
 
-static inline int native_write_msr_safe(unsigned int msr,
+/* Can be uninlined because referenced by paravirt */
+notrace static inline int native_write_msr_safe(unsigned int msr,
 					unsigned low, unsigned high)
 					unsigned low, unsigned high)
 {
 {
 	int err;
 	int err;

+ 13 - 0
arch/x86/include/asm/processor.h

@@ -756,6 +756,19 @@ extern void switch_to_new_gdt(void);
 extern void cpu_init(void);
 extern void cpu_init(void);
 extern void init_gdt(int cpu);
 extern void init_gdt(int cpu);
 
 
+static inline unsigned long get_debugctlmsr(void)
+{
+    unsigned long debugctlmsr = 0;
+
+#ifndef CONFIG_X86_DEBUGCTLMSR
+	if (boot_cpu_data.x86 < 6)
+		return 0;
+#endif
+	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
+
+    return debugctlmsr;
+}
+
 static inline void update_debugctlmsr(unsigned long debugctlmsr)
 static inline void update_debugctlmsr(unsigned long debugctlmsr)
 {
 {
 #ifndef CONFIG_X86_DEBUGCTLMSR
 #ifndef CONFIG_X86_DEBUGCTLMSR

+ 7 - 36
arch/x86/include/asm/ptrace.h

@@ -6,7 +6,6 @@
 #include <asm/processor-flags.h>
 #include <asm/processor-flags.h>
 
 
 #ifdef __KERNEL__
 #ifdef __KERNEL__
-#include <asm/ds.h>		/* the DS BTS struct is used for ptrace too */
 #include <asm/segment.h>
 #include <asm/segment.h>
 #endif
 #endif
 
 
@@ -128,34 +127,6 @@ struct pt_regs {
 #endif /* !__i386__ */
 #endif /* !__i386__ */
 
 
 
 
-#ifdef CONFIG_X86_PTRACE_BTS
-/* a branch trace record entry
- *
- * In order to unify the interface between various processor versions,
- * we use the below data structure for all processors.
- */
-enum bts_qualifier {
-	BTS_INVALID = 0,
-	BTS_BRANCH,
-	BTS_TASK_ARRIVES,
-	BTS_TASK_DEPARTS
-};
-
-struct bts_struct {
-	__u64 qualifier;
-	union {
-		/* BTS_BRANCH */
-		struct {
-			__u64 from_ip;
-			__u64 to_ip;
-		} lbr;
-		/* BTS_TASK_ARRIVES or
-		   BTS_TASK_DEPARTS */
-		__u64 jiffies;
-	} variant;
-};
-#endif /* CONFIG_X86_PTRACE_BTS */
-
 #ifdef __KERNEL__
 #ifdef __KERNEL__
 
 
 #include <linux/init.h>
 #include <linux/init.h>
@@ -163,13 +134,6 @@ struct bts_struct {
 struct cpuinfo_x86;
 struct cpuinfo_x86;
 struct task_struct;
 struct task_struct;
 
 
-#ifdef CONFIG_X86_PTRACE_BTS
-extern void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *);
-extern void ptrace_bts_take_timestamp(struct task_struct *, enum bts_qualifier);
-#else
-#define ptrace_bts_init_intel(config) do {} while (0)
-#endif /* CONFIG_X86_PTRACE_BTS */
-
 extern unsigned long profile_pc(struct pt_regs *regs);
 extern unsigned long profile_pc(struct pt_regs *regs);
 
 
 extern unsigned long
 extern unsigned long
@@ -271,6 +235,13 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
 extern int do_set_thread_area(struct task_struct *p, int idx,
 extern int do_set_thread_area(struct task_struct *p, int idx,
 			      struct user_desc __user *info, int can_allocate);
 			      struct user_desc __user *info, int can_allocate);
 
 
+extern void x86_ptrace_untrace(struct task_struct *);
+extern void x86_ptrace_fork(struct task_struct *child,
+			    unsigned long clone_flags);
+
+#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk)
+#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags)
+
 #endif /* __KERNEL__ */
 #endif /* __KERNEL__ */
 
 
 #endif /* !__ASSEMBLY__ */
 #endif /* !__ASSEMBLY__ */

+ 3 - 4
arch/x86/include/asm/thread_info.h

@@ -20,6 +20,8 @@
 struct task_struct;
 struct task_struct;
 struct exec_domain;
 struct exec_domain;
 #include <asm/processor.h>
 #include <asm/processor.h>
+#include <asm/ftrace.h>
+#include <asm/atomic.h>
 
 
 struct thread_info {
 struct thread_info {
 	struct task_struct	*task;		/* main task structure */
 	struct task_struct	*task;		/* main task structure */
@@ -91,7 +93,6 @@ struct thread_info {
 #define TIF_FORCED_TF		24	/* true if TF in eflags artificially */
 #define TIF_FORCED_TF		24	/* true if TF in eflags artificially */
 #define TIF_DEBUGCTLMSR		25	/* uses thread_struct.debugctlmsr */
 #define TIF_DEBUGCTLMSR		25	/* uses thread_struct.debugctlmsr */
 #define TIF_DS_AREA_MSR		26      /* uses thread_struct.ds_area_msr */
 #define TIF_DS_AREA_MSR		26      /* uses thread_struct.ds_area_msr */
-#define TIF_BTS_TRACE_TS	27      /* record scheduling event timestamps */
 
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -113,7 +114,6 @@ struct thread_info {
 #define _TIF_FORCED_TF		(1 << TIF_FORCED_TF)
 #define _TIF_FORCED_TF		(1 << TIF_FORCED_TF)
 #define _TIF_DEBUGCTLMSR	(1 << TIF_DEBUGCTLMSR)
 #define _TIF_DEBUGCTLMSR	(1 << TIF_DEBUGCTLMSR)
 #define _TIF_DS_AREA_MSR	(1 << TIF_DS_AREA_MSR)
 #define _TIF_DS_AREA_MSR	(1 << TIF_DS_AREA_MSR)
-#define _TIF_BTS_TRACE_TS	(1 << TIF_BTS_TRACE_TS)
 
 
 /* work to do in syscall_trace_enter() */
 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
 #define _TIF_WORK_SYSCALL_ENTRY	\
@@ -139,8 +139,7 @@ struct thread_info {
 
 
 /* flags to check in __switch_to() */
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
 #define _TIF_WORK_CTXSW							\
-	(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \
-								_TIF_NOTSC)
+	(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
 
 
 #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
 #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)

+ 1 - 0
arch/x86/kernel/Makefile

@@ -66,6 +66,7 @@ obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o nmi.o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o

+ 2 - 1
arch/x86/kernel/apic.c

@@ -30,6 +30,7 @@
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/dmi.h>
 #include <linux/dmi.h>
 #include <linux/dmar.h>
 #include <linux/dmar.h>
+#include <linux/ftrace.h>
 
 
 #include <asm/atomic.h>
 #include <asm/atomic.h>
 #include <asm/smp.h>
 #include <asm/smp.h>
@@ -790,7 +791,7 @@ static void local_apic_timer_interrupt(void)
  * [ if a single-CPU system runs an SMP kernel then we call the local
  * [ if a single-CPU system runs an SMP kernel then we call the local
  *   interrupt as well. Thus we cannot inline the local irq ... ]
  *   interrupt as well. Thus we cannot inline the local irq ... ]
  */
  */
-void smp_apic_timer_interrupt(struct pt_regs *regs)
+void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
 {
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
 

+ 5 - 0
arch/x86/kernel/cpu/Makefile

@@ -2,6 +2,11 @@
 # Makefile for x86-compatible CPU details and quirks
 # Makefile for x86-compatible CPU details and quirks
 #
 #
 
 
+# Don't trace early stages of a secondary CPU boot
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_common.o = -pg
+endif
+
 obj-y			:= intel_cacheinfo.o addon_cpuid_features.o
 obj-y			:= intel_cacheinfo.o addon_cpuid_features.o
 obj-y			+= proc.o capflags.o powerflags.o common.o
 obj-y			+= proc.o capflags.o powerflags.o common.o
 obj-y			+= vmware.o hypervisor.o
 obj-y			+= vmware.o hypervisor.o

+ 4 - 0
arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c

@@ -33,6 +33,7 @@
 #include <linux/cpufreq.h>
 #include <linux/cpufreq.h>
 #include <linux/compiler.h>
 #include <linux/compiler.h>
 #include <linux/dmi.h>
 #include <linux/dmi.h>
+#include <linux/ftrace.h>
 
 
 #include <linux/acpi.h>
 #include <linux/acpi.h>
 #include <acpi/processor.h>
 #include <acpi/processor.h>
@@ -391,6 +392,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 	unsigned int next_perf_state = 0; /* Index into perf table */
 	unsigned int next_perf_state = 0; /* Index into perf table */
 	unsigned int i;
 	unsigned int i;
 	int result = 0;
 	int result = 0;
+	struct power_trace it;
 
 
 	dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
 	dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
 
 
@@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 		}
 		}
 	}
 	}
 
 
+	trace_power_mark(&it, POWER_PSTATE, next_perf_state);
+
 	switch (data->cpu_feature) {
 	switch (data->cpu_feature) {
 	case SYSTEM_INTEL_MSR_CAPABLE:
 	case SYSTEM_INTEL_MSR_CAPABLE:
 		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
 		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;

+ 0 - 4
arch/x86/kernel/cpu/intel.c

@@ -11,7 +11,6 @@
 #include <asm/pgtable.h>
 #include <asm/pgtable.h>
 #include <asm/msr.h>
 #include <asm/msr.h>
 #include <asm/uaccess.h>
 #include <asm/uaccess.h>
-#include <asm/ptrace.h>
 #include <asm/ds.h>
 #include <asm/ds.h>
 #include <asm/bugs.h>
 #include <asm/bugs.h>
 
 
@@ -326,9 +325,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 		set_cpu_cap(c, X86_FEATURE_P3);
 		set_cpu_cap(c, X86_FEATURE_P3);
 #endif
 #endif
 
 
-	if (cpu_has_bts)
-		ptrace_bts_init_intel(c);
-
 	if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
 	if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
 		/*
 		/*
 		 * let's use the legacy cpuid vector 0x1 and 0x4 for topology
 		 * let's use the legacy cpuid vector 0x1 and 0x4 for topology

文件差異過大導致無法顯示
+ 590 - 488
arch/x86/kernel/ds.c


+ 33 - 1
arch/x86/kernel/dumpstack.c

@@ -30,6 +30,37 @@ void printk_address(unsigned long address, int reliable)
 			reliable ? "" : "? ", (void *) address);
 			reliable ? "" : "? ", (void *) address);
 }
 }
 
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static void
+print_ftrace_graph_addr(unsigned long addr, void *data,
+			const struct stacktrace_ops *ops,
+			struct thread_info *tinfo, int *graph)
+{
+	struct task_struct *task = tinfo->task;
+	unsigned long ret_addr;
+	int index = task->curr_ret_stack;
+
+	if (addr != (unsigned long)return_to_handler)
+		return;
+
+	if (!task->ret_stack || index < *graph)
+		return;
+
+	index -= *graph;
+	ret_addr = task->ret_stack[index].ret;
+
+	ops->address(data, ret_addr, 1);
+
+	(*graph)++;
+}
+#else
+static inline void
+print_ftrace_graph_addr(unsigned long addr, void *data,
+			const struct stacktrace_ops *ops,
+			struct thread_info *tinfo, int *graph)
+{ }
+#endif
+
 /*
 /*
  * x86-64 can have up to three kernel stacks:
  * x86-64 can have up to three kernel stacks:
  * process stack
  * process stack
@@ -54,7 +85,7 @@ unsigned long
 print_context_stack(struct thread_info *tinfo,
 print_context_stack(struct thread_info *tinfo,
 		unsigned long *stack, unsigned long bp,
 		unsigned long *stack, unsigned long bp,
 		const struct stacktrace_ops *ops, void *data,
 		const struct stacktrace_ops *ops, void *data,
-		unsigned long *end)
+		unsigned long *end, int *graph)
 {
 {
 	struct stack_frame *frame = (struct stack_frame *)bp;
 	struct stack_frame *frame = (struct stack_frame *)bp;
 
 
@@ -70,6 +101,7 @@ print_context_stack(struct thread_info *tinfo,
 			} else {
 			} else {
 				ops->address(data, addr, bp == 0);
 				ops->address(data, addr, bp == 0);
 			}
 			}
+			print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
 		}
 		}
 		stack++;
 		stack++;
 	}
 	}

+ 1 - 1
arch/x86/kernel/dumpstack.h

@@ -18,7 +18,7 @@ extern unsigned long
 print_context_stack(struct thread_info *tinfo,
 print_context_stack(struct thread_info *tinfo,
 		unsigned long *stack, unsigned long bp,
 		unsigned long *stack, unsigned long bp,
 		const struct stacktrace_ops *ops, void *data,
 		const struct stacktrace_ops *ops, void *data,
-		unsigned long *end);
+		unsigned long *end, int *graph);
 
 
 extern void
 extern void
 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,

+ 4 - 1
arch/x86/kernel/dumpstack_32.c

@@ -23,6 +23,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 		unsigned long *stack, unsigned long bp,
 		unsigned long *stack, unsigned long bp,
 		const struct stacktrace_ops *ops, void *data)
 		const struct stacktrace_ops *ops, void *data)
 {
 {
+	int graph = 0;
+
 	if (!task)
 	if (!task)
 		task = current;
 		task = current;
 
 
@@ -50,7 +52,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 
 
 		context = (struct thread_info *)
 		context = (struct thread_info *)
 			((unsigned long)stack & (~(THREAD_SIZE - 1)));
 			((unsigned long)stack & (~(THREAD_SIZE - 1)));
-		bp = print_context_stack(context, stack, bp, ops, data, NULL);
+		bp = print_context_stack(context, stack, bp, ops,
+					 data, NULL, &graph);
 
 
 		stack = (unsigned long *)context->previous_esp;
 		stack = (unsigned long *)context->previous_esp;
 		if (!stack)
 		if (!stack)

+ 4 - 3
arch/x86/kernel/dumpstack_64.c

@@ -109,6 +109,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 	unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
 	unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
 	unsigned used = 0;
 	unsigned used = 0;
 	struct thread_info *tinfo;
 	struct thread_info *tinfo;
+	int graph = 0;
 
 
 	if (!task)
 	if (!task)
 		task = current;
 		task = current;
@@ -149,7 +150,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 				break;
 				break;
 
 
 			bp = print_context_stack(tinfo, stack, bp, ops,
 			bp = print_context_stack(tinfo, stack, bp, ops,
-							data, estack_end);
+						 data, estack_end, &graph);
 			ops->stack(data, "<EOE>");
 			ops->stack(data, "<EOE>");
 			/*
 			/*
 			 * We link to the next stack via the
 			 * We link to the next stack via the
@@ -168,7 +169,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 				if (ops->stack(data, "IRQ") < 0)
 				if (ops->stack(data, "IRQ") < 0)
 					break;
 					break;
 				bp = print_context_stack(tinfo, stack, bp,
 				bp = print_context_stack(tinfo, stack, bp,
-						ops, data, irqstack_end);
+					ops, data, irqstack_end, &graph);
 				/*
 				/*
 				 * We link to the next stack (which would be
 				 * We link to the next stack (which would be
 				 * the process stack normally) the last
 				 * the process stack normally) the last
@@ -186,7 +187,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 	/*
 	/*
 	 * This handles the process stack:
 	 * This handles the process stack:
 	 */
 	 */
-	bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
+	bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph);
 	put_cpu();
 	put_cpu();
 }
 }
 EXPORT_SYMBOL(dump_trace);
 EXPORT_SYMBOL(dump_trace);

+ 50 - 1
arch/x86/kernel/entry_32.S

@@ -954,6 +954,9 @@ ENTRY(mcount)
 END(mcount)
 END(mcount)
 
 
 ENTRY(ftrace_caller)
 ENTRY(ftrace_caller)
+	cmpl $0, function_trace_stop
+	jne  ftrace_stub
+
 	pushl %eax
 	pushl %eax
 	pushl %ecx
 	pushl %ecx
 	pushl %edx
 	pushl %edx
@@ -968,6 +971,11 @@ ftrace_call:
 	popl %edx
 	popl %edx
 	popl %ecx
 	popl %ecx
 	popl %eax
 	popl %eax
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+	jmp ftrace_stub
+#endif
 
 
 .globl ftrace_stub
 .globl ftrace_stub
 ftrace_stub:
 ftrace_stub:
@@ -977,8 +985,18 @@ END(ftrace_caller)
 #else /* ! CONFIG_DYNAMIC_FTRACE */
 #else /* ! CONFIG_DYNAMIC_FTRACE */
 
 
 ENTRY(mcount)
 ENTRY(mcount)
+	cmpl $0, function_trace_stop
+	jne  ftrace_stub
+
 	cmpl $ftrace_stub, ftrace_trace_function
 	cmpl $ftrace_stub, ftrace_trace_function
 	jnz trace
 	jnz trace
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	cmpl $ftrace_stub, ftrace_graph_return
+	jnz ftrace_graph_caller
+
+	cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
+	jnz ftrace_graph_caller
+#endif
 .globl ftrace_stub
 .globl ftrace_stub
 ftrace_stub:
 ftrace_stub:
 	ret
 	ret
@@ -997,12 +1015,43 @@ trace:
 	popl %edx
 	popl %edx
 	popl %ecx
 	popl %ecx
 	popl %eax
 	popl %eax
-
 	jmp ftrace_stub
 	jmp ftrace_stub
 END(mcount)
 END(mcount)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #endif /* CONFIG_FUNCTION_TRACER */
 #endif /* CONFIG_FUNCTION_TRACER */
 
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+	cmpl $0, function_trace_stop
+	jne ftrace_stub
+
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %edx
+	lea 0x4(%ebp), %eax
+	subl $MCOUNT_INSN_SIZE, %edx
+	call prepare_ftrace_return
+	popl %edx
+	popl %ecx
+	popl %eax
+	ret
+END(ftrace_graph_caller)
+
+.globl return_to_handler
+return_to_handler:
+	pushl $0
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	call ftrace_return_to_handler
+	movl %eax, 0xc(%esp)
+	popl %edx
+	popl %ecx
+	popl %eax
+	ret
+#endif
+
 .section .rodata,"a"
 .section .rodata,"a"
 #include "syscall_table_32.S"
 #include "syscall_table_32.S"
 
 

+ 69 - 29
arch/x86/kernel/entry_64.S

@@ -67,16 +67,10 @@ ENTRY(mcount)
 END(mcount)
 END(mcount)
 
 
 ENTRY(ftrace_caller)
 ENTRY(ftrace_caller)
+	cmpl $0, function_trace_stop
+	jne  ftrace_stub
 
 
-	/* taken from glibc */
-	subq $0x38, %rsp
-	movq %rax, (%rsp)
-	movq %rcx, 8(%rsp)
-	movq %rdx, 16(%rsp)
-	movq %rsi, 24(%rsp)
-	movq %rdi, 32(%rsp)
-	movq %r8, 40(%rsp)
-	movq %r9, 48(%rsp)
+	MCOUNT_SAVE_FRAME
 
 
 	movq 0x38(%rsp), %rdi
 	movq 0x38(%rsp), %rdi
 	movq 8(%rbp), %rsi
 	movq 8(%rbp), %rsi
@@ -86,14 +80,13 @@ ENTRY(ftrace_caller)
 ftrace_call:
 ftrace_call:
 	call ftrace_stub
 	call ftrace_stub
 
 
-	movq 48(%rsp), %r9
-	movq 40(%rsp), %r8
-	movq 32(%rsp), %rdi
-	movq 24(%rsp), %rsi
-	movq 16(%rsp), %rdx
-	movq 8(%rsp), %rcx
-	movq (%rsp), %rax
-	addq $0x38, %rsp
+	MCOUNT_RESTORE_FRAME
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+	jmp ftrace_stub
+#endif
 
 
 .globl ftrace_stub
 .globl ftrace_stub
 ftrace_stub:
 ftrace_stub:
@@ -102,15 +95,63 @@ END(ftrace_caller)
 
 
 #else /* ! CONFIG_DYNAMIC_FTRACE */
 #else /* ! CONFIG_DYNAMIC_FTRACE */
 ENTRY(mcount)
 ENTRY(mcount)
+	cmpl $0, function_trace_stop
+	jne  ftrace_stub
+
 	cmpq $ftrace_stub, ftrace_trace_function
 	cmpq $ftrace_stub, ftrace_trace_function
 	jnz trace
 	jnz trace
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	cmpq $ftrace_stub, ftrace_graph_return
+	jnz ftrace_graph_caller
+
+	cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
+	jnz ftrace_graph_caller
+#endif
+
 .globl ftrace_stub
 .globl ftrace_stub
 ftrace_stub:
 ftrace_stub:
 	retq
 	retq
 
 
 trace:
 trace:
-	/* taken from glibc */
-	subq $0x38, %rsp
+	MCOUNT_SAVE_FRAME
+
+	movq 0x38(%rsp), %rdi
+	movq 8(%rbp), %rsi
+	subq $MCOUNT_INSN_SIZE, %rdi
+
+	call   *ftrace_trace_function
+
+	MCOUNT_RESTORE_FRAME
+
+	jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+	cmpl $0, function_trace_stop
+	jne ftrace_stub
+
+	MCOUNT_SAVE_FRAME
+
+	leaq 8(%rbp), %rdi
+	movq 0x38(%rsp), %rsi
+	subq $MCOUNT_INSN_SIZE, %rsi
+
+	call	prepare_ftrace_return
+
+	MCOUNT_RESTORE_FRAME
+
+	retq
+END(ftrace_graph_caller)
+
+
+.globl return_to_handler
+return_to_handler:
+	subq  $80, %rsp
+
 	movq %rax, (%rsp)
 	movq %rax, (%rsp)
 	movq %rcx, 8(%rsp)
 	movq %rcx, 8(%rsp)
 	movq %rdx, 16(%rsp)
 	movq %rdx, 16(%rsp)
@@ -118,13 +159,14 @@ trace:
 	movq %rdi, 32(%rsp)
 	movq %rdi, 32(%rsp)
 	movq %r8, 40(%rsp)
 	movq %r8, 40(%rsp)
 	movq %r9, 48(%rsp)
 	movq %r9, 48(%rsp)
+	movq %r10, 56(%rsp)
+	movq %r11, 64(%rsp)
 
 
-	movq 0x38(%rsp), %rdi
-	movq 8(%rbp), %rsi
-	subq $MCOUNT_INSN_SIZE, %rdi
-
-	call   *ftrace_trace_function
+	call ftrace_return_to_handler
 
 
+	movq %rax, 72(%rsp)
+	movq 64(%rsp), %r11
+	movq 56(%rsp), %r10
 	movq 48(%rsp), %r9
 	movq 48(%rsp), %r9
 	movq 40(%rsp), %r8
 	movq 40(%rsp), %r8
 	movq 32(%rsp), %rdi
 	movq 32(%rsp), %rdi
@@ -132,12 +174,10 @@ trace:
 	movq 16(%rsp), %rdx
 	movq 16(%rsp), %rdx
 	movq 8(%rsp), %rcx
 	movq 8(%rsp), %rcx
 	movq (%rsp), %rax
 	movq (%rsp), %rax
-	addq $0x38, %rsp
+	addq $72, %rsp
+	retq
+#endif
 
 
-	jmp ftrace_stub
-END(mcount)
-#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FUNCTION_TRACER */
 
 
 #ifndef CONFIG_PREEMPT
 #ifndef CONFIG_PREEMPT
 #define retint_kernel retint_restore_args
 #define retint_kernel retint_restore_args

+ 380 - 10
arch/x86/kernel/ftrace.c

@@ -14,14 +14,17 @@
 #include <linux/uaccess.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
 #include <linux/ftrace.h>
 #include <linux/percpu.h>
 #include <linux/percpu.h>
+#include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/list.h>
 
 
 #include <asm/ftrace.h>
 #include <asm/ftrace.h>
+#include <linux/ftrace.h>
 #include <asm/nops.h>
 #include <asm/nops.h>
+#include <asm/nmi.h>
 
 
 
 
-static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
+#ifdef CONFIG_DYNAMIC_FTRACE
 
 
 union ftrace_code_union {
 union ftrace_code_union {
 	char code[MCOUNT_INSN_SIZE];
 	char code[MCOUNT_INSN_SIZE];
@@ -31,18 +34,12 @@ union ftrace_code_union {
 	} __attribute__((packed));
 	} __attribute__((packed));
 };
 };
 
 
-
 static int ftrace_calc_offset(long ip, long addr)
 static int ftrace_calc_offset(long ip, long addr)
 {
 {
 	return (int)(addr - ip);
 	return (int)(addr - ip);
 }
 }
 
 
-unsigned char *ftrace_nop_replace(void)
-{
-	return ftrace_nop;
-}
-
-unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 {
 {
 	static union ftrace_code_union calc;
 	static union ftrace_code_union calc;
 
 
@@ -56,7 +53,142 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 	return calc.code;
 	return calc.code;
 }
 }
 
 
-int
+/*
+ * Modifying code must take extra care. On an SMP machine, if
+ * the code being modified is also being executed on another CPU
+ * that CPU will have undefined results and possibly take a GPF.
+ * We use kstop_machine to stop other CPUS from exectuing code.
+ * But this does not stop NMIs from happening. We still need
+ * to protect against that. We separate out the modification of
+ * the code to take care of this.
+ *
+ * Two buffers are added: An IP buffer and a "code" buffer.
+ *
+ * 1) Put the instruction pointer into the IP buffer
+ *    and the new code into the "code" buffer.
+ * 2) Set a flag that says we are modifying code
+ * 3) Wait for any running NMIs to finish.
+ * 4) Write the code
+ * 5) clear the flag.
+ * 6) Wait for any running NMIs to finish.
+ *
+ * If an NMI is executed, the first thing it does is to call
+ * "ftrace_nmi_enter". This will check if the flag is set to write
+ * and if it is, it will write what is in the IP and "code" buffers.
+ *
+ * The trick is, it does not matter if everyone is writing the same
+ * content to the code location. Also, if a CPU is executing code
+ * it is OK to write to that code location if the contents being written
+ * are the same as what exists.
+ */
+
+static atomic_t in_nmi = ATOMIC_INIT(0);
+static int mod_code_status;		/* holds return value of text write */
+static int mod_code_write;		/* set when NMI should do the write */
+static void *mod_code_ip;		/* holds the IP to write to */
+static void *mod_code_newcode;		/* holds the text to write to the IP */
+
+static unsigned nmi_wait_count;
+static atomic_t nmi_update_count = ATOMIC_INIT(0);
+
+int ftrace_arch_read_dyn_info(char *buf, int size)
+{
+	int r;
+
+	r = snprintf(buf, size, "%u %u",
+		     nmi_wait_count,
+		     atomic_read(&nmi_update_count));
+	return r;
+}
+
+static void ftrace_mod_code(void)
+{
+	/*
+	 * Yes, more than one CPU process can be writing to mod_code_status.
+	 *    (and the code itself)
+	 * But if one were to fail, then they all should, and if one were
+	 * to succeed, then they all should.
+	 */
+	mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
+					     MCOUNT_INSN_SIZE);
+}
+
+void ftrace_nmi_enter(void)
+{
+	atomic_inc(&in_nmi);
+	/* Must have in_nmi seen before reading write flag */
+	smp_mb();
+	if (mod_code_write) {
+		ftrace_mod_code();
+		atomic_inc(&nmi_update_count);
+	}
+}
+
+void ftrace_nmi_exit(void)
+{
+	/* Finish all executions before clearing in_nmi */
+	smp_wmb();
+	atomic_dec(&in_nmi);
+}
+
+static void wait_for_nmi(void)
+{
+	int waited = 0;
+
+	while (atomic_read(&in_nmi)) {
+		waited = 1;
+		cpu_relax();
+	}
+
+	if (waited)
+		nmi_wait_count++;
+}
+
+static int
+do_ftrace_mod_code(unsigned long ip, void *new_code)
+{
+	mod_code_ip = (void *)ip;
+	mod_code_newcode = new_code;
+
+	/* The buffers need to be visible before we let NMIs write them */
+	smp_wmb();
+
+	mod_code_write = 1;
+
+	/* Make sure write bit is visible before we wait on NMIs */
+	smp_mb();
+
+	wait_for_nmi();
+
+	/* Make sure all running NMIs have finished before we write the code */
+	smp_mb();
+
+	ftrace_mod_code();
+
+	/* Make sure the write happens before clearing the bit */
+	smp_wmb();
+
+	mod_code_write = 0;
+
+	/* make sure NMIs see the cleared bit */
+	smp_mb();
+
+	wait_for_nmi();
+
+	return mod_code_status;
+}
+
+
+
+
+static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
+
+static unsigned char *ftrace_nop_replace(void)
+{
+	return ftrace_nop;
+}
+
+static int
 ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 		   unsigned char *new_code)
 		   unsigned char *new_code)
 {
 {
@@ -81,7 +213,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 		return -EINVAL;
 		return -EINVAL;
 
 
 	/* replace the text with the new text */
 	/* replace the text with the new text */
-	if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
+	if (do_ftrace_mod_code(ip, new_code))
 		return -EPERM;
 		return -EPERM;
 
 
 	sync_core();
 	sync_core();
@@ -89,6 +221,29 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 	return 0;
 	return 0;
 }
 }
 
 
+int ftrace_make_nop(struct module *mod,
+		    struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned char *new, *old;
+	unsigned long ip = rec->ip;
+
+	old = ftrace_call_replace(ip, addr);
+	new = ftrace_nop_replace();
+
+	return ftrace_modify_code(rec->ip, old, new);
+}
+
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned char *new, *old;
+	unsigned long ip = rec->ip;
+
+	old = ftrace_nop_replace();
+	new = ftrace_call_replace(ip, addr);
+
+	return ftrace_modify_code(rec->ip, old, new);
+}
+
 int ftrace_update_ftrace_func(ftrace_func_t func)
 int ftrace_update_ftrace_func(ftrace_func_t func)
 {
 {
 	unsigned long ip = (unsigned long)(&ftrace_call);
 	unsigned long ip = (unsigned long)(&ftrace_call);
@@ -165,3 +320,218 @@ int __init ftrace_dyn_arch_init(void *data)
 
 
 	return 0;
 	return 0;
 }
 }
+#endif
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern void ftrace_graph_call(void);
+
+static int ftrace_mod_jmp(unsigned long ip,
+			  int old_offset, int new_offset)
+{
+	unsigned char code[MCOUNT_INSN_SIZE];
+
+	if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE))
+		return -EFAULT;
+
+	if (code[0] != 0xe9 || old_offset != *(int *)(&code[1]))
+		return -EINVAL;
+
+	*(int *)(&code[1]) = new_offset;
+
+	if (do_ftrace_mod_code(ip, &code))
+		return -EPERM;
+
+	return 0;
+}
+
+int ftrace_enable_ftrace_graph_caller(void)
+{
+	unsigned long ip = (unsigned long)(&ftrace_graph_call);
+	int old_offset, new_offset;
+
+	old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
+	new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
+
+	return ftrace_mod_jmp(ip, old_offset, new_offset);
+}
+
+int ftrace_disable_ftrace_graph_caller(void)
+{
+	unsigned long ip = (unsigned long)(&ftrace_graph_call);
+	int old_offset, new_offset;
+
+	old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
+	new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
+
+	return ftrace_mod_jmp(ip, old_offset, new_offset);
+}
+
+#else /* CONFIG_DYNAMIC_FTRACE */
+
+/*
+ * These functions are picked from those used on
+ * this page for dynamic ftrace. They have been
+ * simplified to ignore all traces in NMI context.
+ */
+static atomic_t in_nmi;
+
+void ftrace_nmi_enter(void)
+{
+	atomic_inc(&in_nmi);
+}
+
+void ftrace_nmi_exit(void)
+{
+	atomic_dec(&in_nmi);
+}
+
+#endif /* !CONFIG_DYNAMIC_FTRACE */
+
+/* Add a function return address to the trace stack on thread info.*/
+static int push_return_trace(unsigned long ret, unsigned long long time,
+				unsigned long func, int *depth)
+{
+	int index;
+
+	if (!current->ret_stack)
+		return -EBUSY;
+
+	/* The return trace stack is full */
+	if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
+		atomic_inc(&current->trace_overrun);
+		return -EBUSY;
+	}
+
+	index = ++current->curr_ret_stack;
+	barrier();
+	current->ret_stack[index].ret = ret;
+	current->ret_stack[index].func = func;
+	current->ret_stack[index].calltime = time;
+	*depth = index;
+
+	return 0;
+}
+
+/* Retrieve a function return address to the trace stack on thread info.*/
+static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
+{
+	int index;
+
+	index = current->curr_ret_stack;
+
+	if (unlikely(index < 0)) {
+		ftrace_graph_stop();
+		WARN_ON(1);
+		/* Might as well panic, otherwise we have no where to go */
+		*ret = (unsigned long)panic;
+		return;
+	}
+
+	*ret = current->ret_stack[index].ret;
+	trace->func = current->ret_stack[index].func;
+	trace->calltime = current->ret_stack[index].calltime;
+	trace->overrun = atomic_read(&current->trace_overrun);
+	trace->depth = index;
+	barrier();
+	current->curr_ret_stack--;
+
+}
+
+/*
+ * Send the trace to the ring-buffer.
+ * @return the original return address.
+ */
+unsigned long ftrace_return_to_handler(void)
+{
+	struct ftrace_graph_ret trace;
+	unsigned long ret;
+
+	pop_return_trace(&trace, &ret);
+	trace.rettime = cpu_clock(raw_smp_processor_id());
+	ftrace_graph_return(&trace);
+
+	if (unlikely(!ret)) {
+		ftrace_graph_stop();
+		WARN_ON(1);
+		/* Might as well panic. What else to do? */
+		ret = (unsigned long)panic;
+	}
+
+	return ret;
+}
+
+/*
+ * Hook the return address and push it in the stack of return addrs
+ * in current thread info.
+ */
+void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
+{
+	unsigned long old;
+	unsigned long long calltime;
+	int faulted;
+	struct ftrace_graph_ent trace;
+	unsigned long return_hooker = (unsigned long)
+				&return_to_handler;
+
+	/* Nmi's are currently unsupported */
+	if (unlikely(atomic_read(&in_nmi)))
+		return;
+
+	if (unlikely(atomic_read(&current->tracing_graph_pause)))
+		return;
+
+	/*
+	 * Protect against fault, even if it shouldn't
+	 * happen. This tool is too much intrusive to
+	 * ignore such a protection.
+	 */
+	asm volatile(
+		"1: " _ASM_MOV " (%[parent_old]), %[old]\n"
+		"2: " _ASM_MOV " %[return_hooker], (%[parent_replaced])\n"
+		"   movl $0, %[faulted]\n"
+
+		".section .fixup, \"ax\"\n"
+		"3: movl $1, %[faulted]\n"
+		".previous\n"
+
+		_ASM_EXTABLE(1b, 3b)
+		_ASM_EXTABLE(2b, 3b)
+
+		: [parent_replaced] "=r" (parent), [old] "=r" (old),
+		  [faulted] "=r" (faulted)
+		: [parent_old] "0" (parent), [return_hooker] "r" (return_hooker)
+		: "memory"
+	);
+
+	if (unlikely(faulted)) {
+		ftrace_graph_stop();
+		WARN_ON(1);
+		return;
+	}
+
+	if (unlikely(!__kernel_text_address(old))) {
+		ftrace_graph_stop();
+		*parent = old;
+		WARN_ON(1);
+		return;
+	}
+
+	calltime = cpu_clock(raw_smp_processor_id());
+
+	if (push_return_trace(old, calltime,
+				self_addr, &trace.depth) == -EBUSY) {
+		*parent = old;
+		return;
+	}
+
+	trace.func = self_addr;
+
+	/* Only trace if the calling function expects to */
+	if (!ftrace_graph_entry(&trace)) {
+		current->curr_ret_stack--;
+		*parent = old;
+	}
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */

+ 2 - 1
arch/x86/kernel/irq_64.c

@@ -13,6 +13,7 @@
 #include <linux/seq_file.h>
 #include <linux/seq_file.h>
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/delay.h>
+#include <linux/ftrace.h>
 #include <asm/uaccess.h>
 #include <asm/uaccess.h>
 #include <asm/io_apic.h>
 #include <asm/io_apic.h>
 #include <asm/idle.h>
 #include <asm/idle.h>
@@ -45,7 +46,7 @@ static inline void stack_overflow_check(struct pt_regs *regs)
  * SMP cross-CPU interrupts have their own specific
  * SMP cross-CPU interrupts have their own specific
  * handlers).
  * handlers).
  */
  */
-asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
+asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 {
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 	struct pt_regs *old_regs = set_irq_regs(regs);
 	struct irq_desc *desc;
 	struct irq_desc *desc;

+ 16 - 0
arch/x86/kernel/process.c

@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/pm.h>
 #include <linux/pm.h>
 #include <linux/clockchips.h>
 #include <linux/clockchips.h>
+#include <linux/ftrace.h>
 #include <asm/system.h>
 #include <asm/system.h>
 #include <asm/apic.h>
 #include <asm/apic.h>
 
 
@@ -102,6 +103,9 @@ static inline int hlt_use_halt(void)
 void default_idle(void)
 void default_idle(void)
 {
 {
 	if (hlt_use_halt()) {
 	if (hlt_use_halt()) {
+		struct power_trace it;
+
+		trace_power_start(&it, POWER_CSTATE, 1);
 		current_thread_info()->status &= ~TS_POLLING;
 		current_thread_info()->status &= ~TS_POLLING;
 		/*
 		/*
 		 * TS_POLLING-cleared state must be visible before we
 		 * TS_POLLING-cleared state must be visible before we
@@ -114,6 +118,7 @@ void default_idle(void)
 		else
 		else
 			local_irq_enable();
 			local_irq_enable();
 		current_thread_info()->status |= TS_POLLING;
 		current_thread_info()->status |= TS_POLLING;
+		trace_power_end(&it);
 	} else {
 	} else {
 		local_irq_enable();
 		local_irq_enable();
 		/* loop is done by the caller */
 		/* loop is done by the caller */
@@ -171,24 +176,31 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
  */
  */
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
 {
+	struct power_trace it;
+
+	trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
 	if (!need_resched()) {
 	if (!need_resched()) {
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
 		smp_mb();
 		if (!need_resched())
 		if (!need_resched())
 			__mwait(ax, cx);
 			__mwait(ax, cx);
 	}
 	}
+	trace_power_end(&it);
 }
 }
 
 
 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
 static void mwait_idle(void)
 static void mwait_idle(void)
 {
 {
+	struct power_trace it;
 	if (!need_resched()) {
 	if (!need_resched()) {
+		trace_power_start(&it, POWER_CSTATE, 1);
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
 		smp_mb();
 		if (!need_resched())
 		if (!need_resched())
 			__sti_mwait(0, 0);
 			__sti_mwait(0, 0);
 		else
 		else
 			local_irq_enable();
 			local_irq_enable();
+		trace_power_end(&it);
 	} else
 	} else
 		local_irq_enable();
 		local_irq_enable();
 }
 }
@@ -200,9 +212,13 @@ static void mwait_idle(void)
  */
  */
 static void poll_idle(void)
 static void poll_idle(void)
 {
 {
+	struct power_trace it;
+
+	trace_power_start(&it, POWER_CSTATE, 0);
 	local_irq_enable();
 	local_irq_enable();
 	while (!need_resched())
 	while (!need_resched())
 		cpu_relax();
 		cpu_relax();
+	trace_power_end(&it);
 }
 }
 
 
 /*
 /*

+ 16 - 51
arch/x86/kernel/process_32.c

@@ -38,6 +38,7 @@
 #include <linux/percpu.h>
 #include <linux/percpu.h>
 #include <linux/prctl.h>
 #include <linux/prctl.h>
 #include <linux/dmi.h>
 #include <linux/dmi.h>
+#include <linux/ftrace.h>
 
 
 #include <asm/uaccess.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/pgtable.h>
@@ -59,6 +60,7 @@
 #include <asm/idle.h>
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/syscalls.h>
 #include <asm/smp.h>
 #include <asm/smp.h>
+#include <asm/ds.h>
 
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
 
@@ -250,14 +252,8 @@ void exit_thread(void)
 		tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
 		tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
 		put_cpu();
 		put_cpu();
 	}
 	}
-#ifdef CONFIG_X86_DS
-	/* Free any DS contexts that have not been properly released. */
-	if (unlikely(current->thread.ds_ctx)) {
-		/* we clear debugctl to make sure DS is not used. */
-		update_debugctlmsr(0);
-		ds_free(current->thread.ds_ctx);
-	}
-#endif /* CONFIG_X86_DS */
+
+	ds_exit_thread(current);
 }
 }
 
 
 void flush_thread(void)
 void flush_thread(void)
@@ -339,6 +335,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 		kfree(p->thread.io_bitmap_ptr);
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 		p->thread.io_bitmap_max = 0;
 	}
 	}
+
+	ds_copy_thread(p, current);
+
+	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
+	p->thread.debugctlmsr = 0;
+
 	return err;
 	return err;
 }
 }
 
 
@@ -419,48 +421,19 @@ int set_tsc_mode(unsigned int val)
 	return 0;
 	return 0;
 }
 }
 
 
-#ifdef CONFIG_X86_DS
-static int update_debugctl(struct thread_struct *prev,
-			struct thread_struct *next, unsigned long debugctl)
-{
-	unsigned long ds_prev = 0;
-	unsigned long ds_next = 0;
-
-	if (prev->ds_ctx)
-		ds_prev = (unsigned long)prev->ds_ctx->ds;
-	if (next->ds_ctx)
-		ds_next = (unsigned long)next->ds_ctx->ds;
-
-	if (ds_next != ds_prev) {
-		/* we clear debugctl to make sure DS
-		 * is not in use when we change it */
-		debugctl = 0;
-		update_debugctlmsr(0);
-		wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
-	}
-	return debugctl;
-}
-#else
-static int update_debugctl(struct thread_struct *prev,
-			struct thread_struct *next, unsigned long debugctl)
-{
-	return debugctl;
-}
-#endif /* CONFIG_X86_DS */
-
 static noinline void
 static noinline void
 __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		 struct tss_struct *tss)
 		 struct tss_struct *tss)
 {
 {
 	struct thread_struct *prev, *next;
 	struct thread_struct *prev, *next;
-	unsigned long debugctl;
 
 
 	prev = &prev_p->thread;
 	prev = &prev_p->thread;
 	next = &next_p->thread;
 	next = &next_p->thread;
 
 
-	debugctl = update_debugctl(prev, next, prev->debugctlmsr);
-
-	if (next->debugctlmsr != debugctl)
+	if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
+	    test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
+		ds_switch_to(prev_p, next_p);
+	else if (next->debugctlmsr != prev->debugctlmsr)
 		update_debugctlmsr(next->debugctlmsr);
 		update_debugctlmsr(next->debugctlmsr);
 
 
 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -482,15 +455,6 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 			hard_enable_TSC();
 			hard_enable_TSC();
 	}
 	}
 
 
-#ifdef CONFIG_X86_PTRACE_BTS
-	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
-		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
-
-	if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
-		ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
-#endif /* CONFIG_X86_PTRACE_BTS */
-
-
 	if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
 	if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
 		/*
 		/*
 		 * Disable the bitmap via an invalid offset. We still cache
 		 * Disable the bitmap via an invalid offset. We still cache
@@ -548,7 +512,8 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
  * the task-switch, and shows up in ret_from_fork in entry.S,
  * the task-switch, and shows up in ret_from_fork in entry.S,
  * for example.
  * for example.
  */
  */
-struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+__notrace_funcgraph struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 {
 	struct thread_struct *prev = &prev_p->thread,
 	struct thread_struct *prev = &prev_p->thread,
 				 *next = &next_p->thread;
 				 *next = &next_p->thread;

+ 16 - 42
arch/x86/kernel/process_64.c

@@ -39,6 +39,7 @@
 #include <linux/prctl.h>
 #include <linux/prctl.h>
 #include <linux/uaccess.h>
 #include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/io.h>
+#include <linux/ftrace.h>
 
 
 #include <asm/pgtable.h>
 #include <asm/pgtable.h>
 #include <asm/system.h>
 #include <asm/system.h>
@@ -52,6 +53,7 @@
 #include <asm/ia32.h>
 #include <asm/ia32.h>
 #include <asm/idle.h>
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/syscalls.h>
+#include <asm/ds.h>
 
 
 asmlinkage extern void ret_from_fork(void);
 asmlinkage extern void ret_from_fork(void);
 
 
@@ -235,14 +237,8 @@ void exit_thread(void)
 		t->io_bitmap_max = 0;
 		t->io_bitmap_max = 0;
 		put_cpu();
 		put_cpu();
 	}
 	}
-#ifdef CONFIG_X86_DS
-	/* Free any DS contexts that have not been properly released. */
-	if (unlikely(t->ds_ctx)) {
-		/* we clear debugctl to make sure DS is not used. */
-		update_debugctlmsr(0);
-		ds_free(t->ds_ctx);
-	}
-#endif /* CONFIG_X86_DS */
+
+	ds_exit_thread(current);
 }
 }
 
 
 void flush_thread(void)
 void flush_thread(void)
@@ -372,6 +368,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 		if (err)
 		if (err)
 			goto out;
 			goto out;
 	}
 	}
+
+	ds_copy_thread(p, me);
+
+	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
+	p->thread.debugctlmsr = 0;
+
 	err = 0;
 	err = 0;
 out:
 out:
 	if (err && p->thread.io_bitmap_ptr) {
 	if (err && p->thread.io_bitmap_ptr) {
@@ -470,35 +472,14 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
 				    struct tss_struct *tss)
 				    struct tss_struct *tss)
 {
 {
 	struct thread_struct *prev, *next;
 	struct thread_struct *prev, *next;
-	unsigned long debugctl;
 
 
 	prev = &prev_p->thread,
 	prev = &prev_p->thread,
 	next = &next_p->thread;
 	next = &next_p->thread;
 
 
-	debugctl = prev->debugctlmsr;
-
-#ifdef CONFIG_X86_DS
-	{
-		unsigned long ds_prev = 0, ds_next = 0;
-
-		if (prev->ds_ctx)
-			ds_prev = (unsigned long)prev->ds_ctx->ds;
-		if (next->ds_ctx)
-			ds_next = (unsigned long)next->ds_ctx->ds;
-
-		if (ds_next != ds_prev) {
-			/*
-			 * We clear debugctl to make sure DS
-			 * is not in use when we change it:
-			 */
-			debugctl = 0;
-			update_debugctlmsr(0);
-			wrmsrl(MSR_IA32_DS_AREA, ds_next);
-		}
-	}
-#endif /* CONFIG_X86_DS */
-
-	if (next->debugctlmsr != debugctl)
+	if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
+	    test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
+		ds_switch_to(prev_p, next_p);
+	else if (next->debugctlmsr != prev->debugctlmsr)
 		update_debugctlmsr(next->debugctlmsr);
 		update_debugctlmsr(next->debugctlmsr);
 
 
 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -533,14 +514,6 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
 		 */
 		 */
 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
 	}
 	}
-
-#ifdef CONFIG_X86_PTRACE_BTS
-	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
-		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
-
-	if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
-		ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
-#endif /* CONFIG_X86_PTRACE_BTS */
 }
 }
 
 
 /*
 /*
@@ -551,8 +524,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
  * - could test fs/gs bitsliced
  * - could test fs/gs bitsliced
  *
  *
  * Kprobes not supported here. Set the probe on schedule instead.
  * Kprobes not supported here. Set the probe on schedule instead.
+ * Function graph tracer not supported too.
  */
  */
-struct task_struct *
+__notrace_funcgraph struct task_struct *
 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 {
 	struct thread_struct *prev = &prev_p->thread;
 	struct thread_struct *prev = &prev_p->thread;

+ 152 - 279
arch/x86/kernel/ptrace.c

@@ -581,158 +581,91 @@ static int ioperm_get(struct task_struct *target,
 }
 }
 
 
 #ifdef CONFIG_X86_PTRACE_BTS
 #ifdef CONFIG_X86_PTRACE_BTS
-/*
- * The configuration for a particular BTS hardware implementation.
- */
-struct bts_configuration {
-	/* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
-	unsigned char  sizeof_bts;
-	/* the size of a field in the BTS record in bytes */
-	unsigned char  sizeof_field;
-	/* a bitmask to enable/disable BTS in DEBUGCTL MSR */
-	unsigned long debugctl_mask;
-};
-static struct bts_configuration bts_cfg;
-
-#define BTS_MAX_RECORD_SIZE (8 * 3)
-
-
-/*
- * Branch Trace Store (BTS) uses the following format. Different
- * architectures vary in the size of those fields.
- * - source linear address
- * - destination linear address
- * - flags
- *
- * Later architectures use 64bit pointers throughout, whereas earlier
- * architectures use 32bit pointers in 32bit mode.
- *
- * We compute the base address for the first 8 fields based on:
- * - the field size stored in the DS configuration
- * - the relative field position
- *
- * In order to store additional information in the BTS buffer, we use
- * a special source address to indicate that the record requires
- * special interpretation.
- *
- * Netburst indicated via a bit in the flags field whether the branch
- * was predicted; this is ignored.
- */
-
-enum bts_field {
-	bts_from = 0,
-	bts_to,
-	bts_flags,
-
-	bts_escape = (unsigned long)-1,
-	bts_qual = bts_to,
-	bts_jiffies = bts_flags
-};
-
-static inline unsigned long bts_get(const char *base, enum bts_field field)
-{
-	base += (bts_cfg.sizeof_field * field);
-	return *(unsigned long *)base;
-}
-
-static inline void bts_set(char *base, enum bts_field field, unsigned long val)
-{
-	base += (bts_cfg.sizeof_field * field);;
-	(*(unsigned long *)base) = val;
-}
-
-/*
- * Translate a BTS record from the raw format into the bts_struct format
- *
- * out (out): bts_struct interpretation
- * raw: raw BTS record
- */
-static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
-{
-	memset(out, 0, sizeof(*out));
-	if (bts_get(raw, bts_from) == bts_escape) {
-		out->qualifier       = bts_get(raw, bts_qual);
-		out->variant.jiffies = bts_get(raw, bts_jiffies);
-	} else {
-		out->qualifier = BTS_BRANCH;
-		out->variant.lbr.from_ip = bts_get(raw, bts_from);
-		out->variant.lbr.to_ip   = bts_get(raw, bts_to);
-	}
-}
-
 static int ptrace_bts_read_record(struct task_struct *child, size_t index,
 static int ptrace_bts_read_record(struct task_struct *child, size_t index,
 				  struct bts_struct __user *out)
 				  struct bts_struct __user *out)
 {
 {
-	struct bts_struct ret;
-	const void *bts_record;
-	size_t bts_index, bts_end;
+	const struct bts_trace *trace;
+	struct bts_struct bts;
+	const unsigned char *at;
 	int error;
 	int error;
 
 
-	error = ds_get_bts_end(child, &bts_end);
-	if (error < 0)
-		return error;
-
-	if (bts_end <= index)
-		return -EINVAL;
+	trace = ds_read_bts(child->bts);
+	if (!trace)
+		return -EPERM;
 
 
-	error = ds_get_bts_index(child, &bts_index);
-	if (error < 0)
-		return error;
+	at = trace->ds.top - ((index + 1) * trace->ds.size);
+	if ((void *)at < trace->ds.begin)
+		at += (trace->ds.n * trace->ds.size);
 
 
-	/* translate the ptrace bts index into the ds bts index */
-	bts_index += bts_end - (index + 1);
-	if (bts_end <= bts_index)
-		bts_index -= bts_end;
+	if (!trace->read)
+		return -EOPNOTSUPP;
 
 
-	error = ds_access_bts(child, bts_index, &bts_record);
+	error = trace->read(child->bts, at, &bts);
 	if (error < 0)
 	if (error < 0)
 		return error;
 		return error;
 
 
-	ptrace_bts_translate_record(&ret, bts_record);
-
-	if (copy_to_user(out, &ret, sizeof(ret)))
+	if (copy_to_user(out, &bts, sizeof(bts)))
 		return -EFAULT;
 		return -EFAULT;
 
 
-	return sizeof(ret);
+	return sizeof(bts);
 }
 }
 
 
 static int ptrace_bts_drain(struct task_struct *child,
 static int ptrace_bts_drain(struct task_struct *child,
 			    long size,
 			    long size,
 			    struct bts_struct __user *out)
 			    struct bts_struct __user *out)
 {
 {
-	struct bts_struct ret;
-	const unsigned char *raw;
-	size_t end, i;
-	int error;
+	const struct bts_trace *trace;
+	const unsigned char *at;
+	int error, drained = 0;
 
 
-	error = ds_get_bts_index(child, &end);
-	if (error < 0)
-		return error;
+	trace = ds_read_bts(child->bts);
+	if (!trace)
+		return -EPERM;
 
 
-	if (size < (end * sizeof(struct bts_struct)))
+	if (!trace->read)
+		return -EOPNOTSUPP;
+
+	if (size < (trace->ds.top - trace->ds.begin))
 		return -EIO;
 		return -EIO;
 
 
-	error = ds_access_bts(child, 0, (const void **)&raw);
-	if (error < 0)
-		return error;
+	for (at = trace->ds.begin; (void *)at < trace->ds.top;
+	     out++, drained++, at += trace->ds.size) {
+		struct bts_struct bts;
+		int error;
 
 
-	for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) {
-		ptrace_bts_translate_record(&ret, raw);
+		error = trace->read(child->bts, at, &bts);
+		if (error < 0)
+			return error;
 
 
-		if (copy_to_user(out, &ret, sizeof(ret)))
+		if (copy_to_user(out, &bts, sizeof(bts)))
 			return -EFAULT;
 			return -EFAULT;
 	}
 	}
 
 
-	error = ds_clear_bts(child);
+	memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
+
+	error = ds_reset_bts(child->bts);
 	if (error < 0)
 	if (error < 0)
 		return error;
 		return error;
 
 
-	return end;
+	return drained;
 }
 }
 
 
-static void ptrace_bts_ovfl(struct task_struct *child)
+static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size)
 {
 {
-	send_sig(child->thread.bts_ovfl_signal, child, 0);
+	child->bts_buffer = alloc_locked_buffer(size);
+	if (!child->bts_buffer)
+		return -ENOMEM;
+
+	child->bts_size = size;
+
+	return 0;
+}
+
+static void ptrace_bts_free_buffer(struct task_struct *child)
+{
+	free_locked_buffer(child->bts_buffer, child->bts_size);
+	child->bts_buffer = NULL;
+	child->bts_size = 0;
 }
 }
 
 
 static int ptrace_bts_config(struct task_struct *child,
 static int ptrace_bts_config(struct task_struct *child,
@@ -740,114 +673,86 @@ static int ptrace_bts_config(struct task_struct *child,
 			     const struct ptrace_bts_config __user *ucfg)
 			     const struct ptrace_bts_config __user *ucfg)
 {
 {
 	struct ptrace_bts_config cfg;
 	struct ptrace_bts_config cfg;
-	int error = 0;
-
-	error = -EOPNOTSUPP;
-	if (!bts_cfg.sizeof_bts)
-		goto errout;
+	unsigned int flags = 0;
 
 
-	error = -EIO;
 	if (cfg_size < sizeof(cfg))
 	if (cfg_size < sizeof(cfg))
-		goto errout;
+		return -EIO;
 
 
-	error = -EFAULT;
 	if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
 	if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
-		goto errout;
+		return -EFAULT;
 
 
-	error = -EINVAL;
-	if ((cfg.flags & PTRACE_BTS_O_SIGNAL) &&
-	    !(cfg.flags & PTRACE_BTS_O_ALLOC))
-		goto errout;
+	if (child->bts) {
+		ds_release_bts(child->bts);
+		child->bts = NULL;
+	}
 
 
-	if (cfg.flags & PTRACE_BTS_O_ALLOC) {
-		ds_ovfl_callback_t ovfl = NULL;
-		unsigned int sig = 0;
+	if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
+		if (!cfg.signal)
+			return -EINVAL;
 
 
-		/* we ignore the error in case we were not tracing child */
-		(void)ds_release_bts(child);
+		return -EOPNOTSUPP;
 
 
-		if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
-			if (!cfg.signal)
-				goto errout;
+		child->thread.bts_ovfl_signal = cfg.signal;
+	}
 
 
-			sig  = cfg.signal;
-			ovfl = ptrace_bts_ovfl;
-		}
+	if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
+	    (cfg.size != child->bts_size)) {
+		int error;
 
 
-		error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl);
-		if (error < 0)
-			goto errout;
+		ptrace_bts_free_buffer(child);
 
 
-		child->thread.bts_ovfl_signal = sig;
+		error = ptrace_bts_allocate_buffer(child, cfg.size);
+		if (error < 0)
+			return error;
 	}
 	}
 
 
-	error = -EINVAL;
-	if (!child->thread.ds_ctx && cfg.flags)
-		goto errout;
-
 	if (cfg.flags & PTRACE_BTS_O_TRACE)
 	if (cfg.flags & PTRACE_BTS_O_TRACE)
-		child->thread.debugctlmsr |= bts_cfg.debugctl_mask;
-	else
-		child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
+		flags |= BTS_USER;
 
 
 	if (cfg.flags & PTRACE_BTS_O_SCHED)
 	if (cfg.flags & PTRACE_BTS_O_SCHED)
-		set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
-	else
-		clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+		flags |= BTS_TIMESTAMPS;
 
 
-	error = sizeof(cfg);
+	child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size,
+				    /* ovfl = */ NULL, /* th = */ (size_t)-1,
+				    flags);
+	if (IS_ERR(child->bts)) {
+		int error = PTR_ERR(child->bts);
 
 
-out:
-	if (child->thread.debugctlmsr)
-		set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
-	else
-		clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+		ptrace_bts_free_buffer(child);
+		child->bts = NULL;
 
 
-	return error;
+		return error;
+	}
 
 
-errout:
-	child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
-	clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
-	goto out;
+	return sizeof(cfg);
 }
 }
 
 
 static int ptrace_bts_status(struct task_struct *child,
 static int ptrace_bts_status(struct task_struct *child,
 			     long cfg_size,
 			     long cfg_size,
 			     struct ptrace_bts_config __user *ucfg)
 			     struct ptrace_bts_config __user *ucfg)
 {
 {
+	const struct bts_trace *trace;
 	struct ptrace_bts_config cfg;
 	struct ptrace_bts_config cfg;
-	size_t end;
-	const void *base, *max;
-	int error;
 
 
 	if (cfg_size < sizeof(cfg))
 	if (cfg_size < sizeof(cfg))
 		return -EIO;
 		return -EIO;
 
 
-	error = ds_get_bts_end(child, &end);
-	if (error < 0)
-		return error;
-
-	error = ds_access_bts(child, /* index = */ 0, &base);
-	if (error < 0)
-		return error;
-
-	error = ds_access_bts(child, /* index = */ end, &max);
-	if (error < 0)
-		return error;
+	trace = ds_read_bts(child->bts);
+	if (!trace)
+		return -EPERM;
 
 
 	memset(&cfg, 0, sizeof(cfg));
 	memset(&cfg, 0, sizeof(cfg));
-	cfg.size = (max - base);
+	cfg.size = trace->ds.end - trace->ds.begin;
 	cfg.signal = child->thread.bts_ovfl_signal;
 	cfg.signal = child->thread.bts_ovfl_signal;
 	cfg.bts_size = sizeof(struct bts_struct);
 	cfg.bts_size = sizeof(struct bts_struct);
 
 
 	if (cfg.signal)
 	if (cfg.signal)
 		cfg.flags |= PTRACE_BTS_O_SIGNAL;
 		cfg.flags |= PTRACE_BTS_O_SIGNAL;
 
 
-	if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
-	    child->thread.debugctlmsr & bts_cfg.debugctl_mask)
+	if (trace->ds.flags & BTS_USER)
 		cfg.flags |= PTRACE_BTS_O_TRACE;
 		cfg.flags |= PTRACE_BTS_O_TRACE;
 
 
-	if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
+	if (trace->ds.flags & BTS_TIMESTAMPS)
 		cfg.flags |= PTRACE_BTS_O_SCHED;
 		cfg.flags |= PTRACE_BTS_O_SCHED;
 
 
 	if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
 	if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
@@ -856,109 +761,77 @@ static int ptrace_bts_status(struct task_struct *child,
 	return sizeof(cfg);
 	return sizeof(cfg);
 }
 }
 
 
-static int ptrace_bts_write_record(struct task_struct *child,
-				   const struct bts_struct *in)
+static int ptrace_bts_clear(struct task_struct *child)
 {
 {
-	unsigned char bts_record[BTS_MAX_RECORD_SIZE];
+	const struct bts_trace *trace;
 
 
-	BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts);
+	trace = ds_read_bts(child->bts);
+	if (!trace)
+		return -EPERM;
 
 
-	memset(bts_record, 0, bts_cfg.sizeof_bts);
-	switch (in->qualifier) {
-	case BTS_INVALID:
-		break;
+	memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
 
 
-	case BTS_BRANCH:
-		bts_set(bts_record, bts_from, in->variant.lbr.from_ip);
-		bts_set(bts_record, bts_to,   in->variant.lbr.to_ip);
-		break;
+	return ds_reset_bts(child->bts);
+}
 
 
-	case BTS_TASK_ARRIVES:
-	case BTS_TASK_DEPARTS:
-		bts_set(bts_record, bts_from,    bts_escape);
-		bts_set(bts_record, bts_qual,    in->qualifier);
-		bts_set(bts_record, bts_jiffies, in->variant.jiffies);
-		break;
+static int ptrace_bts_size(struct task_struct *child)
+{
+	const struct bts_trace *trace;
 
 
-	default:
-		return -EINVAL;
-	}
+	trace = ds_read_bts(child->bts);
+	if (!trace)
+		return -EPERM;
 
 
-	/* The writing task will be the switched-to task on a context
-	 * switch. It needs to write into the switched-from task's BTS
-	 * buffer. */
-	return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
+	return (trace->ds.top - trace->ds.begin) / trace->ds.size;
 }
 }
 
 
-void ptrace_bts_take_timestamp(struct task_struct *tsk,
-			       enum bts_qualifier qualifier)
+static void ptrace_bts_fork(struct task_struct *tsk)
 {
 {
-	struct bts_struct rec = {
-		.qualifier = qualifier,
-		.variant.jiffies = jiffies_64
-	};
-
-	ptrace_bts_write_record(tsk, &rec);
+	tsk->bts = NULL;
+	tsk->bts_buffer = NULL;
+	tsk->bts_size = 0;
+	tsk->thread.bts_ovfl_signal = 0;
 }
 }
 
 
-static const struct bts_configuration bts_cfg_netburst = {
-	.sizeof_bts    = sizeof(long) * 3,
-	.sizeof_field  = sizeof(long),
-	.debugctl_mask = (1<<2)|(1<<3)|(1<<5)
-};
+static void ptrace_bts_untrace(struct task_struct *child)
+{
+	if (unlikely(child->bts)) {
+		ds_release_bts(child->bts);
+		child->bts = NULL;
+
+		/* We cannot update total_vm and locked_vm since
+		   child's mm is already gone. But we can reclaim the
+		   memory. */
+		kfree(child->bts_buffer);
+		child->bts_buffer = NULL;
+		child->bts_size = 0;
+	}
+}
 
 
-static const struct bts_configuration bts_cfg_pentium_m = {
-	.sizeof_bts    = sizeof(long) * 3,
-	.sizeof_field  = sizeof(long),
-	.debugctl_mask = (1<<6)|(1<<7)
-};
+static void ptrace_bts_detach(struct task_struct *child)
+{
+	if (unlikely(child->bts)) {
+		ds_release_bts(child->bts);
+		child->bts = NULL;
 
 
-static const struct bts_configuration bts_cfg_core2 = {
-	.sizeof_bts    = 8 * 3,
-	.sizeof_field  = 8,
-	.debugctl_mask = (1<<6)|(1<<7)|(1<<9)
-};
+		ptrace_bts_free_buffer(child);
+	}
+}
+#else
+static inline void ptrace_bts_fork(struct task_struct *tsk) {}
+static inline void ptrace_bts_detach(struct task_struct *child) {}
+static inline void ptrace_bts_untrace(struct task_struct *child) {}
+#endif /* CONFIG_X86_PTRACE_BTS */
 
 
-static inline void bts_configure(const struct bts_configuration *cfg)
+void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags)
 {
 {
-	bts_cfg = *cfg;
+	ptrace_bts_fork(child);
 }
 }
 
 
-void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
+void x86_ptrace_untrace(struct task_struct *child)
 {
 {
-	switch (c->x86) {
-	case 0x6:
-		switch (c->x86_model) {
-		case 0 ... 0xC:
-			/* sorry, don't know about them */
-			break;
-		case 0xD:
-		case 0xE: /* Pentium M */
-			bts_configure(&bts_cfg_pentium_m);
-			break;
-		default: /* Core2, Atom, ... */
-			bts_configure(&bts_cfg_core2);
-			break;
-		}
-		break;
-	case 0xF:
-		switch (c->x86_model) {
-		case 0x0:
-		case 0x1:
-		case 0x2: /* Netburst */
-			bts_configure(&bts_cfg_netburst);
-			break;
-		default:
-			/* sorry, don't know about them */
-			break;
-		}
-		break;
-	default:
-		/* sorry, don't know about them */
-		break;
-	}
+	ptrace_bts_untrace(child);
 }
 }
-#endif /* CONFIG_X86_PTRACE_BTS */
 
 
 /*
 /*
  * Called by kernel/ptrace.c when detaching..
  * Called by kernel/ptrace.c when detaching..
@@ -971,15 +844,7 @@ void ptrace_disable(struct task_struct *child)
 #ifdef TIF_SYSCALL_EMU
 #ifdef TIF_SYSCALL_EMU
 	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 #endif
 #endif
-#ifdef CONFIG_X86_PTRACE_BTS
-	(void)ds_release_bts(child);
-
-	child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
-	if (!child->thread.debugctlmsr)
-		clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
-
-	clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
-#endif /* CONFIG_X86_PTRACE_BTS */
+	ptrace_bts_detach(child);
 }
 }
 
 
 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -1111,7 +976,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		break;
 		break;
 
 
 	case PTRACE_BTS_SIZE:
 	case PTRACE_BTS_SIZE:
-		ret = ds_get_bts_index(child, /* pos = */ NULL);
+		ret = ptrace_bts_size(child);
 		break;
 		break;
 
 
 	case PTRACE_BTS_GET:
 	case PTRACE_BTS_GET:
@@ -1120,7 +985,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		break;
 		break;
 
 
 	case PTRACE_BTS_CLEAR:
 	case PTRACE_BTS_CLEAR:
-		ret = ds_clear_bts(child);
+		ret = ptrace_bts_clear(child);
 		break;
 		break;
 
 
 	case PTRACE_BTS_DRAIN:
 	case PTRACE_BTS_DRAIN:
@@ -1383,6 +1248,14 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 
 
 	case PTRACE_GET_THREAD_AREA:
 	case PTRACE_GET_THREAD_AREA:
 	case PTRACE_SET_THREAD_AREA:
 	case PTRACE_SET_THREAD_AREA:
+#ifdef CONFIG_X86_PTRACE_BTS
+	case PTRACE_BTS_CONFIG:
+	case PTRACE_BTS_STATUS:
+	case PTRACE_BTS_SIZE:
+	case PTRACE_BTS_GET:
+	case PTRACE_BTS_CLEAR:
+	case PTRACE_BTS_DRAIN:
+#endif /* CONFIG_X86_PTRACE_BTS */
 		return arch_ptrace(child, request, addr, data);
 		return arch_ptrace(child, request, addr, data);
 
 
 	default:
 	default:

+ 1 - 1
arch/x86/kernel/smpboot.c

@@ -288,7 +288,7 @@ static int __cpuinitdata unsafe_smp;
 /*
 /*
  * Activate a secondary processor.
  * Activate a secondary processor.
  */
  */
-static void __cpuinit start_secondary(void *unused)
+notrace static void __cpuinit start_secondary(void *unused)
 {
 {
 	/*
 	/*
 	 * Don't put *anything* before cpu_init(), SMP booting is too
 	 * Don't put *anything* before cpu_init(), SMP booting is too

+ 64 - 0
arch/x86/kernel/stacktrace.c

@@ -6,6 +6,7 @@
 #include <linux/sched.h>
 #include <linux/sched.h>
 #include <linux/stacktrace.h>
 #include <linux/stacktrace.h>
 #include <linux/module.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 #include <asm/stacktrace.h>
 #include <asm/stacktrace.h>
 
 
 static void save_stack_warning(void *data, char *msg)
 static void save_stack_warning(void *data, char *msg)
@@ -83,3 +84,66 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
 }
 EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
 EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
+
+/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
+
+struct stack_frame {
+	const void __user	*next_fp;
+	unsigned long		ret_addr;
+};
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+	int ret;
+
+	if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+		return 0;
+
+	ret = 1;
+	pagefault_disable();
+	if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+		ret = 0;
+	pagefault_enable();
+
+	return ret;
+}
+
+static inline void __save_stack_trace_user(struct stack_trace *trace)
+{
+	const struct pt_regs *regs = task_pt_regs(current);
+	const void __user *fp = (const void __user *)regs->bp;
+
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = regs->ip;
+
+	while (trace->nr_entries < trace->max_entries) {
+		struct stack_frame frame;
+
+		frame.next_fp = NULL;
+		frame.ret_addr = 0;
+		if (!copy_stack_frame(fp, &frame))
+			break;
+		if ((unsigned long)fp < regs->sp)
+			break;
+		if (frame.ret_addr) {
+			trace->entries[trace->nr_entries++] =
+				frame.ret_addr;
+		}
+		if (fp == frame.next_fp)
+			break;
+		fp = frame.next_fp;
+	}
+}
+
+void save_stack_trace_user(struct stack_trace *trace)
+{
+	/*
+	 * Trace user stack if we are not a kernel thread
+	 */
+	if (current->mm) {
+		__save_stack_trace_user(trace);
+	}
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+

+ 1 - 0
arch/x86/kernel/vmlinux_32.lds.S

@@ -44,6 +44,7 @@ SECTIONS
 	SCHED_TEXT
 	SCHED_TEXT
 	LOCK_TEXT
 	LOCK_TEXT
 	KPROBES_TEXT
 	KPROBES_TEXT
+	IRQENTRY_TEXT
 	*(.fixup)
 	*(.fixup)
 	*(.gnu.warning)
 	*(.gnu.warning)
   	_etext = .;			/* End of text section */
   	_etext = .;			/* End of text section */

+ 1 - 0
arch/x86/kernel/vmlinux_64.lds.S

@@ -35,6 +35,7 @@ SECTIONS
 	SCHED_TEXT
 	SCHED_TEXT
 	LOCK_TEXT
 	LOCK_TEXT
 	KPROBES_TEXT
 	KPROBES_TEXT
+	IRQENTRY_TEXT
 	*(.fixup)
 	*(.fixup)
 	*(.gnu.warning)
 	*(.gnu.warning)
 	_etext = .;		/* End of text section */
 	_etext = .;		/* End of text section */

+ 3 - 0
arch/x86/kernel/vsyscall_64.c

@@ -17,6 +17,9 @@
  *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
  *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
  */
  */
 
 
+/* Disable profiling for userspace code: */
+#define DISABLE_BRANCH_PROFILING
+
 #include <linux/time.h>
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/kernel.h>

+ 1 - 2
arch/x86/mm/Makefile

@@ -8,9 +8,8 @@ obj-$(CONFIG_X86_PTDUMP)	+= dump_pagetables.o
 
 
 obj-$(CONFIG_HIGHMEM)		+= highmem_32.o
 obj-$(CONFIG_HIGHMEM)		+= highmem_32.o
 
 
-obj-$(CONFIG_MMIOTRACE_HOOKS)	+= kmmio.o
 obj-$(CONFIG_MMIOTRACE)		+= mmiotrace.o
 obj-$(CONFIG_MMIOTRACE)		+= mmiotrace.o
-mmiotrace-y			:= pf_in.o mmio-mod.o
+mmiotrace-y			:= kmmio.o pf_in.o mmio-mod.o
 obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o
 obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o
 
 
 obj-$(CONFIG_NUMA)		+= numa_$(BITS).o
 obj-$(CONFIG_NUMA)		+= numa_$(BITS).o

+ 1 - 1
arch/x86/mm/fault.c

@@ -53,7 +53,7 @@
 
 
 static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
 static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
 {
 {
-#ifdef CONFIG_MMIOTRACE_HOOKS
+#ifdef CONFIG_MMIOTRACE
 	if (unlikely(is_kmmio_active()))
 	if (unlikely(is_kmmio_active()))
 		if (kmmio_handler(regs, addr) == 1)
 		if (kmmio_handler(regs, addr) == 1)
 			return -1;
 			return -1;

+ 3 - 0
arch/x86/vdso/vclock_gettime.c

@@ -9,6 +9,9 @@
  * Also alternative() doesn't work.
  * Also alternative() doesn't work.
  */
  */
 
 
+/* Disable profiling for userspace code: */
+#define DISABLE_BRANCH_PROFILING
+
 #include <linux/kernel.h>
 #include <linux/kernel.h>
 #include <linux/posix-timers.h>
 #include <linux/posix-timers.h>
 #include <linux/time.h>
 #include <linux/time.h>

+ 1 - 0
block/Kconfig

@@ -47,6 +47,7 @@ config BLK_DEV_IO_TRACE
 	depends on SYSFS
 	depends on SYSFS
 	select RELAY
 	select RELAY
 	select DEBUG_FS
 	select DEBUG_FS
+	select TRACEPOINTS
 	help
 	help
 	  Say Y here if you want to be able to trace the block layer actions
 	  Say Y here if you want to be able to trace the block layer actions
 	  on a given queue. Tracing allows you to see any traffic happening
 	  on a given queue. Tracing allows you to see any traffic happening

+ 27 - 19
block/blk-core.c

@@ -28,9 +28,23 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/blktrace_api.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
 #include <linux/fault-inject.h>
+#include <trace/block.h>
 
 
 #include "blk.h"
 #include "blk.h"
 
 
+DEFINE_TRACE(block_plug);
+DEFINE_TRACE(block_unplug_io);
+DEFINE_TRACE(block_unplug_timer);
+DEFINE_TRACE(block_getrq);
+DEFINE_TRACE(block_sleeprq);
+DEFINE_TRACE(block_rq_requeue);
+DEFINE_TRACE(block_bio_backmerge);
+DEFINE_TRACE(block_bio_frontmerge);
+DEFINE_TRACE(block_bio_queue);
+DEFINE_TRACE(block_rq_complete);
+DEFINE_TRACE(block_remap);	/* Also used in drivers/md/dm.c */
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
+
 static int __make_request(struct request_queue *q, struct bio *bio);
 static int __make_request(struct request_queue *q, struct bio *bio);
 
 
 /*
 /*
@@ -205,7 +219,7 @@ void blk_plug_device(struct request_queue *q)
 
 
 	if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) {
 	if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) {
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
-		blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
+		trace_block_plug(q);
 	}
 	}
 }
 }
 EXPORT_SYMBOL(blk_plug_device);
 EXPORT_SYMBOL(blk_plug_device);
@@ -292,9 +306,7 @@ void blk_unplug_work(struct work_struct *work)
 	struct request_queue *q =
 	struct request_queue *q =
 		container_of(work, struct request_queue, unplug_work);
 		container_of(work, struct request_queue, unplug_work);
 
 
-	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
-				q->rq.count[READ] + q->rq.count[WRITE]);
-
+	trace_block_unplug_io(q);
 	q->unplug_fn(q);
 	q->unplug_fn(q);
 }
 }
 
 
@@ -302,9 +314,7 @@ void blk_unplug_timeout(unsigned long data)
 {
 {
 	struct request_queue *q = (struct request_queue *)data;
 	struct request_queue *q = (struct request_queue *)data;
 
 
-	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
-				q->rq.count[READ] + q->rq.count[WRITE]);
-
+	trace_block_unplug_timer(q);
 	kblockd_schedule_work(q, &q->unplug_work);
 	kblockd_schedule_work(q, &q->unplug_work);
 }
 }
 
 
@@ -314,9 +324,7 @@ void blk_unplug(struct request_queue *q)
 	 * devices don't necessarily have an ->unplug_fn defined
 	 * devices don't necessarily have an ->unplug_fn defined
 	 */
 	 */
 	if (q->unplug_fn) {
 	if (q->unplug_fn) {
-		blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
-					q->rq.count[READ] + q->rq.count[WRITE]);
-
+		trace_block_unplug_io(q);
 		q->unplug_fn(q);
 		q->unplug_fn(q);
 	}
 	}
 }
 }
@@ -822,7 +830,7 @@ rq_starved:
 	if (ioc_batching(q, ioc))
 	if (ioc_batching(q, ioc))
 		ioc->nr_batch_requests--;
 		ioc->nr_batch_requests--;
 
 
-	blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
+	trace_block_getrq(q, bio, rw);
 out:
 out:
 	return rq;
 	return rq;
 }
 }
@@ -848,7 +856,7 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 		prepare_to_wait_exclusive(&rl->wait[rw], &wait,
 		prepare_to_wait_exclusive(&rl->wait[rw], &wait,
 				TASK_UNINTERRUPTIBLE);
 				TASK_UNINTERRUPTIBLE);
 
 
-		blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
+		trace_block_sleeprq(q, bio, rw);
 
 
 		__generic_unplug_device(q);
 		__generic_unplug_device(q);
 		spin_unlock_irq(q->queue_lock);
 		spin_unlock_irq(q->queue_lock);
@@ -928,7 +936,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
 {
 {
 	blk_delete_timer(rq);
 	blk_delete_timer(rq);
 	blk_clear_rq_complete(rq);
 	blk_clear_rq_complete(rq);
-	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+	trace_block_rq_requeue(q, rq);
 
 
 	if (blk_rq_tagged(rq))
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
 		blk_queue_end_tag(q, rq);
@@ -1167,7 +1175,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 		if (!ll_back_merge_fn(q, req, bio))
 		if (!ll_back_merge_fn(q, req, bio))
 			break;
 			break;
 
 
-		blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+		trace_block_bio_backmerge(q, bio);
 
 
 		req->biotail->bi_next = bio;
 		req->biotail->bi_next = bio;
 		req->biotail = bio;
 		req->biotail = bio;
@@ -1186,7 +1194,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 		if (!ll_front_merge_fn(q, req, bio))
 		if (!ll_front_merge_fn(q, req, bio))
 			break;
 			break;
 
 
-		blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+		trace_block_bio_frontmerge(q, bio);
 
 
 		bio->bi_next = req->bio;
 		bio->bi_next = req->bio;
 		req->bio = bio;
 		req->bio = bio;
@@ -1269,7 +1277,7 @@ static inline void blk_partition_remap(struct bio *bio)
 		bio->bi_sector += p->start_sect;
 		bio->bi_sector += p->start_sect;
 		bio->bi_bdev = bdev->bd_contains;
 		bio->bi_bdev = bdev->bd_contains;
 
 
-		blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio,
+		trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
 				    bdev->bd_dev, bio->bi_sector,
 				    bdev->bd_dev, bio->bi_sector,
 				    bio->bi_sector - p->start_sect);
 				    bio->bi_sector - p->start_sect);
 	}
 	}
@@ -1441,10 +1449,10 @@ end_io:
 			goto end_io;
 			goto end_io;
 
 
 		if (old_sector != -1)
 		if (old_sector != -1)
-			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
+			trace_block_remap(q, bio, old_dev, bio->bi_sector,
 					    old_sector);
 					    old_sector);
 
 
-		blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+		trace_block_bio_queue(q, bio);
 
 
 		old_sector = bio->bi_sector;
 		old_sector = bio->bi_sector;
 		old_dev = bio->bi_bdev->bd_dev;
 		old_dev = bio->bi_bdev->bd_dev;
@@ -1678,7 +1686,7 @@ static int __end_that_request_first(struct request *req, int error,
 	int total_bytes, bio_nbytes, next_idx = 0;
 	int total_bytes, bio_nbytes, next_idx = 0;
 	struct bio *bio;
 	struct bio *bio;
 
 
-	blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
+	trace_block_rq_complete(req->q, req);
 
 
 	/*
 	/*
 	 * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual
 	 * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual

+ 329 - 3
block/blktrace.c

@@ -23,10 +23,18 @@
 #include <linux/mutex.h>
 #include <linux/mutex.h>
 #include <linux/debugfs.h>
 #include <linux/debugfs.h>
 #include <linux/time.h>
 #include <linux/time.h>
+#include <trace/block.h>
 #include <asm/uaccess.h>
 #include <asm/uaccess.h>
 
 
 static unsigned int blktrace_seq __read_mostly = 1;
 static unsigned int blktrace_seq __read_mostly = 1;
 
 
+/* Global reference count of probes */
+static DEFINE_MUTEX(blk_probe_mutex);
+static atomic_t blk_probes_ref = ATOMIC_INIT(0);
+
+static int blk_register_tracepoints(void);
+static void blk_unregister_tracepoints(void);
+
 /*
 /*
  * Send out a notify message.
  * Send out a notify message.
  */
  */
@@ -119,7 +127,7 @@ static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK
  * The worker for the various blk_add_trace*() types. Fills out a
  * The worker for the various blk_add_trace*() types. Fills out a
  * blk_io_trace structure and places it in a per-cpu subbuffer.
  * blk_io_trace structure and places it in a per-cpu subbuffer.
  */
  */
-void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
+static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		     int rw, u32 what, int error, int pdu_len, void *pdu_data)
 		     int rw, u32 what, int error, int pdu_len, void *pdu_data)
 {
 {
 	struct task_struct *tsk = current;
 	struct task_struct *tsk = current;
@@ -177,8 +185,6 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	local_irq_restore(flags);
 	local_irq_restore(flags);
 }
 }
 
 
-EXPORT_SYMBOL_GPL(__blk_add_trace);
-
 static struct dentry *blk_tree_root;
 static struct dentry *blk_tree_root;
 static DEFINE_MUTEX(blk_tree_mutex);
 static DEFINE_MUTEX(blk_tree_mutex);
 static unsigned int root_users;
 static unsigned int root_users;
@@ -237,6 +243,10 @@ static void blk_trace_cleanup(struct blk_trace *bt)
 	free_percpu(bt->sequence);
 	free_percpu(bt->sequence);
 	free_percpu(bt->msg_data);
 	free_percpu(bt->msg_data);
 	kfree(bt);
 	kfree(bt);
+	mutex_lock(&blk_probe_mutex);
+	if (atomic_dec_and_test(&blk_probes_ref))
+		blk_unregister_tracepoints();
+	mutex_unlock(&blk_probe_mutex);
 }
 }
 
 
 int blk_trace_remove(struct request_queue *q)
 int blk_trace_remove(struct request_queue *q)
@@ -428,6 +438,14 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	bt->pid = buts->pid;
 	bt->pid = buts->pid;
 	bt->trace_state = Blktrace_setup;
 	bt->trace_state = Blktrace_setup;
 
 
+	mutex_lock(&blk_probe_mutex);
+	if (atomic_add_return(1, &blk_probes_ref) == 1) {
+		ret = blk_register_tracepoints();
+		if (ret)
+			goto probe_err;
+	}
+	mutex_unlock(&blk_probe_mutex);
+
 	ret = -EBUSY;
 	ret = -EBUSY;
 	old_bt = xchg(&q->blk_trace, bt);
 	old_bt = xchg(&q->blk_trace, bt);
 	if (old_bt) {
 	if (old_bt) {
@@ -436,6 +454,9 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	}
 	}
 
 
 	return 0;
 	return 0;
+probe_err:
+	atomic_dec(&blk_probes_ref);
+	mutex_unlock(&blk_probe_mutex);
 err:
 err:
 	if (dir)
 	if (dir)
 		blk_remove_tree(dir);
 		blk_remove_tree(dir);
@@ -562,3 +583,308 @@ void blk_trace_shutdown(struct request_queue *q)
 		blk_trace_remove(q);
 		blk_trace_remove(q);
 	}
 	}
 }
 }
+
+/*
+ * blktrace probes
+ */
+
+/**
+ * blk_add_trace_rq - Add a trace for a request oriented action
+ * @q:		queue the io is for
+ * @rq:		the source request
+ * @what:	the action
+ *
+ * Description:
+ *     Records an action against a request. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
+				    u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+	int rw = rq->cmd_flags & 0x03;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_discard_rq(rq))
+		rw |= (1 << BIO_RW_DISCARD);
+
+	if (blk_pc_request(rq)) {
+		what |= BLK_TC_ACT(BLK_TC_PC);
+		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
+				sizeof(rq->cmd), rq->cmd);
+	} else  {
+		what |= BLK_TC_ACT(BLK_TC_FS);
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+				rw, what, rq->errors, 0, NULL);
+	}
+}
+
+static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_ABORT);
+}
+
+static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+}
+
+static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+}
+
+static void blk_add_trace_rq_requeue(struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+}
+
+static void blk_add_trace_rq_complete(struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
+}
+
+/**
+ * blk_add_trace_bio - Add a trace for a bio oriented action
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @what:	the action
+ *
+ * Description:
+ *     Records an action against a bio. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
+				     u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
+			!bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+}
+
+static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
+}
+
+static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
+}
+
+static void blk_add_trace_bio_backmerge(struct request_queue *q, struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+}
+
+static void blk_add_trace_bio_frontmerge(struct request_queue *q, struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+}
+
+static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+}
+
+static void blk_add_trace_getrq(struct request_queue *q, struct bio *bio, int rw)
+{
+	if (bio)
+		blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
+	else {
+		struct blk_trace *bt = q->blk_trace;
+
+		if (bt)
+			__blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
+	}
+}
+
+
+static void blk_add_trace_sleeprq(struct request_queue *q, struct bio *bio, int rw)
+{
+	if (bio)
+		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
+	else {
+		struct blk_trace *bt = q->blk_trace;
+
+		if (bt)
+			__blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, 0, 0, NULL);
+	}
+}
+
+static void blk_add_trace_plug(struct request_queue *q)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt)
+		__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+}
+
+static void blk_add_trace_unplug_io(struct request_queue *q)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt) {
+		unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
+		__be64 rpdu = cpu_to_be64(pdu);
+
+		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
+				sizeof(rpdu), &rpdu);
+	}
+}
+
+static void blk_add_trace_unplug_timer(struct request_queue *q)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt) {
+		unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
+		__be64 rpdu = cpu_to_be64(pdu);
+
+		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0,
+				sizeof(rpdu), &rpdu);
+	}
+}
+
+static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
+				unsigned int pdu)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt) {
+		__be64 rpdu = cpu_to_be64(pdu);
+
+		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
+				BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE),
+				sizeof(rpdu), &rpdu);
+	}
+}
+
+/**
+ * blk_add_trace_remap - Add a trace for a remap operation
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @dev:	target device
+ * @from:	source sector
+ * @to:		target sector
+ *
+ * Description:
+ *     Device mapper or raid target sometimes need to split a bio because
+ *     it spans a stripe (or similar). Add a trace for that action.
+ *
+ **/
+static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
+				       dev_t dev, sector_t from, sector_t to)
+{
+	struct blk_trace *bt = q->blk_trace;
+	struct blk_io_trace_remap r;
+
+	if (likely(!bt))
+		return;
+
+	r.device = cpu_to_be32(dev);
+	r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
+	r.sector = cpu_to_be64(to);
+
+	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
+			!bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+}
+
+/**
+ * blk_add_driver_data - Add binary message with driver-specific data
+ * @q:		queue the io is for
+ * @rq:		io request
+ * @data:	driver-specific data
+ * @len:	length of driver-specific data
+ *
+ * Description:
+ *     Some drivers might want to write driver-specific data per request.
+ *
+ **/
+void blk_add_driver_data(struct request_queue *q,
+			 struct request *rq,
+			 void *data, size_t len)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_pc_request(rq))
+		__blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
+				rq->errors, len, data);
+	else
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+				0, BLK_TA_DRV_DATA, rq->errors, len, data);
+}
+EXPORT_SYMBOL_GPL(blk_add_driver_data);
+
+static int blk_register_tracepoints(void)
+{
+	int ret;
+
+	ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
+	WARN_ON(ret);
+	ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
+	WARN_ON(ret);
+	ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
+	WARN_ON(ret);
+	ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+	WARN_ON(ret);
+	ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
+	WARN_ON(ret);
+	ret = register_trace_block_getrq(blk_add_trace_getrq);
+	WARN_ON(ret);
+	ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
+	WARN_ON(ret);
+	ret = register_trace_block_plug(blk_add_trace_plug);
+	WARN_ON(ret);
+	ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+	WARN_ON(ret);
+	ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
+	WARN_ON(ret);
+	ret = register_trace_block_split(blk_add_trace_split);
+	WARN_ON(ret);
+	ret = register_trace_block_remap(blk_add_trace_remap);
+	WARN_ON(ret);
+	return 0;
+}
+
+static void blk_unregister_tracepoints(void)
+{
+	unregister_trace_block_remap(blk_add_trace_remap);
+	unregister_trace_block_split(blk_add_trace_split);
+	unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
+	unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+	unregister_trace_block_plug(blk_add_trace_plug);
+	unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
+	unregister_trace_block_getrq(blk_add_trace_getrq);
+	unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
+	unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+	unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+	unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
+	unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+	unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
+	unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+	unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
+	unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
+	unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
+
+	tracepoint_synchronize_unregister();
+}

+ 9 - 3
block/elevator.c

@@ -33,6 +33,7 @@
 #include <linux/compiler.h>
 #include <linux/compiler.h>
 #include <linux/delay.h>
 #include <linux/delay.h>
 #include <linux/blktrace_api.h>
 #include <linux/blktrace_api.h>
+#include <trace/block.h>
 #include <linux/hash.h>
 #include <linux/hash.h>
 #include <linux/uaccess.h>
 #include <linux/uaccess.h>
 
 
@@ -41,6 +42,8 @@
 static DEFINE_SPINLOCK(elv_list_lock);
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
 static LIST_HEAD(elv_list);
 
 
+DEFINE_TRACE(block_rq_abort);
+
 /*
 /*
  * Merge hash stuff.
  * Merge hash stuff.
  */
  */
@@ -52,6 +55,9 @@ static const int elv_hash_shift = 6;
 #define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
 #define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
 #define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
 #define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
 
 
+DEFINE_TRACE(block_rq_insert);
+DEFINE_TRACE(block_rq_issue);
+
 /*
 /*
  * Query io scheduler to see if the current process issuing bio may be
  * Query io scheduler to see if the current process issuing bio may be
  * merged with rq.
  * merged with rq.
@@ -586,7 +592,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 	unsigned ordseq;
 	unsigned ordseq;
 	int unplug_it = 1;
 	int unplug_it = 1;
 
 
-	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+	trace_block_rq_insert(q, rq);
 
 
 	rq->q = q;
 	rq->q = q;
 
 
@@ -772,7 +778,7 @@ struct request *elv_next_request(struct request_queue *q)
 			 * not be passed by new incoming requests
 			 * not be passed by new incoming requests
 			 */
 			 */
 			rq->cmd_flags |= REQ_STARTED;
 			rq->cmd_flags |= REQ_STARTED;
-			blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+			trace_block_rq_issue(q, rq);
 		}
 		}
 
 
 		if (!q->boundary_rq || q->boundary_rq == rq) {
 		if (!q->boundary_rq || q->boundary_rq == rq) {
@@ -914,7 +920,7 @@ void elv_abort_queue(struct request_queue *q)
 	while (!list_empty(&q->queue_head)) {
 	while (!list_empty(&q->queue_head)) {
 		rq = list_entry_rq(q->queue_head.next);
 		rq = list_entry_rq(q->queue_head.next);
 		rq->cmd_flags |= REQ_QUIET;
 		rq->cmd_flags |= REQ_QUIET;
-		blk_add_trace_rq(q, rq, BLK_TA_ABORT);
+		trace_block_rq_abort(q, rq);
 		__blk_end_request(rq, -EIO, blk_rq_bytes(rq));
 		__blk_end_request(rq, -EIO, blk_rq_bytes(rq));
 	}
 	}
 }
 }

+ 17 - 1
drivers/char/sysrq.c

@@ -274,6 +274,22 @@ static struct sysrq_key_op sysrq_showstate_blocked_op = {
 	.enable_mask	= SYSRQ_ENABLE_DUMP,
 	.enable_mask	= SYSRQ_ENABLE_DUMP,
 };
 };
 
 
+#ifdef CONFIG_TRACING
+#include <linux/ftrace.h>
+
+static void sysrq_ftrace_dump(int key, struct tty_struct *tty)
+{
+	ftrace_dump();
+}
+static struct sysrq_key_op sysrq_ftrace_dump_op = {
+	.handler	= sysrq_ftrace_dump,
+	.help_msg	= "dumpZ-ftrace-buffer",
+	.action_msg	= "Dump ftrace buffer",
+	.enable_mask	= SYSRQ_ENABLE_DUMP,
+};
+#else
+#define sysrq_ftrace_dump_op (*(struct sysrq_key_op *)0)
+#endif
 
 
 static void sysrq_handle_showmem(int key, struct tty_struct *tty)
 static void sysrq_handle_showmem(int key, struct tty_struct *tty)
 {
 {
@@ -406,7 +422,7 @@ static struct sysrq_key_op *sysrq_key_table[36] = {
 	NULL,				/* x */
 	NULL,				/* x */
 	/* y: May be registered on sparc64 for global register dump */
 	/* y: May be registered on sparc64 for global register dump */
 	NULL,				/* y */
 	NULL,				/* y */
-	NULL				/* z */
+	&sysrq_ftrace_dump_op,		/* z */
 };
 };
 
 
 /* key2index calculation, -1 on invalid index */
 /* key2index calculation, -1 on invalid index */

+ 5 - 3
drivers/md/dm.c

@@ -21,6 +21,7 @@
 #include <linux/idr.h>
 #include <linux/idr.h>
 #include <linux/hdreg.h>
 #include <linux/hdreg.h>
 #include <linux/blktrace_api.h>
 #include <linux/blktrace_api.h>
+#include <trace/block.h>
 
 
 #define DM_MSG_PREFIX "core"
 #define DM_MSG_PREFIX "core"
 
 
@@ -51,6 +52,8 @@ struct dm_target_io {
 	union map_info info;
 	union map_info info;
 };
 };
 
 
+DEFINE_TRACE(block_bio_complete);
+
 union map_info *dm_get_mapinfo(struct bio *bio)
 union map_info *dm_get_mapinfo(struct bio *bio)
 {
 {
 	if (bio && bio->bi_private)
 	if (bio && bio->bi_private)
@@ -504,8 +507,7 @@ static void dec_pending(struct dm_io *io, int error)
 		end_io_acct(io);
 		end_io_acct(io);
 
 
 		if (io->error != DM_ENDIO_REQUEUE) {
 		if (io->error != DM_ENDIO_REQUEUE) {
-			blk_add_trace_bio(io->md->queue, io->bio,
-					  BLK_TA_COMPLETE);
+			trace_block_bio_complete(io->md->queue, io->bio);
 
 
 			bio_endio(io->bio, io->error);
 			bio_endio(io->bio, io->error);
 		}
 		}
@@ -598,7 +600,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 	if (r == DM_MAPIO_REMAPPED) {
 	if (r == DM_MAPIO_REMAPPED) {
 		/* the bio has been remapped so dispatch it */
 		/* the bio has been remapped so dispatch it */
 
 
-		blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
+		trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
 				    tio->io->bio->bi_bdev->bd_dev,
 				    tio->io->bio->bi_bdev->bd_dev,
 				    clone->bi_sector, sector);
 				    clone->bi_sector, sector);
 
 

+ 4 - 1
fs/bio.c

@@ -26,8 +26,11 @@
 #include <linux/mempool.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
 #include <linux/workqueue.h>
 #include <linux/blktrace_api.h>
 #include <linux/blktrace_api.h>
+#include <trace/block.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
 
+DEFINE_TRACE(block_split);
+
 static struct kmem_cache *bio_slab __read_mostly;
 static struct kmem_cache *bio_slab __read_mostly;
 
 
 static mempool_t *bio_split_pool __read_mostly;
 static mempool_t *bio_split_pool __read_mostly;
@@ -1263,7 +1266,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 	if (!bp)
 	if (!bp)
 		return bp;
 		return bp;
 
 
-	blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi,
+	trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
 				bi->bi_sector + first_sectors);
 				bi->bi_sector + first_sectors);
 
 
 	BUG_ON(bi->bi_vcnt != 1);
 	BUG_ON(bi->bi_vcnt != 1);

+ 13 - 1
fs/seq_file.c

@@ -357,7 +357,18 @@ int seq_printf(struct seq_file *m, const char *f, ...)
 }
 }
 EXPORT_SYMBOL(seq_printf);
 EXPORT_SYMBOL(seq_printf);
 
 
-static char *mangle_path(char *s, char *p, char *esc)
+/**
+ *	mangle_path -	mangle and copy path to buffer beginning
+ *	@s: buffer start
+ *	@p: beginning of path in above buffer
+ *	@esc: set of characters that need escaping
+ *
+ *      Copy the path from @p to @s, replacing each occurrence of character from
+ *      @esc with usual octal escape.
+ *      Returns pointer past last written character in @s, or NULL in case of
+ *      failure.
+ */
+char *mangle_path(char *s, char *p, char *esc)
 {
 {
 	while (s <= p) {
 	while (s <= p) {
 		char c = *p++;
 		char c = *p++;
@@ -376,6 +387,7 @@ static char *mangle_path(char *s, char *p, char *esc)
 	}
 	}
 	return NULL;
 	return NULL;
 }
 }
+EXPORT_SYMBOL(mangle_path);
 
 
 /*
 /*
  * return the absolute path of 'dentry' residing in mount 'mnt'.
  * return the absolute path of 'dentry' residing in mount 'mnt'.

+ 30 - 1
include/asm-generic/vmlinux.lds.h

@@ -45,6 +45,22 @@
 #define MCOUNT_REC()
 #define MCOUNT_REC()
 #endif
 #endif
 
 
+#ifdef CONFIG_TRACE_BRANCH_PROFILING
+#define LIKELY_PROFILE()	VMLINUX_SYMBOL(__start_annotated_branch_profile) = .; \
+				*(_ftrace_annotated_branch)			      \
+				VMLINUX_SYMBOL(__stop_annotated_branch_profile) = .;
+#else
+#define LIKELY_PROFILE()
+#endif
+
+#ifdef CONFIG_PROFILE_ALL_BRANCHES
+#define BRANCH_PROFILE()	VMLINUX_SYMBOL(__start_branch_profile) = .;   \
+				*(_ftrace_branch)			      \
+				VMLINUX_SYMBOL(__stop_branch_profile) = .;
+#else
+#define BRANCH_PROFILE()
+#endif
+
 /* .data section */
 /* .data section */
 #define DATA_DATA							\
 #define DATA_DATA							\
 	*(.data)							\
 	*(.data)							\
@@ -60,9 +76,12 @@
 	VMLINUX_SYMBOL(__start___markers) = .;				\
 	VMLINUX_SYMBOL(__start___markers) = .;				\
 	*(__markers)							\
 	*(__markers)							\
 	VMLINUX_SYMBOL(__stop___markers) = .;				\
 	VMLINUX_SYMBOL(__stop___markers) = .;				\
+	. = ALIGN(32);							\
 	VMLINUX_SYMBOL(__start___tracepoints) = .;			\
 	VMLINUX_SYMBOL(__start___tracepoints) = .;			\
 	*(__tracepoints)						\
 	*(__tracepoints)						\
-	VMLINUX_SYMBOL(__stop___tracepoints) = .;
+	VMLINUX_SYMBOL(__stop___tracepoints) = .;			\
+	LIKELY_PROFILE()		       				\
+	BRANCH_PROFILE()
 
 
 #define RO_DATA(align)							\
 #define RO_DATA(align)							\
 	. = ALIGN((align));						\
 	. = ALIGN((align));						\
@@ -269,6 +288,16 @@
 		*(.kprobes.text)					\
 		*(.kprobes.text)					\
 		VMLINUX_SYMBOL(__kprobes_text_end) = .;
 		VMLINUX_SYMBOL(__kprobes_text_end) = .;
 
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+#define IRQENTRY_TEXT							\
+		ALIGN_FUNCTION();					\
+		VMLINUX_SYMBOL(__irqentry_text_start) = .;		\
+		*(.irqentry.text)					\
+		VMLINUX_SYMBOL(__irqentry_text_end) = .;
+#else
+#define IRQENTRY_TEXT
+#endif
+
 /* Section used for early init (in .S files) */
 /* Section used for early init (in .S files) */
 #define HEAD_TEXT  *(.head.text)
 #define HEAD_TEXT  *(.head.text)
 
 

+ 3 - 169
include/linux/blktrace_api.h

@@ -160,7 +160,6 @@ struct blk_trace {
 
 
 extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
 extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
 extern void blk_trace_shutdown(struct request_queue *);
 extern void blk_trace_shutdown(struct request_queue *);
-extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *);
 extern int do_blk_trace_setup(struct request_queue *q,
 extern int do_blk_trace_setup(struct request_queue *q,
 	char *name, dev_t dev, struct blk_user_trace_setup *buts);
 	char *name, dev_t dev, struct blk_user_trace_setup *buts);
 extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
 extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
@@ -186,168 +185,8 @@ extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
 	} while (0)
 	} while (0)
 #define BLK_TN_MAX_MSG		128
 #define BLK_TN_MAX_MSG		128
 
 
-/**
- * blk_add_trace_rq - Add a trace for a request oriented action
- * @q:		queue the io is for
- * @rq:		the source request
- * @what:	the action
- *
- * Description:
- *     Records an action against a request. Will log the bio offset + size.
- *
- **/
-static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq,
-				    u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-	int rw = rq->cmd_flags & 0x03;
-
-	if (likely(!bt))
-		return;
-
-	if (blk_discard_rq(rq))
-		rw |= (1 << BIO_RW_DISCARD);
-
-	if (blk_pc_request(rq)) {
-		what |= BLK_TC_ACT(BLK_TC_PC);
-		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
-	} else  {
-		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
-	}
-}
-
-/**
- * blk_add_trace_bio - Add a trace for a bio oriented action
- * @q:		queue the io is for
- * @bio:	the source bio
- * @what:	the action
- *
- * Description:
- *     Records an action against a bio. Will log the bio offset + size.
- *
- **/
-static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
-				     u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (likely(!bt))
-		return;
-
-	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
-}
-
-/**
- * blk_add_trace_generic - Add a trace for a generic action
- * @q:		queue the io is for
- * @bio:	the source bio
- * @rw:		the data direction
- * @what:	the action
- *
- * Description:
- *     Records a simple trace
- *
- **/
-static inline void blk_add_trace_generic(struct request_queue *q,
-					 struct bio *bio, int rw, u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (likely(!bt))
-		return;
-
-	if (bio)
-		blk_add_trace_bio(q, bio, what);
-	else
-		__blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
-}
-
-/**
- * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
- * @q:		queue the io is for
- * @what:	the action
- * @bio:	the source bio
- * @pdu:	the integer payload
- *
- * Description:
- *     Adds a trace with some integer payload. This might be an unplug
- *     option given as the action, with the depth at unplug time given
- *     as the payload
- *
- **/
-static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what,
-					 struct bio *bio, unsigned int pdu)
-{
-	struct blk_trace *bt = q->blk_trace;
-	__be64 rpdu = cpu_to_be64(pdu);
-
-	if (likely(!bt))
-		return;
-
-	if (bio)
-		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
-	else
-		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
-}
-
-/**
- * blk_add_trace_remap - Add a trace for a remap operation
- * @q:		queue the io is for
- * @bio:	the source bio
- * @dev:	target device
- * @from:	source sector
- * @to:		target sector
- *
- * Description:
- *     Device mapper or raid target sometimes need to split a bio because
- *     it spans a stripe (or similar). Add a trace for that action.
- *
- **/
-static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-				       dev_t dev, sector_t from, sector_t to)
-{
-	struct blk_trace *bt = q->blk_trace;
-	struct blk_io_trace_remap r;
-
-	if (likely(!bt))
-		return;
-
-	r.device = cpu_to_be32(dev);
-	r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
-	r.sector = cpu_to_be64(to);
-
-	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
-}
-
-/**
- * blk_add_driver_data - Add binary message with driver-specific data
- * @q:		queue the io is for
- * @rq:		io request
- * @data:	driver-specific data
- * @len:	length of driver-specific data
- *
- * Description:
- *     Some drivers might want to write driver-specific data per request.
- *
- **/
-static inline void blk_add_driver_data(struct request_queue *q,
-				       struct request *rq,
-				       void *data, size_t len)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (likely(!bt))
-		return;
-
-	if (blk_pc_request(rq))
-		__blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
-				rq->errors, len, data);
-	else
-		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
-				0, BLK_TA_DRV_DATA, rq->errors, len, data);
-}
-
+extern void blk_add_driver_data(struct request_queue *q, struct request *rq,
+				void *data, size_t len);
 extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 			   char __user *arg);
 			   char __user *arg);
 extern int blk_trace_startstop(struct request_queue *q, int start);
 extern int blk_trace_startstop(struct request_queue *q, int start);
@@ -356,13 +195,8 @@ extern int blk_trace_remove(struct request_queue *q);
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
 #define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
 #define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
 #define blk_trace_shutdown(q)			do { } while (0)
 #define blk_trace_shutdown(q)			do { } while (0)
-#define blk_add_trace_rq(q, rq, what)		do { } while (0)
-#define blk_add_trace_bio(q, rq, what)		do { } while (0)
-#define blk_add_trace_generic(q, rq, rw, what)	do { } while (0)
-#define blk_add_trace_pdu_int(q, what, bio, pdu)	do { } while (0)
-#define blk_add_trace_remap(q, bio, dev, f, t)	do {} while (0)
-#define blk_add_driver_data(q, rq, data, len)	do {} while (0)
 #define do_blk_trace_setup(q, name, dev, buts)	(-ENOTTY)
 #define do_blk_trace_setup(q, name, dev, buts)	(-ENOTTY)
+#define blk_add_driver_data(q, rq, data, len)	do {} while (0)
 #define blk_trace_setup(q, name, dev, arg)	(-ENOTTY)
 #define blk_trace_setup(q, name, dev, arg)	(-ENOTTY)
 #define blk_trace_startstop(q, start)		(-ENOTTY)
 #define blk_trace_startstop(q, start)		(-ENOTTY)
 #define blk_trace_remove(q)			(-ENOTTY)
 #define blk_trace_remove(q)			(-ENOTTY)

+ 82 - 2
include/linux/compiler.h

@@ -59,8 +59,88 @@ extern void __chk_io_ptr(const volatile void __iomem *);
  * specific implementations come from the above header files
  * specific implementations come from the above header files
  */
  */
 
 
-#define likely(x)	__builtin_expect(!!(x), 1)
-#define unlikely(x)	__builtin_expect(!!(x), 0)
+struct ftrace_branch_data {
+	const char *func;
+	const char *file;
+	unsigned line;
+	union {
+		struct {
+			unsigned long correct;
+			unsigned long incorrect;
+		};
+		struct {
+			unsigned long miss;
+			unsigned long hit;
+		};
+	};
+};
+
+/*
+ * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code
+ * to disable branch tracing on a per file basis.
+ */
+#if defined(CONFIG_TRACE_BRANCH_PROFILING) && !defined(DISABLE_BRANCH_PROFILING)
+void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
+
+#define likely_notrace(x)	__builtin_expect(!!(x), 1)
+#define unlikely_notrace(x)	__builtin_expect(!!(x), 0)
+
+#define __branch_check__(x, expect) ({					\
+			int ______r;					\
+			static struct ftrace_branch_data		\
+				__attribute__((__aligned__(4)))		\
+				__attribute__((section("_ftrace_annotated_branch"))) \
+				______f = {				\
+				.func = __func__,			\
+				.file = __FILE__,			\
+				.line = __LINE__,			\
+			};						\
+			______r = likely_notrace(x);			\
+			ftrace_likely_update(&______f, ______r, expect); \
+			______r;					\
+		})
+
+/*
+ * Using __builtin_constant_p(x) to ignore cases where the return
+ * value is always the same.  This idea is taken from a similar patch
+ * written by Daniel Walker.
+ */
+# ifndef likely
+#  define likely(x)	(__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 1))
+# endif
+# ifndef unlikely
+#  define unlikely(x)	(__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0))
+# endif
+
+#ifdef CONFIG_PROFILE_ALL_BRANCHES
+/*
+ * "Define 'is'", Bill Clinton
+ * "Define 'if'", Steven Rostedt
+ */
+#define if(cond) if (__builtin_constant_p((cond)) ? !!(cond) :		\
+	({								\
+		int ______r;						\
+		static struct ftrace_branch_data			\
+			__attribute__((__aligned__(4)))			\
+			__attribute__((section("_ftrace_branch")))	\
+			______f = {					\
+				.func = __func__,			\
+				.file = __FILE__,			\
+				.line = __LINE__,			\
+			};						\
+		______r = !!(cond);					\
+		if (______r)						\
+			______f.hit++;					\
+		else							\
+			______f.miss++;					\
+		______r;						\
+	}))
+#endif /* CONFIG_PROFILE_ALL_BRANCHES */
+
+#else
+# define likely(x)	__builtin_expect(!!(x), 1)
+# define unlikely(x)	__builtin_expect(!!(x), 0)
+#endif
 
 
 /* Optimization barrier */
 /* Optimization barrier */
 #ifndef barrier
 #ifndef barrier

+ 267 - 26
include/linux/ftrace.h

@@ -8,6 +8,8 @@
 #include <linux/types.h>
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/kallsyms.h>
+#include <linux/bitops.h>
+#include <linux/sched.h>
 
 
 #ifdef CONFIG_FUNCTION_TRACER
 #ifdef CONFIG_FUNCTION_TRACER
 
 
@@ -24,6 +26,45 @@ struct ftrace_ops {
 	struct ftrace_ops *next;
 	struct ftrace_ops *next;
 };
 };
 
 
+extern int function_trace_stop;
+
+/*
+ * Type of the current tracing.
+ */
+enum ftrace_tracing_type_t {
+	FTRACE_TYPE_ENTER = 0, /* Hook the call of the function */
+	FTRACE_TYPE_RETURN,	/* Hook the return of the function */
+};
+
+/* Current tracing type, default is FTRACE_TYPE_ENTER */
+extern enum ftrace_tracing_type_t ftrace_tracing_type;
+
+/**
+ * ftrace_stop - stop function tracer.
+ *
+ * A quick way to stop the function tracer. Note this an on off switch,
+ * it is not something that is recursive like preempt_disable.
+ * This does not disable the calling of mcount, it only stops the
+ * calling of functions from mcount.
+ */
+static inline void ftrace_stop(void)
+{
+	function_trace_stop = 1;
+}
+
+/**
+ * ftrace_start - start the function tracer.
+ *
+ * This function is the inverse of ftrace_stop. This does not enable
+ * the function tracing if the function tracer is disabled. This only
+ * sets the function tracer flag to continue calling the functions
+ * from mcount.
+ */
+static inline void ftrace_start(void)
+{
+	function_trace_stop = 0;
+}
+
 /*
 /*
  * The ftrace_ops must be a static and should also
  * The ftrace_ops must be a static and should also
  * be read_mostly.  These functions do modify read_mostly variables
  * be read_mostly.  These functions do modify read_mostly variables
@@ -42,9 +83,21 @@ extern void ftrace_stub(unsigned long a0, unsigned long a1);
 # define unregister_ftrace_function(ops) do { } while (0)
 # define unregister_ftrace_function(ops) do { } while (0)
 # define clear_ftrace_function(ops) do { } while (0)
 # define clear_ftrace_function(ops) do { } while (0)
 static inline void ftrace_kill(void) { }
 static inline void ftrace_kill(void) { }
+static inline void ftrace_stop(void) { }
+static inline void ftrace_start(void) { }
 #endif /* CONFIG_FUNCTION_TRACER */
 #endif /* CONFIG_FUNCTION_TRACER */
 
 
+#ifdef CONFIG_STACK_TRACER
+extern int stack_tracer_enabled;
+int
+stack_trace_sysctl(struct ctl_table *table, int write,
+		   struct file *file, void __user *buffer, size_t *lenp,
+		   loff_t *ppos);
+#endif
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 #ifdef CONFIG_DYNAMIC_FTRACE
+/* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
+#include <asm/ftrace.h>
 
 
 enum {
 enum {
 	FTRACE_FL_FREE		= (1 << 0),
 	FTRACE_FL_FREE		= (1 << 0),
@@ -60,6 +113,7 @@ struct dyn_ftrace {
 	struct list_head	list;
 	struct list_head	list;
 	unsigned long		ip; /* address of mcount call-site */
 	unsigned long		ip; /* address of mcount call-site */
 	unsigned long		flags;
 	unsigned long		flags;
+	struct dyn_arch_ftrace	arch;
 };
 };
 
 
 int ftrace_force_update(void);
 int ftrace_force_update(void);
@@ -67,19 +121,48 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset);
 
 
 /* defined in arch */
 /* defined in arch */
 extern int ftrace_ip_converted(unsigned long ip);
 extern int ftrace_ip_converted(unsigned long ip);
-extern unsigned char *ftrace_nop_replace(void);
-extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr);
 extern int ftrace_dyn_arch_init(void *data);
 extern int ftrace_dyn_arch_init(void *data);
 extern int ftrace_update_ftrace_func(ftrace_func_t func);
 extern int ftrace_update_ftrace_func(ftrace_func_t func);
 extern void ftrace_caller(void);
 extern void ftrace_caller(void);
 extern void ftrace_call(void);
 extern void ftrace_call(void);
 extern void mcount_call(void);
 extern void mcount_call(void);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+extern void ftrace_graph_caller(void);
+extern int ftrace_enable_ftrace_graph_caller(void);
+extern int ftrace_disable_ftrace_graph_caller(void);
+#else
+static inline int ftrace_enable_ftrace_graph_caller(void) { return 0; }
+static inline int ftrace_disable_ftrace_graph_caller(void) { return 0; }
+#endif
+
+/**
+ * ftrace_make_nop - convert code into top
+ * @mod: module structure if called by module load initialization
+ * @rec: the mcount call site record
+ * @addr: the address that the call site should be calling
+ *
+ * This is a very sensitive operation and great care needs
+ * to be taken by the arch.  The operation should carefully
+ * read the location, check to see if what is read is indeed
+ * what we expect it to be, and then on success of the compare,
+ * it should write to the location.
+ *
+ * The code segment at @rec->ip should be a caller to @addr
+ *
+ * Return must be:
+ *  0 on success
+ *  -EFAULT on error reading the location
+ *  -EINVAL on a failed compare of the contents
+ *  -EPERM  on error writing to the location
+ * Any other value will be considered a failure.
+ */
+extern int ftrace_make_nop(struct module *mod,
+			   struct dyn_ftrace *rec, unsigned long addr);
 
 
 /**
 /**
- * ftrace_modify_code - modify code segment
- * @ip: the address of the code segment
- * @old_code: the contents of what is expected to be there
- * @new_code: the code to patch in
+ * ftrace_make_call - convert a nop call site into a call to addr
+ * @rec: the mcount call site record
+ * @addr: the address that the call site should call
  *
  *
  * This is a very sensitive operation and great care needs
  * This is a very sensitive operation and great care needs
  * to be taken by the arch.  The operation should carefully
  * to be taken by the arch.  The operation should carefully
@@ -87,6 +170,8 @@ extern void mcount_call(void);
  * what we expect it to be, and then on success of the compare,
  * what we expect it to be, and then on success of the compare,
  * it should write to the location.
  * it should write to the location.
  *
  *
+ * The code segment at @rec->ip should be a nop
+ *
  * Return must be:
  * Return must be:
  *  0 on success
  *  0 on success
  *  -EFAULT on error reading the location
  *  -EFAULT on error reading the location
@@ -94,8 +179,11 @@ extern void mcount_call(void);
  *  -EPERM  on error writing to the location
  *  -EPERM  on error writing to the location
  * Any other value will be considered a failure.
  * Any other value will be considered a failure.
  */
  */
-extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code,
-			      unsigned char *new_code);
+extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
+
+
+/* May be defined in arch */
+extern int ftrace_arch_read_dyn_info(char *buf, int size);
 
 
 extern int skip_trace(unsigned long ip);
 extern int skip_trace(unsigned long ip);
 
 
@@ -103,7 +191,6 @@ extern void ftrace_release(void *start, unsigned long size);
 
 
 extern void ftrace_disable_daemon(void);
 extern void ftrace_disable_daemon(void);
 extern void ftrace_enable_daemon(void);
 extern void ftrace_enable_daemon(void);
-
 #else
 #else
 # define skip_trace(ip)				({ 0; })
 # define skip_trace(ip)				({ 0; })
 # define ftrace_force_update()			({ 0; })
 # define ftrace_force_update()			({ 0; })
@@ -182,6 +269,12 @@ static inline void __ftrace_enabled_restore(int enabled)
 #endif
 #endif
 
 
 #ifdef CONFIG_TRACING
 #ifdef CONFIG_TRACING
+extern int ftrace_dump_on_oops;
+
+extern void tracing_start(void);
+extern void tracing_stop(void);
+extern void ftrace_off_permanent(void);
+
 extern void
 extern void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
 
 
@@ -210,8 +303,11 @@ extern void ftrace_dump(void);
 static inline void
 static inline void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
 static inline int
 static inline int
-ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0)));
+ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
 
 
+static inline void tracing_start(void) { }
+static inline void tracing_stop(void) { }
+static inline void ftrace_off_permanent(void) { }
 static inline int
 static inline int
 ftrace_printk(const char *fmt, ...)
 ftrace_printk(const char *fmt, ...)
 {
 {
@@ -222,33 +318,178 @@ static inline void ftrace_dump(void) { }
 
 
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 extern void ftrace_init(void);
 extern void ftrace_init(void);
-extern void ftrace_init_module(unsigned long *start, unsigned long *end);
+extern void ftrace_init_module(struct module *mod,
+			       unsigned long *start, unsigned long *end);
 #else
 #else
 static inline void ftrace_init(void) { }
 static inline void ftrace_init(void) { }
 static inline void
 static inline void
-ftrace_init_module(unsigned long *start, unsigned long *end) { }
+ftrace_init_module(struct module *mod,
+		   unsigned long *start, unsigned long *end) { }
+#endif
+
+enum {
+	POWER_NONE = 0,
+	POWER_CSTATE = 1,
+	POWER_PSTATE = 2,
+};
+
+struct power_trace {
+#ifdef CONFIG_POWER_TRACER
+	ktime_t			stamp;
+	ktime_t			end;
+	int			type;
+	int			state;
 #endif
 #endif
+};
 
 
+#ifdef CONFIG_POWER_TRACER
+extern void trace_power_start(struct power_trace *it, unsigned int type,
+					unsigned int state);
+extern void trace_power_mark(struct power_trace *it, unsigned int type,
+					unsigned int state);
+extern void trace_power_end(struct power_trace *it);
+#else
+static inline void trace_power_start(struct power_trace *it, unsigned int type,
+					unsigned int state) { }
+static inline void trace_power_mark(struct power_trace *it, unsigned int type,
+					unsigned int state) { }
+static inline void trace_power_end(struct power_trace *it) { }
+#endif
+
+
+/*
+ * Structure that defines an entry function trace.
+ */
+struct ftrace_graph_ent {
+	unsigned long func; /* Current function */
+	int depth;
+};
 
 
-struct boot_trace {
-	pid_t			caller;
-	char			func[KSYM_SYMBOL_LEN];
-	int			result;
-	unsigned long long	duration;		/* usecs */
-	ktime_t			calltime;
-	ktime_t			rettime;
+/*
+ * Structure that defines a return function trace.
+ */
+struct ftrace_graph_ret {
+	unsigned long func; /* Current function */
+	unsigned long long calltime;
+	unsigned long long rettime;
+	/* Number of functions that overran the depth limit for current task */
+	unsigned long overrun;
+	int depth;
 };
 };
 
 
-#ifdef CONFIG_BOOT_TRACER
-extern void trace_boot(struct boot_trace *it, initcall_t fn);
-extern void start_boot_trace(void);
-extern void stop_boot_trace(void);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+/*
+ * Sometimes we don't want to trace a function with the function
+ * graph tracer but we want them to keep traced by the usual function
+ * tracer if the function graph tracer is not configured.
+ */
+#define __notrace_funcgraph		notrace
+
+/*
+ * We want to which function is an entrypoint of a hardirq.
+ * That will help us to put a signal on output.
+ */
+#define __irq_entry		 __attribute__((__section__(".irqentry.text")))
+
+/* Limits of hardirq entrypoints */
+extern char __irqentry_text_start[];
+extern char __irqentry_text_end[];
+
+#define FTRACE_RETFUNC_DEPTH 50
+#define FTRACE_RETSTACK_ALLOC_SIZE 32
+/* Type of the callback handlers for tracing function graph*/
+typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */
+typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
+
+extern int register_ftrace_graph(trace_func_graph_ret_t retfunc,
+				trace_func_graph_ent_t entryfunc);
+
+extern void ftrace_graph_stop(void);
+
+/* The current handlers in use */
+extern trace_func_graph_ret_t ftrace_graph_return;
+extern trace_func_graph_ent_t ftrace_graph_entry;
+
+extern void unregister_ftrace_graph(void);
+
+extern void ftrace_graph_init_task(struct task_struct *t);
+extern void ftrace_graph_exit_task(struct task_struct *t);
+
+static inline int task_curr_ret_stack(struct task_struct *t)
+{
+	return t->curr_ret_stack;
+}
+
+static inline void pause_graph_tracing(void)
+{
+	atomic_inc(&current->tracing_graph_pause);
+}
+
+static inline void unpause_graph_tracing(void)
+{
+	atomic_dec(&current->tracing_graph_pause);
+}
 #else
 #else
-static inline void trace_boot(struct boot_trace *it, initcall_t fn) { }
-static inline void start_boot_trace(void) { }
-static inline void stop_boot_trace(void) { }
+
+#define __notrace_funcgraph
+#define __irq_entry
+
+static inline void ftrace_graph_init_task(struct task_struct *t) { }
+static inline void ftrace_graph_exit_task(struct task_struct *t) { }
+
+static inline int task_curr_ret_stack(struct task_struct *tsk)
+{
+	return -1;
+}
+
+static inline void pause_graph_tracing(void) { }
+static inline void unpause_graph_tracing(void) { }
 #endif
 #endif
 
 
+#ifdef CONFIG_TRACING
+#include <linux/sched.h>
+
+/* flags for current->trace */
+enum {
+	TSK_TRACE_FL_TRACE_BIT	= 0,
+	TSK_TRACE_FL_GRAPH_BIT	= 1,
+};
+enum {
+	TSK_TRACE_FL_TRACE	= 1 << TSK_TRACE_FL_TRACE_BIT,
+	TSK_TRACE_FL_GRAPH	= 1 << TSK_TRACE_FL_GRAPH_BIT,
+};
+
+static inline void set_tsk_trace_trace(struct task_struct *tsk)
+{
+	set_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace);
+}
+
+static inline void clear_tsk_trace_trace(struct task_struct *tsk)
+{
+	clear_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace);
+}
+
+static inline int test_tsk_trace_trace(struct task_struct *tsk)
+{
+	return tsk->trace & TSK_TRACE_FL_TRACE;
+}
+
+static inline void set_tsk_trace_graph(struct task_struct *tsk)
+{
+	set_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace);
+}
+
+static inline void clear_tsk_trace_graph(struct task_struct *tsk)
+{
+	clear_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace);
+}
+
+static inline int test_tsk_trace_graph(struct task_struct *tsk)
+{
+	return tsk->trace & TSK_TRACE_FL_GRAPH;
+}
 
 
+#endif /* CONFIG_TRACING */
 
 
 #endif /* _LINUX_FTRACE_H */
 #endif /* _LINUX_FTRACE_H */

+ 13 - 0
include/linux/ftrace_irq.h

@@ -0,0 +1,13 @@
+#ifndef _LINUX_FTRACE_IRQ_H
+#define _LINUX_FTRACE_IRQ_H
+
+
+#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_GRAPH_TRACER)
+extern void ftrace_nmi_enter(void);
+extern void ftrace_nmi_exit(void);
+#else
+static inline void ftrace_nmi_enter(void) { }
+static inline void ftrace_nmi_exit(void) { }
+#endif
+
+#endif /* _LINUX_FTRACE_IRQ_H */

+ 13 - 2
include/linux/hardirq.h

@@ -4,6 +4,7 @@
 #include <linux/preempt.h>
 #include <linux/preempt.h>
 #include <linux/smp_lock.h>
 #include <linux/smp_lock.h>
 #include <linux/lockdep.h>
 #include <linux/lockdep.h>
+#include <linux/ftrace_irq.h>
 #include <asm/hardirq.h>
 #include <asm/hardirq.h>
 #include <asm/system.h>
 #include <asm/system.h>
 
 
@@ -161,7 +162,17 @@ extern void irq_enter(void);
  */
  */
 extern void irq_exit(void);
 extern void irq_exit(void);
 
 
-#define nmi_enter()		do { lockdep_off(); __irq_enter(); } while (0)
-#define nmi_exit()		do { __irq_exit(); lockdep_on(); } while (0)
+#define nmi_enter()				\
+	do {					\
+		ftrace_nmi_enter();		\
+		lockdep_off();			\
+		__irq_enter();			\
+	} while (0)
+#define nmi_exit()				\
+	do {					\
+		__irq_exit();			\
+		lockdep_on();			\
+		ftrace_nmi_exit();		\
+	} while (0)
 
 
 #endif /* LINUX_HARDIRQ_H */
 #endif /* LINUX_HARDIRQ_H */

+ 63 - 12
include/linux/marker.h

@@ -12,6 +12,7 @@
  * See the file COPYING for more details.
  * See the file COPYING for more details.
  */
  */
 
 
+#include <stdarg.h>
 #include <linux/types.h>
 #include <linux/types.h>
 
 
 struct module;
 struct module;
@@ -48,10 +49,28 @@ struct marker {
 	void (*call)(const struct marker *mdata, void *call_private, ...);
 	void (*call)(const struct marker *mdata, void *call_private, ...);
 	struct marker_probe_closure single;
 	struct marker_probe_closure single;
 	struct marker_probe_closure *multi;
 	struct marker_probe_closure *multi;
+	const char *tp_name;	/* Optional tracepoint name */
+	void *tp_cb;		/* Optional tracepoint callback */
 } __attribute__((aligned(8)));
 } __attribute__((aligned(8)));
 
 
 #ifdef CONFIG_MARKERS
 #ifdef CONFIG_MARKERS
 
 
+#define _DEFINE_MARKER(name, tp_name_str, tp_cb, format)		\
+		static const char __mstrtab_##name[]			\
+		__attribute__((section("__markers_strings")))		\
+		= #name "\0" format;					\
+		static struct marker __mark_##name			\
+		__attribute__((section("__markers"), aligned(8))) =	\
+		{ __mstrtab_##name, &__mstrtab_##name[sizeof(#name)],	\
+		  0, 0, marker_probe_cb, { __mark_empty_function, NULL},\
+		  NULL, tp_name_str, tp_cb }
+
+#define DEFINE_MARKER(name, format)					\
+		_DEFINE_MARKER(name, NULL, NULL, format)
+
+#define DEFINE_MARKER_TP(name, tp_name, tp_cb, format)			\
+		_DEFINE_MARKER(name, #tp_name, tp_cb, format)
+
 /*
 /*
  * Note : the empty asm volatile with read constraint is used here instead of a
  * Note : the empty asm volatile with read constraint is used here instead of a
  * "used" attribute to fix a gcc 4.1.x bug.
  * "used" attribute to fix a gcc 4.1.x bug.
@@ -65,14 +84,7 @@ struct marker {
  */
  */
 #define __trace_mark(generic, name, call_private, format, args...)	\
 #define __trace_mark(generic, name, call_private, format, args...)	\
 	do {								\
 	do {								\
-		static const char __mstrtab_##name[]			\
-		__attribute__((section("__markers_strings")))		\
-		= #name "\0" format;					\
-		static struct marker __mark_##name			\
-		__attribute__((section("__markers"), aligned(8))) =	\
-		{ __mstrtab_##name, &__mstrtab_##name[sizeof(#name)],	\
-		0, 0, marker_probe_cb,					\
-		{ __mark_empty_function, NULL}, NULL };			\
+		DEFINE_MARKER(name, format);				\
 		__mark_check_format(format, ## args);			\
 		__mark_check_format(format, ## args);			\
 		if (unlikely(__mark_##name.state)) {			\
 		if (unlikely(__mark_##name.state)) {			\
 			(*__mark_##name.call)				\
 			(*__mark_##name.call)				\
@@ -80,14 +92,39 @@ struct marker {
 		}							\
 		}							\
 	} while (0)
 	} while (0)
 
 
+#define __trace_mark_tp(name, call_private, tp_name, tp_cb, format, args...) \
+	do {								\
+		void __check_tp_type(void)				\
+		{							\
+			register_trace_##tp_name(tp_cb);		\
+		}							\
+		DEFINE_MARKER_TP(name, tp_name, tp_cb, format);		\
+		__mark_check_format(format, ## args);			\
+		(*__mark_##name.call)(&__mark_##name, call_private,	\
+					## args);			\
+	} while (0)
+
 extern void marker_update_probe_range(struct marker *begin,
 extern void marker_update_probe_range(struct marker *begin,
 	struct marker *end);
 	struct marker *end);
+
+#define GET_MARKER(name)	(__mark_##name)
+
 #else /* !CONFIG_MARKERS */
 #else /* !CONFIG_MARKERS */
+#define DEFINE_MARKER(name, tp_name, tp_cb, format)
 #define __trace_mark(generic, name, call_private, format, args...) \
 #define __trace_mark(generic, name, call_private, format, args...) \
 		__mark_check_format(format, ## args)
 		__mark_check_format(format, ## args)
+#define __trace_mark_tp(name, call_private, tp_name, tp_cb, format, args...) \
+	do {								\
+		void __check_tp_type(void)				\
+		{							\
+			register_trace_##tp_name(tp_cb);		\
+		}							\
+		__mark_check_format(format, ## args);			\
+	} while (0)
 static inline void marker_update_probe_range(struct marker *begin,
 static inline void marker_update_probe_range(struct marker *begin,
 	struct marker *end)
 	struct marker *end)
 { }
 { }
+#define GET_MARKER(name)
 #endif /* CONFIG_MARKERS */
 #endif /* CONFIG_MARKERS */
 
 
 /**
 /**
@@ -116,6 +153,20 @@ static inline void marker_update_probe_range(struct marker *begin,
 #define _trace_mark(name, format, args...) \
 #define _trace_mark(name, format, args...) \
 	__trace_mark(1, name, NULL, format, ## args)
 	__trace_mark(1, name, NULL, format, ## args)
 
 
+/**
+ * trace_mark_tp - Marker in a tracepoint callback
+ * @name: marker name, not quoted.
+ * @tp_name: tracepoint name, not quoted.
+ * @tp_cb: tracepoint callback. Should have an associated global symbol so it
+ *         is not optimized away by the compiler (should not be static).
+ * @format: format string
+ * @args...: variable argument list
+ *
+ * Places a marker in a tracepoint callback.
+ */
+#define trace_mark_tp(name, tp_name, tp_cb, format, args...)	\
+	__trace_mark_tp(name, NULL, tp_name, tp_cb, format, ## args)
+
 /**
 /**
  * MARK_NOARGS - Format string for a marker with no argument.
  * MARK_NOARGS - Format string for a marker with no argument.
  */
  */
@@ -136,8 +187,6 @@ extern marker_probe_func __mark_empty_function;
 
 
 extern void marker_probe_cb(const struct marker *mdata,
 extern void marker_probe_cb(const struct marker *mdata,
 	void *call_private, ...);
 	void *call_private, ...);
-extern void marker_probe_cb_noarg(const struct marker *mdata,
-	void *call_private, ...);
 
 
 /*
 /*
  * Connect a probe to a marker.
  * Connect a probe to a marker.
@@ -162,8 +211,10 @@ extern void *marker_get_private_data(const char *name, marker_probe_func *probe,
 
 
 /*
 /*
  * marker_synchronize_unregister must be called between the last marker probe
  * marker_synchronize_unregister must be called between the last marker probe
- * unregistration and the end of module exit to make sure there is no caller
- * executing a probe when it is freed.
+ * unregistration and the first one of
+ * - the end of module exit function
+ * - the free of any resource used by the probes
+ * to ensure the code and data are valid for any possibly running probes.
  */
  */
 #define marker_synchronize_unregister() synchronize_sched()
 #define marker_synchronize_unregister() synchronize_sched()
 
 

+ 2 - 0
include/linux/mm.h

@@ -1305,5 +1305,7 @@ int vmemmap_populate_basepages(struct page *start_page,
 int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
 int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
 void vmemmap_populate_print_last(void);
 void vmemmap_populate_print_last(void);
 
 
+extern void *alloc_locked_buffer(size_t size);
+extern void free_locked_buffer(void *buffer, size_t size);
 #endif /* __KERNEL__ */
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
 #endif /* _LINUX_MM_H */

+ 2 - 2
include/linux/pid.h

@@ -147,9 +147,9 @@ pid_t pid_vnr(struct pid *pid);
 #define do_each_pid_task(pid, type, task)				\
 #define do_each_pid_task(pid, type, task)				\
 	do {								\
 	do {								\
 		struct hlist_node *pos___;				\
 		struct hlist_node *pos___;				\
-		if (pid != NULL)					\
+		if ((pid) != NULL)					\
 			hlist_for_each_entry_rcu((task), pos___,	\
 			hlist_for_each_entry_rcu((task), pos___,	\
-				&pid->tasks[type], pids[type].node) {
+				&(pid)->tasks[type], pids[type].node) {
 
 
 			/*
 			/*
 			 * Both old and new leaders may be attached to
 			 * Both old and new leaders may be attached to

+ 22 - 0
include/linux/ptrace.h

@@ -94,6 +94,7 @@ extern void ptrace_notify(int exit_code);
 extern void __ptrace_link(struct task_struct *child,
 extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
 extern void __ptrace_unlink(struct task_struct *child);
+extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags);
 #define PTRACE_MODE_READ   1
 #define PTRACE_MODE_READ   1
 #define PTRACE_MODE_ATTACH 2
 #define PTRACE_MODE_ATTACH 2
 /* Returns 0 on success, -errno on denial. */
 /* Returns 0 on success, -errno on denial. */
@@ -313,6 +314,27 @@ static inline void user_enable_block_step(struct task_struct *task)
 #define arch_ptrace_stop(code, info)		do { } while (0)
 #define arch_ptrace_stop(code, info)		do { } while (0)
 #endif
 #endif
 
 
+#ifndef arch_ptrace_untrace
+/*
+ * Do machine-specific work before untracing child.
+ *
+ * This is called for a normal detach as well as from ptrace_exit()
+ * when the tracing task dies.
+ *
+ * Called with write_lock(&tasklist_lock) held.
+ */
+#define arch_ptrace_untrace(task)		do { } while (0)
+#endif
+
+#ifndef arch_ptrace_fork
+/*
+ * Do machine-specific work to initialize a new task.
+ *
+ * This is called from copy_process().
+ */
+#define arch_ptrace_fork(child, clone_flags)	do { } while (0)
+#endif
+
 extern int task_current_syscall(struct task_struct *target, long *callno,
 extern int task_current_syscall(struct task_struct *target, long *callno,
 				unsigned long args[6], unsigned int maxargs,
 				unsigned long args[6], unsigned int maxargs,
 				unsigned long *sp, unsigned long *pc);
 				unsigned long *sp, unsigned long *pc);

+ 2 - 0
include/linux/rcupdate.h

@@ -142,6 +142,7 @@ struct rcu_head {
  * on the write-side to insure proper synchronization.
  * on the write-side to insure proper synchronization.
  */
  */
 #define rcu_read_lock_sched() preempt_disable()
 #define rcu_read_lock_sched() preempt_disable()
+#define rcu_read_lock_sched_notrace() preempt_disable_notrace()
 
 
 /*
 /*
  * rcu_read_unlock_sched - marks the end of a RCU-classic critical section
  * rcu_read_unlock_sched - marks the end of a RCU-classic critical section
@@ -149,6 +150,7 @@ struct rcu_head {
  * See rcu_read_lock_sched for more information.
  * See rcu_read_lock_sched for more information.
  */
  */
 #define rcu_read_unlock_sched() preempt_enable()
 #define rcu_read_unlock_sched() preempt_enable()
+#define rcu_read_unlock_sched_notrace() preempt_enable_notrace()
 
 
 
 
 
 

+ 12 - 4
include/linux/ring_buffer.h

@@ -28,17 +28,19 @@ struct ring_buffer_event {
  *				 size = 8 bytes
  *				 size = 8 bytes
  *
  *
  * @RINGBUF_TYPE_TIME_STAMP:	Sync time stamp with external clock
  * @RINGBUF_TYPE_TIME_STAMP:	Sync time stamp with external clock
- *				 array[0] = tv_nsec
- *				 array[1] = tv_sec
+ *				 array[0]    = tv_nsec
+ *				 array[1..2] = tv_sec
  *				 size = 16 bytes
  *				 size = 16 bytes
  *
  *
  * @RINGBUF_TYPE_DATA:		Data record
  * @RINGBUF_TYPE_DATA:		Data record
  *				 If len is zero:
  *				 If len is zero:
  *				  array[0] holds the actual length
  *				  array[0] holds the actual length
- *				  array[1..(length+3)/4-1] holds data
+ *				  array[1..(length+3)/4] holds data
+ *				  size = 4 + 4 + length (bytes)
  *				 else
  *				 else
  *				  length = len << 2
  *				  length = len << 2
- *				  array[0..(length+3)/4] holds data
+ *				  array[0..(length+3)/4-1] holds data
+ *				  size = 4 + length (bytes)
  */
  */
 enum ring_buffer_type {
 enum ring_buffer_type {
 	RINGBUF_TYPE_PADDING,
 	RINGBUF_TYPE_PADDING,
@@ -122,6 +124,12 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
 
 
 void tracing_on(void);
 void tracing_on(void);
 void tracing_off(void);
 void tracing_off(void);
+void tracing_off_permanent(void);
+
+void *ring_buffer_alloc_read_page(struct ring_buffer *buffer);
+void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data);
+int ring_buffer_read_page(struct ring_buffer *buffer,
+			  void **data_page, int cpu, int full);
 
 
 enum ring_buffer_flags {
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
 	RB_FL_OVERWRITE		= 1 << 0,

+ 31 - 0
include/linux/sched.h

@@ -96,6 +96,7 @@ struct exec_domain;
 struct futex_pi_state;
 struct futex_pi_state;
 struct robust_list_head;
 struct robust_list_head;
 struct bio;
 struct bio;
+struct bts_tracer;
 
 
 /*
 /*
  * List of flags we want to share for kernel threads,
  * List of flags we want to share for kernel threads,
@@ -1130,6 +1131,19 @@ struct task_struct {
 	struct list_head ptraced;
 	struct list_head ptraced;
 	struct list_head ptrace_entry;
 	struct list_head ptrace_entry;
 
 
+#ifdef CONFIG_X86_PTRACE_BTS
+	/*
+	 * This is the tracer handle for the ptrace BTS extension.
+	 * This field actually belongs to the ptracer task.
+	 */
+	struct bts_tracer *bts;
+	/*
+	 * The buffer to hold the BTS data.
+	 */
+	void *bts_buffer;
+	size_t bts_size;
+#endif /* CONFIG_X86_PTRACE_BTS */
+
 	/* PID/PID hash table linkage. */
 	/* PID/PID hash table linkage. */
 	struct pid_link pids[PIDTYPE_MAX];
 	struct pid_link pids[PIDTYPE_MAX];
 	struct list_head thread_group;
 	struct list_head thread_group;
@@ -1313,6 +1327,23 @@ struct task_struct {
 	unsigned long default_timer_slack_ns;
 	unsigned long default_timer_slack_ns;
 
 
 	struct list_head	*scm_work_list;
 	struct list_head	*scm_work_list;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	/* Index of current stored adress in ret_stack */
+	int curr_ret_stack;
+	/* Stack of return addresses for return function tracing */
+	struct ftrace_ret_stack	*ret_stack;
+	/*
+	 * Number of functions that haven't been traced
+	 * because of depth overrun.
+	 */
+	atomic_t trace_overrun;
+	/* Pause for the tracing */
+	atomic_t tracing_graph_pause;
+#endif
+#ifdef CONFIG_TRACING
+	/* state flags for use by tracers */
+	unsigned long trace;
+#endif
 };
 };
 
 
 /*
 /*

+ 1 - 0
include/linux/seq_file.h

@@ -34,6 +34,7 @@ struct seq_operations {
 
 
 #define SEQ_SKIP 1
 #define SEQ_SKIP 1
 
 
+char *mangle_path(char *s, char *p, char *esc);
 int seq_open(struct file *, const struct seq_operations *);
 int seq_open(struct file *, const struct seq_operations *);
 ssize_t seq_read(struct file *, char __user *, size_t, loff_t *);
 ssize_t seq_read(struct file *, char __user *, size_t, loff_t *);
 loff_t seq_lseek(struct file *, loff_t, int);
 loff_t seq_lseek(struct file *, loff_t, int);

+ 8 - 0
include/linux/stacktrace.h

@@ -15,9 +15,17 @@ extern void save_stack_trace_tsk(struct task_struct *tsk,
 				struct stack_trace *trace);
 				struct stack_trace *trace);
 
 
 extern void print_stack_trace(struct stack_trace *trace, int spaces);
 extern void print_stack_trace(struct stack_trace *trace, int spaces);
+
+#ifdef CONFIG_USER_STACKTRACE_SUPPORT
+extern void save_stack_trace_user(struct stack_trace *trace);
+#else
+# define save_stack_trace_user(trace)              do { } while (0)
+#endif
+
 #else
 #else
 # define save_stack_trace(trace)			do { } while (0)
 # define save_stack_trace(trace)			do { } while (0)
 # define save_stack_trace_tsk(tsk, trace)		do { } while (0)
 # define save_stack_trace_tsk(tsk, trace)		do { } while (0)
+# define save_stack_trace_user(trace)			do { } while (0)
 # define print_stack_trace(trace, spaces)		do { } while (0)
 # define print_stack_trace(trace, spaces)		do { } while (0)
 #endif
 #endif
 
 

+ 38 - 19
include/linux/tracepoint.h

@@ -24,8 +24,12 @@ struct tracepoint {
 	const char *name;		/* Tracepoint name */
 	const char *name;		/* Tracepoint name */
 	int state;			/* State. */
 	int state;			/* State. */
 	void **funcs;
 	void **funcs;
-} __attribute__((aligned(8)));
-
+} __attribute__((aligned(32)));		/*
+					 * Aligned on 32 bytes because it is
+					 * globally visible and gcc happily
+					 * align these on the structure size.
+					 * Keep in sync with vmlinux.lds.h.
+					 */
 
 
 #define TPPROTO(args...)	args
 #define TPPROTO(args...)	args
 #define TPARGS(args...)		args
 #define TPARGS(args...)		args
@@ -40,14 +44,14 @@ struct tracepoint {
 	do {								\
 	do {								\
 		void **it_func;						\
 		void **it_func;						\
 									\
 									\
-		rcu_read_lock_sched();					\
+		rcu_read_lock_sched_notrace();				\
 		it_func = rcu_dereference((tp)->funcs);			\
 		it_func = rcu_dereference((tp)->funcs);			\
 		if (it_func) {						\
 		if (it_func) {						\
 			do {						\
 			do {						\
 				((void(*)(proto))(*it_func))(args);	\
 				((void(*)(proto))(*it_func))(args);	\
 			} while (*(++it_func));				\
 			} while (*(++it_func));				\
 		}							\
 		}							\
-		rcu_read_unlock_sched();				\
+		rcu_read_unlock_sched_notrace();			\
 	} while (0)
 	} while (0)
 
 
 /*
 /*
@@ -55,35 +59,40 @@ struct tracepoint {
  * not add unwanted padding between the beginning of the section and the
  * not add unwanted padding between the beginning of the section and the
  * structure. Force alignment to the same alignment as the section start.
  * structure. Force alignment to the same alignment as the section start.
  */
  */
-#define DEFINE_TRACE(name, proto, args)					\
+#define DECLARE_TRACE(name, proto, args)				\
+	extern struct tracepoint __tracepoint_##name;			\
 	static inline void trace_##name(proto)				\
 	static inline void trace_##name(proto)				\
 	{								\
 	{								\
-		static const char __tpstrtab_##name[]			\
-		__attribute__((section("__tracepoints_strings")))	\
-		= #name ":" #proto;					\
-		static struct tracepoint __tracepoint_##name		\
-		__attribute__((section("__tracepoints"), aligned(8))) =	\
-		{ __tpstrtab_##name, 0, NULL };				\
 		if (unlikely(__tracepoint_##name.state))		\
 		if (unlikely(__tracepoint_##name.state))		\
 			__DO_TRACE(&__tracepoint_##name,		\
 			__DO_TRACE(&__tracepoint_##name,		\
 				TPPROTO(proto), TPARGS(args));		\
 				TPPROTO(proto), TPARGS(args));		\
 	}								\
 	}								\
 	static inline int register_trace_##name(void (*probe)(proto))	\
 	static inline int register_trace_##name(void (*probe)(proto))	\
 	{								\
 	{								\
-		return tracepoint_probe_register(#name ":" #proto,	\
-			(void *)probe);					\
+		return tracepoint_probe_register(#name, (void *)probe);	\
 	}								\
 	}								\
-	static inline void unregister_trace_##name(void (*probe)(proto))\
+	static inline int unregister_trace_##name(void (*probe)(proto))	\
 	{								\
 	{								\
-		tracepoint_probe_unregister(#name ":" #proto,		\
-			(void *)probe);					\
+		return tracepoint_probe_unregister(#name, (void *)probe);\
 	}
 	}
 
 
+#define DEFINE_TRACE(name)						\
+	static const char __tpstrtab_##name[]				\
+	__attribute__((section("__tracepoints_strings"))) = #name;	\
+	struct tracepoint __tracepoint_##name				\
+	__attribute__((section("__tracepoints"), aligned(32))) =	\
+		{ __tpstrtab_##name, 0, NULL }
+
+#define EXPORT_TRACEPOINT_SYMBOL_GPL(name)				\
+	EXPORT_SYMBOL_GPL(__tracepoint_##name)
+#define EXPORT_TRACEPOINT_SYMBOL(name)					\
+	EXPORT_SYMBOL(__tracepoint_##name)
+
 extern void tracepoint_update_probe_range(struct tracepoint *begin,
 extern void tracepoint_update_probe_range(struct tracepoint *begin,
 	struct tracepoint *end);
 	struct tracepoint *end);
 
 
 #else /* !CONFIG_TRACEPOINTS */
 #else /* !CONFIG_TRACEPOINTS */
-#define DEFINE_TRACE(name, proto, args)			\
+#define DECLARE_TRACE(name, proto, args)				\
 	static inline void _do_trace_##name(struct tracepoint *tp, proto) \
 	static inline void _do_trace_##name(struct tracepoint *tp, proto) \
 	{ }								\
 	{ }								\
 	static inline void trace_##name(proto)				\
 	static inline void trace_##name(proto)				\
@@ -92,8 +101,14 @@ extern void tracepoint_update_probe_range(struct tracepoint *begin,
 	{								\
 	{								\
 		return -ENOSYS;						\
 		return -ENOSYS;						\
 	}								\
 	}								\
-	static inline void unregister_trace_##name(void (*probe)(proto))\
-	{ }
+	static inline int unregister_trace_##name(void (*probe)(proto))	\
+	{								\
+		return -ENOSYS;						\
+	}
+
+#define DEFINE_TRACE(name)
+#define EXPORT_TRACEPOINT_SYMBOL_GPL(name)
+#define EXPORT_TRACEPOINT_SYMBOL(name)
 
 
 static inline void tracepoint_update_probe_range(struct tracepoint *begin,
 static inline void tracepoint_update_probe_range(struct tracepoint *begin,
 	struct tracepoint *end)
 	struct tracepoint *end)
@@ -112,6 +127,10 @@ extern int tracepoint_probe_register(const char *name, void *probe);
  */
  */
 extern int tracepoint_probe_unregister(const char *name, void *probe);
 extern int tracepoint_probe_unregister(const char *name, void *probe);
 
 
+extern int tracepoint_probe_register_noupdate(const char *name, void *probe);
+extern int tracepoint_probe_unregister_noupdate(const char *name, void *probe);
+extern void tracepoint_probe_update_all(void);
+
 struct tracepoint_iter {
 struct tracepoint_iter {
 	struct module *module;
 	struct module *module;
 	struct tracepoint *tracepoint;
 	struct tracepoint *tracepoint;

+ 1 - 1
include/linux/tty.h

@@ -325,7 +325,7 @@ extern struct class *tty_class;
  *	go away
  *	go away
  */
  */
 
 
-extern inline struct tty_struct *tty_kref_get(struct tty_struct *tty)
+static inline struct tty_struct *tty_kref_get(struct tty_struct *tty)
 {
 {
 	if (tty)
 	if (tty)
 		kref_get(&tty->kref);
 		kref_get(&tty->kref);

+ 76 - 0
include/trace/block.h

@@ -0,0 +1,76 @@
+#ifndef _TRACE_BLOCK_H
+#define _TRACE_BLOCK_H
+
+#include <linux/blkdev.h>
+#include <linux/tracepoint.h>
+
+DECLARE_TRACE(block_rq_abort,
+	TPPROTO(struct request_queue *q, struct request *rq),
+		TPARGS(q, rq));
+
+DECLARE_TRACE(block_rq_insert,
+	TPPROTO(struct request_queue *q, struct request *rq),
+		TPARGS(q, rq));
+
+DECLARE_TRACE(block_rq_issue,
+	TPPROTO(struct request_queue *q, struct request *rq),
+		TPARGS(q, rq));
+
+DECLARE_TRACE(block_rq_requeue,
+	TPPROTO(struct request_queue *q, struct request *rq),
+		TPARGS(q, rq));
+
+DECLARE_TRACE(block_rq_complete,
+	TPPROTO(struct request_queue *q, struct request *rq),
+		TPARGS(q, rq));
+
+DECLARE_TRACE(block_bio_bounce,
+	TPPROTO(struct request_queue *q, struct bio *bio),
+		TPARGS(q, bio));
+
+DECLARE_TRACE(block_bio_complete,
+	TPPROTO(struct request_queue *q, struct bio *bio),
+		TPARGS(q, bio));
+
+DECLARE_TRACE(block_bio_backmerge,
+	TPPROTO(struct request_queue *q, struct bio *bio),
+		TPARGS(q, bio));
+
+DECLARE_TRACE(block_bio_frontmerge,
+	TPPROTO(struct request_queue *q, struct bio *bio),
+		TPARGS(q, bio));
+
+DECLARE_TRACE(block_bio_queue,
+	TPPROTO(struct request_queue *q, struct bio *bio),
+		TPARGS(q, bio));
+
+DECLARE_TRACE(block_getrq,
+	TPPROTO(struct request_queue *q, struct bio *bio, int rw),
+		TPARGS(q, bio, rw));
+
+DECLARE_TRACE(block_sleeprq,
+	TPPROTO(struct request_queue *q, struct bio *bio, int rw),
+		TPARGS(q, bio, rw));
+
+DECLARE_TRACE(block_plug,
+	TPPROTO(struct request_queue *q),
+		TPARGS(q));
+
+DECLARE_TRACE(block_unplug_timer,
+	TPPROTO(struct request_queue *q),
+		TPARGS(q));
+
+DECLARE_TRACE(block_unplug_io,
+	TPPROTO(struct request_queue *q),
+		TPARGS(q));
+
+DECLARE_TRACE(block_split,
+	TPPROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
+		TPARGS(q, bio, pdu));
+
+DECLARE_TRACE(block_remap,
+	TPPROTO(struct request_queue *q, struct bio *bio, dev_t dev,
+		sector_t from, sector_t to),
+		TPARGS(q, bio, dev, from, to));
+
+#endif

+ 60 - 0
include/trace/boot.h

@@ -0,0 +1,60 @@
+#ifndef _LINUX_TRACE_BOOT_H
+#define _LINUX_TRACE_BOOT_H
+
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/init.h>
+
+/*
+ * Structure which defines the trace of an initcall
+ * while it is called.
+ * You don't have to fill the func field since it is
+ * only used internally by the tracer.
+ */
+struct boot_trace_call {
+	pid_t			caller;
+	char			func[KSYM_SYMBOL_LEN];
+};
+
+/*
+ * Structure which defines the trace of an initcall
+ * while it returns.
+ */
+struct boot_trace_ret {
+	char			func[KSYM_SYMBOL_LEN];
+	int				result;
+	unsigned long long	duration;		/* nsecs */
+};
+
+#ifdef CONFIG_BOOT_TRACER
+/* Append the traces on the ring-buffer */
+extern void trace_boot_call(struct boot_trace_call *bt, initcall_t fn);
+extern void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn);
+
+/* Tells the tracer that smp_pre_initcall is finished.
+ * So we can start the tracing
+ */
+extern void start_boot_trace(void);
+
+/* Resume the tracing of other necessary events
+ * such as sched switches
+ */
+extern void enable_boot_trace(void);
+
+/* Suspend this tracing. Actually, only sched_switches tracing have
+ * to be suspended. Initcalls doesn't need it.)
+ */
+extern void disable_boot_trace(void);
+#else
+static inline
+void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) { }
+
+static inline
+void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) { }
+
+static inline void start_boot_trace(void) { }
+static inline void enable_boot_trace(void) { }
+static inline void disable_boot_trace(void) { }
+#endif /* CONFIG_BOOT_TRACER */
+
+#endif /* __LINUX_TRACE_BOOT_H */

+ 18 - 18
include/trace/sched.h

@@ -4,52 +4,52 @@
 #include <linux/sched.h>
 #include <linux/sched.h>
 #include <linux/tracepoint.h>
 #include <linux/tracepoint.h>
 
 
-DEFINE_TRACE(sched_kthread_stop,
+DECLARE_TRACE(sched_kthread_stop,
 	TPPROTO(struct task_struct *t),
 	TPPROTO(struct task_struct *t),
 		TPARGS(t));
 		TPARGS(t));
 
 
-DEFINE_TRACE(sched_kthread_stop_ret,
+DECLARE_TRACE(sched_kthread_stop_ret,
 	TPPROTO(int ret),
 	TPPROTO(int ret),
 		TPARGS(ret));
 		TPARGS(ret));
 
 
-DEFINE_TRACE(sched_wait_task,
+DECLARE_TRACE(sched_wait_task,
 	TPPROTO(struct rq *rq, struct task_struct *p),
 	TPPROTO(struct rq *rq, struct task_struct *p),
 		TPARGS(rq, p));
 		TPARGS(rq, p));
 
 
-DEFINE_TRACE(sched_wakeup,
-	TPPROTO(struct rq *rq, struct task_struct *p),
-		TPARGS(rq, p));
+DECLARE_TRACE(sched_wakeup,
+	TPPROTO(struct rq *rq, struct task_struct *p, int success),
+		TPARGS(rq, p, success));
 
 
-DEFINE_TRACE(sched_wakeup_new,
-	TPPROTO(struct rq *rq, struct task_struct *p),
-		TPARGS(rq, p));
+DECLARE_TRACE(sched_wakeup_new,
+	TPPROTO(struct rq *rq, struct task_struct *p, int success),
+		TPARGS(rq, p, success));
 
 
-DEFINE_TRACE(sched_switch,
+DECLARE_TRACE(sched_switch,
 	TPPROTO(struct rq *rq, struct task_struct *prev,
 	TPPROTO(struct rq *rq, struct task_struct *prev,
 		struct task_struct *next),
 		struct task_struct *next),
 		TPARGS(rq, prev, next));
 		TPARGS(rq, prev, next));
 
 
-DEFINE_TRACE(sched_migrate_task,
-	TPPROTO(struct rq *rq, struct task_struct *p, int dest_cpu),
-		TPARGS(rq, p, dest_cpu));
+DECLARE_TRACE(sched_migrate_task,
+	TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
+		TPARGS(p, orig_cpu, dest_cpu));
 
 
-DEFINE_TRACE(sched_process_free,
+DECLARE_TRACE(sched_process_free,
 	TPPROTO(struct task_struct *p),
 	TPPROTO(struct task_struct *p),
 		TPARGS(p));
 		TPARGS(p));
 
 
-DEFINE_TRACE(sched_process_exit,
+DECLARE_TRACE(sched_process_exit,
 	TPPROTO(struct task_struct *p),
 	TPPROTO(struct task_struct *p),
 		TPARGS(p));
 		TPARGS(p));
 
 
-DEFINE_TRACE(sched_process_wait,
+DECLARE_TRACE(sched_process_wait,
 	TPPROTO(struct pid *pid),
 	TPPROTO(struct pid *pid),
 		TPARGS(pid));
 		TPARGS(pid));
 
 
-DEFINE_TRACE(sched_process_fork,
+DECLARE_TRACE(sched_process_fork,
 	TPPROTO(struct task_struct *parent, struct task_struct *child),
 	TPPROTO(struct task_struct *parent, struct task_struct *child),
 		TPARGS(parent, child));
 		TPARGS(parent, child));
 
 
-DEFINE_TRACE(sched_signal_send,
+DECLARE_TRACE(sched_signal_send,
 	TPPROTO(int sig, struct task_struct *p),
 	TPPROTO(int sig, struct task_struct *p),
 		TPARGS(sig, p));
 		TPARGS(sig, p));
 
 

+ 1 - 0
init/Kconfig

@@ -808,6 +808,7 @@ config TRACEPOINTS
 
 
 config MARKERS
 config MARKERS
 	bool "Activate markers"
 	bool "Activate markers"
+	depends on TRACEPOINTS
 	help
 	help
 	  Place an empty function call at each marker site. Can be
 	  Place an empty function call at each marker site. Can be
 	  dynamically changed for a probe function.
 	  dynamically changed for a probe function.

+ 20 - 15
init/main.c

@@ -63,6 +63,7 @@
 #include <linux/signal.h>
 #include <linux/signal.h>
 #include <linux/idr.h>
 #include <linux/idr.h>
 #include <linux/ftrace.h>
 #include <linux/ftrace.h>
+#include <trace/boot.h>
 
 
 #include <asm/io.h>
 #include <asm/io.h>
 #include <asm/bugs.h>
 #include <asm/bugs.h>
@@ -704,31 +705,35 @@ core_param(initcall_debug, initcall_debug, bool, 0644);
 int do_one_initcall(initcall_t fn)
 int do_one_initcall(initcall_t fn)
 {
 {
 	int count = preempt_count();
 	int count = preempt_count();
-	ktime_t delta;
+	ktime_t calltime, delta, rettime;
 	char msgbuf[64];
 	char msgbuf[64];
-	struct boot_trace it;
+	struct boot_trace_call call;
+	struct boot_trace_ret ret;
 
 
 	if (initcall_debug) {
 	if (initcall_debug) {
-		it.caller = task_pid_nr(current);
-		printk("calling  %pF @ %i\n", fn, it.caller);
-		it.calltime = ktime_get();
+		call.caller = task_pid_nr(current);
+		printk("calling  %pF @ %i\n", fn, call.caller);
+		calltime = ktime_get();
+		trace_boot_call(&call, fn);
+		enable_boot_trace();
 	}
 	}
 
 
-	it.result = fn();
+	ret.result = fn();
 
 
 	if (initcall_debug) {
 	if (initcall_debug) {
-		it.rettime = ktime_get();
-		delta = ktime_sub(it.rettime, it.calltime);
-		it.duration = (unsigned long long) delta.tv64 >> 10;
+		disable_boot_trace();
+		rettime = ktime_get();
+		delta = ktime_sub(rettime, calltime);
+		ret.duration = (unsigned long long) ktime_to_ns(delta) >> 10;
+		trace_boot_ret(&ret, fn);
 		printk("initcall %pF returned %d after %Ld usecs\n", fn,
 		printk("initcall %pF returned %d after %Ld usecs\n", fn,
-			it.result, it.duration);
-		trace_boot(&it, fn);
+			ret.result, ret.duration);
 	}
 	}
 
 
 	msgbuf[0] = 0;
 	msgbuf[0] = 0;
 
 
-	if (it.result && it.result != -ENODEV && initcall_debug)
-		sprintf(msgbuf, "error code %d ", it.result);
+	if (ret.result && ret.result != -ENODEV && initcall_debug)
+		sprintf(msgbuf, "error code %d ", ret.result);
 
 
 	if (preempt_count() != count) {
 	if (preempt_count() != count) {
 		strlcat(msgbuf, "preemption imbalance ", sizeof(msgbuf));
 		strlcat(msgbuf, "preemption imbalance ", sizeof(msgbuf));
@@ -742,7 +747,7 @@ int do_one_initcall(initcall_t fn)
 		printk("initcall %pF returned with %s\n", fn, msgbuf);
 		printk("initcall %pF returned with %s\n", fn, msgbuf);
 	}
 	}
 
 
-	return it.result;
+	return ret.result;
 }
 }
 
 
 
 
@@ -883,7 +888,7 @@ static int __init kernel_init(void * unused)
 	 * we're essentially up and running. Get rid of the
 	 * we're essentially up and running. Get rid of the
 	 * initmem segments and start the user-mode stuff..
 	 * initmem segments and start the user-mode stuff..
 	 */
 	 */
-	stop_boot_trace();
+
 	init_post();
 	init_post();
 	return 0;
 	return 0;
 }
 }

+ 4 - 1
kernel/exit.c

@@ -55,6 +55,10 @@
 #include <asm/mmu_context.h>
 #include <asm/mmu_context.h>
 #include "cred-internals.h"
 #include "cred-internals.h"
 
 
+DEFINE_TRACE(sched_process_free);
+DEFINE_TRACE(sched_process_exit);
+DEFINE_TRACE(sched_process_wait);
+
 static void exit_mm(struct task_struct * tsk);
 static void exit_mm(struct task_struct * tsk);
 
 
 static inline int task_detached(struct task_struct *p)
 static inline int task_detached(struct task_struct *p)
@@ -1127,7 +1131,6 @@ NORET_TYPE void do_exit(long code)
 	preempt_disable();
 	preempt_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
 	tsk->state = TASK_DEAD;
-
 	schedule();
 	schedule();
 	BUG();
 	BUG();
 	/* Avoid "noreturn function does return".  */
 	/* Avoid "noreturn function does return".  */

+ 3 - 2
kernel/extable.c

@@ -17,6 +17,7 @@
 */
 */
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/init.h>
+#include <linux/ftrace.h>
 #include <asm/uaccess.h>
 #include <asm/uaccess.h>
 #include <asm/sections.h>
 #include <asm/sections.h>
 
 
@@ -40,7 +41,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
 	return e;
 	return e;
 }
 }
 
 
-int core_kernel_text(unsigned long addr)
+__notrace_funcgraph int core_kernel_text(unsigned long addr)
 {
 {
 	if (addr >= (unsigned long)_stext &&
 	if (addr >= (unsigned long)_stext &&
 	    addr <= (unsigned long)_etext)
 	    addr <= (unsigned long)_etext)
@@ -53,7 +54,7 @@ int core_kernel_text(unsigned long addr)
 	return 0;
 	return 0;
 }
 }
 
 
-int __kernel_text_address(unsigned long addr)
+__notrace_funcgraph int __kernel_text_address(unsigned long addr)
 {
 {
 	if (core_kernel_text(addr))
 	if (core_kernel_text(addr))
 		return 1;
 		return 1;

+ 12 - 2
kernel/fork.c

@@ -47,6 +47,7 @@
 #include <linux/mount.h>
 #include <linux/mount.h>
 #include <linux/audit.h>
 #include <linux/audit.h>
 #include <linux/memcontrol.h>
 #include <linux/memcontrol.h>
+#include <linux/ftrace.h>
 #include <linux/profile.h>
 #include <linux/profile.h>
 #include <linux/rmap.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
 #include <linux/acct.h>
@@ -80,6 +81,8 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 
 
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
 
 
+DEFINE_TRACE(sched_process_fork);
+
 int nr_processes(void)
 int nr_processes(void)
 {
 {
 	int cpu;
 	int cpu;
@@ -137,6 +140,7 @@ void free_task(struct task_struct *tsk)
 	prop_local_destroy_single(&tsk->dirties);
 	prop_local_destroy_single(&tsk->dirties);
 	free_thread_info(tsk->stack);
 	free_thread_info(tsk->stack);
 	rt_mutex_debug_task_free(tsk);
 	rt_mutex_debug_task_free(tsk);
+	ftrace_graph_exit_task(tsk);
 	free_task_struct(tsk);
 	free_task_struct(tsk);
 }
 }
 EXPORT_SYMBOL(free_task);
 EXPORT_SYMBOL(free_task);
@@ -1080,6 +1084,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
 #ifdef CONFIG_DEBUG_MUTEXES
 	p->blocked_on = NULL; /* not blocked yet */
 	p->blocked_on = NULL; /* not blocked yet */
 #endif
 #endif
+	if (unlikely(ptrace_reparented(current)))
+		ptrace_fork(p, clone_flags);
 
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);
 	sched_fork(p, clone_flags);
@@ -1120,6 +1126,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		}
 		}
 	}
 	}
 
 
+	ftrace_graph_init_task(p);
+
 	p->pid = pid_nr(pid);
 	p->pid = pid_nr(pid);
 	p->tgid = p->pid;
 	p->tgid = p->pid;
 	if (clone_flags & CLONE_THREAD)
 	if (clone_flags & CLONE_THREAD)
@@ -1128,7 +1136,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (current->nsproxy != p->nsproxy) {
 	if (current->nsproxy != p->nsproxy) {
 		retval = ns_cgroup_clone(p, pid);
 		retval = ns_cgroup_clone(p, pid);
 		if (retval)
 		if (retval)
-			goto bad_fork_free_pid;
+			goto bad_fork_free_graph;
 	}
 	}
 
 
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@ -1221,7 +1229,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		spin_unlock(&current->sighand->siglock);
 		spin_unlock(&current->sighand->siglock);
 		write_unlock_irq(&tasklist_lock);
 		write_unlock_irq(&tasklist_lock);
 		retval = -ERESTARTNOINTR;
 		retval = -ERESTARTNOINTR;
-		goto bad_fork_free_pid;
+		goto bad_fork_free_graph;
 	}
 	}
 
 
 	if (clone_flags & CLONE_THREAD) {
 	if (clone_flags & CLONE_THREAD) {
@@ -1258,6 +1266,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	cgroup_post_fork(p);
 	cgroup_post_fork(p);
 	return p;
 	return p;
 
 
+bad_fork_free_graph:
+	ftrace_graph_exit_task(p);
 bad_fork_free_pid:
 bad_fork_free_pid:
 	if (pid != &init_struct_pid)
 	if (pid != &init_struct_pid)
 		free_pid(pid);
 		free_pid(pid);

+ 3 - 0
kernel/kthread.c

@@ -21,6 +21,9 @@ static DEFINE_SPINLOCK(kthread_create_lock);
 static LIST_HEAD(kthread_create_list);
 static LIST_HEAD(kthread_create_list);
 struct task_struct *kthreadd_task;
 struct task_struct *kthreadd_task;
 
 
+DEFINE_TRACE(sched_kthread_stop);
+DEFINE_TRACE(sched_kthread_stop_ret);
+
 struct kthread_create_info
 struct kthread_create_info
 {
 {
 	/* Information passed to kthread() from kthreadd. */
 	/* Information passed to kthread() from kthreadd. */

+ 1 - 0
kernel/lockdep.c

@@ -25,6 +25,7 @@
  * Thanks to Arjan van de Ven for coming up with the initial idea of
  * Thanks to Arjan van de Ven for coming up with the initial idea of
  * mapping lock dependencies runtime.
  * mapping lock dependencies runtime.
  */
  */
+#define DISABLE_BRANCH_PROFILING
 #include <linux/mutex.h>
 #include <linux/mutex.h>
 #include <linux/sched.h>
 #include <linux/sched.h>
 #include <linux/delay.h>
 #include <linux/delay.h>

+ 127 - 65
kernel/marker.c

@@ -43,6 +43,7 @@ static DEFINE_MUTEX(markers_mutex);
  */
  */
 #define MARKER_HASH_BITS 6
 #define MARKER_HASH_BITS 6
 #define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
 #define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
+static struct hlist_head marker_table[MARKER_TABLE_SIZE];
 
 
 /*
 /*
  * Note about RCU :
  * Note about RCU :
@@ -64,11 +65,10 @@ struct marker_entry {
 	void *oldptr;
 	void *oldptr;
 	int rcu_pending;
 	int rcu_pending;
 	unsigned char ptype:1;
 	unsigned char ptype:1;
+	unsigned char format_allocated:1;
 	char name[0];	/* Contains name'\0'format'\0' */
 	char name[0];	/* Contains name'\0'format'\0' */
 };
 };
 
 
-static struct hlist_head marker_table[MARKER_TABLE_SIZE];
-
 /**
 /**
  * __mark_empty_function - Empty probe callback
  * __mark_empty_function - Empty probe callback
  * @probe_private: probe private data
  * @probe_private: probe private data
@@ -81,7 +81,7 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
  * though the function pointer change and the marker enabling are two distinct
  * though the function pointer change and the marker enabling are two distinct
  * operations that modifies the execution flow of preemptible code.
  * operations that modifies the execution flow of preemptible code.
  */
  */
-void __mark_empty_function(void *probe_private, void *call_private,
+notrace void __mark_empty_function(void *probe_private, void *call_private,
 	const char *fmt, va_list *args)
 	const char *fmt, va_list *args)
 {
 {
 }
 }
@@ -97,7 +97,8 @@ EXPORT_SYMBOL_GPL(__mark_empty_function);
  * need to put a full smp_rmb() in this branch. This is why we do not use
  * need to put a full smp_rmb() in this branch. This is why we do not use
  * rcu_dereference() for the pointer read.
  * rcu_dereference() for the pointer read.
  */
  */
-void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
+notrace void marker_probe_cb(const struct marker *mdata,
+		void *call_private, ...)
 {
 {
 	va_list args;
 	va_list args;
 	char ptype;
 	char ptype;
@@ -107,7 +108,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
 	 * sure the teardown of the callbacks can be done correctly when they
 	 * sure the teardown of the callbacks can be done correctly when they
 	 * are in modules and they insure RCU read coherency.
 	 * are in modules and they insure RCU read coherency.
 	 */
 	 */
-	rcu_read_lock_sched();
+	rcu_read_lock_sched_notrace();
 	ptype = mdata->ptype;
 	ptype = mdata->ptype;
 	if (likely(!ptype)) {
 	if (likely(!ptype)) {
 		marker_probe_func *func;
 		marker_probe_func *func;
@@ -145,7 +146,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
 			va_end(args);
 			va_end(args);
 		}
 		}
 	}
 	}
-	rcu_read_unlock_sched();
+	rcu_read_unlock_sched_notrace();
 }
 }
 EXPORT_SYMBOL_GPL(marker_probe_cb);
 EXPORT_SYMBOL_GPL(marker_probe_cb);
 
 
@@ -157,12 +158,13 @@ EXPORT_SYMBOL_GPL(marker_probe_cb);
  *
  *
  * Should be connected to markers "MARK_NOARGS".
  * Should be connected to markers "MARK_NOARGS".
  */
  */
-void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
+static notrace void marker_probe_cb_noarg(const struct marker *mdata,
+		void *call_private, ...)
 {
 {
 	va_list args;	/* not initialized */
 	va_list args;	/* not initialized */
 	char ptype;
 	char ptype;
 
 
-	rcu_read_lock_sched();
+	rcu_read_lock_sched_notrace();
 	ptype = mdata->ptype;
 	ptype = mdata->ptype;
 	if (likely(!ptype)) {
 	if (likely(!ptype)) {
 		marker_probe_func *func;
 		marker_probe_func *func;
@@ -195,9 +197,8 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
 			multi[i].func(multi[i].probe_private, call_private,
 			multi[i].func(multi[i].probe_private, call_private,
 				mdata->format, &args);
 				mdata->format, &args);
 	}
 	}
-	rcu_read_unlock_sched();
+	rcu_read_unlock_sched_notrace();
 }
 }
-EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
 
 
 static void free_old_closure(struct rcu_head *head)
 static void free_old_closure(struct rcu_head *head)
 {
 {
@@ -416,6 +417,7 @@ static struct marker_entry *add_marker(const char *name, const char *format)
 	e->single.probe_private = NULL;
 	e->single.probe_private = NULL;
 	e->multi = NULL;
 	e->multi = NULL;
 	e->ptype = 0;
 	e->ptype = 0;
+	e->format_allocated = 0;
 	e->refcount = 0;
 	e->refcount = 0;
 	e->rcu_pending = 0;
 	e->rcu_pending = 0;
 	hlist_add_head(&e->hlist, head);
 	hlist_add_head(&e->hlist, head);
@@ -447,6 +449,8 @@ static int remove_marker(const char *name)
 	if (e->single.func != __mark_empty_function)
 	if (e->single.func != __mark_empty_function)
 		return -EBUSY;
 		return -EBUSY;
 	hlist_del(&e->hlist);
 	hlist_del(&e->hlist);
+	if (e->format_allocated)
+		kfree(e->format);
 	/* Make sure the call_rcu has been executed */
 	/* Make sure the call_rcu has been executed */
 	if (e->rcu_pending)
 	if (e->rcu_pending)
 		rcu_barrier_sched();
 		rcu_barrier_sched();
@@ -457,57 +461,34 @@ static int remove_marker(const char *name)
 /*
 /*
  * Set the mark_entry format to the format found in the element.
  * Set the mark_entry format to the format found in the element.
  */
  */
-static int marker_set_format(struct marker_entry **entry, const char *format)
+static int marker_set_format(struct marker_entry *entry, const char *format)
 {
 {
-	struct marker_entry *e;
-	size_t name_len = strlen((*entry)->name) + 1;
-	size_t format_len = strlen(format) + 1;
-
-
-	e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
-			GFP_KERNEL);
-	if (!e)
+	entry->format = kstrdup(format, GFP_KERNEL);
+	if (!entry->format)
 		return -ENOMEM;
 		return -ENOMEM;
-	memcpy(&e->name[0], (*entry)->name, name_len);
-	e->format = &e->name[name_len];
-	memcpy(e->format, format, format_len);
-	if (strcmp(e->format, MARK_NOARGS) == 0)
-		e->call = marker_probe_cb_noarg;
-	else
-		e->call = marker_probe_cb;
-	e->single = (*entry)->single;
-	e->multi = (*entry)->multi;
-	e->ptype = (*entry)->ptype;
-	e->refcount = (*entry)->refcount;
-	e->rcu_pending = 0;
-	hlist_add_before(&e->hlist, &(*entry)->hlist);
-	hlist_del(&(*entry)->hlist);
-	/* Make sure the call_rcu has been executed */
-	if ((*entry)->rcu_pending)
-		rcu_barrier_sched();
-	kfree(*entry);
-	*entry = e;
+	entry->format_allocated = 1;
+
 	trace_mark(core_marker_format, "name %s format %s",
 	trace_mark(core_marker_format, "name %s format %s",
-			e->name, e->format);
+			entry->name, entry->format);
 	return 0;
 	return 0;
 }
 }
 
 
 /*
 /*
  * Sets the probe callback corresponding to one marker.
  * Sets the probe callback corresponding to one marker.
  */
  */
-static int set_marker(struct marker_entry **entry, struct marker *elem,
+static int set_marker(struct marker_entry *entry, struct marker *elem,
 		int active)
 		int active)
 {
 {
-	int ret;
-	WARN_ON(strcmp((*entry)->name, elem->name) != 0);
+	int ret = 0;
+	WARN_ON(strcmp(entry->name, elem->name) != 0);
 
 
-	if ((*entry)->format) {
-		if (strcmp((*entry)->format, elem->format) != 0) {
+	if (entry->format) {
+		if (strcmp(entry->format, elem->format) != 0) {
 			printk(KERN_NOTICE
 			printk(KERN_NOTICE
 				"Format mismatch for probe %s "
 				"Format mismatch for probe %s "
 				"(%s), marker (%s)\n",
 				"(%s), marker (%s)\n",
-				(*entry)->name,
-				(*entry)->format,
+				entry->name,
+				entry->format,
 				elem->format);
 				elem->format);
 			return -EPERM;
 			return -EPERM;
 		}
 		}
@@ -523,37 +504,67 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,
 	 * pass from a "safe" callback (with argument) to an "unsafe"
 	 * pass from a "safe" callback (with argument) to an "unsafe"
 	 * callback (does not set arguments).
 	 * callback (does not set arguments).
 	 */
 	 */
-	elem->call = (*entry)->call;
+	elem->call = entry->call;
 	/*
 	/*
 	 * Sanity check :
 	 * Sanity check :
 	 * We only update the single probe private data when the ptr is
 	 * We only update the single probe private data when the ptr is
 	 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
 	 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
 	 */
 	 */
 	WARN_ON(elem->single.func != __mark_empty_function
 	WARN_ON(elem->single.func != __mark_empty_function
-		&& elem->single.probe_private
-		!= (*entry)->single.probe_private &&
-		!elem->ptype);
-	elem->single.probe_private = (*entry)->single.probe_private;
+		&& elem->single.probe_private != entry->single.probe_private
+		&& !elem->ptype);
+	elem->single.probe_private = entry->single.probe_private;
 	/*
 	/*
 	 * Make sure the private data is valid when we update the
 	 * Make sure the private data is valid when we update the
 	 * single probe ptr.
 	 * single probe ptr.
 	 */
 	 */
 	smp_wmb();
 	smp_wmb();
-	elem->single.func = (*entry)->single.func;
+	elem->single.func = entry->single.func;
 	/*
 	/*
 	 * We also make sure that the new probe callbacks array is consistent
 	 * We also make sure that the new probe callbacks array is consistent
 	 * before setting a pointer to it.
 	 * before setting a pointer to it.
 	 */
 	 */
-	rcu_assign_pointer(elem->multi, (*entry)->multi);
+	rcu_assign_pointer(elem->multi, entry->multi);
 	/*
 	/*
 	 * Update the function or multi probe array pointer before setting the
 	 * Update the function or multi probe array pointer before setting the
 	 * ptype.
 	 * ptype.
 	 */
 	 */
 	smp_wmb();
 	smp_wmb();
-	elem->ptype = (*entry)->ptype;
+	elem->ptype = entry->ptype;
+
+	if (elem->tp_name && (active ^ elem->state)) {
+		WARN_ON(!elem->tp_cb);
+		/*
+		 * It is ok to directly call the probe registration because type
+		 * checking has been done in the __trace_mark_tp() macro.
+		 */
+
+		if (active) {
+			/*
+			 * try_module_get should always succeed because we hold
+			 * lock_module() to get the tp_cb address.
+			 */
+			ret = try_module_get(__module_text_address(
+				(unsigned long)elem->tp_cb));
+			BUG_ON(!ret);
+			ret = tracepoint_probe_register_noupdate(
+				elem->tp_name,
+				elem->tp_cb);
+		} else {
+			ret = tracepoint_probe_unregister_noupdate(
+				elem->tp_name,
+				elem->tp_cb);
+			/*
+			 * tracepoint_probe_update_all() must be called
+			 * before the module containing tp_cb is unloaded.
+			 */
+			module_put(__module_text_address(
+				(unsigned long)elem->tp_cb));
+		}
+	}
 	elem->state = active;
 	elem->state = active;
 
 
-	return 0;
+	return ret;
 }
 }
 
 
 /*
 /*
@@ -564,7 +575,24 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,
  */
  */
 static void disable_marker(struct marker *elem)
 static void disable_marker(struct marker *elem)
 {
 {
+	int ret;
+
 	/* leave "call" as is. It is known statically. */
 	/* leave "call" as is. It is known statically. */
+	if (elem->tp_name && elem->state) {
+		WARN_ON(!elem->tp_cb);
+		/*
+		 * It is ok to directly call the probe registration because type
+		 * checking has been done in the __trace_mark_tp() macro.
+		 */
+		ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
+			elem->tp_cb);
+		WARN_ON(ret);
+		/*
+		 * tracepoint_probe_update_all() must be called
+		 * before the module containing tp_cb is unloaded.
+		 */
+		module_put(__module_text_address((unsigned long)elem->tp_cb));
+	}
 	elem->state = 0;
 	elem->state = 0;
 	elem->single.func = __mark_empty_function;
 	elem->single.func = __mark_empty_function;
 	/* Update the function before setting the ptype */
 	/* Update the function before setting the ptype */
@@ -594,8 +622,7 @@ void marker_update_probe_range(struct marker *begin,
 	for (iter = begin; iter < end; iter++) {
 	for (iter = begin; iter < end; iter++) {
 		mark_entry = get_marker(iter->name);
 		mark_entry = get_marker(iter->name);
 		if (mark_entry) {
 		if (mark_entry) {
-			set_marker(&mark_entry, iter,
-					!!mark_entry->refcount);
+			set_marker(mark_entry, iter, !!mark_entry->refcount);
 			/*
 			/*
 			 * ignore error, continue
 			 * ignore error, continue
 			 */
 			 */
@@ -629,6 +656,7 @@ static void marker_update_probes(void)
 	marker_update_probe_range(__start___markers, __stop___markers);
 	marker_update_probe_range(__start___markers, __stop___markers);
 	/* Markers in modules. */
 	/* Markers in modules. */
 	module_update_markers();
 	module_update_markers();
+	tracepoint_probe_update_all();
 }
 }
 
 
 /**
 /**
@@ -657,7 +685,7 @@ int marker_probe_register(const char *name, const char *format,
 			ret = PTR_ERR(entry);
 			ret = PTR_ERR(entry);
 	} else if (format) {
 	} else if (format) {
 		if (!entry->format)
 		if (!entry->format)
-			ret = marker_set_format(&entry, format);
+			ret = marker_set_format(entry, format);
 		else if (strcmp(entry->format, format))
 		else if (strcmp(entry->format, format))
 			ret = -EPERM;
 			ret = -EPERM;
 	}
 	}
@@ -676,10 +704,11 @@ int marker_probe_register(const char *name, const char *format,
 		goto end;
 		goto end;
 	}
 	}
 	mutex_unlock(&markers_mutex);
 	mutex_unlock(&markers_mutex);
-	marker_update_probes();		/* may update entry */
+	marker_update_probes();
 	mutex_lock(&markers_mutex);
 	mutex_lock(&markers_mutex);
 	entry = get_marker(name);
 	entry = get_marker(name);
-	WARN_ON(!entry);
+	if (!entry)
+		goto end;
 	if (entry->rcu_pending)
 	if (entry->rcu_pending)
 		rcu_barrier_sched();
 		rcu_barrier_sched();
 	entry->oldptr = old;
 	entry->oldptr = old;
@@ -720,7 +749,7 @@ int marker_probe_unregister(const char *name,
 		rcu_barrier_sched();
 		rcu_barrier_sched();
 	old = marker_entry_remove_probe(entry, probe, probe_private);
 	old = marker_entry_remove_probe(entry, probe, probe_private);
 	mutex_unlock(&markers_mutex);
 	mutex_unlock(&markers_mutex);
-	marker_update_probes();		/* may update entry */
+	marker_update_probes();
 	mutex_lock(&markers_mutex);
 	mutex_lock(&markers_mutex);
 	entry = get_marker(name);
 	entry = get_marker(name);
 	if (!entry)
 	if (!entry)
@@ -801,10 +830,11 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
 		rcu_barrier_sched();
 		rcu_barrier_sched();
 	old = marker_entry_remove_probe(entry, NULL, probe_private);
 	old = marker_entry_remove_probe(entry, NULL, probe_private);
 	mutex_unlock(&markers_mutex);
 	mutex_unlock(&markers_mutex);
-	marker_update_probes();		/* may update entry */
+	marker_update_probes();
 	mutex_lock(&markers_mutex);
 	mutex_lock(&markers_mutex);
 	entry = get_marker_from_private_data(probe, probe_private);
 	entry = get_marker_from_private_data(probe, probe_private);
-	WARN_ON(!entry);
+	if (!entry)
+		goto end;
 	if (entry->rcu_pending)
 	if (entry->rcu_pending)
 		rcu_barrier_sched();
 		rcu_barrier_sched();
 	entry->oldptr = old;
 	entry->oldptr = old;
@@ -848,8 +878,6 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
 			if (!e->ptype) {
 			if (!e->ptype) {
 				if (num == 0 && e->single.func == probe)
 				if (num == 0 && e->single.func == probe)
 					return e->single.probe_private;
 					return e->single.probe_private;
-				else
-					break;
 			} else {
 			} else {
 				struct marker_probe_closure *closure;
 				struct marker_probe_closure *closure;
 				int match = 0;
 				int match = 0;
@@ -861,8 +889,42 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
 						return closure[i].probe_private;
 						return closure[i].probe_private;
 				}
 				}
 			}
 			}
+			break;
 		}
 		}
 	}
 	}
 	return ERR_PTR(-ENOENT);
 	return ERR_PTR(-ENOENT);
 }
 }
 EXPORT_SYMBOL_GPL(marker_get_private_data);
 EXPORT_SYMBOL_GPL(marker_get_private_data);
+
+#ifdef CONFIG_MODULES
+
+int marker_module_notify(struct notifier_block *self,
+			 unsigned long val, void *data)
+{
+	struct module *mod = data;
+
+	switch (val) {
+	case MODULE_STATE_COMING:
+		marker_update_probe_range(mod->markers,
+			mod->markers + mod->num_markers);
+		break;
+	case MODULE_STATE_GOING:
+		marker_update_probe_range(mod->markers,
+			mod->markers + mod->num_markers);
+		break;
+	}
+	return 0;
+}
+
+struct notifier_block marker_module_nb = {
+	.notifier_call = marker_module_notify,
+	.priority = 0,
+};
+
+static int init_markers(void)
+{
+	return register_module_notifier(&marker_module_nb);
+}
+__initcall(init_markers);
+
+#endif /* CONFIG_MODULES */

+ 2 - 11
kernel/module.c

@@ -2184,24 +2184,15 @@ static noinline struct module *load_module(void __user *umod,
 		struct mod_debug *debug;
 		struct mod_debug *debug;
 		unsigned int num_debug;
 		unsigned int num_debug;
 
 
-#ifdef CONFIG_MARKERS
-		marker_update_probe_range(mod->markers,
-			mod->markers + mod->num_markers);
-#endif
 		debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
 		debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
 				     sizeof(*debug), &num_debug);
 				     sizeof(*debug), &num_debug);
 		dynamic_printk_setup(debug, num_debug);
 		dynamic_printk_setup(debug, num_debug);
-
-#ifdef CONFIG_TRACEPOINTS
-		tracepoint_update_probe_range(mod->tracepoints,
-			mod->tracepoints + mod->num_tracepoints);
-#endif
 	}
 	}
 
 
 	/* sechdrs[0].sh_size is always zero */
 	/* sechdrs[0].sh_size is always zero */
 	mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
 	mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
 			    sizeof(*mseg), &num_mcount);
 			    sizeof(*mseg), &num_mcount);
-	ftrace_init_module(mseg, mseg + num_mcount);
+	ftrace_init_module(mod, mseg, mseg + num_mcount);
 
 
 	err = module_finalize(hdr, sechdrs, mod);
 	err = module_finalize(hdr, sechdrs, mod);
 	if (err < 0)
 	if (err < 0)
@@ -2713,7 +2704,7 @@ int is_module_address(unsigned long addr)
 
 
 
 
 /* Is this a valid kernel address? */
 /* Is this a valid kernel address? */
-struct module *__module_text_address(unsigned long addr)
+__notrace_funcgraph struct module *__module_text_address(unsigned long addr)
 {
 {
 	struct module *mod;
 	struct module *mod;
 
 

+ 3 - 10
kernel/power/disk.c

@@ -22,7 +22,6 @@
 #include <linux/console.h>
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
 #include <linux/freezer.h>
-#include <linux/ftrace.h>
 
 
 #include "power.h"
 #include "power.h"
 
 
@@ -257,7 +256,7 @@ static int create_image(int platform_mode)
 
 
 int hibernation_snapshot(int platform_mode)
 int hibernation_snapshot(int platform_mode)
 {
 {
-	int error, ftrace_save;
+	int error;
 
 
 	/* Free memory before shutting down devices. */
 	/* Free memory before shutting down devices. */
 	error = swsusp_shrink_memory();
 	error = swsusp_shrink_memory();
@@ -269,7 +268,6 @@ int hibernation_snapshot(int platform_mode)
 		goto Close;
 		goto Close;
 
 
 	suspend_console();
 	suspend_console();
-	ftrace_save = __ftrace_enabled_save();
 	error = device_suspend(PMSG_FREEZE);
 	error = device_suspend(PMSG_FREEZE);
 	if (error)
 	if (error)
 		goto Recover_platform;
 		goto Recover_platform;
@@ -299,7 +297,6 @@ int hibernation_snapshot(int platform_mode)
  Resume_devices:
  Resume_devices:
 	device_resume(in_suspend ?
 	device_resume(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
-	__ftrace_enabled_restore(ftrace_save);
 	resume_console();
 	resume_console();
  Close:
  Close:
 	platform_end(platform_mode);
 	platform_end(platform_mode);
@@ -370,11 +367,10 @@ static int resume_target_kernel(void)
 
 
 int hibernation_restore(int platform_mode)
 int hibernation_restore(int platform_mode)
 {
 {
-	int error, ftrace_save;
+	int error;
 
 
 	pm_prepare_console();
 	pm_prepare_console();
 	suspend_console();
 	suspend_console();
-	ftrace_save = __ftrace_enabled_save();
 	error = device_suspend(PMSG_QUIESCE);
 	error = device_suspend(PMSG_QUIESCE);
 	if (error)
 	if (error)
 		goto Finish;
 		goto Finish;
@@ -389,7 +385,6 @@ int hibernation_restore(int platform_mode)
 	platform_restore_cleanup(platform_mode);
 	platform_restore_cleanup(platform_mode);
 	device_resume(PMSG_RECOVER);
 	device_resume(PMSG_RECOVER);
  Finish:
  Finish:
-	__ftrace_enabled_restore(ftrace_save);
 	resume_console();
 	resume_console();
 	pm_restore_console();
 	pm_restore_console();
 	return error;
 	return error;
@@ -402,7 +397,7 @@ int hibernation_restore(int platform_mode)
 
 
 int hibernation_platform_enter(void)
 int hibernation_platform_enter(void)
 {
 {
-	int error, ftrace_save;
+	int error;
 
 
 	if (!hibernation_ops)
 	if (!hibernation_ops)
 		return -ENOSYS;
 		return -ENOSYS;
@@ -417,7 +412,6 @@ int hibernation_platform_enter(void)
 		goto Close;
 		goto Close;
 
 
 	suspend_console();
 	suspend_console();
-	ftrace_save = __ftrace_enabled_save();
 	error = device_suspend(PMSG_HIBERNATE);
 	error = device_suspend(PMSG_HIBERNATE);
 	if (error) {
 	if (error) {
 		if (hibernation_ops->recover)
 		if (hibernation_ops->recover)
@@ -452,7 +446,6 @@ int hibernation_platform_enter(void)
 	hibernation_ops->finish();
 	hibernation_ops->finish();
  Resume_devices:
  Resume_devices:
 	device_resume(PMSG_RESTORE);
 	device_resume(PMSG_RESTORE);
-	__ftrace_enabled_restore(ftrace_save);
 	resume_console();
 	resume_console();
  Close:
  Close:
 	hibernation_ops->end();
 	hibernation_ops->end();

+ 1 - 4
kernel/power/main.c

@@ -22,7 +22,6 @@
 #include <linux/freezer.h>
 #include <linux/freezer.h>
 #include <linux/vmstat.h>
 #include <linux/vmstat.h>
 #include <linux/syscalls.h>
 #include <linux/syscalls.h>
-#include <linux/ftrace.h>
 
 
 #include "power.h"
 #include "power.h"
 
 
@@ -317,7 +316,7 @@ static int suspend_enter(suspend_state_t state)
  */
  */
 int suspend_devices_and_enter(suspend_state_t state)
 int suspend_devices_and_enter(suspend_state_t state)
 {
 {
-	int error, ftrace_save;
+	int error;
 
 
 	if (!suspend_ops)
 	if (!suspend_ops)
 		return -ENOSYS;
 		return -ENOSYS;
@@ -328,7 +327,6 @@ int suspend_devices_and_enter(suspend_state_t state)
 			goto Close;
 			goto Close;
 	}
 	}
 	suspend_console();
 	suspend_console();
-	ftrace_save = __ftrace_enabled_save();
 	suspend_test_start();
 	suspend_test_start();
 	error = device_suspend(PMSG_SUSPEND);
 	error = device_suspend(PMSG_SUSPEND);
 	if (error) {
 	if (error) {
@@ -360,7 +358,6 @@ int suspend_devices_and_enter(suspend_state_t state)
 	suspend_test_start();
 	suspend_test_start();
 	device_resume(PMSG_RESUME);
 	device_resume(PMSG_RESUME);
 	suspend_test_finish("resume devices");
 	suspend_test_finish("resume devices");
-	__ftrace_enabled_restore(ftrace_save);
 	resume_console();
 	resume_console();
  Close:
  Close:
 	if (suspend_ops->end)
 	if (suspend_ops->end)

+ 1 - 1
kernel/profile.c

@@ -544,7 +544,7 @@ static const struct file_operations proc_profile_operations = {
 };
 };
 
 
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
-static inline void profile_nop(void *unused)
+static void profile_nop(void *unused)
 {
 {
 }
 }
 
 

+ 12 - 0
kernel/ptrace.c

@@ -25,6 +25,17 @@
 #include <asm/pgtable.h>
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
 #include <asm/uaccess.h>
 
 
+
+/*
+ * Initialize a new task whose father had been ptraced.
+ *
+ * Called from copy_process().
+ */
+void ptrace_fork(struct task_struct *child, unsigned long clone_flags)
+{
+	arch_ptrace_fork(child, clone_flags);
+}
+
 /*
 /*
  * ptrace a task: make the debugger its new parent and
  * ptrace a task: make the debugger its new parent and
  * move it to the ptrace list.
  * move it to the ptrace list.
@@ -72,6 +83,7 @@ void __ptrace_unlink(struct task_struct *child)
 	child->parent = child->real_parent;
 	child->parent = child->real_parent;
 	list_del_init(&child->ptrace_entry);
 	list_del_init(&child->ptrace_entry);
 
 
+	arch_ptrace_untrace(child);
 	if (task_is_traced(child))
 	if (task_is_traced(child))
 		ptrace_untrace(child);
 		ptrace_untrace(child);
 }
 }

+ 11 - 3
kernel/sched.c

@@ -118,6 +118,12 @@
  */
  */
 #define RUNTIME_INF	((u64)~0ULL)
 #define RUNTIME_INF	((u64)~0ULL)
 
 
+DEFINE_TRACE(sched_wait_task);
+DEFINE_TRACE(sched_wakeup);
+DEFINE_TRACE(sched_wakeup_new);
+DEFINE_TRACE(sched_switch);
+DEFINE_TRACE(sched_migrate_task);
+
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
 /*
 /*
  * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
  * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -1847,6 +1853,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 
 
 	clock_offset = old_rq->clock - new_rq->clock;
 	clock_offset = old_rq->clock - new_rq->clock;
 
 
+	trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+
 #ifdef CONFIG_SCHEDSTATS
 #ifdef CONFIG_SCHEDSTATS
 	if (p->se.wait_start)
 	if (p->se.wait_start)
 		p->se.wait_start -= clock_offset;
 		p->se.wait_start -= clock_offset;
@@ -2318,7 +2326,7 @@ out_activate:
 	success = 1;
 	success = 1;
 
 
 out_running:
 out_running:
-	trace_sched_wakeup(rq, p);
+	trace_sched_wakeup(rq, p, success);
 	check_preempt_curr(rq, p, sync);
 	check_preempt_curr(rq, p, sync);
 
 
 	p->state = TASK_RUNNING;
 	p->state = TASK_RUNNING;
@@ -2451,7 +2459,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		p->sched_class->task_new(rq, p);
 		p->sched_class->task_new(rq, p);
 		inc_nr_running(rq);
 		inc_nr_running(rq);
 	}
 	}
-	trace_sched_wakeup_new(rq, p);
+	trace_sched_wakeup_new(rq, p, 1);
 	check_preempt_curr(rq, p, 0);
 	check_preempt_curr(rq, p, 0);
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
 	if (p->sched_class->task_wake_up)
@@ -2864,7 +2872,6 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
 	    || unlikely(!cpu_active(dest_cpu)))
 	    || unlikely(!cpu_active(dest_cpu)))
 		goto out;
 		goto out;
 
 
-	trace_sched_migrate_task(rq, p, dest_cpu);
 	/* force the process onto the specified CPU */
 	/* force the process onto the specified CPU */
 	if (migrate_task(p, dest_cpu, &req)) {
 	if (migrate_task(p, dest_cpu, &req)) {
 		/* Need to wait for migration thread (might exit: take ref). */
 		/* Need to wait for migration thread (might exit: take ref). */
@@ -5912,6 +5919,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	 * The idle tasks have their own, simple scheduling class:
 	 * The idle tasks have their own, simple scheduling class:
 	 */
 	 */
 	idle->sched_class = &idle_sched_class;
 	idle->sched_class = &idle_sched_class;
+	ftrace_graph_init_task(idle);
 }
 }
 
 
 /*
 /*

+ 2 - 0
kernel/signal.c

@@ -41,6 +41,8 @@
 
 
 static struct kmem_cache *sigqueue_cachep;
 static struct kmem_cache *sigqueue_cachep;
 
 
+DEFINE_TRACE(sched_signal_send);
+
 static void __user *sig_handler(struct task_struct *t, int sig)
 static void __user *sig_handler(struct task_struct *t, int sig)
 {
 {
 	return t->sighand->action[sig - 1].sa.sa_handler;
 	return t->sighand->action[sig - 1].sa.sa_handler;

+ 20 - 0
kernel/sysctl.c

@@ -487,6 +487,26 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &ftrace_enable_sysctl,
 		.proc_handler	= &ftrace_enable_sysctl,
 	},
 	},
 #endif
 #endif
+#ifdef CONFIG_STACK_TRACER
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "stack_tracer_enabled",
+		.data		= &stack_tracer_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &stack_trace_sysctl,
+	},
+#endif
+#ifdef CONFIG_TRACING
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "ftrace_dump_on_oops",
+		.data		= &ftrace_dump_on_oops,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 #ifdef CONFIG_MODULES
 #ifdef CONFIG_MODULES
 	{
 	{
 		.ctl_name	= KERN_MODPROBE,
 		.ctl_name	= KERN_MODPROBE,

+ 111 - 4
kernel/trace/Kconfig

@@ -3,18 +3,34 @@
 #  select HAVE_FUNCTION_TRACER:
 #  select HAVE_FUNCTION_TRACER:
 #
 #
 
 
+config USER_STACKTRACE_SUPPORT
+	bool
+
 config NOP_TRACER
 config NOP_TRACER
 	bool
 	bool
 
 
 config HAVE_FUNCTION_TRACER
 config HAVE_FUNCTION_TRACER
 	bool
 	bool
 
 
+config HAVE_FUNCTION_GRAPH_TRACER
+	bool
+
+config HAVE_FUNCTION_TRACE_MCOUNT_TEST
+	bool
+	help
+	 This gets selected when the arch tests the function_trace_stop
+	 variable at the mcount call site. Otherwise, this variable
+	 is tested by the called function.
+
 config HAVE_DYNAMIC_FTRACE
 config HAVE_DYNAMIC_FTRACE
 	bool
 	bool
 
 
 config HAVE_FTRACE_MCOUNT_RECORD
 config HAVE_FTRACE_MCOUNT_RECORD
 	bool
 	bool
 
 
+config HAVE_HW_BRANCH_TRACER
+	bool
+
 config TRACER_MAX_TRACE
 config TRACER_MAX_TRACE
 	bool
 	bool
 
 
@@ -47,6 +63,20 @@ config FUNCTION_TRACER
 	  (the bootup default), then the overhead of the instructions is very
 	  (the bootup default), then the overhead of the instructions is very
 	  small and not measurable even in micro-benchmarks.
 	  small and not measurable even in micro-benchmarks.
 
 
+config FUNCTION_GRAPH_TRACER
+	bool "Kernel Function Graph Tracer"
+	depends on HAVE_FUNCTION_GRAPH_TRACER
+	depends on FUNCTION_TRACER
+	default y
+	help
+	  Enable the kernel to trace a function at both its return
+	  and its entry.
+	  It's first purpose is to trace the duration of functions and
+	  draw a call graph for each thread with some informations like
+	  the return value.
+	  This is done by setting the current return address on the current
+	  task structure into a stack of calls.
+
 config IRQSOFF_TRACER
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	bool "Interrupts-off Latency Tracer"
 	default n
 	default n
@@ -138,6 +168,70 @@ config BOOT_TRACER
 	    selected, because the self-tests are an initcall as well and that
 	    selected, because the self-tests are an initcall as well and that
 	    would invalidate the boot trace. )
 	    would invalidate the boot trace. )
 
 
+config TRACE_BRANCH_PROFILING
+	bool "Trace likely/unlikely profiler"
+	depends on DEBUG_KERNEL
+	select TRACING
+	help
+	  This tracer profiles all the the likely and unlikely macros
+	  in the kernel. It will display the results in:
+
+	  /debugfs/tracing/profile_annotated_branch
+
+	  Note: this will add a significant overhead, only turn this
+	  on if you need to profile the system's use of these macros.
+
+	  Say N if unsure.
+
+config PROFILE_ALL_BRANCHES
+	bool "Profile all if conditionals"
+	depends on TRACE_BRANCH_PROFILING
+	help
+	  This tracer profiles all branch conditions. Every if ()
+	  taken in the kernel is recorded whether it hit or miss.
+	  The results will be displayed in:
+
+	  /debugfs/tracing/profile_branch
+
+	  This configuration, when enabled, will impose a great overhead
+	  on the system. This should only be enabled when the system
+	  is to be analyzed
+
+	  Say N if unsure.
+
+config TRACING_BRANCHES
+	bool
+	help
+	  Selected by tracers that will trace the likely and unlikely
+	  conditions. This prevents the tracers themselves from being
+	  profiled. Profiling the tracing infrastructure can only happen
+	  when the likelys and unlikelys are not being traced.
+
+config BRANCH_TRACER
+	bool "Trace likely/unlikely instances"
+	depends on TRACE_BRANCH_PROFILING
+	select TRACING_BRANCHES
+	help
+	  This traces the events of likely and unlikely condition
+	  calls in the kernel.  The difference between this and the
+	  "Trace likely/unlikely profiler" is that this is not a
+	  histogram of the callers, but actually places the calling
+	  events into a running trace buffer to see when and where the
+	  events happened, as well as their results.
+
+	  Say N if unsure.
+
+config POWER_TRACER
+	bool "Trace power consumption behavior"
+	depends on DEBUG_KERNEL
+	depends on X86
+	select TRACING
+	help
+	  This tracer helps developers to analyze and optimize the kernels
+	  power management decisions, specifically the C-state and P-state
+	  behavior.
+
+
 config STACK_TRACER
 config STACK_TRACER
 	bool "Trace max stack"
 	bool "Trace max stack"
 	depends on HAVE_FUNCTION_TRACER
 	depends on HAVE_FUNCTION_TRACER
@@ -150,13 +244,26 @@ config STACK_TRACER
 
 
 	  This tracer works by hooking into every function call that the
 	  This tracer works by hooking into every function call that the
 	  kernel executes, and keeping a maximum stack depth value and
 	  kernel executes, and keeping a maximum stack depth value and
-	  stack-trace saved. Because this logic has to execute in every
-	  kernel function, all the time, this option can slow down the
-	  kernel measurably and is generally intended for kernel
-	  developers only.
+	  stack-trace saved.  If this is configured with DYNAMIC_FTRACE
+	  then it will not have any overhead while the stack tracer
+	  is disabled.
+
+	  To enable the stack tracer on bootup, pass in 'stacktrace'
+	  on the kernel command line.
+
+	  The stack tracer can also be enabled or disabled via the
+	  sysctl kernel.stack_tracer_enabled
 
 
 	  Say N if unsure.
 	  Say N if unsure.
 
 
+config HW_BRANCH_TRACER
+	depends on HAVE_HW_BRANCH_TRACER
+	bool "Trace hw branches"
+	select TRACING
+	help
+	  This tracer records all branches on the system in a circular
+	  buffer giving access to the last N branches for each cpu.
+
 config DYNAMIC_FTRACE
 config DYNAMIC_FTRACE
 	bool "enable/disable ftrace tracepoints dynamically"
 	bool "enable/disable ftrace tracepoints dynamically"
 	depends on FUNCTION_TRACER
 	depends on FUNCTION_TRACER

+ 9 - 0
kernel/trace/Makefile

@@ -10,6 +10,11 @@ CFLAGS_trace_selftest_dynamic.o = -pg
 obj-y += trace_selftest_dynamic.o
 obj-y += trace_selftest_dynamic.o
 endif
 endif
 
 
+# If unlikely tracing is enabled, do not trace these files
+ifdef CONFIG_TRACING_BRANCHES
+KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
+endif
+
 obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
 
 
@@ -24,5 +29,9 @@ obj-$(CONFIG_NOP_TRACER) += trace_nop.o
 obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
+obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
+obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
+obj-$(CONFIG_POWER_TRACER) += trace_power.o
 
 
 libftrace-y := ftrace.o
 libftrace-y := ftrace.o

文件差異過大導致無法顯示
+ 642 - 130
kernel/trace/ftrace.c


文件差異過大導致無法顯示
+ 302 - 184
kernel/trace/ring_buffer.c


文件差異過大導致無法顯示
+ 611 - 66
kernel/trace/trace.c


+ 255 - 10
kernel/trace/trace.h

@@ -8,6 +8,7 @@
 #include <linux/ring_buffer.h>
 #include <linux/ring_buffer.h>
 #include <linux/mmiotrace.h>
 #include <linux/mmiotrace.h>
 #include <linux/ftrace.h>
 #include <linux/ftrace.h>
+#include <trace/boot.h>
 
 
 enum trace_type {
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
 	__TRACE_FIRST_TYPE = 0,
@@ -21,7 +22,14 @@ enum trace_type {
 	TRACE_SPECIAL,
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
 	TRACE_MMIO_MAP,
-	TRACE_BOOT,
+	TRACE_BRANCH,
+	TRACE_BOOT_CALL,
+	TRACE_BOOT_RET,
+	TRACE_GRAPH_RET,
+	TRACE_GRAPH_ENT,
+	TRACE_USER_STACK,
+	TRACE_HW_BRANCHES,
+	TRACE_POWER,
 
 
 	__TRACE_LAST_TYPE
 	__TRACE_LAST_TYPE
 };
 };
@@ -38,6 +46,7 @@ struct trace_entry {
 	unsigned char		flags;
 	unsigned char		flags;
 	unsigned char		preempt_count;
 	unsigned char		preempt_count;
 	int			pid;
 	int			pid;
+	int			tgid;
 };
 };
 
 
 /*
 /*
@@ -48,6 +57,18 @@ struct ftrace_entry {
 	unsigned long		ip;
 	unsigned long		ip;
 	unsigned long		parent_ip;
 	unsigned long		parent_ip;
 };
 };
+
+/* Function call entry */
+struct ftrace_graph_ent_entry {
+	struct trace_entry			ent;
+	struct ftrace_graph_ent		graph_ent;
+};
+
+/* Function return entry */
+struct ftrace_graph_ret_entry {
+	struct trace_entry			ent;
+	struct ftrace_graph_ret		ret;
+};
 extern struct tracer boot_tracer;
 extern struct tracer boot_tracer;
 
 
 /*
 /*
@@ -85,12 +106,18 @@ struct stack_entry {
 	unsigned long		caller[FTRACE_STACK_ENTRIES];
 	unsigned long		caller[FTRACE_STACK_ENTRIES];
 };
 };
 
 
+struct userstack_entry {
+	struct trace_entry	ent;
+	unsigned long		caller[FTRACE_STACK_ENTRIES];
+};
+
 /*
 /*
  * ftrace_printk entry:
  * ftrace_printk entry:
  */
  */
 struct print_entry {
 struct print_entry {
 	struct trace_entry	ent;
 	struct trace_entry	ent;
 	unsigned long		ip;
 	unsigned long		ip;
+	int			depth;
 	char			buf[];
 	char			buf[];
 };
 };
 
 
@@ -112,9 +139,35 @@ struct trace_mmiotrace_map {
 	struct mmiotrace_map	map;
 	struct mmiotrace_map	map;
 };
 };
 
 
-struct trace_boot {
+struct trace_boot_call {
 	struct trace_entry	ent;
 	struct trace_entry	ent;
-	struct boot_trace	initcall;
+	struct boot_trace_call boot_call;
+};
+
+struct trace_boot_ret {
+	struct trace_entry	ent;
+	struct boot_trace_ret boot_ret;
+};
+
+#define TRACE_FUNC_SIZE 30
+#define TRACE_FILE_SIZE 20
+struct trace_branch {
+	struct trace_entry	ent;
+	unsigned	        line;
+	char			func[TRACE_FUNC_SIZE+1];
+	char			file[TRACE_FILE_SIZE+1];
+	char			correct;
+};
+
+struct hw_branch_entry {
+	struct trace_entry	ent;
+	u64			from;
+	u64			to;
+};
+
+struct trace_power {
+	struct trace_entry	ent;
+	struct power_trace	state_data;
 };
 };
 
 
 /*
 /*
@@ -172,7 +225,6 @@ struct trace_iterator;
 struct trace_array {
 struct trace_array {
 	struct ring_buffer	*buffer;
 	struct ring_buffer	*buffer;
 	unsigned long		entries;
 	unsigned long		entries;
-	long			ctrl;
 	int			cpu;
 	int			cpu;
 	cycle_t			time_start;
 	cycle_t			time_start;
 	struct task_struct	*waiter;
 	struct task_struct	*waiter;
@@ -212,13 +264,22 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct ctx_switch_entry, 0);	\
 		IF_ASSIGN(var, ent, struct ctx_switch_entry, 0);	\
 		IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
 		IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
 		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\
 		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\
+		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
 		IF_ASSIGN(var, ent, struct special_entry, 0);		\
 		IF_ASSIGN(var, ent, struct special_entry, 0);		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 			  TRACE_MMIO_RW);				\
 			  TRACE_MMIO_RW);				\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\
 			  TRACE_MMIO_MAP);				\
 			  TRACE_MMIO_MAP);				\
-		IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT);	\
+		IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
+		IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
+		IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
+		IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry,	\
+			  TRACE_GRAPH_ENT);		\
+		IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,	\
+			  TRACE_GRAPH_RET);		\
+		IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
+ 		IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
 		__ftrace_bad_type();					\
 		__ftrace_bad_type();					\
 	} while (0)
 	} while (0)
 
 
@@ -229,29 +290,56 @@ enum print_line_t {
 	TRACE_TYPE_UNHANDLED	= 2	/* Relay to other output functions */
 	TRACE_TYPE_UNHANDLED	= 2	/* Relay to other output functions */
 };
 };
 
 
+
+/*
+ * An option specific to a tracer. This is a boolean value.
+ * The bit is the bit index that sets its value on the
+ * flags value in struct tracer_flags.
+ */
+struct tracer_opt {
+	const char 	*name; /* Will appear on the trace_options file */
+	u32 		bit; /* Mask assigned in val field in tracer_flags */
+};
+
+/*
+ * The set of specific options for a tracer. Your tracer
+ * have to set the initial value of the flags val.
+ */
+struct tracer_flags {
+	u32			val;
+	struct tracer_opt 	*opts;
+};
+
+/* Makes more easy to define a tracer opt */
+#define TRACER_OPT(s, b)	.name = #s, .bit = b
+
 /*
 /*
  * A specific tracer, represented by methods that operate on a trace array:
  * A specific tracer, represented by methods that operate on a trace array:
  */
  */
 struct tracer {
 struct tracer {
 	const char		*name;
 	const char		*name;
-	void			(*init)(struct trace_array *tr);
+	/* Your tracer should raise a warning if init fails */
+	int			(*init)(struct trace_array *tr);
 	void			(*reset)(struct trace_array *tr);
 	void			(*reset)(struct trace_array *tr);
+	void			(*start)(struct trace_array *tr);
+	void			(*stop)(struct trace_array *tr);
 	void			(*open)(struct trace_iterator *iter);
 	void			(*open)(struct trace_iterator *iter);
 	void			(*pipe_open)(struct trace_iterator *iter);
 	void			(*pipe_open)(struct trace_iterator *iter);
 	void			(*close)(struct trace_iterator *iter);
 	void			(*close)(struct trace_iterator *iter);
-	void			(*start)(struct trace_iterator *iter);
-	void			(*stop)(struct trace_iterator *iter);
 	ssize_t			(*read)(struct trace_iterator *iter,
 	ssize_t			(*read)(struct trace_iterator *iter,
 					struct file *filp, char __user *ubuf,
 					struct file *filp, char __user *ubuf,
 					size_t cnt, loff_t *ppos);
 					size_t cnt, loff_t *ppos);
-	void			(*ctrl_update)(struct trace_array *tr);
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 	int			(*selftest)(struct tracer *trace,
 	int			(*selftest)(struct tracer *trace,
 					    struct trace_array *tr);
 					    struct trace_array *tr);
 #endif
 #endif
+	void			(*print_header)(struct seq_file *m);
 	enum print_line_t	(*print_line)(struct trace_iterator *iter);
 	enum print_line_t	(*print_line)(struct trace_iterator *iter);
+	/* If you handled the flag setting, return 0 */
+	int			(*set_flag)(u32 old_flags, u32 bit, int set);
 	struct tracer		*next;
 	struct tracer		*next;
 	int			print_max;
 	int			print_max;
+	struct tracer_flags 	*flags;
 };
 };
 
 
 struct trace_seq {
 struct trace_seq {
@@ -279,10 +367,14 @@ struct trace_iterator {
 	unsigned long		iter_flags;
 	unsigned long		iter_flags;
 	loff_t			pos;
 	loff_t			pos;
 	long			idx;
 	long			idx;
+
+	cpumask_t		started;
 };
 };
 
 
+int tracing_is_enabled(void);
 void trace_wake_up(void);
 void trace_wake_up(void);
 void tracing_reset(struct trace_array *tr, int cpu);
 void tracing_reset(struct trace_array *tr, int cpu);
+void tracing_reset_online_cpus(struct trace_array *tr);
 int tracing_open_generic(struct inode *inode, struct file *filp);
 int tracing_open_generic(struct inode *inode, struct file *filp);
 struct dentry *tracing_init_dentry(void);
 struct dentry *tracing_init_dentry(void);
 void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
 void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
@@ -321,8 +413,15 @@ void trace_function(struct trace_array *tr,
 		    unsigned long parent_ip,
 		    unsigned long parent_ip,
 		    unsigned long flags, int pc);
 		    unsigned long flags, int pc);
 
 
+void trace_graph_return(struct ftrace_graph_ret *trace);
+int trace_graph_entry(struct ftrace_graph_ent *trace);
+void trace_hw_branch(struct trace_array *tr, u64 from, u64 to);
+
 void tracing_start_cmdline_record(void);
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
+void tracing_sched_switch_assign_trace(struct trace_array *tr);
+void tracing_stop_sched_switch_record(void);
+void tracing_start_sched_switch_record(void);
 int register_tracer(struct tracer *type);
 int register_tracer(struct tracer *type);
 void unregister_tracer(struct tracer *type);
 void unregister_tracer(struct tracer *type);
 
 
@@ -358,6 +457,7 @@ struct tracer_switch_ops {
 	struct tracer_switch_ops	*next;
 	struct tracer_switch_ops	*next;
 };
 };
 
 
+char *trace_find_cmdline(int pid);
 #endif /* CONFIG_CONTEXT_SWITCH_TRACER */
 #endif /* CONFIG_CONTEXT_SWITCH_TRACER */
 
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -383,19 +483,79 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace,
 					       struct trace_array *tr);
 					       struct trace_array *tr);
 extern int trace_selftest_startup_sysprof(struct tracer *trace,
 extern int trace_selftest_startup_sysprof(struct tracer *trace,
 					       struct trace_array *tr);
 					       struct trace_array *tr);
+extern int trace_selftest_startup_branch(struct tracer *trace,
+					 struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 
 extern void *head_page(struct trace_array_cpu *data);
 extern void *head_page(struct trace_array_cpu *data);
 extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
 extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
 extern void trace_seq_print_cont(struct trace_seq *s,
 extern void trace_seq_print_cont(struct trace_seq *s,
 				 struct trace_iterator *iter);
 				 struct trace_iterator *iter);
+
+extern int
+seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
+		unsigned long sym_flags);
 extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
 extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
 				 size_t cnt);
 				 size_t cnt);
 extern long ns2usecs(cycle_t nsec);
 extern long ns2usecs(cycle_t nsec);
-extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args);
+extern int
+trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
 
 
 extern unsigned long trace_flags;
 extern unsigned long trace_flags;
 
 
+/* Standard output formatting function used for function return traces */
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+extern enum print_line_t print_graph_function(struct trace_iterator *iter);
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+/* TODO: make this variable */
+#define FTRACE_GRAPH_MAX_FUNCS		32
+extern int ftrace_graph_count;
+extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
+
+static inline int ftrace_graph_addr(unsigned long addr)
+{
+	int i;
+
+	if (!ftrace_graph_count || test_tsk_trace_graph(current))
+		return 1;
+
+	for (i = 0; i < ftrace_graph_count; i++) {
+		if (addr == ftrace_graph_funcs[i])
+			return 1;
+	}
+
+	return 0;
+}
+#else
+static inline int ftrace_trace_addr(unsigned long addr)
+{
+	return 1;
+}
+static inline int ftrace_graph_addr(unsigned long addr)
+{
+	return 1;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#else /* CONFIG_FUNCTION_GRAPH_TRACER */
+static inline enum print_line_t
+print_graph_function(struct trace_iterator *iter)
+{
+	return TRACE_TYPE_UNHANDLED;
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+extern struct pid *ftrace_pid_trace;
+
+static inline int ftrace_trace_task(struct task_struct *task)
+{
+	if (!ftrace_pid_trace)
+		return 1;
+
+	return test_tsk_trace_trace(task);
+}
+
 /*
 /*
  * trace_iterator_flags is an enumeration that defines bit
  * trace_iterator_flags is an enumeration that defines bit
  * positions into trace_flags that controls the output.
  * positions into trace_flags that controls the output.
@@ -415,8 +575,93 @@ enum trace_iterator_flags {
 	TRACE_ITER_STACKTRACE		= 0x100,
 	TRACE_ITER_STACKTRACE		= 0x100,
 	TRACE_ITER_SCHED_TREE		= 0x200,
 	TRACE_ITER_SCHED_TREE		= 0x200,
 	TRACE_ITER_PRINTK		= 0x400,
 	TRACE_ITER_PRINTK		= 0x400,
+	TRACE_ITER_PREEMPTONLY		= 0x800,
+	TRACE_ITER_BRANCH		= 0x1000,
+	TRACE_ITER_ANNOTATE		= 0x2000,
+	TRACE_ITER_USERSTACKTRACE       = 0x4000,
+	TRACE_ITER_SYM_USEROBJ          = 0x8000,
+	TRACE_ITER_PRINTK_MSGONLY	= 0x10000
 };
 };
 
 
+/*
+ * TRACE_ITER_SYM_MASK masks the options in trace_flags that
+ * control the output of kernel symbols.
+ */
+#define TRACE_ITER_SYM_MASK \
+	(TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
+
 extern struct tracer nop_trace;
 extern struct tracer nop_trace;
 
 
+/**
+ * ftrace_preempt_disable - disable preemption scheduler safe
+ *
+ * When tracing can happen inside the scheduler, there exists
+ * cases that the tracing might happen before the need_resched
+ * flag is checked. If this happens and the tracer calls
+ * preempt_enable (after a disable), a schedule might take place
+ * causing an infinite recursion.
+ *
+ * To prevent this, we read the need_recshed flag before
+ * disabling preemption. When we want to enable preemption we
+ * check the flag, if it is set, then we call preempt_enable_no_resched.
+ * Otherwise, we call preempt_enable.
+ *
+ * The rational for doing the above is that if need resched is set
+ * and we have yet to reschedule, we are either in an atomic location
+ * (where we do not need to check for scheduling) or we are inside
+ * the scheduler and do not want to resched.
+ */
+static inline int ftrace_preempt_disable(void)
+{
+	int resched;
+
+	resched = need_resched();
+	preempt_disable_notrace();
+
+	return resched;
+}
+
+/**
+ * ftrace_preempt_enable - enable preemption scheduler safe
+ * @resched: the return value from ftrace_preempt_disable
+ *
+ * This is a scheduler safe way to enable preemption and not miss
+ * any preemption checks. The disabled saved the state of preemption.
+ * If resched is set, then we were either inside an atomic or
+ * are inside the scheduler (we would have already scheduled
+ * otherwise). In this case, we do not want to call normal
+ * preempt_enable, but preempt_enable_no_resched instead.
+ */
+static inline void ftrace_preempt_enable(int resched)
+{
+	if (resched)
+		preempt_enable_no_resched_notrace();
+	else
+		preempt_enable_notrace();
+}
+
+#ifdef CONFIG_BRANCH_TRACER
+extern int enable_branch_tracing(struct trace_array *tr);
+extern void disable_branch_tracing(void);
+static inline int trace_branch_enable(struct trace_array *tr)
+{
+	if (trace_flags & TRACE_ITER_BRANCH)
+		return enable_branch_tracing(tr);
+	return 0;
+}
+static inline void trace_branch_disable(void)
+{
+	/* due to races, always disable */
+	disable_branch_tracing();
+}
+#else
+static inline int trace_branch_enable(struct trace_array *tr)
+{
+	return 0;
+}
+static inline void trace_branch_disable(void)
+{
+}
+#endif /* CONFIG_BRANCH_TRACER */
+
 #endif /* _LINUX_KERNEL_TRACE_H */
 #endif /* _LINUX_KERNEL_TRACE_H */

+ 109 - 49
kernel/trace/trace_boot.c

@@ -13,101 +13,161 @@
 #include "trace.h"
 #include "trace.h"
 
 
 static struct trace_array *boot_trace;
 static struct trace_array *boot_trace;
-static int trace_boot_enabled;
+static bool pre_initcalls_finished;
 
 
-
-/* Should be started after do_pre_smp_initcalls() in init/main.c */
+/* Tells the boot tracer that the pre_smp_initcalls are finished.
+ * So we are ready .
+ * It doesn't enable sched events tracing however.
+ * You have to call enable_boot_trace to do so.
+ */
 void start_boot_trace(void)
 void start_boot_trace(void)
 {
 {
-	trace_boot_enabled = 1;
+	pre_initcalls_finished = true;
 }
 }
 
 
-void stop_boot_trace(void)
+void enable_boot_trace(void)
 {
 {
-	trace_boot_enabled = 0;
+	if (pre_initcalls_finished)
+		tracing_start_sched_switch_record();
 }
 }
 
 
-void reset_boot_trace(struct trace_array *tr)
+void disable_boot_trace(void)
 {
 {
-	stop_boot_trace();
+	if (pre_initcalls_finished)
+		tracing_stop_sched_switch_record();
 }
 }
 
 
-static void boot_trace_init(struct trace_array *tr)
+static int boot_trace_init(struct trace_array *tr)
 {
 {
 	int cpu;
 	int cpu;
 	boot_trace = tr;
 	boot_trace = tr;
 
 
-	trace_boot_enabled = 0;
-
 	for_each_cpu_mask(cpu, cpu_possible_map)
 	for_each_cpu_mask(cpu, cpu_possible_map)
 		tracing_reset(tr, cpu);
 		tracing_reset(tr, cpu);
+
+	tracing_sched_switch_assign_trace(tr);
+	return 0;
 }
 }
 
 
-static void boot_trace_ctrl_update(struct trace_array *tr)
+static enum print_line_t
+initcall_call_print_line(struct trace_iterator *iter)
 {
 {
-	if (tr->ctrl)
-		start_boot_trace();
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
+	struct trace_boot_call *field;
+	struct boot_trace_call *call;
+	u64 ts;
+	unsigned long nsec_rem;
+	int ret;
+
+	trace_assign_type(field, entry);
+	call = &field->boot_call;
+	ts = iter->ts;
+	nsec_rem = do_div(ts, 1000000000);
+
+	ret = trace_seq_printf(s, "[%5ld.%09ld] calling  %s @ %i\n",
+			(unsigned long)ts, nsec_rem, call->func, call->caller);
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 	else
 	else
-		stop_boot_trace();
+		return TRACE_TYPE_HANDLED;
 }
 }
 
 
-static enum print_line_t initcall_print_line(struct trace_iterator *iter)
+static enum print_line_t
+initcall_ret_print_line(struct trace_iterator *iter)
 {
 {
-	int ret;
 	struct trace_entry *entry = iter->ent;
 	struct trace_entry *entry = iter->ent;
-	struct trace_boot *field = (struct trace_boot *)entry;
-	struct boot_trace *it = &field->initcall;
 	struct trace_seq *s = &iter->seq;
 	struct trace_seq *s = &iter->seq;
-	struct timespec calltime = ktime_to_timespec(it->calltime);
-	struct timespec rettime = ktime_to_timespec(it->rettime);
-
-	if (entry->type == TRACE_BOOT) {
-		ret = trace_seq_printf(s, "[%5ld.%09ld] calling  %s @ %i\n",
-					  calltime.tv_sec,
-					  calltime.tv_nsec,
-					  it->func, it->caller);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-
-		ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
-					  "returned %d after %lld msecs\n",
-					  rettime.tv_sec,
-					  rettime.tv_nsec,
-					  it->func, it->result, it->duration);
-
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
+	struct trace_boot_ret *field;
+	struct boot_trace_ret *init_ret;
+	u64 ts;
+	unsigned long nsec_rem;
+	int ret;
+
+	trace_assign_type(field, entry);
+	init_ret = &field->boot_ret;
+	ts = iter->ts;
+	nsec_rem = do_div(ts, 1000000000);
+
+	ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
+			"returned %d after %llu msecs\n",
+			(unsigned long) ts,
+			nsec_rem,
+			init_ret->func, init_ret->result, init_ret->duration);
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+	else
 		return TRACE_TYPE_HANDLED;
 		return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t initcall_print_line(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+
+	switch (entry->type) {
+	case TRACE_BOOT_CALL:
+		return initcall_call_print_line(iter);
+	case TRACE_BOOT_RET:
+		return initcall_ret_print_line(iter);
+	default:
+		return TRACE_TYPE_UNHANDLED;
 	}
 	}
-	return TRACE_TYPE_UNHANDLED;
 }
 }
 
 
 struct tracer boot_tracer __read_mostly =
 struct tracer boot_tracer __read_mostly =
 {
 {
 	.name		= "initcall",
 	.name		= "initcall",
 	.init		= boot_trace_init,
 	.init		= boot_trace_init,
-	.reset		= reset_boot_trace,
-	.ctrl_update	= boot_trace_ctrl_update,
+	.reset		= tracing_reset_online_cpus,
 	.print_line	= initcall_print_line,
 	.print_line	= initcall_print_line,
 };
 };
 
 
-void trace_boot(struct boot_trace *it, initcall_t fn)
+void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
 {
 {
 	struct ring_buffer_event *event;
 	struct ring_buffer_event *event;
-	struct trace_boot *entry;
-	struct trace_array_cpu *data;
+	struct trace_boot_call *entry;
 	unsigned long irq_flags;
 	unsigned long irq_flags;
 	struct trace_array *tr = boot_trace;
 	struct trace_array *tr = boot_trace;
 
 
-	if (!trace_boot_enabled)
+	if (!pre_initcalls_finished)
 		return;
 		return;
 
 
 	/* Get its name now since this function could
 	/* Get its name now since this function could
 	 * disappear because it is in the .init section.
 	 * disappear because it is in the .init section.
 	 */
 	 */
-	sprint_symbol(it->func, (unsigned long)fn);
+	sprint_symbol(bt->func, (unsigned long)fn);
+	preempt_disable();
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+	entry->ent.type = TRACE_BOOT_CALL;
+	entry->boot_call = *bt;
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+
+ out:
+	preempt_enable();
+}
+
+void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
+{
+	struct ring_buffer_event *event;
+	struct trace_boot_ret *entry;
+	unsigned long irq_flags;
+	struct trace_array *tr = boot_trace;
+
+	if (!pre_initcalls_finished)
+		return;
+
+	sprint_symbol(bt->func, (unsigned long)fn);
 	preempt_disable();
 	preempt_disable();
-	data = tr->data[smp_processor_id()];
 
 
 	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
 	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
 					 &irq_flags);
 					 &irq_flags);
@@ -115,8 +175,8 @@ void trace_boot(struct boot_trace *it, initcall_t fn)
 		goto out;
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
-	entry->ent.type = TRACE_BOOT;
-	entry->initcall = *it;
+	entry->ent.type = TRACE_BOOT_RET;
+	entry->boot_ret = *bt;
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 
 
 	trace_wake_up();
 	trace_wake_up();

部分文件因文件數量過多而無法顯示