浏览代码

Merge branch 'tracing-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'tracing-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (244 commits)
  Revert "x86, bts: reenable ptrace branch trace support"
  tracing: do not translate event helper macros in print format
  ftrace/documentation: fix typo in function grapher name
  tracing/events: convert block trace points to TRACE_EVENT(), fix !CONFIG_BLOCK
  tracing: add protection around module events unload
  tracing: add trace_seq_vprint interface
  tracing: fix the block trace points print size
  tracing/events: convert block trace points to TRACE_EVENT()
  ring-buffer: fix ret in rb_add_time_stamp
  ring-buffer: pass in lockdep class key for reader_lock
  tracing: add annotation to what type of stack trace is recorded
  tracing: fix multiple use of __print_flags and __print_symbolic
  tracing/events: fix output format of user stack
  tracing/events: fix output format of kernel stack
  tracing/trace_stack: fix the number of entries in the header
  ring-buffer: discard timestamps that are at the start of the buffer
  ring-buffer: try to discard unneeded timestamps
  ring-buffer: fix bug in ring_buffer_discard_commit
  ftrace: do not profile functions when disabled
  tracing: make trace pipe recognize latency format flag
  ...
Linus Torvalds 16 年之前
父节点
当前提交
8623661180
共有 100 个文件被更改,包括 8043 次插入2549 次删除
  1. 2 1
      Documentation/DocBook/Makefile
  2. 89 0
      Documentation/DocBook/tracepoint.tmpl
  3. 15 12
      Documentation/kernel-parameters.txt
  4. 90 0
      Documentation/trace/events.txt
  5. 1 1
      Documentation/trace/ftrace.txt
  6. 17 0
      Documentation/trace/power.txt
  7. 9 0
      arch/x86/Kconfig.debug
  8. 56 26
      arch/x86/include/asm/ds.h
  9. 28 7
      arch/x86/include/asm/processor.h
  10. 4 5
      arch/x86/include/asm/ptrace.h
  11. 4 4
      arch/x86/include/asm/tlbflush.h
  12. 1 0
      arch/x86/kernel/Makefile
  13. 519 193
      arch/x86/kernel/ds.c
  14. 408 0
      arch/x86/kernel/ds_selftest.c
  15. 15 0
      arch/x86/kernel/ds_selftest.h
  16. 3 16
      arch/x86/kernel/entry_64.S
  17. 3 2
      arch/x86/kernel/process.c
  18. 2 1
      arch/x86/kernel/process_32.c
  19. 2 1
      arch/x86/kernel/process_64.c
  20. 182 102
      arch/x86/kernel/ptrace.c
  21. 1 1
      arch/x86/kernel/stacktrace.c
  22. 61 43
      arch/x86/mm/kmmio.c
  23. 2 0
      arch/x86/mm/mmio-mod.c
  24. 6 15
      block/blk-core.c
  25. 6 1
      block/blk-sysfs.c
  26. 1 1
      block/compat_ioctl.c
  27. 2 6
      block/elevator.c
  28. 3 5
      drivers/md/dm.c
  29. 1 0
      drivers/scsi/sg.c
  30. 1 2
      fs/bio.c
  31. 1 1
      include/asm-generic/vmlinux.lds.h
  32. 33 12
      include/linux/blktrace_api.h
  33. 8 9
      include/linux/ftrace.h
  34. 172 0
      include/linux/ftrace_event.h
  35. 1 0
      include/linux/init_task.h
  36. 25 0
      include/linux/kmemtrace.h
  37. 4 3
      include/linux/mm.h
  38. 2 0
      include/linux/mmiotrace.h
  39. 8 0
      include/linux/module.h
  40. 0 10
      include/linux/ptrace.h
  41. 58 8
      include/linux/ring_buffer.h
  42. 7 10
      include/linux/sched.h
  43. 1 1
      include/linux/slab_def.h
  44. 1 1
      include/linux/slub_def.h
  45. 92 0
      include/linux/trace_seq.h
  46. 5 3
      include/linux/tracepoint.h
  47. 0 76
      include/trace/block.h
  48. 75 0
      include/trace/define_trace.h
  49. 498 0
      include/trace/events/block.h
  50. 145 0
      include/trace/events/irq.h
  51. 231 0
      include/trace/events/kmem.h
  52. 96 0
      include/trace/events/lockdep.h
  53. 19 10
      include/trace/events/sched.h
  54. 40 0
      include/trace/events/skb.h
  55. 100 0
      include/trace/events/workqueue.h
  56. 591 0
      include/trace/ftrace.h
  57. 0 9
      include/trace/irq.h
  58. 0 55
      include/trace/irq_event_types.h
  59. 0 63
      include/trace/kmemtrace.h
  60. 0 9
      include/trace/lockdep.h
  61. 0 44
      include/trace/lockdep_event_types.h
  62. 0 9
      include/trace/sched.h
  63. 0 11
      include/trace/skb.h
  64. 0 5
      include/trace/trace_event_types.h
  65. 0 5
      include/trace/trace_events.h
  66. 0 25
      include/trace/workqueue.h
  67. 1 1
      init/main.c
  68. 1 0
      kernel/Makefile
  69. 1 5
      kernel/exit.c
  70. 4 5
      kernel/fork.c
  71. 1 4
      kernel/irq/handle.c
  72. 1 4
      kernel/kthread.c
  73. 5 11
      kernel/lockdep.c
  74. 14 12
      kernel/module.c
  75. 0 10
      kernel/ptrace.c
  76. 47 8
      kernel/sched.c
  77. 1 3
      kernel/signal.c
  78. 3 4
      kernel/softirq.c
  79. 110 33
      kernel/trace/Kconfig
  80. 14 6
      kernel/trace/Makefile
  81. 229 43
      kernel/trace/blktrace.c
  82. 0 14
      kernel/trace/events.c
  83. 705 67
      kernel/trace/ftrace.c
  84. 7 3
      kernel/trace/kmemtrace.c
  85. 461 215
      kernel/trace/ring_buffer.c
  86. 416 0
      kernel/trace/ring_buffer_benchmark.c
  87. 253 130
      kernel/trace/trace.c
  88. 98 145
      kernel/trace/trace.h
  89. 3 2
      kernel/trace/trace_boot.c
  90. 5 3
      kernel/trace/trace_branch.c
  91. 16 8
      kernel/trace/trace_event_profile.c
  92. 7 5
      kernel/trace/trace_event_types.h
  93. 700 139
      kernel/trace/trace_events.c
  94. 963 241
      kernel/trace/trace_events_filter.c
  95. 0 39
      kernel/trace/trace_events_stage_1.h
  96. 0 176
      kernel/trace/trace_events_stage_2.h
  97. 0 281
      kernel/trace/trace_events_stage_3.h
  98. 107 3
      kernel/trace/trace_export.c
  99. 18 7
      kernel/trace/trace_functions_graph.c
  100. 105 98
      kernel/trace/trace_hw_branches.c

+ 2 - 1
Documentation/DocBook/Makefile

@@ -13,7 +13,8 @@ DOCBOOKS := z8530book.xml mcabook.xml device-drivers.xml \
 	    gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \
 	    gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \
 	    genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \
 	    genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \
 	    mac80211.xml debugobjects.xml sh.xml regulator.xml \
 	    mac80211.xml debugobjects.xml sh.xml regulator.xml \
-	    alsa-driver-api.xml writing-an-alsa-driver.xml
+	    alsa-driver-api.xml writing-an-alsa-driver.xml \
+	    tracepoint.xml
 
 
 ###
 ###
 # The build process is as follows (targets):
 # The build process is as follows (targets):

+ 89 - 0
Documentation/DocBook/tracepoint.tmpl

@@ -0,0 +1,89 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="Tracepoints">
+ <bookinfo>
+  <title>The Linux Kernel Tracepoint API</title>
+
+  <authorgroup>
+   <author>
+    <firstname>Jason</firstname>
+    <surname>Baron</surname>
+    <affiliation>
+     <address>
+      <email>jbaron@redhat.com</email>
+     </address>
+    </affiliation>
+   </author>
+  </authorgroup>
+
+  <legalnotice>
+   <para>
+     This documentation is free software; you can redistribute
+     it and/or modify it under the terms of the GNU General Public
+     License as published by the Free Software Foundation; either
+     version 2 of the License, or (at your option) any later
+     version.
+   </para>
+
+   <para>
+     This program is distributed in the hope that it will be
+     useful, but WITHOUT ANY WARRANTY; without even the implied
+     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+     See the GNU General Public License for more details.
+   </para>
+
+   <para>
+     You should have received a copy of the GNU General Public
+     License along with this program; if not, write to the Free
+     Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+     MA 02111-1307 USA
+   </para>
+
+   <para>
+     For more details see the file COPYING in the source
+     distribution of Linux.
+   </para>
+  </legalnotice>
+ </bookinfo>
+
+ <toc></toc>
+  <chapter id="intro">
+   <title>Introduction</title>
+   <para>
+     Tracepoints are static probe points that are located in strategic points
+     throughout the kernel. 'Probes' register/unregister with tracepoints
+     via a callback mechanism. The 'probes' are strictly typed functions that
+     are passed a unique set of parameters defined by each tracepoint.
+   </para>
+
+   <para>
+     From this simple callback mechanism, 'probes' can be used to profile, debug,
+     and understand kernel behavior. There are a number of tools that provide a
+     framework for using 'probes'. These tools include Systemtap, ftrace, and
+     LTTng.
+   </para>
+
+   <para>
+     Tracepoints are defined in a number of header files via various macros. Thus,
+     the purpose of this document is to provide a clear accounting of the available
+     tracepoints. The intention is to understand not only what tracepoints are
+     available but also to understand where future tracepoints might be added.
+   </para>
+
+   <para>
+     The API presented has functions of the form:
+     <function>trace_tracepointname(function parameters)</function>. These are the
+     tracepoints callbacks that are found throughout the code. Registering and
+     unregistering probes with these callback sites is covered in the
+     <filename>Documentation/trace/*</filename> directory.
+   </para>
+  </chapter>
+
+  <chapter id="irq">
+   <title>IRQ</title>
+!Iinclude/trace/events/irq.h
+  </chapter>
+
+</book>

+ 15 - 12
Documentation/kernel-parameters.txt

@@ -56,7 +56,6 @@ parameter is applicable:
 	ISAPNP	ISA PnP code is enabled.
 	ISAPNP	ISA PnP code is enabled.
 	ISDN	Appropriate ISDN support is enabled.
 	ISDN	Appropriate ISDN support is enabled.
 	JOY	Appropriate joystick support is enabled.
 	JOY	Appropriate joystick support is enabled.
-	KMEMTRACE kmemtrace is enabled.
 	LIBATA  Libata driver is enabled
 	LIBATA  Libata driver is enabled
 	LP	Printer support is enabled.
 	LP	Printer support is enabled.
 	LOOP	Loopback device support is enabled.
 	LOOP	Loopback device support is enabled.
@@ -754,12 +753,25 @@ and is between 256 and 4096 characters. It is defined in the file
 			ia64_pal_cache_flush instead of SAL_CACHE_FLUSH.
 			ia64_pal_cache_flush instead of SAL_CACHE_FLUSH.
 
 
 	ftrace=[tracer]
 	ftrace=[tracer]
-			[ftrace] will set and start the specified tracer
+			[FTRACE] will set and start the specified tracer
 			as early as possible in order to facilitate early
 			as early as possible in order to facilitate early
 			boot debugging.
 			boot debugging.
 
 
 	ftrace_dump_on_oops
 	ftrace_dump_on_oops
-			[ftrace] will dump the trace buffers on oops.
+			[FTRACE] will dump the trace buffers on oops.
+
+	ftrace_filter=[function-list]
+			[FTRACE] Limit the functions traced by the function
+			tracer at boot up. function-list is a comma separated
+			list of functions. This list can be changed at run
+			time by the set_ftrace_filter file in the debugfs
+			tracing directory. 
+
+	ftrace_notrace=[function-list]
+			[FTRACE] Do not trace the functions specified in
+			function-list. This list can be changed at run time
+			by the set_ftrace_notrace file in the debugfs
+			tracing directory.
 
 
 	gamecon.map[2|3]=
 	gamecon.map[2|3]=
 			[HW,JOY] Multisystem joystick and NES/SNES/PSX pad
 			[HW,JOY] Multisystem joystick and NES/SNES/PSX pad
@@ -1056,15 +1068,6 @@ and is between 256 and 4096 characters. It is defined in the file
 			use the HighMem zone if it exists, and the Normal
 			use the HighMem zone if it exists, and the Normal
 			zone if it does not.
 			zone if it does not.
 
 
-	kmemtrace.enable=	[KNL,KMEMTRACE] Format: { yes | no }
-				Controls whether kmemtrace is enabled
-				at boot-time.
-
-	kmemtrace.subbufs=n	[KNL,KMEMTRACE] Overrides the number of
-			subbufs kmemtrace's relay channel has. Set this
-			higher than default (KMEMTRACE_N_SUBBUFS in code) if
-			you experience buffer overruns.
-
 	kgdboc=		[HW] kgdb over consoles.
 	kgdboc=		[HW] kgdb over consoles.
 			Requires a tty driver that supports console polling.
 			Requires a tty driver that supports console polling.
 			(only serial suported for now)
 			(only serial suported for now)

+ 90 - 0
Documentation/trace/events.txt

@@ -0,0 +1,90 @@
+			     Event Tracing
+
+		Documentation written by Theodore Ts'o
+			Updated by Li Zefan
+
+1. Introduction
+===============
+
+Tracepoints (see Documentation/trace/tracepoints.txt) can be used
+without creating custom kernel modules to register probe functions
+using the event tracing infrastructure.
+
+Not all tracepoints can be traced using the event tracing system;
+the kernel developer must provide code snippets which define how the
+tracing information is saved into the tracing buffer, and how the
+tracing information should be printed.
+
+2. Using Event Tracing
+======================
+
+2.1 Via the 'set_event' interface
+---------------------------------
+
+The events which are available for tracing can be found in the file
+/debug/tracing/available_events.
+
+To enable a particular event, such as 'sched_wakeup', simply echo it
+to /debug/tracing/set_event. For example:
+
+	# echo sched_wakeup >> /debug/tracing/set_event
+
+[ Note: '>>' is necessary, otherwise it will firstly disable
+  all the events. ]
+
+To disable an event, echo the event name to the set_event file prefixed
+with an exclamation point:
+
+	# echo '!sched_wakeup' >> /debug/tracing/set_event
+
+To disable all events, echo an empty line to the set_event file:
+
+	# echo > /debug/tracing/set_event
+
+To enable all events, echo '*:*' or '*:' to the set_event file:
+
+	# echo *:* > /debug/tracing/set_event
+
+The events are organized into subsystems, such as ext4, irq, sched,
+etc., and a full event name looks like this: <subsystem>:<event>.  The
+subsystem name is optional, but it is displayed in the available_events
+file.  All of the events in a subsystem can be specified via the syntax
+"<subsystem>:*"; for example, to enable all irq events, you can use the
+command:
+
+	# echo 'irq:*' > /debug/tracing/set_event
+
+2.2 Via the 'enable' toggle
+---------------------------
+
+The events available are also listed in /debug/tracing/events/ hierarchy
+of directories.
+
+To enable event 'sched_wakeup':
+
+	# echo 1 > /debug/tracing/events/sched/sched_wakeup/enable
+
+To disable it:
+
+	# echo 0 > /debug/tracing/events/sched/sched_wakeup/enable
+
+To enable all events in sched subsystem:
+
+	# echo 1 > /debug/tracing/events/sched/enable
+
+To eanble all events:
+
+	# echo 1 > /debug/tracing/events/enable
+
+When reading one of these enable files, there are four results:
+
+ 0 - all events this file affects are disabled
+ 1 - all events this file affects are enabled
+ X - there is a mixture of events enabled and disabled
+ ? - this file does not affect any event
+
+3. Defining an event-enabled tracepoint
+=======================================
+
+See The example provided in samples/trace_events
+

+ 1 - 1
Documentation/trace/ftrace.txt

@@ -179,7 +179,7 @@ Here is the list of current tracers that may be configured.
 
 
 	Function call tracer to trace all kernel functions.
 	Function call tracer to trace all kernel functions.
 
 
-  "function_graph_tracer"
+  "function_graph"
 
 
 	Similar to the function tracer except that the
 	Similar to the function tracer except that the
 	function tracer probes the functions on their entry
 	function tracer probes the functions on their entry

+ 17 - 0
Documentation/trace/power.txt

@@ -0,0 +1,17 @@
+The power tracer collects detailed information about C-state and P-state
+transitions, instead of just looking at the high-level "average"
+information.
+
+There is a helper script found in scrips/tracing/power.pl in the kernel
+sources which can be used to parse this information and create a
+Scalable Vector Graphics (SVG) picture from the trace data.
+
+To use this tracer:
+
+	echo 0 > /sys/kernel/debug/tracing/tracing_enabled
+	echo power > /sys/kernel/debug/tracing/current_tracer
+	echo 1 > /sys/kernel/debug/tracing/tracing_enabled
+	sleep 1
+	echo 0 > /sys/kernel/debug/tracing/tracing_enabled
+	cat /sys/kernel/debug/tracing/trace | \
+		perl scripts/tracing/power.pl > out.sv

+ 9 - 0
arch/x86/Kconfig.debug

@@ -174,6 +174,15 @@ config IOMMU_LEAK
 	  Add a simple leak tracer to the IOMMU code. This is useful when you
 	  Add a simple leak tracer to the IOMMU code. This is useful when you
 	  are debugging a buggy device driver that leaks IOMMU mappings.
 	  are debugging a buggy device driver that leaks IOMMU mappings.
 
 
+config X86_DS_SELFTEST
+    bool "DS selftest"
+    default y
+    depends on DEBUG_KERNEL
+    depends on X86_DS
+	---help---
+	  Perform Debug Store selftests at boot time.
+	  If in doubt, say "N".
+
 config HAVE_MMIOTRACE_SUPPORT
 config HAVE_MMIOTRACE_SUPPORT
 	def_bool y
 	def_bool y
 
 

+ 56 - 26
arch/x86/include/asm/ds.h

@@ -15,8 +15,8 @@
  * - buffer allocation (memory accounting)
  * - buffer allocation (memory accounting)
  *
  *
  *
  *
- * Copyright (C) 2007-2008 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
+ * Copyright (C) 2007-2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
  */
  */
 
 
 #ifndef _ASM_X86_DS_H
 #ifndef _ASM_X86_DS_H
@@ -83,8 +83,10 @@ enum ds_feature {
  * The interrupt threshold is independent from the overflow callback
  * The interrupt threshold is independent from the overflow callback
  * to allow users to use their own overflow interrupt handling mechanism.
  * to allow users to use their own overflow interrupt handling mechanism.
  *
  *
- * task: the task to request recording for;
- *       NULL for per-cpu recording on the current cpu
+ * The function might sleep.
+ *
+ * task: the task to request recording for
+ * cpu:  the cpu to request recording for
  * base: the base pointer for the (non-pageable) buffer;
  * base: the base pointer for the (non-pageable) buffer;
  * size: the size of the provided buffer in bytes
  * size: the size of the provided buffer in bytes
  * ovfl: pointer to a function to be called on buffer overflow;
  * ovfl: pointer to a function to be called on buffer overflow;
@@ -93,19 +95,28 @@ enum ds_feature {
  *     -1 if no interrupt threshold is requested.
  *     -1 if no interrupt threshold is requested.
  * flags: a bit-mask of the above flags
  * flags: a bit-mask of the above flags
  */
  */
-extern struct bts_tracer *ds_request_bts(struct task_struct *task,
-					 void *base, size_t size,
-					 bts_ovfl_callback_t ovfl,
-					 size_t th, unsigned int flags);
-extern struct pebs_tracer *ds_request_pebs(struct task_struct *task,
-					   void *base, size_t size,
-					   pebs_ovfl_callback_t ovfl,
-					   size_t th, unsigned int flags);
+extern struct bts_tracer *ds_request_bts_task(struct task_struct *task,
+					      void *base, size_t size,
+					      bts_ovfl_callback_t ovfl,
+					      size_t th, unsigned int flags);
+extern struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
+					     bts_ovfl_callback_t ovfl,
+					     size_t th, unsigned int flags);
+extern struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
+						void *base, size_t size,
+						pebs_ovfl_callback_t ovfl,
+						size_t th, unsigned int flags);
+extern struct pebs_tracer *ds_request_pebs_cpu(int cpu,
+					       void *base, size_t size,
+					       pebs_ovfl_callback_t ovfl,
+					       size_t th, unsigned int flags);
 
 
 /*
 /*
  * Release BTS or PEBS resources
  * Release BTS or PEBS resources
  * Suspend and resume BTS or PEBS tracing
  * Suspend and resume BTS or PEBS tracing
  *
  *
+ * Must be called with irq's enabled.
+ *
  * tracer: the tracer handle returned from ds_request_~()
  * tracer: the tracer handle returned from ds_request_~()
  */
  */
 extern void ds_release_bts(struct bts_tracer *tracer);
 extern void ds_release_bts(struct bts_tracer *tracer);
@@ -115,6 +126,28 @@ extern void ds_release_pebs(struct pebs_tracer *tracer);
 extern void ds_suspend_pebs(struct pebs_tracer *tracer);
 extern void ds_suspend_pebs(struct pebs_tracer *tracer);
 extern void ds_resume_pebs(struct pebs_tracer *tracer);
 extern void ds_resume_pebs(struct pebs_tracer *tracer);
 
 
+/*
+ * Release BTS or PEBS resources
+ * Suspend and resume BTS or PEBS tracing
+ *
+ * Cpu tracers must call this on the traced cpu.
+ * Task tracers must call ds_release_~_noirq() for themselves.
+ *
+ * May be called with irq's disabled.
+ *
+ * Returns 0 if successful;
+ * -EPERM if the cpu tracer does not trace the current cpu.
+ * -EPERM if the task tracer does not trace itself.
+ *
+ * tracer: the tracer handle returned from ds_request_~()
+ */
+extern int ds_release_bts_noirq(struct bts_tracer *tracer);
+extern int ds_suspend_bts_noirq(struct bts_tracer *tracer);
+extern int ds_resume_bts_noirq(struct bts_tracer *tracer);
+extern int ds_release_pebs_noirq(struct pebs_tracer *tracer);
+extern int ds_suspend_pebs_noirq(struct pebs_tracer *tracer);
+extern int ds_resume_pebs_noirq(struct pebs_tracer *tracer);
+
 
 
 /*
 /*
  * The raw DS buffer state as it is used for BTS and PEBS recording.
  * The raw DS buffer state as it is used for BTS and PEBS recording.
@@ -170,9 +203,9 @@ struct bts_struct {
 		} lbr;
 		} lbr;
 		/* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
 		/* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
 		struct {
 		struct {
-			__u64 jiffies;
+			__u64 clock;
 			pid_t pid;
 			pid_t pid;
-		} timestamp;
+		} event;
 	} variant;
 	} variant;
 };
 };
 
 
@@ -201,8 +234,12 @@ struct bts_trace {
 struct pebs_trace {
 struct pebs_trace {
 	struct ds_trace ds;
 	struct ds_trace ds;
 
 
-	/* the PEBS reset value */
-	unsigned long long reset_value;
+	/* the number of valid counters in the below array */
+	unsigned int counters;
+
+#define MAX_PEBS_COUNTERS 4
+	/* the counter reset value */
+	unsigned long long counter_reset[MAX_PEBS_COUNTERS];
 };
 };
 
 
 
 
@@ -237,9 +274,11 @@ extern int ds_reset_pebs(struct pebs_tracer *tracer);
  * Returns 0 on success; -Eerrno on error
  * Returns 0 on success; -Eerrno on error
  *
  *
  * tracer: the tracer handle returned from ds_request_pebs()
  * tracer: the tracer handle returned from ds_request_pebs()
+ * counter: the index of the counter
  * value: the new counter reset value
  * value: the new counter reset value
  */
  */
-extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value);
+extern int ds_set_pebs_reset(struct pebs_tracer *tracer,
+			     unsigned int counter, u64 value);
 
 
 /*
 /*
  * Initialization
  * Initialization
@@ -252,21 +291,12 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
  */
  */
 extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
 extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
 
 
-/*
- * Task clone/init and cleanup work
- */
-extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father);
-extern void ds_exit_thread(struct task_struct *tsk);
-
 #else /* CONFIG_X86_DS */
 #else /* CONFIG_X86_DS */
 
 
 struct cpuinfo_x86;
 struct cpuinfo_x86;
 static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
 static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
 static inline void ds_switch_to(struct task_struct *prev,
 static inline void ds_switch_to(struct task_struct *prev,
 				struct task_struct *next) {}
 				struct task_struct *next) {}
-static inline void ds_copy_thread(struct task_struct *tsk,
-				  struct task_struct *father) {}
-static inline void ds_exit_thread(struct task_struct *tsk) {}
 
 
 #endif /* CONFIG_X86_DS */
 #endif /* CONFIG_X86_DS */
 #endif /* _ASM_X86_DS_H */
 #endif /* _ASM_X86_DS_H */

+ 28 - 7
arch/x86/include/asm/processor.h

@@ -462,14 +462,8 @@ struct thread_struct {
 	unsigned		io_bitmap_max;
 	unsigned		io_bitmap_max;
 /* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set.  */
 /* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set.  */
 	unsigned long	debugctlmsr;
 	unsigned long	debugctlmsr;
-#ifdef CONFIG_X86_DS
-/* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */
+	/* Debug Store context; see asm/ds.h */
 	struct ds_context	*ds_ctx;
 	struct ds_context	*ds_ctx;
-#endif /* CONFIG_X86_DS */
-#ifdef CONFIG_X86_PTRACE_BTS
-/* the signal to send on a bts buffer overflow */
-	unsigned int	bts_ovfl_signal;
-#endif /* CONFIG_X86_PTRACE_BTS */
 };
 };
 
 
 static inline unsigned long native_get_debugreg(int regno)
 static inline unsigned long native_get_debugreg(int regno)
@@ -797,6 +791,21 @@ static inline unsigned long get_debugctlmsr(void)
     return debugctlmsr;
     return debugctlmsr;
 }
 }
 
 
+static inline unsigned long get_debugctlmsr_on_cpu(int cpu)
+{
+	u64 debugctlmsr = 0;
+	u32 val1, val2;
+
+#ifndef CONFIG_X86_DEBUGCTLMSR
+	if (boot_cpu_data.x86 < 6)
+		return 0;
+#endif
+	rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2);
+	debugctlmsr = val1 | ((u64)val2 << 32);
+
+	return debugctlmsr;
+}
+
 static inline void update_debugctlmsr(unsigned long debugctlmsr)
 static inline void update_debugctlmsr(unsigned long debugctlmsr)
 {
 {
 #ifndef CONFIG_X86_DEBUGCTLMSR
 #ifndef CONFIG_X86_DEBUGCTLMSR
@@ -806,6 +815,18 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr)
 	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
 	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
 }
 }
 
 
+static inline void update_debugctlmsr_on_cpu(int cpu,
+					     unsigned long debugctlmsr)
+{
+#ifndef CONFIG_X86_DEBUGCTLMSR
+	if (boot_cpu_data.x86 < 6)
+		return;
+#endif
+	wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR,
+		     (u32)((u64)debugctlmsr),
+		     (u32)((u64)debugctlmsr >> 32));
+}
+
 /*
 /*
  * from system description table in BIOS. Mostly for MCA use, but
  * from system description table in BIOS. Mostly for MCA use, but
  * others may find it useful:
  * others may find it useful:

+ 4 - 5
arch/x86/include/asm/ptrace.h

@@ -236,12 +236,11 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
 extern int do_set_thread_area(struct task_struct *p, int idx,
 extern int do_set_thread_area(struct task_struct *p, int idx,
 			      struct user_desc __user *info, int can_allocate);
 			      struct user_desc __user *info, int can_allocate);
 
 
-extern void x86_ptrace_untrace(struct task_struct *);
-extern void x86_ptrace_fork(struct task_struct *child,
-			    unsigned long clone_flags);
+#ifdef CONFIG_X86_PTRACE_BTS
+extern void ptrace_bts_untrace(struct task_struct *tsk);
 
 
-#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk)
-#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags)
+#define arch_ptrace_untrace(tsk)	ptrace_bts_untrace(tsk)
+#endif /* CONFIG_X86_PTRACE_BTS */
 
 
 #endif /* __KERNEL__ */
 #endif /* __KERNEL__ */
 
 

+ 4 - 4
arch/x86/include/asm/tlbflush.h

@@ -17,7 +17,7 @@
 
 
 static inline void __native_flush_tlb(void)
 static inline void __native_flush_tlb(void)
 {
 {
-	write_cr3(read_cr3());
+	native_write_cr3(native_read_cr3());
 }
 }
 
 
 static inline void __native_flush_tlb_global(void)
 static inline void __native_flush_tlb_global(void)
@@ -32,11 +32,11 @@ static inline void __native_flush_tlb_global(void)
 	 */
 	 */
 	raw_local_irq_save(flags);
 	raw_local_irq_save(flags);
 
 
-	cr4 = read_cr4();
+	cr4 = native_read_cr4();
 	/* clear PGE */
 	/* clear PGE */
-	write_cr4(cr4 & ~X86_CR4_PGE);
+	native_write_cr4(cr4 & ~X86_CR4_PGE);
 	/* write old PGE again and flush TLBs */
 	/* write old PGE again and flush TLBs */
-	write_cr4(cr4);
+	native_write_cr4(cr4);
 
 
 	raw_local_irq_restore(flags);
 	raw_local_irq_restore(flags);
 }
 }

+ 1 - 0
arch/x86/kernel/Makefile

@@ -44,6 +44,7 @@ obj-y				+= process.o
 obj-y				+= i387.o xsave.o
 obj-y				+= i387.o xsave.o
 obj-y				+= ptrace.o
 obj-y				+= ptrace.o
 obj-$(CONFIG_X86_DS)		+= ds.o
 obj-$(CONFIG_X86_DS)		+= ds.o
+obj-$(CONFIG_X86_DS_SELFTEST)		+= ds_selftest.o
 obj-$(CONFIG_X86_32)		+= tls.o
 obj-$(CONFIG_X86_32)		+= tls.o
 obj-$(CONFIG_IA32_EMULATION)	+= tls.o
 obj-$(CONFIG_IA32_EMULATION)	+= tls.o
 obj-y				+= step.o
 obj-y				+= step.o

文件差异内容过多而无法显示
+ 519 - 193
arch/x86/kernel/ds.c


+ 408 - 0
arch/x86/kernel/ds_selftest.c

@@ -0,0 +1,408 @@
+/*
+ * Debug Store support - selftest
+ *
+ *
+ * Copyright (C) 2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2009
+ */
+
+#include "ds_selftest.h"
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+
+#include <asm/ds.h>
+
+
+#define BUFFER_SIZE		521	/* Intentionally chose an odd size. */
+#define SMALL_BUFFER_SIZE	24	/* A single bts entry. */
+
+struct ds_selftest_bts_conf {
+	struct bts_tracer *tracer;
+	int error;
+	int (*suspend)(struct bts_tracer *);
+	int (*resume)(struct bts_tracer *);
+};
+
+static int ds_selftest_bts_consistency(const struct bts_trace *trace)
+{
+	int error = 0;
+
+	if (!trace) {
+		printk(KERN_CONT "failed to access trace...");
+		/* Bail out. Other tests are pointless. */
+		return -1;
+	}
+
+	if (!trace->read) {
+		printk(KERN_CONT "bts read not available...");
+		error = -1;
+	}
+
+	/* Do some sanity checks on the trace configuration. */
+	if (!trace->ds.n) {
+		printk(KERN_CONT "empty bts buffer...");
+		error = -1;
+	}
+	if (!trace->ds.size) {
+		printk(KERN_CONT "bad bts trace setup...");
+		error = -1;
+	}
+	if (trace->ds.end !=
+	    (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) {
+		printk(KERN_CONT "bad bts buffer setup...");
+		error = -1;
+	}
+	/*
+	 * We allow top in [begin; end], since its not clear when the
+	 * overflow adjustment happens: after the increment or before the
+	 * write.
+	 */
+	if ((trace->ds.top < trace->ds.begin) ||
+	    (trace->ds.end < trace->ds.top)) {
+		printk(KERN_CONT "bts top out of bounds...");
+		error = -1;
+	}
+
+	return error;
+}
+
+static int ds_selftest_bts_read(struct bts_tracer *tracer,
+				const struct bts_trace *trace,
+				const void *from, const void *to)
+{
+	const unsigned char *at;
+
+	/*
+	 * Check a few things which do not belong to this test.
+	 * They should be covered by other tests.
+	 */
+	if (!trace)
+		return -1;
+
+	if (!trace->read)
+		return -1;
+
+	if (to < from)
+		return -1;
+
+	if (from < trace->ds.begin)
+		return -1;
+
+	if (trace->ds.end < to)
+		return -1;
+
+	if (!trace->ds.size)
+		return -1;
+
+	/* Now to the test itself. */
+	for (at = from; (void *)at < to; at += trace->ds.size) {
+		struct bts_struct bts;
+		unsigned long index;
+		int error;
+
+		if (((void *)at - trace->ds.begin) % trace->ds.size) {
+			printk(KERN_CONT
+			       "read from non-integer index...");
+			return -1;
+		}
+		index = ((void *)at - trace->ds.begin) / trace->ds.size;
+
+		memset(&bts, 0, sizeof(bts));
+		error = trace->read(tracer, at, &bts);
+		if (error < 0) {
+			printk(KERN_CONT
+			       "error reading bts trace at [%lu] (0x%p)...",
+			       index, at);
+			return error;
+		}
+
+		switch (bts.qualifier) {
+		case BTS_BRANCH:
+			break;
+		default:
+			printk(KERN_CONT
+			       "unexpected bts entry %llu at [%lu] (0x%p)...",
+			       bts.qualifier, index, at);
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static void ds_selftest_bts_cpu(void *arg)
+{
+	struct ds_selftest_bts_conf *conf = arg;
+	const struct bts_trace *trace;
+	void *top;
+
+	if (IS_ERR(conf->tracer)) {
+		conf->error = PTR_ERR(conf->tracer);
+		conf->tracer = NULL;
+
+		printk(KERN_CONT
+		       "initialization failed (err: %d)...", conf->error);
+		return;
+	}
+
+	/* We should meanwhile have enough trace. */
+	conf->error = conf->suspend(conf->tracer);
+	if (conf->error < 0)
+		return;
+
+	/* Let's see if we can access the trace. */
+	trace = ds_read_bts(conf->tracer);
+
+	conf->error = ds_selftest_bts_consistency(trace);
+	if (conf->error < 0)
+		return;
+
+	/* If everything went well, we should have a few trace entries. */
+	if (trace->ds.top == trace->ds.begin) {
+		/*
+		 * It is possible but highly unlikely that we got a
+		 * buffer overflow and end up at exactly the same
+		 * position we started from.
+		 * Let's issue a warning, but continue.
+		 */
+		printk(KERN_CONT "no trace/overflow...");
+	}
+
+	/* Let's try to read the trace we collected. */
+	conf->error =
+		ds_selftest_bts_read(conf->tracer, trace,
+				     trace->ds.begin, trace->ds.top);
+	if (conf->error < 0)
+		return;
+
+	/*
+	 * Let's read the trace again.
+	 * Since we suspended tracing, we should get the same result.
+	 */
+	top = trace->ds.top;
+
+	trace = ds_read_bts(conf->tracer);
+	conf->error = ds_selftest_bts_consistency(trace);
+	if (conf->error < 0)
+		return;
+
+	if (top != trace->ds.top) {
+		printk(KERN_CONT "suspend not working...");
+		conf->error = -1;
+		return;
+	}
+
+	/* Let's collect some more trace - see if resume is working. */
+	conf->error = conf->resume(conf->tracer);
+	if (conf->error < 0)
+		return;
+
+	conf->error = conf->suspend(conf->tracer);
+	if (conf->error < 0)
+		return;
+
+	trace = ds_read_bts(conf->tracer);
+
+	conf->error = ds_selftest_bts_consistency(trace);
+	if (conf->error < 0)
+		return;
+
+	if (trace->ds.top == top) {
+		/*
+		 * It is possible but highly unlikely that we got a
+		 * buffer overflow and end up at exactly the same
+		 * position we started from.
+		 * Let's issue a warning and check the full trace.
+		 */
+		printk(KERN_CONT
+		       "no resume progress/overflow...");
+
+		conf->error =
+			ds_selftest_bts_read(conf->tracer, trace,
+					     trace->ds.begin, trace->ds.end);
+	} else if (trace->ds.top < top) {
+		/*
+		 * We had a buffer overflow - the entire buffer should
+		 * contain trace records.
+		 */
+		conf->error =
+			ds_selftest_bts_read(conf->tracer, trace,
+					     trace->ds.begin, trace->ds.end);
+	} else {
+		/*
+		 * It is quite likely that the buffer did not overflow.
+		 * Let's just check the delta trace.
+		 */
+		conf->error =
+			ds_selftest_bts_read(conf->tracer, trace, top,
+					     trace->ds.top);
+	}
+	if (conf->error < 0)
+		return;
+
+	conf->error = 0;
+}
+
+static int ds_suspend_bts_wrap(struct bts_tracer *tracer)
+{
+	ds_suspend_bts(tracer);
+	return 0;
+}
+
+static int ds_resume_bts_wrap(struct bts_tracer *tracer)
+{
+	ds_resume_bts(tracer);
+	return 0;
+}
+
+static void ds_release_bts_noirq_wrap(void *tracer)
+{
+	(void)ds_release_bts_noirq(tracer);
+}
+
+static int ds_selftest_bts_bad_release_noirq(int cpu,
+					     struct bts_tracer *tracer)
+{
+	int error = -EPERM;
+
+	/* Try to release the tracer on the wrong cpu. */
+	get_cpu();
+	if (cpu != smp_processor_id()) {
+		error = ds_release_bts_noirq(tracer);
+		if (error != -EPERM)
+			printk(KERN_CONT "release on wrong cpu...");
+	}
+	put_cpu();
+
+	return error ? 0 : -1;
+}
+
+static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer)
+{
+	struct bts_tracer *tracer;
+	int error;
+
+	/* Try to request cpu tracing while task tracing is active. */
+	tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL,
+				    (size_t)-1, BTS_KERNEL);
+	error = PTR_ERR(tracer);
+	if (!IS_ERR(tracer)) {
+		ds_release_bts(tracer);
+		error = 0;
+	}
+
+	if (error != -EPERM)
+		printk(KERN_CONT "cpu/task tracing overlap...");
+
+	return error ? 0 : -1;
+}
+
+static int ds_selftest_bts_bad_request_task(void *buffer)
+{
+	struct bts_tracer *tracer;
+	int error;
+
+	/* Try to request cpu tracing while task tracing is active. */
+	tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL,
+				    (size_t)-1, BTS_KERNEL);
+	error = PTR_ERR(tracer);
+	if (!IS_ERR(tracer)) {
+		error = 0;
+		ds_release_bts(tracer);
+	}
+
+	if (error != -EPERM)
+		printk(KERN_CONT "task/cpu tracing overlap...");
+
+	return error ? 0 : -1;
+}
+
+int ds_selftest_bts(void)
+{
+	struct ds_selftest_bts_conf conf;
+	unsigned char buffer[BUFFER_SIZE], *small_buffer;
+	unsigned long irq;
+	int cpu;
+
+	printk(KERN_INFO "[ds] bts selftest...");
+	conf.error = 0;
+
+	small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8;
+
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		conf.suspend = ds_suspend_bts_wrap;
+		conf.resume = ds_resume_bts_wrap;
+		conf.tracer =
+			ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
+					   NULL, (size_t)-1, BTS_KERNEL);
+		ds_selftest_bts_cpu(&conf);
+		if (conf.error >= 0)
+			conf.error = ds_selftest_bts_bad_request_task(buffer);
+		ds_release_bts(conf.tracer);
+		if (conf.error < 0)
+			goto out;
+
+		conf.suspend = ds_suspend_bts_noirq;
+		conf.resume = ds_resume_bts_noirq;
+		conf.tracer =
+			ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
+					   NULL, (size_t)-1, BTS_KERNEL);
+		smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1);
+		if (conf.error >= 0) {
+			conf.error =
+				ds_selftest_bts_bad_release_noirq(cpu,
+								  conf.tracer);
+			/* We must not release the tracer twice. */
+			if (conf.error < 0)
+				conf.tracer = NULL;
+		}
+		if (conf.error >= 0)
+			conf.error = ds_selftest_bts_bad_request_task(buffer);
+		smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
+					 conf.tracer, 1);
+		if (conf.error < 0)
+			goto out;
+	}
+
+	conf.suspend = ds_suspend_bts_wrap;
+	conf.resume = ds_resume_bts_wrap;
+	conf.tracer =
+		ds_request_bts_task(current, buffer, BUFFER_SIZE,
+				    NULL, (size_t)-1, BTS_KERNEL);
+	ds_selftest_bts_cpu(&conf);
+	if (conf.error >= 0)
+		conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
+	ds_release_bts(conf.tracer);
+	if (conf.error < 0)
+		goto out;
+
+	conf.suspend = ds_suspend_bts_noirq;
+	conf.resume = ds_resume_bts_noirq;
+	conf.tracer =
+		ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE,
+				   NULL, (size_t)-1, BTS_KERNEL);
+	local_irq_save(irq);
+	ds_selftest_bts_cpu(&conf);
+	if (conf.error >= 0)
+		conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
+	ds_release_bts_noirq(conf.tracer);
+	local_irq_restore(irq);
+	if (conf.error < 0)
+		goto out;
+
+	conf.error = 0;
+ out:
+	put_online_cpus();
+	printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed"));
+
+	return conf.error;
+}
+
+int ds_selftest_pebs(void)
+{
+	return 0;
+}

+ 15 - 0
arch/x86/kernel/ds_selftest.h

@@ -0,0 +1,15 @@
+/*
+ * Debug Store support - selftest
+ *
+ *
+ * Copyright (C) 2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2009
+ */
+
+#ifdef CONFIG_X86_DS_SELFTEST
+extern int ds_selftest_bts(void);
+extern int ds_selftest_pebs(void);
+#else
+static inline int ds_selftest_bts(void) { return 0; }
+static inline int ds_selftest_pebs(void) { return 0; }
+#endif

+ 3 - 16
arch/x86/kernel/entry_64.S

@@ -147,27 +147,14 @@ END(ftrace_graph_caller)
 GLOBAL(return_to_handler)
 GLOBAL(return_to_handler)
 	subq  $80, %rsp
 	subq  $80, %rsp
 
 
+	/* Save the return values */
 	movq %rax, (%rsp)
 	movq %rax, (%rsp)
-	movq %rcx, 8(%rsp)
-	movq %rdx, 16(%rsp)
-	movq %rsi, 24(%rsp)
-	movq %rdi, 32(%rsp)
-	movq %r8, 40(%rsp)
-	movq %r9, 48(%rsp)
-	movq %r10, 56(%rsp)
-	movq %r11, 64(%rsp)
+	movq %rdx, 8(%rsp)
 
 
 	call ftrace_return_to_handler
 	call ftrace_return_to_handler
 
 
 	movq %rax, 72(%rsp)
 	movq %rax, 72(%rsp)
-	movq 64(%rsp), %r11
-	movq 56(%rsp), %r10
-	movq 48(%rsp), %r9
-	movq 40(%rsp), %r8
-	movq 32(%rsp), %rdi
-	movq 24(%rsp), %rsi
-	movq 16(%rsp), %rdx
-	movq 8(%rsp), %rcx
+	movq 8(%rsp), %rdx
 	movq (%rsp), %rax
 	movq (%rsp), %rax
 	addq $72, %rsp
 	addq $72, %rsp
 	retq
 	retq

+ 3 - 2
arch/x86/kernel/process.c

@@ -16,6 +16,7 @@
 #include <asm/idle.h>
 #include <asm/idle.h>
 #include <asm/uaccess.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
 #include <asm/i387.h>
+#include <asm/ds.h>
 
 
 unsigned long idle_halt;
 unsigned long idle_halt;
 EXPORT_SYMBOL(idle_halt);
 EXPORT_SYMBOL(idle_halt);
@@ -47,6 +48,8 @@ void free_thread_xstate(struct task_struct *tsk)
 		kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
 		kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
 		tsk->thread.xstate = NULL;
 		tsk->thread.xstate = NULL;
 	}
 	}
+
+	WARN(tsk->thread.ds_ctx, "leaking DS context\n");
 }
 }
 
 
 void free_thread_info(struct thread_info *ti)
 void free_thread_info(struct thread_info *ti)
@@ -85,8 +88,6 @@ void exit_thread(void)
 		put_cpu();
 		put_cpu();
 		kfree(bp);
 		kfree(bp);
 	}
 	}
-
-	ds_exit_thread(current);
 }
 }
 
 
 void flush_thread(void)
 void flush_thread(void)

+ 2 - 1
arch/x86/kernel/process_32.c

@@ -287,7 +287,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 		p->thread.io_bitmap_max = 0;
 		p->thread.io_bitmap_max = 0;
 	}
 	}
 
 
-	ds_copy_thread(p, current);
+	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
+	p->thread.ds_ctx = NULL;
 
 
 	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
 	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
 	p->thread.debugctlmsr = 0;
 	p->thread.debugctlmsr = 0;

+ 2 - 1
arch/x86/kernel/process_64.c

@@ -332,7 +332,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 			goto out;
 			goto out;
 	}
 	}
 
 
-	ds_copy_thread(p, me);
+	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
+	p->thread.ds_ctx = NULL;
 
 
 	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
 	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
 	p->thread.debugctlmsr = 0;
 	p->thread.debugctlmsr = 0;

+ 182 - 102
arch/x86/kernel/ptrace.c

@@ -21,6 +21,7 @@
 #include <linux/audit.h>
 #include <linux/audit.h>
 #include <linux/seccomp.h>
 #include <linux/seccomp.h>
 #include <linux/signal.h>
 #include <linux/signal.h>
+#include <linux/workqueue.h>
 
 
 #include <asm/uaccess.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/pgtable.h>
@@ -578,17 +579,130 @@ static int ioperm_get(struct task_struct *target,
 }
 }
 
 
 #ifdef CONFIG_X86_PTRACE_BTS
 #ifdef CONFIG_X86_PTRACE_BTS
+/*
+ * A branch trace store context.
+ *
+ * Contexts may only be installed by ptrace_bts_config() and only for
+ * ptraced tasks.
+ *
+ * Contexts are destroyed when the tracee is detached from the tracer.
+ * The actual destruction work requires interrupts enabled, so the
+ * work is deferred and will be scheduled during __ptrace_unlink().
+ *
+ * Contexts hold an additional task_struct reference on the traced
+ * task, as well as a reference on the tracer's mm.
+ *
+ * Ptrace already holds a task_struct for the duration of ptrace operations,
+ * but since destruction is deferred, it may be executed after both
+ * tracer and tracee exited.
+ */
+struct bts_context {
+	/* The branch trace handle. */
+	struct bts_tracer	*tracer;
+
+	/* The buffer used to store the branch trace and its size. */
+	void			*buffer;
+	unsigned int		size;
+
+	/* The mm that paid for the above buffer. */
+	struct mm_struct	*mm;
+
+	/* The task this context belongs to. */
+	struct task_struct	*task;
+
+	/* The signal to send on a bts buffer overflow. */
+	unsigned int		bts_ovfl_signal;
+
+	/* The work struct to destroy a context. */
+	struct work_struct	work;
+};
+
+static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
+{
+	void *buffer = NULL;
+	int err = -ENOMEM;
+
+	err = account_locked_memory(current->mm, current->signal->rlim, size);
+	if (err < 0)
+		return err;
+
+	buffer = kzalloc(size, GFP_KERNEL);
+	if (!buffer)
+		goto out_refund;
+
+	context->buffer = buffer;
+	context->size = size;
+	context->mm = get_task_mm(current);
+
+	return 0;
+
+ out_refund:
+	refund_locked_memory(current->mm, size);
+	return err;
+}
+
+static inline void free_bts_buffer(struct bts_context *context)
+{
+	if (!context->buffer)
+		return;
+
+	kfree(context->buffer);
+	context->buffer = NULL;
+
+	refund_locked_memory(context->mm, context->size);
+	context->size = 0;
+
+	mmput(context->mm);
+	context->mm = NULL;
+}
+
+static void free_bts_context_work(struct work_struct *w)
+{
+	struct bts_context *context;
+
+	context = container_of(w, struct bts_context, work);
+
+	ds_release_bts(context->tracer);
+	put_task_struct(context->task);
+	free_bts_buffer(context);
+	kfree(context);
+}
+
+static inline void free_bts_context(struct bts_context *context)
+{
+	INIT_WORK(&context->work, free_bts_context_work);
+	schedule_work(&context->work);
+}
+
+static inline struct bts_context *alloc_bts_context(struct task_struct *task)
+{
+	struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (context) {
+		context->task = task;
+		task->bts = context;
+
+		get_task_struct(task);
+	}
+
+	return context;
+}
+
 static int ptrace_bts_read_record(struct task_struct *child, size_t index,
 static int ptrace_bts_read_record(struct task_struct *child, size_t index,
 				  struct bts_struct __user *out)
 				  struct bts_struct __user *out)
 {
 {
+	struct bts_context *context;
 	const struct bts_trace *trace;
 	const struct bts_trace *trace;
 	struct bts_struct bts;
 	struct bts_struct bts;
 	const unsigned char *at;
 	const unsigned char *at;
 	int error;
 	int error;
 
 
-	trace = ds_read_bts(child->bts);
+	context = child->bts;
+	if (!context)
+		return -ESRCH;
+
+	trace = ds_read_bts(context->tracer);
 	if (!trace)
 	if (!trace)
-		return -EPERM;
+		return -ESRCH;
 
 
 	at = trace->ds.top - ((index + 1) * trace->ds.size);
 	at = trace->ds.top - ((index + 1) * trace->ds.size);
 	if ((void *)at < trace->ds.begin)
 	if ((void *)at < trace->ds.begin)
@@ -597,7 +711,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
 	if (!trace->read)
 	if (!trace->read)
 		return -EOPNOTSUPP;
 		return -EOPNOTSUPP;
 
 
-	error = trace->read(child->bts, at, &bts);
+	error = trace->read(context->tracer, at, &bts);
 	if (error < 0)
 	if (error < 0)
 		return error;
 		return error;
 
 
@@ -611,13 +725,18 @@ static int ptrace_bts_drain(struct task_struct *child,
 			    long size,
 			    long size,
 			    struct bts_struct __user *out)
 			    struct bts_struct __user *out)
 {
 {
+	struct bts_context *context;
 	const struct bts_trace *trace;
 	const struct bts_trace *trace;
 	const unsigned char *at;
 	const unsigned char *at;
 	int error, drained = 0;
 	int error, drained = 0;
 
 
-	trace = ds_read_bts(child->bts);
+	context = child->bts;
+	if (!context)
+		return -ESRCH;
+
+	trace = ds_read_bts(context->tracer);
 	if (!trace)
 	if (!trace)
-		return -EPERM;
+		return -ESRCH;
 
 
 	if (!trace->read)
 	if (!trace->read)
 		return -EOPNOTSUPP;
 		return -EOPNOTSUPP;
@@ -628,9 +747,8 @@ static int ptrace_bts_drain(struct task_struct *child,
 	for (at = trace->ds.begin; (void *)at < trace->ds.top;
 	for (at = trace->ds.begin; (void *)at < trace->ds.top;
 	     out++, drained++, at += trace->ds.size) {
 	     out++, drained++, at += trace->ds.size) {
 		struct bts_struct bts;
 		struct bts_struct bts;
-		int error;
 
 
-		error = trace->read(child->bts, at, &bts);
+		error = trace->read(context->tracer, at, &bts);
 		if (error < 0)
 		if (error < 0)
 			return error;
 			return error;
 
 
@@ -640,35 +758,18 @@ static int ptrace_bts_drain(struct task_struct *child,
 
 
 	memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
 	memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
 
 
-	error = ds_reset_bts(child->bts);
+	error = ds_reset_bts(context->tracer);
 	if (error < 0)
 	if (error < 0)
 		return error;
 		return error;
 
 
 	return drained;
 	return drained;
 }
 }
 
 
-static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size)
-{
-	child->bts_buffer = alloc_locked_buffer(size);
-	if (!child->bts_buffer)
-		return -ENOMEM;
-
-	child->bts_size = size;
-
-	return 0;
-}
-
-static void ptrace_bts_free_buffer(struct task_struct *child)
-{
-	free_locked_buffer(child->bts_buffer, child->bts_size);
-	child->bts_buffer = NULL;
-	child->bts_size = 0;
-}
-
 static int ptrace_bts_config(struct task_struct *child,
 static int ptrace_bts_config(struct task_struct *child,
 			     long cfg_size,
 			     long cfg_size,
 			     const struct ptrace_bts_config __user *ucfg)
 			     const struct ptrace_bts_config __user *ucfg)
 {
 {
+	struct bts_context *context;
 	struct ptrace_bts_config cfg;
 	struct ptrace_bts_config cfg;
 	unsigned int flags = 0;
 	unsigned int flags = 0;
 
 
@@ -678,28 +779,33 @@ static int ptrace_bts_config(struct task_struct *child,
 	if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
 	if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
 		return -EFAULT;
 		return -EFAULT;
 
 
-	if (child->bts) {
-		ds_release_bts(child->bts);
-		child->bts = NULL;
-	}
+	context = child->bts;
+	if (!context)
+		context = alloc_bts_context(child);
+	if (!context)
+		return -ENOMEM;
 
 
 	if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
 	if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
 		if (!cfg.signal)
 		if (!cfg.signal)
 			return -EINVAL;
 			return -EINVAL;
 
 
-		child->thread.bts_ovfl_signal = cfg.signal;
 		return -EOPNOTSUPP;
 		return -EOPNOTSUPP;
+		context->bts_ovfl_signal = cfg.signal;
 	}
 	}
 
 
-	if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
-	    (cfg.size != child->bts_size)) {
-		int error;
+	ds_release_bts(context->tracer);
+	context->tracer = NULL;
 
 
-		ptrace_bts_free_buffer(child);
+	if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
+		int err;
 
 
-		error = ptrace_bts_allocate_buffer(child, cfg.size);
-		if (error < 0)
-			return error;
+		free_bts_buffer(context);
+		if (!cfg.size)
+			return 0;
+
+		err = alloc_bts_buffer(context, cfg.size);
+		if (err < 0)
+			return err;
 	}
 	}
 
 
 	if (cfg.flags & PTRACE_BTS_O_TRACE)
 	if (cfg.flags & PTRACE_BTS_O_TRACE)
@@ -708,15 +814,14 @@ static int ptrace_bts_config(struct task_struct *child,
 	if (cfg.flags & PTRACE_BTS_O_SCHED)
 	if (cfg.flags & PTRACE_BTS_O_SCHED)
 		flags |= BTS_TIMESTAMPS;
 		flags |= BTS_TIMESTAMPS;
 
 
-	child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size,
-				    /* ovfl = */ NULL, /* th = */ (size_t)-1,
-				    flags);
-	if (IS_ERR(child->bts)) {
-		int error = PTR_ERR(child->bts);
-
-		ptrace_bts_free_buffer(child);
-		child->bts = NULL;
+	context->tracer =
+		ds_request_bts_task(child, context->buffer, context->size,
+				    NULL, (size_t)-1, flags);
+	if (unlikely(IS_ERR(context->tracer))) {
+		int error = PTR_ERR(context->tracer);
 
 
+		free_bts_buffer(context);
+		context->tracer = NULL;
 		return error;
 		return error;
 	}
 	}
 
 
@@ -727,20 +832,25 @@ static int ptrace_bts_status(struct task_struct *child,
 			     long cfg_size,
 			     long cfg_size,
 			     struct ptrace_bts_config __user *ucfg)
 			     struct ptrace_bts_config __user *ucfg)
 {
 {
+	struct bts_context *context;
 	const struct bts_trace *trace;
 	const struct bts_trace *trace;
 	struct ptrace_bts_config cfg;
 	struct ptrace_bts_config cfg;
 
 
+	context = child->bts;
+	if (!context)
+		return -ESRCH;
+
 	if (cfg_size < sizeof(cfg))
 	if (cfg_size < sizeof(cfg))
 		return -EIO;
 		return -EIO;
 
 
-	trace = ds_read_bts(child->bts);
+	trace = ds_read_bts(context->tracer);
 	if (!trace)
 	if (!trace)
-		return -EPERM;
+		return -ESRCH;
 
 
 	memset(&cfg, 0, sizeof(cfg));
 	memset(&cfg, 0, sizeof(cfg));
-	cfg.size = trace->ds.end - trace->ds.begin;
-	cfg.signal = child->thread.bts_ovfl_signal;
-	cfg.bts_size = sizeof(struct bts_struct);
+	cfg.size	= trace->ds.end - trace->ds.begin;
+	cfg.signal	= context->bts_ovfl_signal;
+	cfg.bts_size	= sizeof(struct bts_struct);
 
 
 	if (cfg.signal)
 	if (cfg.signal)
 		cfg.flags |= PTRACE_BTS_O_SIGNAL;
 		cfg.flags |= PTRACE_BTS_O_SIGNAL;
@@ -759,80 +869,51 @@ static int ptrace_bts_status(struct task_struct *child,
 
 
 static int ptrace_bts_clear(struct task_struct *child)
 static int ptrace_bts_clear(struct task_struct *child)
 {
 {
+	struct bts_context *context;
 	const struct bts_trace *trace;
 	const struct bts_trace *trace;
 
 
-	trace = ds_read_bts(child->bts);
+	context = child->bts;
+	if (!context)
+		return -ESRCH;
+
+	trace = ds_read_bts(context->tracer);
 	if (!trace)
 	if (!trace)
-		return -EPERM;
+		return -ESRCH;
 
 
 	memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
 	memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
 
 
-	return ds_reset_bts(child->bts);
+	return ds_reset_bts(context->tracer);
 }
 }
 
 
 static int ptrace_bts_size(struct task_struct *child)
 static int ptrace_bts_size(struct task_struct *child)
 {
 {
+	struct bts_context *context;
 	const struct bts_trace *trace;
 	const struct bts_trace *trace;
 
 
-	trace = ds_read_bts(child->bts);
+	context = child->bts;
+	if (!context)
+		return -ESRCH;
+
+	trace = ds_read_bts(context->tracer);
 	if (!trace)
 	if (!trace)
-		return -EPERM;
+		return -ESRCH;
 
 
 	return (trace->ds.top - trace->ds.begin) / trace->ds.size;
 	return (trace->ds.top - trace->ds.begin) / trace->ds.size;
 }
 }
 
 
-static void ptrace_bts_fork(struct task_struct *tsk)
-{
-	tsk->bts = NULL;
-	tsk->bts_buffer = NULL;
-	tsk->bts_size = 0;
-	tsk->thread.bts_ovfl_signal = 0;
-}
-
-static void ptrace_bts_untrace(struct task_struct *child)
+/*
+ * Called from __ptrace_unlink() after the child has been moved back
+ * to its original parent.
+ */
+void ptrace_bts_untrace(struct task_struct *child)
 {
 {
 	if (unlikely(child->bts)) {
 	if (unlikely(child->bts)) {
-		ds_release_bts(child->bts);
+		free_bts_context(child->bts);
 		child->bts = NULL;
 		child->bts = NULL;
-
-		/* We cannot update total_vm and locked_vm since
-		   child's mm is already gone. But we can reclaim the
-		   memory. */
-		kfree(child->bts_buffer);
-		child->bts_buffer = NULL;
-		child->bts_size = 0;
 	}
 	}
 }
 }
-
-static void ptrace_bts_detach(struct task_struct *child)
-{
-	/*
-	 * Ptrace_detach() races with ptrace_untrace() in case
-	 * the child dies and is reaped by another thread.
-	 *
-	 * We only do the memory accounting at this point and
-	 * leave the buffer deallocation and the bts tracer
-	 * release to ptrace_bts_untrace() which will be called
-	 * later on with tasklist_lock held.
-	 */
-	release_locked_buffer(child->bts_buffer, child->bts_size);
-}
-#else
-static inline void ptrace_bts_fork(struct task_struct *tsk) {}
-static inline void ptrace_bts_detach(struct task_struct *child) {}
-static inline void ptrace_bts_untrace(struct task_struct *child) {}
 #endif /* CONFIG_X86_PTRACE_BTS */
 #endif /* CONFIG_X86_PTRACE_BTS */
 
 
-void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags)
-{
-	ptrace_bts_fork(child);
-}
-
-void x86_ptrace_untrace(struct task_struct *child)
-{
-	ptrace_bts_untrace(child);
-}
-
 /*
 /*
  * Called by kernel/ptrace.c when detaching..
  * Called by kernel/ptrace.c when detaching..
  *
  *
@@ -844,7 +925,6 @@ void ptrace_disable(struct task_struct *child)
 #ifdef TIF_SYSCALL_EMU
 #ifdef TIF_SYSCALL_EMU
 	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 #endif
 #endif
-	ptrace_bts_detach(child);
 }
 }
 
 
 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION

+ 1 - 1
arch/x86/kernel/stacktrace.c

@@ -20,7 +20,7 @@ save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
 
 
 static int save_stack_stack(void *data, char *name)
 static int save_stack_stack(void *data, char *name)
 {
 {
-	return -1;
+	return 0;
 }
 }
 
 
 static void save_stack_address(void *data, unsigned long addr, int reliable)
 static void save_stack_address(void *data, unsigned long addr, int reliable)

+ 61 - 43
arch/x86/mm/kmmio.c

@@ -32,7 +32,7 @@ struct kmmio_fault_page {
 	struct list_head list;
 	struct list_head list;
 	struct kmmio_fault_page *release_next;
 	struct kmmio_fault_page *release_next;
 	unsigned long page; /* location of the fault page */
 	unsigned long page; /* location of the fault page */
-	bool old_presence; /* page presence prior to arming */
+	pteval_t old_presence; /* page presence prior to arming */
 	bool armed;
 	bool armed;
 
 
 	/*
 	/*
@@ -97,60 +97,62 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
 static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
 static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
 {
 {
 	struct list_head *head;
 	struct list_head *head;
-	struct kmmio_fault_page *p;
+	struct kmmio_fault_page *f;
 
 
 	page &= PAGE_MASK;
 	page &= PAGE_MASK;
 	head = kmmio_page_list(page);
 	head = kmmio_page_list(page);
-	list_for_each_entry_rcu(p, head, list) {
-		if (p->page == page)
-			return p;
+	list_for_each_entry_rcu(f, head, list) {
+		if (f->page == page)
+			return f;
 	}
 	}
 	return NULL;
 	return NULL;
 }
 }
 
 
-static void set_pmd_presence(pmd_t *pmd, bool present, bool *old)
+static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
 {
 {
 	pmdval_t v = pmd_val(*pmd);
 	pmdval_t v = pmd_val(*pmd);
-	*old = !!(v & _PAGE_PRESENT);
-	v &= ~_PAGE_PRESENT;
-	if (present)
-		v |= _PAGE_PRESENT;
+	if (clear) {
+		*old = v & _PAGE_PRESENT;
+		v &= ~_PAGE_PRESENT;
+	} else	/* presume this has been called with clear==true previously */
+		v |= *old;
 	set_pmd(pmd, __pmd(v));
 	set_pmd(pmd, __pmd(v));
 }
 }
 
 
-static void set_pte_presence(pte_t *pte, bool present, bool *old)
+static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
 {
 {
 	pteval_t v = pte_val(*pte);
 	pteval_t v = pte_val(*pte);
-	*old = !!(v & _PAGE_PRESENT);
-	v &= ~_PAGE_PRESENT;
-	if (present)
-		v |= _PAGE_PRESENT;
+	if (clear) {
+		*old = v & _PAGE_PRESENT;
+		v &= ~_PAGE_PRESENT;
+	} else	/* presume this has been called with clear==true previously */
+		v |= *old;
 	set_pte_atomic(pte, __pte(v));
 	set_pte_atomic(pte, __pte(v));
 }
 }
 
 
-static int set_page_presence(unsigned long addr, bool present, bool *old)
+static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
 {
 {
 	unsigned int level;
 	unsigned int level;
-	pte_t *pte = lookup_address(addr, &level);
+	pte_t *pte = lookup_address(f->page, &level);
 
 
 	if (!pte) {
 	if (!pte) {
-		pr_err("kmmio: no pte for page 0x%08lx\n", addr);
+		pr_err("kmmio: no pte for page 0x%08lx\n", f->page);
 		return -1;
 		return -1;
 	}
 	}
 
 
 	switch (level) {
 	switch (level) {
 	case PG_LEVEL_2M:
 	case PG_LEVEL_2M:
-		set_pmd_presence((pmd_t *)pte, present, old);
+		clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence);
 		break;
 		break;
 	case PG_LEVEL_4K:
 	case PG_LEVEL_4K:
-		set_pte_presence(pte, present, old);
+		clear_pte_presence(pte, clear, &f->old_presence);
 		break;
 		break;
 	default:
 	default:
 		pr_err("kmmio: unexpected page level 0x%x.\n", level);
 		pr_err("kmmio: unexpected page level 0x%x.\n", level);
 		return -1;
 		return -1;
 	}
 	}
 
 
-	__flush_tlb_one(addr);
+	__flush_tlb_one(f->page);
 	return 0;
 	return 0;
 }
 }
 
 
@@ -171,9 +173,9 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
 	WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n");
 	WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n");
 	if (f->armed) {
 	if (f->armed) {
 		pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n",
 		pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n",
-					f->page, f->count, f->old_presence);
+					f->page, f->count, !!f->old_presence);
 	}
 	}
-	ret = set_page_presence(f->page, false, &f->old_presence);
+	ret = clear_page_presence(f, true);
 	WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page);
 	WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page);
 	f->armed = true;
 	f->armed = true;
 	return ret;
 	return ret;
@@ -182,8 +184,7 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
 /** Restore the given page to saved presence state. */
 /** Restore the given page to saved presence state. */
 static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
 static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
 {
 {
-	bool tmp;
-	int ret = set_page_presence(f->page, f->old_presence, &tmp);
+	int ret = clear_page_presence(f, false);
 	WARN_ONCE(ret < 0,
 	WARN_ONCE(ret < 0,
 			KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
 			KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
 	f->armed = false;
 	f->armed = false;
@@ -310,7 +311,12 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
 	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
 
 
 	if (!ctx->active) {
 	if (!ctx->active) {
-		pr_debug("kmmio: spurious debug trap on CPU %d.\n",
+		/*
+		 * debug traps without an active context are due to either
+		 * something external causing them (f.e. using a debugger while
+		 * mmio tracing enabled), or erroneous behaviour
+		 */
+		pr_warning("kmmio: unexpected debug trap on CPU %d.\n",
 							smp_processor_id());
 							smp_processor_id());
 		goto out;
 		goto out;
 	}
 	}
@@ -439,12 +445,12 @@ static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
 						head,
 						head,
 						struct kmmio_delayed_release,
 						struct kmmio_delayed_release,
 						rcu);
 						rcu);
-	struct kmmio_fault_page *p = dr->release_list;
-	while (p) {
-		struct kmmio_fault_page *next = p->release_next;
-		BUG_ON(p->count);
-		kfree(p);
-		p = next;
+	struct kmmio_fault_page *f = dr->release_list;
+	while (f) {
+		struct kmmio_fault_page *next = f->release_next;
+		BUG_ON(f->count);
+		kfree(f);
+		f = next;
 	}
 	}
 	kfree(dr);
 	kfree(dr);
 }
 }
@@ -453,19 +459,19 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
 {
 {
 	struct kmmio_delayed_release *dr =
 	struct kmmio_delayed_release *dr =
 		container_of(head, struct kmmio_delayed_release, rcu);
 		container_of(head, struct kmmio_delayed_release, rcu);
-	struct kmmio_fault_page *p = dr->release_list;
+	struct kmmio_fault_page *f = dr->release_list;
 	struct kmmio_fault_page **prevp = &dr->release_list;
 	struct kmmio_fault_page **prevp = &dr->release_list;
 	unsigned long flags;
 	unsigned long flags;
 
 
 	spin_lock_irqsave(&kmmio_lock, flags);
 	spin_lock_irqsave(&kmmio_lock, flags);
-	while (p) {
-		if (!p->count) {
-			list_del_rcu(&p->list);
-			prevp = &p->release_next;
+	while (f) {
+		if (!f->count) {
+			list_del_rcu(&f->list);
+			prevp = &f->release_next;
 		} else {
 		} else {
-			*prevp = p->release_next;
+			*prevp = f->release_next;
 		}
 		}
-		p = p->release_next;
+		f = f->release_next;
 	}
 	}
 	spin_unlock_irqrestore(&kmmio_lock, flags);
 	spin_unlock_irqrestore(&kmmio_lock, flags);
 
 
@@ -528,8 +534,8 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
 }
 }
 EXPORT_SYMBOL(unregister_kmmio_probe);
 EXPORT_SYMBOL(unregister_kmmio_probe);
 
 
-static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
-								void *args)
+static int
+kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
 {
 {
 	struct die_args *arg = args;
 	struct die_args *arg = args;
 
 
@@ -544,11 +550,23 @@ static struct notifier_block nb_die = {
 	.notifier_call = kmmio_die_notifier
 	.notifier_call = kmmio_die_notifier
 };
 };
 
 
-static int __init init_kmmio(void)
+int kmmio_init(void)
 {
 {
 	int i;
 	int i;
+
 	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
 	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
 		INIT_LIST_HEAD(&kmmio_page_table[i]);
 		INIT_LIST_HEAD(&kmmio_page_table[i]);
+
 	return register_die_notifier(&nb_die);
 	return register_die_notifier(&nb_die);
 }
 }
-fs_initcall(init_kmmio); /* should be before device_initcall() */
+
+void kmmio_cleanup(void)
+{
+	int i;
+
+	unregister_die_notifier(&nb_die);
+	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) {
+		WARN_ONCE(!list_empty(&kmmio_page_table[i]),
+			KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n");
+	}
+}

+ 2 - 0
arch/x86/mm/mmio-mod.c

@@ -451,6 +451,7 @@ void enable_mmiotrace(void)
 
 
 	if (nommiotrace)
 	if (nommiotrace)
 		pr_info(NAME "MMIO tracing disabled.\n");
 		pr_info(NAME "MMIO tracing disabled.\n");
+	kmmio_init();
 	enter_uniprocessor();
 	enter_uniprocessor();
 	spin_lock_irq(&trace_lock);
 	spin_lock_irq(&trace_lock);
 	atomic_inc(&mmiotrace_enabled);
 	atomic_inc(&mmiotrace_enabled);
@@ -473,6 +474,7 @@ void disable_mmiotrace(void)
 
 
 	clear_trace_list(); /* guarantees: no more kmmio callbacks */
 	clear_trace_list(); /* guarantees: no more kmmio callbacks */
 	leave_uniprocessor();
 	leave_uniprocessor();
+	kmmio_cleanup();
 	pr_info(NAME "disabled.\n");
 	pr_info(NAME "disabled.\n");
 out:
 out:
 	mutex_unlock(&mmiotrace_mutex);
 	mutex_unlock(&mmiotrace_mutex);

+ 6 - 15
block/blk-core.c

@@ -28,22 +28,14 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/blktrace_api.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
 #include <linux/fault-inject.h>
-#include <trace/block.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/block.h>
 
 
 #include "blk.h"
 #include "blk.h"
 
 
-DEFINE_TRACE(block_plug);
-DEFINE_TRACE(block_unplug_io);
-DEFINE_TRACE(block_unplug_timer);
-DEFINE_TRACE(block_getrq);
-DEFINE_TRACE(block_sleeprq);
-DEFINE_TRACE(block_rq_requeue);
-DEFINE_TRACE(block_bio_backmerge);
-DEFINE_TRACE(block_bio_frontmerge);
-DEFINE_TRACE(block_bio_queue);
-DEFINE_TRACE(block_rq_complete);
-DEFINE_TRACE(block_remap);	/* Also used in drivers/md/dm.c */
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
 
 
 static int __make_request(struct request_queue *q, struct bio *bio);
 static int __make_request(struct request_queue *q, struct bio *bio);
 
 
@@ -1277,7 +1269,7 @@ static inline void blk_partition_remap(struct bio *bio)
 		bio->bi_bdev = bdev->bd_contains;
 		bio->bi_bdev = bdev->bd_contains;
 
 
 		trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
 		trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
-				    bdev->bd_dev, bio->bi_sector,
+				    bdev->bd_dev,
 				    bio->bi_sector - p->start_sect);
 				    bio->bi_sector - p->start_sect);
 	}
 	}
 }
 }
@@ -1446,8 +1438,7 @@ static inline void __generic_make_request(struct bio *bio)
 			goto end_io;
 			goto end_io;
 
 
 		if (old_sector != -1)
 		if (old_sector != -1)
-			trace_block_remap(q, bio, old_dev, bio->bi_sector,
-					    old_sector);
+			trace_block_remap(q, bio, old_dev, old_sector);
 
 
 		trace_block_bio_queue(q, bio);
 		trace_block_bio_queue(q, bio);
 
 

+ 6 - 1
block/blk-sysfs.c

@@ -383,16 +383,21 @@ struct kobj_type blk_queue_ktype = {
 int blk_register_queue(struct gendisk *disk)
 int blk_register_queue(struct gendisk *disk)
 {
 {
 	int ret;
 	int ret;
+	struct device *dev = disk_to_dev(disk);
 
 
 	struct request_queue *q = disk->queue;
 	struct request_queue *q = disk->queue;
 
 
 	if (WARN_ON(!q))
 	if (WARN_ON(!q))
 		return -ENXIO;
 		return -ENXIO;
 
 
+	ret = blk_trace_init_sysfs(dev);
+	if (ret)
+		return ret;
+
 	if (!q->request_fn)
 	if (!q->request_fn)
 		return 0;
 		return 0;
 
 
-	ret = kobject_add(&q->kobj, kobject_get(&disk_to_dev(disk)->kobj),
+	ret = kobject_add(&q->kobj, kobject_get(&dev->kobj),
 			  "%s", "queue");
 			  "%s", "queue");
 	if (ret < 0)
 	if (ret < 0)
 		return ret;
 		return ret;

+ 1 - 1
block/compat_ioctl.c

@@ -568,7 +568,7 @@ static int compat_blk_trace_setup(struct block_device *bdev, char __user *arg)
 	memcpy(&buts.name, &cbuts.name, 32);
 	memcpy(&buts.name, &cbuts.name, 32);
 
 
 	mutex_lock(&bdev->bd_mutex);
 	mutex_lock(&bdev->bd_mutex);
-	ret = do_blk_trace_setup(q, b, bdev->bd_dev, &buts);
+	ret = do_blk_trace_setup(q, b, bdev->bd_dev, bdev, &buts);
 	mutex_unlock(&bdev->bd_mutex);
 	mutex_unlock(&bdev->bd_mutex);
 	if (ret)
 	if (ret)
 		return ret;
 		return ret;

+ 2 - 6
block/elevator.c

@@ -33,17 +33,16 @@
 #include <linux/compiler.h>
 #include <linux/compiler.h>
 #include <linux/delay.h>
 #include <linux/delay.h>
 #include <linux/blktrace_api.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
 #include <linux/hash.h>
 #include <linux/hash.h>
 #include <linux/uaccess.h>
 #include <linux/uaccess.h>
 
 
+#include <trace/events/block.h>
+
 #include "blk.h"
 #include "blk.h"
 
 
 static DEFINE_SPINLOCK(elv_list_lock);
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
 static LIST_HEAD(elv_list);
 
 
-DEFINE_TRACE(block_rq_abort);
-
 /*
 /*
  * Merge hash stuff.
  * Merge hash stuff.
  */
  */
@@ -55,9 +54,6 @@ static const int elv_hash_shift = 6;
 #define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
 #define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
 #define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
 #define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
 
 
-DEFINE_TRACE(block_rq_insert);
-DEFINE_TRACE(block_rq_issue);
-
 /*
 /*
  * Query io scheduler to see if the current process issuing bio may be
  * Query io scheduler to see if the current process issuing bio may be
  * merged with rq.
  * merged with rq.

+ 3 - 5
drivers/md/dm.c

@@ -20,7 +20,8 @@
 #include <linux/idr.h>
 #include <linux/idr.h>
 #include <linux/hdreg.h>
 #include <linux/hdreg.h>
 #include <linux/blktrace_api.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
+
+#include <trace/events/block.h>
 
 
 #define DM_MSG_PREFIX "core"
 #define DM_MSG_PREFIX "core"
 
 
@@ -53,8 +54,6 @@ struct dm_target_io {
 	union map_info info;
 	union map_info info;
 };
 };
 
 
-DEFINE_TRACE(block_bio_complete);
-
 /*
 /*
  * For request-based dm.
  * For request-based dm.
  * One of these is allocated per request.
  * One of these is allocated per request.
@@ -656,8 +655,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 		/* the bio has been remapped so dispatch it */
 		/* the bio has been remapped so dispatch it */
 
 
 		trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
 		trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
-				    tio->io->bio->bi_bdev->bd_dev,
-				    clone->bi_sector, sector);
+				    tio->io->bio->bi_bdev->bd_dev, sector);
 
 
 		generic_make_request(clone);
 		generic_make_request(clone);
 	} else if (r < 0 || r == DM_MAPIO_REQUEUE) {
 	} else if (r < 0 || r == DM_MAPIO_REQUEUE) {

+ 1 - 0
drivers/scsi/sg.c

@@ -1065,6 +1065,7 @@ sg_ioctl(struct inode *inode, struct file *filp,
 		return blk_trace_setup(sdp->device->request_queue,
 		return blk_trace_setup(sdp->device->request_queue,
 				       sdp->disk->disk_name,
 				       sdp->disk->disk_name,
 				       MKDEV(SCSI_GENERIC_MAJOR, sdp->index),
 				       MKDEV(SCSI_GENERIC_MAJOR, sdp->index),
+				       NULL,
 				       (char *)arg);
 				       (char *)arg);
 	case BLKTRACESTART:
 	case BLKTRACESTART:
 		return blk_trace_startstop(sdp->device->request_queue, 1);
 		return blk_trace_startstop(sdp->device->request_queue, 1);

+ 1 - 2
fs/bio.c

@@ -26,10 +26,9 @@
 #include <linux/mempool.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
 #include <linux/workqueue.h>
 #include <linux/blktrace_api.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
 
-DEFINE_TRACE(block_split);
+#include <trace/events/block.h>
 
 
 /*
 /*
  * Test patch to inline a certain number of bi_io_vec's inside the bio
  * Test patch to inline a certain number of bi_io_vec's inside the bio

+ 1 - 1
include/asm-generic/vmlinux.lds.h

@@ -63,7 +63,7 @@
 #define BRANCH_PROFILE()
 #define BRANCH_PROFILE()
 #endif
 #endif
 
 
-#ifdef CONFIG_EVENT_TRACER
+#ifdef CONFIG_EVENT_TRACING
 #define FTRACE_EVENTS()	VMLINUX_SYMBOL(__start_ftrace_events) = .;	\
 #define FTRACE_EVENTS()	VMLINUX_SYMBOL(__start_ftrace_events) = .;	\
 			*(_ftrace_events)				\
 			*(_ftrace_events)				\
 			VMLINUX_SYMBOL(__stop_ftrace_events) = .;
 			VMLINUX_SYMBOL(__stop_ftrace_events) = .;

+ 33 - 12
include/linux/blktrace_api.h

@@ -116,9 +116,9 @@ struct blk_io_trace {
  * The remap event
  * The remap event
  */
  */
 struct blk_io_trace_remap {
 struct blk_io_trace_remap {
-	__be32 device;
 	__be32 device_from;
 	__be32 device_from;
-	__be64 sector;
+	__be32 device_to;
+	__be64 sector_from;
 };
 };
 
 
 enum {
 enum {
@@ -165,8 +165,9 @@ struct blk_trace {
 
 
 extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
 extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
 extern void blk_trace_shutdown(struct request_queue *);
 extern void blk_trace_shutdown(struct request_queue *);
-extern int do_blk_trace_setup(struct request_queue *q,
-	char *name, dev_t dev, struct blk_user_trace_setup *buts);
+extern int do_blk_trace_setup(struct request_queue *q, char *name,
+			      dev_t dev, struct block_device *bdev,
+			      struct blk_user_trace_setup *buts);
 extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
 extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
 
 
 /**
 /**
@@ -193,22 +194,42 @@ extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
 extern void blk_add_driver_data(struct request_queue *q, struct request *rq,
 extern void blk_add_driver_data(struct request_queue *q, struct request *rq,
 				void *data, size_t len);
 				void *data, size_t len);
 extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+			   struct block_device *bdev,
 			   char __user *arg);
 			   char __user *arg);
 extern int blk_trace_startstop(struct request_queue *q, int start);
 extern int blk_trace_startstop(struct request_queue *q, int start);
 extern int blk_trace_remove(struct request_queue *q);
 extern int blk_trace_remove(struct request_queue *q);
+extern int blk_trace_init_sysfs(struct device *dev);
 
 
 extern struct attribute_group blk_trace_attr_group;
 extern struct attribute_group blk_trace_attr_group;
 
 
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
-#define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
-#define blk_trace_shutdown(q)			do { } while (0)
-#define do_blk_trace_setup(q, name, dev, buts)	(-ENOTTY)
-#define blk_add_driver_data(q, rq, data, len)	do {} while (0)
-#define blk_trace_setup(q, name, dev, arg)	(-ENOTTY)
-#define blk_trace_startstop(q, start)		(-ENOTTY)
-#define blk_trace_remove(q)			(-ENOTTY)
-#define blk_add_trace_msg(q, fmt, ...)		do { } while (0)
+# define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
+# define blk_trace_shutdown(q)				do { } while (0)
+# define do_blk_trace_setup(q, name, dev, bdev, buts)	(-ENOTTY)
+# define blk_add_driver_data(q, rq, data, len)		do {} while (0)
+# define blk_trace_setup(q, name, dev, bdev, arg)	(-ENOTTY)
+# define blk_trace_startstop(q, start)			(-ENOTTY)
+# define blk_trace_remove(q)				(-ENOTTY)
+# define blk_add_trace_msg(q, fmt, ...)			do { } while (0)
+static inline int blk_trace_init_sysfs(struct device *dev)
+{
+	return 0;
+}
 
 
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
+
+#if defined(CONFIG_EVENT_TRACING) && defined(CONFIG_BLOCK)
+
+static inline int blk_cmd_buf_len(struct request *rq)
+{
+	return blk_pc_request(rq) ? rq->cmd_len * 3 : 1;
+}
+
+extern void blk_dump_cmd(char *buf, struct request *rq);
+extern void blk_fill_rwbs(char *rwbs, u32 rw, int bytes);
+extern void blk_fill_rwbs_rq(char *rwbs, struct request *rq);
+
+#endif /* CONFIG_EVENT_TRACING && CONFIG_BLOCK */
+
 #endif /* __KERNEL__ */
 #endif /* __KERNEL__ */
 #endif
 #endif

+ 8 - 9
include/linux/ftrace.h

@@ -233,8 +233,6 @@ extern int ftrace_arch_read_dyn_info(char *buf, int size);
 
 
 extern int skip_trace(unsigned long ip);
 extern int skip_trace(unsigned long ip);
 
 
-extern void ftrace_release(void *start, unsigned long size);
-
 extern void ftrace_disable_daemon(void);
 extern void ftrace_disable_daemon(void);
 extern void ftrace_enable_daemon(void);
 extern void ftrace_enable_daemon(void);
 #else
 #else
@@ -325,13 +323,8 @@ static inline void __ftrace_enabled_restore(int enabled)
 
 
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 extern void ftrace_init(void);
 extern void ftrace_init(void);
-extern void ftrace_init_module(struct module *mod,
-			       unsigned long *start, unsigned long *end);
 #else
 #else
 static inline void ftrace_init(void) { }
 static inline void ftrace_init(void) { }
-static inline void
-ftrace_init_module(struct module *mod,
-		   unsigned long *start, unsigned long *end) { }
 #endif
 #endif
 
 
 /*
 /*
@@ -368,6 +361,7 @@ struct ftrace_ret_stack {
 	unsigned long ret;
 	unsigned long ret;
 	unsigned long func;
 	unsigned long func;
 	unsigned long long calltime;
 	unsigned long long calltime;
+	unsigned long long subtime;
 };
 };
 
 
 /*
 /*
@@ -379,8 +373,6 @@ extern void return_to_handler(void);
 
 
 extern int
 extern int
 ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth);
 ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth);
-extern void
-ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret);
 
 
 /*
 /*
  * Sometimes we don't want to trace a function with the function
  * Sometimes we don't want to trace a function with the function
@@ -496,8 +488,15 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk)
 
 
 extern int ftrace_dump_on_oops;
 extern int ftrace_dump_on_oops;
 
 
+#ifdef CONFIG_PREEMPT
+#define INIT_TRACE_RECURSION		.trace_recursion = 0,
+#endif
+
 #endif /* CONFIG_TRACING */
 #endif /* CONFIG_TRACING */
 
 
+#ifndef INIT_TRACE_RECURSION
+#define INIT_TRACE_RECURSION
+#endif
 
 
 #ifdef CONFIG_HW_BRANCH_TRACER
 #ifdef CONFIG_HW_BRANCH_TRACER
 
 

+ 172 - 0
include/linux/ftrace_event.h

@@ -0,0 +1,172 @@
+#ifndef _LINUX_FTRACE_EVENT_H
+#define _LINUX_FTRACE_EVENT_H
+
+#include <linux/trace_seq.h>
+#include <linux/ring_buffer.h>
+#include <linux/percpu.h>
+
+struct trace_array;
+struct tracer;
+struct dentry;
+
+DECLARE_PER_CPU(struct trace_seq, ftrace_event_seq);
+
+struct trace_print_flags {
+	unsigned long		mask;
+	const char		*name;
+};
+
+const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
+				   unsigned long flags,
+				   const struct trace_print_flags *flag_array);
+
+const char *ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
+				     const struct trace_print_flags *symbol_array);
+
+/*
+ * The trace entry - the most basic unit of tracing. This is what
+ * is printed in the end as a single line in the trace output, such as:
+ *
+ *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
+ */
+struct trace_entry {
+	unsigned short		type;
+	unsigned char		flags;
+	unsigned char		preempt_count;
+	int			pid;
+	int			tgid;
+};
+
+#define FTRACE_MAX_EVENT						\
+	((1 << (sizeof(((struct trace_entry *)0)->type) * 8)) - 1)
+
+/*
+ * Trace iterator - used by printout routines who present trace
+ * results to users and which routines might sleep, etc:
+ */
+struct trace_iterator {
+	struct trace_array	*tr;
+	struct tracer		*trace;
+	void			*private;
+	int			cpu_file;
+	struct mutex		mutex;
+	struct ring_buffer_iter	*buffer_iter[NR_CPUS];
+	unsigned long		iter_flags;
+
+	/* The below is zeroed out in pipe_read */
+	struct trace_seq	seq;
+	struct trace_entry	*ent;
+	int			cpu;
+	u64			ts;
+
+	loff_t			pos;
+	long			idx;
+
+	cpumask_var_t		started;
+};
+
+
+typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
+					      int flags);
+struct trace_event {
+	struct hlist_node	node;
+	struct list_head	list;
+	int			type;
+	trace_print_func	trace;
+	trace_print_func	raw;
+	trace_print_func	hex;
+	trace_print_func	binary;
+};
+
+extern int register_ftrace_event(struct trace_event *event);
+extern int unregister_ftrace_event(struct trace_event *event);
+
+/* Return values for print_line callback */
+enum print_line_t {
+	TRACE_TYPE_PARTIAL_LINE	= 0,	/* Retry after flushing the seq */
+	TRACE_TYPE_HANDLED	= 1,
+	TRACE_TYPE_UNHANDLED	= 2,	/* Relay to other output functions */
+	TRACE_TYPE_NO_CONSUME	= 3	/* Handled but ask to not consume */
+};
+
+
+struct ring_buffer_event *
+trace_current_buffer_lock_reserve(int type, unsigned long len,
+				  unsigned long flags, int pc);
+void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
+					unsigned long flags, int pc);
+void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
+					unsigned long flags, int pc);
+void trace_current_buffer_discard_commit(struct ring_buffer_event *event);
+
+void tracing_record_cmdline(struct task_struct *tsk);
+
+struct ftrace_event_call {
+	struct list_head	list;
+	char			*name;
+	char			*system;
+	struct dentry		*dir;
+	struct trace_event	*event;
+	int			enabled;
+	int			(*regfunc)(void);
+	void			(*unregfunc)(void);
+	int			id;
+	int			(*raw_init)(void);
+	int			(*show_format)(struct trace_seq *s);
+	int			(*define_fields)(void);
+	struct list_head	fields;
+	int			filter_active;
+	void			*filter;
+	void			*mod;
+
+#ifdef CONFIG_EVENT_PROFILE
+	atomic_t	profile_count;
+	int		(*profile_enable)(struct ftrace_event_call *);
+	void		(*profile_disable)(struct ftrace_event_call *);
+#endif
+};
+
+#define MAX_FILTER_PRED		32
+#define MAX_FILTER_STR_VAL	128
+
+extern int init_preds(struct ftrace_event_call *call);
+extern void destroy_preds(struct ftrace_event_call *call);
+extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
+extern int filter_current_check_discard(struct ftrace_event_call *call,
+					void *rec,
+					struct ring_buffer_event *event);
+
+extern int trace_define_field(struct ftrace_event_call *call, char *type,
+			      char *name, int offset, int size, int is_signed);
+
+#define is_signed_type(type)	(((type)(-1)) < 0)
+
+int trace_set_clr_event(const char *system, const char *event, int set);
+
+/*
+ * The double __builtin_constant_p is because gcc will give us an error
+ * if we try to allocate the static variable to fmt if it is not a
+ * constant. Even with the outer if statement optimizing out.
+ */
+#define event_trace_printk(ip, fmt, args...)				\
+do {									\
+	__trace_printk_check_format(fmt, ##args);			\
+	tracing_record_cmdline(current);				\
+	if (__builtin_constant_p(fmt)) {				\
+		static const char *trace_printk_fmt			\
+		  __attribute__((section("__trace_printk_fmt"))) =	\
+			__builtin_constant_p(fmt) ? fmt : NULL;		\
+									\
+		__trace_bprintk(ip, trace_printk_fmt, ##args);		\
+	} else								\
+		__trace_printk(ip, fmt, ##args);			\
+} while (0)
+
+#define __common_field(type, item, is_signed)				\
+	ret = trace_define_field(event_call, #type, "common_" #item,	\
+				 offsetof(typeof(field.ent), item),	\
+				 sizeof(field.ent.item), is_signed);	\
+	if (ret)							\
+		return ret;
+
+#endif /* _LINUX_FTRACE_EVENT_H */

+ 1 - 0
include/linux/init_task.h

@@ -174,6 +174,7 @@ extern struct cred init_cred;
 	INIT_TRACE_IRQFLAGS						\
 	INIT_TRACE_IRQFLAGS						\
 	INIT_LOCKDEP							\
 	INIT_LOCKDEP							\
 	INIT_FTRACE_GRAPH						\
 	INIT_FTRACE_GRAPH						\
+	INIT_TRACE_RECURSION						\
 }
 }
 
 
 
 

+ 25 - 0
include/linux/kmemtrace.h

@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2008 Eduard - Gabriel Munteanu
+ *
+ * This file is released under GPL version 2.
+ */
+
+#ifndef _LINUX_KMEMTRACE_H
+#define _LINUX_KMEMTRACE_H
+
+#ifdef __KERNEL__
+
+#include <trace/events/kmem.h>
+
+#ifdef CONFIG_KMEMTRACE
+extern void kmemtrace_init(void);
+#else
+static inline void kmemtrace_init(void)
+{
+}
+#endif
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_KMEMTRACE_H */
+

+ 4 - 3
include/linux/mm.h

@@ -19,6 +19,7 @@ struct anon_vma;
 struct file_ra_state;
 struct file_ra_state;
 struct user_struct;
 struct user_struct;
 struct writeback_control;
 struct writeback_control;
+struct rlimit;
 
 
 #ifndef CONFIG_DISCONTIGMEM          /* Don't use mapnrs, do it properly */
 #ifndef CONFIG_DISCONTIGMEM          /* Don't use mapnrs, do it properly */
 extern unsigned long max_mapnr;
 extern unsigned long max_mapnr;
@@ -1317,8 +1318,8 @@ int vmemmap_populate_basepages(struct page *start_page,
 int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
 int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
 void vmemmap_populate_print_last(void);
 void vmemmap_populate_print_last(void);
 
 
-extern void *alloc_locked_buffer(size_t size);
-extern void free_locked_buffer(void *buffer, size_t size);
-extern void release_locked_buffer(void *buffer, size_t size);
+extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
+				 size_t size);
+extern void refund_locked_memory(struct mm_struct *mm, size_t size);
 #endif /* __KERNEL__ */
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
 #endif /* _LINUX_MM_H */

+ 2 - 0
include/linux/mmiotrace.h

@@ -30,6 +30,8 @@ extern unsigned int kmmio_count;
 
 
 extern int register_kmmio_probe(struct kmmio_probe *p);
 extern int register_kmmio_probe(struct kmmio_probe *p);
 extern void unregister_kmmio_probe(struct kmmio_probe *p);
 extern void unregister_kmmio_probe(struct kmmio_probe *p);
+extern int kmmio_init(void);
+extern void kmmio_cleanup(void);
 
 
 #ifdef CONFIG_MMIOTRACE
 #ifdef CONFIG_MMIOTRACE
 /* kmmio is active by some kmmio_probes? */
 /* kmmio is active by some kmmio_probes? */

+ 8 - 0
include/linux/module.h

@@ -337,6 +337,14 @@ struct module
 	const char **trace_bprintk_fmt_start;
 	const char **trace_bprintk_fmt_start;
 	unsigned int num_trace_bprintk_fmt;
 	unsigned int num_trace_bprintk_fmt;
 #endif
 #endif
+#ifdef CONFIG_EVENT_TRACING
+	struct ftrace_event_call *trace_events;
+	unsigned int num_trace_events;
+#endif
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+	unsigned long *ftrace_callsites;
+	unsigned int num_ftrace_callsites;
+#endif
 
 
 #ifdef CONFIG_MODULE_UNLOAD
 #ifdef CONFIG_MODULE_UNLOAD
 	/* What modules depend on me? */
 	/* What modules depend on me? */

+ 0 - 10
include/linux/ptrace.h

@@ -95,7 +95,6 @@ extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
 extern void __ptrace_unlink(struct task_struct *child);
 extern void exit_ptrace(struct task_struct *tracer);
 extern void exit_ptrace(struct task_struct *tracer);
-extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags);
 #define PTRACE_MODE_READ   1
 #define PTRACE_MODE_READ   1
 #define PTRACE_MODE_ATTACH 2
 #define PTRACE_MODE_ATTACH 2
 /* Returns 0 on success, -errno on denial. */
 /* Returns 0 on success, -errno on denial. */
@@ -327,15 +326,6 @@ static inline void user_enable_block_step(struct task_struct *task)
 #define arch_ptrace_untrace(task)		do { } while (0)
 #define arch_ptrace_untrace(task)		do { } while (0)
 #endif
 #endif
 
 
-#ifndef arch_ptrace_fork
-/*
- * Do machine-specific work to initialize a new task.
- *
- * This is called from copy_process().
- */
-#define arch_ptrace_fork(child, clone_flags)	do { } while (0)
-#endif
-
 extern int task_current_syscall(struct task_struct *target, long *callno,
 extern int task_current_syscall(struct task_struct *target, long *callno,
 				unsigned long args[6], unsigned int maxargs,
 				unsigned long args[6], unsigned int maxargs,
 				unsigned long *sp, unsigned long *pc);
 				unsigned long *sp, unsigned long *pc);

+ 58 - 8
include/linux/ring_buffer.h

@@ -11,7 +11,7 @@ struct ring_buffer_iter;
  * Don't refer to this struct directly, use functions below.
  * Don't refer to this struct directly, use functions below.
  */
  */
 struct ring_buffer_event {
 struct ring_buffer_event {
-	u32		type:2, len:3, time_delta:27;
+	u32		type_len:5, time_delta:27;
 	u32		array[];
 	u32		array[];
 };
 };
 
 
@@ -24,7 +24,8 @@ struct ring_buffer_event {
  *				  size is variable depending on how much
  *				  size is variable depending on how much
  *				  padding is needed
  *				  padding is needed
  *				 If time_delta is non zero:
  *				 If time_delta is non zero:
- *				  everything else same as RINGBUF_TYPE_DATA
+ *				  array[0] holds the actual length
+ *				  size = 4 + length (bytes)
  *
  *
  * @RINGBUF_TYPE_TIME_EXTEND:	Extend the time delta
  * @RINGBUF_TYPE_TIME_EXTEND:	Extend the time delta
  *				 array[0] = time delta (28 .. 59)
  *				 array[0] = time delta (28 .. 59)
@@ -35,22 +36,23 @@ struct ring_buffer_event {
  *				 array[1..2] = tv_sec
  *				 array[1..2] = tv_sec
  *				 size = 16 bytes
  *				 size = 16 bytes
  *
  *
- * @RINGBUF_TYPE_DATA:		Data record
- *				 If len is zero:
+ * <= @RINGBUF_TYPE_DATA_TYPE_LEN_MAX:
+ *				Data record
+ *				 If type_len is zero:
  *				  array[0] holds the actual length
  *				  array[0] holds the actual length
  *				  array[1..(length+3)/4] holds data
  *				  array[1..(length+3)/4] holds data
- *				  size = 4 + 4 + length (bytes)
+ *				  size = 4 + length (bytes)
  *				 else
  *				 else
- *				  length = len << 2
+ *				  length = type_len << 2
  *				  array[0..(length+3)/4-1] holds data
  *				  array[0..(length+3)/4-1] holds data
  *				  size = 4 + length (bytes)
  *				  size = 4 + length (bytes)
  */
  */
 enum ring_buffer_type {
 enum ring_buffer_type {
+	RINGBUF_TYPE_DATA_TYPE_LEN_MAX = 28,
 	RINGBUF_TYPE_PADDING,
 	RINGBUF_TYPE_PADDING,
 	RINGBUF_TYPE_TIME_EXTEND,
 	RINGBUF_TYPE_TIME_EXTEND,
 	/* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */
 	/* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */
 	RINGBUF_TYPE_TIME_STAMP,
 	RINGBUF_TYPE_TIME_STAMP,
-	RINGBUF_TYPE_DATA,
 };
 };
 
 
 unsigned ring_buffer_event_length(struct ring_buffer_event *event);
 unsigned ring_buffer_event_length(struct ring_buffer_event *event);
@@ -68,13 +70,54 @@ ring_buffer_event_time_delta(struct ring_buffer_event *event)
 	return event->time_delta;
 	return event->time_delta;
 }
 }
 
 
+/*
+ * ring_buffer_event_discard can discard any event in the ring buffer.
+ *   it is up to the caller to protect against a reader from
+ *   consuming it or a writer from wrapping and replacing it.
+ *
+ * No external protection is needed if this is called before
+ * the event is commited. But in that case it would be better to
+ * use ring_buffer_discard_commit.
+ *
+ * Note, if an event that has not been committed is discarded
+ * with ring_buffer_event_discard, it must still be committed.
+ */
 void ring_buffer_event_discard(struct ring_buffer_event *event);
 void ring_buffer_event_discard(struct ring_buffer_event *event);
 
 
+/*
+ * ring_buffer_discard_commit will remove an event that has not
+ *   ben committed yet. If this is used, then ring_buffer_unlock_commit
+ *   must not be called on the discarded event. This function
+ *   will try to remove the event from the ring buffer completely
+ *   if another event has not been written after it.
+ *
+ * Example use:
+ *
+ *  if (some_condition)
+ *    ring_buffer_discard_commit(buffer, event);
+ *  else
+ *    ring_buffer_unlock_commit(buffer, event);
+ */
+void ring_buffer_discard_commit(struct ring_buffer *buffer,
+				struct ring_buffer_event *event);
+
 /*
 /*
  * size is in bytes for each per CPU buffer.
  * size is in bytes for each per CPU buffer.
  */
  */
 struct ring_buffer *
 struct ring_buffer *
-ring_buffer_alloc(unsigned long size, unsigned flags);
+__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key);
+
+/*
+ * Because the ring buffer is generic, if other users of the ring buffer get
+ * traced by ftrace, it can produce lockdep warnings. We need to keep each
+ * ring buffer's lock class separate.
+ */
+#define ring_buffer_alloc(size, flags)			\
+({							\
+	static struct lock_class_key __key;		\
+	__ring_buffer_alloc((size), (flags), &__key);	\
+})
+
 void ring_buffer_free(struct ring_buffer *buffer);
 void ring_buffer_free(struct ring_buffer *buffer);
 
 
 int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
 int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
@@ -122,6 +165,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer);
 unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
 unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
 unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu);
 unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu);
 unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
 unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
+unsigned long ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu);
+unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu);
 
 
 u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu);
 u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu);
 void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
 void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
@@ -137,6 +182,11 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data);
 int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page,
 int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page,
 			  size_t len, int cpu, int full);
 			  size_t len, int cpu, int full);
 
 
+struct trace_seq;
+
+int ring_buffer_print_entry_header(struct trace_seq *s);
+int ring_buffer_print_page_header(struct trace_seq *s);
+
 enum ring_buffer_flags {
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
 	RB_FL_OVERWRITE		= 1 << 0,
 };
 };

+ 7 - 10
include/linux/sched.h

@@ -97,8 +97,8 @@ struct exec_domain;
 struct futex_pi_state;
 struct futex_pi_state;
 struct robust_list_head;
 struct robust_list_head;
 struct bio;
 struct bio;
-struct bts_tracer;
 struct fs_struct;
 struct fs_struct;
+struct bts_context;
 
 
 /*
 /*
  * List of flags we want to share for kernel threads,
  * List of flags we want to share for kernel threads,
@@ -1230,18 +1230,11 @@ struct task_struct {
 	struct list_head ptraced;
 	struct list_head ptraced;
 	struct list_head ptrace_entry;
 	struct list_head ptrace_entry;
 
 
-#ifdef CONFIG_X86_PTRACE_BTS
 	/*
 	/*
 	 * This is the tracer handle for the ptrace BTS extension.
 	 * This is the tracer handle for the ptrace BTS extension.
 	 * This field actually belongs to the ptracer task.
 	 * This field actually belongs to the ptracer task.
 	 */
 	 */
-	struct bts_tracer *bts;
-	/*
-	 * The buffer to hold the BTS data.
-	 */
-	void *bts_buffer;
-	size_t bts_size;
-#endif /* CONFIG_X86_PTRACE_BTS */
+	struct bts_context *bts;
 
 
 	/* PID/PID hash table linkage. */
 	/* PID/PID hash table linkage. */
 	struct pid_link pids[PIDTYPE_MAX];
 	struct pid_link pids[PIDTYPE_MAX];
@@ -1449,7 +1442,9 @@ struct task_struct {
 #ifdef CONFIG_TRACING
 #ifdef CONFIG_TRACING
 	/* state flags for use by tracers */
 	/* state flags for use by tracers */
 	unsigned long trace;
 	unsigned long trace;
-#endif
+	/* bitmask of trace recursion */
+	unsigned long trace_recursion;
+#endif /* CONFIG_TRACING */
 };
 };
 
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
@@ -2022,8 +2017,10 @@ extern void set_task_comm(struct task_struct *tsk, char *from);
 extern char *get_task_comm(char *to, struct task_struct *tsk);
 extern char *get_task_comm(char *to, struct task_struct *tsk);
 
 
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
+extern void wait_task_context_switch(struct task_struct *p);
 extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
 extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
 #else
 #else
+static inline void wait_task_context_switch(struct task_struct *p) {}
 static inline unsigned long wait_task_inactive(struct task_struct *p,
 static inline unsigned long wait_task_inactive(struct task_struct *p,
 					       long match_state)
 					       long match_state)
 {
 {

+ 1 - 1
include/linux/slab_def.h

@@ -14,7 +14,7 @@
 #include <asm/page.h>		/* kmalloc_sizes.h needs PAGE_SIZE */
 #include <asm/page.h>		/* kmalloc_sizes.h needs PAGE_SIZE */
 #include <asm/cache.h>		/* kmalloc_sizes.h needs L1_CACHE_BYTES */
 #include <asm/cache.h>		/* kmalloc_sizes.h needs L1_CACHE_BYTES */
 #include <linux/compiler.h>
 #include <linux/compiler.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 
 
 /* Size description struct for general caches. */
 /* Size description struct for general caches. */
 struct cache_sizes {
 struct cache_sizes {

+ 1 - 1
include/linux/slub_def.h

@@ -10,7 +10,7 @@
 #include <linux/gfp.h>
 #include <linux/gfp.h>
 #include <linux/workqueue.h>
 #include <linux/workqueue.h>
 #include <linux/kobject.h>
 #include <linux/kobject.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 
 
 enum stat_item {
 enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */

+ 92 - 0
include/linux/trace_seq.h

@@ -0,0 +1,92 @@
+#ifndef _LINUX_TRACE_SEQ_H
+#define _LINUX_TRACE_SEQ_H
+
+#include <linux/fs.h>
+
+/*
+ * Trace sequences are used to allow a function to call several other functions
+ * to create a string of data to use (up to a max of PAGE_SIZE.
+ */
+
+struct trace_seq {
+	unsigned char		buffer[PAGE_SIZE];
+	unsigned int		len;
+	unsigned int		readpos;
+};
+
+static inline void
+trace_seq_init(struct trace_seq *s)
+{
+	s->len = 0;
+	s->readpos = 0;
+}
+
+/*
+ * Currently only defined when tracing is enabled.
+ */
+#ifdef CONFIG_TRACING
+extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+extern int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+	__attribute__ ((format (printf, 2, 0)));
+extern int
+trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
+extern void trace_print_seq(struct seq_file *m, struct trace_seq *s);
+extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
+				 size_t cnt);
+extern int trace_seq_puts(struct trace_seq *s, const char *str);
+extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
+extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
+extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+				size_t len);
+extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
+extern int trace_seq_path(struct trace_seq *s, struct path *path);
+
+#else /* CONFIG_TRACING */
+static inline int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+{
+	return 0;
+}
+static inline int
+trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+{
+	return 0;
+}
+
+static inline void trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+}
+static inline ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
+				 size_t cnt)
+{
+	return 0;
+}
+static inline int trace_seq_puts(struct trace_seq *s, const char *str)
+{
+	return 0;
+}
+static inline int trace_seq_putc(struct trace_seq *s, unsigned char c)
+{
+	return 0;
+}
+static inline int
+trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
+{
+	return 0;
+}
+static inline int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+				       size_t len)
+{
+	return 0;
+}
+static inline void *trace_seq_reserve(struct trace_seq *s, size_t len)
+{
+	return NULL;
+}
+static inline int trace_seq_path(struct trace_seq *s, struct path *path)
+{
+	return 0;
+}
+#endif /* CONFIG_TRACING */
+
+#endif /* _LINUX_TRACE_SEQ_H */

+ 5 - 3
include/linux/tracepoint.h

@@ -31,6 +31,8 @@ struct tracepoint {
 					 * Keep in sync with vmlinux.lds.h.
 					 * Keep in sync with vmlinux.lds.h.
 					 */
 					 */
 
 
+#ifndef DECLARE_TRACE
+
 #define TP_PROTO(args...)	args
 #define TP_PROTO(args...)	args
 #define TP_ARGS(args...)		args
 #define TP_ARGS(args...)		args
 
 
@@ -114,6 +116,7 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin,
 	struct tracepoint *end)
 	struct tracepoint *end)
 { }
 { }
 #endif /* CONFIG_TRACEPOINTS */
 #endif /* CONFIG_TRACEPOINTS */
+#endif /* DECLARE_TRACE */
 
 
 /*
 /*
  * Connect a probe to a tracepoint.
  * Connect a probe to a tracepoint.
@@ -154,10 +157,8 @@ static inline void tracepoint_synchronize_unregister(void)
 }
 }
 
 
 #define PARAMS(args...) args
 #define PARAMS(args...) args
-#define TRACE_FORMAT(name, proto, args, fmt)		\
-	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
-
 
 
+#ifndef TRACE_EVENT
 /*
 /*
  * For use with the TRACE_EVENT macro:
  * For use with the TRACE_EVENT macro:
  *
  *
@@ -262,5 +263,6 @@ static inline void tracepoint_synchronize_unregister(void)
 
 
 #define TRACE_EVENT(name, proto, args, struct, assign, print)	\
 #define TRACE_EVENT(name, proto, args, struct, assign, print)	\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#endif
 
 
 #endif
 #endif

+ 0 - 76
include/trace/block.h

@@ -1,76 +0,0 @@
-#ifndef _TRACE_BLOCK_H
-#define _TRACE_BLOCK_H
-
-#include <linux/blkdev.h>
-#include <linux/tracepoint.h>
-
-DECLARE_TRACE(block_rq_abort,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_insert,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_issue,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_requeue,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_complete,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_bio_bounce,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_complete,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_backmerge,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_frontmerge,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_queue,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_getrq,
-	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
-	      TP_ARGS(q, bio, rw));
-
-DECLARE_TRACE(block_sleeprq,
-	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
-	      TP_ARGS(q, bio, rw));
-
-DECLARE_TRACE(block_plug,
-	TP_PROTO(struct request_queue *q),
-	      TP_ARGS(q));
-
-DECLARE_TRACE(block_unplug_timer,
-	TP_PROTO(struct request_queue *q),
-	      TP_ARGS(q));
-
-DECLARE_TRACE(block_unplug_io,
-	TP_PROTO(struct request_queue *q),
-	      TP_ARGS(q));
-
-DECLARE_TRACE(block_split,
-	TP_PROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
-	      TP_ARGS(q, bio, pdu));
-
-DECLARE_TRACE(block_remap,
-	TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
-		 sector_t from, sector_t to),
-	      TP_ARGS(q, bio, dev, from, to));
-
-#endif

+ 75 - 0
include/trace/define_trace.h

@@ -0,0 +1,75 @@
+/*
+ * Trace files that want to automate creationg of all tracepoints defined
+ * in their file should include this file. The following are macros that the
+ * trace file may define:
+ *
+ * TRACE_SYSTEM defines the system the tracepoint is for
+ *
+ * TRACE_INCLUDE_FILE if the file name is something other than TRACE_SYSTEM.h
+ *     This macro may be defined to tell define_trace.h what file to include.
+ *     Note, leave off the ".h".
+ *
+ * TRACE_INCLUDE_PATH if the path is something other than core kernel include/trace
+ *     then this macro can define the path to use. Note, the path is relative to
+ *     define_trace.h, not the file including it. Full path names for out of tree
+ *     modules must be used.
+ */
+
+#ifdef CREATE_TRACE_POINTS
+
+/* Prevent recursion */
+#undef CREATE_TRACE_POINTS
+
+#include <linux/stringify.h>
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(name, proto, args, tstruct, assign, print)	\
+	DEFINE_TRACE(name)
+
+#undef DECLARE_TRACE
+#define DECLARE_TRACE(name, proto, args)	\
+	DEFINE_TRACE(name)
+
+#undef TRACE_INCLUDE
+#undef __TRACE_INCLUDE
+
+#ifndef TRACE_INCLUDE_FILE
+# define TRACE_INCLUDE_FILE TRACE_SYSTEM
+# define UNDEF_TRACE_INCLUDE_FILE
+#endif
+
+#ifndef TRACE_INCLUDE_PATH
+# define __TRACE_INCLUDE(system) <trace/events/system.h>
+# define UNDEF_TRACE_INCLUDE_PATH
+#else
+# define __TRACE_INCLUDE(system) __stringify(TRACE_INCLUDE_PATH/system.h)
+#endif
+
+# define TRACE_INCLUDE(system) __TRACE_INCLUDE(system)
+
+/* Let the trace headers be reread */
+#define TRACE_HEADER_MULTI_READ
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+#ifdef CONFIG_EVENT_TRACING
+#include <trace/ftrace.h>
+#endif
+
+#undef TRACE_HEADER_MULTI_READ
+
+/* Only undef what we defined in this file */
+#ifdef UNDEF_TRACE_INCLUDE_FILE
+# undef TRACE_INCLUDE_FILE
+# undef UNDEF_TRACE_INCLUDE_FILE
+#endif
+
+#ifdef UNDEF_TRACE_INCLUDE_PATH
+# undef TRACE_INCLUDE_PATH
+# undef UNDEF_TRACE_INCLUDE_PATH
+#endif
+
+/* We may be processing more files */
+#define CREATE_TRACE_POINTS
+
+#endif /* CREATE_TRACE_POINTS */

+ 498 - 0
include/trace/events/block.h

@@ -0,0 +1,498 @@
+#if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BLOCK_H
+
+#include <linux/blktrace_api.h>
+#include <linux/blkdev.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM block
+
+TRACE_EVENT(block_rq_abort,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  int,		errors			)
+		__array(  char,		rwbs,	6		)
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->errors    = rq->errors;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+	),
+
+	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __get_str(cmd),
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->errors)
+);
+
+TRACE_EVENT(block_rq_insert,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  unsigned int,	bytes			)
+		__array(  char,		rwbs,	6		)
+		__array(  char,         comm,   TASK_COMM_LEN   )
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->bytes     = blk_pc_request(rq) ? rq->data_len : 0;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __entry->bytes, __get_str(cmd),
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_rq_issue,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  unsigned int,	bytes			)
+		__array(  char,		rwbs,	6		)
+		__array(  char,		comm,   TASK_COMM_LEN   )
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->bytes     = blk_pc_request(rq) ? rq->data_len : 0;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __entry->bytes, __get_str(cmd),
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_rq_requeue,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  int,		errors			)
+		__array(  char,		rwbs,	6		)
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->errors	   = rq->errors;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+	),
+
+	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __get_str(cmd),
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->errors)
+);
+
+TRACE_EVENT(block_rq_complete,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  int,		errors			)
+		__array(  char,		rwbs,	6		)
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->errors    = rq->errors;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+	),
+
+	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __get_str(cmd),
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->errors)
+);
+TRACE_EVENT(block_bio_bounce,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_complete,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev		)
+		__field( sector_t,	sector		)
+		__field( unsigned,	nr_sector	)
+		__field( int,		error		)
+		__array( char,		rwbs,	6	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%d]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->error)
+);
+
+TRACE_EVENT(block_bio_backmerge,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_frontmerge,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_queue,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_getrq,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+
+	TP_ARGS(q, bio, rw),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+        ),
+
+	TP_fast_assign(
+		__entry->dev		= bio ? bio->bi_bdev->bd_dev : 0;
+		__entry->sector		= bio ? bio->bi_sector : 0;
+		__entry->nr_sector	= bio ? bio->bi_size >> 9 : 0;
+		blk_fill_rwbs(__entry->rwbs,
+			      bio ? bio->bi_rw : 0, __entry->nr_sector);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+        ),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_sleeprq,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+
+	TP_ARGS(q, bio, rw),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio ? bio->bi_bdev->bd_dev : 0;
+		__entry->sector		= bio ? bio->bi_sector : 0;
+		__entry->nr_sector	= bio ? bio->bi_size >> 9 : 0;
+		blk_fill_rwbs(__entry->rwbs,
+			    bio ? bio->bi_rw : 0, __entry->nr_sector);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_plug,
+
+	TP_PROTO(struct request_queue *q),
+
+	TP_ARGS(q),
+
+	TP_STRUCT__entry(
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("[%s]", __entry->comm)
+);
+
+TRACE_EVENT(block_unplug_timer,
+
+	TP_PROTO(struct request_queue *q),
+
+	TP_ARGS(q),
+
+	TP_STRUCT__entry(
+		__field( int,		nr_rq			)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->nr_rq	= q->rq.count[READ] + q->rq.count[WRITE];
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
+);
+
+TRACE_EVENT(block_unplug_io,
+
+	TP_PROTO(struct request_queue *q),
+
+	TP_ARGS(q),
+
+	TP_STRUCT__entry(
+		__field( int,		nr_rq			)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->nr_rq	= q->rq.count[READ] + q->rq.count[WRITE];
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
+);
+
+TRACE_EVENT(block_split,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio,
+		 unsigned int new_sector),
+
+	TP_ARGS(q, bio, new_sector),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev				)
+		__field( sector_t,	sector				)
+		__field( sector_t,	new_sector			)
+		__array( char,		rwbs,		6		)
+		__array( char,		comm,		TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->new_sector	= new_sector;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu / %llu [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  (unsigned long long)__entry->sector,
+		  (unsigned long long)__entry->new_sector,
+		  __entry->comm)
+);
+
+TRACE_EVENT(block_remap,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
+		 sector_t from),
+
+	TP_ARGS(q, bio, dev, from),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev		)
+		__field( sector_t,	sector		)
+		__field( unsigned int,	nr_sector	)
+		__field( dev_t,		old_dev		)
+		__field( sector_t,	old_sector	)
+		__array( char,		rwbs,	6	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		__entry->old_dev	= dev;
+		__entry->old_sector	= from;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+	),
+
+	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector,
+		  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
+		  (unsigned long long)__entry->old_sector)
+);
+
+#endif /* _TRACE_BLOCK_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
+

+ 145 - 0
include/trace/events/irq.h

@@ -0,0 +1,145 @@
+#if !defined(_TRACE_IRQ_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_IRQ_H
+
+#include <linux/tracepoint.h>
+#include <linux/interrupt.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM irq
+
+#define softirq_name(sirq) { sirq##_SOFTIRQ, #sirq }
+#define show_softirq_name(val)			\
+	__print_symbolic(val,			\
+			 softirq_name(HI),	\
+			 softirq_name(TIMER),	\
+			 softirq_name(NET_TX),	\
+			 softirq_name(NET_RX),	\
+			 softirq_name(BLOCK),	\
+			 softirq_name(TASKLET),	\
+			 softirq_name(SCHED),	\
+			 softirq_name(HRTIMER),	\
+			 softirq_name(RCU))
+
+/**
+ * irq_handler_entry - called immediately before the irq action handler
+ * @irq: irq number
+ * @action: pointer to struct irqaction
+ *
+ * The struct irqaction pointed to by @action contains various
+ * information about the handler, including the device name,
+ * @action->name, and the device id, @action->dev_id. When used in
+ * conjunction with the irq_handler_exit tracepoint, we can figure
+ * out irq handler latencies.
+ */
+TRACE_EVENT(irq_handler_entry,
+
+	TP_PROTO(int irq, struct irqaction *action),
+
+	TP_ARGS(irq, action),
+
+	TP_STRUCT__entry(
+		__field(	int,	irq		)
+		__string(	name,	action->name	)
+	),
+
+	TP_fast_assign(
+		__entry->irq = irq;
+		__assign_str(name, action->name);
+	),
+
+	TP_printk("irq=%d handler=%s", __entry->irq, __get_str(name))
+);
+
+/**
+ * irq_handler_exit - called immediately after the irq action handler returns
+ * @irq: irq number
+ * @action: pointer to struct irqaction
+ * @ret: return value
+ *
+ * If the @ret value is set to IRQ_HANDLED, then we know that the corresponding
+ * @action->handler scuccessully handled this irq. Otherwise, the irq might be
+ * a shared irq line, or the irq was not handled successfully. Can be used in
+ * conjunction with the irq_handler_entry to understand irq handler latencies.
+ */
+TRACE_EVENT(irq_handler_exit,
+
+	TP_PROTO(int irq, struct irqaction *action, int ret),
+
+	TP_ARGS(irq, action, ret),
+
+	TP_STRUCT__entry(
+		__field(	int,	irq	)
+		__field(	int,	ret	)
+	),
+
+	TP_fast_assign(
+		__entry->irq	= irq;
+		__entry->ret	= ret;
+	),
+
+	TP_printk("irq=%d return=%s",
+		  __entry->irq, __entry->ret ? "handled" : "unhandled")
+);
+
+/**
+ * softirq_entry - called immediately before the softirq handler
+ * @h: pointer to struct softirq_action
+ * @vec: pointer to first struct softirq_action in softirq_vec array
+ *
+ * The @h parameter, contains a pointer to the struct softirq_action
+ * which has a pointer to the action handler that is called. By subtracting
+ * the @vec pointer from the @h pointer, we can determine the softirq
+ * number. Also, when used in combination with the softirq_exit tracepoint
+ * we can determine the softirq latency.
+ */
+TRACE_EVENT(softirq_entry,
+
+	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+
+	TP_ARGS(h, vec),
+
+	TP_STRUCT__entry(
+		__field(	int,	vec			)
+	),
+
+	TP_fast_assign(
+		__entry->vec = (int)(h - vec);
+	),
+
+	TP_printk("softirq=%d action=%s", __entry->vec,
+		  show_softirq_name(__entry->vec))
+);
+
+/**
+ * softirq_exit - called immediately after the softirq handler returns
+ * @h: pointer to struct softirq_action
+ * @vec: pointer to first struct softirq_action in softirq_vec array
+ *
+ * The @h parameter contains a pointer to the struct softirq_action
+ * that has handled the softirq. By subtracting the @vec pointer from
+ * the @h pointer, we can determine the softirq number. Also, when used in
+ * combination with the softirq_entry tracepoint we can determine the softirq
+ * latency.
+ */
+TRACE_EVENT(softirq_exit,
+
+	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+
+	TP_ARGS(h, vec),
+
+	TP_STRUCT__entry(
+		__field(	int,	vec			)
+	),
+
+	TP_fast_assign(
+		__entry->vec = (int)(h - vec);
+	),
+
+	TP_printk("softirq=%d action=%s", __entry->vec,
+		  show_softirq_name(__entry->vec))
+);
+
+#endif /*  _TRACE_IRQ_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>

+ 231 - 0
include/trace/events/kmem.h

@@ -0,0 +1,231 @@
+#if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KMEM_H
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kmem
+
+/*
+ * The order of these masks is important. Matching masks will be seen
+ * first and the left over flags will end up showing by themselves.
+ *
+ * For example, if we have GFP_KERNEL before GFP_USER we wil get:
+ *
+ *  GFP_KERNEL|GFP_HARDWALL
+ *
+ * Thus most bits set go first.
+ */
+#define show_gfp_flags(flags)						\
+	(flags) ? __print_flags(flags, "|",				\
+	{(unsigned long)GFP_HIGHUSER_MOVABLE,	"GFP_HIGHUSER_MOVABLE"}, \
+	{(unsigned long)GFP_HIGHUSER,		"GFP_HIGHUSER"},	\
+	{(unsigned long)GFP_USER,		"GFP_USER"},		\
+	{(unsigned long)GFP_TEMPORARY,		"GFP_TEMPORARY"},	\
+	{(unsigned long)GFP_KERNEL,		"GFP_KERNEL"},		\
+	{(unsigned long)GFP_NOFS,		"GFP_NOFS"},		\
+	{(unsigned long)GFP_ATOMIC,		"GFP_ATOMIC"},		\
+	{(unsigned long)GFP_NOIO,		"GFP_NOIO"},		\
+	{(unsigned long)__GFP_HIGH,		"GFP_HIGH"},		\
+	{(unsigned long)__GFP_WAIT,		"GFP_WAIT"},		\
+	{(unsigned long)__GFP_IO,		"GFP_IO"},		\
+	{(unsigned long)__GFP_COLD,		"GFP_COLD"},		\
+	{(unsigned long)__GFP_NOWARN,		"GFP_NOWARN"},		\
+	{(unsigned long)__GFP_REPEAT,		"GFP_REPEAT"},		\
+	{(unsigned long)__GFP_NOFAIL,		"GFP_NOFAIL"},		\
+	{(unsigned long)__GFP_NORETRY,		"GFP_NORETRY"},		\
+	{(unsigned long)__GFP_COMP,		"GFP_COMP"},		\
+	{(unsigned long)__GFP_ZERO,		"GFP_ZERO"},		\
+	{(unsigned long)__GFP_NOMEMALLOC,	"GFP_NOMEMALLOC"},	\
+	{(unsigned long)__GFP_HARDWALL,		"GFP_HARDWALL"},	\
+	{(unsigned long)__GFP_THISNODE,		"GFP_THISNODE"},	\
+	{(unsigned long)__GFP_RECLAIMABLE,	"GFP_RECLAIMABLE"},	\
+	{(unsigned long)__GFP_MOVABLE,		"GFP_MOVABLE"}		\
+	) : "GFP_NOWAIT"
+
+TRACE_EVENT(kmalloc,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		show_gfp_flags(__entry->gfp_flags))
+);
+
+TRACE_EVENT(kmem_cache_alloc,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		show_gfp_flags(__entry->gfp_flags))
+);
+
+TRACE_EVENT(kmalloc_node,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags,
+		 int node),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+		__field(	int,		node		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+		__entry->node		= node;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		show_gfp_flags(__entry->gfp_flags),
+		__entry->node)
+);
+
+TRACE_EVENT(kmem_cache_alloc_node,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags,
+		 int node),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+		__field(	int,		node		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+		__entry->node		= node;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		show_gfp_flags(__entry->gfp_flags),
+		__entry->node)
+);
+
+TRACE_EVENT(kfree,
+
+	TP_PROTO(unsigned long call_site, const void *ptr),
+
+	TP_ARGS(call_site, ptr),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+	),
+
+	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
+);
+
+TRACE_EVENT(kmem_cache_free,
+
+	TP_PROTO(unsigned long call_site, const void *ptr),
+
+	TP_ARGS(call_site, ptr),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+	),
+
+	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
+);
+#endif /* _TRACE_KMEM_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>

+ 96 - 0
include/trace/events/lockdep.h

@@ -0,0 +1,96 @@
+#if !defined(_TRACE_LOCKDEP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_LOCKDEP_H
+
+#include <linux/lockdep.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM lockdep
+
+#ifdef CONFIG_LOCKDEP
+
+TRACE_EVENT(lock_acquire,
+
+	TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
+		int trylock, int read, int check,
+		struct lockdep_map *next_lock, unsigned long ip),
+
+	TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, flags)
+		__string(name, lock->name)
+	),
+
+	TP_fast_assign(
+		__entry->flags = (trylock ? 1 : 0) | (read ? 2 : 0);
+		__assign_str(name, lock->name);
+	),
+
+	TP_printk("%s%s%s", (__entry->flags & 1) ? "try " : "",
+		  (__entry->flags & 2) ? "read " : "",
+		  __get_str(name))
+);
+
+TRACE_EVENT(lock_release,
+
+	TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
+
+	TP_ARGS(lock, nested, ip),
+
+	TP_STRUCT__entry(
+		__string(name, lock->name)
+	),
+
+	TP_fast_assign(
+		__assign_str(name, lock->name);
+	),
+
+	TP_printk("%s", __get_str(name))
+);
+
+#ifdef CONFIG_LOCK_STAT
+
+TRACE_EVENT(lock_contended,
+
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+
+	TP_ARGS(lock, ip),
+
+	TP_STRUCT__entry(
+		__string(name, lock->name)
+	),
+
+	TP_fast_assign(
+		__assign_str(name, lock->name);
+	),
+
+	TP_printk("%s", __get_str(name))
+);
+
+TRACE_EVENT(lock_acquired,
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime),
+
+	TP_ARGS(lock, ip, waittime),
+
+	TP_STRUCT__entry(
+		__string(name, lock->name)
+		__field(unsigned long, wait_usec)
+		__field(unsigned long, wait_nsec_rem)
+	),
+	TP_fast_assign(
+		__assign_str(name, lock->name);
+		__entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC);
+		__entry->wait_usec = (unsigned long) waittime;
+	),
+	TP_printk("%s (%lu.%03lu us)", __get_str(name), __entry->wait_usec,
+				       __entry->wait_nsec_rem)
+);
+
+#endif
+#endif
+
+#endif /* _TRACE_LOCKDEP_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>

+ 19 - 10
include/trace/sched_event_types.h → include/trace/events/sched.h

@@ -1,9 +1,8 @@
+#if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SCHED_H
 
 
-/* use <trace/sched.h> instead */
-#ifndef TRACE_EVENT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
 
 
 #undef TRACE_SYSTEM
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM sched
 #define TRACE_SYSTEM sched
@@ -157,6 +156,7 @@ TRACE_EVENT(sched_switch,
 		__array(	char,	prev_comm,	TASK_COMM_LEN	)
 		__array(	char,	prev_comm,	TASK_COMM_LEN	)
 		__field(	pid_t,	prev_pid			)
 		__field(	pid_t,	prev_pid			)
 		__field(	int,	prev_prio			)
 		__field(	int,	prev_prio			)
+		__field(	long,	prev_state			)
 		__array(	char,	next_comm,	TASK_COMM_LEN	)
 		__array(	char,	next_comm,	TASK_COMM_LEN	)
 		__field(	pid_t,	next_pid			)
 		__field(	pid_t,	next_pid			)
 		__field(	int,	next_prio			)
 		__field(	int,	next_prio			)
@@ -166,13 +166,19 @@ TRACE_EVENT(sched_switch,
 		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
 		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
 		__entry->prev_pid	= prev->pid;
 		__entry->prev_pid	= prev->pid;
 		__entry->prev_prio	= prev->prio;
 		__entry->prev_prio	= prev->prio;
+		__entry->prev_state	= prev->state;
 		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
 		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
 		__entry->next_pid	= next->pid;
 		__entry->next_pid	= next->pid;
 		__entry->next_prio	= next->prio;
 		__entry->next_prio	= next->prio;
 	),
 	),
 
 
-	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
+	TP_printk("task %s:%d [%d] (%s) ==> %s:%d [%d]",
 		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
 		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
+		__entry->prev_state ?
+		  __print_flags(__entry->prev_state, "|",
+				{ 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
+				{ 16, "Z" }, { 32, "X" }, { 64, "x" },
+				{ 128, "W" }) : "R",
 		__entry->next_comm, __entry->next_pid, __entry->next_prio)
 		__entry->next_comm, __entry->next_pid, __entry->next_prio)
 );
 );
 
 
@@ -181,9 +187,9 @@ TRACE_EVENT(sched_switch,
  */
  */
 TRACE_EVENT(sched_migrate_task,
 TRACE_EVENT(sched_migrate_task,
 
 
-	TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
+	TP_PROTO(struct task_struct *p, int dest_cpu),
 
 
-	TP_ARGS(p, orig_cpu, dest_cpu),
+	TP_ARGS(p, dest_cpu),
 
 
 	TP_STRUCT__entry(
 	TP_STRUCT__entry(
 		__array(	char,	comm,	TASK_COMM_LEN	)
 		__array(	char,	comm,	TASK_COMM_LEN	)
@@ -197,7 +203,7 @@ TRACE_EVENT(sched_migrate_task,
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio;
 		__entry->prio		= p->prio;
-		__entry->orig_cpu	= orig_cpu;
+		__entry->orig_cpu	= task_cpu(p);
 		__entry->dest_cpu	= dest_cpu;
 		__entry->dest_cpu	= dest_cpu;
 	),
 	),
 
 
@@ -334,4 +340,7 @@ TRACE_EVENT(sched_signal_send,
 		  __entry->sig, __entry->comm, __entry->pid)
 		  __entry->sig, __entry->comm, __entry->pid)
 );
 );
 
 
-#undef TRACE_SYSTEM
+#endif /* _TRACE_SCHED_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>

+ 40 - 0
include/trace/events/skb.h

@@ -0,0 +1,40 @@
+#if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SKB_H
+
+#include <linux/skbuff.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM skb
+
+/*
+ * Tracepoint for free an sk_buff:
+ */
+TRACE_EVENT(kfree_skb,
+
+	TP_PROTO(struct sk_buff *skb, void *location),
+
+	TP_ARGS(skb, location),
+
+	TP_STRUCT__entry(
+		__field(	void *,		skbaddr		)
+		__field(	unsigned short,	protocol	)
+		__field(	void *,		location	)
+	),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+		if (skb) {
+			__entry->protocol = ntohs(skb->protocol);
+		}
+		__entry->location = location;
+	),
+
+	TP_printk("skbaddr=%p protocol=%u location=%p",
+		__entry->skbaddr, __entry->protocol, __entry->location)
+);
+
+#endif /* _TRACE_SKB_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>

+ 100 - 0
include/trace/events/workqueue.h

@@ -0,0 +1,100 @@
+#if !defined(_TRACE_WORKQUEUE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_WORKQUEUE_H
+
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM workqueue
+
+TRACE_EVENT(workqueue_insertion,
+
+	TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
+
+	TP_ARGS(wq_thread, work),
+
+	TP_STRUCT__entry(
+		__array(char,		thread_comm,	TASK_COMM_LEN)
+		__field(pid_t,		thread_pid)
+		__field(work_func_t,	func)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN);
+		__entry->thread_pid	= wq_thread->pid;
+		__entry->func		= work->func;
+	),
+
+	TP_printk("thread=%s:%d func=%pF", __entry->thread_comm,
+		__entry->thread_pid, __entry->func)
+);
+
+TRACE_EVENT(workqueue_execution,
+
+	TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
+
+	TP_ARGS(wq_thread, work),
+
+	TP_STRUCT__entry(
+		__array(char,		thread_comm,	TASK_COMM_LEN)
+		__field(pid_t,		thread_pid)
+		__field(work_func_t,	func)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN);
+		__entry->thread_pid	= wq_thread->pid;
+		__entry->func		= work->func;
+	),
+
+	TP_printk("thread=%s:%d func=%pF", __entry->thread_comm,
+		__entry->thread_pid, __entry->func)
+);
+
+/* Trace the creation of one workqueue thread on a cpu */
+TRACE_EVENT(workqueue_creation,
+
+	TP_PROTO(struct task_struct *wq_thread, int cpu),
+
+	TP_ARGS(wq_thread, cpu),
+
+	TP_STRUCT__entry(
+		__array(char,	thread_comm,	TASK_COMM_LEN)
+		__field(pid_t,	thread_pid)
+		__field(int,	cpu)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN);
+		__entry->thread_pid	= wq_thread->pid;
+		__entry->cpu		= cpu;
+	),
+
+	TP_printk("thread=%s:%d cpu=%d", __entry->thread_comm,
+		__entry->thread_pid, __entry->cpu)
+);
+
+TRACE_EVENT(workqueue_destruction,
+
+	TP_PROTO(struct task_struct *wq_thread),
+
+	TP_ARGS(wq_thread),
+
+	TP_STRUCT__entry(
+		__array(char,	thread_comm,	TASK_COMM_LEN)
+		__field(pid_t,	thread_pid)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN);
+		__entry->thread_pid	= wq_thread->pid;
+	),
+
+	TP_printk("thread=%s:%d", __entry->thread_comm, __entry->thread_pid)
+);
+
+#endif /* _TRACE_WORKQUEUE_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>

+ 591 - 0
include/trace/ftrace.h

@@ -0,0 +1,591 @@
+/*
+ * Stage 1 of the trace events.
+ *
+ * Override the macros in <trace/trace_events.h> to include the following:
+ *
+ * struct ftrace_raw_<call> {
+ *	struct trace_entry		ent;
+ *	<type>				<item>;
+ *	<type2>				<item2>[<len>];
+ *	[...]
+ * };
+ *
+ * The <type> <item> is created by the __field(type, item) macro or
+ * the __array(type2, item2, len) macro.
+ * We simply do "type item;", and that will create the fields
+ * in the structure.
+ */
+
+#include <linux/ftrace_event.h>
+
+#undef __field
+#define __field(type, item)		type	item;
+
+#undef __array
+#define __array(type, item, len)	type	item[len];
+
+#undef __dynamic_array
+#define __dynamic_array(type, item, len) unsigned short __data_loc_##item;
+
+#undef __string
+#define __string(item, src) __dynamic_array(char, item, -1)
+
+#undef TP_STRUCT__entry
+#define TP_STRUCT__entry(args...) args
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(name, proto, args, tstruct, assign, print)	\
+	struct ftrace_raw_##name {				\
+		struct trace_entry	ent;			\
+		tstruct						\
+		char			__data[0];		\
+	};							\
+	static struct ftrace_event_call event_##name
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+
+/*
+ * Stage 2 of the trace events.
+ *
+ * Include the following:
+ *
+ * struct ftrace_data_offsets_<call> {
+ *	int				<item1>;
+ *	int				<item2>;
+ *	[...]
+ * };
+ *
+ * The __dynamic_array() macro will create each int <item>, this is
+ * to keep the offset of each array from the beginning of the event.
+ */
+
+#undef __field
+#define __field(type, item);
+
+#undef __array
+#define __array(type, item, len)
+
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)	int item;
+
+#undef __string
+#define __string(item, src) __dynamic_array(char, item, -1)
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
+	struct ftrace_data_offsets_##call {				\
+		tstruct;						\
+	};
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+/*
+ * Setup the showing format of trace point.
+ *
+ * int
+ * ftrace_format_##call(struct trace_seq *s)
+ * {
+ *	struct ftrace_raw_##call field;
+ *	int ret;
+ *
+ *	ret = trace_seq_printf(s, #type " " #item ";"
+ *			       " offset:%u; size:%u;\n",
+ *			       offsetof(struct ftrace_raw_##call, item),
+ *			       sizeof(field.type));
+ *
+ * }
+ */
+
+#undef TP_STRUCT__entry
+#define TP_STRUCT__entry(args...) args
+
+#undef __field
+#define __field(type, item)					\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+#undef __array
+#define __array(type, item, len)						\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)				       \
+	ret = trace_seq_printf(s, "\tfield:__data_loc " #item ";\t"	       \
+			       "offset:%u;\tsize:%u;\n",		       \
+			       (unsigned int)offsetof(typeof(field),	       \
+					__data_loc_##item),		       \
+			       (unsigned int)sizeof(field.__data_loc_##item)); \
+	if (!ret)							       \
+		return 0;
+
+#undef __string
+#define __string(item, src) __dynamic_array(char, item, -1)
+
+#undef __entry
+#define __entry REC
+
+#undef __print_symbolic
+#undef __get_dynamic_array
+#undef __get_str
+
+#undef TP_printk
+#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
+
+#undef TP_fast_assign
+#define TP_fast_assign(args...) args
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
+static int								\
+ftrace_format_##call(struct trace_seq *s)				\
+{									\
+	struct ftrace_raw_##call field __attribute__((unused));		\
+	int ret = 0;							\
+									\
+	tstruct;							\
+									\
+	trace_seq_printf(s, "\nprint fmt: " print);			\
+									\
+	return ret;							\
+}
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+/*
+ * Stage 3 of the trace events.
+ *
+ * Override the macros in <trace/trace_events.h> to include the following:
+ *
+ * enum print_line_t
+ * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
+ * {
+ *	struct trace_seq *s = &iter->seq;
+ *	struct ftrace_raw_<call> *field; <-- defined in stage 1
+ *	struct trace_entry *entry;
+ *	struct trace_seq *p;
+ *	int ret;
+ *
+ *	entry = iter->ent;
+ *
+ *	if (entry->type != event_<call>.id) {
+ *		WARN_ON_ONCE(1);
+ *		return TRACE_TYPE_UNHANDLED;
+ *	}
+ *
+ *	field = (typeof(field))entry;
+ *
+ *	p = get_cpu_var(ftrace_event_seq);
+ *	trace_seq_init(p);
+ *	ret = trace_seq_printf(s, <TP_printk> "\n");
+ *	put_cpu();
+ *	if (!ret)
+ *		return TRACE_TYPE_PARTIAL_LINE;
+ *
+ *	return TRACE_TYPE_HANDLED;
+ * }
+ *
+ * This is the method used to print the raw event to the trace
+ * output format. Note, this is not needed if the data is read
+ * in binary.
+ */
+
+#undef __entry
+#define __entry field
+
+#undef TP_printk
+#define TP_printk(fmt, args...) fmt "\n", args
+
+#undef __get_dynamic_array
+#define __get_dynamic_array(field)	\
+		((void *)__entry + __entry->__data_loc_##field)
+
+#undef __get_str
+#define __get_str(field) (char *)__get_dynamic_array(field)
+
+#undef __print_flags
+#define __print_flags(flag, delim, flag_array...)			\
+	({								\
+		static const struct trace_print_flags flags[] =		\
+			{ flag_array, { -1, NULL }};			\
+		ftrace_print_flags_seq(p, delim, flag, flags);		\
+	})
+
+#undef __print_symbolic
+#define __print_symbolic(value, symbol_array...)			\
+	({								\
+		static const struct trace_print_flags symbols[] =	\
+			{ symbol_array, { -1, NULL }};			\
+		ftrace_print_symbols_seq(p, value, symbols);		\
+	})
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
+enum print_line_t							\
+ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
+{									\
+	struct trace_seq *s = &iter->seq;				\
+	struct ftrace_raw_##call *field;				\
+	struct trace_entry *entry;					\
+	struct trace_seq *p;						\
+	int ret;							\
+									\
+	entry = iter->ent;						\
+									\
+	if (entry->type != event_##call.id) {				\
+		WARN_ON_ONCE(1);					\
+		return TRACE_TYPE_UNHANDLED;				\
+	}								\
+									\
+	field = (typeof(field))entry;					\
+									\
+	p = &get_cpu_var(ftrace_event_seq);				\
+	trace_seq_init(p);						\
+	ret = trace_seq_printf(s, #call ": " print);			\
+	put_cpu();							\
+	if (!ret)							\
+		return TRACE_TYPE_PARTIAL_LINE;				\
+									\
+	return TRACE_TYPE_HANDLED;					\
+}
+	
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+#undef __field
+#define __field(type, item)						\
+	ret = trace_define_field(event_call, #type, #item,		\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item), is_signed_type(type));	\
+	if (ret)							\
+		return ret;
+
+#undef __array
+#define __array(type, item, len)					\
+	BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);				\
+	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item), 0);		\
+	if (ret)							\
+		return ret;
+
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)				       \
+	ret = trace_define_field(event_call, "__data_loc" "[" #type "]", #item,\
+				offsetof(typeof(field), __data_loc_##item),    \
+				 sizeof(field.__data_loc_##item), 0);
+
+#undef __string
+#define __string(item, src) __dynamic_array(char, item, -1)
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
+int									\
+ftrace_define_fields_##call(void)					\
+{									\
+	struct ftrace_raw_##call field;					\
+	struct ftrace_event_call *event_call = &event_##call;		\
+	int ret;							\
+									\
+	__common_field(int, type, 1);					\
+	__common_field(unsigned char, flags, 0);			\
+	__common_field(unsigned char, preempt_count, 0);		\
+	__common_field(int, pid, 1);					\
+	__common_field(int, tgid, 1);					\
+									\
+	tstruct;							\
+									\
+	return ret;							\
+}
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+/*
+ * remember the offset of each array from the beginning of the event.
+ */
+
+#undef __entry
+#define __entry entry
+
+#undef __field
+#define __field(type, item)
+
+#undef __array
+#define __array(type, item, len)
+
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)				\
+	__data_offsets->item = __data_size +				\
+			       offsetof(typeof(*entry), __data);	\
+	__data_size += (len) * sizeof(type);
+
+#undef __string
+#define __string(item, src) __dynamic_array(char, item, strlen(src) + 1)       \
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
+static inline int ftrace_get_offsets_##call(				\
+	struct ftrace_data_offsets_##call *__data_offsets, proto)       \
+{									\
+	int __data_size = 0;						\
+	struct ftrace_raw_##call __maybe_unused *entry;			\
+									\
+	tstruct;							\
+									\
+	return __data_size;						\
+}
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+/*
+ * Stage 4 of the trace events.
+ *
+ * Override the macros in <trace/trace_events.h> to include the following:
+ *
+ * static void ftrace_event_<call>(proto)
+ * {
+ *	event_trace_printk(_RET_IP_, "<call>: " <fmt>);
+ * }
+ *
+ * static int ftrace_reg_event_<call>(void)
+ * {
+ *	int ret;
+ *
+ *	ret = register_trace_<call>(ftrace_event_<call>);
+ *	if (!ret)
+ *		pr_info("event trace: Could not activate trace point "
+ *			"probe to  <call>");
+ *	return ret;
+ * }
+ *
+ * static void ftrace_unreg_event_<call>(void)
+ * {
+ *	unregister_trace_<call>(ftrace_event_<call>);
+ * }
+ *
+ *
+ * For those macros defined with TRACE_EVENT:
+ *
+ * static struct ftrace_event_call event_<call>;
+ *
+ * static void ftrace_raw_event_<call>(proto)
+ * {
+ *	struct ring_buffer_event *event;
+ *	struct ftrace_raw_<call> *entry; <-- defined in stage 1
+ *	unsigned long irq_flags;
+ *	int pc;
+ *
+ *	local_save_flags(irq_flags);
+ *	pc = preempt_count();
+ *
+ *	event = trace_current_buffer_lock_reserve(event_<call>.id,
+ *				  sizeof(struct ftrace_raw_<call>),
+ *				  irq_flags, pc);
+ *	if (!event)
+ *		return;
+ *	entry	= ring_buffer_event_data(event);
+ *
+ *	<assign>;  <-- Here we assign the entries by the __field and
+ *			__array macros.
+ *
+ *	trace_current_buffer_unlock_commit(event, irq_flags, pc);
+ * }
+ *
+ * static int ftrace_raw_reg_event_<call>(void)
+ * {
+ *	int ret;
+ *
+ *	ret = register_trace_<call>(ftrace_raw_event_<call>);
+ *	if (!ret)
+ *		pr_info("event trace: Could not activate trace point "
+ *			"probe to <call>");
+ *	return ret;
+ * }
+ *
+ * static void ftrace_unreg_event_<call>(void)
+ * {
+ *	unregister_trace_<call>(ftrace_raw_event_<call>);
+ * }
+ *
+ * static struct trace_event ftrace_event_type_<call> = {
+ *	.trace			= ftrace_raw_output_<call>, <-- stage 2
+ * };
+ *
+ * static int ftrace_raw_init_event_<call>(void)
+ * {
+ *	int id;
+ *
+ *	id = register_ftrace_event(&ftrace_event_type_<call>);
+ *	if (!id)
+ *		return -ENODEV;
+ *	event_<call>.id = id;
+ *	return 0;
+ * }
+ *
+ * static struct ftrace_event_call __used
+ * __attribute__((__aligned__(4)))
+ * __attribute__((section("_ftrace_events"))) event_<call> = {
+ *	.name			= "<call>",
+ *	.system			= "<system>",
+ *	.raw_init		= ftrace_raw_init_event_<call>,
+ *	.regfunc		= ftrace_reg_event_<call>,
+ *	.unregfunc		= ftrace_unreg_event_<call>,
+ *	.show_format		= ftrace_format_<call>,
+ * }
+ *
+ */
+
+#undef TP_FMT
+#define TP_FMT(fmt, args...)	fmt "\n", ##args
+
+#ifdef CONFIG_EVENT_PROFILE
+#define _TRACE_PROFILE(call, proto, args)				\
+static void ftrace_profile_##call(proto)				\
+{									\
+	extern void perf_tpcounter_event(int);				\
+	perf_tpcounter_event(event_##call.id);				\
+}									\
+									\
+static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \
+{									\
+	int ret = 0;							\
+									\
+	if (!atomic_inc_return(&event_call->profile_count))		\
+		ret = register_trace_##call(ftrace_profile_##call);	\
+									\
+	return ret;							\
+}									\
+									\
+static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
+{									\
+	if (atomic_add_negative(-1, &event_call->profile_count))	\
+		unregister_trace_##call(ftrace_profile_##call);		\
+}
+
+#define _TRACE_PROFILE_INIT(call)					\
+	.profile_count = ATOMIC_INIT(-1),				\
+	.profile_enable = ftrace_profile_enable_##call,			\
+	.profile_disable = ftrace_profile_disable_##call,
+
+#else
+#define _TRACE_PROFILE(call, proto, args)
+#define _TRACE_PROFILE_INIT(call)
+#endif
+
+#undef __entry
+#define __entry entry
+
+#undef __field
+#define __field(type, item)
+
+#undef __array
+#define __array(type, item, len)
+
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)				\
+	__entry->__data_loc_##item = __data_offsets.item;
+
+#undef __string
+#define __string(item, src) __dynamic_array(char, item, -1)       	\
+
+#undef __assign_str
+#define __assign_str(dst, src)						\
+	strcpy(__get_str(dst), src);
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
+_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))			\
+									\
+static struct ftrace_event_call event_##call;				\
+									\
+static void ftrace_raw_event_##call(proto)				\
+{									\
+	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
+	struct ftrace_event_call *event_call = &event_##call;		\
+	struct ring_buffer_event *event;				\
+	struct ftrace_raw_##call *entry;				\
+	unsigned long irq_flags;					\
+	int __data_size;						\
+	int pc;								\
+									\
+	local_save_flags(irq_flags);					\
+	pc = preempt_count();						\
+									\
+	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
+									\
+	event = trace_current_buffer_lock_reserve(event_##call.id,	\
+				 sizeof(*entry) + __data_size,		\
+				 irq_flags, pc);			\
+	if (!event)							\
+		return;							\
+	entry	= ring_buffer_event_data(event);			\
+									\
+									\
+	tstruct								\
+									\
+	{ assign; }							\
+									\
+	if (!filter_current_check_discard(event_call, entry, event))	\
+		trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
+}									\
+									\
+static int ftrace_raw_reg_event_##call(void)				\
+{									\
+	int ret;							\
+									\
+	ret = register_trace_##call(ftrace_raw_event_##call);		\
+	if (ret)							\
+		pr_info("event trace: Could not activate trace point "	\
+			"probe to " #call "\n");			\
+	return ret;							\
+}									\
+									\
+static void ftrace_raw_unreg_event_##call(void)				\
+{									\
+	unregister_trace_##call(ftrace_raw_event_##call);		\
+}									\
+									\
+static struct trace_event ftrace_event_type_##call = {			\
+	.trace			= ftrace_raw_output_##call,		\
+};									\
+									\
+static int ftrace_raw_init_event_##call(void)				\
+{									\
+	int id;								\
+									\
+	id = register_ftrace_event(&ftrace_event_type_##call);		\
+	if (!id)							\
+		return -ENODEV;						\
+	event_##call.id = id;						\
+	INIT_LIST_HEAD(&event_##call.fields);				\
+	init_preds(&event_##call);					\
+	return 0;							\
+}									\
+									\
+static struct ftrace_event_call __used					\
+__attribute__((__aligned__(4)))						\
+__attribute__((section("_ftrace_events"))) event_##call = {		\
+	.name			= #call,				\
+	.system			= __stringify(TRACE_SYSTEM),		\
+	.event			= &ftrace_event_type_##call,		\
+	.raw_init		= ftrace_raw_init_event_##call,		\
+	.regfunc		= ftrace_raw_reg_event_##call,		\
+	.unregfunc		= ftrace_raw_unreg_event_##call,	\
+	.show_format		= ftrace_format_##call,			\
+	.define_fields		= ftrace_define_fields_##call,		\
+	_TRACE_PROFILE_INIT(call)					\
+}
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+#undef _TRACE_PROFILE
+#undef _TRACE_PROFILE_INIT
+

+ 0 - 9
include/trace/irq.h

@@ -1,9 +0,0 @@
-#ifndef _TRACE_IRQ_H
-#define _TRACE_IRQ_H
-
-#include <linux/interrupt.h>
-#include <linux/tracepoint.h>
-
-#include <trace/irq_event_types.h>
-
-#endif

+ 0 - 55
include/trace/irq_event_types.h

@@ -1,55 +0,0 @@
-
-/* use <trace/irq.h> instead */
-#ifndef TRACE_FORMAT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM irq
-
-/*
- * Tracepoint for entry of interrupt handler:
- */
-TRACE_FORMAT(irq_handler_entry,
-	TP_PROTO(int irq, struct irqaction *action),
-	TP_ARGS(irq, action),
-	TP_FMT("irq=%d handler=%s", irq, action->name)
-	);
-
-/*
- * Tracepoint for return of an interrupt handler:
- */
-TRACE_EVENT(irq_handler_exit,
-
-	TP_PROTO(int irq, struct irqaction *action, int ret),
-
-	TP_ARGS(irq, action, ret),
-
-	TP_STRUCT__entry(
-		__field(	int,	irq	)
-		__field(	int,	ret	)
-	),
-
-	TP_fast_assign(
-		__entry->irq	= irq;
-		__entry->ret	= ret;
-	),
-
-	TP_printk("irq=%d return=%s",
-		  __entry->irq, __entry->ret ? "handled" : "unhandled")
-);
-
-TRACE_FORMAT(softirq_entry,
-	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
-	TP_ARGS(h, vec),
-	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
-	);
-
-TRACE_FORMAT(softirq_exit,
-	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
-	TP_ARGS(h, vec),
-	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
-	);
-
-#undef TRACE_SYSTEM

+ 0 - 63
include/trace/kmemtrace.h

@@ -1,63 +0,0 @@
-/*
- * Copyright (C) 2008 Eduard - Gabriel Munteanu
- *
- * This file is released under GPL version 2.
- */
-
-#ifndef _LINUX_KMEMTRACE_H
-#define _LINUX_KMEMTRACE_H
-
-#ifdef __KERNEL__
-
-#include <linux/tracepoint.h>
-#include <linux/types.h>
-
-#ifdef CONFIG_KMEMTRACE
-extern void kmemtrace_init(void);
-#else
-static inline void kmemtrace_init(void)
-{
-}
-#endif
-
-DECLARE_TRACE(kmalloc,
-	      TP_PROTO(unsigned long call_site,
-		      const void *ptr,
-		      size_t bytes_req,
-		      size_t bytes_alloc,
-		      gfp_t gfp_flags),
-	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags));
-DECLARE_TRACE(kmem_cache_alloc,
-	      TP_PROTO(unsigned long call_site,
-		      const void *ptr,
-		      size_t bytes_req,
-		      size_t bytes_alloc,
-		      gfp_t gfp_flags),
-	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags));
-DECLARE_TRACE(kmalloc_node,
-	      TP_PROTO(unsigned long call_site,
-		      const void *ptr,
-		      size_t bytes_req,
-		      size_t bytes_alloc,
-		      gfp_t gfp_flags,
-		      int node),
-	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node));
-DECLARE_TRACE(kmem_cache_alloc_node,
-	      TP_PROTO(unsigned long call_site,
-		      const void *ptr,
-		      size_t bytes_req,
-		      size_t bytes_alloc,
-		      gfp_t gfp_flags,
-		      int node),
-	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node));
-DECLARE_TRACE(kfree,
-	      TP_PROTO(unsigned long call_site, const void *ptr),
-	      TP_ARGS(call_site, ptr));
-DECLARE_TRACE(kmem_cache_free,
-	      TP_PROTO(unsigned long call_site, const void *ptr),
-	      TP_ARGS(call_site, ptr));
-
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_KMEMTRACE_H */
-

+ 0 - 9
include/trace/lockdep.h

@@ -1,9 +0,0 @@
-#ifndef _TRACE_LOCKDEP_H
-#define _TRACE_LOCKDEP_H
-
-#include <linux/lockdep.h>
-#include <linux/tracepoint.h>
-
-#include <trace/lockdep_event_types.h>
-
-#endif

+ 0 - 44
include/trace/lockdep_event_types.h

@@ -1,44 +0,0 @@
-
-#ifndef TRACE_FORMAT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM lock
-
-#ifdef CONFIG_LOCKDEP
-
-TRACE_FORMAT(lock_acquire,
-	TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
-		int trylock, int read, int check,
-		struct lockdep_map *next_lock, unsigned long ip),
-	TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
-	TP_FMT("%s%s%s", trylock ? "try " : "",
-		read ? "read " : "", lock->name)
-	);
-
-TRACE_FORMAT(lock_release,
-	TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
-	TP_ARGS(lock, nested, ip),
-	TP_FMT("%s", lock->name)
-	);
-
-#ifdef CONFIG_LOCK_STAT
-
-TRACE_FORMAT(lock_contended,
-	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
-	TP_ARGS(lock, ip),
-	TP_FMT("%s", lock->name)
-	);
-
-TRACE_FORMAT(lock_acquired,
-	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
-	TP_ARGS(lock, ip),
-	TP_FMT("%s", lock->name)
-	);
-
-#endif
-#endif
-
-#undef TRACE_SYSTEM

+ 0 - 9
include/trace/sched.h

@@ -1,9 +0,0 @@
-#ifndef _TRACE_SCHED_H
-#define _TRACE_SCHED_H
-
-#include <linux/sched.h>
-#include <linux/tracepoint.h>
-
-#include <trace/sched_event_types.h>
-
-#endif

+ 0 - 11
include/trace/skb.h

@@ -1,11 +0,0 @@
-#ifndef _TRACE_SKB_H_
-#define _TRACE_SKB_H_
-
-#include <linux/skbuff.h>
-#include <linux/tracepoint.h>
-
-DECLARE_TRACE(kfree_skb,
-	TP_PROTO(struct sk_buff *skb, void *location),
-	TP_ARGS(skb, location));
-
-#endif

+ 0 - 5
include/trace/trace_event_types.h

@@ -1,5 +0,0 @@
-/* trace/<type>_event_types.h here */
-
-#include <trace/sched_event_types.h>
-#include <trace/irq_event_types.h>
-#include <trace/lockdep_event_types.h>

+ 0 - 5
include/trace/trace_events.h

@@ -1,5 +0,0 @@
-/* trace/<type>.h here */
-
-#include <trace/sched.h>
-#include <trace/irq.h>
-#include <trace/lockdep.h>

+ 0 - 25
include/trace/workqueue.h

@@ -1,25 +0,0 @@
-#ifndef __TRACE_WORKQUEUE_H
-#define __TRACE_WORKQUEUE_H
-
-#include <linux/tracepoint.h>
-#include <linux/workqueue.h>
-#include <linux/sched.h>
-
-DECLARE_TRACE(workqueue_insertion,
-	   TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
-	   TP_ARGS(wq_thread, work));
-
-DECLARE_TRACE(workqueue_execution,
-	   TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
-	   TP_ARGS(wq_thread, work));
-
-/* Trace the creation of one workqueue thread on a cpu */
-DECLARE_TRACE(workqueue_creation,
-	   TP_PROTO(struct task_struct *wq_thread, int cpu),
-	   TP_ARGS(wq_thread, cpu));
-
-DECLARE_TRACE(workqueue_destruction,
-	   TP_PROTO(struct task_struct *wq_thread),
-	   TP_ARGS(wq_thread));
-
-#endif /* __TRACE_WORKQUEUE_H */

+ 1 - 1
init/main.c

@@ -64,6 +64,7 @@
 #include <linux/idr.h>
 #include <linux/idr.h>
 #include <linux/ftrace.h>
 #include <linux/ftrace.h>
 #include <linux/async.h>
 #include <linux/async.h>
+#include <linux/kmemtrace.h>
 #include <trace/boot.h>
 #include <trace/boot.h>
 
 
 #include <asm/io.h>
 #include <asm/io.h>
@@ -71,7 +72,6 @@
 #include <asm/setup.h>
 #include <asm/setup.h>
 #include <asm/sections.h>
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
 #include <asm/cacheflush.h>
-#include <trace/kmemtrace.h>
 
 
 #ifdef CONFIG_X86_LOCAL_APIC
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/smp.h>
 #include <asm/smp.h>

+ 1 - 0
kernel/Makefile

@@ -93,6 +93,7 @@ obj-$(CONFIG_LATENCYTOP) += latencytop.o
 obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
 obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_TRACING) += trace/
+obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
 
 

+ 1 - 5
kernel/exit.c

@@ -48,7 +48,7 @@
 #include <linux/tracehook.h>
 #include <linux/tracehook.h>
 #include <linux/fs_struct.h>
 #include <linux/fs_struct.h>
 #include <linux/init_task.h>
 #include <linux/init_task.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 
 #include <asm/uaccess.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/unistd.h>
@@ -56,10 +56,6 @@
 #include <asm/mmu_context.h>
 #include <asm/mmu_context.h>
 #include "cred-internals.h"
 #include "cred-internals.h"
 
 
-DEFINE_TRACE(sched_process_free);
-DEFINE_TRACE(sched_process_exit);
-DEFINE_TRACE(sched_process_wait);
-
 static void exit_mm(struct task_struct * tsk);
 static void exit_mm(struct task_struct * tsk);
 
 
 static void __unhash_process(struct task_struct *p)
 static void __unhash_process(struct task_struct *p)

+ 4 - 5
kernel/fork.c

@@ -61,7 +61,6 @@
 #include <linux/proc_fs.h>
 #include <linux/proc_fs.h>
 #include <linux/blkdev.h>
 #include <linux/blkdev.h>
 #include <linux/fs_struct.h>
 #include <linux/fs_struct.h>
-#include <trace/sched.h>
 #include <linux/magic.h>
 #include <linux/magic.h>
 
 
 #include <asm/pgtable.h>
 #include <asm/pgtable.h>
@@ -71,6 +70,8 @@
 #include <asm/cacheflush.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 #include <asm/tlbflush.h>
 
 
+#include <trace/events/sched.h>
+
 /*
 /*
  * Protected counters by write_lock_irq(&tasklist_lock)
  * Protected counters by write_lock_irq(&tasklist_lock)
  */
  */
@@ -83,8 +84,6 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 
 
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
 
 
-DEFINE_TRACE(sched_process_fork);
-
 int nr_processes(void)
 int nr_processes(void)
 {
 {
 	int cpu;
 	int cpu;
@@ -1089,8 +1088,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
 #ifdef CONFIG_DEBUG_MUTEXES
 	p->blocked_on = NULL; /* not blocked yet */
 	p->blocked_on = NULL; /* not blocked yet */
 #endif
 #endif
-	if (unlikely(current->ptrace))
-		ptrace_fork(p, clone_flags);
+
+	p->bts = NULL;
 
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);
 	sched_fork(p, clone_flags);

+ 1 - 4
kernel/irq/handle.c

@@ -18,8 +18,8 @@
 #include <linux/kernel_stat.h>
 #include <linux/kernel_stat.h>
 #include <linux/rculist.h>
 #include <linux/rculist.h>
 #include <linux/hash.h>
 #include <linux/hash.h>
-#include <trace/irq.h>
 #include <linux/bootmem.h>
 #include <linux/bootmem.h>
+#include <trace/events/irq.h>
 
 
 #include "internals.h"
 #include "internals.h"
 
 
@@ -355,9 +355,6 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
 	       "but no thread function available.", irq, action->name);
 	       "but no thread function available.", irq, action->name);
 }
 }
 
 
-DEFINE_TRACE(irq_handler_entry);
-DEFINE_TRACE(irq_handler_exit);
-
 /**
 /**
  * handle_IRQ_event - irq action chain handler
  * handle_IRQ_event - irq action chain handler
  * @irq:	the interrupt number
  * @irq:	the interrupt number

+ 1 - 4
kernel/kthread.c

@@ -13,7 +13,7 @@
 #include <linux/file.h>
 #include <linux/file.h>
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/mutex.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 
 #define KTHREAD_NICE_LEVEL (-5)
 #define KTHREAD_NICE_LEVEL (-5)
 
 
@@ -21,9 +21,6 @@ static DEFINE_SPINLOCK(kthread_create_lock);
 static LIST_HEAD(kthread_create_list);
 static LIST_HEAD(kthread_create_list);
 struct task_struct *kthreadd_task;
 struct task_struct *kthreadd_task;
 
 
-DEFINE_TRACE(sched_kthread_stop);
-DEFINE_TRACE(sched_kthread_stop_ret);
-
 struct kthread_create_info
 struct kthread_create_info
 {
 {
 	/* Information passed to kthread() from kthreadd. */
 	/* Information passed to kthread() from kthreadd. */

+ 5 - 11
kernel/lockdep.c

@@ -42,12 +42,14 @@
 #include <linux/hash.h>
 #include <linux/hash.h>
 #include <linux/ftrace.h>
 #include <linux/ftrace.h>
 #include <linux/stringify.h>
 #include <linux/stringify.h>
-#include <trace/lockdep.h>
 
 
 #include <asm/sections.h>
 #include <asm/sections.h>
 
 
 #include "lockdep_internals.h"
 #include "lockdep_internals.h"
 
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/lockdep.h>
+
 #ifdef CONFIG_PROVE_LOCKING
 #ifdef CONFIG_PROVE_LOCKING
 int prove_locking = 1;
 int prove_locking = 1;
 module_param(prove_locking, int, 0644);
 module_param(prove_locking, int, 0644);
@@ -2935,8 +2937,6 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
 }
 }
 EXPORT_SYMBOL_GPL(lock_set_class);
 EXPORT_SYMBOL_GPL(lock_set_class);
 
 
-DEFINE_TRACE(lock_acquire);
-
 /*
 /*
  * We are not always called with irqs disabled - do that here,
  * We are not always called with irqs disabled - do that here,
  * and also avoid lockdep recursion:
  * and also avoid lockdep recursion:
@@ -2963,8 +2963,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 }
 }
 EXPORT_SYMBOL_GPL(lock_acquire);
 EXPORT_SYMBOL_GPL(lock_acquire);
 
 
-DEFINE_TRACE(lock_release);
-
 void lock_release(struct lockdep_map *lock, int nested,
 void lock_release(struct lockdep_map *lock, int nested,
 			  unsigned long ip)
 			  unsigned long ip)
 {
 {
@@ -3105,6 +3103,8 @@ found_it:
 		hlock->holdtime_stamp = now;
 		hlock->holdtime_stamp = now;
 	}
 	}
 
 
+	trace_lock_acquired(lock, ip, waittime);
+
 	stats = get_lock_stats(hlock_class(hlock));
 	stats = get_lock_stats(hlock_class(hlock));
 	if (waittime) {
 	if (waittime) {
 		if (hlock->read)
 		if (hlock->read)
@@ -3120,8 +3120,6 @@ found_it:
 	lock->ip = ip;
 	lock->ip = ip;
 }
 }
 
 
-DEFINE_TRACE(lock_contended);
-
 void lock_contended(struct lockdep_map *lock, unsigned long ip)
 void lock_contended(struct lockdep_map *lock, unsigned long ip)
 {
 {
 	unsigned long flags;
 	unsigned long flags;
@@ -3143,14 +3141,10 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
 }
 }
 EXPORT_SYMBOL_GPL(lock_contended);
 EXPORT_SYMBOL_GPL(lock_contended);
 
 
-DEFINE_TRACE(lock_acquired);
-
 void lock_acquired(struct lockdep_map *lock, unsigned long ip)
 void lock_acquired(struct lockdep_map *lock, unsigned long ip)
 {
 {
 	unsigned long flags;
 	unsigned long flags;
 
 
-	trace_lock_acquired(lock, ip);
-
 	if (unlikely(!lock_stat))
 	if (unlikely(!lock_stat))
 		return;
 		return;
 
 

+ 14 - 12
kernel/module.c

@@ -18,6 +18,7 @@
 */
 */
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/moduleloader.h>
 #include <linux/moduleloader.h>
+#include <linux/ftrace_event.h>
 #include <linux/init.h>
 #include <linux/init.h>
 #include <linux/kallsyms.h>
 #include <linux/kallsyms.h>
 #include <linux/fs.h>
 #include <linux/fs.h>
@@ -1489,9 +1490,6 @@ static void free_module(struct module *mod)
 	/* Free any allocated parameters. */
 	/* Free any allocated parameters. */
 	destroy_params(mod->kp, mod->num_kp);
 	destroy_params(mod->kp, mod->num_kp);
 
 
-	/* release any pointers to mcount in this module */
-	ftrace_release(mod->module_core, mod->core_size);
-
 	/* This may be NULL, but that's OK */
 	/* This may be NULL, but that's OK */
 	module_free(mod, mod->module_init);
 	module_free(mod, mod->module_init);
 	kfree(mod->args);
 	kfree(mod->args);
@@ -1892,11 +1890,9 @@ static noinline struct module *load_module(void __user *umod,
 	unsigned int symindex = 0;
 	unsigned int symindex = 0;
 	unsigned int strindex = 0;
 	unsigned int strindex = 0;
 	unsigned int modindex, versindex, infoindex, pcpuindex;
 	unsigned int modindex, versindex, infoindex, pcpuindex;
-	unsigned int num_mcount;
 	struct module *mod;
 	struct module *mod;
 	long err = 0;
 	long err = 0;
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
-	unsigned long *mseg;
 	mm_segment_t old_fs;
 	mm_segment_t old_fs;
 
 
 	DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
 	DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2172,7 +2168,19 @@ static noinline struct module *load_module(void __user *umod,
 					sizeof(*mod->tracepoints),
 					sizeof(*mod->tracepoints),
 					&mod->num_tracepoints);
 					&mod->num_tracepoints);
 #endif
 #endif
-
+#ifdef CONFIG_EVENT_TRACING
+	mod->trace_events = section_objs(hdr, sechdrs, secstrings,
+					 "_ftrace_events",
+					 sizeof(*mod->trace_events),
+					 &mod->num_trace_events);
+#endif
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+	/* sechdrs[0].sh_size is always zero */
+	mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings,
+					     "__mcount_loc",
+					     sizeof(*mod->ftrace_callsites),
+					     &mod->num_ftrace_callsites);
+#endif
 #ifdef CONFIG_MODVERSIONS
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !mod->crcs)
 	if ((mod->num_syms && !mod->crcs)
 	    || (mod->num_gpl_syms && !mod->gpl_crcs)
 	    || (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2237,11 +2245,6 @@ static noinline struct module *load_module(void __user *umod,
 			dynamic_debug_setup(debug, num_debug);
 			dynamic_debug_setup(debug, num_debug);
 	}
 	}
 
 
-	/* sechdrs[0].sh_size is always zero */
-	mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
-			    sizeof(*mseg), &num_mcount);
-	ftrace_init_module(mod, mseg, mseg + num_mcount);
-
 	err = module_finalize(hdr, sechdrs, mod);
 	err = module_finalize(hdr, sechdrs, mod);
 	if (err < 0)
 	if (err < 0)
 		goto cleanup;
 		goto cleanup;
@@ -2302,7 +2305,6 @@ static noinline struct module *load_module(void __user *umod,
  cleanup:
  cleanup:
 	kobject_del(&mod->mkobj.kobj);
 	kobject_del(&mod->mkobj.kobj);
 	kobject_put(&mod->mkobj.kobj);
 	kobject_put(&mod->mkobj.kobj);
-	ftrace_release(mod->module_core, mod->core_size);
  free_unload:
  free_unload:
 	module_unload_free(mod);
 	module_unload_free(mod);
 #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
 #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)

+ 0 - 10
kernel/ptrace.c

@@ -24,16 +24,6 @@
 #include <linux/uaccess.h>
 #include <linux/uaccess.h>
 
 
 
 
-/*
- * Initialize a new task whose father had been ptraced.
- *
- * Called from copy_process().
- */
-void ptrace_fork(struct task_struct *child, unsigned long clone_flags)
-{
-	arch_ptrace_fork(child, clone_flags);
-}
-
 /*
 /*
  * ptrace a task: make the debugger its new parent and
  * ptrace a task: make the debugger its new parent and
  * move it to the ptrace list.
  * move it to the ptrace list.

+ 47 - 8
kernel/sched.c

@@ -72,13 +72,15 @@
 #include <linux/debugfs.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
 #include <linux/ftrace.h>
-#include <trace/sched.h>
 
 
 #include <asm/tlb.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 #include <asm/irq_regs.h>
 
 
 #include "sched_cpupri.h"
 #include "sched_cpupri.h"
 
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+
 /*
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -118,12 +120,6 @@
  */
  */
 #define RUNTIME_INF	((u64)~0ULL)
 #define RUNTIME_INF	((u64)~0ULL)
 
 
-DEFINE_TRACE(sched_wait_task);
-DEFINE_TRACE(sched_wakeup);
-DEFINE_TRACE(sched_wakeup_new);
-DEFINE_TRACE(sched_switch);
-DEFINE_TRACE(sched_migrate_task);
-
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
 
 
 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -1964,7 +1960,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 
 
 	clock_offset = old_rq->clock - new_rq->clock;
 	clock_offset = old_rq->clock - new_rq->clock;
 
 
-	trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+	trace_sched_migrate_task(p, new_cpu);
 
 
 #ifdef CONFIG_SCHEDSTATS
 #ifdef CONFIG_SCHEDSTATS
 	if (p->se.wait_start)
 	if (p->se.wait_start)
@@ -2020,6 +2016,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 	return 1;
 	return 1;
 }
 }
 
 
+/*
+ * wait_task_context_switch -	wait for a thread to complete at least one
+ *				context switch.
+ *
+ * @p must not be current.
+ */
+void wait_task_context_switch(struct task_struct *p)
+{
+	unsigned long nvcsw, nivcsw, flags;
+	int running;
+	struct rq *rq;
+
+	nvcsw	= p->nvcsw;
+	nivcsw	= p->nivcsw;
+	for (;;) {
+		/*
+		 * The runqueue is assigned before the actual context
+		 * switch. We need to take the runqueue lock.
+		 *
+		 * We could check initially without the lock but it is
+		 * very likely that we need to take the lock in every
+		 * iteration.
+		 */
+		rq = task_rq_lock(p, &flags);
+		running = task_running(rq, p);
+		task_rq_unlock(rq, &flags);
+
+		if (likely(!running))
+			break;
+		/*
+		 * The switch count is incremented before the actual
+		 * context switch. We thus wait for two switches to be
+		 * sure at least one completed.
+		 */
+		if ((p->nvcsw - nvcsw) > 1)
+			break;
+		if ((p->nivcsw - nivcsw) > 1)
+			break;
+
+		cpu_relax();
+	}
+}
+
 /*
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  * wait_task_inactive - wait for a thread to unschedule.
  *
  *

+ 1 - 3
kernel/signal.c

@@ -27,7 +27,7 @@
 #include <linux/freezer.h>
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
 #include <linux/pid_namespace.h>
 #include <linux/nsproxy.h>
 #include <linux/nsproxy.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 
 #include <asm/param.h>
 #include <asm/param.h>
 #include <asm/uaccess.h>
 #include <asm/uaccess.h>
@@ -41,8 +41,6 @@
 
 
 static struct kmem_cache *sigqueue_cachep;
 static struct kmem_cache *sigqueue_cachep;
 
 
-DEFINE_TRACE(sched_signal_send);
-
 static void __user *sig_handler(struct task_struct *t, int sig)
 static void __user *sig_handler(struct task_struct *t, int sig)
 {
 {
 	return t->sighand->action[sig - 1].sa.sa_handler;
 	return t->sighand->action[sig - 1].sa.sa_handler;

+ 3 - 4
kernel/softirq.c

@@ -24,7 +24,9 @@
 #include <linux/ftrace.h>
 #include <linux/ftrace.h>
 #include <linux/smp.h>
 #include <linux/smp.h>
 #include <linux/tick.h>
 #include <linux/tick.h>
-#include <trace/irq.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/irq.h>
 
 
 #include <asm/irq.h>
 #include <asm/irq.h>
 /*
 /*
@@ -186,9 +188,6 @@ EXPORT_SYMBOL(local_bh_enable_ip);
  */
  */
 #define MAX_SOFTIRQ_RESTART 10
 #define MAX_SOFTIRQ_RESTART 10
 
 
-DEFINE_TRACE(softirq_entry);
-DEFINE_TRACE(softirq_exit);
-
 asmlinkage void __do_softirq(void)
 asmlinkage void __do_softirq(void)
 {
 {
 	struct softirq_action *h;
 	struct softirq_action *h;

+ 110 - 33
kernel/trace/Kconfig

@@ -48,6 +48,21 @@ config FTRACE_NMI_ENTER
        depends on HAVE_FTRACE_NMI_ENTER
        depends on HAVE_FTRACE_NMI_ENTER
        default y
        default y
 
 
+config EVENT_TRACING
+	select CONTEXT_SWITCH_TRACER
+	bool
+
+config CONTEXT_SWITCH_TRACER
+	select MARKERS
+	bool
+
+# All tracer options should select GENERIC_TRACER. For those options that are
+# enabled by all tracers (context switch and event tracer) they select TRACING.
+# This allows those options to appear when no other tracer is selected. But the
+# options do not appear when something else selects it. We need the two options
+# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
+# hidding of the automatic options options.
+
 config TRACING
 config TRACING
 	bool
 	bool
 	select DEBUG_FS
 	select DEBUG_FS
@@ -56,6 +71,11 @@ config TRACING
 	select TRACEPOINTS
 	select TRACEPOINTS
 	select NOP_TRACER
 	select NOP_TRACER
 	select BINARY_PRINTF
 	select BINARY_PRINTF
+	select EVENT_TRACING
+
+config GENERIC_TRACER
+	bool
+	select TRACING
 
 
 #
 #
 # Minimum requirements an architecture has to meet for us to
 # Minimum requirements an architecture has to meet for us to
@@ -73,14 +93,20 @@ config TRACING_SUPPORT
 
 
 if TRACING_SUPPORT
 if TRACING_SUPPORT
 
 
-menu "Tracers"
+menuconfig FTRACE
+	bool "Tracers"
+	default y if DEBUG_KERNEL
+	help
+	 Enable the kernel tracing infrastructure.
+
+if FTRACE
 
 
 config FUNCTION_TRACER
 config FUNCTION_TRACER
 	bool "Kernel Function Tracer"
 	bool "Kernel Function Tracer"
 	depends on HAVE_FUNCTION_TRACER
 	depends on HAVE_FUNCTION_TRACER
 	select FRAME_POINTER
 	select FRAME_POINTER
 	select KALLSYMS
 	select KALLSYMS
-	select TRACING
+	select GENERIC_TRACER
 	select CONTEXT_SWITCH_TRACER
 	select CONTEXT_SWITCH_TRACER
 	help
 	help
 	  Enable the kernel to trace every kernel function. This is done
 	  Enable the kernel to trace every kernel function. This is done
@@ -104,13 +130,14 @@ config FUNCTION_GRAPH_TRACER
 	  the return value. This is done by setting the current return 
 	  the return value. This is done by setting the current return 
 	  address on the current task structure into a stack of calls.
 	  address on the current task structure into a stack of calls.
 
 
+
 config IRQSOFF_TRACER
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	bool "Interrupts-off Latency Tracer"
 	default n
 	default n
 	depends on TRACE_IRQFLAGS_SUPPORT
 	depends on TRACE_IRQFLAGS_SUPPORT
 	depends on GENERIC_TIME
 	depends on GENERIC_TIME
 	select TRACE_IRQFLAGS
 	select TRACE_IRQFLAGS
-	select TRACING
+	select GENERIC_TRACER
 	select TRACER_MAX_TRACE
 	select TRACER_MAX_TRACE
 	help
 	help
 	  This option measures the time spent in irqs-off critical
 	  This option measures the time spent in irqs-off critical
@@ -131,7 +158,7 @@ config PREEMPT_TRACER
 	default n
 	default n
 	depends on GENERIC_TIME
 	depends on GENERIC_TIME
 	depends on PREEMPT
 	depends on PREEMPT
-	select TRACING
+	select GENERIC_TRACER
 	select TRACER_MAX_TRACE
 	select TRACER_MAX_TRACE
 	help
 	help
 	  This option measures the time spent in preemption off critical
 	  This option measures the time spent in preemption off critical
@@ -150,7 +177,7 @@ config PREEMPT_TRACER
 config SYSPROF_TRACER
 config SYSPROF_TRACER
 	bool "Sysprof Tracer"
 	bool "Sysprof Tracer"
 	depends on X86
 	depends on X86
-	select TRACING
+	select GENERIC_TRACER
 	select CONTEXT_SWITCH_TRACER
 	select CONTEXT_SWITCH_TRACER
 	help
 	help
 	  This tracer provides the trace needed by the 'Sysprof' userspace
 	  This tracer provides the trace needed by the 'Sysprof' userspace
@@ -158,40 +185,33 @@ config SYSPROF_TRACER
 
 
 config SCHED_TRACER
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
 	bool "Scheduling Latency Tracer"
-	select TRACING
+	select GENERIC_TRACER
 	select CONTEXT_SWITCH_TRACER
 	select CONTEXT_SWITCH_TRACER
 	select TRACER_MAX_TRACE
 	select TRACER_MAX_TRACE
 	help
 	help
 	  This tracer tracks the latency of the highest priority task
 	  This tracer tracks the latency of the highest priority task
 	  to be scheduled in, starting from the point it has woken up.
 	  to be scheduled in, starting from the point it has woken up.
 
 
-config CONTEXT_SWITCH_TRACER
-	bool "Trace process context switches"
-	select TRACING
-	select MARKERS
-	help
-	  This tracer gets called from the context switch and records
-	  all switching of tasks.
-
-config EVENT_TRACER
-	bool "Trace various events in the kernel"
+config ENABLE_DEFAULT_TRACERS
+	bool "Trace process context switches and events"
+	depends on !GENERIC_TRACER
 	select TRACING
 	select TRACING
 	help
 	help
 	  This tracer hooks to various trace points in the kernel
 	  This tracer hooks to various trace points in the kernel
 	  allowing the user to pick and choose which trace point they
 	  allowing the user to pick and choose which trace point they
-	  want to trace.
+	  want to trace. It also includes the sched_switch tracer plugin.
 
 
 config FTRACE_SYSCALLS
 config FTRACE_SYSCALLS
 	bool "Trace syscalls"
 	bool "Trace syscalls"
 	depends on HAVE_FTRACE_SYSCALLS
 	depends on HAVE_FTRACE_SYSCALLS
-	select TRACING
+	select GENERIC_TRACER
 	select KALLSYMS
 	select KALLSYMS
 	help
 	help
 	  Basic tracer to catch the syscall entry and exit events.
 	  Basic tracer to catch the syscall entry and exit events.
 
 
 config BOOT_TRACER
 config BOOT_TRACER
 	bool "Trace boot initcalls"
 	bool "Trace boot initcalls"
-	select TRACING
+	select GENERIC_TRACER
 	select CONTEXT_SWITCH_TRACER
 	select CONTEXT_SWITCH_TRACER
 	help
 	help
 	  This tracer helps developers to optimize boot times: it records
 	  This tracer helps developers to optimize boot times: it records
@@ -207,8 +227,36 @@ config BOOT_TRACER
 	  to enable this on bootup.
 	  to enable this on bootup.
 
 
 config TRACE_BRANCH_PROFILING
 config TRACE_BRANCH_PROFILING
+	bool
+	select GENERIC_TRACER
+
+choice
+	prompt "Branch Profiling"
+	default BRANCH_PROFILE_NONE
+	help
+	 The branch profiling is a software profiler. It will add hooks
+	 into the C conditionals to test which path a branch takes.
+
+	 The likely/unlikely profiler only looks at the conditions that
+	 are annotated with a likely or unlikely macro.
+
+	 The "all branch" profiler will profile every if statement in the
+	 kernel. This profiler will also enable the likely/unlikely
+	 profiler as well.
+
+	 Either of the above profilers add a bit of overhead to the system.
+	 If unsure choose "No branch profiling".
+
+config BRANCH_PROFILE_NONE
+	bool "No branch profiling"
+	help
+	 No branch profiling. Branch profiling adds a bit of overhead.
+	 Only enable it if you want to analyse the branching behavior.
+	 Otherwise keep it disabled.
+
+config PROFILE_ANNOTATED_BRANCHES
 	bool "Trace likely/unlikely profiler"
 	bool "Trace likely/unlikely profiler"
-	select TRACING
+	select TRACE_BRANCH_PROFILING
 	help
 	help
 	  This tracer profiles all the the likely and unlikely macros
 	  This tracer profiles all the the likely and unlikely macros
 	  in the kernel. It will display the results in:
 	  in the kernel. It will display the results in:
@@ -218,11 +266,9 @@ config TRACE_BRANCH_PROFILING
 	  Note: this will add a significant overhead, only turn this
 	  Note: this will add a significant overhead, only turn this
 	  on if you need to profile the system's use of these macros.
 	  on if you need to profile the system's use of these macros.
 
 
-	  Say N if unsure.
-
 config PROFILE_ALL_BRANCHES
 config PROFILE_ALL_BRANCHES
 	bool "Profile all if conditionals"
 	bool "Profile all if conditionals"
-	depends on TRACE_BRANCH_PROFILING
+	select TRACE_BRANCH_PROFILING
 	help
 	help
 	  This tracer profiles all branch conditions. Every if ()
 	  This tracer profiles all branch conditions. Every if ()
 	  taken in the kernel is recorded whether it hit or miss.
 	  taken in the kernel is recorded whether it hit or miss.
@@ -230,11 +276,12 @@ config PROFILE_ALL_BRANCHES
 
 
 	  /debugfs/tracing/profile_branch
 	  /debugfs/tracing/profile_branch
 
 
+	  This option also enables the likely/unlikely profiler.
+
 	  This configuration, when enabled, will impose a great overhead
 	  This configuration, when enabled, will impose a great overhead
 	  on the system. This should only be enabled when the system
 	  on the system. This should only be enabled when the system
 	  is to be analyzed
 	  is to be analyzed
-
-	  Say N if unsure.
+endchoice
 
 
 config TRACING_BRANCHES
 config TRACING_BRANCHES
 	bool
 	bool
@@ -261,7 +308,7 @@ config BRANCH_TRACER
 config POWER_TRACER
 config POWER_TRACER
 	bool "Trace power consumption behavior"
 	bool "Trace power consumption behavior"
 	depends on X86
 	depends on X86
-	select TRACING
+	select GENERIC_TRACER
 	help
 	help
 	  This tracer helps developers to analyze and optimize the kernels
 	  This tracer helps developers to analyze and optimize the kernels
 	  power management decisions, specifically the C-state and P-state
 	  power management decisions, specifically the C-state and P-state
@@ -295,14 +342,14 @@ config STACK_TRACER
 config HW_BRANCH_TRACER
 config HW_BRANCH_TRACER
 	depends on HAVE_HW_BRANCH_TRACER
 	depends on HAVE_HW_BRANCH_TRACER
 	bool "Trace hw branches"
 	bool "Trace hw branches"
-	select TRACING
+	select GENERIC_TRACER
 	help
 	help
 	  This tracer records all branches on the system in a circular
 	  This tracer records all branches on the system in a circular
 	  buffer giving access to the last N branches for each cpu.
 	  buffer giving access to the last N branches for each cpu.
 
 
 config KMEMTRACE
 config KMEMTRACE
 	bool "Trace SLAB allocations"
 	bool "Trace SLAB allocations"
-	select TRACING
+	select GENERIC_TRACER
 	help
 	help
 	  kmemtrace provides tracing for slab allocator functions, such as
 	  kmemtrace provides tracing for slab allocator functions, such as
 	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
 	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
@@ -322,7 +369,7 @@ config KMEMTRACE
 
 
 config WORKQUEUE_TRACER
 config WORKQUEUE_TRACER
 	bool "Trace workqueues"
 	bool "Trace workqueues"
-	select TRACING
+	select GENERIC_TRACER
 	help
 	help
 	  The workqueue tracer provides some statistical informations
 	  The workqueue tracer provides some statistical informations
           about each cpu workqueue thread such as the number of the
           about each cpu workqueue thread such as the number of the
@@ -338,7 +385,7 @@ config BLK_DEV_IO_TRACE
 	select RELAY
 	select RELAY
 	select DEBUG_FS
 	select DEBUG_FS
 	select TRACEPOINTS
 	select TRACEPOINTS
-	select TRACING
+	select GENERIC_TRACER
 	select STACKTRACE
 	select STACKTRACE
 	help
 	help
 	  Say Y here if you want to be able to trace the block layer actions
 	  Say Y here if you want to be able to trace the block layer actions
@@ -375,6 +422,20 @@ config DYNAMIC_FTRACE
 	 were made. If so, it runs stop_machine (stops all CPUS)
 	 were made. If so, it runs stop_machine (stops all CPUS)
 	 and modifies the code to jump over the call to ftrace.
 	 and modifies the code to jump over the call to ftrace.
 
 
+config FUNCTION_PROFILER
+	bool "Kernel function profiler"
+	depends on FUNCTION_TRACER
+	default n
+	help
+	 This option enables the kernel function profiler. A file is created
+	 in debugfs called function_profile_enabled which defaults to zero.
+	 When a 1 is echoed into this file profiling begins, and when a
+	 zero is entered, profiling stops. A file in the trace_stats
+	 directory called functions, that show the list of functions that
+	 have been hit and their counters.
+
+	 If in doubt, say N
+
 config FTRACE_MCOUNT_RECORD
 config FTRACE_MCOUNT_RECORD
 	def_bool y
 	def_bool y
 	depends on DYNAMIC_FTRACE
 	depends on DYNAMIC_FTRACE
@@ -385,7 +446,7 @@ config FTRACE_SELFTEST
 
 
 config FTRACE_STARTUP_TEST
 config FTRACE_STARTUP_TEST
 	bool "Perform a startup test on ftrace"
 	bool "Perform a startup test on ftrace"
-	depends on TRACING
+	depends on GENERIC_TRACER
 	select FTRACE_SELFTEST
 	select FTRACE_SELFTEST
 	help
 	help
 	  This option performs a series of startup tests on ftrace. On bootup
 	  This option performs a series of startup tests on ftrace. On bootup
@@ -396,7 +457,7 @@ config FTRACE_STARTUP_TEST
 config MMIOTRACE
 config MMIOTRACE
 	bool "Memory mapped IO tracing"
 	bool "Memory mapped IO tracing"
 	depends on HAVE_MMIOTRACE_SUPPORT && PCI
 	depends on HAVE_MMIOTRACE_SUPPORT && PCI
-	select TRACING
+	select GENERIC_TRACER
 	help
 	help
 	  Mmiotrace traces Memory Mapped I/O access and is meant for
 	  Mmiotrace traces Memory Mapped I/O access and is meant for
 	  debugging and reverse engineering. It is called from the ioremap
 	  debugging and reverse engineering. It is called from the ioremap
@@ -416,7 +477,23 @@ config MMIOTRACE_TEST
 
 
 	  Say N, unless you absolutely know what you are doing.
 	  Say N, unless you absolutely know what you are doing.
 
 
-endmenu
+config RING_BUFFER_BENCHMARK
+	tristate "Ring buffer benchmark stress tester"
+	depends on RING_BUFFER
+	help
+	  This option creates a test to stress the ring buffer and bench mark it.
+	  It creates its own ring buffer such that it will not interfer with
+	  any other users of the ring buffer (such as ftrace). It then creates
+	  a producer and consumer that will run for 10 seconds and sleep for
+	  10 seconds. Each interval it will print out the number of events
+	  it recorded and give a rough estimate of how long each iteration took.
+
+	  It does not disable interrupts or raise its priority, so it may be
+	  affected by processes that are running.
+
+	  If unsure, say N
+
+endif # FTRACE
 
 
 endif # TRACING_SUPPORT
 endif # TRACING_SUPPORT
 
 

+ 14 - 6
kernel/trace/Makefile

@@ -15,11 +15,17 @@ ifdef CONFIG_TRACING_BRANCHES
 KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
 KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
 endif
 endif
 
 
+#
+# Make the trace clocks available generally: it's infrastructure
+# relied on by ptrace for example:
+#
+obj-y += trace_clock.o
+
 obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
+obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o
 
 
 obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_TRACING) += trace.o
-obj-$(CONFIG_TRACING) += trace_clock.o
 obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_stat.o
 obj-$(CONFIG_TRACING) += trace_stat.o
 obj-$(CONFIG_TRACING) += trace_printk.o
 obj-$(CONFIG_TRACING) += trace_printk.o
@@ -39,12 +45,14 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
-obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
-obj-$(CONFIG_EVENT_TRACER) += trace_events.o
-obj-$(CONFIG_EVENT_TRACER) += events.o
-obj-$(CONFIG_EVENT_TRACER) += trace_export.o
+obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
+ifeq ($(CONFIG_BLOCK),y)
+obj-$(CONFIG_EVENT_TRACING) += blktrace.o
+endif
+obj-$(CONFIG_EVENT_TRACING) += trace_events.o
+obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
 obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
-obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o
+obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 
 
 libftrace-y := ftrace.o
 libftrace-y := ftrace.o

+ 229 - 43
kernel/trace/blktrace.c

@@ -23,10 +23,14 @@
 #include <linux/mutex.h>
 #include <linux/mutex.h>
 #include <linux/debugfs.h>
 #include <linux/debugfs.h>
 #include <linux/time.h>
 #include <linux/time.h>
-#include <trace/block.h>
 #include <linux/uaccess.h>
 #include <linux/uaccess.h>
+
+#include <trace/events/block.h>
+
 #include "trace_output.h"
 #include "trace_output.h"
 
 
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+
 static unsigned int blktrace_seq __read_mostly = 1;
 static unsigned int blktrace_seq __read_mostly = 1;
 
 
 static struct trace_array *blk_tr;
 static struct trace_array *blk_tr;
@@ -147,7 +151,7 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
 {
 {
 	if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
 	if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
 		return 1;
 		return 1;
-	if (sector < bt->start_lba || sector > bt->end_lba)
+	if (sector && (sector < bt->start_lba || sector > bt->end_lba))
 		return 1;
 		return 1;
 	if (bt->pid && pid != bt->pid)
 	if (bt->pid && pid != bt->pid)
 		return 1;
 		return 1;
@@ -192,7 +196,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	what |= MASK_TC_BIT(rw, DISCARD);
 	what |= MASK_TC_BIT(rw, DISCARD);
 
 
 	pid = tsk->pid;
 	pid = tsk->pid;
-	if (unlikely(act_log_check(bt, what, sector, pid)))
+	if (act_log_check(bt, what, sector, pid))
 		return;
 		return;
 	cpu = raw_smp_processor_id();
 	cpu = raw_smp_processor_id();
 
 
@@ -262,6 +266,7 @@ static void blk_trace_free(struct blk_trace *bt)
 {
 {
 	debugfs_remove(bt->msg_file);
 	debugfs_remove(bt->msg_file);
 	debugfs_remove(bt->dropped_file);
 	debugfs_remove(bt->dropped_file);
+	debugfs_remove(bt->dir);
 	relay_close(bt->rchan);
 	relay_close(bt->rchan);
 	free_percpu(bt->sequence);
 	free_percpu(bt->sequence);
 	free_percpu(bt->msg_data);
 	free_percpu(bt->msg_data);
@@ -403,11 +408,29 @@ static struct rchan_callbacks blk_relay_callbacks = {
 	.remove_buf_file	= blk_remove_buf_file_callback,
 	.remove_buf_file	= blk_remove_buf_file_callback,
 };
 };
 
 
+static void blk_trace_setup_lba(struct blk_trace *bt,
+				struct block_device *bdev)
+{
+	struct hd_struct *part = NULL;
+
+	if (bdev)
+		part = bdev->bd_part;
+
+	if (part) {
+		bt->start_lba = part->start_sect;
+		bt->end_lba = part->start_sect + part->nr_sects;
+	} else {
+		bt->start_lba = 0;
+		bt->end_lba = -1ULL;
+	}
+}
+
 /*
 /*
  * Setup everything required to start tracing
  * Setup everything required to start tracing
  */
  */
 int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
-			struct blk_user_trace_setup *buts)
+		       struct block_device *bdev,
+		       struct blk_user_trace_setup *buts)
 {
 {
 	struct blk_trace *old_bt, *bt = NULL;
 	struct blk_trace *old_bt, *bt = NULL;
 	struct dentry *dir = NULL;
 	struct dentry *dir = NULL;
@@ -480,10 +503,13 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (!bt->act_mask)
 	if (!bt->act_mask)
 		bt->act_mask = (u16) -1;
 		bt->act_mask = (u16) -1;
 
 
-	bt->start_lba = buts->start_lba;
-	bt->end_lba = buts->end_lba;
-	if (!bt->end_lba)
-		bt->end_lba = -1ULL;
+	blk_trace_setup_lba(bt, bdev);
+
+	/* overwrite with user settings */
+	if (buts->start_lba)
+		bt->start_lba = buts->start_lba;
+	if (buts->end_lba)
+		bt->end_lba = buts->end_lba;
 
 
 	bt->pid = buts->pid;
 	bt->pid = buts->pid;
 	bt->trace_state = Blktrace_setup;
 	bt->trace_state = Blktrace_setup;
@@ -505,6 +531,7 @@ err:
 }
 }
 
 
 int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+		    struct block_device *bdev,
 		    char __user *arg)
 		    char __user *arg)
 {
 {
 	struct blk_user_trace_setup buts;
 	struct blk_user_trace_setup buts;
@@ -514,7 +541,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (ret)
 	if (ret)
 		return -EFAULT;
 		return -EFAULT;
 
 
-	ret = do_blk_trace_setup(q, name, dev, &buts);
+	ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
 	if (ret)
 	if (ret)
 		return ret;
 		return ret;
 
 
@@ -582,7 +609,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 	switch (cmd) {
 	switch (cmd) {
 	case BLKTRACESETUP:
 	case BLKTRACESETUP:
 		bdevname(bdev, b);
 		bdevname(bdev, b);
-		ret = blk_trace_setup(q, b, bdev->bd_dev, arg);
+		ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
 		break;
 		break;
 	case BLKTRACESTART:
 	case BLKTRACESTART:
 		start = 1;
 		start = 1;
@@ -809,7 +836,6 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
  * @bio:	the source bio
  * @bio:	the source bio
  * @dev:	target device
  * @dev:	target device
  * @from:	source sector
  * @from:	source sector
- * @to:		target sector
  *
  *
  * Description:
  * Description:
  *     Device mapper or raid target sometimes need to split a bio because
  *     Device mapper or raid target sometimes need to split a bio because
@@ -817,7 +843,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
  *
  *
  **/
  **/
 static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
 static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-				       dev_t dev, sector_t from, sector_t to)
+				       dev_t dev, sector_t from)
 {
 {
 	struct blk_trace *bt = q->blk_trace;
 	struct blk_trace *bt = q->blk_trace;
 	struct blk_io_trace_remap r;
 	struct blk_io_trace_remap r;
@@ -825,12 +851,13 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
 	if (likely(!bt))
 	if (likely(!bt))
 		return;
 		return;
 
 
-	r.device = cpu_to_be32(dev);
-	r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
-	r.sector = cpu_to_be64(to);
+	r.device_from = cpu_to_be32(dev);
+	r.device_to   = cpu_to_be32(bio->bi_bdev->bd_dev);
+	r.sector_from = cpu_to_be64(from);
 
 
-	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
-			!bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
+			BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE),
+			sizeof(r), &r);
 }
 }
 
 
 /**
 /**
@@ -971,6 +998,16 @@ static inline const void *pdu_start(const struct trace_entry *ent)
 	return te_blk_io_trace(ent) + 1;
 	return te_blk_io_trace(ent) + 1;
 }
 }
 
 
+static inline u32 t_action(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent)->action;
+}
+
+static inline u32 t_bytes(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent)->bytes;
+}
+
 static inline u32 t_sec(const struct trace_entry *ent)
 static inline u32 t_sec(const struct trace_entry *ent)
 {
 {
 	return te_blk_io_trace(ent)->bytes >> 9;
 	return te_blk_io_trace(ent)->bytes >> 9;
@@ -996,11 +1033,11 @@ static void get_pdu_remap(const struct trace_entry *ent,
 			  struct blk_io_trace_remap *r)
 			  struct blk_io_trace_remap *r)
 {
 {
 	const struct blk_io_trace_remap *__r = pdu_start(ent);
 	const struct blk_io_trace_remap *__r = pdu_start(ent);
-	__u64 sector = __r->sector;
+	__u64 sector_from = __r->sector_from;
 
 
-	r->device = be32_to_cpu(__r->device);
 	r->device_from = be32_to_cpu(__r->device_from);
 	r->device_from = be32_to_cpu(__r->device_from);
-	r->sector = be64_to_cpu(sector);
+	r->device_to   = be32_to_cpu(__r->device_to);
+	r->sector_from = be64_to_cpu(sector_from);
 }
 }
 
 
 typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
 typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
@@ -1031,36 +1068,98 @@ static int blk_log_action(struct trace_iterator *iter, const char *act)
 				MAJOR(t->device), MINOR(t->device), act, rwbs);
 				MAJOR(t->device), MINOR(t->device), act, rwbs);
 }
 }
 
 
+static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
+{
+	const unsigned char *pdu_buf;
+	int pdu_len;
+	int i, end, ret;
+
+	pdu_buf = pdu_start(ent);
+	pdu_len = te_blk_io_trace(ent)->pdu_len;
+
+	if (!pdu_len)
+		return 1;
+
+	/* find the last zero that needs to be printed */
+	for (end = pdu_len - 1; end >= 0; end--)
+		if (pdu_buf[end])
+			break;
+	end++;
+
+	if (!trace_seq_putc(s, '('))
+		return 0;
+
+	for (i = 0; i < pdu_len; i++) {
+
+		ret = trace_seq_printf(s, "%s%02x",
+				       i == 0 ? "" : " ", pdu_buf[i]);
+		if (!ret)
+			return ret;
+
+		/*
+		 * stop when the rest is just zeroes and indicate so
+		 * with a ".." appended
+		 */
+		if (i == end && end != pdu_len - 1)
+			return trace_seq_puts(s, " ..) ");
+	}
+
+	return trace_seq_puts(s, ") ");
+}
+
 static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
 static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
 {
 {
 	char cmd[TASK_COMM_LEN];
 	char cmd[TASK_COMM_LEN];
 
 
 	trace_find_cmdline(ent->pid, cmd);
 	trace_find_cmdline(ent->pid, cmd);
 
 
-	if (t_sec(ent))
-		return trace_seq_printf(s, "%llu + %u [%s]\n",
-					t_sector(ent), t_sec(ent), cmd);
-	return trace_seq_printf(s, "[%s]\n", cmd);
+	if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
+		int ret;
+
+		ret = trace_seq_printf(s, "%u ", t_bytes(ent));
+		if (!ret)
+			return 0;
+		ret = blk_log_dump_pdu(s, ent);
+		if (!ret)
+			return 0;
+		return trace_seq_printf(s, "[%s]\n", cmd);
+	} else {
+		if (t_sec(ent))
+			return trace_seq_printf(s, "%llu + %u [%s]\n",
+						t_sector(ent), t_sec(ent), cmd);
+		return trace_seq_printf(s, "[%s]\n", cmd);
+	}
 }
 }
 
 
 static int blk_log_with_error(struct trace_seq *s,
 static int blk_log_with_error(struct trace_seq *s,
 			      const struct trace_entry *ent)
 			      const struct trace_entry *ent)
 {
 {
-	if (t_sec(ent))
-		return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent),
-					t_sec(ent), t_error(ent));
-	return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent));
+	if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
+		int ret;
+
+		ret = blk_log_dump_pdu(s, ent);
+		if (ret)
+			return trace_seq_printf(s, "[%d]\n", t_error(ent));
+		return 0;
+	} else {
+		if (t_sec(ent))
+			return trace_seq_printf(s, "%llu + %u [%d]\n",
+						t_sector(ent),
+						t_sec(ent), t_error(ent));
+		return trace_seq_printf(s, "%llu [%d]\n",
+					t_sector(ent), t_error(ent));
+	}
 }
 }
 
 
 static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
 static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
 {
 {
-	struct blk_io_trace_remap r = { .device = 0, };
+	struct blk_io_trace_remap r = { .device_from = 0, };
 
 
 	get_pdu_remap(ent, &r);
 	get_pdu_remap(ent, &r);
 	return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
 	return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
-			       t_sector(ent),
-			       t_sec(ent), MAJOR(r.device), MINOR(r.device),
-			       (unsigned long long)r.sector);
+				t_sector(ent), t_sec(ent),
+				MAJOR(r.device_from), MINOR(r.device_from),
+				(unsigned long long)r.sector_from);
 }
 }
 
 
 static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
 static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
@@ -1117,7 +1216,6 @@ static void blk_tracer_print_header(struct seq_file *m)
 static void blk_tracer_start(struct trace_array *tr)
 static void blk_tracer_start(struct trace_array *tr)
 {
 {
 	blk_tracer_enabled = true;
 	blk_tracer_enabled = true;
-	trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
 }
 }
 
 
 static int blk_tracer_init(struct trace_array *tr)
 static int blk_tracer_init(struct trace_array *tr)
@@ -1130,7 +1228,6 @@ static int blk_tracer_init(struct trace_array *tr)
 static void blk_tracer_stop(struct trace_array *tr)
 static void blk_tracer_stop(struct trace_array *tr)
 {
 {
 	blk_tracer_enabled = false;
 	blk_tracer_enabled = false;
-	trace_flags |= TRACE_ITER_CONTEXT_INFO;
 }
 }
 
 
 static void blk_tracer_reset(struct trace_array *tr)
 static void blk_tracer_reset(struct trace_array *tr)
@@ -1182,7 +1279,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
 	}
 	}
 
 
 	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
 	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
-		ret = trace_seq_printf(s, "Bad pc action %x\n", what);
+		ret = trace_seq_printf(s, "Unknown action %x\n", what);
 	else {
 	else {
 		ret = log_action(iter, what2act[what].act[long_act]);
 		ret = log_action(iter, what2act[what].act[long_act]);
 		if (ret)
 		if (ret)
@@ -1195,9 +1292,6 @@ out:
 static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
 static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
 					       int flags)
 					       int flags)
 {
 {
-	if (!trace_print_context(iter))
-		return TRACE_TYPE_PARTIAL_LINE;
-
 	return print_one_line(iter, false);
 	return print_one_line(iter, false);
 }
 }
 
 
@@ -1232,6 +1326,18 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
 	return print_one_line(iter, true);
 	return print_one_line(iter, true);
 }
 }
 
 
+static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set)
+{
+	/* don't output context-info for blk_classic output */
+	if (bit == TRACE_BLK_OPT_CLASSIC) {
+		if (set)
+			trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
+		else
+			trace_flags |= TRACE_ITER_CONTEXT_INFO;
+	}
+	return 0;
+}
+
 static struct tracer blk_tracer __read_mostly = {
 static struct tracer blk_tracer __read_mostly = {
 	.name		= "blk",
 	.name		= "blk",
 	.init		= blk_tracer_init,
 	.init		= blk_tracer_init,
@@ -1241,6 +1347,7 @@ static struct tracer blk_tracer __read_mostly = {
 	.print_header	= blk_tracer_print_header,
 	.print_header	= blk_tracer_print_header,
 	.print_line	= blk_tracer_print_line,
 	.print_line	= blk_tracer_print_line,
 	.flags		= &blk_tracer_flags,
 	.flags		= &blk_tracer_flags,
+	.set_flag	= blk_tracer_set_flag,
 };
 };
 
 
 static struct trace_event trace_blk_event = {
 static struct trace_event trace_blk_event = {
@@ -1285,7 +1392,8 @@ static int blk_trace_remove_queue(struct request_queue *q)
 /*
 /*
  * Setup everything required to start tracing
  * Setup everything required to start tracing
  */
  */
-static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
+static int blk_trace_setup_queue(struct request_queue *q,
+				 struct block_device *bdev)
 {
 {
 	struct blk_trace *old_bt, *bt = NULL;
 	struct blk_trace *old_bt, *bt = NULL;
 	int ret = -ENOMEM;
 	int ret = -ENOMEM;
@@ -1298,9 +1406,10 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
 	if (!bt->msg_data)
 	if (!bt->msg_data)
 		goto free_bt;
 		goto free_bt;
 
 
-	bt->dev = dev;
+	bt->dev = bdev->bd_dev;
 	bt->act_mask = (u16)-1;
 	bt->act_mask = (u16)-1;
-	bt->end_lba = -1ULL;
+
+	blk_trace_setup_lba(bt, bdev);
 
 
 	old_bt = xchg(&q->blk_trace, bt);
 	old_bt = xchg(&q->blk_trace, bt);
 	if (old_bt != NULL) {
 	if (old_bt != NULL) {
@@ -1517,7 +1626,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 
 
 	if (attr == &dev_attr_enable) {
 	if (attr == &dev_attr_enable) {
 		if (value)
 		if (value)
-			ret = blk_trace_setup_queue(q, bdev->bd_dev);
+			ret = blk_trace_setup_queue(q, bdev);
 		else
 		else
 			ret = blk_trace_remove_queue(q);
 			ret = blk_trace_remove_queue(q);
 		goto out_unlock_bdev;
 		goto out_unlock_bdev;
@@ -1525,7 +1634,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 
 
 	ret = 0;
 	ret = 0;
 	if (q->blk_trace == NULL)
 	if (q->blk_trace == NULL)
-		ret = blk_trace_setup_queue(q, bdev->bd_dev);
+		ret = blk_trace_setup_queue(q, bdev);
 
 
 	if (ret == 0) {
 	if (ret == 0) {
 		if (attr == &dev_attr_act_mask)
 		if (attr == &dev_attr_act_mask)
@@ -1548,3 +1657,80 @@ out:
 	return ret ? ret : count;
 	return ret ? ret : count;
 }
 }
 
 
+int blk_trace_init_sysfs(struct device *dev)
+{
+	return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
+}
+
+#endif /* CONFIG_BLK_DEV_IO_TRACE */
+
+#ifdef CONFIG_EVENT_TRACING
+
+void blk_dump_cmd(char *buf, struct request *rq)
+{
+	int i, end;
+	int len = rq->cmd_len;
+	unsigned char *cmd = rq->cmd;
+
+	if (!blk_pc_request(rq)) {
+		buf[0] = '\0';
+		return;
+	}
+
+	for (end = len - 1; end >= 0; end--)
+		if (cmd[end])
+			break;
+	end++;
+
+	for (i = 0; i < len; i++) {
+		buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]);
+		if (i == end && end != len - 1) {
+			sprintf(buf, " ..");
+			break;
+		}
+	}
+}
+
+void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
+{
+	int i = 0;
+
+	if (rw & WRITE)
+		rwbs[i++] = 'W';
+	else if (rw & 1 << BIO_RW_DISCARD)
+		rwbs[i++] = 'D';
+	else if (bytes)
+		rwbs[i++] = 'R';
+	else
+		rwbs[i++] = 'N';
+
+	if (rw & 1 << BIO_RW_AHEAD)
+		rwbs[i++] = 'A';
+	if (rw & 1 << BIO_RW_BARRIER)
+		rwbs[i++] = 'B';
+	if (rw & 1 << BIO_RW_SYNCIO)
+		rwbs[i++] = 'S';
+	if (rw & 1 << BIO_RW_META)
+		rwbs[i++] = 'M';
+
+	rwbs[i] = '\0';
+}
+
+void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
+{
+	int rw = rq->cmd_flags & 0x03;
+	int bytes;
+
+	if (blk_discard_rq(rq))
+		rw |= (1 << BIO_RW_DISCARD);
+
+	if (blk_pc_request(rq))
+		bytes = rq->data_len;
+	else
+		bytes = rq->hard_nr_sectors << 9;
+
+	blk_fill_rwbs(rwbs, rw, bytes);
+}
+
+#endif /* CONFIG_EVENT_TRACING */
+

+ 0 - 14
kernel/trace/events.c

@@ -1,14 +0,0 @@
-/*
- * This is the place to register all trace points as events.
- */
-
-#include <linux/stringify.h>
-
-#include <trace/trace_events.h>
-
-#include "trace_output.h"
-
-#include "trace_events_stage_1.h"
-#include "trace_events_stage_2.h"
-#include "trace_events_stage_3.h"
-

+ 705 - 67
kernel/trace/ftrace.c

@@ -29,11 +29,13 @@
 #include <linux/list.h>
 #include <linux/list.h>
 #include <linux/hash.h>
 #include <linux/hash.h>
 
 
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 
 #include <asm/ftrace.h>
 #include <asm/ftrace.h>
+#include <asm/setup.h>
 
 
-#include "trace.h"
+#include "trace_output.h"
+#include "trace_stat.h"
 
 
 #define FTRACE_WARN_ON(cond)			\
 #define FTRACE_WARN_ON(cond)			\
 	do {					\
 	do {					\
@@ -68,7 +70,7 @@ static DEFINE_MUTEX(ftrace_lock);
 
 
 static struct ftrace_ops ftrace_list_end __read_mostly =
 static struct ftrace_ops ftrace_list_end __read_mostly =
 {
 {
-	.func = ftrace_stub,
+	.func		= ftrace_stub,
 };
 };
 
 
 static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
 static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
@@ -240,6 +242,580 @@ static void ftrace_update_pid_func(void)
 #endif
 #endif
 }
 }
 
 
+#ifdef CONFIG_FUNCTION_PROFILER
+struct ftrace_profile {
+	struct hlist_node		node;
+	unsigned long			ip;
+	unsigned long			counter;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	unsigned long long		time;
+#endif
+};
+
+struct ftrace_profile_page {
+	struct ftrace_profile_page	*next;
+	unsigned long			index;
+	struct ftrace_profile		records[];
+};
+
+struct ftrace_profile_stat {
+	atomic_t			disabled;
+	struct hlist_head		*hash;
+	struct ftrace_profile_page	*pages;
+	struct ftrace_profile_page	*start;
+	struct tracer_stat		stat;
+};
+
+#define PROFILE_RECORDS_SIZE						\
+	(PAGE_SIZE - offsetof(struct ftrace_profile_page, records))
+
+#define PROFILES_PER_PAGE					\
+	(PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
+
+static int ftrace_profile_bits __read_mostly;
+static int ftrace_profile_enabled __read_mostly;
+
+/* ftrace_profile_lock - synchronize the enable and disable of the profiler */
+static DEFINE_MUTEX(ftrace_profile_lock);
+
+static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
+
+#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */
+
+static void *
+function_stat_next(void *v, int idx)
+{
+	struct ftrace_profile *rec = v;
+	struct ftrace_profile_page *pg;
+
+	pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
+
+ again:
+	rec++;
+	if ((void *)rec >= (void *)&pg->records[pg->index]) {
+		pg = pg->next;
+		if (!pg)
+			return NULL;
+		rec = &pg->records[0];
+		if (!rec->counter)
+			goto again;
+	}
+
+	return rec;
+}
+
+static void *function_stat_start(struct tracer_stat *trace)
+{
+	struct ftrace_profile_stat *stat =
+		container_of(trace, struct ftrace_profile_stat, stat);
+
+	if (!stat || !stat->start)
+		return NULL;
+
+	return function_stat_next(&stat->start->records[0], 0);
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+/* function graph compares on total time */
+static int function_stat_cmp(void *p1, void *p2)
+{
+	struct ftrace_profile *a = p1;
+	struct ftrace_profile *b = p2;
+
+	if (a->time < b->time)
+		return -1;
+	if (a->time > b->time)
+		return 1;
+	else
+		return 0;
+}
+#else
+/* not function graph compares against hits */
+static int function_stat_cmp(void *p1, void *p2)
+{
+	struct ftrace_profile *a = p1;
+	struct ftrace_profile *b = p2;
+
+	if (a->counter < b->counter)
+		return -1;
+	if (a->counter > b->counter)
+		return 1;
+	else
+		return 0;
+}
+#endif
+
+static int function_stat_headers(struct seq_file *m)
+{
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	seq_printf(m, "  Function                               "
+		   "Hit    Time            Avg\n"
+		      "  --------                               "
+		   "---    ----            ---\n");
+#else
+	seq_printf(m, "  Function                               Hit\n"
+		      "  --------                               ---\n");
+#endif
+	return 0;
+}
+
+static int function_stat_show(struct seq_file *m, void *v)
+{
+	struct ftrace_profile *rec = v;
+	char str[KSYM_SYMBOL_LEN];
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	static DEFINE_MUTEX(mutex);
+	static struct trace_seq s;
+	unsigned long long avg;
+#endif
+
+	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+	seq_printf(m, "  %-30.30s  %10lu", str, rec->counter);
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	seq_printf(m, "    ");
+	avg = rec->time;
+	do_div(avg, rec->counter);
+
+	mutex_lock(&mutex);
+	trace_seq_init(&s);
+	trace_print_graph_duration(rec->time, &s);
+	trace_seq_puts(&s, "    ");
+	trace_print_graph_duration(avg, &s);
+	trace_print_seq(m, &s);
+	mutex_unlock(&mutex);
+#endif
+	seq_putc(m, '\n');
+
+	return 0;
+}
+
+static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
+{
+	struct ftrace_profile_page *pg;
+
+	pg = stat->pages = stat->start;
+
+	while (pg) {
+		memset(pg->records, 0, PROFILE_RECORDS_SIZE);
+		pg->index = 0;
+		pg = pg->next;
+	}
+
+	memset(stat->hash, 0,
+	       FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head));
+}
+
+int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
+{
+	struct ftrace_profile_page *pg;
+	int functions;
+	int pages;
+	int i;
+
+	/* If we already allocated, do nothing */
+	if (stat->pages)
+		return 0;
+
+	stat->pages = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!stat->pages)
+		return -ENOMEM;
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+	functions = ftrace_update_tot_cnt;
+#else
+	/*
+	 * We do not know the number of functions that exist because
+	 * dynamic tracing is what counts them. With past experience
+	 * we have around 20K functions. That should be more than enough.
+	 * It is highly unlikely we will execute every function in
+	 * the kernel.
+	 */
+	functions = 20000;
+#endif
+
+	pg = stat->start = stat->pages;
+
+	pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
+
+	for (i = 0; i < pages; i++) {
+		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+		if (!pg->next)
+			goto out_free;
+		pg = pg->next;
+	}
+
+	return 0;
+
+ out_free:
+	pg = stat->start;
+	while (pg) {
+		unsigned long tmp = (unsigned long)pg;
+
+		pg = pg->next;
+		free_page(tmp);
+	}
+
+	free_page((unsigned long)stat->pages);
+	stat->pages = NULL;
+	stat->start = NULL;
+
+	return -ENOMEM;
+}
+
+static int ftrace_profile_init_cpu(int cpu)
+{
+	struct ftrace_profile_stat *stat;
+	int size;
+
+	stat = &per_cpu(ftrace_profile_stats, cpu);
+
+	if (stat->hash) {
+		/* If the profile is already created, simply reset it */
+		ftrace_profile_reset(stat);
+		return 0;
+	}
+
+	/*
+	 * We are profiling all functions, but usually only a few thousand
+	 * functions are hit. We'll make a hash of 1024 items.
+	 */
+	size = FTRACE_PROFILE_HASH_SIZE;
+
+	stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL);
+
+	if (!stat->hash)
+		return -ENOMEM;
+
+	if (!ftrace_profile_bits) {
+		size--;
+
+		for (; size; size >>= 1)
+			ftrace_profile_bits++;
+	}
+
+	/* Preallocate the function profiling pages */
+	if (ftrace_profile_pages_init(stat) < 0) {
+		kfree(stat->hash);
+		stat->hash = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int ftrace_profile_init(void)
+{
+	int cpu;
+	int ret = 0;
+
+	for_each_online_cpu(cpu) {
+		ret = ftrace_profile_init_cpu(cpu);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+/* interrupts must be disabled */
+static struct ftrace_profile *
+ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
+{
+	struct ftrace_profile *rec;
+	struct hlist_head *hhd;
+	struct hlist_node *n;
+	unsigned long key;
+
+	key = hash_long(ip, ftrace_profile_bits);
+	hhd = &stat->hash[key];
+
+	if (hlist_empty(hhd))
+		return NULL;
+
+	hlist_for_each_entry_rcu(rec, n, hhd, node) {
+		if (rec->ip == ip)
+			return rec;
+	}
+
+	return NULL;
+}
+
+static void ftrace_add_profile(struct ftrace_profile_stat *stat,
+			       struct ftrace_profile *rec)
+{
+	unsigned long key;
+
+	key = hash_long(rec->ip, ftrace_profile_bits);
+	hlist_add_head_rcu(&rec->node, &stat->hash[key]);
+}
+
+/*
+ * The memory is already allocated, this simply finds a new record to use.
+ */
+static struct ftrace_profile *
+ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
+{
+	struct ftrace_profile *rec = NULL;
+
+	/* prevent recursion (from NMIs) */
+	if (atomic_inc_return(&stat->disabled) != 1)
+		goto out;
+
+	/*
+	 * Try to find the function again since an NMI
+	 * could have added it
+	 */
+	rec = ftrace_find_profiled_func(stat, ip);
+	if (rec)
+		goto out;
+
+	if (stat->pages->index == PROFILES_PER_PAGE) {
+		if (!stat->pages->next)
+			goto out;
+		stat->pages = stat->pages->next;
+	}
+
+	rec = &stat->pages->records[stat->pages->index++];
+	rec->ip = ip;
+	ftrace_add_profile(stat, rec);
+
+ out:
+	atomic_dec(&stat->disabled);
+
+	return rec;
+}
+
+static void
+function_profile_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct ftrace_profile_stat *stat;
+	struct ftrace_profile *rec;
+	unsigned long flags;
+
+	if (!ftrace_profile_enabled)
+		return;
+
+	local_irq_save(flags);
+
+	stat = &__get_cpu_var(ftrace_profile_stats);
+	if (!stat->hash || !ftrace_profile_enabled)
+		goto out;
+
+	rec = ftrace_find_profiled_func(stat, ip);
+	if (!rec) {
+		rec = ftrace_profile_alloc(stat, ip);
+		if (!rec)
+			goto out;
+	}
+
+	rec->counter++;
+ out:
+	local_irq_restore(flags);
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int profile_graph_entry(struct ftrace_graph_ent *trace)
+{
+	function_profile_call(trace->func, 0);
+	return 1;
+}
+
+static void profile_graph_return(struct ftrace_graph_ret *trace)
+{
+	struct ftrace_profile_stat *stat;
+	unsigned long long calltime;
+	struct ftrace_profile *rec;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	stat = &__get_cpu_var(ftrace_profile_stats);
+	if (!stat->hash || !ftrace_profile_enabled)
+		goto out;
+
+	calltime = trace->rettime - trace->calltime;
+
+	if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
+		int index;
+
+		index = trace->depth;
+
+		/* Append this call time to the parent time to subtract */
+		if (index)
+			current->ret_stack[index - 1].subtime += calltime;
+
+		if (current->ret_stack[index].subtime < calltime)
+			calltime -= current->ret_stack[index].subtime;
+		else
+			calltime = 0;
+	}
+
+	rec = ftrace_find_profiled_func(stat, trace->func);
+	if (rec)
+		rec->time += calltime;
+
+ out:
+	local_irq_restore(flags);
+}
+
+static int register_ftrace_profiler(void)
+{
+	return register_ftrace_graph(&profile_graph_return,
+				     &profile_graph_entry);
+}
+
+static void unregister_ftrace_profiler(void)
+{
+	unregister_ftrace_graph();
+}
+#else
+static struct ftrace_ops ftrace_profile_ops __read_mostly =
+{
+	.func		= function_profile_call,
+};
+
+static int register_ftrace_profiler(void)
+{
+	return register_ftrace_function(&ftrace_profile_ops);
+}
+
+static void unregister_ftrace_profiler(void)
+{
+	unregister_ftrace_function(&ftrace_profile_ops);
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+static ssize_t
+ftrace_profile_write(struct file *filp, const char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	unsigned long val;
+	char buf[64];		/* big enough to hold a number */
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	val = !!val;
+
+	mutex_lock(&ftrace_profile_lock);
+	if (ftrace_profile_enabled ^ val) {
+		if (val) {
+			ret = ftrace_profile_init();
+			if (ret < 0) {
+				cnt = ret;
+				goto out;
+			}
+
+			ret = register_ftrace_profiler();
+			if (ret < 0) {
+				cnt = ret;
+				goto out;
+			}
+			ftrace_profile_enabled = 1;
+		} else {
+			ftrace_profile_enabled = 0;
+			/*
+			 * unregister_ftrace_profiler calls stop_machine
+			 * so this acts like an synchronize_sched.
+			 */
+			unregister_ftrace_profiler();
+		}
+	}
+ out:
+	mutex_unlock(&ftrace_profile_lock);
+
+	filp->f_pos += cnt;
+
+	return cnt;
+}
+
+static ssize_t
+ftrace_profile_read(struct file *filp, char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	char buf[64];		/* big enough to hold a number */
+	int r;
+
+	r = sprintf(buf, "%u\n", ftrace_profile_enabled);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static const struct file_operations ftrace_profile_fops = {
+	.open		= tracing_open_generic,
+	.read		= ftrace_profile_read,
+	.write		= ftrace_profile_write,
+};
+
+/* used to initialize the real stat files */
+static struct tracer_stat function_stats __initdata = {
+	.name		= "functions",
+	.stat_start	= function_stat_start,
+	.stat_next	= function_stat_next,
+	.stat_cmp	= function_stat_cmp,
+	.stat_headers	= function_stat_headers,
+	.stat_show	= function_stat_show
+};
+
+static void ftrace_profile_debugfs(struct dentry *d_tracer)
+{
+	struct ftrace_profile_stat *stat;
+	struct dentry *entry;
+	char *name;
+	int ret;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		stat = &per_cpu(ftrace_profile_stats, cpu);
+
+		/* allocate enough for function name + cpu number */
+		name = kmalloc(32, GFP_KERNEL);
+		if (!name) {
+			/*
+			 * The files created are permanent, if something happens
+			 * we still do not free memory.
+			 */
+			kfree(stat);
+			WARN(1,
+			     "Could not allocate stat file for cpu %d\n",
+			     cpu);
+			return;
+		}
+		stat->stat = function_stats;
+		snprintf(name, 32, "function%d", cpu);
+		stat->stat.name = name;
+		ret = register_stat_tracer(&stat->stat);
+		if (ret) {
+			WARN(1,
+			     "Could not register function stat for cpu %d\n",
+			     cpu);
+			kfree(name);
+			return;
+		}
+	}
+
+	entry = debugfs_create_file("function_profile_enabled", 0644,
+				    d_tracer, NULL, &ftrace_profile_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'function_profile_enabled' entry\n");
+}
+
+#else /* CONFIG_FUNCTION_PROFILER */
+static void ftrace_profile_debugfs(struct dentry *d_tracer)
+{
+}
+#endif /* CONFIG_FUNCTION_PROFILER */
+
 /* set when tracing only a pid */
 /* set when tracing only a pid */
 struct pid *ftrace_pid_trace;
 struct pid *ftrace_pid_trace;
 static struct pid * const ftrace_swapper_pid = &init_struct_pid;
 static struct pid * const ftrace_swapper_pid = &init_struct_pid;
@@ -261,7 +837,6 @@ struct ftrace_func_probe {
 	struct rcu_head		rcu;
 	struct rcu_head		rcu;
 };
 };
 
 
-
 enum {
 enum {
 	FTRACE_ENABLE_CALLS		= (1 << 0),
 	FTRACE_ENABLE_CALLS		= (1 << 0),
 	FTRACE_DISABLE_CALLS		= (1 << 1),
 	FTRACE_DISABLE_CALLS		= (1 << 1),
@@ -346,30 +921,6 @@ static void ftrace_free_rec(struct dyn_ftrace *rec)
 	rec->flags |= FTRACE_FL_FREE;
 	rec->flags |= FTRACE_FL_FREE;
 }
 }
 
 
-void ftrace_release(void *start, unsigned long size)
-{
-	struct dyn_ftrace *rec;
-	struct ftrace_page *pg;
-	unsigned long s = (unsigned long)start;
-	unsigned long e = s + size;
-
-	if (ftrace_disabled || !start)
-		return;
-
-	mutex_lock(&ftrace_lock);
-	do_for_each_ftrace_rec(pg, rec) {
-		if ((rec->ip >= s) && (rec->ip < e)) {
-			/*
-			 * rec->ip is changed in ftrace_free_rec()
-			 * It should not between s and e if record was freed.
-			 */
-			FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
-			ftrace_free_rec(rec);
-		}
-	} while_for_each_ftrace_rec();
-	mutex_unlock(&ftrace_lock);
-}
-
 static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 {
 {
 	struct dyn_ftrace *rec;
 	struct dyn_ftrace *rec;
@@ -1408,7 +1959,7 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
 
 
 static struct ftrace_ops trace_probe_ops __read_mostly =
 static struct ftrace_ops trace_probe_ops __read_mostly =
 {
 {
-	.func = function_trace_probe_call,
+	.func		= function_trace_probe_call,
 };
 };
 
 
 static int ftrace_probe_registered;
 static int ftrace_probe_registered;
@@ -1823,6 +2374,45 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset)
 	ftrace_set_regex(buf, len, reset, 0);
 	ftrace_set_regex(buf, len, reset, 0);
 }
 }
 
 
+/*
+ * command line interface to allow users to set filters on boot up.
+ */
+#define FTRACE_FILTER_SIZE		COMMAND_LINE_SIZE
+static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
+static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
+
+static int __init set_ftrace_notrace(char *str)
+{
+	strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+	return 1;
+}
+__setup("ftrace_notrace=", set_ftrace_notrace);
+
+static int __init set_ftrace_filter(char *str)
+{
+	strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+	return 1;
+}
+__setup("ftrace_filter=", set_ftrace_filter);
+
+static void __init set_ftrace_early_filter(char *buf, int enable)
+{
+	char *func;
+
+	while (buf) {
+		func = strsep(&buf, ",");
+		ftrace_set_regex(func, strlen(func), 0, enable);
+	}
+}
+
+static void __init set_ftrace_early_filters(void)
+{
+	if (ftrace_filter_buf[0])
+		set_ftrace_early_filter(ftrace_filter_buf, 1);
+	if (ftrace_notrace_buf[0])
+		set_ftrace_early_filter(ftrace_notrace_buf, 0);
+}
+
 static int
 static int
 ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 {
 {
@@ -2128,38 +2718,23 @@ static const struct file_operations ftrace_graph_fops = {
 
 
 static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 {
 {
-	struct dentry *entry;
 
 
-	entry = debugfs_create_file("available_filter_functions", 0444,
-				    d_tracer, NULL, &ftrace_avail_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'available_filter_functions' entry\n");
+	trace_create_file("available_filter_functions", 0444,
+			d_tracer, NULL, &ftrace_avail_fops);
 
 
-	entry = debugfs_create_file("failures", 0444,
-				    d_tracer, NULL, &ftrace_failures_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'failures' entry\n");
+	trace_create_file("failures", 0444,
+			d_tracer, NULL, &ftrace_failures_fops);
 
 
-	entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer,
-				    NULL, &ftrace_filter_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'set_ftrace_filter' entry\n");
+	trace_create_file("set_ftrace_filter", 0644, d_tracer,
+			NULL, &ftrace_filter_fops);
 
 
-	entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer,
+	trace_create_file("set_ftrace_notrace", 0644, d_tracer,
 				    NULL, &ftrace_notrace_fops);
 				    NULL, &ftrace_notrace_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'set_ftrace_notrace' entry\n");
 
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	entry = debugfs_create_file("set_graph_function", 0444, d_tracer,
+	trace_create_file("set_graph_function", 0444, d_tracer,
 				    NULL,
 				    NULL,
 				    &ftrace_graph_fops);
 				    &ftrace_graph_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'set_graph_function' entry\n");
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 
 	return 0;
 	return 0;
@@ -2197,14 +2772,72 @@ static int ftrace_convert_nops(struct module *mod,
 	return 0;
 	return 0;
 }
 }
 
 
-void ftrace_init_module(struct module *mod,
-			unsigned long *start, unsigned long *end)
+#ifdef CONFIG_MODULES
+void ftrace_release(void *start, void *end)
+{
+	struct dyn_ftrace *rec;
+	struct ftrace_page *pg;
+	unsigned long s = (unsigned long)start;
+	unsigned long e = (unsigned long)end;
+
+	if (ftrace_disabled || !start || start == end)
+		return;
+
+	mutex_lock(&ftrace_lock);
+	do_for_each_ftrace_rec(pg, rec) {
+		if ((rec->ip >= s) && (rec->ip < e)) {
+			/*
+			 * rec->ip is changed in ftrace_free_rec()
+			 * It should not between s and e if record was freed.
+			 */
+			FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
+			ftrace_free_rec(rec);
+		}
+	} while_for_each_ftrace_rec();
+	mutex_unlock(&ftrace_lock);
+}
+
+static void ftrace_init_module(struct module *mod,
+			       unsigned long *start, unsigned long *end)
 {
 {
 	if (ftrace_disabled || start == end)
 	if (ftrace_disabled || start == end)
 		return;
 		return;
 	ftrace_convert_nops(mod, start, end);
 	ftrace_convert_nops(mod, start, end);
 }
 }
 
 
+static int ftrace_module_notify(struct notifier_block *self,
+				unsigned long val, void *data)
+{
+	struct module *mod = data;
+
+	switch (val) {
+	case MODULE_STATE_COMING:
+		ftrace_init_module(mod, mod->ftrace_callsites,
+				   mod->ftrace_callsites +
+				   mod->num_ftrace_callsites);
+		break;
+	case MODULE_STATE_GOING:
+		ftrace_release(mod->ftrace_callsites,
+			       mod->ftrace_callsites +
+			       mod->num_ftrace_callsites);
+		break;
+	}
+
+	return 0;
+}
+#else
+static int ftrace_module_notify(struct notifier_block *self,
+				unsigned long val, void *data)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULES */
+
+struct notifier_block ftrace_module_nb = {
+	.notifier_call = ftrace_module_notify,
+	.priority = 0,
+};
+
 extern unsigned long __start_mcount_loc[];
 extern unsigned long __start_mcount_loc[];
 extern unsigned long __stop_mcount_loc[];
 extern unsigned long __stop_mcount_loc[];
 
 
@@ -2236,6 +2869,12 @@ void __init ftrace_init(void)
 				  __start_mcount_loc,
 				  __start_mcount_loc,
 				  __stop_mcount_loc);
 				  __stop_mcount_loc);
 
 
+	ret = register_module_notifier(&ftrace_module_nb);
+	if (ret)
+		pr_warning("Failed to register trace ftrace module notifier\n");
+
+	set_ftrace_early_filters();
+
 	return;
 	return;
  failed:
  failed:
 	ftrace_disabled = 1;
 	ftrace_disabled = 1;
@@ -2417,7 +3056,6 @@ static const struct file_operations ftrace_pid_fops = {
 static __init int ftrace_init_debugfs(void)
 static __init int ftrace_init_debugfs(void)
 {
 {
 	struct dentry *d_tracer;
 	struct dentry *d_tracer;
-	struct dentry *entry;
 
 
 	d_tracer = tracing_init_dentry();
 	d_tracer = tracing_init_dentry();
 	if (!d_tracer)
 	if (!d_tracer)
@@ -2425,11 +3063,11 @@ static __init int ftrace_init_debugfs(void)
 
 
 	ftrace_init_dyn_debugfs(d_tracer);
 	ftrace_init_dyn_debugfs(d_tracer);
 
 
-	entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer,
-				    NULL, &ftrace_pid_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'set_ftrace_pid' entry\n");
+	trace_create_file("set_ftrace_pid", 0644, d_tracer,
+			    NULL, &ftrace_pid_fops);
+
+	ftrace_profile_debugfs(d_tracer);
+
 	return 0;
 	return 0;
 }
 }
 fs_initcall(ftrace_init_debugfs);
 fs_initcall(ftrace_init_debugfs);
@@ -2538,7 +3176,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
 
-static atomic_t ftrace_graph_active;
+static int ftrace_graph_active;
 static struct notifier_block ftrace_suspend_notifier;
 static struct notifier_block ftrace_suspend_notifier;
 
 
 int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
 int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
@@ -2690,7 +3328,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 	mutex_lock(&ftrace_lock);
 	mutex_lock(&ftrace_lock);
 
 
 	/* we currently allow only one tracer registered at a time */
 	/* we currently allow only one tracer registered at a time */
-	if (atomic_read(&ftrace_graph_active)) {
+	if (ftrace_graph_active) {
 		ret = -EBUSY;
 		ret = -EBUSY;
 		goto out;
 		goto out;
 	}
 	}
@@ -2698,10 +3336,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 	ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
 	ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
 	register_pm_notifier(&ftrace_suspend_notifier);
 	register_pm_notifier(&ftrace_suspend_notifier);
 
 
-	atomic_inc(&ftrace_graph_active);
+	ftrace_graph_active++;
 	ret = start_graph_tracing();
 	ret = start_graph_tracing();
 	if (ret) {
 	if (ret) {
-		atomic_dec(&ftrace_graph_active);
+		ftrace_graph_active--;
 		goto out;
 		goto out;
 	}
 	}
 
 
@@ -2719,10 +3357,10 @@ void unregister_ftrace_graph(void)
 {
 {
 	mutex_lock(&ftrace_lock);
 	mutex_lock(&ftrace_lock);
 
 
-	if (!unlikely(atomic_read(&ftrace_graph_active)))
+	if (unlikely(!ftrace_graph_active))
 		goto out;
 		goto out;
 
 
-	atomic_dec(&ftrace_graph_active);
+	ftrace_graph_active--;
 	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
 	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
 	ftrace_graph_entry = ftrace_graph_entry_stub;
 	ftrace_graph_entry = ftrace_graph_entry_stub;
@@ -2736,7 +3374,7 @@ void unregister_ftrace_graph(void)
 /* Allocate a return stack for newly created task */
 /* Allocate a return stack for newly created task */
 void ftrace_graph_init_task(struct task_struct *t)
 void ftrace_graph_init_task(struct task_struct *t)
 {
 {
-	if (atomic_read(&ftrace_graph_active)) {
+	if (ftrace_graph_active) {
 		t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
 		t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
 				* sizeof(struct ftrace_ret_stack),
 				* sizeof(struct ftrace_ret_stack),
 				GFP_KERNEL);
 				GFP_KERNEL);

+ 7 - 3
kernel/trace/kmemtrace.c

@@ -12,7 +12,7 @@
 #include <linux/dcache.h>
 #include <linux/dcache.h>
 #include <linux/fs.h>
 #include <linux/fs.h>
 
 
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 
 
 #include "trace_output.h"
 #include "trace_output.h"
 #include "trace.h"
 #include "trace.h"
@@ -42,6 +42,7 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
 				   gfp_t gfp_flags,
 				   gfp_t gfp_flags,
 				   int node)
 				   int node)
 {
 {
+	struct ftrace_event_call *call = &event_kmem_alloc;
 	struct trace_array *tr = kmemtrace_array;
 	struct trace_array *tr = kmemtrace_array;
 	struct kmemtrace_alloc_entry *entry;
 	struct kmemtrace_alloc_entry *entry;
 	struct ring_buffer_event *event;
 	struct ring_buffer_event *event;
@@ -62,7 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
 	entry->gfp_flags	= gfp_flags;
 	entry->gfp_flags	= gfp_flags;
 	entry->node		= node;
 	entry->node		= node;
 
 
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 
 
 	trace_wake_up();
 	trace_wake_up();
 }
 }
@@ -71,6 +73,7 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
 				  unsigned long call_site,
 				  unsigned long call_site,
 				  const void *ptr)
 				  const void *ptr)
 {
 {
+	struct ftrace_event_call *call = &event_kmem_free;
 	struct trace_array *tr = kmemtrace_array;
 	struct trace_array *tr = kmemtrace_array;
 	struct kmemtrace_free_entry *entry;
 	struct kmemtrace_free_entry *entry;
 	struct ring_buffer_event *event;
 	struct ring_buffer_event *event;
@@ -86,7 +89,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
 	entry->call_site	= call_site;
 	entry->call_site	= call_site;
 	entry->ptr		= ptr;
 	entry->ptr		= ptr;
 
 
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 
 
 	trace_wake_up();
 	trace_wake_up();
 }
 }

文件差异内容过多而无法显示
+ 461 - 215
kernel/trace/ring_buffer.c


+ 416 - 0
kernel/trace/ring_buffer_benchmark.c

@@ -0,0 +1,416 @@
+/*
+ * ring buffer tester and benchmark
+ *
+ * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/ring_buffer.h>
+#include <linux/completion.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/time.h>
+
+struct rb_page {
+	u64		ts;
+	local_t		commit;
+	char		data[4080];
+};
+
+/* run time and sleep time in seconds */
+#define RUN_TIME	10
+#define SLEEP_TIME	10
+
+/* number of events for writer to wake up the reader */
+static int wakeup_interval = 100;
+
+static int reader_finish;
+static struct completion read_start;
+static struct completion read_done;
+
+static struct ring_buffer *buffer;
+static struct task_struct *producer;
+static struct task_struct *consumer;
+static unsigned long read;
+
+static int disable_reader;
+module_param(disable_reader, uint, 0644);
+MODULE_PARM_DESC(disable_reader, "only run producer");
+
+static int read_events;
+
+static int kill_test;
+
+#define KILL_TEST()				\
+	do {					\
+		if (!kill_test) {		\
+			kill_test = 1;		\
+			WARN_ON(1);		\
+		}				\
+	} while (0)
+
+enum event_status {
+	EVENT_FOUND,
+	EVENT_DROPPED,
+};
+
+static enum event_status read_event(int cpu)
+{
+	struct ring_buffer_event *event;
+	int *entry;
+	u64 ts;
+
+	event = ring_buffer_consume(buffer, cpu, &ts);
+	if (!event)
+		return EVENT_DROPPED;
+
+	entry = ring_buffer_event_data(event);
+	if (*entry != cpu) {
+		KILL_TEST();
+		return EVENT_DROPPED;
+	}
+
+	read++;
+	return EVENT_FOUND;
+}
+
+static enum event_status read_page(int cpu)
+{
+	struct ring_buffer_event *event;
+	struct rb_page *rpage;
+	unsigned long commit;
+	void *bpage;
+	int *entry;
+	int ret;
+	int inc;
+	int i;
+
+	bpage = ring_buffer_alloc_read_page(buffer);
+	if (!bpage)
+		return EVENT_DROPPED;
+
+	ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
+	if (ret >= 0) {
+		rpage = bpage;
+		commit = local_read(&rpage->commit);
+		for (i = 0; i < commit && !kill_test; i += inc) {
+
+			if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
+				KILL_TEST();
+				break;
+			}
+
+			inc = -1;
+			event = (void *)&rpage->data[i];
+			switch (event->type_len) {
+			case RINGBUF_TYPE_PADDING:
+				/* We don't expect any padding */
+				KILL_TEST();
+				break;
+			case RINGBUF_TYPE_TIME_EXTEND:
+				inc = 8;
+				break;
+			case 0:
+				entry = ring_buffer_event_data(event);
+				if (*entry != cpu) {
+					KILL_TEST();
+					break;
+				}
+				read++;
+				if (!event->array[0]) {
+					KILL_TEST();
+					break;
+				}
+				inc = event->array[0];
+				break;
+			default:
+				entry = ring_buffer_event_data(event);
+				if (*entry != cpu) {
+					KILL_TEST();
+					break;
+				}
+				read++;
+				inc = ((event->type_len + 1) * 4);
+			}
+			if (kill_test)
+				break;
+
+			if (inc <= 0) {
+				KILL_TEST();
+				break;
+			}
+		}
+	}
+	ring_buffer_free_read_page(buffer, bpage);
+
+	if (ret < 0)
+		return EVENT_DROPPED;
+	return EVENT_FOUND;
+}
+
+static void ring_buffer_consumer(void)
+{
+	/* toggle between reading pages and events */
+	read_events ^= 1;
+
+	read = 0;
+	while (!reader_finish && !kill_test) {
+		int found;
+
+		do {
+			int cpu;
+
+			found = 0;
+			for_each_online_cpu(cpu) {
+				enum event_status stat;
+
+				if (read_events)
+					stat = read_event(cpu);
+				else
+					stat = read_page(cpu);
+
+				if (kill_test)
+					break;
+				if (stat == EVENT_FOUND)
+					found = 1;
+			}
+		} while (found && !kill_test);
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (reader_finish)
+			break;
+
+		schedule();
+		__set_current_state(TASK_RUNNING);
+	}
+	reader_finish = 0;
+	complete(&read_done);
+}
+
+static void ring_buffer_producer(void)
+{
+	struct timeval start_tv;
+	struct timeval end_tv;
+	unsigned long long time;
+	unsigned long long entries;
+	unsigned long long overruns;
+	unsigned long missed = 0;
+	unsigned long hit = 0;
+	unsigned long avg;
+	int cnt = 0;
+
+	/*
+	 * Hammer the buffer for 10 secs (this may
+	 * make the system stall)
+	 */
+	pr_info("Starting ring buffer hammer\n");
+	do_gettimeofday(&start_tv);
+	do {
+		struct ring_buffer_event *event;
+		int *entry;
+
+		event = ring_buffer_lock_reserve(buffer, 10);
+		if (!event) {
+			missed++;
+		} else {
+			hit++;
+			entry = ring_buffer_event_data(event);
+			*entry = smp_processor_id();
+			ring_buffer_unlock_commit(buffer, event);
+		}
+		do_gettimeofday(&end_tv);
+
+		cnt++;
+		if (consumer && !(cnt % wakeup_interval))
+			wake_up_process(consumer);
+
+#ifndef CONFIG_PREEMPT
+		/*
+		 * If we are a non preempt kernel, the 10 second run will
+		 * stop everything while it runs. Instead, we will call
+		 * cond_resched and also add any time that was lost by a
+		 * rescedule.
+		 *
+		 * Do a cond resched at the same frequency we would wake up
+		 * the reader.
+		 */
+		if (cnt % wakeup_interval)
+			cond_resched();
+#endif
+
+	} while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
+	pr_info("End ring buffer hammer\n");
+
+	if (consumer) {
+		/* Init both completions here to avoid races */
+		init_completion(&read_start);
+		init_completion(&read_done);
+		/* the completions must be visible before the finish var */
+		smp_wmb();
+		reader_finish = 1;
+		/* finish var visible before waking up the consumer */
+		smp_wmb();
+		wake_up_process(consumer);
+		wait_for_completion(&read_done);
+	}
+
+	time = end_tv.tv_sec - start_tv.tv_sec;
+	time *= USEC_PER_SEC;
+	time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec);
+
+	entries = ring_buffer_entries(buffer);
+	overruns = ring_buffer_overruns(buffer);
+
+	if (kill_test)
+		pr_info("ERROR!\n");
+	pr_info("Time:     %lld (usecs)\n", time);
+	pr_info("Overruns: %lld\n", overruns);
+	if (disable_reader)
+		pr_info("Read:     (reader disabled)\n");
+	else
+		pr_info("Read:     %ld  (by %s)\n", read,
+			read_events ? "events" : "pages");
+	pr_info("Entries:  %lld\n", entries);
+	pr_info("Total:    %lld\n", entries + overruns + read);
+	pr_info("Missed:   %ld\n", missed);
+	pr_info("Hit:      %ld\n", hit);
+
+	/* Convert time from usecs to millisecs */
+	do_div(time, USEC_PER_MSEC);
+	if (time)
+		hit /= (long)time;
+	else
+		pr_info("TIME IS ZERO??\n");
+
+	pr_info("Entries per millisec: %ld\n", hit);
+
+	if (hit) {
+		/* Calculate the average time in nanosecs */
+		avg = NSEC_PER_MSEC / hit;
+		pr_info("%ld ns per entry\n", avg);
+	}
+
+	if (missed) {
+		if (time)
+			missed /= (long)time;
+
+		pr_info("Total iterations per millisec: %ld\n", hit + missed);
+
+		/* it is possible that hit + missed will overflow and be zero */
+		if (!(hit + missed)) {
+			pr_info("hit + missed overflowed and totalled zero!\n");
+			hit--; /* make it non zero */
+		}
+
+		/* Caculate the average time in nanosecs */
+		avg = NSEC_PER_MSEC / (hit + missed);
+		pr_info("%ld ns per entry\n", avg);
+	}
+}
+
+static void wait_to_die(void)
+{
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+}
+
+static int ring_buffer_consumer_thread(void *arg)
+{
+	while (!kthread_should_stop() && !kill_test) {
+		complete(&read_start);
+
+		ring_buffer_consumer();
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop() || kill_test)
+			break;
+
+		schedule();
+		__set_current_state(TASK_RUNNING);
+	}
+	__set_current_state(TASK_RUNNING);
+
+	if (kill_test)
+		wait_to_die();
+
+	return 0;
+}
+
+static int ring_buffer_producer_thread(void *arg)
+{
+	init_completion(&read_start);
+
+	while (!kthread_should_stop() && !kill_test) {
+		ring_buffer_reset(buffer);
+
+		if (consumer) {
+			smp_wmb();
+			wake_up_process(consumer);
+			wait_for_completion(&read_start);
+		}
+
+		ring_buffer_producer();
+
+		pr_info("Sleeping for 10 secs\n");
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(HZ * SLEEP_TIME);
+		__set_current_state(TASK_RUNNING);
+	}
+
+	if (kill_test)
+		wait_to_die();
+
+	return 0;
+}
+
+static int __init ring_buffer_benchmark_init(void)
+{
+	int ret;
+
+	/* make a one meg buffer in overwite mode */
+	buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE);
+	if (!buffer)
+		return -ENOMEM;
+
+	if (!disable_reader) {
+		consumer = kthread_create(ring_buffer_consumer_thread,
+					  NULL, "rb_consumer");
+		ret = PTR_ERR(consumer);
+		if (IS_ERR(consumer))
+			goto out_fail;
+	}
+
+	producer = kthread_run(ring_buffer_producer_thread,
+			       NULL, "rb_producer");
+	ret = PTR_ERR(producer);
+
+	if (IS_ERR(producer))
+		goto out_kill;
+
+	return 0;
+
+ out_kill:
+	if (consumer)
+		kthread_stop(consumer);
+
+ out_fail:
+	ring_buffer_free(buffer);
+	return ret;
+}
+
+static void __exit ring_buffer_benchmark_exit(void)
+{
+	kthread_stop(producer);
+	if (consumer)
+		kthread_stop(consumer);
+	ring_buffer_free(buffer);
+}
+
+module_init(ring_buffer_benchmark_init);
+module_exit(ring_buffer_benchmark_exit);
+
+MODULE_AUTHOR("Steven Rostedt");
+MODULE_DESCRIPTION("ring_buffer_benchmark");
+MODULE_LICENSE("GPL");

+ 253 - 130
kernel/trace/trace.c

@@ -171,6 +171,13 @@ static struct trace_array	global_trace;
 
 
 static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
 static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
 
 
+int filter_current_check_discard(struct ftrace_event_call *call, void *rec,
+				 struct ring_buffer_event *event)
+{
+	return filter_check_discard(call, rec, global_trace.buffer, event);
+}
+EXPORT_SYMBOL_GPL(filter_current_check_discard);
+
 cycle_t ftrace_now(int cpu)
 cycle_t ftrace_now(int cpu)
 {
 {
 	u64 ts;
 	u64 ts;
@@ -255,7 +262,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
 
 /* trace_flags holds trace_options default values */
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
-	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME;
+	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
+	TRACE_ITER_GRAPH_TIME;
 
 
 /**
 /**
  * trace_wake_up - wake up tasks waiting for trace input
  * trace_wake_up - wake up tasks waiting for trace input
@@ -317,6 +325,7 @@ static const char *trace_options[] = {
 	"latency-format",
 	"latency-format",
 	"global-clock",
 	"global-clock",
 	"sleep-time",
 	"sleep-time",
+	"graph-time",
 	NULL
 	NULL
 };
 };
 
 
@@ -402,17 +411,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
 	return cnt;
 	return cnt;
 }
 }
 
 
-static void
-trace_print_seq(struct seq_file *m, struct trace_seq *s)
-{
-	int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
-
-	s->buffer[len] = 0;
-	seq_puts(m, s->buffer);
-
-	trace_seq_init(s);
-}
-
 /**
 /**
  * update_max_tr - snapshot all trace buffers from global_trace to max_tr
  * update_max_tr - snapshot all trace buffers from global_trace to max_tr
  * @tr: tracer
  * @tr: tracer
@@ -641,6 +639,16 @@ void tracing_reset_online_cpus(struct trace_array *tr)
 		tracing_reset(tr, cpu);
 		tracing_reset(tr, cpu);
 }
 }
 
 
+void tracing_reset_current(int cpu)
+{
+	tracing_reset(&global_trace, cpu);
+}
+
+void tracing_reset_current_online_cpus(void)
+{
+	tracing_reset_online_cpus(&global_trace);
+}
+
 #define SAVED_CMDLINES 128
 #define SAVED_CMDLINES 128
 #define NO_CMDLINE_MAP UINT_MAX
 #define NO_CMDLINE_MAP UINT_MAX
 static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
 static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
@@ -800,6 +808,7 @@ void trace_find_cmdline(int pid, char comm[])
 		return;
 		return;
 	}
 	}
 
 
+	preempt_disable();
 	__raw_spin_lock(&trace_cmdline_lock);
 	__raw_spin_lock(&trace_cmdline_lock);
 	map = map_pid_to_cmdline[pid];
 	map = map_pid_to_cmdline[pid];
 	if (map != NO_CMDLINE_MAP)
 	if (map != NO_CMDLINE_MAP)
@@ -808,6 +817,7 @@ void trace_find_cmdline(int pid, char comm[])
 		strcpy(comm, "<...>");
 		strcpy(comm, "<...>");
 
 
 	__raw_spin_unlock(&trace_cmdline_lock);
 	__raw_spin_unlock(&trace_cmdline_lock);
+	preempt_enable();
 }
 }
 
 
 void tracing_record_cmdline(struct task_struct *tsk)
 void tracing_record_cmdline(struct task_struct *tsk)
@@ -840,7 +850,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 }
 }
 
 
 struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
 struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
-						    unsigned char type,
+						    int type,
 						    unsigned long len,
 						    unsigned long len,
 						    unsigned long flags, int pc)
 						    unsigned long flags, int pc)
 {
 {
@@ -883,30 +893,40 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
 }
 }
 
 
 struct ring_buffer_event *
 struct ring_buffer_event *
-trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
+trace_current_buffer_lock_reserve(int type, unsigned long len,
 				  unsigned long flags, int pc)
 				  unsigned long flags, int pc)
 {
 {
 	return trace_buffer_lock_reserve(&global_trace,
 	return trace_buffer_lock_reserve(&global_trace,
 					 type, len, flags, pc);
 					 type, len, flags, pc);
 }
 }
+EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
 
 
 void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc)
 					unsigned long flags, int pc)
 {
 {
-	return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
+	__trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
 }
 }
+EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
 
 
 void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
 void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc)
 					unsigned long flags, int pc)
 {
 {
-	return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
+	__trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
+}
+EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
+
+void trace_current_buffer_discard_commit(struct ring_buffer_event *event)
+{
+	ring_buffer_discard_commit(global_trace.buffer, event);
 }
 }
+EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
 
 
 void
 void
 trace_function(struct trace_array *tr,
 trace_function(struct trace_array *tr,
 	       unsigned long ip, unsigned long parent_ip, unsigned long flags,
 	       unsigned long ip, unsigned long parent_ip, unsigned long flags,
 	       int pc)
 	       int pc)
 {
 {
+	struct ftrace_event_call *call = &event_function;
 	struct ring_buffer_event *event;
 	struct ring_buffer_event *event;
 	struct ftrace_entry *entry;
 	struct ftrace_entry *entry;
 
 
@@ -921,7 +941,9 @@ trace_function(struct trace_array *tr,
 	entry	= ring_buffer_event_data(event);
 	entry	= ring_buffer_event_data(event);
 	entry->ip			= ip;
 	entry->ip			= ip;
 	entry->parent_ip		= parent_ip;
 	entry->parent_ip		= parent_ip;
-	ring_buffer_unlock_commit(tr->buffer, event);
+
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 }
 }
 
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -930,6 +952,7 @@ static int __trace_graph_entry(struct trace_array *tr,
 				unsigned long flags,
 				unsigned long flags,
 				int pc)
 				int pc)
 {
 {
+	struct ftrace_event_call *call = &event_funcgraph_entry;
 	struct ring_buffer_event *event;
 	struct ring_buffer_event *event;
 	struct ftrace_graph_ent_entry *entry;
 	struct ftrace_graph_ent_entry *entry;
 
 
@@ -942,7 +965,8 @@ static int __trace_graph_entry(struct trace_array *tr,
 		return 0;
 		return 0;
 	entry	= ring_buffer_event_data(event);
 	entry	= ring_buffer_event_data(event);
 	entry->graph_ent			= *trace;
 	entry->graph_ent			= *trace;
-	ring_buffer_unlock_commit(global_trace.buffer, event);
+	if (!filter_current_check_discard(call, entry, event))
+		ring_buffer_unlock_commit(global_trace.buffer, event);
 
 
 	return 1;
 	return 1;
 }
 }
@@ -952,6 +976,7 @@ static void __trace_graph_return(struct trace_array *tr,
 				unsigned long flags,
 				unsigned long flags,
 				int pc)
 				int pc)
 {
 {
+	struct ftrace_event_call *call = &event_funcgraph_exit;
 	struct ring_buffer_event *event;
 	struct ring_buffer_event *event;
 	struct ftrace_graph_ret_entry *entry;
 	struct ftrace_graph_ret_entry *entry;
 
 
@@ -964,7 +989,8 @@ static void __trace_graph_return(struct trace_array *tr,
 		return;
 		return;
 	entry	= ring_buffer_event_data(event);
 	entry	= ring_buffer_event_data(event);
 	entry->ret				= *trace;
 	entry->ret				= *trace;
-	ring_buffer_unlock_commit(global_trace.buffer, event);
+	if (!filter_current_check_discard(call, entry, event))
+		ring_buffer_unlock_commit(global_trace.buffer, event);
 }
 }
 #endif
 #endif
 
 
@@ -982,6 +1008,7 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 				 int skip, int pc)
 				 int skip, int pc)
 {
 {
 #ifdef CONFIG_STACKTRACE
 #ifdef CONFIG_STACKTRACE
+	struct ftrace_event_call *call = &event_kernel_stack;
 	struct ring_buffer_event *event;
 	struct ring_buffer_event *event;
 	struct stack_entry *entry;
 	struct stack_entry *entry;
 	struct stack_trace trace;
 	struct stack_trace trace;
@@ -999,7 +1026,8 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 	trace.entries		= entry->caller;
 	trace.entries		= entry->caller;
 
 
 	save_stack_trace(&trace);
 	save_stack_trace(&trace);
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 #endif
 }
 }
 
 
@@ -1024,6 +1052,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 				   unsigned long flags, int pc)
 				   unsigned long flags, int pc)
 {
 {
 #ifdef CONFIG_STACKTRACE
 #ifdef CONFIG_STACKTRACE
+	struct ftrace_event_call *call = &event_user_stack;
 	struct ring_buffer_event *event;
 	struct ring_buffer_event *event;
 	struct userstack_entry *entry;
 	struct userstack_entry *entry;
 	struct stack_trace trace;
 	struct stack_trace trace;
@@ -1045,7 +1074,8 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 	trace.entries		= entry->caller;
 	trace.entries		= entry->caller;
 
 
 	save_stack_trace_user(&trace);
 	save_stack_trace_user(&trace);
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 #endif
 }
 }
 
 
@@ -1089,6 +1119,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 			   struct task_struct *next,
 			   struct task_struct *next,
 			   unsigned long flags, int pc)
 			   unsigned long flags, int pc)
 {
 {
+	struct ftrace_event_call *call = &event_context_switch;
 	struct ring_buffer_event *event;
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
 	struct ctx_switch_entry *entry;
 
 
@@ -1104,7 +1135,9 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->next_prio		= next->prio;
 	entry->next_prio		= next->prio;
 	entry->next_state		= next->state;
 	entry->next_state		= next->state;
 	entry->next_cpu	= task_cpu(next);
 	entry->next_cpu	= task_cpu(next);
-	trace_buffer_unlock_commit(tr, event, flags, pc);
+
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		trace_buffer_unlock_commit(tr, event, flags, pc);
 }
 }
 
 
 void
 void
@@ -1113,6 +1146,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 			   struct task_struct *curr,
 			   struct task_struct *curr,
 			   unsigned long flags, int pc)
 			   unsigned long flags, int pc)
 {
 {
+	struct ftrace_event_call *call = &event_wakeup;
 	struct ring_buffer_event *event;
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
 	struct ctx_switch_entry *entry;
 
 
@@ -1129,7 +1163,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_state		= wakee->state;
 	entry->next_state		= wakee->state;
 	entry->next_cpu			= task_cpu(wakee);
 	entry->next_cpu			= task_cpu(wakee);
 
 
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 	ftrace_trace_stack(tr, flags, 6, pc);
 	ftrace_trace_stack(tr, flags, 6, pc);
 	ftrace_trace_userstack(tr, flags, pc);
 	ftrace_trace_userstack(tr, flags, pc);
 }
 }
@@ -1230,11 +1265,13 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 		(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 		(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	static u32 trace_buf[TRACE_BUF_SIZE];
 	static u32 trace_buf[TRACE_BUF_SIZE];
 
 
+	struct ftrace_event_call *call = &event_bprint;
 	struct ring_buffer_event *event;
 	struct ring_buffer_event *event;
 	struct trace_array *tr = &global_trace;
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
 	struct trace_array_cpu *data;
 	struct bprint_entry *entry;
 	struct bprint_entry *entry;
 	unsigned long flags;
 	unsigned long flags;
+	int disable;
 	int resched;
 	int resched;
 	int cpu, len = 0, size, pc;
 	int cpu, len = 0, size, pc;
 
 
@@ -1249,7 +1286,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	cpu = raw_smp_processor_id();
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 	data = tr->data[cpu];
 
 
-	if (unlikely(atomic_read(&data->disabled)))
+	disable = atomic_inc_return(&data->disabled);
+	if (unlikely(disable != 1))
 		goto out;
 		goto out;
 
 
 	/* Lockdep uses trace_printk for lock tracing */
 	/* Lockdep uses trace_printk for lock tracing */
@@ -1269,13 +1307,15 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	entry->fmt			= fmt;
 	entry->fmt			= fmt;
 
 
 	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
 	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 
 
 out_unlock:
 out_unlock:
 	__raw_spin_unlock(&trace_buf_lock);
 	__raw_spin_unlock(&trace_buf_lock);
 	local_irq_restore(flags);
 	local_irq_restore(flags);
 
 
 out:
 out:
+	atomic_dec_return(&data->disabled);
 	ftrace_preempt_enable(resched);
 	ftrace_preempt_enable(resched);
 	unpause_graph_tracing();
 	unpause_graph_tracing();
 
 
@@ -1288,12 +1328,14 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 	static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
 	static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
 	static char trace_buf[TRACE_BUF_SIZE];
 	static char trace_buf[TRACE_BUF_SIZE];
 
 
+	struct ftrace_event_call *call = &event_print;
 	struct ring_buffer_event *event;
 	struct ring_buffer_event *event;
 	struct trace_array *tr = &global_trace;
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
 	struct trace_array_cpu *data;
 	int cpu, len = 0, size, pc;
 	int cpu, len = 0, size, pc;
 	struct print_entry *entry;
 	struct print_entry *entry;
 	unsigned long irq_flags;
 	unsigned long irq_flags;
+	int disable;
 
 
 	if (tracing_disabled || tracing_selftest_running)
 	if (tracing_disabled || tracing_selftest_running)
 		return 0;
 		return 0;
@@ -1303,7 +1345,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 	cpu = raw_smp_processor_id();
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 	data = tr->data[cpu];
 
 
-	if (unlikely(atomic_read(&data->disabled)))
+	disable = atomic_inc_return(&data->disabled);
+	if (unlikely(disable != 1))
 		goto out;
 		goto out;
 
 
 	pause_graph_tracing();
 	pause_graph_tracing();
@@ -1323,13 +1366,15 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 
 
 	memcpy(&entry->buf, trace_buf, len);
 	memcpy(&entry->buf, trace_buf, len);
 	entry->buf[len] = 0;
 	entry->buf[len] = 0;
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 
 
  out_unlock:
  out_unlock:
 	__raw_spin_unlock(&trace_buf_lock);
 	__raw_spin_unlock(&trace_buf_lock);
 	raw_local_irq_restore(irq_flags);
 	raw_local_irq_restore(irq_flags);
 	unpause_graph_tracing();
 	unpause_graph_tracing();
  out:
  out:
+	atomic_dec_return(&data->disabled);
 	preempt_enable_notrace();
 	preempt_enable_notrace();
 
 
 	return len;
 	return len;
@@ -1526,12 +1571,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 		p = s_next(m, p, &l);
 		p = s_next(m, p, &l);
 	}
 	}
 
 
+	trace_event_read_lock();
 	return p;
 	return p;
 }
 }
 
 
 static void s_stop(struct seq_file *m, void *p)
 static void s_stop(struct seq_file *m, void *p)
 {
 {
 	atomic_dec(&trace_record_cmdline_disabled);
 	atomic_dec(&trace_record_cmdline_disabled);
+	trace_event_read_unlock();
 }
 }
 
 
 static void print_lat_help_header(struct seq_file *m)
 static void print_lat_help_header(struct seq_file *m)
@@ -1774,6 +1821,7 @@ static int trace_empty(struct trace_iterator *iter)
 	return 1;
 	return 1;
 }
 }
 
 
+/*  Called with trace_event_read_lock() held. */
 static enum print_line_t print_trace_line(struct trace_iterator *iter)
 static enum print_line_t print_trace_line(struct trace_iterator *iter)
 {
 {
 	enum print_line_t ret;
 	enum print_line_t ret;
@@ -2396,6 +2444,56 @@ static const struct file_operations tracing_readme_fops = {
 	.read		= tracing_readme_read,
 	.read		= tracing_readme_read,
 };
 };
 
 
+static ssize_t
+tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
+				size_t cnt, loff_t *ppos)
+{
+	char *buf_comm;
+	char *file_buf;
+	char *buf;
+	int len = 0;
+	int pid;
+	int i;
+
+	file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL);
+	if (!file_buf)
+		return -ENOMEM;
+
+	buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL);
+	if (!buf_comm) {
+		kfree(file_buf);
+		return -ENOMEM;
+	}
+
+	buf = file_buf;
+
+	for (i = 0; i < SAVED_CMDLINES; i++) {
+		int r;
+
+		pid = map_cmdline_to_pid[i];
+		if (pid == -1 || pid == NO_CMDLINE_MAP)
+			continue;
+
+		trace_find_cmdline(pid, buf_comm);
+		r = sprintf(buf, "%d %s\n", pid, buf_comm);
+		buf += r;
+		len += r;
+	}
+
+	len = simple_read_from_buffer(ubuf, cnt, ppos,
+				      file_buf, len);
+
+	kfree(file_buf);
+	kfree(buf_comm);
+
+	return len;
+}
+
+static const struct file_operations tracing_saved_cmdlines_fops = {
+    .open       = tracing_open_generic,
+    .read       = tracing_saved_cmdlines_read,
+};
+
 static ssize_t
 static ssize_t
 tracing_ctrl_read(struct file *filp, char __user *ubuf,
 tracing_ctrl_read(struct file *filp, char __user *ubuf,
 		  size_t cnt, loff_t *ppos)
 		  size_t cnt, loff_t *ppos)
@@ -2728,6 +2826,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 	/* trace pipe does not show start of buffer */
 	/* trace pipe does not show start of buffer */
 	cpumask_setall(iter->started);
 	cpumask_setall(iter->started);
 
 
+	if (trace_flags & TRACE_ITER_LATENCY_FMT)
+		iter->iter_flags |= TRACE_FILE_LAT_FMT;
+
 	iter->cpu_file = cpu_file;
 	iter->cpu_file = cpu_file;
 	iter->tr = &global_trace;
 	iter->tr = &global_trace;
 	mutex_init(&iter->mutex);
 	mutex_init(&iter->mutex);
@@ -2915,6 +3016,7 @@ waitagain:
 	       offsetof(struct trace_iterator, seq));
 	       offsetof(struct trace_iterator, seq));
 	iter->pos = -1;
 	iter->pos = -1;
 
 
+	trace_event_read_lock();
 	while (find_next_entry_inc(iter) != NULL) {
 	while (find_next_entry_inc(iter) != NULL) {
 		enum print_line_t ret;
 		enum print_line_t ret;
 		int len = iter->seq.len;
 		int len = iter->seq.len;
@@ -2931,6 +3033,7 @@ waitagain:
 		if (iter->seq.len >= cnt)
 		if (iter->seq.len >= cnt)
 			break;
 			break;
 	}
 	}
+	trace_event_read_unlock();
 
 
 	/* Now copy what we have to the user */
 	/* Now copy what we have to the user */
 	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
 	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
@@ -3053,6 +3156,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		goto out_err;
 		goto out_err;
 	}
 	}
 
 
+	trace_event_read_lock();
+
 	/* Fill as many pages as possible. */
 	/* Fill as many pages as possible. */
 	for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
 	for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
 		pages[i] = alloc_page(GFP_KERNEL);
 		pages[i] = alloc_page(GFP_KERNEL);
@@ -3075,6 +3180,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		trace_seq_init(&iter->seq);
 		trace_seq_init(&iter->seq);
 	}
 	}
 
 
+	trace_event_read_unlock();
 	mutex_unlock(&iter->mutex);
 	mutex_unlock(&iter->mutex);
 
 
 	spd.nr_pages = i;
 	spd.nr_pages = i;
@@ -3425,7 +3531,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		.spd_release	= buffer_spd_release,
 		.spd_release	= buffer_spd_release,
 	};
 	};
 	struct buffer_ref *ref;
 	struct buffer_ref *ref;
-	int size, i;
+	int entries, size, i;
 	size_t ret;
 	size_t ret;
 
 
 	if (*ppos & (PAGE_SIZE - 1)) {
 	if (*ppos & (PAGE_SIZE - 1)) {
@@ -3440,7 +3546,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		len &= PAGE_MASK;
 		len &= PAGE_MASK;
 	}
 	}
 
 
-	for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) {
+	entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
+
+	for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
 		struct page *page;
 		struct page *page;
 		int r;
 		int r;
 
 
@@ -3457,7 +3565,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		}
 		}
 
 
 		r = ring_buffer_read_page(ref->buffer, &ref->page,
 		r = ring_buffer_read_page(ref->buffer, &ref->page,
-					  len, info->cpu, 0);
+					  len, info->cpu, 1);
 		if (r < 0) {
 		if (r < 0) {
 			ring_buffer_free_read_page(ref->buffer,
 			ring_buffer_free_read_page(ref->buffer,
 						   ref->page);
 						   ref->page);
@@ -3481,6 +3589,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		spd.partial[i].private = (unsigned long)ref;
 		spd.partial[i].private = (unsigned long)ref;
 		spd.nr_pages++;
 		spd.nr_pages++;
 		*ppos += PAGE_SIZE;
 		*ppos += PAGE_SIZE;
+
+		entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
 	}
 	}
 
 
 	spd.nr_pages = i;
 	spd.nr_pages = i;
@@ -3508,6 +3618,45 @@ static const struct file_operations tracing_buffers_fops = {
 	.llseek		= no_llseek,
 	.llseek		= no_llseek,
 };
 };
 
 
+static ssize_t
+tracing_stats_read(struct file *filp, char __user *ubuf,
+		   size_t count, loff_t *ppos)
+{
+	unsigned long cpu = (unsigned long)filp->private_data;
+	struct trace_array *tr = &global_trace;
+	struct trace_seq *s;
+	unsigned long cnt;
+
+	s = kmalloc(sizeof(*s), GFP_ATOMIC);
+	if (!s)
+		return ENOMEM;
+
+	trace_seq_init(s);
+
+	cnt = ring_buffer_entries_cpu(tr->buffer, cpu);
+	trace_seq_printf(s, "entries: %ld\n", cnt);
+
+	cnt = ring_buffer_overrun_cpu(tr->buffer, cpu);
+	trace_seq_printf(s, "overrun: %ld\n", cnt);
+
+	cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
+	trace_seq_printf(s, "commit overrun: %ld\n", cnt);
+
+	cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu);
+	trace_seq_printf(s, "nmi dropped: %ld\n", cnt);
+
+	count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
+
+	kfree(s);
+
+	return count;
+}
+
+static const struct file_operations tracing_stats_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_stats_read,
+};
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 
 int __weak ftrace_arch_read_dyn_info(char *buf, int size)
 int __weak ftrace_arch_read_dyn_info(char *buf, int size)
@@ -3597,7 +3746,7 @@ struct dentry *tracing_dentry_percpu(void)
 static void tracing_init_debugfs_percpu(long cpu)
 static void tracing_init_debugfs_percpu(long cpu)
 {
 {
 	struct dentry *d_percpu = tracing_dentry_percpu();
 	struct dentry *d_percpu = tracing_dentry_percpu();
-	struct dentry *entry, *d_cpu;
+	struct dentry *d_cpu;
 	/* strlen(cpu) + MAX(log10(cpu)) + '\0' */
 	/* strlen(cpu) + MAX(log10(cpu)) + '\0' */
 	char cpu_dir[7];
 	char cpu_dir[7];
 
 
@@ -3612,21 +3761,18 @@ static void tracing_init_debugfs_percpu(long cpu)
 	}
 	}
 
 
 	/* per cpu trace_pipe */
 	/* per cpu trace_pipe */
-	entry = debugfs_create_file("trace_pipe", 0444, d_cpu,
-				(void *) cpu, &tracing_pipe_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'trace_pipe' entry\n");
+	trace_create_file("trace_pipe", 0444, d_cpu,
+			(void *) cpu, &tracing_pipe_fops);
 
 
 	/* per cpu trace */
 	/* per cpu trace */
-	entry = debugfs_create_file("trace", 0644, d_cpu,
-				(void *) cpu, &tracing_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'trace' entry\n");
+	trace_create_file("trace", 0644, d_cpu,
+			(void *) cpu, &tracing_fops);
+
+	trace_create_file("trace_pipe_raw", 0444, d_cpu,
+			(void *) cpu, &tracing_buffers_fops);
 
 
-	entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu,
-				    (void *) cpu, &tracing_buffers_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n");
+	trace_create_file("stats", 0444, d_cpu,
+			(void *) cpu, &tracing_stats_fops);
 }
 }
 
 
 #ifdef CONFIG_FTRACE_SELFTEST
 #ifdef CONFIG_FTRACE_SELFTEST
@@ -3782,6 +3928,22 @@ static const struct file_operations trace_options_core_fops = {
 	.write = trace_options_core_write,
 	.write = trace_options_core_write,
 };
 };
 
 
+struct dentry *trace_create_file(const char *name,
+				 mode_t mode,
+				 struct dentry *parent,
+				 void *data,
+				 const struct file_operations *fops)
+{
+	struct dentry *ret;
+
+	ret = debugfs_create_file(name, mode, parent, data, fops);
+	if (!ret)
+		pr_warning("Could not create debugfs '%s' entry\n", name);
+
+	return ret;
+}
+
+
 static struct dentry *trace_options_init_dentry(void)
 static struct dentry *trace_options_init_dentry(void)
 {
 {
 	struct dentry *d_tracer;
 	struct dentry *d_tracer;
@@ -3809,7 +3971,6 @@ create_trace_option_file(struct trace_option_dentry *topt,
 			 struct tracer_opt *opt)
 			 struct tracer_opt *opt)
 {
 {
 	struct dentry *t_options;
 	struct dentry *t_options;
-	struct dentry *entry;
 
 
 	t_options = trace_options_init_dentry();
 	t_options = trace_options_init_dentry();
 	if (!t_options)
 	if (!t_options)
@@ -3818,11 +3979,9 @@ create_trace_option_file(struct trace_option_dentry *topt,
 	topt->flags = flags;
 	topt->flags = flags;
 	topt->opt = opt;
 	topt->opt = opt;
 
 
-	entry = debugfs_create_file(opt->name, 0644, t_options, topt,
+	topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
 				    &trace_options_fops);
 				    &trace_options_fops);
 
 
-	topt->entry = entry;
-
 }
 }
 
 
 static struct trace_option_dentry *
 static struct trace_option_dentry *
@@ -3877,123 +4036,84 @@ static struct dentry *
 create_trace_option_core_file(const char *option, long index)
 create_trace_option_core_file(const char *option, long index)
 {
 {
 	struct dentry *t_options;
 	struct dentry *t_options;
-	struct dentry *entry;
 
 
 	t_options = trace_options_init_dentry();
 	t_options = trace_options_init_dentry();
 	if (!t_options)
 	if (!t_options)
 		return NULL;
 		return NULL;
 
 
-	entry = debugfs_create_file(option, 0644, t_options, (void *)index,
+	return trace_create_file(option, 0644, t_options, (void *)index,
 				    &trace_options_core_fops);
 				    &trace_options_core_fops);
-
-	return entry;
 }
 }
 
 
 static __init void create_trace_options_dir(void)
 static __init void create_trace_options_dir(void)
 {
 {
 	struct dentry *t_options;
 	struct dentry *t_options;
-	struct dentry *entry;
 	int i;
 	int i;
 
 
 	t_options = trace_options_init_dentry();
 	t_options = trace_options_init_dentry();
 	if (!t_options)
 	if (!t_options)
 		return;
 		return;
 
 
-	for (i = 0; trace_options[i]; i++) {
-		entry = create_trace_option_core_file(trace_options[i], i);
-		if (!entry)
-			pr_warning("Could not create debugfs %s entry\n",
-				   trace_options[i]);
-	}
+	for (i = 0; trace_options[i]; i++)
+		create_trace_option_core_file(trace_options[i], i);
 }
 }
 
 
 static __init int tracer_init_debugfs(void)
 static __init int tracer_init_debugfs(void)
 {
 {
 	struct dentry *d_tracer;
 	struct dentry *d_tracer;
-	struct dentry *entry;
 	int cpu;
 	int cpu;
 
 
 	d_tracer = tracing_init_dentry();
 	d_tracer = tracing_init_dentry();
 
 
-	entry = debugfs_create_file("tracing_enabled", 0644, d_tracer,
-				    &global_trace, &tracing_ctrl_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
+	trace_create_file("tracing_enabled", 0644, d_tracer,
+			&global_trace, &tracing_ctrl_fops);
 
 
-	entry = debugfs_create_file("trace_options", 0644, d_tracer,
-				    NULL, &tracing_iter_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'trace_options' entry\n");
+	trace_create_file("trace_options", 0644, d_tracer,
+			NULL, &tracing_iter_fops);
 
 
-	create_trace_options_dir();
+	trace_create_file("tracing_cpumask", 0644, d_tracer,
+			NULL, &tracing_cpumask_fops);
+
+	trace_create_file("trace", 0644, d_tracer,
+			(void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
 
 
-	entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
-				    NULL, &tracing_cpumask_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
-
-	entry = debugfs_create_file("trace", 0644, d_tracer,
-				 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'trace' entry\n");
-
-	entry = debugfs_create_file("available_tracers", 0444, d_tracer,
-				    &global_trace, &show_traces_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'available_tracers' entry\n");
-
-	entry = debugfs_create_file("current_tracer", 0444, d_tracer,
-				    &global_trace, &set_tracer_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'current_tracer' entry\n");
-
-	entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
-				    &tracing_max_latency,
-				    &tracing_max_lat_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'tracing_max_latency' entry\n");
-
-	entry = debugfs_create_file("tracing_thresh", 0644, d_tracer,
-				    &tracing_thresh, &tracing_max_lat_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'tracing_thresh' entry\n");
-	entry = debugfs_create_file("README", 0644, d_tracer,
-				    NULL, &tracing_readme_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'README' entry\n");
-
-	entry = debugfs_create_file("trace_pipe", 0444, d_tracer,
+	trace_create_file("available_tracers", 0444, d_tracer,
+			&global_trace, &show_traces_fops);
+
+	trace_create_file("current_tracer", 0644, d_tracer,
+			&global_trace, &set_tracer_fops);
+
+	trace_create_file("tracing_max_latency", 0644, d_tracer,
+			&tracing_max_latency, &tracing_max_lat_fops);
+
+	trace_create_file("tracing_thresh", 0644, d_tracer,
+			&tracing_thresh, &tracing_max_lat_fops);
+
+	trace_create_file("README", 0444, d_tracer,
+			NULL, &tracing_readme_fops);
+
+	trace_create_file("trace_pipe", 0444, d_tracer,
 			(void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
 			(void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'trace_pipe' entry\n");
-
-	entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer,
-				    &global_trace, &tracing_entries_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'buffer_size_kb' entry\n");
-
-	entry = debugfs_create_file("trace_marker", 0220, d_tracer,
-				    NULL, &tracing_mark_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'trace_marker' entry\n");
+
+	trace_create_file("buffer_size_kb", 0644, d_tracer,
+			&global_trace, &tracing_entries_fops);
+
+	trace_create_file("trace_marker", 0220, d_tracer,
+			NULL, &tracing_mark_fops);
+
+	trace_create_file("saved_cmdlines", 0444, d_tracer,
+			NULL, &tracing_saved_cmdlines_fops);
 
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 #ifdef CONFIG_DYNAMIC_FTRACE
-	entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
-				    &ftrace_update_tot_cnt,
-				    &tracing_dyn_info_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'dyn_ftrace_total_info' entry\n");
+	trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
+			&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
 #endif
 #endif
 #ifdef CONFIG_SYSPROF_TRACER
 #ifdef CONFIG_SYSPROF_TRACER
 	init_tracer_sysprof_debugfs(d_tracer);
 	init_tracer_sysprof_debugfs(d_tracer);
 #endif
 #endif
 
 
+	create_trace_options_dir();
+
 	for_each_tracing_cpu(cpu)
 	for_each_tracing_cpu(cpu)
 		tracing_init_debugfs_percpu(cpu);
 		tracing_init_debugfs_percpu(cpu);
 
 
@@ -4064,7 +4184,8 @@ trace_printk_seq(struct trace_seq *s)
 
 
 static void __ftrace_dump(bool disable_tracing)
 static void __ftrace_dump(bool disable_tracing)
 {
 {
-	static DEFINE_SPINLOCK(ftrace_dump_lock);
+	static raw_spinlock_t ftrace_dump_lock =
+		(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	/* use static because iter can be a bit big for the stack */
 	/* use static because iter can be a bit big for the stack */
 	static struct trace_iterator iter;
 	static struct trace_iterator iter;
 	unsigned int old_userobj;
 	unsigned int old_userobj;
@@ -4073,7 +4194,8 @@ static void __ftrace_dump(bool disable_tracing)
 	int cnt = 0, cpu;
 	int cnt = 0, cpu;
 
 
 	/* only one dump */
 	/* only one dump */
-	spin_lock_irqsave(&ftrace_dump_lock, flags);
+	local_irq_save(flags);
+	__raw_spin_lock(&ftrace_dump_lock);
 	if (dump_ran)
 	if (dump_ran)
 		goto out;
 		goto out;
 
 
@@ -4145,7 +4267,8 @@ static void __ftrace_dump(bool disable_tracing)
 	}
 	}
 
 
  out:
  out:
-	spin_unlock_irqrestore(&ftrace_dump_lock, flags);
+	__raw_spin_unlock(&ftrace_dump_lock);
+	local_irq_restore(flags);
 }
 }
 
 
 /* By default: disable tracing after the dump */
 /* By default: disable tracing after the dump */

+ 98 - 145
kernel/trace/trace.h

@@ -9,9 +9,12 @@
 #include <linux/mmiotrace.h>
 #include <linux/mmiotrace.h>
 #include <linux/ftrace.h>
 #include <linux/ftrace.h>
 #include <trace/boot.h>
 #include <trace/boot.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 #include <trace/power.h>
 #include <trace/power.h>
 
 
+#include <linux/trace_seq.h>
+#include <linux/ftrace_event.h>
+
 enum trace_type {
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
 	__TRACE_FIRST_TYPE = 0,
 
 
@@ -41,20 +44,6 @@ enum trace_type {
 	__TRACE_LAST_TYPE,
 	__TRACE_LAST_TYPE,
 };
 };
 
 
-/*
- * The trace entry - the most basic unit of tracing. This is what
- * is printed in the end as a single line in the trace output, such as:
- *
- *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
- */
-struct trace_entry {
-	unsigned char		type;
-	unsigned char		flags;
-	unsigned char		preempt_count;
-	int			pid;
-	int			tgid;
-};
-
 /*
 /*
  * Function trace entry - function address and parent function addres:
  * Function trace entry - function address and parent function addres:
  */
  */
@@ -263,8 +252,6 @@ struct trace_array_cpu {
 	char			comm[TASK_COMM_LEN];
 	char			comm[TASK_COMM_LEN];
 };
 };
 
 
-struct trace_iterator;
-
 /*
 /*
  * The trace array - an array of per-CPU trace arrays. This is the
  * The trace array - an array of per-CPU trace arrays. This is the
  * highest level data structure that individual tracers deal with.
  * highest level data structure that individual tracers deal with.
@@ -339,15 +326,6 @@ extern void __ftrace_bad_type(void);
 		__ftrace_bad_type();					\
 		__ftrace_bad_type();					\
 	} while (0)
 	} while (0)
 
 
-/* Return values for print_line callback */
-enum print_line_t {
-	TRACE_TYPE_PARTIAL_LINE	= 0,	/* Retry after flushing the seq */
-	TRACE_TYPE_HANDLED	= 1,
-	TRACE_TYPE_UNHANDLED	= 2,	/* Relay to other output functions */
-	TRACE_TYPE_NO_CONSUME	= 3	/* Handled but ask to not consume */
-};
-
-
 /*
 /*
  * An option specific to a tracer. This is a boolean value.
  * An option specific to a tracer. This is a boolean value.
  * The bit is the bit index that sets its value on the
  * The bit is the bit index that sets its value on the
@@ -423,60 +401,30 @@ struct tracer {
 	struct tracer_stat	*stats;
 	struct tracer_stat	*stats;
 };
 };
 
 
-struct trace_seq {
-	unsigned char		buffer[PAGE_SIZE];
-	unsigned int		len;
-	unsigned int		readpos;
-};
-
-static inline void
-trace_seq_init(struct trace_seq *s)
-{
-	s->len = 0;
-	s->readpos = 0;
-}
-
 
 
 #define TRACE_PIPE_ALL_CPU	-1
 #define TRACE_PIPE_ALL_CPU	-1
 
 
-/*
- * Trace iterator - used by printout routines who present trace
- * results to users and which routines might sleep, etc:
- */
-struct trace_iterator {
-	struct trace_array	*tr;
-	struct tracer		*trace;
-	void			*private;
-	int			cpu_file;
-	struct mutex		mutex;
-	struct ring_buffer_iter	*buffer_iter[NR_CPUS];
-
-	/* The below is zeroed out in pipe_read */
-	struct trace_seq	seq;
-	struct trace_entry	*ent;
-	int			cpu;
-	u64			ts;
-
-	unsigned long		iter_flags;
-	loff_t			pos;
-	long			idx;
-
-	cpumask_var_t		started;
-};
-
 int tracer_init(struct tracer *t, struct trace_array *tr);
 int tracer_init(struct tracer *t, struct trace_array *tr);
 int tracing_is_enabled(void);
 int tracing_is_enabled(void);
 void trace_wake_up(void);
 void trace_wake_up(void);
 void tracing_reset(struct trace_array *tr, int cpu);
 void tracing_reset(struct trace_array *tr, int cpu);
 void tracing_reset_online_cpus(struct trace_array *tr);
 void tracing_reset_online_cpus(struct trace_array *tr);
+void tracing_reset_current(int cpu);
+void tracing_reset_current_online_cpus(void);
 int tracing_open_generic(struct inode *inode, struct file *filp);
 int tracing_open_generic(struct inode *inode, struct file *filp);
+struct dentry *trace_create_file(const char *name,
+				 mode_t mode,
+				 struct dentry *parent,
+				 void *data,
+				 const struct file_operations *fops);
+
 struct dentry *tracing_init_dentry(void);
 struct dentry *tracing_init_dentry(void);
 void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
 void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
 
 
 struct ring_buffer_event;
 struct ring_buffer_event;
 
 
 struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
 struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
-						    unsigned char type,
+						    int type,
 						    unsigned long len,
 						    unsigned long len,
 						    unsigned long flags,
 						    unsigned long flags,
 						    int pc);
 						    int pc);
@@ -484,14 +432,6 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
 				struct ring_buffer_event *event,
 				struct ring_buffer_event *event,
 				unsigned long flags, int pc);
 				unsigned long flags, int pc);
 
 
-struct ring_buffer_event *
-trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
-				  unsigned long flags, int pc);
-void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
-					unsigned long flags, int pc);
-void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
-					unsigned long flags, int pc);
-
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 						struct trace_array_cpu *data);
 						struct trace_array_cpu *data);
 
 
@@ -514,7 +454,6 @@ void tracing_sched_switch_trace(struct trace_array *tr,
 				struct task_struct *prev,
 				struct task_struct *prev,
 				struct task_struct *next,
 				struct task_struct *next,
 				unsigned long flags, int pc);
 				unsigned long flags, int pc);
-void tracing_record_cmdline(struct task_struct *tsk);
 
 
 void tracing_sched_wakeup_trace(struct trace_array *tr,
 void tracing_sched_wakeup_trace(struct trace_array *tr,
 				struct task_struct *wakee,
 				struct task_struct *wakee,
@@ -599,6 +538,8 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
 					       struct trace_array *tr);
 					       struct trace_array *tr);
 extern int trace_selftest_startup_branch(struct tracer *trace,
 extern int trace_selftest_startup_branch(struct tracer *trace,
 					 struct trace_array *tr);
 					 struct trace_array *tr);
+extern int trace_selftest_startup_hw_branches(struct tracer *trace,
+					      struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 
 extern void *head_page(struct trace_array_cpu *data);
 extern void *head_page(struct trace_array_cpu *data);
@@ -613,6 +554,8 @@ extern unsigned long trace_flags;
 /* Standard output formatting function used for function return traces */
 /* Standard output formatting function used for function return traces */
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 extern enum print_line_t print_graph_function(struct trace_iterator *iter);
 extern enum print_line_t print_graph_function(struct trace_iterator *iter);
+extern enum print_line_t
+trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
 
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* TODO: make this variable */
 /* TODO: make this variable */
@@ -644,7 +587,6 @@ static inline int ftrace_graph_addr(unsigned long addr)
 	return 1;
 	return 1;
 }
 }
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #endif /* CONFIG_DYNAMIC_FTRACE */
-
 #else /* CONFIG_FUNCTION_GRAPH_TRACER */
 #else /* CONFIG_FUNCTION_GRAPH_TRACER */
 static inline enum print_line_t
 static inline enum print_line_t
 print_graph_function(struct trace_iterator *iter)
 print_graph_function(struct trace_iterator *iter)
@@ -692,6 +634,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_LATENCY_FMT		= 0x40000,
 	TRACE_ITER_LATENCY_FMT		= 0x40000,
 	TRACE_ITER_GLOBAL_CLK		= 0x80000,
 	TRACE_ITER_GLOBAL_CLK		= 0x80000,
 	TRACE_ITER_SLEEP_TIME		= 0x100000,
 	TRACE_ITER_SLEEP_TIME		= 0x100000,
+	TRACE_ITER_GRAPH_TIME		= 0x200000,
 };
 };
 
 
 /*
 /*
@@ -790,103 +733,113 @@ struct ftrace_event_field {
 	char			*type;
 	char			*type;
 	int			offset;
 	int			offset;
 	int			size;
 	int			size;
+	int			is_signed;
 };
 };
 
 
-struct ftrace_event_call {
-	char			*name;
-	char			*system;
-	struct dentry		*dir;
-	int			enabled;
-	int			(*regfunc)(void);
-	void			(*unregfunc)(void);
-	int			id;
-	int			(*raw_init)(void);
-	int			(*show_format)(struct trace_seq *s);
-	int			(*define_fields)(void);
-	struct list_head	fields;
+struct event_filter {
+	int			n_preds;
 	struct filter_pred	**preds;
 	struct filter_pred	**preds;
-
-#ifdef CONFIG_EVENT_PROFILE
-	atomic_t	profile_count;
-	int		(*profile_enable)(struct ftrace_event_call *);
-	void		(*profile_disable)(struct ftrace_event_call *);
-#endif
+	char			*filter_string;
 };
 };
 
 
 struct event_subsystem {
 struct event_subsystem {
 	struct list_head	list;
 	struct list_head	list;
 	const char		*name;
 	const char		*name;
 	struct dentry		*entry;
 	struct dentry		*entry;
-	struct filter_pred	**preds;
+	void			*filter;
 };
 };
 
 
-#define events_for_each(event)						\
-	for (event = __start_ftrace_events;				\
-	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
-	     event++)
-
-#define MAX_FILTER_PRED 8
-
 struct filter_pred;
 struct filter_pred;
 
 
-typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
+typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
+				 int val1, int val2);
 
 
 struct filter_pred {
 struct filter_pred {
 	filter_pred_fn_t fn;
 	filter_pred_fn_t fn;
 	u64 val;
 	u64 val;
-	char *str_val;
+	char str_val[MAX_FILTER_STR_VAL];
 	int str_len;
 	int str_len;
 	char *field_name;
 	char *field_name;
 	int offset;
 	int offset;
 	int not;
 	int not;
-	int or;
-	int compound;
-	int clear;
+	int op;
+	int pop_n;
 };
 };
 
 
-int trace_define_field(struct ftrace_event_call *call, char *type,
-		       char *name, int offset, int size);
-extern void filter_free_pred(struct filter_pred *pred);
-extern void filter_print_preds(struct filter_pred **preds,
+extern void print_event_filter(struct ftrace_event_call *call,
 			       struct trace_seq *s);
 			       struct trace_seq *s);
-extern int filter_parse(char **pbuf, struct filter_pred *pred);
-extern int filter_add_pred(struct ftrace_event_call *call,
-			   struct filter_pred *pred);
-extern void filter_free_preds(struct ftrace_event_call *call);
-extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
-extern void filter_free_subsystem_preds(struct event_subsystem *system);
-extern int filter_add_subsystem_pred(struct event_subsystem *system,
-				     struct filter_pred *pred);
-
-void event_trace_printk(unsigned long ip, const char *fmt, ...);
-extern struct ftrace_event_call __start_ftrace_events[];
-extern struct ftrace_event_call __stop_ftrace_events[];
-
-#define for_each_event(event)						\
-	for (event = __start_ftrace_events;				\
-	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
-	     event++)
+extern int apply_event_filter(struct ftrace_event_call *call,
+			      char *filter_string);
+extern int apply_subsystem_event_filter(struct event_subsystem *system,
+					char *filter_string);
+extern void print_subsystem_event_filter(struct event_subsystem *system,
+					 struct trace_seq *s);
+
+static inline int
+filter_check_discard(struct ftrace_event_call *call, void *rec,
+		     struct ring_buffer *buffer,
+		     struct ring_buffer_event *event)
+{
+	if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) {
+		ring_buffer_discard_commit(buffer, event);
+		return 1;
+	}
+
+	return 0;
+}
+
+#define DEFINE_COMPARISON_PRED(type)					\
+static int filter_pred_##type(struct filter_pred *pred, void *event,	\
+			      int val1, int val2)			\
+{									\
+	type *addr = (type *)(event + pred->offset);			\
+	type val = (type)pred->val;					\
+	int match = 0;							\
+									\
+	switch (pred->op) {						\
+	case OP_LT:							\
+		match = (*addr < val);					\
+		break;							\
+	case OP_LE:							\
+		match = (*addr <= val);					\
+		break;							\
+	case OP_GT:							\
+		match = (*addr > val);					\
+		break;							\
+	case OP_GE:							\
+		match = (*addr >= val);					\
+		break;							\
+	default:							\
+		break;							\
+	}								\
+									\
+	return match;							\
+}
+
+#define DEFINE_EQUALITY_PRED(size)					\
+static int filter_pred_##size(struct filter_pred *pred, void *event,	\
+			      int val1, int val2)			\
+{									\
+	u##size *addr = (u##size *)(event + pred->offset);		\
+	u##size val = (u##size)pred->val;				\
+	int match;							\
+									\
+	match = (val == *addr) ^ pred->not;				\
+									\
+	return match;							\
+}
+
+extern struct mutex event_mutex;
+extern struct list_head ftrace_events;
 
 
 extern const char *__start___trace_bprintk_fmt[];
 extern const char *__start___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
 
 
-/*
- * The double __builtin_constant_p is because gcc will give us an error
- * if we try to allocate the static variable to fmt if it is not a
- * constant. Even with the outer if statement optimizing out.
- */
-#define event_trace_printk(ip, fmt, args...)				\
-do {									\
-	__trace_printk_check_format(fmt, ##args);			\
-	tracing_record_cmdline(current);				\
-	if (__builtin_constant_p(fmt)) {				\
-		static const char *trace_printk_fmt			\
-		  __attribute__((section("__trace_printk_fmt"))) =	\
-			__builtin_constant_p(fmt) ? fmt : NULL;		\
-									\
-		__trace_bprintk(ip, trace_printk_fmt, ##args);		\
-	} else								\
-		__trace_printk(ip, fmt, ##args);			\
-} while (0)
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+	extern struct ftrace_event_call event_##call;
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt)
+#include "trace_event_types.h"
 
 
 #endif /* _LINUX_KERNEL_TRACE_H */
 #endif /* _LINUX_KERNEL_TRACE_H */

+ 3 - 2
kernel/trace/trace_boot.c

@@ -9,6 +9,7 @@
 #include <linux/debugfs.h>
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
 #include <linux/ftrace.h>
 #include <linux/kallsyms.h>
 #include <linux/kallsyms.h>
+#include <linux/time.h>
 
 
 #include "trace.h"
 #include "trace.h"
 #include "trace_output.h"
 #include "trace_output.h"
@@ -67,7 +68,7 @@ initcall_call_print_line(struct trace_iterator *iter)
 	trace_assign_type(field, entry);
 	trace_assign_type(field, entry);
 	call = &field->boot_call;
 	call = &field->boot_call;
 	ts = iter->ts;
 	ts = iter->ts;
-	nsec_rem = do_div(ts, 1000000000);
+	nsec_rem = do_div(ts, NSEC_PER_SEC);
 
 
 	ret = trace_seq_printf(s, "[%5ld.%09ld] calling  %s @ %i\n",
 	ret = trace_seq_printf(s, "[%5ld.%09ld] calling  %s @ %i\n",
 			(unsigned long)ts, nsec_rem, call->func, call->caller);
 			(unsigned long)ts, nsec_rem, call->func, call->caller);
@@ -92,7 +93,7 @@ initcall_ret_print_line(struct trace_iterator *iter)
 	trace_assign_type(field, entry);
 	trace_assign_type(field, entry);
 	init_ret = &field->boot_ret;
 	init_ret = &field->boot_ret;
 	ts = iter->ts;
 	ts = iter->ts;
-	nsec_rem = do_div(ts, 1000000000);
+	nsec_rem = do_div(ts, NSEC_PER_SEC);
 
 
 	ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
 	ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
 			"returned %d after %llu msecs\n",
 			"returned %d after %llu msecs\n",

+ 5 - 3
kernel/trace/trace_branch.c

@@ -30,6 +30,7 @@ static struct trace_array *branch_tracer;
 static void
 static void
 probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 {
 {
+	struct ftrace_event_call *call = &event_branch;
 	struct trace_array *tr = branch_tracer;
 	struct trace_array *tr = branch_tracer;
 	struct ring_buffer_event *event;
 	struct ring_buffer_event *event;
 	struct trace_branch *entry;
 	struct trace_branch *entry;
@@ -73,7 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	entry->line = f->line;
 	entry->line = f->line;
 	entry->correct = val == expect;
 	entry->correct = val == expect;
 
 
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 
 
  out:
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
 	atomic_dec(&tr->data[cpu]->disabled);
@@ -271,7 +273,7 @@ static int branch_stat_show(struct seq_file *m, void *v)
 	return 0;
 	return 0;
 }
 }
 
 
-static void *annotated_branch_stat_start(void)
+static void *annotated_branch_stat_start(struct tracer_stat *trace)
 {
 {
 	return __start_annotated_branch_profile;
 	return __start_annotated_branch_profile;
 }
 }
@@ -346,7 +348,7 @@ static int all_branch_stat_headers(struct seq_file *m)
 	return 0;
 	return 0;
 }
 }
 
 
-static void *all_branch_stat_start(void)
+static void *all_branch_stat_start(struct tracer_stat *trace)
 {
 {
 	return __start_branch_profile;
 	return __start_branch_profile;
 }
 }

+ 16 - 8
kernel/trace/trace_event_profile.c

@@ -10,22 +10,30 @@
 int ftrace_profile_enable(int event_id)
 int ftrace_profile_enable(int event_id)
 {
 {
 	struct ftrace_event_call *event;
 	struct ftrace_event_call *event;
+	int ret = -EINVAL;
 
 
-	for_each_event(event) {
-		if (event->id == event_id)
-			return event->profile_enable(event);
+	mutex_lock(&event_mutex);
+	list_for_each_entry(event, &ftrace_events, list) {
+		if (event->id == event_id) {
+			ret = event->profile_enable(event);
+			break;
+		}
 	}
 	}
+	mutex_unlock(&event_mutex);
 
 
-	return -EINVAL;
+	return ret;
 }
 }
 
 
 void ftrace_profile_disable(int event_id)
 void ftrace_profile_disable(int event_id)
 {
 {
 	struct ftrace_event_call *event;
 	struct ftrace_event_call *event;
 
 
-	for_each_event(event) {
-		if (event->id == event_id)
-			return event->profile_disable(event);
+	mutex_lock(&event_mutex);
+	list_for_each_entry(event, &ftrace_events, list) {
+		if (event->id == event_id) {
+			event->profile_disable(event);
+			break;
+		}
 	}
 	}
+	mutex_unlock(&event_mutex);
 }
 }
-

+ 7 - 5
kernel/trace/trace_event_types.h

@@ -57,7 +57,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
 	TP_RAW_FMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
 	TP_RAW_FMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
 );
 );
 
 
-TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
+TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore,
 	TRACE_STRUCT(
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, arg1, arg1)
 		TRACE_FIELD(unsigned long, arg1, arg1)
 		TRACE_FIELD(unsigned long, arg2, arg2)
 		TRACE_FIELD(unsigned long, arg2, arg2)
@@ -122,8 +122,10 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
 TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
 	TRACE_STRUCT(
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned int, line, line)
 		TRACE_FIELD(unsigned int, line, line)
-		TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func)
-		TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file)
+		TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func,
+				    TRACE_FUNC_SIZE+1, func)
+		TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file,
+				    TRACE_FUNC_SIZE+1, file)
 		TRACE_FIELD(char, correct, correct)
 		TRACE_FIELD(char, correct, correct)
 	),
 	),
 	TP_RAW_FMT("%u:%s:%s (%u)")
 	TP_RAW_FMT("%u:%s:%s (%u)")
@@ -139,8 +141,8 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
 
 
 TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
 TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
 	TRACE_STRUCT(
 	TRACE_STRUCT(
-		TRACE_FIELD(ktime_t, state_data.stamp, stamp)
-		TRACE_FIELD(ktime_t, state_data.end, end)
+		TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1)
+		TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)
 		TRACE_FIELD(int, state_data.type, type)
 		TRACE_FIELD(int, state_data.type, type)
 		TRACE_FIELD(int, state_data.state, state)
 		TRACE_FIELD(int, state_data.state, state)
 	),
 	),

+ 700 - 139
kernel/trace/trace_events.c

@@ -8,19 +8,25 @@
  *
  *
  */
  */
 
 
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
 #include <linux/debugfs.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/ctype.h>
+#include <linux/delay.h>
 
 
 #include "trace_output.h"
 #include "trace_output.h"
 
 
 #define TRACE_SYSTEM "TRACE_SYSTEM"
 #define TRACE_SYSTEM "TRACE_SYSTEM"
 
 
-static DEFINE_MUTEX(event_mutex);
+DEFINE_MUTEX(event_mutex);
+
+LIST_HEAD(ftrace_events);
 
 
 int trace_define_field(struct ftrace_event_call *call, char *type,
 int trace_define_field(struct ftrace_event_call *call, char *type,
-		       char *name, int offset, int size)
+		       char *name, int offset, int size, int is_signed)
 {
 {
 	struct ftrace_event_field *field;
 	struct ftrace_event_field *field;
 
 
@@ -38,6 +44,7 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
 
 
 	field->offset = offset;
 	field->offset = offset;
 	field->size = size;
 	field->size = size;
+	field->is_signed = is_signed;
 	list_add(&field->link, &call->fields);
 	list_add(&field->link, &call->fields);
 
 
 	return 0;
 	return 0;
@@ -51,47 +58,94 @@ err:
 
 
 	return -ENOMEM;
 	return -ENOMEM;
 }
 }
+EXPORT_SYMBOL_GPL(trace_define_field);
 
 
-static void ftrace_clear_events(void)
-{
-	struct ftrace_event_call *call = (void *)__start_ftrace_events;
-
+#ifdef CONFIG_MODULES
 
 
-	while ((unsigned long)call < (unsigned long)__stop_ftrace_events) {
+static void trace_destroy_fields(struct ftrace_event_call *call)
+{
+	struct ftrace_event_field *field, *next;
 
 
-		if (call->enabled) {
-			call->enabled = 0;
-			call->unregfunc();
-		}
-		call++;
+	list_for_each_entry_safe(field, next, &call->fields, link) {
+		list_del(&field->link);
+		kfree(field->type);
+		kfree(field->name);
+		kfree(field);
 	}
 	}
 }
 }
 
 
+#endif /* CONFIG_MODULES */
+
 static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 					int enable)
 					int enable)
 {
 {
-
 	switch (enable) {
 	switch (enable) {
 	case 0:
 	case 0:
 		if (call->enabled) {
 		if (call->enabled) {
 			call->enabled = 0;
 			call->enabled = 0;
+			tracing_stop_cmdline_record();
 			call->unregfunc();
 			call->unregfunc();
 		}
 		}
 		break;
 		break;
 	case 1:
 	case 1:
 		if (!call->enabled) {
 		if (!call->enabled) {
 			call->enabled = 1;
 			call->enabled = 1;
+			tracing_start_cmdline_record();
 			call->regfunc();
 			call->regfunc();
 		}
 		}
 		break;
 		break;
 	}
 	}
 }
 }
 
 
+static void ftrace_clear_events(void)
+{
+	struct ftrace_event_call *call;
+
+	mutex_lock(&event_mutex);
+	list_for_each_entry(call, &ftrace_events, list) {
+		ftrace_event_enable_disable(call, 0);
+	}
+	mutex_unlock(&event_mutex);
+}
+
+/*
+ * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
+ */
+static int __ftrace_set_clr_event(const char *match, const char *sub,
+				  const char *event, int set)
+{
+	struct ftrace_event_call *call;
+	int ret = -EINVAL;
+
+	mutex_lock(&event_mutex);
+	list_for_each_entry(call, &ftrace_events, list) {
+
+		if (!call->name || !call->regfunc)
+			continue;
+
+		if (match &&
+		    strcmp(match, call->name) != 0 &&
+		    strcmp(match, call->system) != 0)
+			continue;
+
+		if (sub && strcmp(sub, call->system) != 0)
+			continue;
+
+		if (event && strcmp(event, call->name) != 0)
+			continue;
+
+		ftrace_event_enable_disable(call, set);
+
+		ret = 0;
+	}
+	mutex_unlock(&event_mutex);
+
+	return ret;
+}
+
 static int ftrace_set_clr_event(char *buf, int set)
 static int ftrace_set_clr_event(char *buf, int set)
 {
 {
-	struct ftrace_event_call *call = __start_ftrace_events;
 	char *event = NULL, *sub = NULL, *match;
 	char *event = NULL, *sub = NULL, *match;
-	int ret = -EINVAL;
 
 
 	/*
 	/*
 	 * The buf format can be <subsystem>:<event-name>
 	 * The buf format can be <subsystem>:<event-name>
@@ -117,30 +171,24 @@ static int ftrace_set_clr_event(char *buf, int set)
 			event = NULL;
 			event = NULL;
 	}
 	}
 
 
-	mutex_lock(&event_mutex);
-	for_each_event(call) {
-
-		if (!call->name || !call->regfunc)
-			continue;
-
-		if (match &&
-		    strcmp(match, call->name) != 0 &&
-		    strcmp(match, call->system) != 0)
-			continue;
-
-		if (sub && strcmp(sub, call->system) != 0)
-			continue;
-
-		if (event && strcmp(event, call->name) != 0)
-			continue;
-
-		ftrace_event_enable_disable(call, set);
-
-		ret = 0;
-	}
-	mutex_unlock(&event_mutex);
+	return __ftrace_set_clr_event(match, sub, event, set);
+}
 
 
-	return ret;
+/**
+ * trace_set_clr_event - enable or disable an event
+ * @system: system name to match (NULL for any system)
+ * @event: event name to match (NULL for all events, within system)
+ * @set: 1 to enable, 0 to disable
+ *
+ * This is a way for other parts of the kernel to enable or disable
+ * event recording.
+ *
+ * Returns 0 on success, -EINVAL if the parameters do not match any
+ * registered events.
+ */
+int trace_set_clr_event(const char *system, const char *event, int set)
+{
+	return __ftrace_set_clr_event(NULL, system, event, set);
 }
 }
 
 
 /* 128 should be much more than enough */
 /* 128 should be much more than enough */
@@ -224,15 +272,17 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
 static void *
 static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
 {
-	struct ftrace_event_call *call = m->private;
-	struct ftrace_event_call *next = call;
+	struct list_head *list = m->private;
+	struct ftrace_event_call *call;
 
 
 	(*pos)++;
 	(*pos)++;
 
 
 	for (;;) {
 	for (;;) {
-		if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+		if (list == &ftrace_events)
 			return NULL;
 			return NULL;
 
 
+		call = list_entry(list, struct ftrace_event_call, list);
+
 		/*
 		/*
 		 * The ftrace subsystem is for showing formats only.
 		 * The ftrace subsystem is for showing formats only.
 		 * They can not be enabled or disabled via the event files.
 		 * They can not be enabled or disabled via the event files.
@@ -240,45 +290,51 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 		if (call->regfunc)
 		if (call->regfunc)
 			break;
 			break;
 
 
-		call++;
-		next = call;
+		list = list->next;
 	}
 	}
 
 
-	m->private = ++next;
+	m->private = list->next;
 
 
 	return call;
 	return call;
 }
 }
 
 
 static void *t_start(struct seq_file *m, loff_t *pos)
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
 {
+	mutex_lock(&event_mutex);
+	if (*pos == 0)
+		m->private = ftrace_events.next;
 	return t_next(m, NULL, pos);
 	return t_next(m, NULL, pos);
 }
 }
 
 
 static void *
 static void *
 s_next(struct seq_file *m, void *v, loff_t *pos)
 s_next(struct seq_file *m, void *v, loff_t *pos)
 {
 {
-	struct ftrace_event_call *call = m->private;
-	struct ftrace_event_call *next;
+	struct list_head *list = m->private;
+	struct ftrace_event_call *call;
 
 
 	(*pos)++;
 	(*pos)++;
 
 
  retry:
  retry:
-	if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+	if (list == &ftrace_events)
 		return NULL;
 		return NULL;
 
 
+	call = list_entry(list, struct ftrace_event_call, list);
+
 	if (!call->enabled) {
 	if (!call->enabled) {
-		call++;
+		list = list->next;
 		goto retry;
 		goto retry;
 	}
 	}
 
 
-	next = call;
-	m->private = ++next;
+	m->private = list->next;
 
 
 	return call;
 	return call;
 }
 }
 
 
 static void *s_start(struct seq_file *m, loff_t *pos)
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
 {
+	mutex_lock(&event_mutex);
+	if (*pos == 0)
+		m->private = ftrace_events.next;
 	return s_next(m, NULL, pos);
 	return s_next(m, NULL, pos);
 }
 }
 
 
@@ -295,12 +351,12 @@ static int t_show(struct seq_file *m, void *v)
 
 
 static void t_stop(struct seq_file *m, void *p)
 static void t_stop(struct seq_file *m, void *p)
 {
 {
+	mutex_unlock(&event_mutex);
 }
 }
 
 
 static int
 static int
 ftrace_event_seq_open(struct inode *inode, struct file *file)
 ftrace_event_seq_open(struct inode *inode, struct file *file)
 {
 {
-	int ret;
 	const struct seq_operations *seq_ops;
 	const struct seq_operations *seq_ops;
 
 
 	if ((file->f_mode & FMODE_WRITE) &&
 	if ((file->f_mode & FMODE_WRITE) &&
@@ -308,13 +364,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)
 		ftrace_clear_events();
 		ftrace_clear_events();
 
 
 	seq_ops = inode->i_private;
 	seq_ops = inode->i_private;
-	ret = seq_open(file, seq_ops);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-
-		m->private = __start_ftrace_events;
-	}
-	return ret;
+	return seq_open(file, seq_ops);
 }
 }
 
 
 static ssize_t
 static ssize_t
@@ -374,8 +424,93 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 	return cnt;
 }
 }
 
 
+static ssize_t
+system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
+		   loff_t *ppos)
+{
+	const char set_to_char[4] = { '?', '0', '1', 'X' };
+	const char *system = filp->private_data;
+	struct ftrace_event_call *call;
+	char buf[2];
+	int set = 0;
+	int ret;
+
+	mutex_lock(&event_mutex);
+	list_for_each_entry(call, &ftrace_events, list) {
+		if (!call->name || !call->regfunc)
+			continue;
+
+		if (system && strcmp(call->system, system) != 0)
+			continue;
+
+		/*
+		 * We need to find out if all the events are set
+		 * or if all events or cleared, or if we have
+		 * a mixture.
+		 */
+		set |= (1 << !!call->enabled);
+
+		/*
+		 * If we have a mixture, no need to look further.
+		 */
+		if (set == 3)
+			break;
+	}
+	mutex_unlock(&event_mutex);
+
+	buf[0] = set_to_char[set];
+	buf[1] = '\n';
+
+	ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+
+	return ret;
+}
+
+static ssize_t
+system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
+		    loff_t *ppos)
+{
+	const char *system = filp->private_data;
+	unsigned long val;
+	char buf[64];
+	ssize_t ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	ret = tracing_update_buffers();
+	if (ret < 0)
+		return ret;
+
+	if (val != 0 && val != 1)
+		return -EINVAL;
+
+	ret = __ftrace_set_clr_event(NULL, system, NULL, val);
+	if (ret)
+		goto out;
+
+	ret = cnt;
+
+out:
+	*ppos += cnt;
+
+	return ret;
+}
+
+extern char *__bad_type_size(void);
+
 #undef FIELD
 #undef FIELD
 #define FIELD(type, name)						\
 #define FIELD(type, name)						\
+	sizeof(type) != sizeof(field.name) ? __bad_type_size() :	\
 	#type, "common_" #name, offsetof(typeof(field), name),		\
 	#type, "common_" #name, offsetof(typeof(field), name),		\
 		sizeof(field.name)
 		sizeof(field.name)
 
 
@@ -391,7 +526,7 @@ static int trace_write_header(struct trace_seq *s)
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\n",
 				"\n",
-				FIELD(unsigned char, type),
+				FIELD(unsigned short, type),
 				FIELD(unsigned char, flags),
 				FIELD(unsigned char, flags),
 				FIELD(unsigned char, preempt_count),
 				FIELD(unsigned char, preempt_count),
 				FIELD(int, pid),
 				FIELD(int, pid),
@@ -481,7 +616,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 
 	trace_seq_init(s);
 	trace_seq_init(s);
 
 
-	filter_print_preds(call->preds, s);
+	print_event_filter(call, s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 
 	kfree(s);
 	kfree(s);
@@ -494,38 +629,26 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		   loff_t *ppos)
 		   loff_t *ppos)
 {
 {
 	struct ftrace_event_call *call = filp->private_data;
 	struct ftrace_event_call *call = filp->private_data;
-	char buf[64], *pbuf = buf;
-	struct filter_pred *pred;
+	char *buf;
 	int err;
 	int err;
 
 
-	if (cnt >= sizeof(buf))
+	if (cnt >= PAGE_SIZE)
 		return -EINVAL;
 		return -EINVAL;
 
 
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-	buf[cnt] = '\0';
-
-	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-	if (!pred)
+	buf = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!buf)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
-	err = filter_parse(&pbuf, pred);
-	if (err < 0) {
-		filter_free_pred(pred);
-		return err;
-	}
-
-	if (pred->clear) {
-		filter_free_preds(call);
-		filter_free_pred(pred);
-		return cnt;
+	if (copy_from_user(buf, ubuf, cnt)) {
+		free_page((unsigned long) buf);
+		return -EFAULT;
 	}
 	}
+	buf[cnt] = '\0';
 
 
-	err = filter_add_pred(call, pred);
-	if (err < 0) {
-		filter_free_pred(pred);
+	err = apply_event_filter(call, buf);
+	free_page((unsigned long) buf);
+	if (err < 0)
 		return err;
 		return err;
-	}
 
 
 	*ppos += cnt;
 	*ppos += cnt;
 
 
@@ -549,7 +672,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 
 	trace_seq_init(s);
 	trace_seq_init(s);
 
 
-	filter_print_preds(system->preds, s);
+	print_subsystem_event_filter(system, s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 
 	kfree(s);
 	kfree(s);
@@ -562,45 +685,56 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		       loff_t *ppos)
 		       loff_t *ppos)
 {
 {
 	struct event_subsystem *system = filp->private_data;
 	struct event_subsystem *system = filp->private_data;
-	char buf[64], *pbuf = buf;
-	struct filter_pred *pred;
+	char *buf;
 	int err;
 	int err;
 
 
-	if (cnt >= sizeof(buf))
+	if (cnt >= PAGE_SIZE)
 		return -EINVAL;
 		return -EINVAL;
 
 
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-	buf[cnt] = '\0';
-
-	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-	if (!pred)
+	buf = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!buf)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
-	err = filter_parse(&pbuf, pred);
-	if (err < 0) {
-		filter_free_pred(pred);
-		return err;
-	}
-
-	if (pred->clear) {
-		filter_free_subsystem_preds(system);
-		filter_free_pred(pred);
-		return cnt;
+	if (copy_from_user(buf, ubuf, cnt)) {
+		free_page((unsigned long) buf);
+		return -EFAULT;
 	}
 	}
+	buf[cnt] = '\0';
 
 
-	err = filter_add_subsystem_pred(system, pred);
-	if (err < 0) {
-		filter_free_subsystem_preds(system);
-		filter_free_pred(pred);
+	err = apply_subsystem_event_filter(system, buf);
+	free_page((unsigned long) buf);
+	if (err < 0)
 		return err;
 		return err;
-	}
 
 
 	*ppos += cnt;
 	*ppos += cnt;
 
 
 	return cnt;
 	return cnt;
 }
 }
 
 
+static ssize_t
+show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	int (*func)(struct trace_seq *s) = filp->private_data;
+	struct trace_seq *s;
+	int r;
+
+	if (*ppos)
+		return 0;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	trace_seq_init(s);
+
+	func(s);
+	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+
+	kfree(s);
+
+	return r;
+}
+
 static const struct seq_operations show_event_seq_ops = {
 static const struct seq_operations show_event_seq_ops = {
 	.start = t_start,
 	.start = t_start,
 	.next = t_next,
 	.next = t_next,
@@ -658,6 +792,17 @@ static const struct file_operations ftrace_subsystem_filter_fops = {
 	.write = subsystem_filter_write,
 	.write = subsystem_filter_write,
 };
 };
 
 
+static const struct file_operations ftrace_system_enable_fops = {
+	.open = tracing_open_generic,
+	.read = system_enable_read,
+	.write = system_enable_write,
+};
+
+static const struct file_operations ftrace_show_header_fops = {
+	.open = tracing_open_generic,
+	.read = show_header,
+};
+
 static struct dentry *event_trace_events_dir(void)
 static struct dentry *event_trace_events_dir(void)
 {
 {
 	static struct dentry *d_tracer;
 	static struct dentry *d_tracer;
@@ -684,6 +829,7 @@ static struct dentry *
 event_subsystem_dir(const char *name, struct dentry *d_events)
 event_subsystem_dir(const char *name, struct dentry *d_events)
 {
 {
 	struct event_subsystem *system;
 	struct event_subsystem *system;
+	struct dentry *entry;
 
 
 	/* First see if we did not already create this dir */
 	/* First see if we did not already create this dir */
 	list_for_each_entry(system, &event_subsystems, list) {
 	list_for_each_entry(system, &event_subsystems, list) {
@@ -707,16 +853,46 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 		return d_events;
 		return d_events;
 	}
 	}
 
 
-	system->name = name;
+	system->name = kstrdup(name, GFP_KERNEL);
+	if (!system->name) {
+		debugfs_remove(system->entry);
+		kfree(system);
+		return d_events;
+	}
+
 	list_add(&system->list, &event_subsystems);
 	list_add(&system->list, &event_subsystems);
 
 
-	system->preds = NULL;
+	system->filter = NULL;
+
+	system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
+	if (!system->filter) {
+		pr_warning("Could not allocate filter for subsystem "
+			   "'%s'\n", name);
+		return system->entry;
+	}
+
+	entry = debugfs_create_file("filter", 0644, system->entry, system,
+				    &ftrace_subsystem_filter_fops);
+	if (!entry) {
+		kfree(system->filter);
+		system->filter = NULL;
+		pr_warning("Could not create debugfs "
+			   "'%s/filter' entry\n", name);
+	}
+
+	entry = trace_create_file("enable", 0644, system->entry,
+				  (void *)system->name,
+				  &ftrace_system_enable_fops);
 
 
 	return system->entry;
 	return system->entry;
 }
 }
 
 
 static int
 static int
-event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
+event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
+		 const struct file_operations *id,
+		 const struct file_operations *enable,
+		 const struct file_operations *filter,
+		 const struct file_operations *format)
 {
 {
 	struct dentry *entry;
 	struct dentry *entry;
 	int ret;
 	int ret;
@@ -725,7 +901,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 	 * If the trace point header did not define TRACE_SYSTEM
 	 * If the trace point header did not define TRACE_SYSTEM
 	 * then the system would be called "TRACE_SYSTEM".
 	 * then the system would be called "TRACE_SYSTEM".
 	 */
 	 */
-	if (strcmp(call->system, "TRACE_SYSTEM") != 0)
+	if (strcmp(call->system, TRACE_SYSTEM) != 0)
 		d_events = event_subsystem_dir(call->system, d_events);
 		d_events = event_subsystem_dir(call->system, d_events);
 
 
 	if (call->raw_init) {
 	if (call->raw_init) {
@@ -744,21 +920,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		return -1;
 		return -1;
 	}
 	}
 
 
-	if (call->regfunc) {
-		entry = debugfs_create_file("enable", 0644, call->dir, call,
-					    &ftrace_enable_fops);
-		if (!entry)
-			pr_warning("Could not create debugfs "
-				   "'%s/enable' entry\n", call->name);
-	}
+	if (call->regfunc)
+		entry = trace_create_file("enable", 0644, call->dir, call,
+					  enable);
 
 
-	if (call->id) {
-		entry = debugfs_create_file("id", 0444, call->dir, call,
-				&ftrace_event_id_fops);
-		if (!entry)
-			pr_warning("Could not create debugfs '%s/id' entry\n",
-					call->name);
-	}
+	if (call->id)
+		entry = trace_create_file("id", 0444, call->dir, call,
+					  id);
 
 
 	if (call->define_fields) {
 	if (call->define_fields) {
 		ret = call->define_fields();
 		ret = call->define_fields();
@@ -767,32 +935,195 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 				   " events/%s\n", call->name);
 				   " events/%s\n", call->name);
 			return ret;
 			return ret;
 		}
 		}
-		entry = debugfs_create_file("filter", 0644, call->dir, call,
-					    &ftrace_event_filter_fops);
-		if (!entry)
-			pr_warning("Could not create debugfs "
-				   "'%s/filter' entry\n", call->name);
+		entry = trace_create_file("filter", 0644, call->dir, call,
+					  filter);
 	}
 	}
 
 
 	/* A trace may not want to export its format */
 	/* A trace may not want to export its format */
 	if (!call->show_format)
 	if (!call->show_format)
 		return 0;
 		return 0;
 
 
-	entry = debugfs_create_file("format", 0444, call->dir, call,
-				    &ftrace_event_format_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'%s/format' entry\n", call->name);
+	entry = trace_create_file("format", 0444, call->dir, call,
+				  format);
+
+	return 0;
+}
+
+#define for_each_event(event, start, end)			\
+	for (event = start;					\
+	     (unsigned long)event < (unsigned long)end;		\
+	     event++)
+
+#ifdef CONFIG_MODULES
+
+static LIST_HEAD(ftrace_module_file_list);
+
+/*
+ * Modules must own their file_operations to keep up with
+ * reference counting.
+ */
+struct ftrace_module_file_ops {
+	struct list_head		list;
+	struct module			*mod;
+	struct file_operations		id;
+	struct file_operations		enable;
+	struct file_operations		format;
+	struct file_operations		filter;
+};
+
+static struct ftrace_module_file_ops *
+trace_create_file_ops(struct module *mod)
+{
+	struct ftrace_module_file_ops *file_ops;
+
+	/*
+	 * This is a bit of a PITA. To allow for correct reference
+	 * counting, modules must "own" their file_operations.
+	 * To do this, we allocate the file operations that will be
+	 * used in the event directory.
+	 */
+
+	file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL);
+	if (!file_ops)
+		return NULL;
+
+	file_ops->mod = mod;
+
+	file_ops->id = ftrace_event_id_fops;
+	file_ops->id.owner = mod;
+
+	file_ops->enable = ftrace_enable_fops;
+	file_ops->enable.owner = mod;
+
+	file_ops->filter = ftrace_event_filter_fops;
+	file_ops->filter.owner = mod;
+
+	file_ops->format = ftrace_event_format_fops;
+	file_ops->format.owner = mod;
+
+	list_add(&file_ops->list, &ftrace_module_file_list);
+
+	return file_ops;
+}
+
+static void trace_module_add_events(struct module *mod)
+{
+	struct ftrace_module_file_ops *file_ops = NULL;
+	struct ftrace_event_call *call, *start, *end;
+	struct dentry *d_events;
+
+	start = mod->trace_events;
+	end = mod->trace_events + mod->num_trace_events;
+
+	if (start == end)
+		return;
+
+	d_events = event_trace_events_dir();
+	if (!d_events)
+		return;
+
+	for_each_event(call, start, end) {
+		/* The linker may leave blanks */
+		if (!call->name)
+			continue;
+
+		/*
+		 * This module has events, create file ops for this module
+		 * if not already done.
+		 */
+		if (!file_ops) {
+			file_ops = trace_create_file_ops(mod);
+			if (!file_ops)
+				return;
+		}
+		call->mod = mod;
+		list_add(&call->list, &ftrace_events);
+		event_create_dir(call, d_events,
+				 &file_ops->id, &file_ops->enable,
+				 &file_ops->filter, &file_ops->format);
+	}
+}
+
+static void trace_module_remove_events(struct module *mod)
+{
+	struct ftrace_module_file_ops *file_ops;
+	struct ftrace_event_call *call, *p;
+	bool found = false;
+
+	down_write(&trace_event_mutex);
+	list_for_each_entry_safe(call, p, &ftrace_events, list) {
+		if (call->mod == mod) {
+			found = true;
+			ftrace_event_enable_disable(call, 0);
+			if (call->event)
+				__unregister_ftrace_event(call->event);
+			debugfs_remove_recursive(call->dir);
+			list_del(&call->list);
+			trace_destroy_fields(call);
+			destroy_preds(call);
+		}
+	}
+
+	/* Now free the file_operations */
+	list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
+		if (file_ops->mod == mod)
+			break;
+	}
+	if (&file_ops->list != &ftrace_module_file_list) {
+		list_del(&file_ops->list);
+		kfree(file_ops);
+	}
+
+	/*
+	 * It is safest to reset the ring buffer if the module being unloaded
+	 * registered any events.
+	 */
+	if (found)
+		tracing_reset_current_online_cpus();
+	up_write(&trace_event_mutex);
+}
+
+static int trace_module_notify(struct notifier_block *self,
+			       unsigned long val, void *data)
+{
+	struct module *mod = data;
+
+	mutex_lock(&event_mutex);
+	switch (val) {
+	case MODULE_STATE_COMING:
+		trace_module_add_events(mod);
+		break;
+	case MODULE_STATE_GOING:
+		trace_module_remove_events(mod);
+		break;
+	}
+	mutex_unlock(&event_mutex);
 
 
 	return 0;
 	return 0;
 }
 }
+#else
+static int trace_module_notify(struct notifier_block *self,
+			       unsigned long val, void *data)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULES */
+
+struct notifier_block trace_module_nb = {
+	.notifier_call = trace_module_notify,
+	.priority = 0,
+};
+
+extern struct ftrace_event_call __start_ftrace_events[];
+extern struct ftrace_event_call __stop_ftrace_events[];
 
 
 static __init int event_trace_init(void)
 static __init int event_trace_init(void)
 {
 {
-	struct ftrace_event_call *call = __start_ftrace_events;
+	struct ftrace_event_call *call;
 	struct dentry *d_tracer;
 	struct dentry *d_tracer;
 	struct dentry *entry;
 	struct dentry *entry;
 	struct dentry *d_events;
 	struct dentry *d_events;
+	int ret;
 
 
 	d_tracer = tracing_init_dentry();
 	d_tracer = tracing_init_dentry();
 	if (!d_tracer)
 	if (!d_tracer)
@@ -816,13 +1147,243 @@ static __init int event_trace_init(void)
 	if (!d_events)
 	if (!d_events)
 		return 0;
 		return 0;
 
 
-	for_each_event(call) {
+	/* ring buffer internal formats */
+	trace_create_file("header_page", 0444, d_events,
+			  ring_buffer_print_page_header,
+			  &ftrace_show_header_fops);
+
+	trace_create_file("header_event", 0444, d_events,
+			  ring_buffer_print_entry_header,
+			  &ftrace_show_header_fops);
+
+	trace_create_file("enable", 0644, d_events,
+			  NULL, &ftrace_system_enable_fops);
+
+	for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
 		/* The linker may leave blanks */
 		/* The linker may leave blanks */
 		if (!call->name)
 		if (!call->name)
 			continue;
 			continue;
-		event_create_dir(call, d_events);
+		list_add(&call->list, &ftrace_events);
+		event_create_dir(call, d_events, &ftrace_event_id_fops,
+				 &ftrace_enable_fops, &ftrace_event_filter_fops,
+				 &ftrace_event_format_fops);
 	}
 	}
 
 
+	ret = register_module_notifier(&trace_module_nb);
+	if (ret)
+		pr_warning("Failed to register trace events module notifier\n");
+
 	return 0;
 	return 0;
 }
 }
 fs_initcall(event_trace_init);
 fs_initcall(event_trace_init);
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+
+static DEFINE_SPINLOCK(test_spinlock);
+static DEFINE_SPINLOCK(test_spinlock_irq);
+static DEFINE_MUTEX(test_mutex);
+
+static __init void test_work(struct work_struct *dummy)
+{
+	spin_lock(&test_spinlock);
+	spin_lock_irq(&test_spinlock_irq);
+	udelay(1);
+	spin_unlock_irq(&test_spinlock_irq);
+	spin_unlock(&test_spinlock);
+
+	mutex_lock(&test_mutex);
+	msleep(1);
+	mutex_unlock(&test_mutex);
+}
+
+static __init int event_test_thread(void *unused)
+{
+	void *test_malloc;
+
+	test_malloc = kmalloc(1234, GFP_KERNEL);
+	if (!test_malloc)
+		pr_info("failed to kmalloc\n");
+
+	schedule_on_each_cpu(test_work);
+
+	kfree(test_malloc);
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop())
+		schedule();
+
+	return 0;
+}
+
+/*
+ * Do various things that may trigger events.
+ */
+static __init void event_test_stuff(void)
+{
+	struct task_struct *test_thread;
+
+	test_thread = kthread_run(event_test_thread, NULL, "test-events");
+	msleep(1);
+	kthread_stop(test_thread);
+}
+
+/*
+ * For every trace event defined, we will test each trace point separately,
+ * and then by groups, and finally all trace points.
+ */
+static __init void event_trace_self_tests(void)
+{
+	struct ftrace_event_call *call;
+	struct event_subsystem *system;
+	int ret;
+
+	pr_info("Running tests on trace events:\n");
+
+	list_for_each_entry(call, &ftrace_events, list) {
+
+		/* Only test those that have a regfunc */
+		if (!call->regfunc)
+			continue;
+
+		pr_info("Testing event %s: ", call->name);
+
+		/*
+		 * If an event is already enabled, someone is using
+		 * it and the self test should not be on.
+		 */
+		if (call->enabled) {
+			pr_warning("Enabled event during self test!\n");
+			WARN_ON_ONCE(1);
+			continue;
+		}
+
+		ftrace_event_enable_disable(call, 1);
+		event_test_stuff();
+		ftrace_event_enable_disable(call, 0);
+
+		pr_cont("OK\n");
+	}
+
+	/* Now test at the sub system level */
+
+	pr_info("Running tests on trace event systems:\n");
+
+	list_for_each_entry(system, &event_subsystems, list) {
+
+		/* the ftrace system is special, skip it */
+		if (strcmp(system->name, "ftrace") == 0)
+			continue;
+
+		pr_info("Testing event system %s: ", system->name);
+
+		ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1);
+		if (WARN_ON_ONCE(ret)) {
+			pr_warning("error enabling system %s\n",
+				   system->name);
+			continue;
+		}
+
+		event_test_stuff();
+
+		ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0);
+		if (WARN_ON_ONCE(ret))
+			pr_warning("error disabling system %s\n",
+				   system->name);
+
+		pr_cont("OK\n");
+	}
+
+	/* Test with all events enabled */
+
+	pr_info("Running tests on all trace events:\n");
+	pr_info("Testing all events: ");
+
+	ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1);
+	if (WARN_ON_ONCE(ret)) {
+		pr_warning("error enabling all events\n");
+		return;
+	}
+
+	event_test_stuff();
+
+	/* reset sysname */
+	ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0);
+	if (WARN_ON_ONCE(ret)) {
+		pr_warning("error disabling all events\n");
+		return;
+	}
+
+	pr_cont("OK\n");
+}
+
+#ifdef CONFIG_FUNCTION_TRACER
+
+static DEFINE_PER_CPU(atomic_t, test_event_disable);
+
+static void
+function_test_events_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct ring_buffer_event *event;
+	struct ftrace_entry *entry;
+	unsigned long flags;
+	long disabled;
+	int resched;
+	int cpu;
+	int pc;
+
+	pc = preempt_count();
+	resched = ftrace_preempt_disable();
+	cpu = raw_smp_processor_id();
+	disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu));
+
+	if (disabled != 1)
+		goto out;
+
+	local_save_flags(flags);
+
+	event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry),
+						  flags, pc);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	entry->ip			= ip;
+	entry->parent_ip		= parent_ip;
+
+	trace_nowake_buffer_unlock_commit(event, flags, pc);
+
+ out:
+	atomic_dec(&per_cpu(test_event_disable, cpu));
+	ftrace_preempt_enable(resched);
+}
+
+static struct ftrace_ops trace_ops __initdata  =
+{
+	.func = function_test_events_call,
+};
+
+static __init void event_trace_self_test_with_function(void)
+{
+	register_ftrace_function(&trace_ops);
+	pr_info("Running tests again, along with the function tracer\n");
+	event_trace_self_tests();
+	unregister_ftrace_function(&trace_ops);
+}
+#else
+static __init void event_trace_self_test_with_function(void)
+{
+}
+#endif
+
+static __init int event_trace_self_tests_init(void)
+{
+
+	event_trace_self_tests();
+
+	event_trace_self_test_with_function();
+
+	return 0;
+}
+
+late_initcall(event_trace_self_tests_init);
+
+#endif

+ 963 - 241
kernel/trace/trace_events_filter.c

@@ -22,119 +22,297 @@
 #include <linux/uaccess.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/ctype.h>
+#include <linux/mutex.h>
 
 
 #include "trace.h"
 #include "trace.h"
 #include "trace_output.h"
 #include "trace_output.h"
 
 
-static int filter_pred_64(struct filter_pred *pred, void *event)
+static DEFINE_MUTEX(filter_mutex);
+
+enum filter_op_ids
+{
+	OP_OR,
+	OP_AND,
+	OP_NE,
+	OP_EQ,
+	OP_LT,
+	OP_LE,
+	OP_GT,
+	OP_GE,
+	OP_NONE,
+	OP_OPEN_PAREN,
+};
+
+struct filter_op {
+	int id;
+	char *string;
+	int precedence;
+};
+
+static struct filter_op filter_ops[] = {
+	{ OP_OR, "||", 1 },
+	{ OP_AND, "&&", 2 },
+	{ OP_NE, "!=", 4 },
+	{ OP_EQ, "==", 4 },
+	{ OP_LT, "<", 5 },
+	{ OP_LE, "<=", 5 },
+	{ OP_GT, ">", 5 },
+	{ OP_GE, ">=", 5 },
+	{ OP_NONE, "OP_NONE", 0 },
+	{ OP_OPEN_PAREN, "(", 0 },
+};
+
+enum {
+	FILT_ERR_NONE,
+	FILT_ERR_INVALID_OP,
+	FILT_ERR_UNBALANCED_PAREN,
+	FILT_ERR_TOO_MANY_OPERANDS,
+	FILT_ERR_OPERAND_TOO_LONG,
+	FILT_ERR_FIELD_NOT_FOUND,
+	FILT_ERR_ILLEGAL_FIELD_OP,
+	FILT_ERR_ILLEGAL_INTVAL,
+	FILT_ERR_BAD_SUBSYS_FILTER,
+	FILT_ERR_TOO_MANY_PREDS,
+	FILT_ERR_MISSING_FIELD,
+	FILT_ERR_INVALID_FILTER,
+};
+
+static char *err_text[] = {
+	"No error",
+	"Invalid operator",
+	"Unbalanced parens",
+	"Too many operands",
+	"Operand too long",
+	"Field not found",
+	"Illegal operation for field type",
+	"Illegal integer value",
+	"Couldn't find or set field in one of a subsystem's events",
+	"Too many terms in predicate expression",
+	"Missing field name and/or value",
+	"Meaningless filter expression",
+};
+
+struct opstack_op {
+	int op;
+	struct list_head list;
+};
+
+struct postfix_elt {
+	int op;
+	char *operand;
+	struct list_head list;
+};
+
+struct filter_parse_state {
+	struct filter_op *ops;
+	struct list_head opstack;
+	struct list_head postfix;
+	int lasterr;
+	int lasterr_pos;
+
+	struct {
+		char *string;
+		unsigned int cnt;
+		unsigned int tail;
+	} infix;
+
+	struct {
+		char string[MAX_FILTER_STR_VAL];
+		int pos;
+		unsigned int tail;
+	} operand;
+};
+
+DEFINE_COMPARISON_PRED(s64);
+DEFINE_COMPARISON_PRED(u64);
+DEFINE_COMPARISON_PRED(s32);
+DEFINE_COMPARISON_PRED(u32);
+DEFINE_COMPARISON_PRED(s16);
+DEFINE_COMPARISON_PRED(u16);
+DEFINE_COMPARISON_PRED(s8);
+DEFINE_COMPARISON_PRED(u8);
+
+DEFINE_EQUALITY_PRED(64);
+DEFINE_EQUALITY_PRED(32);
+DEFINE_EQUALITY_PRED(16);
+DEFINE_EQUALITY_PRED(8);
+
+static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
+			   void *event __attribute((unused)),
+			   int val1, int val2)
+{
+	return val1 && val2;
+}
+
+static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
+			  void *event __attribute((unused)),
+			  int val1, int val2)
+{
+	return val1 || val2;
+}
+
+/* Filter predicate for fixed sized arrays of characters */
+static int filter_pred_string(struct filter_pred *pred, void *event,
+			      int val1, int val2)
 {
 {
-	u64 *addr = (u64 *)(event + pred->offset);
-	u64 val = (u64)pred->val;
-	int match;
+	char *addr = (char *)(event + pred->offset);
+	int cmp, match;
+
+	cmp = strncmp(addr, pred->str_val, pred->str_len);
 
 
-	match = (val == *addr) ^ pred->not;
+	match = (!cmp) ^ pred->not;
 
 
 	return match;
 	return match;
 }
 }
 
 
-static int filter_pred_32(struct filter_pred *pred, void *event)
+/*
+ * Filter predicate for dynamic sized arrays of characters.
+ * These are implemented through a list of strings at the end
+ * of the entry.
+ * Also each of these strings have a field in the entry which
+ * contains its offset from the beginning of the entry.
+ * We have then first to get this field, dereference it
+ * and add it to the address of the entry, and at last we have
+ * the address of the string.
+ */
+static int filter_pred_strloc(struct filter_pred *pred, void *event,
+			      int val1, int val2)
 {
 {
-	u32 *addr = (u32 *)(event + pred->offset);
-	u32 val = (u32)pred->val;
-	int match;
+	int str_loc = *(int *)(event + pred->offset);
+	char *addr = (char *)(event + str_loc);
+	int cmp, match;
+
+	cmp = strncmp(addr, pred->str_val, pred->str_len);
 
 
-	match = (val == *addr) ^ pred->not;
+	match = (!cmp) ^ pred->not;
 
 
 	return match;
 	return match;
 }
 }
 
 
-static int filter_pred_16(struct filter_pred *pred, void *event)
+static int filter_pred_none(struct filter_pred *pred, void *event,
+			    int val1, int val2)
+{
+	return 0;
+}
+
+/* return 1 if event matches, 0 otherwise (discard) */
+int filter_match_preds(struct ftrace_event_call *call, void *rec)
 {
 {
-	u16 *addr = (u16 *)(event + pred->offset);
-	u16 val = (u16)pred->val;
-	int match;
+	struct event_filter *filter = call->filter;
+	int match, top = 0, val1 = 0, val2 = 0;
+	int stack[MAX_FILTER_PRED];
+	struct filter_pred *pred;
+	int i;
+
+	for (i = 0; i < filter->n_preds; i++) {
+		pred = filter->preds[i];
+		if (!pred->pop_n) {
+			match = pred->fn(pred, rec, val1, val2);
+			stack[top++] = match;
+			continue;
+		}
+		if (pred->pop_n > top) {
+			WARN_ON_ONCE(1);
+			return 0;
+		}
+		val1 = stack[--top];
+		val2 = stack[--top];
+		match = pred->fn(pred, rec, val1, val2);
+		stack[top++] = match;
+	}
 
 
-	match = (val == *addr) ^ pred->not;
+	return stack[--top];
+}
+EXPORT_SYMBOL_GPL(filter_match_preds);
 
 
-	return match;
+static void parse_error(struct filter_parse_state *ps, int err, int pos)
+{
+	ps->lasterr = err;
+	ps->lasterr_pos = pos;
 }
 }
 
 
-static int filter_pred_8(struct filter_pred *pred, void *event)
+static void remove_filter_string(struct event_filter *filter)
 {
 {
-	u8 *addr = (u8 *)(event + pred->offset);
-	u8 val = (u8)pred->val;
-	int match;
+	kfree(filter->filter_string);
+	filter->filter_string = NULL;
+}
 
 
-	match = (val == *addr) ^ pred->not;
+static int replace_filter_string(struct event_filter *filter,
+				 char *filter_string)
+{
+	kfree(filter->filter_string);
+	filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
+	if (!filter->filter_string)
+		return -ENOMEM;
 
 
-	return match;
+	return 0;
 }
 }
 
 
-static int filter_pred_string(struct filter_pred *pred, void *event)
+static int append_filter_string(struct event_filter *filter,
+				char *string)
 {
 {
-	char *addr = (char *)(event + pred->offset);
-	int cmp, match;
+	int newlen;
+	char *new_filter_string;
 
 
-	cmp = strncmp(addr, pred->str_val, pred->str_len);
+	BUG_ON(!filter->filter_string);
+	newlen = strlen(filter->filter_string) + strlen(string) + 1;
+	new_filter_string = kmalloc(newlen, GFP_KERNEL);
+	if (!new_filter_string)
+		return -ENOMEM;
 
 
-	match = (!cmp) ^ pred->not;
+	strcpy(new_filter_string, filter->filter_string);
+	strcat(new_filter_string, string);
+	kfree(filter->filter_string);
+	filter->filter_string = new_filter_string;
 
 
-	return match;
+	return 0;
 }
 }
 
 
-/* return 1 if event matches, 0 otherwise (discard) */
-int filter_match_preds(struct ftrace_event_call *call, void *rec)
+static void append_filter_err(struct filter_parse_state *ps,
+			      struct event_filter *filter)
 {
 {
-	int i, matched, and_failed = 0;
-	struct filter_pred *pred;
+	int pos = ps->lasterr_pos;
+	char *buf, *pbuf;
 
 
-	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (call->preds[i]) {
-			pred = call->preds[i];
-			if (and_failed && !pred->or)
-				continue;
-			matched = pred->fn(pred, rec);
-			if (!matched && !pred->or) {
-				and_failed = 1;
-				continue;
-			} else if (matched && pred->or)
-				return 1;
-		} else
-			break;
-	}
+	buf = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!buf)
+		return;
 
 
-	if (and_failed)
-		return 0;
+	append_filter_string(filter, "\n");
+	memset(buf, ' ', PAGE_SIZE);
+	if (pos > PAGE_SIZE - 128)
+		pos = 0;
+	buf[pos] = '^';
+	pbuf = &buf[pos] + 1;
 
 
-	return 1;
+	sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]);
+	append_filter_string(filter, buf);
+	free_page((unsigned long) buf);
 }
 }
 
 
-void filter_print_preds(struct filter_pred **preds, struct trace_seq *s)
+void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
 {
 {
-	char *field_name;
-	struct filter_pred *pred;
-	int i;
+	struct event_filter *filter = call->filter;
 
 
-	if (!preds) {
+	mutex_lock(&filter_mutex);
+	if (filter->filter_string)
+		trace_seq_printf(s, "%s\n", filter->filter_string);
+	else
 		trace_seq_printf(s, "none\n");
 		trace_seq_printf(s, "none\n");
-		return;
-	}
+	mutex_unlock(&filter_mutex);
+}
 
 
-	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (preds[i]) {
-			pred = preds[i];
-			field_name = pred->field_name;
-			if (i)
-				trace_seq_printf(s, pred->or ? "|| " : "&& ");
-			trace_seq_printf(s, "%s ", field_name);
-			trace_seq_printf(s, pred->not ? "!= " : "== ");
-			if (pred->str_val)
-				trace_seq_printf(s, "%s\n", pred->str_val);
-			else
-				trace_seq_printf(s, "%llu\n", pred->val);
-		} else
-			break;
-	}
+void print_subsystem_event_filter(struct event_subsystem *system,
+				  struct trace_seq *s)
+{
+	struct event_filter *filter = system->filter;
+
+	mutex_lock(&filter_mutex);
+	if (filter->filter_string)
+		trace_seq_printf(s, "%s\n", filter->filter_string);
+	else
+		trace_seq_printf(s, "none\n");
+	mutex_unlock(&filter_mutex);
 }
 }
 
 
 static struct ftrace_event_field *
 static struct ftrace_event_field *
@@ -150,284 +328,828 @@ find_event_field(struct ftrace_event_call *call, char *name)
 	return NULL;
 	return NULL;
 }
 }
 
 
-void filter_free_pred(struct filter_pred *pred)
+static void filter_free_pred(struct filter_pred *pred)
 {
 {
 	if (!pred)
 	if (!pred)
 		return;
 		return;
 
 
 	kfree(pred->field_name);
 	kfree(pred->field_name);
-	kfree(pred->str_val);
 	kfree(pred);
 	kfree(pred);
 }
 }
 
 
-void filter_free_preds(struct ftrace_event_call *call)
+static void filter_clear_pred(struct filter_pred *pred)
 {
 {
-	int i;
+	kfree(pred->field_name);
+	pred->field_name = NULL;
+	pred->str_len = 0;
+}
 
 
-	if (call->preds) {
-		for (i = 0; i < MAX_FILTER_PRED; i++)
-			filter_free_pred(call->preds[i]);
-		kfree(call->preds);
-		call->preds = NULL;
+static int filter_set_pred(struct filter_pred *dest,
+			   struct filter_pred *src,
+			   filter_pred_fn_t fn)
+{
+	*dest = *src;
+	if (src->field_name) {
+		dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
+		if (!dest->field_name)
+			return -ENOMEM;
 	}
 	}
+	dest->fn = fn;
+
+	return 0;
 }
 }
 
 
-void filter_free_subsystem_preds(struct event_subsystem *system)
+static void filter_disable_preds(struct ftrace_event_call *call)
 {
 {
-	struct ftrace_event_call *call = __start_ftrace_events;
+	struct event_filter *filter = call->filter;
 	int i;
 	int i;
 
 
-	if (system->preds) {
-		for (i = 0; i < MAX_FILTER_PRED; i++)
-			filter_free_pred(system->preds[i]);
-		kfree(system->preds);
-		system->preds = NULL;
-	}
+	call->filter_active = 0;
+	filter->n_preds = 0;
 
 
-	events_for_each(call) {
-		if (!call->name || !call->regfunc)
-			continue;
+	for (i = 0; i < MAX_FILTER_PRED; i++)
+		filter->preds[i]->fn = filter_pred_none;
+}
+
+void destroy_preds(struct ftrace_event_call *call)
+{
+	struct event_filter *filter = call->filter;
+	int i;
 
 
-		if (!strcmp(call->system, system->name))
-			filter_free_preds(call);
+	for (i = 0; i < MAX_FILTER_PRED; i++) {
+		if (filter->preds[i])
+			filter_free_pred(filter->preds[i]);
 	}
 	}
+	kfree(filter->preds);
+	kfree(filter);
+	call->filter = NULL;
 }
 }
 
 
-static int __filter_add_pred(struct ftrace_event_call *call,
-			     struct filter_pred *pred)
+int init_preds(struct ftrace_event_call *call)
 {
 {
+	struct event_filter *filter;
+	struct filter_pred *pred;
 	int i;
 	int i;
 
 
-	if (call->preds && !pred->compound)
-		filter_free_preds(call);
+	filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+	if (!call->filter)
+		return -ENOMEM;
 
 
-	if (!call->preds) {
-		call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
-				      GFP_KERNEL);
-		if (!call->preds)
-			return -ENOMEM;
-	}
+	call->filter_active = 0;
+	filter->n_preds = 0;
+
+	filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
+	if (!filter->preds)
+		goto oom;
 
 
 	for (i = 0; i < MAX_FILTER_PRED; i++) {
 	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (!call->preds[i]) {
-			call->preds[i] = pred;
-			return 0;
+		pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+		if (!pred)
+			goto oom;
+		pred->fn = filter_pred_none;
+		filter->preds[i] = pred;
+	}
+
+	return 0;
+
+oom:
+	destroy_preds(call);
+
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(init_preds);
+
+static void filter_free_subsystem_preds(struct event_subsystem *system)
+{
+	struct event_filter *filter = system->filter;
+	struct ftrace_event_call *call;
+	int i;
+
+	if (filter->n_preds) {
+		for (i = 0; i < filter->n_preds; i++)
+			filter_free_pred(filter->preds[i]);
+		kfree(filter->preds);
+		filter->preds = NULL;
+		filter->n_preds = 0;
+	}
+
+	mutex_lock(&event_mutex);
+	list_for_each_entry(call, &ftrace_events, list) {
+		if (!call->define_fields)
+			continue;
+
+		if (!strcmp(call->system, system->name)) {
+			filter_disable_preds(call);
+			remove_filter_string(call->filter);
 		}
 		}
 	}
 	}
+	mutex_unlock(&event_mutex);
+}
+
+static int filter_add_pred_fn(struct filter_parse_state *ps,
+			      struct ftrace_event_call *call,
+			      struct filter_pred *pred,
+			      filter_pred_fn_t fn)
+{
+	struct event_filter *filter = call->filter;
+	int idx, err;
+
+	if (filter->n_preds == MAX_FILTER_PRED) {
+		parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
+		return -ENOSPC;
+	}
+
+	idx = filter->n_preds;
+	filter_clear_pred(filter->preds[idx]);
+	err = filter_set_pred(filter->preds[idx], pred, fn);
+	if (err)
+		return err;
 
 
-	return -ENOSPC;
+	filter->n_preds++;
+	call->filter_active = 1;
+
+	return 0;
 }
 }
 
 
+enum {
+	FILTER_STATIC_STRING = 1,
+	FILTER_DYN_STRING
+};
+
 static int is_string_field(const char *type)
 static int is_string_field(const char *type)
 {
 {
+	if (strstr(type, "__data_loc") && strstr(type, "char"))
+		return FILTER_DYN_STRING;
+
 	if (strchr(type, '[') && strstr(type, "char"))
 	if (strchr(type, '[') && strstr(type, "char"))
-		return 1;
+		return FILTER_STATIC_STRING;
 
 
 	return 0;
 	return 0;
 }
 }
 
 
-int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
+static int is_legal_op(struct ftrace_event_field *field, int op)
 {
 {
-	struct ftrace_event_field *field;
-
-	field = find_event_field(call, pred->field_name);
-	if (!field)
-		return -EINVAL;
+	if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE))
+		return 0;
 
 
-	pred->offset = field->offset;
+	return 1;
+}
 
 
-	if (is_string_field(field->type)) {
-		if (!pred->str_val)
-			return -EINVAL;
-		pred->fn = filter_pred_string;
-		pred->str_len = field->size;
-		return __filter_add_pred(call, pred);
-	} else {
-		if (pred->str_val)
-			return -EINVAL;
-	}
+static filter_pred_fn_t select_comparison_fn(int op, int field_size,
+					     int field_is_signed)
+{
+	filter_pred_fn_t fn = NULL;
 
 
-	switch (field->size) {
+	switch (field_size) {
 	case 8:
 	case 8:
-		pred->fn = filter_pred_64;
+		if (op == OP_EQ || op == OP_NE)
+			fn = filter_pred_64;
+		else if (field_is_signed)
+			fn = filter_pred_s64;
+		else
+			fn = filter_pred_u64;
 		break;
 		break;
 	case 4:
 	case 4:
-		pred->fn = filter_pred_32;
+		if (op == OP_EQ || op == OP_NE)
+			fn = filter_pred_32;
+		else if (field_is_signed)
+			fn = filter_pred_s32;
+		else
+			fn = filter_pred_u32;
 		break;
 		break;
 	case 2:
 	case 2:
-		pred->fn = filter_pred_16;
+		if (op == OP_EQ || op == OP_NE)
+			fn = filter_pred_16;
+		else if (field_is_signed)
+			fn = filter_pred_s16;
+		else
+			fn = filter_pred_u16;
 		break;
 		break;
 	case 1:
 	case 1:
-		pred->fn = filter_pred_8;
+		if (op == OP_EQ || op == OP_NE)
+			fn = filter_pred_8;
+		else if (field_is_signed)
+			fn = filter_pred_s8;
+		else
+			fn = filter_pred_u8;
 		break;
 		break;
-	default:
-		return -EINVAL;
 	}
 	}
 
 
-	return __filter_add_pred(call, pred);
+	return fn;
 }
 }
 
 
-static struct filter_pred *copy_pred(struct filter_pred *pred)
+static int filter_add_pred(struct filter_parse_state *ps,
+			   struct ftrace_event_call *call,
+			   struct filter_pred *pred)
 {
 {
-	struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL);
-	if (!new_pred)
-		return NULL;
+	struct ftrace_event_field *field;
+	filter_pred_fn_t fn;
+	unsigned long long val;
+	int string_type;
+
+	pred->fn = filter_pred_none;
+
+	if (pred->op == OP_AND) {
+		pred->pop_n = 2;
+		return filter_add_pred_fn(ps, call, pred, filter_pred_and);
+	} else if (pred->op == OP_OR) {
+		pred->pop_n = 2;
+		return filter_add_pred_fn(ps, call, pred, filter_pred_or);
+	}
+
+	field = find_event_field(call, pred->field_name);
+	if (!field) {
+		parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
+		return -EINVAL;
+	}
 
 
-	memcpy(new_pred, pred, sizeof(*pred));
+	pred->offset = field->offset;
 
 
-	if (pred->field_name) {
-		new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
-		if (!new_pred->field_name) {
-			kfree(new_pred);
-			return NULL;
-		}
+	if (!is_legal_op(field, pred->op)) {
+		parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0);
+		return -EINVAL;
 	}
 	}
 
 
-	if (pred->str_val) {
-		new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL);
-		if (!new_pred->str_val) {
-			filter_free_pred(new_pred);
-			return NULL;
+	string_type = is_string_field(field->type);
+	if (string_type) {
+		if (string_type == FILTER_STATIC_STRING)
+			fn = filter_pred_string;
+		else
+			fn = filter_pred_strloc;
+		pred->str_len = field->size;
+		if (pred->op == OP_NE)
+			pred->not = 1;
+		return filter_add_pred_fn(ps, call, pred, fn);
+	} else {
+		if (strict_strtoull(pred->str_val, 0, &val)) {
+			parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
+			return -EINVAL;
 		}
 		}
+		pred->val = val;
+	}
+
+	fn = select_comparison_fn(pred->op, field->size, field->is_signed);
+	if (!fn) {
+		parse_error(ps, FILT_ERR_INVALID_OP, 0);
+		return -EINVAL;
 	}
 	}
 
 
-	return new_pred;
+	if (pred->op == OP_NE)
+		pred->not = 1;
+
+	return filter_add_pred_fn(ps, call, pred, fn);
 }
 }
 
 
-int filter_add_subsystem_pred(struct event_subsystem *system,
-			      struct filter_pred *pred)
+static int filter_add_subsystem_pred(struct filter_parse_state *ps,
+				     struct event_subsystem *system,
+				     struct filter_pred *pred,
+				     char *filter_string)
 {
 {
-	struct ftrace_event_call *call = __start_ftrace_events;
-	struct filter_pred *event_pred;
-	int i;
-
-	if (system->preds && !pred->compound)
-		filter_free_subsystem_preds(system);
+	struct event_filter *filter = system->filter;
+	struct ftrace_event_call *call;
+	int err = 0;
 
 
-	if (!system->preds) {
-		system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
+	if (!filter->preds) {
+		filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
 					GFP_KERNEL);
 					GFP_KERNEL);
-		if (!system->preds)
+
+		if (!filter->preds)
 			return -ENOMEM;
 			return -ENOMEM;
 	}
 	}
 
 
-	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (!system->preds[i]) {
-			system->preds[i] = pred;
-			break;
-		}
+	if (filter->n_preds == MAX_FILTER_PRED) {
+		parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
+		return -ENOSPC;
 	}
 	}
 
 
-	if (i == MAX_FILTER_PRED)
-		return -ENOSPC;
+	filter->preds[filter->n_preds] = pred;
+	filter->n_preds++;
 
 
-	events_for_each(call) {
-		int err;
+	mutex_lock(&event_mutex);
+	list_for_each_entry(call, &ftrace_events, list) {
 
 
-		if (!call->name || !call->regfunc)
+		if (!call->define_fields)
 			continue;
 			continue;
 
 
 		if (strcmp(call->system, system->name))
 		if (strcmp(call->system, system->name))
 			continue;
 			continue;
 
 
-		if (!find_event_field(call, pred->field_name))
-			continue;
+		err = filter_add_pred(ps, call, pred);
+		if (err) {
+			mutex_unlock(&event_mutex);
+			filter_free_subsystem_preds(system);
+			parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+			goto out;
+		}
+		replace_filter_string(call->filter, filter_string);
+	}
+	mutex_unlock(&event_mutex);
+out:
+	return err;
+}
 
 
-		event_pred = copy_pred(pred);
-		if (!event_pred)
-			goto oom;
+static void parse_init(struct filter_parse_state *ps,
+		       struct filter_op *ops,
+		       char *infix_string)
+{
+	memset(ps, '\0', sizeof(*ps));
 
 
-		err = filter_add_pred(call, event_pred);
-		if (err)
-			filter_free_pred(event_pred);
-		if (err == -ENOMEM)
-			goto oom;
+	ps->infix.string = infix_string;
+	ps->infix.cnt = strlen(infix_string);
+	ps->ops = ops;
+
+	INIT_LIST_HEAD(&ps->opstack);
+	INIT_LIST_HEAD(&ps->postfix);
+}
+
+static char infix_next(struct filter_parse_state *ps)
+{
+	ps->infix.cnt--;
+
+	return ps->infix.string[ps->infix.tail++];
+}
+
+static char infix_peek(struct filter_parse_state *ps)
+{
+	if (ps->infix.tail == strlen(ps->infix.string))
+		return 0;
+
+	return ps->infix.string[ps->infix.tail];
+}
+
+static void infix_advance(struct filter_parse_state *ps)
+{
+	ps->infix.cnt--;
+	ps->infix.tail++;
+}
+
+static inline int is_precedence_lower(struct filter_parse_state *ps,
+				      int a, int b)
+{
+	return ps->ops[a].precedence < ps->ops[b].precedence;
+}
+
+static inline int is_op_char(struct filter_parse_state *ps, char c)
+{
+	int i;
+
+	for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+		if (ps->ops[i].string[0] == c)
+			return 1;
 	}
 	}
 
 
 	return 0;
 	return 0;
+}
 
 
-oom:
-	system->preds[i] = NULL;
-	return -ENOMEM;
+static int infix_get_op(struct filter_parse_state *ps, char firstc)
+{
+	char nextc = infix_peek(ps);
+	char opstr[3];
+	int i;
+
+	opstr[0] = firstc;
+	opstr[1] = nextc;
+	opstr[2] = '\0';
+
+	for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+		if (!strcmp(opstr, ps->ops[i].string)) {
+			infix_advance(ps);
+			return ps->ops[i].id;
+		}
+	}
+
+	opstr[1] = '\0';
+
+	for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+		if (!strcmp(opstr, ps->ops[i].string))
+			return ps->ops[i].id;
+	}
+
+	return OP_NONE;
 }
 }
 
 
-int filter_parse(char **pbuf, struct filter_pred *pred)
+static inline void clear_operand_string(struct filter_parse_state *ps)
 {
 {
-	char *tmp, *tok, *val_str = NULL;
-	int tok_n = 0;
+	memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL);
+	ps->operand.tail = 0;
+}
 
 
-	/* field ==/!= number, or/and field ==/!= number, number */
-	while ((tok = strsep(pbuf, " \n"))) {
-		if (tok_n == 0) {
-			if (!strcmp(tok, "0")) {
-				pred->clear = 1;
-				return 0;
-			} else if (!strcmp(tok, "&&")) {
-				pred->or = 0;
-				pred->compound = 1;
-			} else if (!strcmp(tok, "||")) {
-				pred->or = 1;
-				pred->compound = 1;
-			} else
-				pred->field_name = tok;
-			tok_n = 1;
+static inline int append_operand_char(struct filter_parse_state *ps, char c)
+{
+	if (ps->operand.tail == MAX_FILTER_STR_VAL - 1)
+		return -EINVAL;
+
+	ps->operand.string[ps->operand.tail++] = c;
+
+	return 0;
+}
+
+static int filter_opstack_push(struct filter_parse_state *ps, int op)
+{
+	struct opstack_op *opstack_op;
+
+	opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL);
+	if (!opstack_op)
+		return -ENOMEM;
+
+	opstack_op->op = op;
+	list_add(&opstack_op->list, &ps->opstack);
+
+	return 0;
+}
+
+static int filter_opstack_empty(struct filter_parse_state *ps)
+{
+	return list_empty(&ps->opstack);
+}
+
+static int filter_opstack_top(struct filter_parse_state *ps)
+{
+	struct opstack_op *opstack_op;
+
+	if (filter_opstack_empty(ps))
+		return OP_NONE;
+
+	opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+
+	return opstack_op->op;
+}
+
+static int filter_opstack_pop(struct filter_parse_state *ps)
+{
+	struct opstack_op *opstack_op;
+	int op;
+
+	if (filter_opstack_empty(ps))
+		return OP_NONE;
+
+	opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+	op = opstack_op->op;
+	list_del(&opstack_op->list);
+
+	kfree(opstack_op);
+
+	return op;
+}
+
+static void filter_opstack_clear(struct filter_parse_state *ps)
+{
+	while (!filter_opstack_empty(ps))
+		filter_opstack_pop(ps);
+}
+
+static char *curr_operand(struct filter_parse_state *ps)
+{
+	return ps->operand.string;
+}
+
+static int postfix_append_operand(struct filter_parse_state *ps, char *operand)
+{
+	struct postfix_elt *elt;
+
+	elt = kmalloc(sizeof(*elt), GFP_KERNEL);
+	if (!elt)
+		return -ENOMEM;
+
+	elt->op = OP_NONE;
+	elt->operand = kstrdup(operand, GFP_KERNEL);
+	if (!elt->operand) {
+		kfree(elt);
+		return -ENOMEM;
+	}
+
+	list_add_tail(&elt->list, &ps->postfix);
+
+	return 0;
+}
+
+static int postfix_append_op(struct filter_parse_state *ps, int op)
+{
+	struct postfix_elt *elt;
+
+	elt = kmalloc(sizeof(*elt), GFP_KERNEL);
+	if (!elt)
+		return -ENOMEM;
+
+	elt->op = op;
+	elt->operand = NULL;
+
+	list_add_tail(&elt->list, &ps->postfix);
+
+	return 0;
+}
+
+static void postfix_clear(struct filter_parse_state *ps)
+{
+	struct postfix_elt *elt;
+
+	while (!list_empty(&ps->postfix)) {
+		elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
+		kfree(elt->operand);
+		list_del(&elt->list);
+	}
+}
+
+static int filter_parse(struct filter_parse_state *ps)
+{
+	int in_string = 0;
+	int op, top_op;
+	char ch;
+
+	while ((ch = infix_next(ps))) {
+		if (ch == '"') {
+			in_string ^= 1;
 			continue;
 			continue;
 		}
 		}
-		if (tok_n == 1) {
-			if (!pred->field_name)
-				pred->field_name = tok;
-			else if (!strcmp(tok, "!="))
-				pred->not = 1;
-			else if (!strcmp(tok, "=="))
-				pred->not = 0;
-			else {
-				pred->field_name = NULL;
+
+		if (in_string)
+			goto parse_operand;
+
+		if (isspace(ch))
+			continue;
+
+		if (is_op_char(ps, ch)) {
+			op = infix_get_op(ps, ch);
+			if (op == OP_NONE) {
+				parse_error(ps, FILT_ERR_INVALID_OP, 0);
 				return -EINVAL;
 				return -EINVAL;
 			}
 			}
-			tok_n = 2;
+
+			if (strlen(curr_operand(ps))) {
+				postfix_append_operand(ps, curr_operand(ps));
+				clear_operand_string(ps);
+			}
+
+			while (!filter_opstack_empty(ps)) {
+				top_op = filter_opstack_top(ps);
+				if (!is_precedence_lower(ps, top_op, op)) {
+					top_op = filter_opstack_pop(ps);
+					postfix_append_op(ps, top_op);
+					continue;
+				}
+				break;
+			}
+
+			filter_opstack_push(ps, op);
 			continue;
 			continue;
 		}
 		}
-		if (tok_n == 2) {
-			if (pred->compound) {
-				if (!strcmp(tok, "!="))
-					pred->not = 1;
-				else if (!strcmp(tok, "=="))
-					pred->not = 0;
-				else {
-					pred->field_name = NULL;
-					return -EINVAL;
-				}
-			} else {
-				val_str = tok;
-				break; /* done */
+
+		if (ch == '(') {
+			filter_opstack_push(ps, OP_OPEN_PAREN);
+			continue;
+		}
+
+		if (ch == ')') {
+			if (strlen(curr_operand(ps))) {
+				postfix_append_operand(ps, curr_operand(ps));
+				clear_operand_string(ps);
+			}
+
+			top_op = filter_opstack_pop(ps);
+			while (top_op != OP_NONE) {
+				if (top_op == OP_OPEN_PAREN)
+					break;
+				postfix_append_op(ps, top_op);
+				top_op = filter_opstack_pop(ps);
+			}
+			if (top_op == OP_NONE) {
+				parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
+				return -EINVAL;
 			}
 			}
-			tok_n = 3;
 			continue;
 			continue;
 		}
 		}
-		if (tok_n == 3) {
-			val_str = tok;
-			break; /* done */
+parse_operand:
+		if (append_operand_char(ps, ch)) {
+			parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0);
+			return -EINVAL;
 		}
 		}
 	}
 	}
 
 
-	if (!val_str) {
-		pred->field_name = NULL;
-		return -EINVAL;
+	if (strlen(curr_operand(ps)))
+		postfix_append_operand(ps, curr_operand(ps));
+
+	while (!filter_opstack_empty(ps)) {
+		top_op = filter_opstack_pop(ps);
+		if (top_op == OP_NONE)
+			break;
+		if (top_op == OP_OPEN_PAREN) {
+			parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
+			return -EINVAL;
+		}
+		postfix_append_op(ps, top_op);
 	}
 	}
 
 
-	pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
-	if (!pred->field_name)
-		return -ENOMEM;
+	return 0;
+}
 
 
-	pred->val = simple_strtoull(val_str, &tmp, 0);
-	if (tmp == val_str) {
-		pred->str_val = kstrdup(val_str, GFP_KERNEL);
-		if (!pred->str_val)
-			return -ENOMEM;
-	} else if (*tmp != '\0')
+static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
+{
+	struct filter_pred *pred;
+
+	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+	if (!pred)
+		return NULL;
+
+	pred->field_name = kstrdup(operand1, GFP_KERNEL);
+	if (!pred->field_name) {
+		kfree(pred);
+		return NULL;
+	}
+
+	strcpy(pred->str_val, operand2);
+	pred->str_len = strlen(operand2);
+
+	pred->op = op;
+
+	return pred;
+}
+
+static struct filter_pred *create_logical_pred(int op)
+{
+	struct filter_pred *pred;
+
+	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+	if (!pred)
+		return NULL;
+
+	pred->op = op;
+
+	return pred;
+}
+
+static int check_preds(struct filter_parse_state *ps)
+{
+	int n_normal_preds = 0, n_logical_preds = 0;
+	struct postfix_elt *elt;
+
+	list_for_each_entry(elt, &ps->postfix, list) {
+		if (elt->op == OP_NONE)
+			continue;
+
+		if (elt->op == OP_AND || elt->op == OP_OR) {
+			n_logical_preds++;
+			continue;
+		}
+		n_normal_preds++;
+	}
+
+	if (!n_normal_preds || n_logical_preds >= n_normal_preds) {
+		parse_error(ps, FILT_ERR_INVALID_FILTER, 0);
 		return -EINVAL;
 		return -EINVAL;
+	}
 
 
 	return 0;
 	return 0;
 }
 }
 
 
+static int replace_preds(struct event_subsystem *system,
+			 struct ftrace_event_call *call,
+			 struct filter_parse_state *ps,
+			 char *filter_string)
+{
+	char *operand1 = NULL, *operand2 = NULL;
+	struct filter_pred *pred;
+	struct postfix_elt *elt;
+	int err;
+
+	err = check_preds(ps);
+	if (err)
+		return err;
+
+	list_for_each_entry(elt, &ps->postfix, list) {
+		if (elt->op == OP_NONE) {
+			if (!operand1)
+				operand1 = elt->operand;
+			else if (!operand2)
+				operand2 = elt->operand;
+			else {
+				parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
+				return -EINVAL;
+			}
+			continue;
+		}
+
+		if (elt->op == OP_AND || elt->op == OP_OR) {
+			pred = create_logical_pred(elt->op);
+			if (call) {
+				err = filter_add_pred(ps, call, pred);
+				filter_free_pred(pred);
+			} else
+				err = filter_add_subsystem_pred(ps, system,
+							pred, filter_string);
+			if (err)
+				return err;
+
+			operand1 = operand2 = NULL;
+			continue;
+		}
+
+		if (!operand1 || !operand2) {
+			parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
+			return -EINVAL;
+		}
+
+		pred = create_pred(elt->op, operand1, operand2);
+		if (call) {
+			err = filter_add_pred(ps, call, pred);
+			filter_free_pred(pred);
+		} else
+			err = filter_add_subsystem_pred(ps, system, pred,
+							filter_string);
+		if (err)
+			return err;
+
+		operand1 = operand2 = NULL;
+	}
+
+	return 0;
+}
+
+int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
+{
+	int err;
+
+	struct filter_parse_state *ps;
+
+	mutex_lock(&filter_mutex);
+
+	if (!strcmp(strstrip(filter_string), "0")) {
+		filter_disable_preds(call);
+		remove_filter_string(call->filter);
+		mutex_unlock(&filter_mutex);
+		return 0;
+	}
+
+	err = -ENOMEM;
+	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+	if (!ps)
+		goto out_unlock;
+
+	filter_disable_preds(call);
+	replace_filter_string(call->filter, filter_string);
+
+	parse_init(ps, filter_ops, filter_string);
+	err = filter_parse(ps);
+	if (err) {
+		append_filter_err(ps, call->filter);
+		goto out;
+	}
+
+	err = replace_preds(NULL, call, ps, filter_string);
+	if (err)
+		append_filter_err(ps, call->filter);
+
+out:
+	filter_opstack_clear(ps);
+	postfix_clear(ps);
+	kfree(ps);
+out_unlock:
+	mutex_unlock(&filter_mutex);
+
+	return err;
+}
+
+int apply_subsystem_event_filter(struct event_subsystem *system,
+				 char *filter_string)
+{
+	int err;
+
+	struct filter_parse_state *ps;
+
+	mutex_lock(&filter_mutex);
+
+	if (!strcmp(strstrip(filter_string), "0")) {
+		filter_free_subsystem_preds(system);
+		remove_filter_string(system->filter);
+		mutex_unlock(&filter_mutex);
+		return 0;
+	}
+
+	err = -ENOMEM;
+	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+	if (!ps)
+		goto out_unlock;
+
+	filter_free_subsystem_preds(system);
+	replace_filter_string(system->filter, filter_string);
+
+	parse_init(ps, filter_ops, filter_string);
+	err = filter_parse(ps);
+	if (err) {
+		append_filter_err(ps, system->filter);
+		goto out;
+	}
+
+	err = replace_preds(system, NULL, ps, filter_string);
+	if (err)
+		append_filter_err(ps, system->filter);
+
+out:
+	filter_opstack_clear(ps);
+	postfix_clear(ps);
+	kfree(ps);
+out_unlock:
+	mutex_unlock(&filter_mutex);
+
+	return err;
+}
 
 

+ 0 - 39
kernel/trace/trace_events_stage_1.h

@@ -1,39 +0,0 @@
-/*
- * Stage 1 of the trace events.
- *
- * Override the macros in <trace/trace_event_types.h> to include the following:
- *
- * struct ftrace_raw_<call> {
- *	struct trace_entry		ent;
- *	<type>				<item>;
- *	<type2>				<item2>[<len>];
- *	[...]
- * };
- *
- * The <type> <item> is created by the __field(type, item) macro or
- * the __array(type2, item2, len) macro.
- * We simply do "type item;", and that will create the fields
- * in the structure.
- */
-
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)
-
-#undef __array
-#define __array(type, item, len)	type	item[len];
-
-#undef __field
-#define __field(type, item)		type	item;
-
-#undef TP_STRUCT__entry
-#define TP_STRUCT__entry(args...) args
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(name, proto, args, tstruct, assign, print)	\
-	struct ftrace_raw_##name {				\
-		struct trace_entry	ent;			\
-		tstruct						\
-	};							\
-	static struct ftrace_event_call event_##name
-
-#include <trace/trace_event_types.h>

+ 0 - 176
kernel/trace/trace_events_stage_2.h

@@ -1,176 +0,0 @@
-/*
- * Stage 2 of the trace events.
- *
- * Override the macros in <trace/trace_event_types.h> to include the following:
- *
- * enum print_line_t
- * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
- * {
- *	struct trace_seq *s = &iter->seq;
- *	struct ftrace_raw_<call> *field; <-- defined in stage 1
- *	struct trace_entry *entry;
- *	int ret;
- *
- *	entry = iter->ent;
- *
- *	if (entry->type != event_<call>.id) {
- *		WARN_ON_ONCE(1);
- *		return TRACE_TYPE_UNHANDLED;
- *	}
- *
- *	field = (typeof(field))entry;
- *
- *	ret = trace_seq_printf(s, <TP_printk> "\n");
- *	if (!ret)
- *		return TRACE_TYPE_PARTIAL_LINE;
- *
- *	return TRACE_TYPE_HANDLED;
- * }
- *
- * This is the method used to print the raw event to the trace
- * output format. Note, this is not needed if the data is read
- * in binary.
- */
-
-#undef __entry
-#define __entry field
-
-#undef TP_printk
-#define TP_printk(fmt, args...) fmt "\n", args
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
-enum print_line_t							\
-ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
-{									\
-	struct trace_seq *s = &iter->seq;				\
-	struct ftrace_raw_##call *field;				\
-	struct trace_entry *entry;					\
-	int ret;							\
-									\
-	entry = iter->ent;						\
-									\
-	if (entry->type != event_##call.id) {				\
-		WARN_ON_ONCE(1);					\
-		return TRACE_TYPE_UNHANDLED;				\
-	}								\
-									\
-	field = (typeof(field))entry;					\
-									\
-	ret = trace_seq_printf(s, #call ": " print);			\
-	if (!ret)							\
-		return TRACE_TYPE_PARTIAL_LINE;				\
-									\
-	return TRACE_TYPE_HANDLED;					\
-}
-	
-#include <trace/trace_event_types.h>
-
-/*
- * Setup the showing format of trace point.
- *
- * int
- * ftrace_format_##call(struct trace_seq *s)
- * {
- *	struct ftrace_raw_##call field;
- *	int ret;
- *
- *	ret = trace_seq_printf(s, #type " " #item ";"
- *			       " offset:%u; size:%u;\n",
- *			       offsetof(struct ftrace_raw_##call, item),
- *			       sizeof(field.type));
- *
- * }
- */
-
-#undef TP_STRUCT__entry
-#define TP_STRUCT__entry(args...) args
-
-#undef __field
-#define __field(type, item)					\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
-			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
-	if (!ret)							\
-		return 0;
-
-#undef __array
-#define __array(type, item, len)						\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
-			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
-	if (!ret)							\
-		return 0;
-
-#undef __entry
-#define __entry REC
-
-#undef TP_printk
-#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
-
-#undef TP_fast_assign
-#define TP_fast_assign(args...) args
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
-static int								\
-ftrace_format_##call(struct trace_seq *s)				\
-{									\
-	struct ftrace_raw_##call field;					\
-	int ret;							\
-									\
-	tstruct;							\
-									\
-	trace_seq_printf(s, "\nprint fmt: " print);			\
-									\
-	return ret;							\
-}
-
-#include <trace/trace_event_types.h>
-
-#undef __field
-#define __field(type, item)						\
-	ret = trace_define_field(event_call, #type, #item,		\
-				 offsetof(typeof(field), item),		\
-				 sizeof(field.item));			\
-	if (ret)							\
-		return ret;
-
-#undef __array
-#define __array(type, item, len)					\
-	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
-				 offsetof(typeof(field), item),		\
-				 sizeof(field.item));			\
-	if (ret)							\
-		return ret;
-
-#define __common_field(type, item)					\
-	ret = trace_define_field(event_call, #type, "common_" #item,	\
-				 offsetof(typeof(field.ent), item),	\
-				 sizeof(field.ent.item));		\
-	if (ret)							\
-		return ret;
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
-int									\
-ftrace_define_fields_##call(void)					\
-{									\
-	struct ftrace_raw_##call field;					\
-	struct ftrace_event_call *event_call = &event_##call;		\
-	int ret;							\
-									\
-	__common_field(unsigned char, type);				\
-	__common_field(unsigned char, flags);				\
-	__common_field(unsigned char, preempt_count);			\
-	__common_field(int, pid);					\
-	__common_field(int, tgid);					\
-									\
-	tstruct;							\
-									\
-	return ret;							\
-}
-
-#include <trace/trace_event_types.h>

+ 0 - 281
kernel/trace/trace_events_stage_3.h

@@ -1,281 +0,0 @@
-/*
- * Stage 3 of the trace events.
- *
- * Override the macros in <trace/trace_event_types.h> to include the following:
- *
- * static void ftrace_event_<call>(proto)
- * {
- *	event_trace_printk(_RET_IP_, "<call>: " <fmt>);
- * }
- *
- * static int ftrace_reg_event_<call>(void)
- * {
- *	int ret;
- *
- *	ret = register_trace_<call>(ftrace_event_<call>);
- *	if (!ret)
- *		pr_info("event trace: Could not activate trace point "
- *			"probe to  <call>");
- *	return ret;
- * }
- *
- * static void ftrace_unreg_event_<call>(void)
- * {
- *	unregister_trace_<call>(ftrace_event_<call>);
- * }
- *
- * For those macros defined with TRACE_FORMAT:
- *
- * static struct ftrace_event_call __used
- * __attribute__((__aligned__(4)))
- * __attribute__((section("_ftrace_events"))) event_<call> = {
- *	.name			= "<call>",
- *	.regfunc		= ftrace_reg_event_<call>,
- *	.unregfunc		= ftrace_unreg_event_<call>,
- * }
- *
- *
- * For those macros defined with TRACE_EVENT:
- *
- * static struct ftrace_event_call event_<call>;
- *
- * static void ftrace_raw_event_<call>(proto)
- * {
- *	struct ring_buffer_event *event;
- *	struct ftrace_raw_<call> *entry; <-- defined in stage 1
- *	unsigned long irq_flags;
- *	int pc;
- *
- *	local_save_flags(irq_flags);
- *	pc = preempt_count();
- *
- *	event = trace_current_buffer_lock_reserve(event_<call>.id,
- *				  sizeof(struct ftrace_raw_<call>),
- *				  irq_flags, pc);
- *	if (!event)
- *		return;
- *	entry	= ring_buffer_event_data(event);
- *
- *	<assign>;  <-- Here we assign the entries by the __field and
- *			__array macros.
- *
- *	trace_current_buffer_unlock_commit(event, irq_flags, pc);
- * }
- *
- * static int ftrace_raw_reg_event_<call>(void)
- * {
- *	int ret;
- *
- *	ret = register_trace_<call>(ftrace_raw_event_<call>);
- *	if (!ret)
- *		pr_info("event trace: Could not activate trace point "
- *			"probe to <call>");
- *	return ret;
- * }
- *
- * static void ftrace_unreg_event_<call>(void)
- * {
- *	unregister_trace_<call>(ftrace_raw_event_<call>);
- * }
- *
- * static struct trace_event ftrace_event_type_<call> = {
- *	.trace			= ftrace_raw_output_<call>, <-- stage 2
- * };
- *
- * static int ftrace_raw_init_event_<call>(void)
- * {
- *	int id;
- *
- *	id = register_ftrace_event(&ftrace_event_type_<call>);
- *	if (!id)
- *		return -ENODEV;
- *	event_<call>.id = id;
- *	return 0;
- * }
- *
- * static struct ftrace_event_call __used
- * __attribute__((__aligned__(4)))
- * __attribute__((section("_ftrace_events"))) event_<call> = {
- *	.name			= "<call>",
- *	.system			= "<system>",
- *	.raw_init		= ftrace_raw_init_event_<call>,
- *	.regfunc		= ftrace_reg_event_<call>,
- *	.unregfunc		= ftrace_unreg_event_<call>,
- *	.show_format		= ftrace_format_<call>,
- * }
- *
- */
-
-#undef TP_FMT
-#define TP_FMT(fmt, args...)	fmt "\n", ##args
-
-#ifdef CONFIG_EVENT_PROFILE
-#define _TRACE_PROFILE(call, proto, args)				\
-static void ftrace_profile_##call(proto)				\
-{									\
-	extern void perf_tpcounter_event(int);				\
-	perf_tpcounter_event(event_##call.id);				\
-}									\
-									\
-static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \
-{									\
-	int ret = 0;							\
-									\
-	if (!atomic_inc_return(&call->profile_count))			\
-		ret = register_trace_##call(ftrace_profile_##call);	\
-									\
-	return ret;							\
-}									\
-									\
-static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
-{									\
-	if (atomic_add_negative(-1, &call->profile_count))		\
-		unregister_trace_##call(ftrace_profile_##call);		\
-}
-
-#define _TRACE_PROFILE_INIT(call)					\
-	.profile_count = ATOMIC_INIT(-1),				\
-	.profile_enable = ftrace_profile_enable_##call,			\
-	.profile_disable = ftrace_profile_disable_##call,
-
-#else
-#define _TRACE_PROFILE(call, proto, args)
-#define _TRACE_PROFILE_INIT(call)
-#endif
-
-#define _TRACE_FORMAT(call, proto, args, fmt)				\
-static void ftrace_event_##call(proto)					\
-{									\
-	event_trace_printk(_RET_IP_, #call ": " fmt);			\
-}									\
-									\
-static int ftrace_reg_event_##call(void)				\
-{									\
-	int ret;							\
-									\
-	ret = register_trace_##call(ftrace_event_##call);		\
-	if (ret)							\
-		pr_info("event trace: Could not activate trace point "	\
-			"probe to " #call "\n");			\
-	return ret;							\
-}									\
-									\
-static void ftrace_unreg_event_##call(void)				\
-{									\
-	unregister_trace_##call(ftrace_event_##call);			\
-}									\
-									\
-static struct ftrace_event_call event_##call;				\
-									\
-static int ftrace_init_event_##call(void)				\
-{									\
-	int id;								\
-									\
-	id = register_ftrace_event(NULL);				\
-	if (!id)							\
-		return -ENODEV;						\
-	event_##call.id = id;						\
-	return 0;							\
-}
-
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)				\
-_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
-_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))			\
-static struct ftrace_event_call __used					\
-__attribute__((__aligned__(4)))						\
-__attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name			= #call,				\
-	.system			= __stringify(TRACE_SYSTEM),		\
-	.raw_init		= ftrace_init_event_##call,		\
-	.regfunc		= ftrace_reg_event_##call,		\
-	.unregfunc		= ftrace_unreg_event_##call,		\
-	_TRACE_PROFILE_INIT(call)					\
-}
-
-#undef __entry
-#define __entry entry
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
-_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))			\
-									\
-static struct ftrace_event_call event_##call;				\
-									\
-static void ftrace_raw_event_##call(proto)				\
-{									\
-	struct ftrace_event_call *call = &event_##call;			\
-	struct ring_buffer_event *event;				\
-	struct ftrace_raw_##call *entry;				\
-	unsigned long irq_flags;					\
-	int pc;								\
-									\
-	local_save_flags(irq_flags);					\
-	pc = preempt_count();						\
-									\
-	event = trace_current_buffer_lock_reserve(event_##call.id,	\
-				  sizeof(struct ftrace_raw_##call),	\
-				  irq_flags, pc);			\
-	if (!event)							\
-		return;							\
-	entry	= ring_buffer_event_data(event);			\
-									\
-	assign;								\
-									\
-	if (call->preds && !filter_match_preds(call, entry))		\
-		ring_buffer_event_discard(event);			\
-									\
-	trace_nowake_buffer_unlock_commit(event, irq_flags, pc);	\
-									\
-}									\
-									\
-static int ftrace_raw_reg_event_##call(void)				\
-{									\
-	int ret;							\
-									\
-	ret = register_trace_##call(ftrace_raw_event_##call);		\
-	if (ret)							\
-		pr_info("event trace: Could not activate trace point "	\
-			"probe to " #call "\n");			\
-	return ret;							\
-}									\
-									\
-static void ftrace_raw_unreg_event_##call(void)				\
-{									\
-	unregister_trace_##call(ftrace_raw_event_##call);		\
-}									\
-									\
-static struct trace_event ftrace_event_type_##call = {			\
-	.trace			= ftrace_raw_output_##call,		\
-};									\
-									\
-static int ftrace_raw_init_event_##call(void)				\
-{									\
-	int id;								\
-									\
-	id = register_ftrace_event(&ftrace_event_type_##call);		\
-	if (!id)							\
-		return -ENODEV;						\
-	event_##call.id = id;						\
-	INIT_LIST_HEAD(&event_##call.fields);				\
-	return 0;							\
-}									\
-									\
-static struct ftrace_event_call __used					\
-__attribute__((__aligned__(4)))						\
-__attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name			= #call,				\
-	.system			= __stringify(TRACE_SYSTEM),		\
-	.raw_init		= ftrace_raw_init_event_##call,		\
-	.regfunc		= ftrace_raw_reg_event_##call,		\
-	.unregfunc		= ftrace_raw_unreg_event_##call,	\
-	.show_format		= ftrace_format_##call,			\
-	.define_fields		= ftrace_define_fields_##call,		\
-	_TRACE_PROFILE_INIT(call)					\
-}
-
-#include <trace/trace_event_types.h>
-
-#undef _TRACE_PROFILE
-#undef _TRACE_PROFILE_INIT
-

+ 107 - 3
kernel/trace/trace_export.c

@@ -19,8 +19,12 @@
 #undef TRACE_STRUCT
 #undef TRACE_STRUCT
 #define TRACE_STRUCT(args...) args
 #define TRACE_STRUCT(args...) args
 
 
+extern void __bad_type_size(void);
+
 #undef TRACE_FIELD
 #undef TRACE_FIELD
 #define TRACE_FIELD(type, item, assign)					\
 #define TRACE_FIELD(type, item, assign)					\
+	if (sizeof(type) != sizeof(field.item))				\
+		__bad_type_size();					\
 	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
 	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
 			       "offset:%u;\tsize:%u;\n",		\
 			       "offset:%u;\tsize:%u;\n",		\
 			       (unsigned int)offsetof(typeof(field), item), \
 			       (unsigned int)offsetof(typeof(field), item), \
@@ -30,7 +34,7 @@
 
 
 
 
 #undef TRACE_FIELD_SPECIAL
 #undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
+#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd)			\
 	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
 	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
 			       "offset:%u;\tsize:%u;\n",		\
 			       "offset:%u;\tsize:%u;\n",		\
 			       (unsigned int)offsetof(typeof(field), item), \
 			       (unsigned int)offsetof(typeof(field), item), \
@@ -46,6 +50,9 @@
 	if (!ret)							\
 	if (!ret)							\
 		return 0;
 		return 0;
 
 
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed)	\
+	TRACE_FIELD(type, item, assign)
 
 
 #undef TP_RAW_FMT
 #undef TP_RAW_FMT
 #define TP_RAW_FMT(args...) args
 #define TP_RAW_FMT(args...) args
@@ -65,6 +72,22 @@ ftrace_format_##call(struct trace_seq *s)				\
 	return ret;							\
 	return ret;							\
 }
 }
 
 
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,	\
+				    tpfmt)				\
+static int								\
+ftrace_format_##call(struct trace_seq *s)				\
+{									\
+	struct args field;						\
+	int ret;							\
+									\
+	tstruct;							\
+									\
+	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\
+									\
+	return ret;							\
+}
+
 #include "trace_event_types.h"
 #include "trace_event_types.h"
 
 
 #undef TRACE_ZERO_CHAR
 #undef TRACE_ZERO_CHAR
@@ -78,6 +101,10 @@ ftrace_format_##call(struct trace_seq *s)				\
 #define TRACE_FIELD(type, item, assign)\
 #define TRACE_FIELD(type, item, assign)\
 	entry->item = assign;
 	entry->item = assign;
 
 
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed)	\
+	TRACE_FIELD(type, item, assign)
+
 #undef TP_CMD
 #undef TP_CMD
 #define TP_CMD(cmd...)	cmd
 #define TP_CMD(cmd...)	cmd
 
 
@@ -85,18 +112,95 @@ ftrace_format_##call(struct trace_seq *s)				\
 #define TRACE_ENTRY	entry
 #define TRACE_ENTRY	entry
 
 
 #undef TRACE_FIELD_SPECIAL
 #undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
+#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd)	\
 	cmd;
 	cmd;
 
 
 #undef TRACE_EVENT_FORMAT
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+int ftrace_define_fields_##call(void);					\
+static int ftrace_raw_init_event_##call(void);				\
+									\
+struct ftrace_event_call __used						\
+__attribute__((__aligned__(4)))						\
+__attribute__((section("_ftrace_events"))) event_##call = {		\
+	.name			= #call,				\
+	.id			= proto,				\
+	.system			= __stringify(TRACE_SYSTEM),		\
+	.raw_init		= ftrace_raw_init_event_##call,		\
+	.show_format		= ftrace_format_##call,			\
+	.define_fields		= ftrace_define_fields_##call,		\
+};									\
+static int ftrace_raw_init_event_##call(void)				\
+{									\
+	INIT_LIST_HEAD(&event_##call.fields);				\
+	init_preds(&event_##call);					\
+	return 0;							\
+}									\
+
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,	\
+				    tpfmt)				\
 									\
 									\
-static struct ftrace_event_call __used					\
+struct ftrace_event_call __used						\
 __attribute__((__aligned__(4)))						\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name			= #call,				\
 	.name			= #call,				\
 	.id			= proto,				\
 	.id			= proto,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
 	.system			= __stringify(TRACE_SYSTEM),		\
 	.show_format		= ftrace_format_##call,			\
 	.show_format		= ftrace_format_##call,			\
+};
+
+#include "trace_event_types.h"
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)					\
+	ret = trace_define_field(event_call, #type, #item,		\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item), is_signed_type(type));	\
+	if (ret)							\
+		return ret;
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type, item, len, cmd)			\
+	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item), 0);		\
+	if (ret)							\
+		return ret;
+
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed)			\
+	ret = trace_define_field(event_call, #type, #item,		\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item), is_signed);	\
+	if (ret)							\
+		return ret;
+
+#undef TRACE_FIELD_ZERO_CHAR
+#define TRACE_FIELD_ZERO_CHAR(item)
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+int									\
+ftrace_define_fields_##call(void)					\
+{									\
+	struct ftrace_event_call *event_call = &event_##call;		\
+	struct args field;						\
+	int ret;							\
+									\
+	__common_field(unsigned char, type, 0);				\
+	__common_field(unsigned char, flags, 0);			\
+	__common_field(unsigned char, preempt_count, 0);		\
+	__common_field(int, pid, 1);					\
+	__common_field(int, tgid, 1);					\
+									\
+	tstruct;							\
+									\
+	return ret;							\
 }
 }
+
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,	\
+				    tpfmt)
+
 #include "trace_event_types.h"
 #include "trace_event_types.h"

+ 18 - 7
kernel/trace/trace_functions_graph.c

@@ -78,13 +78,14 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
 	current->ret_stack[index].ret = ret;
 	current->ret_stack[index].ret = ret;
 	current->ret_stack[index].func = func;
 	current->ret_stack[index].func = func;
 	current->ret_stack[index].calltime = calltime;
 	current->ret_stack[index].calltime = calltime;
+	current->ret_stack[index].subtime = 0;
 	*depth = index;
 	*depth = index;
 
 
 	return 0;
 	return 0;
 }
 }
 
 
 /* Retrieve a function return address to the trace stack on thread info.*/
 /* Retrieve a function return address to the trace stack on thread info.*/
-void
+static void
 ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
 ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
 {
 {
 	int index;
 	int index;
@@ -104,9 +105,6 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
 	trace->calltime = current->ret_stack[index].calltime;
 	trace->calltime = current->ret_stack[index].calltime;
 	trace->overrun = atomic_read(&current->trace_overrun);
 	trace->overrun = atomic_read(&current->trace_overrun);
 	trace->depth = index;
 	trace->depth = index;
-	barrier();
-	current->curr_ret_stack--;
-
 }
 }
 
 
 /*
 /*
@@ -121,6 +119,8 @@ unsigned long ftrace_return_to_handler(void)
 	ftrace_pop_return_trace(&trace, &ret);
 	ftrace_pop_return_trace(&trace, &ret);
 	trace.rettime = trace_clock_local();
 	trace.rettime = trace_clock_local();
 	ftrace_graph_return(&trace);
 	ftrace_graph_return(&trace);
+	barrier();
+	current->curr_ret_stack--;
 
 
 	if (unlikely(!ret)) {
 	if (unlikely(!ret)) {
 		ftrace_graph_stop();
 		ftrace_graph_stop();
@@ -426,8 +426,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
 	return TRACE_TYPE_HANDLED;
 	return TRACE_TYPE_HANDLED;
 }
 }
 
 
-static enum print_line_t
-print_graph_duration(unsigned long long duration, struct trace_seq *s)
+enum print_line_t
+trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
 {
 {
 	unsigned long nsecs_rem = do_div(duration, 1000);
 	unsigned long nsecs_rem = do_div(duration, 1000);
 	/* log10(ULONG_MAX) + '\0' */
 	/* log10(ULONG_MAX) + '\0' */
@@ -464,12 +464,23 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
 		if (!ret)
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 	}
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+print_graph_duration(unsigned long long duration, struct trace_seq *s)
+{
+	int ret;
+
+	ret = trace_print_graph_duration(duration, s);
+	if (ret != TRACE_TYPE_HANDLED)
+		return ret;
 
 
 	ret = trace_seq_printf(s, "|  ");
 	ret = trace_seq_printf(s, "|  ");
 	if (!ret)
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 		return TRACE_TYPE_PARTIAL_LINE;
-	return TRACE_TYPE_HANDLED;
 
 
+	return TRACE_TYPE_HANDLED;
 }
 }
 
 
 /* Case of a leaf function on its call entry */
 /* Case of a leaf function on its call entry */

+ 105 - 98
kernel/trace/trace_hw_branches.c

@@ -1,10 +1,9 @@
 /*
 /*
- * h/w branch tracer for x86 based on bts
+ * h/w branch tracer for x86 based on BTS
  *
  *
  * Copyright (C) 2008-2009 Intel Corporation.
  * Copyright (C) 2008-2009 Intel Corporation.
  * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
  * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
  */
  */
-#include <linux/spinlock.h>
 #include <linux/kallsyms.h>
 #include <linux/kallsyms.h>
 #include <linux/debugfs.h>
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
 #include <linux/ftrace.h>
@@ -15,110 +14,119 @@
 
 
 #include <asm/ds.h>
 #include <asm/ds.h>
 
 
-#include "trace.h"
 #include "trace_output.h"
 #include "trace_output.h"
+#include "trace.h"
 
 
 
 
-#define SIZEOF_BTS (1 << 13)
+#define BTS_BUFFER_SIZE (1 << 13)
 
 
-/*
- * The tracer lock protects the below per-cpu tracer array.
- * It needs to be held to:
- * - start tracing on all cpus
- * - stop tracing on all cpus
- * - start tracing on a single hotplug cpu
- * - stop tracing on a single hotplug cpu
- * - read the trace from all cpus
- * - read the trace from a single cpu
- */
-static DEFINE_SPINLOCK(bts_tracer_lock);
 static DEFINE_PER_CPU(struct bts_tracer *, tracer);
 static DEFINE_PER_CPU(struct bts_tracer *, tracer);
-static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
+static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer);
 
 
 #define this_tracer per_cpu(tracer, smp_processor_id())
 #define this_tracer per_cpu(tracer, smp_processor_id())
-#define this_buffer per_cpu(buffer, smp_processor_id())
 
 
-static int __read_mostly trace_hw_branches_enabled;
+static int trace_hw_branches_enabled __read_mostly;
+static int trace_hw_branches_suspended __read_mostly;
 static struct trace_array *hw_branch_trace __read_mostly;
 static struct trace_array *hw_branch_trace __read_mostly;
 
 
 
 
-/*
- * Start tracing on the current cpu.
- * The argument is ignored.
- *
- * pre: bts_tracer_lock must be locked.
- */
-static void bts_trace_start_cpu(void *arg)
+static void bts_trace_init_cpu(int cpu)
 {
 {
-	if (this_tracer)
-		ds_release_bts(this_tracer);
-
-	this_tracer =
-		ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS,
-			       /* ovfl = */ NULL, /* th = */ (size_t)-1,
-			       BTS_KERNEL);
-	if (IS_ERR(this_tracer)) {
-		this_tracer = NULL;
-		return;
-	}
+	per_cpu(tracer, cpu) =
+		ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE,
+				   NULL, (size_t)-1, BTS_KERNEL);
+
+	if (IS_ERR(per_cpu(tracer, cpu)))
+		per_cpu(tracer, cpu) = NULL;
 }
 }
 
 
-static void bts_trace_start(struct trace_array *tr)
+static int bts_trace_init(struct trace_array *tr)
 {
 {
-	spin_lock(&bts_tracer_lock);
+	int cpu;
+
+	hw_branch_trace = tr;
+	trace_hw_branches_enabled = 0;
 
 
-	on_each_cpu(bts_trace_start_cpu, NULL, 1);
-	trace_hw_branches_enabled = 1;
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		bts_trace_init_cpu(cpu);
 
 
-	spin_unlock(&bts_tracer_lock);
+		if (likely(per_cpu(tracer, cpu)))
+			trace_hw_branches_enabled = 1;
+	}
+	trace_hw_branches_suspended = 0;
+	put_online_cpus();
+
+	/* If we could not enable tracing on a single cpu, we fail. */
+	return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
 }
 }
 
 
-/*
- * Stop tracing on the current cpu.
- * The argument is ignored.
- *
- * pre: bts_tracer_lock must be locked.
- */
-static void bts_trace_stop_cpu(void *arg)
+static void bts_trace_reset(struct trace_array *tr)
 {
 {
-	if (this_tracer) {
-		ds_release_bts(this_tracer);
-		this_tracer = NULL;
+	int cpu;
+
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		if (likely(per_cpu(tracer, cpu))) {
+			ds_release_bts(per_cpu(tracer, cpu));
+			per_cpu(tracer, cpu) = NULL;
+		}
 	}
 	}
+	trace_hw_branches_enabled = 0;
+	trace_hw_branches_suspended = 0;
+	put_online_cpus();
 }
 }
 
 
-static void bts_trace_stop(struct trace_array *tr)
+static void bts_trace_start(struct trace_array *tr)
 {
 {
-	spin_lock(&bts_tracer_lock);
+	int cpu;
 
 
-	trace_hw_branches_enabled = 0;
-	on_each_cpu(bts_trace_stop_cpu, NULL, 1);
+	get_online_cpus();
+	for_each_online_cpu(cpu)
+		if (likely(per_cpu(tracer, cpu)))
+			ds_resume_bts(per_cpu(tracer, cpu));
+	trace_hw_branches_suspended = 0;
+	put_online_cpus();
+}
 
 
-	spin_unlock(&bts_tracer_lock);
+static void bts_trace_stop(struct trace_array *tr)
+{
+	int cpu;
+
+	get_online_cpus();
+	for_each_online_cpu(cpu)
+		if (likely(per_cpu(tracer, cpu)))
+			ds_suspend_bts(per_cpu(tracer, cpu));
+	trace_hw_branches_suspended = 1;
+	put_online_cpus();
 }
 }
 
 
 static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
 static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
 				     unsigned long action, void *hcpu)
 				     unsigned long action, void *hcpu)
 {
 {
-	unsigned int cpu = (unsigned long)hcpu;
-
-	spin_lock(&bts_tracer_lock);
-
-	if (!trace_hw_branches_enabled)
-		goto out;
+	int cpu = (long)hcpu;
 
 
 	switch (action) {
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_ONLINE:
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED:
-		smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
+		/* The notification is sent with interrupts enabled. */
+		if (trace_hw_branches_enabled) {
+			bts_trace_init_cpu(cpu);
+
+			if (trace_hw_branches_suspended &&
+			    likely(per_cpu(tracer, cpu)))
+				ds_suspend_bts(per_cpu(tracer, cpu));
+		}
 		break;
 		break;
+
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE:
-		smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
-		break;
+		/* The notification is sent with interrupts enabled. */
+		if (likely(per_cpu(tracer, cpu))) {
+			ds_release_bts(per_cpu(tracer, cpu));
+			per_cpu(tracer, cpu) = NULL;
+		}
 	}
 	}
 
 
- out:
-	spin_unlock(&bts_tracer_lock);
 	return NOTIFY_DONE;
 	return NOTIFY_DONE;
 }
 }
 
 
@@ -126,20 +134,6 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
 	.notifier_call = bts_hotcpu_handler
 	.notifier_call = bts_hotcpu_handler
 };
 };
 
 
-static int bts_trace_init(struct trace_array *tr)
-{
-	hw_branch_trace = tr;
-
-	bts_trace_start(tr);
-
-	return 0;
-}
-
-static void bts_trace_reset(struct trace_array *tr)
-{
-	bts_trace_stop(tr);
-}
-
 static void bts_trace_print_header(struct seq_file *m)
 static void bts_trace_print_header(struct seq_file *m)
 {
 {
 	seq_puts(m, "# CPU#        TO  <-  FROM\n");
 	seq_puts(m, "# CPU#        TO  <-  FROM\n");
@@ -147,10 +141,10 @@ static void bts_trace_print_header(struct seq_file *m)
 
 
 static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 {
 {
+	unsigned long symflags = TRACE_ITER_SYM_OFFSET;
 	struct trace_entry *entry = iter->ent;
 	struct trace_entry *entry = iter->ent;
 	struct trace_seq *seq = &iter->seq;
 	struct trace_seq *seq = &iter->seq;
 	struct hw_branch_entry *it;
 	struct hw_branch_entry *it;
-	unsigned long symflags = TRACE_ITER_SYM_OFFSET;
 
 
 	trace_assign_type(it, entry);
 	trace_assign_type(it, entry);
 
 
@@ -168,6 +162,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 
 
 void trace_hw_branch(u64 from, u64 to)
 void trace_hw_branch(u64 from, u64 to)
 {
 {
+	struct ftrace_event_call *call = &event_hw_branch;
 	struct trace_array *tr = hw_branch_trace;
 	struct trace_array *tr = hw_branch_trace;
 	struct ring_buffer_event *event;
 	struct ring_buffer_event *event;
 	struct hw_branch_entry *entry;
 	struct hw_branch_entry *entry;
@@ -194,7 +189,8 @@ void trace_hw_branch(u64 from, u64 to)
 	entry->ent.type = TRACE_HW_BRANCHES;
 	entry->ent.type = TRACE_HW_BRANCHES;
 	entry->from = from;
 	entry->from = from;
 	entry->to   = to;
 	entry->to   = to;
-	trace_buffer_unlock_commit(tr, event, 0, 0);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		trace_buffer_unlock_commit(tr, event, 0, 0);
 
 
  out:
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
 	atomic_dec(&tr->data[cpu]->disabled);
@@ -224,11 +220,11 @@ static void trace_bts_at(const struct bts_trace *trace, void *at)
 /*
 /*
  * Collect the trace on the current cpu and write it into the ftrace buffer.
  * Collect the trace on the current cpu and write it into the ftrace buffer.
  *
  *
- * pre: bts_tracer_lock must be locked
+ * pre: tracing must be suspended on the current cpu
  */
  */
 static void trace_bts_cpu(void *arg)
 static void trace_bts_cpu(void *arg)
 {
 {
-	struct trace_array *tr = (struct trace_array *) arg;
+	struct trace_array *tr = (struct trace_array *)arg;
 	const struct bts_trace *trace;
 	const struct bts_trace *trace;
 	unsigned char *at;
 	unsigned char *at;
 
 
@@ -241,10 +237,9 @@ static void trace_bts_cpu(void *arg)
 	if (unlikely(!this_tracer))
 	if (unlikely(!this_tracer))
 		return;
 		return;
 
 
-	ds_suspend_bts(this_tracer);
 	trace = ds_read_bts(this_tracer);
 	trace = ds_read_bts(this_tracer);
 	if (!trace)
 	if (!trace)
-		goto out;
+		return;
 
 
 	for (at = trace->ds.top; (void *)at < trace->ds.end;
 	for (at = trace->ds.top; (void *)at < trace->ds.end;
 	     at += trace->ds.size)
 	     at += trace->ds.size)
@@ -253,18 +248,27 @@ static void trace_bts_cpu(void *arg)
 	for (at = trace->ds.begin; (void *)at < trace->ds.top;
 	for (at = trace->ds.begin; (void *)at < trace->ds.top;
 	     at += trace->ds.size)
 	     at += trace->ds.size)
 		trace_bts_at(trace, at);
 		trace_bts_at(trace, at);
-
-out:
-	ds_resume_bts(this_tracer);
 }
 }
 
 
 static void trace_bts_prepare(struct trace_iterator *iter)
 static void trace_bts_prepare(struct trace_iterator *iter)
 {
 {
-	spin_lock(&bts_tracer_lock);
+	int cpu;
 
 
+	get_online_cpus();
+	for_each_online_cpu(cpu)
+		if (likely(per_cpu(tracer, cpu)))
+			ds_suspend_bts(per_cpu(tracer, cpu));
+	/*
+	 * We need to collect the trace on the respective cpu since ftrace
+	 * implicitly adds the record for the current cpu.
+	 * Once that is more flexible, we could collect the data from any cpu.
+	 */
 	on_each_cpu(trace_bts_cpu, iter->tr, 1);
 	on_each_cpu(trace_bts_cpu, iter->tr, 1);
 
 
-	spin_unlock(&bts_tracer_lock);
+	for_each_online_cpu(cpu)
+		if (likely(per_cpu(tracer, cpu)))
+			ds_resume_bts(per_cpu(tracer, cpu));
+	put_online_cpus();
 }
 }
 
 
 static void trace_bts_close(struct trace_iterator *iter)
 static void trace_bts_close(struct trace_iterator *iter)
@@ -274,11 +278,11 @@ static void trace_bts_close(struct trace_iterator *iter)
 
 
 void trace_hw_branch_oops(void)
 void trace_hw_branch_oops(void)
 {
 {
-	spin_lock(&bts_tracer_lock);
-
-	trace_bts_cpu(hw_branch_trace);
-
-	spin_unlock(&bts_tracer_lock);
+	if (this_tracer) {
+		ds_suspend_bts_noirq(this_tracer);
+		trace_bts_cpu(hw_branch_trace);
+		ds_resume_bts_noirq(this_tracer);
+	}
 }
 }
 
 
 struct tracer bts_tracer __read_mostly =
 struct tracer bts_tracer __read_mostly =
@@ -291,7 +295,10 @@ struct tracer bts_tracer __read_mostly =
 	.start		= bts_trace_start,
 	.start		= bts_trace_start,
 	.stop		= bts_trace_stop,
 	.stop		= bts_trace_stop,
 	.open		= trace_bts_prepare,
 	.open		= trace_bts_prepare,
-	.close		= trace_bts_close
+	.close		= trace_bts_close,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest	= trace_selftest_startup_hw_branches,
+#endif /* CONFIG_FTRACE_SELFTEST */
 };
 };
 
 
 __init static int init_bts_trace(void)
 __init static int init_bts_trace(void)

部分文件因为文件数量过多而无法显示