16 years ago · be4bdbfbae
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -4,6 +4,7 @@
 
				 #include <linux/ring_buffer.h>
			
 
				 #include <linux/trace_seq.h>
			
 
				 #include <linux/percpu.h>
			
 
				+#include <linux/hardirq.h>
			
 
				 
			
 
				 struct trace_array;
			
 
				 struct tracer;
			
@@ -130,10 +131,15 @@ struct ftrace_event_call {
 
				 	void			*data;
			
 
				 
			
 
				 	atomic_t		profile_count;
			
 
				-	int			(*profile_enable)(struct ftrace_event_call *);
			
 
				-	void			(*profile_disable)(struct ftrace_event_call *);
			
 
				+	int			(*profile_enable)(void);
			
 
				+	void			(*profile_disable)(void);
			
 
				 };
			
 
				 
			
 
				+#define FTRACE_MAX_PROFILE_SIZE	2048
			
 
				+
			
 
				+extern char			*trace_profile_buf;
			
 
				+extern char			*trace_profile_buf_nmi;
			
 
				+
			
 
				 #define MAX_FILTER_PRED		32
			
 
				 #define MAX_FILTER_STR_VAL	256	/* Should handle KSYM_SYMBOL_LEN */
			
 
				 
			
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -100,33 +100,25 @@ struct perf_counter_attr;
 
				 
			
 
				 #ifdef CONFIG_EVENT_PROFILE
			
 
				 #define TRACE_SYS_ENTER_PROFILE(sname)					       \
			
 
				-static int prof_sysenter_enable_##sname(struct ftrace_event_call *event_call)  \
			
 
				+static int prof_sysenter_enable_##sname(void)				       \
			
 
				 {									       \
			
 
				-	int ret = 0;							       \
			
 
				-	if (!atomic_inc_return(&event_enter_##sname.profile_count))	       \
			
 
				-		ret = reg_prof_syscall_enter("sys"#sname);		       \
			
 
				-	return ret;							       \
			
 
				+	return reg_prof_syscall_enter("sys"#sname);			       \
			
 
				 }									       \
			
 
				 									       \
			
 
				-static void prof_sysenter_disable_##sname(struct ftrace_event_call *event_call)\
			
 
				+static void prof_sysenter_disable_##sname(void)				       \
			
 
				 {									       \
			
 
				-	if (atomic_add_negative(-1, &event_enter_##sname.profile_count))       \
			
 
				-		unreg_prof_syscall_enter("sys"#sname);			       \
			
 
				+	unreg_prof_syscall_enter("sys"#sname);				       \
			
 
				 }
			
 
				 
			
 
				 #define TRACE_SYS_EXIT_PROFILE(sname)					       \
			
 
				-static int prof_sysexit_enable_##sname(struct ftrace_event_call *event_call)   \
			
 
				+static int prof_sysexit_enable_##sname(void)				       \
			
 
				 {									       \
			
 
				-	int ret = 0;							       \
			
 
				-	if (!atomic_inc_return(&event_exit_##sname.profile_count))	       \
			
 
				-		ret = reg_prof_syscall_exit("sys"#sname);		       \
			
 
				-	return ret;							       \
			
 
				+	return reg_prof_syscall_exit("sys"#sname);			       \
			
 
				 }									       \
			
 
				 									       \
			
 
				-static void prof_sysexit_disable_##sname(struct ftrace_event_call *event_call) \
			
 
				+static void prof_sysexit_disable_##sname(void)				       \
			
 
				 {                                                                              \
			
 
				-	if (atomic_add_negative(-1, &event_exit_##sname.profile_count))	       \
			
 
				-		unreg_prof_syscall_exit("sys"#sname);			       \
			
 
				+	unreg_prof_syscall_exit("sys"#sname);				       \
			
 
				 }
			
 
				 
			
 
				 #define TRACE_SYS_ENTER_PROFILE_INIT(sname)				       \
			
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -382,20 +382,14 @@ static inline int ftrace_get_offsets_##call(				\
 
				  *
			
 
				  * NOTE: The insertion profile callback (ftrace_profile_<call>) is defined later
			
 
				  *
			
 
				- * static int ftrace_profile_enable_<call>(struct ftrace_event_call *event_call)
			
 
				+ * static int ftrace_profile_enable_<call>(void)
			
 
				  * {
			
 
				- * 	int ret = 0;
			
 
				- *
			
 
				- * 	if (!atomic_inc_return(&event_call->profile_count))
			
 
				- * 		ret = register_trace_<call>(ftrace_profile_<call>);
			
 
				- *
			
 
				- * 	return ret;
			
 
				+ * 	return register_trace_<call>(ftrace_profile_<call>);
			
 
				  * }
			
 
				  *
			
 
				- * static void ftrace_profile_disable_<call>(struct ftrace_event_call *event_call)
			
 
				+ * static void ftrace_profile_disable_<call>(void)
			
 
				  * {
			
 
				- * 	if (atomic_add_negative(-1, &event->call->profile_count))
			
 
				- * 		unregister_trace_<call>(ftrace_profile_<call>);
			
 
				+ * 	unregister_trace_<call>(ftrace_profile_<call>);
			
 
				  * }
			
 
				  *
			
 
				  */
			
@@ -405,20 +399,14 @@ static inline int ftrace_get_offsets_##call(				\
 
				 									\
			
 
				 static void ftrace_profile_##call(proto);				\
			
 
				 									\
			
 
				-static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \
			
 
				+static int ftrace_profile_enable_##call(void)				\
			
 
				 {									\
			
 
				-	int ret = 0;							\
			
 
				-									\
			
 
				-	if (!atomic_inc_return(&event_call->profile_count))		\
			
 
				-		ret = register_trace_##call(ftrace_profile_##call);	\
			
 
				-									\
			
 
				-	return ret;							\
			
 
				+	return register_trace_##call(ftrace_profile_##call);		\
			
 
				 }									\
			
 
				 									\
			
 
				-static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
			
 
				+static void ftrace_profile_disable_##call(void)				\
			
 
				 {									\
			
 
				-	if (atomic_add_negative(-1, &event_call->profile_count))	\
			
 
				-		unregister_trace_##call(ftrace_profile_##call);		\
			
 
				+	unregister_trace_##call(ftrace_profile_##call);			\
			
 
				 }
			
 
				 
			
 
				 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
			
@@ -660,11 +648,12 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 
				  *	struct ftrace_raw_##call *entry;
			
 
				  *	u64 __addr = 0, __count = 1;
			
 
				  *	unsigned long irq_flags;
			
 
				+ *	struct trace_entry *ent;
			
 
				  *	int __entry_size;
			
 
				  *	int __data_size;
			
 
				+ *	int __cpu
			
 
				  *	int pc;
			
 
				  *
			
 
				- *	local_save_flags(irq_flags);
			
 
				  *	pc = preempt_count();
			
 
				  *
			
 
				  *	__data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
			
@@ -675,25 +664,34 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 
				  *			     sizeof(u64));
			
 
				  *	__entry_size -= sizeof(u32);
			
 
				  *
			
 
				- *	do {
			
 
				- *		char raw_data[__entry_size]; <- allocate our sample in the stack
			
 
				- *		struct trace_entry *ent;
			
 
				+ *	// Protect the non nmi buffer
			
 
				+ *	// This also protects the rcu read side
			
 
				+ *	local_irq_save(irq_flags);
			
 
				+ *	__cpu = smp_processor_id();
			
 
				+ *
			
 
				+ *	if (in_nmi())
			
 
				+ *		raw_data = rcu_dereference(trace_profile_buf_nmi);
			
 
				+ *	else
			
 
				+ *		raw_data = rcu_dereference(trace_profile_buf);
			
 
				+ *
			
 
				+ *	if (!raw_data)
			
 
				+ *		goto end;
			
 
				  *
			
 
				- *		zero dead bytes from alignment to avoid stack leak to userspace:
			
 
				+ *	raw_data = per_cpu_ptr(raw_data, __cpu);
			
 
				  *
			
 
				- *		*(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;
			
 
				- *		entry = (struct ftrace_raw_<call> *)raw_data;
			
 
				- *		ent = &entry->ent;
			
 
				- *		tracing_generic_entry_update(ent, irq_flags, pc);
			
 
				- *		ent->type = event_call->id;
			
 
				+ *	//zero dead bytes from alignment to avoid stack leak to userspace:
			
 
				+ *	*(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;
			
 
				+ *	entry = (struct ftrace_raw_<call> *)raw_data;
			
 
				+ *	ent = &entry->ent;
			
 
				+ *	tracing_generic_entry_update(ent, irq_flags, pc);
			
 
				+ *	ent->type = event_call->id;
			
 
				  *
			
 
				- *		<tstruct> <- do some jobs with dynamic arrays
			
 
				+ *	<tstruct> <- do some jobs with dynamic arrays
			
 
				  *
			
 
				- *		<assign>  <- affect our values
			
 
				+ *	<assign>  <- affect our values
			
 
				  *
			
 
				- *		perf_tpcounter_event(event_call->id, __addr, __count, entry,
			
 
				- *			     __entry_size);  <- submit them to perf counter
			
 
				- *	} while (0);
			
 
				+ *	perf_tpcounter_event(event_call->id, __addr, __count, entry,
			
 
				+ *		     __entry_size);  <- submit them to perf counter
			
 
				  *
			
 
				  * }
			
 
				  */
			
@@ -716,11 +714,13 @@ static void ftrace_profile_##call(proto)				\
 
				 	struct ftrace_raw_##call *entry;				\
			
 
				 	u64 __addr = 0, __count = 1;					\
			
 
				 	unsigned long irq_flags;					\
			
 
				+	struct trace_entry *ent;					\
			
 
				 	int __entry_size;						\
			
 
				 	int __data_size;						\
			
 
				+	char *raw_data;							\
			
 
				+	int __cpu;							\
			
 
				 	int pc;								\
			
 
				 									\
			
 
				-	local_save_flags(irq_flags);					\
			
 
				 	pc = preempt_count();						\
			
 
				 									\
			
 
				 	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
			
@@ -728,23 +728,38 @@ static void ftrace_profile_##call(proto)				\
 
				 			     sizeof(u64));				\
			
 
				 	__entry_size -= sizeof(u32);					\
			
 
				 									\
			
 
				-	do {								\
			
 
				-		char raw_data[__entry_size];				\
			
 
				-		struct trace_entry *ent;				\
			
 
				+	if (WARN_ONCE(__entry_size > FTRACE_MAX_PROFILE_SIZE,		\
			
 
				+		      "profile buffer not large enough"))		\
			
 
				+		return;							\
			
 
				+									\
			
 
				+	local_irq_save(irq_flags);					\
			
 
				+	__cpu = smp_processor_id();					\
			
 
				 									\
			
 
				-		*(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;	\
			
 
				-		entry = (struct ftrace_raw_##call *)raw_data;		\
			
 
				-		ent = &entry->ent;					\
			
 
				-		tracing_generic_entry_update(ent, irq_flags, pc);	\
			
 
				-		ent->type = event_call->id;				\
			
 
				+	if (in_nmi())							\
			
 
				+		raw_data = rcu_dereference(trace_profile_buf_nmi);		\
			
 
				+	else								\
			
 
				+		raw_data = rcu_dereference(trace_profile_buf);		\
			
 
				 									\
			
 
				-		tstruct							\
			
 
				+	if (!raw_data)							\
			
 
				+		goto end;						\
			
 
				 									\
			
 
				-		{ assign; }						\
			
 
				+	raw_data = per_cpu_ptr(raw_data, __cpu);			\
			
 
				 									\
			
 
				-		perf_tpcounter_event(event_call->id, __addr, __count, entry,\
			
 
				+	*(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;		\
			
 
				+	entry = (struct ftrace_raw_##call *)raw_data;			\
			
 
				+	ent = &entry->ent;						\
			
 
				+	tracing_generic_entry_update(ent, irq_flags, pc);		\
			
 
				+	ent->type = event_call->id;					\
			
 
				+									\
			
 
				+	tstruct								\
			
 
				+									\
			
 
				+	{ assign; }							\
			
 
				+									\
			
 
				+	perf_tpcounter_event(event_call->id, __addr, __count, entry,	\
			
 
				 			     __entry_size);				\
			
 
				-	} while (0);							\
			
 
				+									\
			
 
				+end:									\
			
 
				+	local_irq_restore(irq_flags);					\
			
 
				 									\
			
 
				 }
			
 
				 
			
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -8,6 +8,54 @@
 
				 #include <linux/module.h>
			
 
				 #include "trace.h"
			
 
				 
			
 
				+/*
			
 
				+ * We can't use a size but a type in alloc_percpu()
			
 
				+ * So let's create a dummy type that matches the desired size
			
 
				+ */
			
 
				+typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
			
 
				+
			
 
				+char		*trace_profile_buf;
			
 
				+char 		*trace_profile_buf_nmi;
			
 
				+
			
 
				+/* Count the events in use (per event id, not per instance) */
			
 
				+static int	total_profile_count;
			
 
				+
			
 
				+static int ftrace_profile_enable_event(struct ftrace_event_call *event)
			
 
				+{
			
 
				+	char *buf;
			
 
				+	int ret = -ENOMEM;
			
 
				+
			
 
				+	if (atomic_inc_return(&event->profile_count))
			
 
				+		return 0;
			
 
				+
			
 
				+	if (!total_profile_count++) {
			
 
				+		buf = (char *)alloc_percpu(profile_buf_t);
			
 
				+		if (!buf)
			
 
				+			goto fail_buf;
			
 
				+
			
 
				+		rcu_assign_pointer(trace_profile_buf, buf);
			
 
				+
			
 
				+		buf = (char *)alloc_percpu(profile_buf_t);
			
 
				+		if (!buf)
			
 
				+			goto fail_buf_nmi;
			
 
				+
			
 
				+		rcu_assign_pointer(trace_profile_buf_nmi, buf);
			
 
				+	}
			
 
				+
			
 
				+	ret = event->profile_enable();
			
 
				+	if (!ret)
			
 
				+		return 0;
			
 
				+
			
 
				+	kfree(trace_profile_buf_nmi);
			
 
				+fail_buf_nmi:
			
 
				+	kfree(trace_profile_buf);
			
 
				+fail_buf:
			
 
				+	total_profile_count--;
			
 
				+	atomic_dec(&event->profile_count);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 int ftrace_profile_enable(int event_id)
			
 
				 {
			
 
				 	struct ftrace_event_call *event;
			
@@ -17,7 +65,7 @@ int ftrace_profile_enable(int event_id)
 
				 	list_for_each_entry(event, &ftrace_events, list) {
			
 
				 		if (event->id == event_id && event->profile_enable &&
			
 
				 		    try_module_get(event->mod)) {
			
 
				-			ret = event->profile_enable(event);
			
 
				+			ret = ftrace_profile_enable_event(event);
			
 
				 			break;
			
 
				 		}
			
 
				 	}
			
@@ -26,6 +74,33 @@ int ftrace_profile_enable(int event_id)
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+static void ftrace_profile_disable_event(struct ftrace_event_call *event)
			
 
				+{
			
 
				+	char *buf, *nmi_buf;
			
 
				+
			
 
				+	if (!atomic_add_negative(-1, &event->profile_count))
			
 
				+		return;
			
 
				+
			
 
				+	event->profile_disable();
			
 
				+
			
 
				+	if (!--total_profile_count) {
			
 
				+		buf = trace_profile_buf;
			
 
				+		rcu_assign_pointer(trace_profile_buf, NULL);
			
 
				+
			
 
				+		nmi_buf = trace_profile_buf_nmi;
			
 
				+		rcu_assign_pointer(trace_profile_buf_nmi, NULL);
			
 
				+
			
 
				+		/*
			
 
				+		 * Ensure every events in profiling have finished before
			
 
				+		 * releasing the buffers
			
 
				+		 */
			
 
				+		synchronize_sched();
			
 
				+
			
 
				+		free_percpu(buf);
			
 
				+		free_percpu(nmi_buf);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 void ftrace_profile_disable(int event_id)
			
 
				 {
			
 
				 	struct ftrace_event_call *event;
			
@@ -33,7 +108,7 @@ void ftrace_profile_disable(int event_id)
 
				 	mutex_lock(&event_mutex);
			
 
				 	list_for_each_entry(event, &ftrace_events, list) {
			
 
				 		if (event->id == event_id) {
			
 
				-			event->profile_disable(event);
			
 
				+			ftrace_profile_disable_event(event);
			
 
				 			module_put(event->mod);
			
 
				 			break;
			
 
				 		}
			
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -384,10 +384,13 @@ static int sys_prof_refcount_exit;
 
				 
			
 
				 static void prof_syscall_enter(struct pt_regs *regs, long id)
			
 
				 {
			
 
				-	struct syscall_trace_enter *rec;
			
 
				 	struct syscall_metadata *sys_data;
			
 
				+	struct syscall_trace_enter *rec;
			
 
				+	unsigned long flags;
			
 
				+	char *raw_data;
			
 
				 	int syscall_nr;
			
 
				 	int size;
			
 
				+	int cpu;
			
 
				 
			
 
				 	syscall_nr = syscall_get_nr(current, regs);
			
 
				 	if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
			
@@ -402,20 +405,38 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 
				 	size = ALIGN(size + sizeof(u32), sizeof(u64));
			
 
				 	size -= sizeof(u32);
			
 
				 
			
 
				-	do {
			
 
				-		char raw_data[size];
			
 
				+	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
			
 
				+		      "profile buffer not large enough"))
			
 
				+		return;
			
 
				+
			
 
				+	/* Protect the per cpu buffer, begin the rcu read side */
			
 
				+	local_irq_save(flags);
			
 
				 
			
 
				-		/* zero the dead bytes from align to not leak stack to user */
			
 
				-		*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
			
 
				+	cpu = smp_processor_id();
			
 
				+
			
 
				+	if (in_nmi())
			
 
				+		raw_data = rcu_dereference(trace_profile_buf_nmi);
			
 
				+	else
			
 
				+		raw_data = rcu_dereference(trace_profile_buf);
			
 
				+
			
 
				+	if (!raw_data)
			
 
				+		goto end;
			
 
				 
			
 
				-		rec = (struct syscall_trace_enter *) raw_data;
			
 
				-		tracing_generic_entry_update(&rec->ent, 0, 0);
			
 
				-		rec->ent.type = sys_data->enter_id;
			
 
				-		rec->nr = syscall_nr;
			
 
				-		syscall_get_arguments(current, regs, 0, sys_data->nb_args,
			
 
				-				       (unsigned long *)&rec->args);
			
 
				-		perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
			
 
				-	} while(0);
			
 
				+	raw_data = per_cpu_ptr(raw_data, cpu);
			
 
				+
			
 
				+	/* zero the dead bytes from align to not leak stack to user */
			
 
				+	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
			
 
				+
			
 
				+	rec = (struct syscall_trace_enter *) raw_data;
			
 
				+	tracing_generic_entry_update(&rec->ent, 0, 0);
			
 
				+	rec->ent.type = sys_data->enter_id;
			
 
				+	rec->nr = syscall_nr;
			
 
				+	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
			
 
				+			       (unsigned long *)&rec->args);
			
 
				+	perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
			
 
				+
			
 
				+end:
			
 
				+	local_irq_restore(flags);
			
 
				 }
			
 
				 
			
 
				 int reg_prof_syscall_enter(char *name)
			
@@ -460,8 +481,12 @@ void unreg_prof_syscall_enter(char *name)
 
				 static void prof_syscall_exit(struct pt_regs *regs, long ret)
			
 
				 {
			
 
				 	struct syscall_metadata *sys_data;
			
 
				-	struct syscall_trace_exit rec;
			
 
				+	struct syscall_trace_exit *rec;
			
 
				+	unsigned long flags;
			
 
				 	int syscall_nr;
			
 
				+	char *raw_data;
			
 
				+	int size;
			
 
				+	int cpu;
			
 
				 
			
 
				 	syscall_nr = syscall_get_nr(current, regs);
			
 
				 	if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
			
@@ -471,12 +496,46 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 
				 	if (!sys_data)
			
 
				 		return;
			
 
				 
			
 
				-	tracing_generic_entry_update(&rec.ent, 0, 0);
			
 
				-	rec.ent.type = sys_data->exit_id;
			
 
				-	rec.nr = syscall_nr;
			
 
				-	rec.ret = syscall_get_return_value(current, regs);
			
 
				+	/* We can probably do that at build time */
			
 
				+	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
			
 
				+	size -= sizeof(u32);
			
 
				 
			
 
				-	perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec));
			
 
				+	/*
			
 
				+	 * Impossible, but be paranoid with the future
			
 
				+	 * How to put this check outside runtime?
			
 
				+	 */
			
 
				+	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
			
 
				+		"exit event has grown above profile buffer size"))
			
 
				+		return;
			
 
				+
			
 
				+	/* Protect the per cpu buffer, begin the rcu read side */
			
 
				+	local_irq_save(flags);
			
 
				+	cpu = smp_processor_id();
			
 
				+
			
 
				+	if (in_nmi())
			
 
				+		raw_data = rcu_dereference(trace_profile_buf_nmi);
			
 
				+	else
			
 
				+		raw_data = rcu_dereference(trace_profile_buf);
			
 
				+
			
 
				+	if (!raw_data)
			
 
				+		goto end;
			
 
				+
			
 
				+	raw_data = per_cpu_ptr(raw_data, cpu);
			
 
				+
			
 
				+	/* zero the dead bytes from align to not leak stack to user */
			
 
				+	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
			
 
				+
			
 
				+	rec = (struct syscall_trace_exit *)raw_data;
			
 
				+
			
 
				+	tracing_generic_entry_update(&rec->ent, 0, 0);
			
 
				+	rec->ent.type = sys_data->exit_id;
			
 
				+	rec->nr = syscall_nr;
			
 
				+	rec->ret = syscall_get_return_value(current, regs);
			
 
				+
			
 
				+	perf_tpcounter_event(sys_data->exit_id, 0, 1, rec, size);
			
 
				+
			
 
				+end:
			
 
				+	local_irq_restore(flags);
			
 
				 }
			
 
				 
			
 
				 int reg_prof_syscall_exit(char *name)