Browse Source

Merge branch 'tip/tracing/core/devel' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-2.6-trace into tracing/ftrace

Conflicts:
	kernel/trace/trace_hw_branches.c
Ingo Molnar 16 năm trước cách đây
mục cha
commit
44b0635481

+ 1 - 0
arch/x86/Kconfig

@@ -34,6 +34,7 @@ config X86
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
+	select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 	select HAVE_ARCH_KGDB if !X86_VOYAGER
 	select HAVE_ARCH_TRACEHOOK

+ 8 - 27
arch/x86/kernel/ftrace.c

@@ -82,7 +82,7 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
  * are the same as what exists.
  */
 
-static atomic_t in_nmi = ATOMIC_INIT(0);
+static atomic_t nmi_running = ATOMIC_INIT(0);
 static int mod_code_status;		/* holds return value of text write */
 static int mod_code_write;		/* set when NMI should do the write */
 static void *mod_code_ip;		/* holds the IP to write to */
@@ -115,8 +115,8 @@ static void ftrace_mod_code(void)
 
 void ftrace_nmi_enter(void)
 {
-	atomic_inc(&in_nmi);
-	/* Must have in_nmi seen before reading write flag */
+	atomic_inc(&nmi_running);
+	/* Must have nmi_running seen before reading write flag */
 	smp_mb();
 	if (mod_code_write) {
 		ftrace_mod_code();
@@ -126,19 +126,19 @@ void ftrace_nmi_enter(void)
 
 void ftrace_nmi_exit(void)
 {
-	/* Finish all executions before clearing in_nmi */
+	/* Finish all executions before clearing nmi_running */
 	smp_wmb();
-	atomic_dec(&in_nmi);
+	atomic_dec(&nmi_running);
 }
 
 static void wait_for_nmi(void)
 {
-	if (!atomic_read(&in_nmi))
+	if (!atomic_read(&nmi_running))
 		return;
 
 	do {
 		cpu_relax();
-	} while(atomic_read(&in_nmi));
+	} while (atomic_read(&nmi_running));
 
 	nmi_wait_count++;
 }
@@ -367,25 +367,6 @@ int ftrace_disable_ftrace_graph_caller(void)
 	return ftrace_mod_jmp(ip, old_offset, new_offset);
 }
 
-#else /* CONFIG_DYNAMIC_FTRACE */
-
-/*
- * These functions are picked from those used on
- * this page for dynamic ftrace. They have been
- * simplified to ignore all traces in NMI context.
- */
-static atomic_t in_nmi;
-
-void ftrace_nmi_enter(void)
-{
-	atomic_inc(&in_nmi);
-}
-
-void ftrace_nmi_exit(void)
-{
-	atomic_dec(&in_nmi);
-}
-
 #endif /* !CONFIG_DYNAMIC_FTRACE */
 
 /* Add a function return address to the trace stack on thread info.*/
@@ -475,7 +456,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 				&return_to_handler;
 
 	/* Nmi's are currently unsupported */
-	if (unlikely(atomic_read(&in_nmi)))
+	if (unlikely(in_nmi()))
 		return;
 
 	if (unlikely(atomic_read(&current->tracing_graph_pause)))

+ 1 - 1
include/linux/ftrace.h

@@ -140,7 +140,7 @@ static inline int ftrace_disable_ftrace_graph_caller(void) { return 0; }
 #endif
 
 /**
- * ftrace_make_nop - convert code into top
+ * ftrace_make_nop - convert code into nop
  * @mod: module structure if called by module load initialization
  * @rec: the mcount call site record
  * @addr: the address that the call site should be calling

+ 1 - 1
include/linux/ftrace_irq.h

@@ -2,7 +2,7 @@
 #define _LINUX_FTRACE_IRQ_H
 
 
-#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_GRAPH_TRACER)
+#ifdef CONFIG_FTRACE_NMI_ENTER
 extern void ftrace_nmi_enter(void);
 extern void ftrace_nmi_exit(void);
 #else

+ 15 - 0
include/linux/hardirq.h

@@ -61,6 +61,12 @@
 #error PREEMPT_ACTIVE is too low!
 #endif
 
+#define NMI_OFFSET	(PREEMPT_ACTIVE << 1)
+
+#if NMI_OFFSET >= 0x80000000
+#error PREEMPT_ACTIVE too high!
+#endif
+
 #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
 #define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
 #define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK))
@@ -73,6 +79,11 @@
 #define in_softirq()		(softirq_count())
 #define in_interrupt()		(irq_count())
 
+/*
+ * Are we in NMI context?
+ */
+#define in_nmi()	(preempt_count() & NMI_OFFSET)
+
 #if defined(CONFIG_PREEMPT)
 # define PREEMPT_INATOMIC_BASE kernel_locked()
 # define PREEMPT_CHECK_OFFSET 1
@@ -167,6 +178,8 @@ extern void irq_exit(void);
 #define nmi_enter()				\
 	do {					\
 		ftrace_nmi_enter();		\
+		BUG_ON(in_nmi());		\
+		add_preempt_count(NMI_OFFSET);	\
 		lockdep_off();			\
 		rcu_nmi_enter();		\
 		__irq_enter();			\
@@ -177,6 +190,8 @@ extern void irq_exit(void);
 		__irq_exit();			\
 		rcu_nmi_exit();			\
 		lockdep_on();			\
+		BUG_ON(!in_nmi());		\
+		sub_preempt_count(NMI_OFFSET);	\
 		ftrace_nmi_exit();		\
 	} while (0)
 

+ 9 - 0
include/linux/ring_buffer.h

@@ -121,9 +121,18 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
 u64 ring_buffer_time_stamp(int cpu);
 void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
 
+/*
+ * The below functions are fine to use outside the tracing facility.
+ */
+#ifdef CONFIG_RING_BUFFER
 void tracing_on(void);
 void tracing_off(void);
 void tracing_off_permanent(void);
+#else
+static inline void tracing_on(void) { }
+static inline void tracing_off(void) { }
+static inline void tracing_off_permanent(void) { }
+#endif
 
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer);
 void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data);

+ 8 - 0
kernel/trace/Kconfig

@@ -9,6 +9,9 @@ config USER_STACKTRACE_SUPPORT
 config NOP_TRACER
 	bool
 
+config HAVE_FTRACE_NMI_ENTER
+	bool
+
 config HAVE_FUNCTION_TRACER
 	bool
 
@@ -37,6 +40,11 @@ config TRACER_MAX_TRACE
 config RING_BUFFER
 	bool
 
+config FTRACE_NMI_ENTER
+       bool
+       depends on HAVE_FTRACE_NMI_ENTER
+       default y
+
 config TRACING
 	bool
 	select DEBUG_FS

+ 3 - 3
kernel/trace/ftrace.c

@@ -465,7 +465,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	 * it is not enabled then do nothing.
 	 *
 	 * If this record is not to be traced and
-	 * it is enabled then disabled it.
+	 * it is enabled then disable it.
 	 *
 	 */
 	if (rec->flags & FTRACE_FL_NOTRACE) {
@@ -485,7 +485,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 		if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
 			return 0;
 
-		/* Record is not filtered and is not enabled do nothing */
+		/* Record is not filtered or enabled, do nothing */
 		if (!fl)
 			return 0;
 
@@ -507,7 +507,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 
 		} else {
 
-			/* if record is not enabled do nothing */
+			/* if record is not enabled, do nothing */
 			if (!(rec->flags & FTRACE_FL_ENABLED))
 				return 0;
 

+ 29 - 2
kernel/trace/ring_buffer.c

@@ -4,9 +4,11 @@
  * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
  */
 #include <linux/ring_buffer.h>
+#include <linux/ftrace_irq.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
+#include <linux/hardirq.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
@@ -982,6 +984,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	struct ring_buffer *buffer = cpu_buffer->buffer;
 	struct ring_buffer_event *event;
 	unsigned long flags;
+	bool lock_taken = false;
 
 	commit_page = cpu_buffer->commit_page;
 	/* we just need to protect against interrupts */
@@ -995,7 +998,30 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		struct buffer_page *next_page = tail_page;
 
 		local_irq_save(flags);
-		__raw_spin_lock(&cpu_buffer->lock);
+		/*
+		 * Since the write to the buffer is still not
+		 * fully lockless, we must be careful with NMIs.
+		 * The locks in the writers are taken when a write
+		 * crosses to a new page. The locks protect against
+		 * races with the readers (this will soon be fixed
+		 * with a lockless solution).
+		 *
+		 * Because we can not protect against NMIs, and we
+		 * want to keep traces reentrant, we need to manage
+		 * what happens when we are in an NMI.
+		 *
+		 * NMIs can happen after we take the lock.
+		 * If we are in an NMI, only take the lock
+		 * if it is not already taken. Otherwise
+		 * simply fail.
+		 */
+		if (unlikely(in_nmi())) {
+			if (!__raw_spin_trylock(&cpu_buffer->lock))
+				goto out_unlock;
+		} else
+			__raw_spin_lock(&cpu_buffer->lock);
+
+		lock_taken = true;
 
 		rb_inc_page(cpu_buffer, &next_page);
 
@@ -1097,7 +1123,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	if (tail <= BUF_PAGE_SIZE)
 		local_set(&tail_page->write, tail);
 
-	__raw_spin_unlock(&cpu_buffer->lock);
+	if (likely(lock_taken))
+		__raw_spin_unlock(&cpu_buffer->lock);
 	local_irq_restore(flags);
 	return NULL;
 }

+ 1 - 1
kernel/trace/trace.c

@@ -1519,7 +1519,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 
 	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
 		SEQ_PUT_FIELD_RET(s, entry->pid);
-		SEQ_PUT_FIELD_RET(s, entry->cpu);
+		SEQ_PUT_FIELD_RET(s, iter->cpu);
 		SEQ_PUT_FIELD_RET(s, iter->ts);
 	}
 

+ 3 - 4
kernel/trace/trace.h

@@ -45,7 +45,6 @@ enum trace_type {
  */
 struct trace_entry {
 	unsigned char		type;
-	unsigned char		cpu;
 	unsigned char		flags;
 	unsigned char		preempt_count;
 	int			pid;
@@ -625,12 +624,12 @@ extern struct tracer nop_trace;
  * preempt_enable (after a disable), a schedule might take place
  * causing an infinite recursion.
  *
- * To prevent this, we read the need_recshed flag before
+ * To prevent this, we read the need_resched flag before
  * disabling preemption. When we want to enable preemption we
  * check the flag, if it is set, then we call preempt_enable_no_resched.
  * Otherwise, we call preempt_enable.
  *
- * The rational for doing the above is that if need resched is set
+ * The rational for doing the above is that if need_resched is set
  * and we have yet to reschedule, we are either in an atomic location
  * (where we do not need to check for scheduling) or we are inside
  * the scheduler and do not want to resched.
@@ -651,7 +650,7 @@ static inline int ftrace_preempt_disable(void)
  *
  * This is a scheduler safe way to enable preemption and not miss
  * any preemption checks. The disabled saved the state of preemption.
- * If resched is set, then we were either inside an atomic or
+ * If resched is set, then we are either inside an atomic or
  * are inside the scheduler (we would have already scheduled
  * otherwise). In this case, we do not want to call normal
  * preempt_enable, but preempt_enable_no_resched instead.

+ 3 - 2
kernel/trace/trace_hw_branches.c

@@ -158,7 +158,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 	trace_assign_type(it, entry);
 
 	if (entry->type == TRACE_HW_BRANCHES) {
-		if (trace_seq_printf(seq, "%4d  ", entry->cpu) &&
+		if (trace_seq_printf(seq, "%4d  ", iter->cpu) &&
 		    seq_print_ip_sym(seq, it->to, symflags) &&
 		    trace_seq_printf(seq, "\t  <-  ") &&
 		    seq_print_ip_sym(seq, it->from, symflags) &&
@@ -193,7 +193,8 @@ void trace_hw_branch(u64 from, u64 to)
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
-	entry->ent.cpu = cpu;
+	tracing_generic_entry_update(&entry->ent, 0, from);
+	entry->ent.type = TRACE_HW_BRANCHES;
 	entry->from = from;
 	entry->to   = to;
 	trace_buffer_unlock_commit(tr, event, 0, 0);

+ 3 - 3
kernel/trace/trace_output.c

@@ -333,7 +333,7 @@ int trace_print_context(struct trace_iterator *iter)
 	unsigned long secs = (unsigned long)t;
 
 	return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ",
-				comm, entry->pid, entry->cpu, secs, usec_rem);
+				comm, entry->pid, iter->cpu, secs, usec_rem);
 }
 
 int trace_print_lat_context(struct trace_iterator *iter)
@@ -356,7 +356,7 @@ int trace_print_lat_context(struct trace_iterator *iter)
 		char *comm = trace_find_cmdline(entry->pid);
 		ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]"
 				       " %ld.%03ldms (+%ld.%03ldms): ", comm,
-				       entry->pid, entry->cpu, entry->flags,
+				       entry->pid, iter->cpu, entry->flags,
 				       entry->preempt_count, iter->idx,
 				       ns2usecs(iter->ts),
 				       abs_usecs / USEC_PER_MSEC,
@@ -364,7 +364,7 @@ int trace_print_lat_context(struct trace_iterator *iter)
 				       rel_usecs / USEC_PER_MSEC,
 				       rel_usecs % USEC_PER_MSEC);
 	} else {
-		ret = lat_print_generic(s, entry, entry->cpu);
+		ret = lat_print_generic(s, entry, iter->cpu);
 		if (ret)
 			ret = lat_print_timestamp(s, abs_usecs, rel_usecs);
 	}