16 năm trước cách đây · 8a1ca8cedd
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4403,6 +4403,16 @@ S:	Maintained
 
				 F:	include/linux/delayacct.h
			
 
				 F:	kernel/delayacct.c
			
 
				 
			
 
				+PERFORMANCE COUNTER SUBSYSTEM
			
 
				+P:	Peter Zijlstra
			
 
				+M:	a.p.zijlstra@chello.nl
			
 
				+P:	Paul Mackerras
			
 
				+M:	paulus@samba.org
			
 
				+P:	Ingo Molnar
			
 
				+M:	mingo@elte.hu
			
 
				+L:	linux-kernel@vger.kernel.org
			
 
				+S:	Supported
			
 
				+
			
 
				 PERSONALITY HANDLING
			
 
				 P:	Christoph Hellwig
			
 
				 M:	hch@infradead.org
			
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -131,5 +131,44 @@ static inline int irqs_disabled_flags(unsigned long flags)
 
				  */
			
 
				 struct irq_chip;
			
 
				 
			
 
				+#ifdef CONFIG_PERF_COUNTERS
			
 
				+static inline unsigned long test_perf_counter_pending(void)
			
 
				+{
			
 
				+	unsigned long x;
			
 
				+
			
 
				+	asm volatile("lbz %0,%1(13)"
			
 
				+		: "=r" (x)
			
 
				+		: "i" (offsetof(struct paca_struct, perf_counter_pending)));
			
 
				+	return x;
			
 
				+}
			
 
				+
			
 
				+static inline void set_perf_counter_pending(void)
			
 
				+{
			
 
				+	asm volatile("stb %0,%1(13)" : :
			
 
				+		"r" (1),
			
 
				+		"i" (offsetof(struct paca_struct, perf_counter_pending)));
			
 
				+}
			
 
				+
			
 
				+static inline void clear_perf_counter_pending(void)
			
 
				+{
			
 
				+	asm volatile("stb %0,%1(13)" : :
			
 
				+		"r" (0),
			
 
				+		"i" (offsetof(struct paca_struct, perf_counter_pending)));
			
 
				+}
			
 
				+
			
 
				+extern void perf_counter_do_pending(void);
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+static inline unsigned long test_perf_counter_pending(void)
			
 
				+{
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static inline void set_perf_counter_pending(void) {}
			
 
				+static inline void clear_perf_counter_pending(void) {}
			
 
				+static inline void perf_counter_do_pending(void) {}
			
 
				+#endif /* CONFIG_PERF_COUNTERS */
			
 
				+
			
 
				 #endif	/* __KERNEL__ */
			
 
				 #endif	/* _ASM_POWERPC_HW_IRQ_H */
			
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -99,6 +99,7 @@ struct paca_struct {
 
				 	u8 soft_enabled;		/* irq soft-enable flag */
			
 
				 	u8 hard_enabled;		/* set if irqs are enabled in MSR */
			
 
				 	u8 io_sync;			/* writel() needs spin_unlock sync */
			
 
				+	u8 perf_counter_pending;	/* PM interrupt while soft-disabled */
			
 
				 
			
 
				 	/* Stuff for accurate time accounting */
			
 
				 	u64 user_time;			/* accumulated usermode TB ticks */
			
--- a/arch/powerpc/include/asm/perf_counter.h
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -0,0 +1,98 @@
 
				+/*
			
 
				+ * Performance counter support - PowerPC-specific definitions.
			
 
				+ *
			
 
				+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public License
			
 
				+ * as published by the Free Software Foundation; either version
			
 
				+ * 2 of the License, or (at your option) any later version.
			
 
				+ */
			
 
				+#include <linux/types.h>
			
 
				+
			
 
				+#define MAX_HWCOUNTERS		8
			
 
				+#define MAX_EVENT_ALTERNATIVES	8
			
 
				+#define MAX_LIMITED_HWCOUNTERS	2
			
 
				+
			
 
				+/*
			
 
				+ * This struct provides the constants and functions needed to
			
 
				+ * describe the PMU on a particular POWER-family CPU.
			
 
				+ */
			
 
				+struct power_pmu {
			
 
				+	int	n_counter;
			
 
				+	int	max_alternatives;
			
 
				+	u64	add_fields;
			
 
				+	u64	test_adder;
			
 
				+	int	(*compute_mmcr)(u64 events[], int n_ev,
			
 
				+				unsigned int hwc[], u64 mmcr[]);
			
 
				+	int	(*get_constraint)(u64 event, u64 *mskp, u64 *valp);
			
 
				+	int	(*get_alternatives)(u64 event, unsigned int flags,
			
 
				+				    u64 alt[]);
			
 
				+	void	(*disable_pmc)(unsigned int pmc, u64 mmcr[]);
			
 
				+	int	(*limited_pmc_event)(u64 event);
			
 
				+	u32	flags;
			
 
				+	int	n_generic;
			
 
				+	int	*generic_events;
			
 
				+	int	(*cache_events)[PERF_COUNT_HW_CACHE_MAX]
			
 
				+			       [PERF_COUNT_HW_CACHE_OP_MAX]
			
 
				+			       [PERF_COUNT_HW_CACHE_RESULT_MAX];
			
 
				+};
			
 
				+
			
 
				+extern struct power_pmu *ppmu;
			
 
				+
			
 
				+/*
			
 
				+ * Values for power_pmu.flags
			
 
				+ */
			
 
				+#define PPMU_LIMITED_PMC5_6	1	/* PMC5/6 have limited function */
			
 
				+#define PPMU_ALT_SIPR		2	/* uses alternate posn for SIPR/HV */
			
 
				+
			
 
				+/*
			
 
				+ * Values for flags to get_alternatives()
			
 
				+ */
			
 
				+#define PPMU_LIMITED_PMC_OK	1	/* can put this on a limited PMC */
			
 
				+#define PPMU_LIMITED_PMC_REQD	2	/* have to put this on a limited PMC */
			
 
				+#define PPMU_ONLY_COUNT_RUN	4	/* only counting in run state */
			
 
				+
			
 
				+struct pt_regs;
			
 
				+extern unsigned long perf_misc_flags(struct pt_regs *regs);
			
 
				+#define perf_misc_flags(regs)	perf_misc_flags(regs)
			
 
				+
			
 
				+extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
			
 
				+
			
 
				+/*
			
 
				+ * The power_pmu.get_constraint function returns a 64-bit value and
			
 
				+ * a 64-bit mask that express the constraints between this event and
			
 
				+ * other events.
			
 
				+ *
			
 
				+ * The value and mask are divided up into (non-overlapping) bitfields
			
 
				+ * of three different types:
			
 
				+ *
			
 
				+ * Select field: this expresses the constraint that some set of bits
			
 
				+ * in MMCR* needs to be set to a specific value for this event.  For a
			
 
				+ * select field, the mask contains 1s in every bit of the field, and
			
 
				+ * the value contains a unique value for each possible setting of the
			
 
				+ * MMCR* bits.  The constraint checking code will ensure that two events
			
 
				+ * that set the same field in their masks have the same value in their
			
 
				+ * value dwords.
			
 
				+ *
			
 
				+ * Add field: this expresses the constraint that there can be at most
			
 
				+ * N events in a particular class.  A field of k bits can be used for
			
 
				+ * N <= 2^(k-1) - 1.  The mask has the most significant bit of the field
			
 
				+ * set (and the other bits 0), and the value has only the least significant
			
 
				+ * bit of the field set.  In addition, the 'add_fields' and 'test_adder'
			
 
				+ * in the struct power_pmu for this processor come into play.  The
			
 
				+ * add_fields value contains 1 in the LSB of the field, and the
			
 
				+ * test_adder contains 2^(k-1) - 1 - N in the field.
			
 
				+ *
			
 
				+ * NAND field: this expresses the constraint that you may not have events
			
 
				+ * in all of a set of classes.  (For example, on PPC970, you can't select
			
 
				+ * events from the FPU, ISU and IDU simultaneously, although any two are
			
 
				+ * possible.)  For N classes, the field is N+1 bits wide, and each class
			
 
				+ * is assigned one bit from the least-significant N bits.  The mask has
			
 
				+ * only the most-significant bit set, and the value has only the bit
			
 
				+ * for the event's class set.  The test_adder has the least significant
			
 
				+ * bit set in the field.
			
 
				+ *
			
 
				+ * If an event is not subject to the constraint expressed by a particular
			
 
				+ * field, then it will have 0 in both the mask and value for that field.
			
 
				+ */
			
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -492,11 +492,13 @@
 
				 #define   MMCR0_FCHV	0x00000001UL /* freeze conditions in hypervisor mode */
			
 
				 #define SPRN_MMCR1	798
			
 
				 #define SPRN_MMCRA	0x312
			
 
				+#define   MMCRA_SDSYNC	0x80000000UL /* SDAR synced with SIAR */
			
 
				 #define   MMCRA_SIHV	0x10000000UL /* state of MSR HV when SIAR set */
			
 
				 #define   MMCRA_SIPR	0x08000000UL /* state of MSR PR when SIAR set */
			
 
				 #define   MMCRA_SLOT	0x07000000UL /* SLOT bits (37-39) */
			
 
				 #define   MMCRA_SLOT_SHIFT	24
			
 
				 #define   MMCRA_SAMPLE_ENABLE 0x00000001UL /* enable sampling */
			
 
				+#define   POWER6_MMCRA_SDSYNC 0x0000080000000000ULL	/* SDAR/SIAR synced */
			
 
				 #define   POWER6_MMCRA_SIHV   0x0000040000000000ULL
			
 
				 #define   POWER6_MMCRA_SIPR   0x0000020000000000ULL
			
 
				 #define   POWER6_MMCRA_THRM	0x00000020UL
			
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -322,6 +322,6 @@ SYSCALL_SPU(epoll_create1)
 
				 SYSCALL_SPU(dup3)
			
 
				 SYSCALL_SPU(pipe2)
			
 
				 SYSCALL(inotify_init1)
			
 
				-SYSCALL(ni_syscall)
			
 
				+SYSCALL_SPU(perf_counter_open)
			
 
				 COMPAT_SYS_SPU(preadv)
			
 
				 COMPAT_SYS_SPU(pwritev)
			
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -341,6 +341,7 @@
 
				 #define __NR_dup3		316
			
 
				 #define __NR_pipe2		317
			
 
				 #define __NR_inotify_init1	318
			
 
				+#define __NR_perf_counter_open	319
			
 
				 #define __NR_preadv		320
			
 
				 #define __NR_pwritev		321
			
 
				 
			
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,6 +94,9 @@ obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
				 
			
 
				 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
			
 
				 obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
			
 
				+obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o power4-pmu.o ppc970-pmu.o \
			
 
				+				   power5-pmu.o power5+-pmu.o power6-pmu.o \
			
 
				+				   power7-pmu.o
			
 
				 
			
 
				 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
			
 
				 
			
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -131,6 +131,7 @@ int main(void)
 
				 	DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));
			
 
				 	DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
			
 
				 	DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled));
			
 
				+	DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending));
			
 
				 	DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache));
			
 
				 	DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
			
 
				 	DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
			
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -526,6 +526,15 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
 
				 2:
			
 
				 	TRACE_AND_RESTORE_IRQ(r5);
			
 
				 
			
 
				+#ifdef CONFIG_PERF_COUNTERS
			
 
				+	/* check paca->perf_counter_pending if we're enabling ints */
			
 
				+	lbz	r3,PACAPERFPEND(r13)
			
 
				+	and.	r3,r3,r5
			
 
				+	beq	27f
			
 
				+	bl	.perf_counter_do_pending
			
 
				+27:
			
 
				+#endif /* CONFIG_PERF_COUNTERS */
			
 
				+
			
 
				 	/* extract EE bit and use it to restore paca->hard_enabled */
			
 
				 	ld	r3,_MSR(r1)
			
 
				 	rldicl	r4,r3,49,63		/* r0 = (r3 >> 15) & 1 */
			
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -135,6 +135,11 @@ notrace void raw_local_irq_restore(unsigned long en)
 
				 			iseries_handle_interrupts();
			
 
				 	}
			
 
				 
			
 
				+	if (test_perf_counter_pending()) {
			
 
				+		clear_perf_counter_pending();
			
 
				+		perf_counter_do_pending();
			
 
				+	}
			
 
				+
			
 
				 	/*
			
 
				 	 * if (get_paca()->hard_enabled) return;
			
 
				 	 * But again we need to take care that gcc gets hard_enabled directly
			
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -0,0 +1,1263 @@
 
				+/*
			
 
				+ * Performance counter support - powerpc architecture code
			
 
				+ *
			
 
				+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public License
			
 
				+ * as published by the Free Software Foundation; either version
			
 
				+ * 2 of the License, or (at your option) any later version.
			
 
				+ */
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/sched.h>
			
 
				+#include <linux/perf_counter.h>
			
 
				+#include <linux/percpu.h>
			
 
				+#include <linux/hardirq.h>
			
 
				+#include <asm/reg.h>
			
 
				+#include <asm/pmc.h>
			
 
				+#include <asm/machdep.h>
			
 
				+#include <asm/firmware.h>
			
 
				+#include <asm/ptrace.h>
			
 
				+
			
 
				+struct cpu_hw_counters {
			
 
				+	int n_counters;
			
 
				+	int n_percpu;
			
 
				+	int disabled;
			
 
				+	int n_added;
			
 
				+	int n_limited;
			
 
				+	u8  pmcs_enabled;
			
 
				+	struct perf_counter *counter[MAX_HWCOUNTERS];
			
 
				+	u64 events[MAX_HWCOUNTERS];
			
 
				+	unsigned int flags[MAX_HWCOUNTERS];
			
 
				+	u64 mmcr[3];
			
 
				+	struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS];
			
 
				+	u8  limited_hwidx[MAX_LIMITED_HWCOUNTERS];
			
 
				+};
			
 
				+DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
			
 
				+
			
 
				+struct power_pmu *ppmu;
			
 
				+
			
 
				+/*
			
 
				+ * Normally, to ignore kernel events we set the FCS (freeze counters
			
 
				+ * in supervisor mode) bit in MMCR0, but if the kernel runs with the
			
 
				+ * hypervisor bit set in the MSR, or if we are running on a processor
			
 
				+ * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
			
 
				+ * then we need to use the FCHV bit to ignore kernel events.
			
 
				+ */
			
 
				+static unsigned int freeze_counters_kernel = MMCR0_FCS;
			
 
				+
			
 
				+static void perf_counter_interrupt(struct pt_regs *regs);
			
 
				+
			
 
				+void perf_counter_print_debug(void)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Read one performance monitor counter (PMC).
			
 
				+ */
			
 
				+static unsigned long read_pmc(int idx)
			
 
				+{
			
 
				+	unsigned long val;
			
 
				+
			
 
				+	switch (idx) {
			
 
				+	case 1:
			
 
				+		val = mfspr(SPRN_PMC1);
			
 
				+		break;
			
 
				+	case 2:
			
 
				+		val = mfspr(SPRN_PMC2);
			
 
				+		break;
			
 
				+	case 3:
			
 
				+		val = mfspr(SPRN_PMC3);
			
 
				+		break;
			
 
				+	case 4:
			
 
				+		val = mfspr(SPRN_PMC4);
			
 
				+		break;
			
 
				+	case 5:
			
 
				+		val = mfspr(SPRN_PMC5);
			
 
				+		break;
			
 
				+	case 6:
			
 
				+		val = mfspr(SPRN_PMC6);
			
 
				+		break;
			
 
				+	case 7:
			
 
				+		val = mfspr(SPRN_PMC7);
			
 
				+		break;
			
 
				+	case 8:
			
 
				+		val = mfspr(SPRN_PMC8);
			
 
				+		break;
			
 
				+	default:
			
 
				+		printk(KERN_ERR "oops trying to read PMC%d\n", idx);
			
 
				+		val = 0;
			
 
				+	}
			
 
				+	return val;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Write one PMC.
			
 
				+ */
			
 
				+static void write_pmc(int idx, unsigned long val)
			
 
				+{
			
 
				+	switch (idx) {
			
 
				+	case 1:
			
 
				+		mtspr(SPRN_PMC1, val);
			
 
				+		break;
			
 
				+	case 2:
			
 
				+		mtspr(SPRN_PMC2, val);
			
 
				+		break;
			
 
				+	case 3:
			
 
				+		mtspr(SPRN_PMC3, val);
			
 
				+		break;
			
 
				+	case 4:
			
 
				+		mtspr(SPRN_PMC4, val);
			
 
				+		break;
			
 
				+	case 5:
			
 
				+		mtspr(SPRN_PMC5, val);
			
 
				+		break;
			
 
				+	case 6:
			
 
				+		mtspr(SPRN_PMC6, val);
			
 
				+		break;
			
 
				+	case 7:
			
 
				+		mtspr(SPRN_PMC7, val);
			
 
				+		break;
			
 
				+	case 8:
			
 
				+		mtspr(SPRN_PMC8, val);
			
 
				+		break;
			
 
				+	default:
			
 
				+		printk(KERN_ERR "oops trying to write PMC%d\n", idx);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Check if a set of events can all go on the PMU at once.
			
 
				+ * If they can't, this will look at alternative codes for the events
			
 
				+ * and see if any combination of alternative codes is feasible.
			
 
				+ * The feasible set is returned in event[].
			
 
				+ */
			
 
				+static int power_check_constraints(u64 event[], unsigned int cflags[],
			
 
				+				   int n_ev)
			
 
				+{
			
 
				+	u64 mask, value, nv;
			
 
				+	u64 alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
			
 
				+	u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
			
 
				+	u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
			
 
				+	u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
			
 
				+	int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS];
			
 
				+	int i, j;
			
 
				+	u64 addf = ppmu->add_fields;
			
 
				+	u64 tadd = ppmu->test_adder;
			
 
				+
			
 
				+	if (n_ev > ppmu->n_counter)
			
 
				+		return -1;
			
 
				+
			
 
				+	/* First see if the events will go on as-is */
			
 
				+	for (i = 0; i < n_ev; ++i) {
			
 
				+		if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
			
 
				+		    && !ppmu->limited_pmc_event(event[i])) {
			
 
				+			ppmu->get_alternatives(event[i], cflags[i],
			
 
				+					       alternatives[i]);
			
 
				+			event[i] = alternatives[i][0];
			
 
				+		}
			
 
				+		if (ppmu->get_constraint(event[i], &amasks[i][0],
			
 
				+					 &avalues[i][0]))
			
 
				+			return -1;
			
 
				+	}
			
 
				+	value = mask = 0;
			
 
				+	for (i = 0; i < n_ev; ++i) {
			
 
				+		nv = (value | avalues[i][0]) + (value & avalues[i][0] & addf);
			
 
				+		if ((((nv + tadd) ^ value) & mask) != 0 ||
			
 
				+		    (((nv + tadd) ^ avalues[i][0]) & amasks[i][0]) != 0)
			
 
				+			break;
			
 
				+		value = nv;
			
 
				+		mask |= amasks[i][0];
			
 
				+	}
			
 
				+	if (i == n_ev)
			
 
				+		return 0;	/* all OK */
			
 
				+
			
 
				+	/* doesn't work, gather alternatives... */
			
 
				+	if (!ppmu->get_alternatives)
			
 
				+		return -1;
			
 
				+	for (i = 0; i < n_ev; ++i) {
			
 
				+		choice[i] = 0;
			
 
				+		n_alt[i] = ppmu->get_alternatives(event[i], cflags[i],
			
 
				+						  alternatives[i]);
			
 
				+		for (j = 1; j < n_alt[i]; ++j)
			
 
				+			ppmu->get_constraint(alternatives[i][j],
			
 
				+					     &amasks[i][j], &avalues[i][j]);
			
 
				+	}
			
 
				+
			
 
				+	/* enumerate all possibilities and see if any will work */
			
 
				+	i = 0;
			
 
				+	j = -1;
			
 
				+	value = mask = nv = 0;
			
 
				+	while (i < n_ev) {
			
 
				+		if (j >= 0) {
			
 
				+			/* we're backtracking, restore context */
			
 
				+			value = svalues[i];
			
 
				+			mask = smasks[i];
			
 
				+			j = choice[i];
			
 
				+		}
			
 
				+		/*
			
 
				+		 * See if any alternative k for event i,
			
 
				+		 * where k > j, will satisfy the constraints.
			
 
				+		 */
			
 
				+		while (++j < n_alt[i]) {
			
 
				+			nv = (value | avalues[i][j]) +
			
 
				+				(value & avalues[i][j] & addf);
			
 
				+			if ((((nv + tadd) ^ value) & mask) == 0 &&
			
 
				+			    (((nv + tadd) ^ avalues[i][j])
			
 
				+			     & amasks[i][j]) == 0)
			
 
				+				break;
			
 
				+		}
			
 
				+		if (j >= n_alt[i]) {
			
 
				+			/*
			
 
				+			 * No feasible alternative, backtrack
			
 
				+			 * to event i-1 and continue enumerating its
			
 
				+			 * alternatives from where we got up to.
			
 
				+			 */
			
 
				+			if (--i < 0)
			
 
				+				return -1;
			
 
				+		} else {
			
 
				+			/*
			
 
				+			 * Found a feasible alternative for event i,
			
 
				+			 * remember where we got up to with this event,
			
 
				+			 * go on to the next event, and start with
			
 
				+			 * the first alternative for it.
			
 
				+			 */
			
 
				+			choice[i] = j;
			
 
				+			svalues[i] = value;
			
 
				+			smasks[i] = mask;
			
 
				+			value = nv;
			
 
				+			mask |= amasks[i][j];
			
 
				+			++i;
			
 
				+			j = -1;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* OK, we have a feasible combination, tell the caller the solution */
			
 
				+	for (i = 0; i < n_ev; ++i)
			
 
				+		event[i] = alternatives[i][choice[i]];
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Check if newly-added counters have consistent settings for
			
 
				+ * exclude_{user,kernel,hv} with each other and any previously
			
 
				+ * added counters.
			
 
				+ */
			
 
				+static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[],
			
 
				+			  int n_prev, int n_new)
			
 
				+{
			
 
				+	int eu = 0, ek = 0, eh = 0;
			
 
				+	int i, n, first;
			
 
				+	struct perf_counter *counter;
			
 
				+
			
 
				+	n = n_prev + n_new;
			
 
				+	if (n <= 1)
			
 
				+		return 0;
			
 
				+
			
 
				+	first = 1;
			
 
				+	for (i = 0; i < n; ++i) {
			
 
				+		if (cflags[i] & PPMU_LIMITED_PMC_OK) {
			
 
				+			cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
			
 
				+			continue;
			
 
				+		}
			
 
				+		counter = ctrs[i];
			
 
				+		if (first) {
			
 
				+			eu = counter->attr.exclude_user;
			
 
				+			ek = counter->attr.exclude_kernel;
			
 
				+			eh = counter->attr.exclude_hv;
			
 
				+			first = 0;
			
 
				+		} else if (counter->attr.exclude_user != eu ||
			
 
				+			   counter->attr.exclude_kernel != ek ||
			
 
				+			   counter->attr.exclude_hv != eh) {
			
 
				+			return -EAGAIN;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (eu || ek || eh)
			
 
				+		for (i = 0; i < n; ++i)
			
 
				+			if (cflags[i] & PPMU_LIMITED_PMC_OK)
			
 
				+				cflags[i] |= PPMU_LIMITED_PMC_REQD;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void power_pmu_read(struct perf_counter *counter)
			
 
				+{
			
 
				+	long val, delta, prev;
			
 
				+
			
 
				+	if (!counter->hw.idx)
			
 
				+		return;
			
 
				+	/*
			
 
				+	 * Performance monitor interrupts come even when interrupts
			
 
				+	 * are soft-disabled, as long as interrupts are hard-enabled.
			
 
				+	 * Therefore we treat them like NMIs.
			
 
				+	 */
			
 
				+	do {
			
 
				+		prev = atomic64_read(&counter->hw.prev_count);
			
 
				+		barrier();
			
 
				+		val = read_pmc(counter->hw.idx);
			
 
				+	} while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev);
			
 
				+
			
 
				+	/* The counters are only 32 bits wide */
			
 
				+	delta = (val - prev) & 0xfffffffful;
			
 
				+	atomic64_add(delta, &counter->count);
			
 
				+	atomic64_sub(delta, &counter->hw.period_left);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * On some machines, PMC5 and PMC6 can't be written, don't respect
			
 
				+ * the freeze conditions, and don't generate interrupts.  This tells
			
 
				+ * us if `counter' is using such a PMC.
			
 
				+ */
			
 
				+static int is_limited_pmc(int pmcnum)
			
 
				+{
			
 
				+	return (ppmu->flags & PPMU_LIMITED_PMC5_6)
			
 
				+		&& (pmcnum == 5 || pmcnum == 6);
			
 
				+}
			
 
				+
			
 
				+static void freeze_limited_counters(struct cpu_hw_counters *cpuhw,
			
 
				+				    unsigned long pmc5, unsigned long pmc6)
			
 
				+{
			
 
				+	struct perf_counter *counter;
			
 
				+	u64 val, prev, delta;
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < cpuhw->n_limited; ++i) {
			
 
				+		counter = cpuhw->limited_counter[i];
			
 
				+		if (!counter->hw.idx)
			
 
				+			continue;
			
 
				+		val = (counter->hw.idx == 5) ? pmc5 : pmc6;
			
 
				+		prev = atomic64_read(&counter->hw.prev_count);
			
 
				+		counter->hw.idx = 0;
			
 
				+		delta = (val - prev) & 0xfffffffful;
			
 
				+		atomic64_add(delta, &counter->count);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void thaw_limited_counters(struct cpu_hw_counters *cpuhw,
			
 
				+				  unsigned long pmc5, unsigned long pmc6)
			
 
				+{
			
 
				+	struct perf_counter *counter;
			
 
				+	u64 val;
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < cpuhw->n_limited; ++i) {
			
 
				+		counter = cpuhw->limited_counter[i];
			
 
				+		counter->hw.idx = cpuhw->limited_hwidx[i];
			
 
				+		val = (counter->hw.idx == 5) ? pmc5 : pmc6;
			
 
				+		atomic64_set(&counter->hw.prev_count, val);
			
 
				+		perf_counter_update_userpage(counter);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Since limited counters don't respect the freeze conditions, we
			
 
				+ * have to read them immediately after freezing or unfreezing the
			
 
				+ * other counters.  We try to keep the values from the limited
			
 
				+ * counters as consistent as possible by keeping the delay (in
			
 
				+ * cycles and instructions) between freezing/unfreezing and reading
			
 
				+ * the limited counters as small and consistent as possible.
			
 
				+ * Therefore, if any limited counters are in use, we read them
			
 
				+ * both, and always in the same order, to minimize variability,
			
 
				+ * and do it inside the same asm that writes MMCR0.
			
 
				+ */
			
 
				+static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0)
			
 
				+{
			
 
				+	unsigned long pmc5, pmc6;
			
 
				+
			
 
				+	if (!cpuhw->n_limited) {
			
 
				+		mtspr(SPRN_MMCR0, mmcr0);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Write MMCR0, then read PMC5 and PMC6 immediately.
			
 
				+	 * To ensure we don't get a performance monitor interrupt
			
 
				+	 * between writing MMCR0 and freezing/thawing the limited
			
 
				+	 * counters, we first write MMCR0 with the counter overflow
			
 
				+	 * interrupt enable bits turned off.
			
 
				+	 */
			
 
				+	asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
			
 
				+		     : "=&r" (pmc5), "=&r" (pmc6)
			
 
				+		     : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)),
			
 
				+		       "i" (SPRN_MMCR0),
			
 
				+		       "i" (SPRN_PMC5), "i" (SPRN_PMC6));
			
 
				+
			
 
				+	if (mmcr0 & MMCR0_FC)
			
 
				+		freeze_limited_counters(cpuhw, pmc5, pmc6);
			
 
				+	else
			
 
				+		thaw_limited_counters(cpuhw, pmc5, pmc6);
			
 
				+
			
 
				+	/*
			
 
				+	 * Write the full MMCR0 including the counter overflow interrupt
			
 
				+	 * enable bits, if necessary.
			
 
				+	 */
			
 
				+	if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE))
			
 
				+		mtspr(SPRN_MMCR0, mmcr0);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Disable all counters to prevent PMU interrupts and to allow
			
 
				+ * counters to be added or removed.
			
 
				+ */
			
 
				+void hw_perf_disable(void)
			
 
				+{
			
 
				+	struct cpu_hw_counters *cpuhw;
			
 
				+	unsigned long ret;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	local_irq_save(flags);
			
 
				+	cpuhw = &__get_cpu_var(cpu_hw_counters);
			
 
				+
			
 
				+	ret = cpuhw->disabled;
			
 
				+	if (!ret) {
			
 
				+		cpuhw->disabled = 1;
			
 
				+		cpuhw->n_added = 0;
			
 
				+
			
 
				+		/*
			
 
				+		 * Check if we ever enabled the PMU on this cpu.
			
 
				+		 */
			
 
				+		if (!cpuhw->pmcs_enabled) {
			
 
				+			if (ppc_md.enable_pmcs)
			
 
				+				ppc_md.enable_pmcs();
			
 
				+			cpuhw->pmcs_enabled = 1;
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * Disable instruction sampling if it was enabled
			
 
				+		 */
			
 
				+		if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
			
 
				+			mtspr(SPRN_MMCRA,
			
 
				+			      cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
			
 
				+			mb();
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * Set the 'freeze counters' bit.
			
 
				+		 * The barrier is to make sure the mtspr has been
			
 
				+		 * executed and the PMU has frozen the counters
			
 
				+		 * before we return.
			
 
				+		 */
			
 
				+		write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
			
 
				+		mb();
			
 
				+	}
			
 
				+	local_irq_restore(flags);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Re-enable all counters if disable == 0.
			
 
				+ * If we were previously disabled and counters were added, then
			
 
				+ * put the new config on the PMU.
			
 
				+ */
			
 
				+void hw_perf_enable(void)
			
 
				+{
			
 
				+	struct perf_counter *counter;
			
 
				+	struct cpu_hw_counters *cpuhw;
			
 
				+	unsigned long flags;
			
 
				+	long i;
			
 
				+	unsigned long val;
			
 
				+	s64 left;
			
 
				+	unsigned int hwc_index[MAX_HWCOUNTERS];
			
 
				+	int n_lim;
			
 
				+	int idx;
			
 
				+
			
 
				+	local_irq_save(flags);
			
 
				+	cpuhw = &__get_cpu_var(cpu_hw_counters);
			
 
				+	if (!cpuhw->disabled) {
			
 
				+		local_irq_restore(flags);
			
 
				+		return;
			
 
				+	}
			
 
				+	cpuhw->disabled = 0;
			
 
				+
			
 
				+	/*
			
 
				+	 * If we didn't change anything, or only removed counters,
			
 
				+	 * no need to recalculate MMCR* settings and reset the PMCs.
			
 
				+	 * Just reenable the PMU with the current MMCR* settings
			
 
				+	 * (possibly updated for removal of counters).
			
 
				+	 */
			
 
				+	if (!cpuhw->n_added) {
			
 
				+		mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
			
 
				+		mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
			
 
				+		if (cpuhw->n_counters == 0)
			
 
				+			get_lppaca()->pmcregs_in_use = 0;
			
 
				+		goto out_enable;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Compute MMCR* values for the new set of counters
			
 
				+	 */
			
 
				+	if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index,
			
 
				+			       cpuhw->mmcr)) {
			
 
				+		/* shouldn't ever get here */
			
 
				+		printk(KERN_ERR "oops compute_mmcr failed\n");
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Add in MMCR0 freeze bits corresponding to the
			
 
				+	 * attr.exclude_* bits for the first counter.
			
 
				+	 * We have already checked that all counters have the
			
 
				+	 * same values for these bits as the first counter.
			
 
				+	 */
			
 
				+	counter = cpuhw->counter[0];
			
 
				+	if (counter->attr.exclude_user)
			
 
				+		cpuhw->mmcr[0] |= MMCR0_FCP;
			
 
				+	if (counter->attr.exclude_kernel)
			
 
				+		cpuhw->mmcr[0] |= freeze_counters_kernel;
			
 
				+	if (counter->attr.exclude_hv)
			
 
				+		cpuhw->mmcr[0] |= MMCR0_FCHV;
			
 
				+
			
 
				+	/*
			
 
				+	 * Write the new configuration to MMCR* with the freeze
			
 
				+	 * bit set and set the hardware counters to their initial values.
			
 
				+	 * Then unfreeze the counters.
			
 
				+	 */
			
 
				+	get_lppaca()->pmcregs_in_use = 1;
			
 
				+	mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
			
 
				+	mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
			
 
				+	mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
			
 
				+				| MMCR0_FC);
			
 
				+
			
 
				+	/*
			
 
				+	 * Read off any pre-existing counters that need to move
			
 
				+	 * to another PMC.
			
 
				+	 */
			
 
				+	for (i = 0; i < cpuhw->n_counters; ++i) {
			
 
				+		counter = cpuhw->counter[i];
			
 
				+		if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
			
 
				+			power_pmu_read(counter);
			
 
				+			write_pmc(counter->hw.idx, 0);
			
 
				+			counter->hw.idx = 0;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Initialize the PMCs for all the new and moved counters.
			
 
				+	 */
			
 
				+	cpuhw->n_limited = n_lim = 0;
			
 
				+	for (i = 0; i < cpuhw->n_counters; ++i) {
			
 
				+		counter = cpuhw->counter[i];
			
 
				+		if (counter->hw.idx)
			
 
				+			continue;
			
 
				+		idx = hwc_index[i] + 1;
			
 
				+		if (is_limited_pmc(idx)) {
			
 
				+			cpuhw->limited_counter[n_lim] = counter;
			
 
				+			cpuhw->limited_hwidx[n_lim] = idx;
			
 
				+			++n_lim;
			
 
				+			continue;
			
 
				+		}
			
 
				+		val = 0;
			
 
				+		if (counter->hw.sample_period) {
			
 
				+			left = atomic64_read(&counter->hw.period_left);
			
 
				+			if (left < 0x80000000L)
			
 
				+				val = 0x80000000L - left;
			
 
				+		}
			
 
				+		atomic64_set(&counter->hw.prev_count, val);
			
 
				+		counter->hw.idx = idx;
			
 
				+		write_pmc(idx, val);
			
 
				+		perf_counter_update_userpage(counter);
			
 
				+	}
			
 
				+	cpuhw->n_limited = n_lim;
			
 
				+	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
			
 
				+
			
 
				+ out_enable:
			
 
				+	mb();
			
 
				+	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
			
 
				+
			
 
				+	/*
			
 
				+	 * Enable instruction sampling if necessary
			
 
				+	 */
			
 
				+	if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
			
 
				+		mb();
			
 
				+		mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
			
 
				+	}
			
 
				+
			
 
				+ out:
			
 
				+	local_irq_restore(flags);
			
 
				+}
			
 
				+
			
 
				+static int collect_events(struct perf_counter *group, int max_count,
			
 
				+			  struct perf_counter *ctrs[], u64 *events,
			
 
				+			  unsigned int *flags)
			
 
				+{
			
 
				+	int n = 0;
			
 
				+	struct perf_counter *counter;
			
 
				+
			
 
				+	if (!is_software_counter(group)) {
			
 
				+		if (n >= max_count)
			
 
				+			return -1;
			
 
				+		ctrs[n] = group;
			
 
				+		flags[n] = group->hw.counter_base;
			
 
				+		events[n++] = group->hw.config;
			
 
				+	}
			
 
				+	list_for_each_entry(counter, &group->sibling_list, list_entry) {
			
 
				+		if (!is_software_counter(counter) &&
			
 
				+		    counter->state != PERF_COUNTER_STATE_OFF) {
			
 
				+			if (n >= max_count)
			
 
				+				return -1;
			
 
				+			ctrs[n] = counter;
			
 
				+			flags[n] = counter->hw.counter_base;
			
 
				+			events[n++] = counter->hw.config;
			
 
				+		}
			
 
				+	}
			
 
				+	return n;
			
 
				+}
			
 
				+
			
 
				+static void counter_sched_in(struct perf_counter *counter, int cpu)
			
 
				+{
			
 
				+	counter->state = PERF_COUNTER_STATE_ACTIVE;
			
 
				+	counter->oncpu = cpu;
			
 
				+	counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped;
			
 
				+	if (is_software_counter(counter))
			
 
				+		counter->pmu->enable(counter);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Called to enable a whole group of counters.
			
 
				+ * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
			
 
				+ * Assumes the caller has disabled interrupts and has
			
 
				+ * frozen the PMU with hw_perf_save_disable.
			
 
				+ */
			
 
				+int hw_perf_group_sched_in(struct perf_counter *group_leader,
			
 
				+	       struct perf_cpu_context *cpuctx,
			
 
				+	       struct perf_counter_context *ctx, int cpu)
			
 
				+{
			
 
				+	struct cpu_hw_counters *cpuhw;
			
 
				+	long i, n, n0;
			
 
				+	struct perf_counter *sub;
			
 
				+
			
 
				+	cpuhw = &__get_cpu_var(cpu_hw_counters);
			
 
				+	n0 = cpuhw->n_counters;
			
 
				+	n = collect_events(group_leader, ppmu->n_counter - n0,
			
 
				+			   &cpuhw->counter[n0], &cpuhw->events[n0],
			
 
				+			   &cpuhw->flags[n0]);
			
 
				+	if (n < 0)
			
 
				+		return -EAGAIN;
			
 
				+	if (check_excludes(cpuhw->counter, cpuhw->flags, n0, n))
			
 
				+		return -EAGAIN;
			
 
				+	i = power_check_constraints(cpuhw->events, cpuhw->flags, n + n0);
			
 
				+	if (i < 0)
			
 
				+		return -EAGAIN;
			
 
				+	cpuhw->n_counters = n0 + n;
			
 
				+	cpuhw->n_added += n;
			
 
				+
			
 
				+	/*
			
 
				+	 * OK, this group can go on; update counter states etc.,
			
 
				+	 * and enable any software counters
			
 
				+	 */
			
 
				+	for (i = n0; i < n0 + n; ++i)
			
 
				+		cpuhw->counter[i]->hw.config = cpuhw->events[i];
			
 
				+	cpuctx->active_oncpu += n;
			
 
				+	n = 1;
			
 
				+	counter_sched_in(group_leader, cpu);
			
 
				+	list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
			
 
				+		if (sub->state != PERF_COUNTER_STATE_OFF) {
			
 
				+			counter_sched_in(sub, cpu);
			
 
				+			++n;
			
 
				+		}
			
 
				+	}
			
 
				+	ctx->nr_active += n;
			
 
				+
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Add a counter to the PMU.
			
 
				+ * If all counters are not already frozen, then we disable and
			
 
				+ * re-enable the PMU in order to get hw_perf_enable to do the
			
 
				+ * actual work of reconfiguring the PMU.
			
 
				+ */
			
 
				+static int power_pmu_enable(struct perf_counter *counter)
			
 
				+{
			
 
				+	struct cpu_hw_counters *cpuhw;
			
 
				+	unsigned long flags;
			
 
				+	int n0;
			
 
				+	int ret = -EAGAIN;
			
 
				+
			
 
				+	local_irq_save(flags);
			
 
				+	perf_disable();
			
 
				+
			
 
				+	/*
			
 
				+	 * Add the counter to the list (if there is room)
			
 
				+	 * and check whether the total set is still feasible.
			
 
				+	 */
			
 
				+	cpuhw = &__get_cpu_var(cpu_hw_counters);
			
 
				+	n0 = cpuhw->n_counters;
			
 
				+	if (n0 >= ppmu->n_counter)
			
 
				+		goto out;
			
 
				+	cpuhw->counter[n0] = counter;
			
 
				+	cpuhw->events[n0] = counter->hw.config;
			
 
				+	cpuhw->flags[n0] = counter->hw.counter_base;
			
 
				+	if (check_excludes(cpuhw->counter, cpuhw->flags, n0, 1))
			
 
				+		goto out;
			
 
				+	if (power_check_constraints(cpuhw->events, cpuhw->flags, n0 + 1))
			
 
				+		goto out;
			
 
				+
			
 
				+	counter->hw.config = cpuhw->events[n0];
			
 
				+	++cpuhw->n_counters;
			
 
				+	++cpuhw->n_added;
			
 
				+
			
 
				+	ret = 0;
			
 
				+ out:
			
 
				+	perf_enable();
			
 
				+	local_irq_restore(flags);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Remove a counter from the PMU.
			
 
				+ */
			
 
				+static void power_pmu_disable(struct perf_counter *counter)
			
 
				+{
			
 
				+	struct cpu_hw_counters *cpuhw;
			
 
				+	long i;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	local_irq_save(flags);
			
 
				+	perf_disable();
			
 
				+
			
 
				+	power_pmu_read(counter);
			
 
				+
			
 
				+	cpuhw = &__get_cpu_var(cpu_hw_counters);
			
 
				+	for (i = 0; i < cpuhw->n_counters; ++i) {
			
 
				+		if (counter == cpuhw->counter[i]) {
			
 
				+			while (++i < cpuhw->n_counters)
			
 
				+				cpuhw->counter[i-1] = cpuhw->counter[i];
			
 
				+			--cpuhw->n_counters;
			
 
				+			ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
			
 
				+			if (counter->hw.idx) {
			
 
				+				write_pmc(counter->hw.idx, 0);
			
 
				+				counter->hw.idx = 0;
			
 
				+			}
			
 
				+			perf_counter_update_userpage(counter);
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+	for (i = 0; i < cpuhw->n_limited; ++i)
			
 
				+		if (counter == cpuhw->limited_counter[i])
			
 
				+			break;
			
 
				+	if (i < cpuhw->n_limited) {
			
 
				+		while (++i < cpuhw->n_limited) {
			
 
				+			cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i];
			
 
				+			cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
			
 
				+		}
			
 
				+		--cpuhw->n_limited;
			
 
				+	}
			
 
				+	if (cpuhw->n_counters == 0) {
			
 
				+		/* disable exceptions if no counters are running */
			
 
				+		cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
			
 
				+	}
			
 
				+
			
 
				+	perf_enable();
			
 
				+	local_irq_restore(flags);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Re-enable interrupts on a counter after they were throttled
			
 
				+ * because they were coming too fast.
			
 
				+ */
			
 
				+static void power_pmu_unthrottle(struct perf_counter *counter)
			
 
				+{
			
 
				+	s64 val, left;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	if (!counter->hw.idx || !counter->hw.sample_period)
			
 
				+		return;
			
 
				+	local_irq_save(flags);
			
 
				+	perf_disable();
			
 
				+	power_pmu_read(counter);
			
 
				+	left = counter->hw.sample_period;
			
 
				+	counter->hw.last_period = left;
			
 
				+	val = 0;
			
 
				+	if (left < 0x80000000L)
			
 
				+		val = 0x80000000L - left;
			
 
				+	write_pmc(counter->hw.idx, val);
			
 
				+	atomic64_set(&counter->hw.prev_count, val);
			
 
				+	atomic64_set(&counter->hw.period_left, left);
			
 
				+	perf_counter_update_userpage(counter);
			
 
				+	perf_enable();
			
 
				+	local_irq_restore(flags);
			
 
				+}
			
 
				+
			
 
				+struct pmu power_pmu = {
			
 
				+	.enable		= power_pmu_enable,
			
 
				+	.disable	= power_pmu_disable,
			
 
				+	.read		= power_pmu_read,
			
 
				+	.unthrottle	= power_pmu_unthrottle,
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Return 1 if we might be able to put counter on a limited PMC,
			
 
				+ * or 0 if not.
			
 
				+ * A counter can only go on a limited PMC if it counts something
			
 
				+ * that a limited PMC can count, doesn't require interrupts, and
			
 
				+ * doesn't exclude any processor mode.
			
 
				+ */
			
 
				+static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev,
			
 
				+				 unsigned int flags)
			
 
				+{
			
 
				+	int n;
			
 
				+	u64 alt[MAX_EVENT_ALTERNATIVES];
			
 
				+
			
 
				+	if (counter->attr.exclude_user
			
 
				+	    || counter->attr.exclude_kernel
			
 
				+	    || counter->attr.exclude_hv
			
 
				+	    || counter->attr.sample_period)
			
 
				+		return 0;
			
 
				+
			
 
				+	if (ppmu->limited_pmc_event(ev))
			
 
				+		return 1;
			
 
				+
			
 
				+	/*
			
 
				+	 * The requested event isn't on a limited PMC already;
			
 
				+	 * see if any alternative code goes on a limited PMC.
			
 
				+	 */
			
 
				+	if (!ppmu->get_alternatives)
			
 
				+		return 0;
			
 
				+
			
 
				+	flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
			
 
				+	n = ppmu->get_alternatives(ev, flags, alt);
			
 
				+
			
 
				+	return n > 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Find an alternative event that goes on a normal PMC, if possible,
			
 
				+ * and return the event code, or 0 if there is no such alternative.
			
 
				+ * (Note: event code 0 is "don't count" on all machines.)
			
 
				+ */
			
 
				+static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
			
 
				+{
			
 
				+	u64 alt[MAX_EVENT_ALTERNATIVES];
			
 
				+	int n;
			
 
				+
			
 
				+	flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
			
 
				+	n = ppmu->get_alternatives(ev, flags, alt);
			
 
				+	if (!n)
			
 
				+		return 0;
			
 
				+	return alt[0];
			
 
				+}
			
 
				+
			
 
				+/* Number of perf_counters counting hardware events */
			
 
				+static atomic_t num_counters;
			
 
				+/* Used to avoid races in calling reserve/release_pmc_hardware */
			
 
				+static DEFINE_MUTEX(pmc_reserve_mutex);
			
 
				+
			
 
				+/*
			
 
				+ * Release the PMU if this is the last perf_counter.
			
 
				+ */
			
 
				+static void hw_perf_counter_destroy(struct perf_counter *counter)
			
 
				+{
			
 
				+	if (!atomic_add_unless(&num_counters, -1, 1)) {
			
 
				+		mutex_lock(&pmc_reserve_mutex);
			
 
				+		if (atomic_dec_return(&num_counters) == 0)
			
 
				+			release_pmc_hardware();
			
 
				+		mutex_unlock(&pmc_reserve_mutex);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Translate a generic cache event config to a raw event code.
			
 
				+ */
			
 
				+static int hw_perf_cache_event(u64 config, u64 *eventp)
			
 
				+{
			
 
				+	unsigned long type, op, result;
			
 
				+	int ev;
			
 
				+
			
 
				+	if (!ppmu->cache_events)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	/* unpack config */
			
 
				+	type = config & 0xff;
			
 
				+	op = (config >> 8) & 0xff;
			
 
				+	result = (config >> 16) & 0xff;
			
 
				+
			
 
				+	if (type >= PERF_COUNT_HW_CACHE_MAX ||
			
 
				+	    op >= PERF_COUNT_HW_CACHE_OP_MAX ||
			
 
				+	    result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	ev = (*ppmu->cache_events)[type][op][result];
			
 
				+	if (ev == 0)
			
 
				+		return -EOPNOTSUPP;
			
 
				+	if (ev == -1)
			
 
				+		return -EINVAL;
			
 
				+	*eventp = ev;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
			
 
				+{
			
 
				+	u64 ev;
			
 
				+	unsigned long flags;
			
 
				+	struct perf_counter *ctrs[MAX_HWCOUNTERS];
			
 
				+	u64 events[MAX_HWCOUNTERS];
			
 
				+	unsigned int cflags[MAX_HWCOUNTERS];
			
 
				+	int n;
			
 
				+	int err;
			
 
				+
			
 
				+	if (!ppmu)
			
 
				+		return ERR_PTR(-ENXIO);
			
 
				+	switch (counter->attr.type) {
			
 
				+	case PERF_TYPE_HARDWARE:
			
 
				+		ev = counter->attr.config;
			
 
				+		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
			
 
				+			return ERR_PTR(-EOPNOTSUPP);
			
 
				+		ev = ppmu->generic_events[ev];
			
 
				+		break;
			
 
				+	case PERF_TYPE_HW_CACHE:
			
 
				+		err = hw_perf_cache_event(counter->attr.config, &ev);
			
 
				+		if (err)
			
 
				+			return ERR_PTR(err);
			
 
				+		break;
			
 
				+	case PERF_TYPE_RAW:
			
 
				+		ev = counter->attr.config;
			
 
				+		break;
			
 
				+	}
			
 
				+	counter->hw.config_base = ev;
			
 
				+	counter->hw.idx = 0;
			
 
				+
			
 
				+	/*
			
 
				+	 * If we are not running on a hypervisor, force the
			
 
				+	 * exclude_hv bit to 0 so that we don't care what
			
 
				+	 * the user set it to.
			
 
				+	 */
			
 
				+	if (!firmware_has_feature(FW_FEATURE_LPAR))
			
 
				+		counter->attr.exclude_hv = 0;
			
 
				+
			
 
				+	/*
			
 
				+	 * If this is a per-task counter, then we can use
			
 
				+	 * PM_RUN_* events interchangeably with their non RUN_*
			
 
				+	 * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
			
 
				+	 * XXX we should check if the task is an idle task.
			
 
				+	 */
			
 
				+	flags = 0;
			
 
				+	if (counter->ctx->task)
			
 
				+		flags |= PPMU_ONLY_COUNT_RUN;
			
 
				+
			
 
				+	/*
			
 
				+	 * If this machine has limited counters, check whether this
			
 
				+	 * event could go on a limited counter.
			
 
				+	 */
			
 
				+	if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
			
 
				+		if (can_go_on_limited_pmc(counter, ev, flags)) {
			
 
				+			flags |= PPMU_LIMITED_PMC_OK;
			
 
				+		} else if (ppmu->limited_pmc_event(ev)) {
			
 
				+			/*
			
 
				+			 * The requested event is on a limited PMC,
			
 
				+			 * but we can't use a limited PMC; see if any
			
 
				+			 * alternative goes on a normal PMC.
			
 
				+			 */
			
 
				+			ev = normal_pmc_alternative(ev, flags);
			
 
				+			if (!ev)
			
 
				+				return ERR_PTR(-EINVAL);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * If this is in a group, check if it can go on with all the
			
 
				+	 * other hardware counters in the group.  We assume the counter
			
 
				+	 * hasn't been linked into its leader's sibling list at this point.
			
 
				+	 */
			
 
				+	n = 0;
			
 
				+	if (counter->group_leader != counter) {
			
 
				+		n = collect_events(counter->group_leader, ppmu->n_counter - 1,
			
 
				+				   ctrs, events, cflags);
			
 
				+		if (n < 0)
			
 
				+			return ERR_PTR(-EINVAL);
			
 
				+	}
			
 
				+	events[n] = ev;
			
 
				+	ctrs[n] = counter;
			
 
				+	cflags[n] = flags;
			
 
				+	if (check_excludes(ctrs, cflags, n, 1))
			
 
				+		return ERR_PTR(-EINVAL);
			
 
				+	if (power_check_constraints(events, cflags, n + 1))
			
 
				+		return ERR_PTR(-EINVAL);
			
 
				+
			
 
				+	counter->hw.config = events[n];
			
 
				+	counter->hw.counter_base = cflags[n];
			
 
				+	counter->hw.last_period = counter->hw.sample_period;
			
 
				+	atomic64_set(&counter->hw.period_left, counter->hw.last_period);
			
 
				+
			
 
				+	/*
			
 
				+	 * See if we need to reserve the PMU.
			
 
				+	 * If no counters are currently in use, then we have to take a
			
 
				+	 * mutex to ensure that we don't race with another task doing
			
 
				+	 * reserve_pmc_hardware or release_pmc_hardware.
			
 
				+	 */
			
 
				+	err = 0;
			
 
				+	if (!atomic_inc_not_zero(&num_counters)) {
			
 
				+		mutex_lock(&pmc_reserve_mutex);
			
 
				+		if (atomic_read(&num_counters) == 0 &&
			
 
				+		    reserve_pmc_hardware(perf_counter_interrupt))
			
 
				+			err = -EBUSY;
			
 
				+		else
			
 
				+			atomic_inc(&num_counters);
			
 
				+		mutex_unlock(&pmc_reserve_mutex);
			
 
				+	}
			
 
				+	counter->destroy = hw_perf_counter_destroy;
			
 
				+
			
 
				+	if (err)
			
 
				+		return ERR_PTR(err);
			
 
				+	return &power_pmu;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * A counter has overflowed; update its count and record
			
 
				+ * things if requested.  Note that interrupts are hard-disabled
			
 
				+ * here so there is no possibility of being interrupted.
			
 
				+ */
			
 
				+static void record_and_restart(struct perf_counter *counter, long val,
			
 
				+			       struct pt_regs *regs, int nmi)
			
 
				+{
			
 
				+	u64 period = counter->hw.sample_period;
			
 
				+	s64 prev, delta, left;
			
 
				+	int record = 0;
			
 
				+	u64 addr, mmcra, sdsync;
			
 
				+
			
 
				+	/* we don't have to worry about interrupts here */
			
 
				+	prev = atomic64_read(&counter->hw.prev_count);
			
 
				+	delta = (val - prev) & 0xfffffffful;
			
 
				+	atomic64_add(delta, &counter->count);
			
 
				+
			
 
				+	/*
			
 
				+	 * See if the total period for this counter has expired,
			
 
				+	 * and update for the next period.
			
 
				+	 */
			
 
				+	val = 0;
			
 
				+	left = atomic64_read(&counter->hw.period_left) - delta;
			
 
				+	if (period) {
			
 
				+		if (left <= 0) {
			
 
				+			left += period;
			
 
				+			if (left <= 0)
			
 
				+				left = period;
			
 
				+			record = 1;
			
 
				+		}
			
 
				+		if (left < 0x80000000L)
			
 
				+			val = 0x80000000L - left;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Finally record data if requested.
			
 
				+	 */
			
 
				+	if (record) {
			
 
				+		struct perf_sample_data data = {
			
 
				+			.regs	= regs,
			
 
				+			.addr	= 0,
			
 
				+			.period	= counter->hw.last_period,
			
 
				+		};
			
 
				+
			
 
				+		if (counter->attr.sample_type & PERF_SAMPLE_ADDR) {
			
 
				+			/*
			
 
				+			 * The user wants a data address recorded.
			
 
				+			 * If we're not doing instruction sampling,
			
 
				+			 * give them the SDAR (sampled data address).
			
 
				+			 * If we are doing instruction sampling, then only
			
 
				+			 * give them the SDAR if it corresponds to the
			
 
				+			 * instruction pointed to by SIAR; this is indicated
			
 
				+			 * by the [POWER6_]MMCRA_SDSYNC bit in MMCRA.
			
 
				+			 */
			
 
				+			mmcra = regs->dsisr;
			
 
				+			sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
			
 
				+				POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
			
 
				+			if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
			
 
				+				data.addr = mfspr(SPRN_SDAR);
			
 
				+		}
			
 
				+		if (perf_counter_overflow(counter, nmi, &data)) {
			
 
				+			/*
			
 
				+			 * Interrupts are coming too fast - throttle them
			
 
				+			 * by setting the counter to 0, so it will be
			
 
				+			 * at least 2^30 cycles until the next interrupt
			
 
				+			 * (assuming each counter counts at most 2 counts
			
 
				+			 * per cycle).
			
 
				+			 */
			
 
				+			val = 0;
			
 
				+			left = ~0ULL >> 1;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	write_pmc(counter->hw.idx, val);
			
 
				+	atomic64_set(&counter->hw.prev_count, val);
			
 
				+	atomic64_set(&counter->hw.period_left, left);
			
 
				+	perf_counter_update_userpage(counter);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Called from generic code to get the misc flags (i.e. processor mode)
			
 
				+ * for an event.
			
 
				+ */
			
 
				+unsigned long perf_misc_flags(struct pt_regs *regs)
			
 
				+{
			
 
				+	unsigned long mmcra;
			
 
				+
			
 
				+	if (TRAP(regs) != 0xf00) {
			
 
				+		/* not a PMU interrupt */
			
 
				+		return user_mode(regs) ? PERF_EVENT_MISC_USER :
			
 
				+			PERF_EVENT_MISC_KERNEL;
			
 
				+	}
			
 
				+
			
 
				+	mmcra = regs->dsisr;
			
 
				+	if (ppmu->flags & PPMU_ALT_SIPR) {
			
 
				+		if (mmcra & POWER6_MMCRA_SIHV)
			
 
				+			return PERF_EVENT_MISC_HYPERVISOR;
			
 
				+		return (mmcra & POWER6_MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
			
 
				+			PERF_EVENT_MISC_KERNEL;
			
 
				+	}
			
 
				+	if (mmcra & MMCRA_SIHV)
			
 
				+		return PERF_EVENT_MISC_HYPERVISOR;
			
 
				+	return (mmcra & MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
			
 
				+			PERF_EVENT_MISC_KERNEL;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Called from generic code to get the instruction pointer
			
 
				+ * for an event.
			
 
				+ */
			
 
				+unsigned long perf_instruction_pointer(struct pt_regs *regs)
			
 
				+{
			
 
				+	unsigned long mmcra;
			
 
				+	unsigned long ip;
			
 
				+	unsigned long slot;
			
 
				+
			
 
				+	if (TRAP(regs) != 0xf00)
			
 
				+		return regs->nip;	/* not a PMU interrupt */
			
 
				+
			
 
				+	ip = mfspr(SPRN_SIAR);
			
 
				+	mmcra = regs->dsisr;
			
 
				+	if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) {
			
 
				+		slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
			
 
				+		if (slot > 1)
			
 
				+			ip += 4 * (slot - 1);
			
 
				+	}
			
 
				+	return ip;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Performance monitor interrupt stuff
			
 
				+ */
			
 
				+static void perf_counter_interrupt(struct pt_regs *regs)
			
 
				+{
			
 
				+	int i;
			
 
				+	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
			
 
				+	struct perf_counter *counter;
			
 
				+	long val;
			
 
				+	int found = 0;
			
 
				+	int nmi;
			
 
				+
			
 
				+	if (cpuhw->n_limited)
			
 
				+		freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
			
 
				+					mfspr(SPRN_PMC6));
			
 
				+
			
 
				+	/*
			
 
				+	 * Overload regs->dsisr to store MMCRA so we only need to read it once.
			
 
				+	 */
			
 
				+	regs->dsisr = mfspr(SPRN_MMCRA);
			
 
				+
			
 
				+	/*
			
 
				+	 * If interrupts were soft-disabled when this PMU interrupt
			
 
				+	 * occurred, treat it as an NMI.
			
 
				+	 */
			
 
				+	nmi = !regs->softe;
			
 
				+	if (nmi)
			
 
				+		nmi_enter();
			
 
				+	else
			
 
				+		irq_enter();
			
 
				+
			
 
				+	for (i = 0; i < cpuhw->n_counters; ++i) {
			
 
				+		counter = cpuhw->counter[i];
			
 
				+		if (!counter->hw.idx || is_limited_pmc(counter->hw.idx))
			
 
				+			continue;
			
 
				+		val = read_pmc(counter->hw.idx);
			
 
				+		if ((int)val < 0) {
			
 
				+			/* counter has overflowed */
			
 
				+			found = 1;
			
 
				+			record_and_restart(counter, val, regs, nmi);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * In case we didn't find and reset the counter that caused
			
 
				+	 * the interrupt, scan all counters and reset any that are
			
 
				+	 * negative, to avoid getting continual interrupts.
			
 
				+	 * Any that we processed in the previous loop will not be negative.
			
 
				+	 */
			
 
				+	if (!found) {
			
 
				+		for (i = 0; i < ppmu->n_counter; ++i) {
			
 
				+			if (is_limited_pmc(i + 1))
			
 
				+				continue;
			
 
				+			val = read_pmc(i + 1);
			
 
				+			if ((int)val < 0)
			
 
				+				write_pmc(i + 1, 0);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Reset MMCR0 to its normal value.  This will set PMXE and
			
 
				+	 * clear FC (freeze counters) and PMAO (perf mon alert occurred)
			
 
				+	 * and thus allow interrupts to occur again.
			
 
				+	 * XXX might want to use MSR.PM to keep the counters frozen until
			
 
				+	 * we get back out of this interrupt.
			
 
				+	 */
			
 
				+	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
			
 
				+
			
 
				+	if (nmi)
			
 
				+		nmi_exit();
			
 
				+	else
			
 
				+		irq_exit();
			
 
				+}
			
 
				+
			
 
				+void hw_perf_counter_setup(int cpu)
			
 
				+{
			
 
				+	struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu);
			
 
				+
			
 
				+	memset(cpuhw, 0, sizeof(*cpuhw));
			
 
				+	cpuhw->mmcr[0] = MMCR0_FC;
			
 
				+}
			
 
				+
			
 
				+extern struct power_pmu power4_pmu;
			
 
				+extern struct power_pmu ppc970_pmu;
			
 
				+extern struct power_pmu power5_pmu;
			
 
				+extern struct power_pmu power5p_pmu;
			
 
				+extern struct power_pmu power6_pmu;
			
 
				+extern struct power_pmu power7_pmu;
			
 
				+
			
 
				+static int init_perf_counters(void)
			
 
				+{
			
 
				+	unsigned long pvr;
			
 
				+
			
 
				+	/* XXX should get this from cputable */
			
 
				+	pvr = mfspr(SPRN_PVR);
			
 
				+	switch (PVR_VER(pvr)) {
			
 
				+	case PV_POWER4:
			
 
				+	case PV_POWER4p:
			
 
				+		ppmu = &power4_pmu;
			
 
				+		break;
			
 
				+	case PV_970:
			
 
				+	case PV_970FX:
			
 
				+	case PV_970MP:
			
 
				+		ppmu = &ppc970_pmu;
			
 
				+		break;
			
 
				+	case PV_POWER5:
			
 
				+		ppmu = &power5_pmu;
			
 
				+		break;
			
 
				+	case PV_POWER5p:
			
 
				+		ppmu = &power5p_pmu;
			
 
				+		break;
			
 
				+	case 0x3e:
			
 
				+		ppmu = &power6_pmu;
			
 
				+		break;
			
 
				+	case 0x3f:
			
 
				+		ppmu = &power7_pmu;
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Use FCHV to ignore kernel events if MSR.HV is set.
			
 
				+	 */
			
 
				+	if (mfmsr() & MSR_HV)
			
 
				+		freeze_counters_kernel = MMCR0_FCHV;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+arch_initcall(init_perf_counters);
			
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -0,0 +1,598 @@
 
				+/*
			
 
				+ * Performance counter support for POWER4 (GP) and POWER4+ (GQ) processors.
			
 
				+ *
			
 
				+ * Copyright 2009 Paul Mackerras, IBM Corporation.
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public License
			
 
				+ * as published by the Free Software Foundation; either version
			
 
				+ * 2 of the License, or (at your option) any later version.
			
 
				+ */
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/perf_counter.h>
			
 
				+#include <asm/reg.h>
			
 
				+
			
 
				+/*
			
 
				+ * Bits in event code for POWER4
			
 
				+ */
			
 
				+#define PM_PMC_SH	12	/* PMC number (1-based) for direct events */
			
 
				+#define PM_PMC_MSK	0xf
			
 
				+#define PM_UNIT_SH	8	/* TTMMUX number and setting - unit select */
			
 
				+#define PM_UNIT_MSK	0xf
			
 
				+#define PM_LOWER_SH	6
			
 
				+#define PM_LOWER_MSK	1
			
 
				+#define PM_LOWER_MSKS	0x40
			
 
				+#define PM_BYTE_SH	4	/* Byte number of event bus to use */
			
 
				+#define PM_BYTE_MSK	3
			
 
				+#define PM_PMCSEL_MSK	7
			
 
				+
			
 
				+/*
			
 
				+ * Unit code values
			
 
				+ */
			
 
				+#define PM_FPU		1
			
 
				+#define PM_ISU1		2
			
 
				+#define PM_IFU		3
			
 
				+#define PM_IDU0		4
			
 
				+#define PM_ISU1_ALT	6
			
 
				+#define PM_ISU2		7
			
 
				+#define PM_IFU_ALT	8
			
 
				+#define PM_LSU0		9
			
 
				+#define PM_LSU1		0xc
			
 
				+#define PM_GPS		0xf
			
 
				+
			
 
				+/*
			
 
				+ * Bits in MMCR0 for POWER4
			
 
				+ */
			
 
				+#define MMCR0_PMC1SEL_SH	8
			
 
				+#define MMCR0_PMC2SEL_SH	1
			
 
				+#define MMCR_PMCSEL_MSK		0x1f
			
 
				+
			
 
				+/*
			
 
				+ * Bits in MMCR1 for POWER4
			
 
				+ */
			
 
				+#define MMCR1_TTM0SEL_SH	62
			
 
				+#define MMCR1_TTC0SEL_SH	61
			
 
				+#define MMCR1_TTM1SEL_SH	59
			
 
				+#define MMCR1_TTC1SEL_SH	58
			
 
				+#define MMCR1_TTM2SEL_SH	56
			
 
				+#define MMCR1_TTC2SEL_SH	55
			
 
				+#define MMCR1_TTM3SEL_SH	53
			
 
				+#define MMCR1_TTC3SEL_SH	52
			
 
				+#define MMCR1_TTMSEL_MSK	3
			
 
				+#define MMCR1_TD_CP_DBG0SEL_SH	50
			
 
				+#define MMCR1_TD_CP_DBG1SEL_SH	48
			
 
				+#define MMCR1_TD_CP_DBG2SEL_SH	46
			
 
				+#define MMCR1_TD_CP_DBG3SEL_SH	44
			
 
				+#define MMCR1_DEBUG0SEL_SH	43
			
 
				+#define MMCR1_DEBUG1SEL_SH	42
			
 
				+#define MMCR1_DEBUG2SEL_SH	41
			
 
				+#define MMCR1_DEBUG3SEL_SH	40
			
 
				+#define MMCR1_PMC1_ADDER_SEL_SH	39
			
 
				+#define MMCR1_PMC2_ADDER_SEL_SH	38
			
 
				+#define MMCR1_PMC6_ADDER_SEL_SH	37
			
 
				+#define MMCR1_PMC5_ADDER_SEL_SH	36
			
 
				+#define MMCR1_PMC8_ADDER_SEL_SH	35
			
 
				+#define MMCR1_PMC7_ADDER_SEL_SH	34
			
 
				+#define MMCR1_PMC3_ADDER_SEL_SH	33
			
 
				+#define MMCR1_PMC4_ADDER_SEL_SH	32
			
 
				+#define MMCR1_PMC3SEL_SH	27
			
 
				+#define MMCR1_PMC4SEL_SH	22
			
 
				+#define MMCR1_PMC5SEL_SH	17
			
 
				+#define MMCR1_PMC6SEL_SH	12
			
 
				+#define MMCR1_PMC7SEL_SH	7
			
 
				+#define MMCR1_PMC8SEL_SH	2	/* note bit 0 is in MMCRA for GP */
			
 
				+
			
 
				+static short mmcr1_adder_bits[8] = {
			
 
				+	MMCR1_PMC1_ADDER_SEL_SH,
			
 
				+	MMCR1_PMC2_ADDER_SEL_SH,
			
 
				+	MMCR1_PMC3_ADDER_SEL_SH,
			
 
				+	MMCR1_PMC4_ADDER_SEL_SH,
			
 
				+	MMCR1_PMC5_ADDER_SEL_SH,
			
 
				+	MMCR1_PMC6_ADDER_SEL_SH,
			
 
				+	MMCR1_PMC7_ADDER_SEL_SH,
			
 
				+	MMCR1_PMC8_ADDER_SEL_SH
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Bits in MMCRA
			
 
				+ */
			
 
				+#define MMCRA_PMC8SEL0_SH	17	/* PMC8SEL bit 0 for GP */
			
 
				+
			
 
				+/*
			
 
				+ * Layout of constraint bits:
			
 
				+ * 6666555555555544444444443333333333222222222211111111110000000000
			
 
				+ * 3210987654321098765432109876543210987654321098765432109876543210
			
 
				+ *        |[  >[  >[   >|||[  >[  ><  ><  ><  ><  ><><><><><><><><>
			
 
				+ *        | UC1 UC2 UC3 ||| PS1 PS2 B0  B1  B2  B3 P1P2P3P4P5P6P7P8
			
 
				+ * 	  \SMPL	        ||\TTC3SEL
			
 
				+ * 		        |\TTC_IFU_SEL
			
 
				+ * 		        \TTM2SEL0
			
 
				+ *
			
 
				+ * SMPL - SAMPLE_ENABLE constraint
			
 
				+ *     56: SAMPLE_ENABLE value 0x0100_0000_0000_0000
			
 
				+ *
			
 
				+ * UC1 - unit constraint 1: can't have all three of FPU/ISU1/IDU0|ISU2
			
 
				+ *     55: UC1 error 0x0080_0000_0000_0000
			
 
				+ *     54: FPU events needed 0x0040_0000_0000_0000
			
 
				+ *     53: ISU1 events needed 0x0020_0000_0000_0000
			
 
				+ *     52: IDU0|ISU2 events needed 0x0010_0000_0000_0000
			
 
				+ *
			
 
				+ * UC2 - unit constraint 2: can't have all three of FPU/IFU/LSU0
			
 
				+ *     51: UC2 error 0x0008_0000_0000_0000
			
 
				+ *     50: FPU events needed 0x0004_0000_0000_0000
			
 
				+ *     49: IFU events needed 0x0002_0000_0000_0000
			
 
				+ *     48: LSU0 events needed 0x0001_0000_0000_0000
			
 
				+ *
			
 
				+ * UC3 - unit constraint 3: can't have all four of LSU0/IFU/IDU0|ISU2/ISU1
			
 
				+ *     47: UC3 error 0x8000_0000_0000
			
 
				+ *     46: LSU0 events needed 0x4000_0000_0000
			
 
				+ *     45: IFU events needed 0x2000_0000_0000
			
 
				+ *     44: IDU0|ISU2 events needed 0x1000_0000_0000
			
 
				+ *     43: ISU1 events needed 0x0800_0000_0000
			
 
				+ *
			
 
				+ * TTM2SEL0
			
 
				+ *     42: 0 = IDU0 events needed
			
 
				+ *     	   1 = ISU2 events needed 0x0400_0000_0000
			
 
				+ *
			
 
				+ * TTC_IFU_SEL
			
 
				+ *     41: 0 = IFU.U events needed
			
 
				+ *     	   1 = IFU.L events needed 0x0200_0000_0000
			
 
				+ *
			
 
				+ * TTC3SEL
			
 
				+ *     40: 0 = LSU1.U events needed
			
 
				+ *     	   1 = LSU1.L events needed 0x0100_0000_0000
			
 
				+ *
			
 
				+ * PS1
			
 
				+ *     39: PS1 error 0x0080_0000_0000
			
 
				+ *     36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
			
 
				+ *
			
 
				+ * PS2
			
 
				+ *     35: PS2 error 0x0008_0000_0000
			
 
				+ *     32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
			
 
				+ *
			
 
				+ * B0
			
 
				+ *     28-31: Byte 0 event source 0xf000_0000
			
 
				+ *     	   1 = FPU
			
 
				+ * 	   2 = ISU1
			
 
				+ * 	   3 = IFU
			
 
				+ * 	   4 = IDU0
			
 
				+ * 	   7 = ISU2
			
 
				+ * 	   9 = LSU0
			
 
				+ * 	   c = LSU1
			
 
				+ * 	   f = GPS
			
 
				+ *
			
 
				+ * B1, B2, B3
			
 
				+ *     24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
			
 
				+ *
			
 
				+ * P8
			
 
				+ *     15: P8 error 0x8000
			
 
				+ *     14-15: Count of events needing PMC8
			
 
				+ *
			
 
				+ * P1..P7
			
 
				+ *     0-13: Count of events needing PMC1..PMC7
			
 
				+ *
			
 
				+ * Note: this doesn't allow events using IFU.U to be combined with events
			
 
				+ * using IFU.L, though that is feasible (using TTM0 and TTM2).  However
			
 
				+ * there are no listed events for IFU.L (they are debug events not
			
 
				+ * verified for performance monitoring) so this shouldn't cause a
			
 
				+ * problem.
			
 
				+ */
			
 
				+
			
 
				+static struct unitinfo {
			
 
				+	u64	value, mask;
			
 
				+	int	unit;
			
 
				+	int	lowerbit;
			
 
				+} p4_unitinfo[16] = {
			
 
				+	[PM_FPU]  = { 0x44000000000000ull, 0x88000000000000ull, PM_FPU, 0 },
			
 
				+	[PM_ISU1] = { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
			
 
				+	[PM_ISU1_ALT] =
			
 
				+		    { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
			
 
				+	[PM_IFU]  = { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
			
 
				+	[PM_IFU_ALT] =
			
 
				+		    { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
			
 
				+	[PM_IDU0] = { 0x10100000000000ull, 0x80840000000000ull, PM_IDU0, 1 },
			
 
				+	[PM_ISU2] = { 0x10140000000000ull, 0x80840000000000ull, PM_ISU2, 0 },
			
 
				+	[PM_LSU0] = { 0x01400000000000ull, 0x08800000000000ull, PM_LSU0, 0 },
			
 
				+	[PM_LSU1] = { 0x00000000000000ull, 0x00010000000000ull, PM_LSU1, 40 },
			
 
				+	[PM_GPS]  = { 0x00000000000000ull, 0x00000000000000ull, PM_GPS, 0 }
			
 
				+};
			
 
				+
			
 
				+static unsigned char direct_marked_event[8] = {
			
 
				+	(1<<2) | (1<<3),	/* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */
			
 
				+	(1<<3) | (1<<5),	/* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */
			
 
				+	(1<<3),			/* PMC3: PM_MRK_ST_CMPL_INT */
			
 
				+	(1<<4) | (1<<5),	/* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */
			
 
				+	(1<<4) | (1<<5),	/* PMC5: PM_MRK_GRP_TIMEO */
			
 
				+	(1<<3) | (1<<4) | (1<<5),
			
 
				+		/* PMC6: PM_MRK_ST_GPS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */
			
 
				+	(1<<4) | (1<<5),	/* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */
			
 
				+	(1<<4),			/* PMC8: PM_MRK_LSU_FIN */
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Returns 1 if event counts things relating to marked instructions
			
 
				+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
			
 
				+ */
			
 
				+static int p4_marked_instr_event(u64 event)
			
 
				+{
			
 
				+	int pmc, psel, unit, byte, bit;
			
 
				+	unsigned int mask;
			
 
				+
			
 
				+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+	psel = event & PM_PMCSEL_MSK;
			
 
				+	if (pmc) {
			
 
				+		if (direct_marked_event[pmc - 1] & (1 << psel))
			
 
				+			return 1;
			
 
				+		if (psel == 0)		/* add events */
			
 
				+			bit = (pmc <= 4)? pmc - 1: 8 - pmc;
			
 
				+		else if (psel == 6)	/* decode events */
			
 
				+			bit = 4;
			
 
				+		else
			
 
				+			return 0;
			
 
				+	} else
			
 
				+		bit = psel;
			
 
				+
			
 
				+	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
			
 
				+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
			
 
				+	mask = 0;
			
 
				+	switch (unit) {
			
 
				+	case PM_LSU1:
			
 
				+		if (event & PM_LOWER_MSKS)
			
 
				+			mask = 1 << 28;		/* byte 7 bit 4 */
			
 
				+		else
			
 
				+			mask = 6 << 24;		/* byte 3 bits 1 and 2 */
			
 
				+		break;
			
 
				+	case PM_LSU0:
			
 
				+		/* byte 3, bit 3; byte 2 bits 0,2,3,4,5; byte 1 */
			
 
				+		mask = 0x083dff00;
			
 
				+	}
			
 
				+	return (mask >> (byte * 8 + bit)) & 1;
			
 
				+}
			
 
				+
			
 
				+static int p4_get_constraint(u64 event, u64 *maskp, u64 *valp)
			
 
				+{
			
 
				+	int pmc, byte, unit, lower, sh;
			
 
				+	u64 mask = 0, value = 0;
			
 
				+	int grp = -1;
			
 
				+
			
 
				+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+	if (pmc) {
			
 
				+		if (pmc > 8)
			
 
				+			return -1;
			
 
				+		sh = (pmc - 1) * 2;
			
 
				+		mask |= 2 << sh;
			
 
				+		value |= 1 << sh;
			
 
				+		grp = ((pmc - 1) >> 1) & 1;
			
 
				+	}
			
 
				+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
			
 
				+	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
			
 
				+	if (unit) {
			
 
				+		lower = (event >> PM_LOWER_SH) & PM_LOWER_MSK;
			
 
				+
			
 
				+		/*
			
 
				+		 * Bus events on bytes 0 and 2 can be counted
			
 
				+		 * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
			
 
				+		 */
			
 
				+		if (!pmc)
			
 
				+			grp = byte & 1;
			
 
				+
			
 
				+		if (!p4_unitinfo[unit].unit)
			
 
				+			return -1;
			
 
				+		mask  |= p4_unitinfo[unit].mask;
			
 
				+		value |= p4_unitinfo[unit].value;
			
 
				+		sh = p4_unitinfo[unit].lowerbit;
			
 
				+		if (sh > 1)
			
 
				+			value |= (u64)lower << sh;
			
 
				+		else if (lower != sh)
			
 
				+			return -1;
			
 
				+		unit = p4_unitinfo[unit].unit;
			
 
				+
			
 
				+		/* Set byte lane select field */
			
 
				+		mask  |= 0xfULL << (28 - 4 * byte);
			
 
				+		value |= (u64)unit << (28 - 4 * byte);
			
 
				+	}
			
 
				+	if (grp == 0) {
			
 
				+		/* increment PMC1/2/5/6 field */
			
 
				+		mask  |= 0x8000000000ull;
			
 
				+		value |= 0x1000000000ull;
			
 
				+	} else {
			
 
				+		/* increment PMC3/4/7/8 field */
			
 
				+		mask  |= 0x800000000ull;
			
 
				+		value |= 0x100000000ull;
			
 
				+	}
			
 
				+
			
 
				+	/* Marked instruction events need sample_enable set */
			
 
				+	if (p4_marked_instr_event(event)) {
			
 
				+		mask  |= 1ull << 56;
			
 
				+		value |= 1ull << 56;
			
 
				+	}
			
 
				+
			
 
				+	/* PMCSEL=6 decode events on byte 2 need sample_enable clear */
			
 
				+	if (pmc && (event & PM_PMCSEL_MSK) == 6 && byte == 2)
			
 
				+		mask  |= 1ull << 56;
			
 
				+
			
 
				+	*maskp = mask;
			
 
				+	*valp = value;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static unsigned int ppc_inst_cmpl[] = {
			
 
				+	0x1001, 0x4001, 0x6001, 0x7001, 0x8001
			
 
				+};
			
 
				+
			
 
				+static int p4_get_alternatives(u64 event, unsigned int flags, u64 alt[])
			
 
				+{
			
 
				+	int i, j, na;
			
 
				+
			
 
				+	alt[0] = event;
			
 
				+	na = 1;
			
 
				+
			
 
				+	/* 2 possibilities for PM_GRP_DISP_REJECT */
			
 
				+	if (event == 0x8003 || event == 0x0224) {
			
 
				+		alt[1] = event ^ (0x8003 ^ 0x0224);
			
 
				+		return 2;
			
 
				+	}
			
 
				+
			
 
				+	/* 2 possibilities for PM_ST_MISS_L1 */
			
 
				+	if (event == 0x0c13 || event == 0x0c23) {
			
 
				+		alt[1] = event ^ (0x0c13 ^ 0x0c23);
			
 
				+		return 2;
			
 
				+	}
			
 
				+
			
 
				+	/* several possibilities for PM_INST_CMPL */
			
 
				+	for (i = 0; i < ARRAY_SIZE(ppc_inst_cmpl); ++i) {
			
 
				+		if (event == ppc_inst_cmpl[i]) {
			
 
				+			for (j = 0; j < ARRAY_SIZE(ppc_inst_cmpl); ++j)
			
 
				+				if (j != i)
			
 
				+					alt[na++] = ppc_inst_cmpl[j];
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return na;
			
 
				+}
			
 
				+
			
 
				+static int p4_compute_mmcr(u64 event[], int n_ev,
			
 
				+			   unsigned int hwc[], u64 mmcr[])
			
 
				+{
			
 
				+	u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
			
 
				+	unsigned int pmc, unit, byte, psel, lower;
			
 
				+	unsigned int ttm, grp;
			
 
				+	unsigned int pmc_inuse = 0;
			
 
				+	unsigned int pmc_grp_use[2];
			
 
				+	unsigned char busbyte[4];
			
 
				+	unsigned char unituse[16];
			
 
				+	unsigned int unitlower = 0;
			
 
				+	int i;
			
 
				+
			
 
				+	if (n_ev > 8)
			
 
				+		return -1;
			
 
				+
			
 
				+	/* First pass to count resource use */
			
 
				+	pmc_grp_use[0] = pmc_grp_use[1] = 0;
			
 
				+	memset(busbyte, 0, sizeof(busbyte));
			
 
				+	memset(unituse, 0, sizeof(unituse));
			
 
				+	for (i = 0; i < n_ev; ++i) {
			
 
				+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+		if (pmc) {
			
 
				+			if (pmc_inuse & (1 << (pmc - 1)))
			
 
				+				return -1;
			
 
				+			pmc_inuse |= 1 << (pmc - 1);
			
 
				+			/* count 1/2/5/6 vs 3/4/7/8 use */
			
 
				+			++pmc_grp_use[((pmc - 1) >> 1) & 1];
			
 
				+		}
			
 
				+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
			
 
				+		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
			
 
				+		lower = (event[i] >> PM_LOWER_SH) & PM_LOWER_MSK;
			
 
				+		if (unit) {
			
 
				+			if (!pmc)
			
 
				+				++pmc_grp_use[byte & 1];
			
 
				+			if (unit == 6 || unit == 8)
			
 
				+				/* map alt ISU1/IFU codes: 6->2, 8->3 */
			
 
				+				unit = (unit >> 1) - 1;
			
 
				+			if (busbyte[byte] && busbyte[byte] != unit)
			
 
				+				return -1;
			
 
				+			busbyte[byte] = unit;
			
 
				+			lower <<= unit;
			
 
				+			if (unituse[unit] && lower != (unitlower & lower))
			
 
				+				return -1;
			
 
				+			unituse[unit] = 1;
			
 
				+			unitlower |= lower;
			
 
				+		}
			
 
				+	}
			
 
				+	if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
			
 
				+		return -1;
			
 
				+
			
 
				+	/*
			
 
				+	 * Assign resources and set multiplexer selects.
			
 
				+	 *
			
 
				+	 * Units 1,2,3 are on TTM0, 4,6,7 on TTM1, 8,10 on TTM2.
			
 
				+	 * Each TTMx can only select one unit, but since
			
 
				+	 * units 2 and 6 are both ISU1, and 3 and 8 are both IFU,
			
 
				+	 * we have some choices.
			
 
				+	 */
			
 
				+	if (unituse[2] & (unituse[1] | (unituse[3] & unituse[9]))) {
			
 
				+		unituse[6] = 1;		/* Move 2 to 6 */
			
 
				+		unituse[2] = 0;
			
 
				+	}
			
 
				+	if (unituse[3] & (unituse[1] | unituse[2])) {
			
 
				+		unituse[8] = 1;		/* Move 3 to 8 */
			
 
				+		unituse[3] = 0;
			
 
				+		unitlower = (unitlower & ~8) | ((unitlower & 8) << 5);
			
 
				+	}
			
 
				+	/* Check only one unit per TTMx */
			
 
				+	if (unituse[1] + unituse[2] + unituse[3] > 1 ||
			
 
				+	    unituse[4] + unituse[6] + unituse[7] > 1 ||
			
 
				+	    unituse[8] + unituse[9] > 1 ||
			
 
				+	    (unituse[5] | unituse[10] | unituse[11] |
			
 
				+	     unituse[13] | unituse[14]))
			
 
				+		return -1;
			
 
				+
			
 
				+	/* Set TTMxSEL fields.  Note, units 1-3 => TTM0SEL codes 0-2 */
			
 
				+	mmcr1 |= (u64)(unituse[3] * 2 + unituse[2]) << MMCR1_TTM0SEL_SH;
			
 
				+	mmcr1 |= (u64)(unituse[7] * 3 + unituse[6] * 2) << MMCR1_TTM1SEL_SH;
			
 
				+	mmcr1 |= (u64)unituse[9] << MMCR1_TTM2SEL_SH;
			
 
				+
			
 
				+	/* Set TTCxSEL fields. */
			
 
				+	if (unitlower & 0xe)
			
 
				+		mmcr1 |= 1ull << MMCR1_TTC0SEL_SH;
			
 
				+	if (unitlower & 0xf0)
			
 
				+		mmcr1 |= 1ull << MMCR1_TTC1SEL_SH;
			
 
				+	if (unitlower & 0xf00)
			
 
				+		mmcr1 |= 1ull << MMCR1_TTC2SEL_SH;
			
 
				+	if (unitlower & 0x7000)
			
 
				+		mmcr1 |= 1ull << MMCR1_TTC3SEL_SH;
			
 
				+
			
 
				+	/* Set byte lane select fields. */
			
 
				+	for (byte = 0; byte < 4; ++byte) {
			
 
				+		unit = busbyte[byte];
			
 
				+		if (!unit)
			
 
				+			continue;
			
 
				+		if (unit == 0xf) {
			
 
				+			/* special case for GPS */
			
 
				+			mmcr1 |= 1ull << (MMCR1_DEBUG0SEL_SH - byte);
			
 
				+		} else {
			
 
				+			if (!unituse[unit])
			
 
				+				ttm = unit - 1;		/* 2->1, 3->2 */
			
 
				+			else
			
 
				+				ttm = unit >> 2;
			
 
				+			mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2*byte);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
			
 
				+	for (i = 0; i < n_ev; ++i) {
			
 
				+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
			
 
				+		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
			
 
				+		psel = event[i] & PM_PMCSEL_MSK;
			
 
				+		if (!pmc) {
			
 
				+			/* Bus event or 00xxx direct event (off or cycles) */
			
 
				+			if (unit)
			
 
				+				psel |= 0x10 | ((byte & 2) << 2);
			
 
				+			for (pmc = 0; pmc < 8; ++pmc) {
			
 
				+				if (pmc_inuse & (1 << pmc))
			
 
				+					continue;
			
 
				+				grp = (pmc >> 1) & 1;
			
 
				+				if (unit) {
			
 
				+					if (grp == (byte & 1))
			
 
				+						break;
			
 
				+				} else if (pmc_grp_use[grp] < 4) {
			
 
				+					++pmc_grp_use[grp];
			
 
				+					break;
			
 
				+				}
			
 
				+			}
			
 
				+			pmc_inuse |= 1 << pmc;
			
 
				+		} else {
			
 
				+			/* Direct event */
			
 
				+			--pmc;
			
 
				+			if (psel == 0 && (byte & 2))
			
 
				+				/* add events on higher-numbered bus */
			
 
				+				mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
			
 
				+			else if (psel == 6 && byte == 3)
			
 
				+				/* seem to need to set sample_enable here */
			
 
				+				mmcra |= MMCRA_SAMPLE_ENABLE;
			
 
				+			psel |= 8;
			
 
				+		}
			
 
				+		if (pmc <= 1)
			
 
				+			mmcr0 |= psel << (MMCR0_PMC1SEL_SH - 7 * pmc);
			
 
				+		else
			
 
				+			mmcr1 |= psel << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
			
 
				+		if (pmc == 7)	/* PMC8 */
			
 
				+			mmcra |= (psel & 1) << MMCRA_PMC8SEL0_SH;
			
 
				+		hwc[i] = pmc;
			
 
				+		if (p4_marked_instr_event(event[i]))
			
 
				+			mmcra |= MMCRA_SAMPLE_ENABLE;
			
 
				+	}
			
 
				+
			
 
				+	if (pmc_inuse & 1)
			
 
				+		mmcr0 |= MMCR0_PMC1CE;
			
 
				+	if (pmc_inuse & 0xfe)
			
 
				+		mmcr0 |= MMCR0_PMCjCE;
			
 
				+
			
 
				+	mmcra |= 0x2000;	/* mark only one IOP per PPC instruction */
			
 
				+
			
 
				+	/* Return MMCRx values */
			
 
				+	mmcr[0] = mmcr0;
			
 
				+	mmcr[1] = mmcr1;
			
 
				+	mmcr[2] = mmcra;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void p4_disable_pmc(unsigned int pmc, u64 mmcr[])
			
 
				+{
			
 
				+	/*
			
 
				+	 * Setting the PMCxSEL field to 0 disables PMC x.
			
 
				+	 * (Note that pmc is 0-based here, not 1-based.)
			
 
				+	 */
			
 
				+	if (pmc <= 1) {
			
 
				+		mmcr[0] &= ~(0x1fUL << (MMCR0_PMC1SEL_SH - 7 * pmc));
			
 
				+	} else {
			
 
				+		mmcr[1] &= ~(0x1fUL << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)));
			
 
				+		if (pmc == 7)
			
 
				+			mmcr[2] &= ~(1UL << MMCRA_PMC8SEL0_SH);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int p4_generic_events[] = {
			
 
				+	[PERF_COUNT_HW_CPU_CYCLES]		= 7,
			
 
				+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x1001,
			
 
				+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x8c10, /* PM_LD_REF_L1 */
			
 
				+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3c10, /* PM_LD_MISS_L1 */
			
 
				+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x330,  /* PM_BR_ISSUED */
			
 
				+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x331,  /* PM_BR_MPRED_CR */
			
 
				+};
			
 
				+
			
 
				+#define C(x)	PERF_COUNT_HW_CACHE_##x
			
 
				+
			
 
				+/*
			
 
				+ * Table of generalized cache-related events.
			
 
				+ * 0 means not supported, -1 means nonsensical, other values
			
 
				+ * are event codes.
			
 
				+ */
			
 
				+static int power4_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
			
 
				+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0x8c10,		0x3c10	},
			
 
				+		[C(OP_WRITE)] = {	0x7c10,		0xc13	},
			
 
				+		[C(OP_PREFETCH)] = {	0xc35,		0	},
			
 
				+	},
			
 
				+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0,		0	},
			
 
				+		[C(OP_WRITE)] = {	-1,		-1	},
			
 
				+		[C(OP_PREFETCH)] = {	0,		0	},
			
 
				+	},
			
 
				+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0,		0	},
			
 
				+		[C(OP_WRITE)] = {	0,		0	},
			
 
				+		[C(OP_PREFETCH)] = {	0xc34,		0	},
			
 
				+	},
			
 
				+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0,		0x904	},
			
 
				+		[C(OP_WRITE)] = {	-1,		-1	},
			
 
				+		[C(OP_PREFETCH)] = {	-1,		-1	},
			
 
				+	},
			
 
				+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0,		0x900	},
			
 
				+		[C(OP_WRITE)] = {	-1,		-1	},
			
 
				+		[C(OP_PREFETCH)] = {	-1,		-1	},
			
 
				+	},
			
 
				+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0x330,		0x331	},
			
 
				+		[C(OP_WRITE)] = {	-1,		-1	},
			
 
				+		[C(OP_PREFETCH)] = {	-1,		-1	},
			
 
				+	},
			
 
				+};
			
 
				+
			
 
				+struct power_pmu power4_pmu = {
			
 
				+	.n_counter = 8,
			
 
				+	.max_alternatives = 5,
			
 
				+	.add_fields = 0x0000001100005555ull,
			
 
				+	.test_adder = 0x0011083300000000ull,
			
 
				+	.compute_mmcr = p4_compute_mmcr,
			
 
				+	.get_constraint = p4_get_constraint,
			
 
				+	.get_alternatives = p4_get_alternatives,
			
 
				+	.disable_pmc = p4_disable_pmc,
			
 
				+	.n_generic = ARRAY_SIZE(p4_generic_events),
			
 
				+	.generic_events = p4_generic_events,
			
 
				+	.cache_events = &power4_cache_events,
			
 
				+};
			
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -0,0 +1,671 @@
 
				+/*
			
 
				+ * Performance counter support for POWER5+/++ (not POWER5) processors.
			
 
				+ *
			
 
				+ * Copyright 2009 Paul Mackerras, IBM Corporation.
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public License
			
 
				+ * as published by the Free Software Foundation; either version
			
 
				+ * 2 of the License, or (at your option) any later version.
			
 
				+ */
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/perf_counter.h>
			
 
				+#include <asm/reg.h>
			
 
				+
			
 
				+/*
			
 
				+ * Bits in event code for POWER5+ (POWER5 GS) and POWER5++ (POWER5 GS DD3)
			
 
				+ */
			
 
				+#define PM_PMC_SH	20	/* PMC number (1-based) for direct events */
			
 
				+#define PM_PMC_MSK	0xf
			
 
				+#define PM_PMC_MSKS	(PM_PMC_MSK << PM_PMC_SH)
			
 
				+#define PM_UNIT_SH	16	/* TTMMUX number and setting - unit select */
			
 
				+#define PM_UNIT_MSK	0xf
			
 
				+#define PM_BYTE_SH	12	/* Byte number of event bus to use */
			
 
				+#define PM_BYTE_MSK	7
			
 
				+#define PM_GRS_SH	8	/* Storage subsystem mux select */
			
 
				+#define PM_GRS_MSK	7
			
 
				+#define PM_BUSEVENT_MSK	0x80	/* Set if event uses event bus */
			
 
				+#define PM_PMCSEL_MSK	0x7f
			
 
				+
			
 
				+/* Values in PM_UNIT field */
			
 
				+#define PM_FPU		0
			
 
				+#define PM_ISU0		1
			
 
				+#define PM_IFU		2
			
 
				+#define PM_ISU1		3
			
 
				+#define PM_IDU		4
			
 
				+#define PM_ISU0_ALT	6
			
 
				+#define PM_GRS		7
			
 
				+#define PM_LSU0		8
			
 
				+#define PM_LSU1		0xc
			
 
				+#define PM_LASTUNIT	0xc
			
 
				+
			
 
				+/*
			
 
				+ * Bits in MMCR1 for POWER5+
			
 
				+ */
			
 
				+#define MMCR1_TTM0SEL_SH	62
			
 
				+#define MMCR1_TTM1SEL_SH	60
			
 
				+#define MMCR1_TTM2SEL_SH	58
			
 
				+#define MMCR1_TTM3SEL_SH	56
			
 
				+#define MMCR1_TTMSEL_MSK	3
			
 
				+#define MMCR1_TD_CP_DBG0SEL_SH	54
			
 
				+#define MMCR1_TD_CP_DBG1SEL_SH	52
			
 
				+#define MMCR1_TD_CP_DBG2SEL_SH	50
			
 
				+#define MMCR1_TD_CP_DBG3SEL_SH	48
			
 
				+#define MMCR1_GRS_L2SEL_SH	46
			
 
				+#define MMCR1_GRS_L2SEL_MSK	3
			
 
				+#define MMCR1_GRS_L3SEL_SH	44
			
 
				+#define MMCR1_GRS_L3SEL_MSK	3
			
 
				+#define MMCR1_GRS_MCSEL_SH	41
			
 
				+#define MMCR1_GRS_MCSEL_MSK	7
			
 
				+#define MMCR1_GRS_FABSEL_SH	39
			
 
				+#define MMCR1_GRS_FABSEL_MSK	3
			
 
				+#define MMCR1_PMC1_ADDER_SEL_SH	35
			
 
				+#define MMCR1_PMC2_ADDER_SEL_SH	34
			
 
				+#define MMCR1_PMC3_ADDER_SEL_SH	33
			
 
				+#define MMCR1_PMC4_ADDER_SEL_SH	32
			
 
				+#define MMCR1_PMC1SEL_SH	25
			
 
				+#define MMCR1_PMC2SEL_SH	17
			
 
				+#define MMCR1_PMC3SEL_SH	9
			
 
				+#define MMCR1_PMC4SEL_SH	1
			
 
				+#define MMCR1_PMCSEL_SH(n)	(MMCR1_PMC1SEL_SH - (n) * 8)
			
 
				+#define MMCR1_PMCSEL_MSK	0x7f
			
 
				+
			
 
				+/*
			
 
				+ * Bits in MMCRA
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Layout of constraint bits:
			
 
				+ * 6666555555555544444444443333333333222222222211111111110000000000
			
 
				+ * 3210987654321098765432109876543210987654321098765432109876543210
			
 
				+ *             [  ><><>< ><> <><>[  >  <  ><  ><  ><  ><><><><><><>
			
 
				+ *             NC  G0G1G2 G3 T0T1 UC    B0  B1  B2  B3 P6P5P4P3P2P1
			
 
				+ *
			
 
				+ * NC - number of counters
			
 
				+ *     51: NC error 0x0008_0000_0000_0000
			
 
				+ *     48-50: number of events needing PMC1-4 0x0007_0000_0000_0000
			
 
				+ *
			
 
				+ * G0..G3 - GRS mux constraints
			
 
				+ *     46-47: GRS_L2SEL value
			
 
				+ *     44-45: GRS_L3SEL value
			
 
				+ *     41-44: GRS_MCSEL value
			
 
				+ *     39-40: GRS_FABSEL value
			
 
				+ *	Note that these match up with their bit positions in MMCR1
			
 
				+ *
			
 
				+ * T0 - TTM0 constraint
			
 
				+ *     36-37: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0x30_0000_0000
			
 
				+ *
			
 
				+ * T1 - TTM1 constraint
			
 
				+ *     34-35: TTM1SEL value (0=IDU, 3=GRS) 0x0c_0000_0000
			
 
				+ *
			
 
				+ * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS
			
 
				+ *     33: UC3 error 0x02_0000_0000
			
 
				+ *     32: FPU|IFU|ISU1 events needed 0x01_0000_0000
			
 
				+ *     31: ISU0 events needed 0x01_8000_0000
			
 
				+ *     30: IDU|GRS events needed 0x00_4000_0000
			
 
				+ *
			
 
				+ * B0
			
 
				+ *     24-27: Byte 0 event source 0x0f00_0000
			
 
				+ *	      Encoding as for the event code
			
 
				+ *
			
 
				+ * B1, B2, B3
			
 
				+ *     20-23, 16-19, 12-15: Byte 1, 2, 3 event sources
			
 
				+ *
			
 
				+ * P6
			
 
				+ *     11: P6 error 0x800
			
 
				+ *     10-11: Count of events needing PMC6
			
 
				+ *
			
 
				+ * P1..P5
			
 
				+ *     0-9: Count of events needing PMC1..PMC5
			
 
				+ */
			
 
				+
			
 
				+static const int grsel_shift[8] = {
			
 
				+	MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH,
			
 
				+	MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH,
			
 
				+	MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH
			
 
				+};
			
 
				+
			
 
				+/* Masks and values for using events from the various units */
			
 
				+static u64 unit_cons[PM_LASTUNIT+1][2] = {
			
 
				+	[PM_FPU] =   { 0x3200000000ull, 0x0100000000ull },
			
 
				+	[PM_ISU0] =  { 0x0200000000ull, 0x0080000000ull },
			
 
				+	[PM_ISU1] =  { 0x3200000000ull, 0x3100000000ull },
			
 
				+	[PM_IFU] =   { 0x3200000000ull, 0x2100000000ull },
			
 
				+	[PM_IDU] =   { 0x0e00000000ull, 0x0040000000ull },
			
 
				+	[PM_GRS] =   { 0x0e00000000ull, 0x0c40000000ull },
			
 
				+};
			
 
				+
			
 
				+static int power5p_get_constraint(u64 event, u64 *maskp, u64 *valp)
			
 
				+{
			
 
				+	int pmc, byte, unit, sh;
			
 
				+	int bit, fmask;
			
 
				+	u64 mask = 0, value = 0;
			
 
				+
			
 
				+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+	if (pmc) {
			
 
				+		if (pmc > 6)
			
 
				+			return -1;
			
 
				+		sh = (pmc - 1) * 2;
			
 
				+		mask |= 2 << sh;
			
 
				+		value |= 1 << sh;
			
 
				+		if (pmc >= 5 && !(event == 0x500009 || event == 0x600005))
			
 
				+			return -1;
			
 
				+	}
			
 
				+	if (event & PM_BUSEVENT_MSK) {
			
 
				+		unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
			
 
				+		if (unit > PM_LASTUNIT)
			
 
				+			return -1;
			
 
				+		if (unit == PM_ISU0_ALT)
			
 
				+			unit = PM_ISU0;
			
 
				+		mask |= unit_cons[unit][0];
			
 
				+		value |= unit_cons[unit][1];
			
 
				+		byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
			
 
				+		if (byte >= 4) {
			
 
				+			if (unit != PM_LSU1)
			
 
				+				return -1;
			
 
				+			/* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */
			
 
				+			++unit;
			
 
				+			byte &= 3;
			
 
				+		}
			
 
				+		if (unit == PM_GRS) {
			
 
				+			bit = event & 7;
			
 
				+			fmask = (bit == 6)? 7: 3;
			
 
				+			sh = grsel_shift[bit];
			
 
				+			mask |= (u64)fmask << sh;
			
 
				+			value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
			
 
				+		}
			
 
				+		/* Set byte lane select field */
			
 
				+		mask  |= 0xfULL << (24 - 4 * byte);
			
 
				+		value |= (u64)unit << (24 - 4 * byte);
			
 
				+	}
			
 
				+	if (pmc < 5) {
			
 
				+		/* need a counter from PMC1-4 set */
			
 
				+		mask  |= 0x8000000000000ull;
			
 
				+		value |= 0x1000000000000ull;
			
 
				+	}
			
 
				+	*maskp = mask;
			
 
				+	*valp = value;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int power5p_limited_pmc_event(u64 event)
			
 
				+{
			
 
				+	int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+
			
 
				+	return pmc == 5 || pmc == 6;
			
 
				+}
			
 
				+
			
 
				+#define MAX_ALT	3	/* at most 3 alternatives for any event */
			
 
				+
			
 
				+static const unsigned int event_alternatives[][MAX_ALT] = {
			
 
				+	{ 0x100c0,  0x40001f },			/* PM_GCT_FULL_CYC */
			
 
				+	{ 0x120e4,  0x400002 },			/* PM_GRP_DISP_REJECT */
			
 
				+	{ 0x230e2,  0x323087 },			/* PM_BR_PRED_CR */
			
 
				+	{ 0x230e3,  0x223087, 0x3230a0 },	/* PM_BR_PRED_TA */
			
 
				+	{ 0x410c7,  0x441084 },			/* PM_THRD_L2MISS_BOTH_CYC */
			
 
				+	{ 0x800c4,  0xc20e0 },			/* PM_DTLB_MISS */
			
 
				+	{ 0xc50c6,  0xc60e0 },			/* PM_MRK_DTLB_MISS */
			
 
				+	{ 0x100005, 0x600005 },			/* PM_RUN_CYC */
			
 
				+	{ 0x100009, 0x200009 },			/* PM_INST_CMPL */
			
 
				+	{ 0x200015, 0x300015 },			/* PM_LSU_LMQ_SRQ_EMPTY_CYC */
			
 
				+	{ 0x300009, 0x400009 },			/* PM_INST_DISP */
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Scan the alternatives table for a match and return the
			
 
				+ * index into the alternatives table if found, else -1.
			
 
				+ */
			
 
				+static int find_alternative(unsigned int event)
			
 
				+{
			
 
				+	int i, j;
			
 
				+
			
 
				+	for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
			
 
				+		if (event < event_alternatives[i][0])
			
 
				+			break;
			
 
				+		for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
			
 
				+			if (event == event_alternatives[i][j])
			
 
				+				return i;
			
 
				+	}
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+static const unsigned char bytedecode_alternatives[4][4] = {
			
 
				+	/* PMC 1 */	{ 0x21, 0x23, 0x25, 0x27 },
			
 
				+	/* PMC 2 */	{ 0x07, 0x17, 0x0e, 0x1e },
			
 
				+	/* PMC 3 */	{ 0x20, 0x22, 0x24, 0x26 },
			
 
				+	/* PMC 4 */	{ 0x07, 0x17, 0x0e, 0x1e }
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Some direct events for decodes of event bus byte 3 have alternative
			
 
				+ * PMCSEL values on other counters.  This returns the alternative
			
 
				+ * event code for those that do, or -1 otherwise.  This also handles
			
 
				+ * alternative PCMSEL values for add events.
			
 
				+ */
			
 
				+static s64 find_alternative_bdecode(u64 event)
			
 
				+{
			
 
				+	int pmc, altpmc, pp, j;
			
 
				+
			
 
				+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+	if (pmc == 0 || pmc > 4)
			
 
				+		return -1;
			
 
				+	altpmc = 5 - pmc;	/* 1 <-> 4, 2 <-> 3 */
			
 
				+	pp = event & PM_PMCSEL_MSK;
			
 
				+	for (j = 0; j < 4; ++j) {
			
 
				+		if (bytedecode_alternatives[pmc - 1][j] == pp) {
			
 
				+			return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) |
			
 
				+				(altpmc << PM_PMC_SH) |
			
 
				+				bytedecode_alternatives[altpmc - 1][j];
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* new decode alternatives for power5+ */
			
 
				+	if (pmc == 1 && (pp == 0x0d || pp == 0x0e))
			
 
				+		return event + (2 << PM_PMC_SH) + (0x2e - 0x0d);
			
 
				+	if (pmc == 3 && (pp == 0x2e || pp == 0x2f))
			
 
				+		return event - (2 << PM_PMC_SH) - (0x2e - 0x0d);
			
 
				+
			
 
				+	/* alternative add event encodings */
			
 
				+	if (pp == 0x10 || pp == 0x28)
			
 
				+		return ((event ^ (0x10 ^ 0x28)) & ~PM_PMC_MSKS) |
			
 
				+			(altpmc << PM_PMC_SH);
			
 
				+
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+static int power5p_get_alternatives(u64 event, unsigned int flags, u64 alt[])
			
 
				+{
			
 
				+	int i, j, nalt = 1;
			
 
				+	int nlim;
			
 
				+	s64 ae;
			
 
				+
			
 
				+	alt[0] = event;
			
 
				+	nalt = 1;
			
 
				+	nlim = power5p_limited_pmc_event(event);
			
 
				+	i = find_alternative(event);
			
 
				+	if (i >= 0) {
			
 
				+		for (j = 0; j < MAX_ALT; ++j) {
			
 
				+			ae = event_alternatives[i][j];
			
 
				+			if (ae && ae != event)
			
 
				+				alt[nalt++] = ae;
			
 
				+			nlim += power5p_limited_pmc_event(ae);
			
 
				+		}
			
 
				+	} else {
			
 
				+		ae = find_alternative_bdecode(event);
			
 
				+		if (ae > 0)
			
 
				+			alt[nalt++] = ae;
			
 
				+	}
			
 
				+
			
 
				+	if (flags & PPMU_ONLY_COUNT_RUN) {
			
 
				+		/*
			
 
				+		 * We're only counting in RUN state,
			
 
				+		 * so PM_CYC is equivalent to PM_RUN_CYC
			
 
				+		 * and PM_INST_CMPL === PM_RUN_INST_CMPL.
			
 
				+		 * This doesn't include alternatives that don't provide
			
 
				+		 * any extra flexibility in assigning PMCs (e.g.
			
 
				+		 * 0x100005 for PM_RUN_CYC vs. 0xf for PM_CYC).
			
 
				+		 * Note that even with these additional alternatives
			
 
				+		 * we never end up with more than 3 alternatives for any event.
			
 
				+		 */
			
 
				+		j = nalt;
			
 
				+		for (i = 0; i < nalt; ++i) {
			
 
				+			switch (alt[i]) {
			
 
				+			case 0xf:	/* PM_CYC */
			
 
				+				alt[j++] = 0x600005;	/* PM_RUN_CYC */
			
 
				+				++nlim;
			
 
				+				break;
			
 
				+			case 0x600005:	/* PM_RUN_CYC */
			
 
				+				alt[j++] = 0xf;
			
 
				+				break;
			
 
				+			case 0x100009:	/* PM_INST_CMPL */
			
 
				+				alt[j++] = 0x500009;	/* PM_RUN_INST_CMPL */
			
 
				+				++nlim;
			
 
				+				break;
			
 
				+			case 0x500009:	/* PM_RUN_INST_CMPL */
			
 
				+				alt[j++] = 0x100009;	/* PM_INST_CMPL */
			
 
				+				alt[j++] = 0x200009;
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+		nalt = j;
			
 
				+	}
			
 
				+
			
 
				+	if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) {
			
 
				+		/* remove the limited PMC events */
			
 
				+		j = 0;
			
 
				+		for (i = 0; i < nalt; ++i) {
			
 
				+			if (!power5p_limited_pmc_event(alt[i])) {
			
 
				+				alt[j] = alt[i];
			
 
				+				++j;
			
 
				+			}
			
 
				+		}
			
 
				+		nalt = j;
			
 
				+	} else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) {
			
 
				+		/* remove all but the limited PMC events */
			
 
				+		j = 0;
			
 
				+		for (i = 0; i < nalt; ++i) {
			
 
				+			if (power5p_limited_pmc_event(alt[i])) {
			
 
				+				alt[j] = alt[i];
			
 
				+				++j;
			
 
				+			}
			
 
				+		}
			
 
				+		nalt = j;
			
 
				+	}
			
 
				+
			
 
				+	return nalt;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Map of which direct events on which PMCs are marked instruction events.
			
 
				+ * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event.
			
 
				+ * Bit 0 is set if it is marked for all PMCs.
			
 
				+ * The 0x80 bit indicates a byte decode PMCSEL value.
			
 
				+ */
			
 
				+static unsigned char direct_event_is_marked[0x28] = {
			
 
				+	0,	/* 00 */
			
 
				+	0x1f,	/* 01 PM_IOPS_CMPL */
			
 
				+	0x2,	/* 02 PM_MRK_GRP_DISP */
			
 
				+	0xe,	/* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
			
 
				+	0,	/* 04 */
			
 
				+	0x1c,	/* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */
			
 
				+	0x80,	/* 06 */
			
 
				+	0x80,	/* 07 */
			
 
				+	0, 0, 0,/* 08 - 0a */
			
 
				+	0x18,	/* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */
			
 
				+	0,	/* 0c */
			
 
				+	0x80,	/* 0d */
			
 
				+	0x80,	/* 0e */
			
 
				+	0,	/* 0f */
			
 
				+	0,	/* 10 */
			
 
				+	0x14,	/* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */
			
 
				+	0,	/* 12 */
			
 
				+	0x10,	/* 13 PM_MRK_GRP_CMPL */
			
 
				+	0x1f,	/* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */
			
 
				+	0x2,	/* 15 PM_MRK_GRP_ISSUED */
			
 
				+	0x80,	/* 16 */
			
 
				+	0x80,	/* 17 */
			
 
				+	0, 0, 0, 0, 0,
			
 
				+	0x80,	/* 1d */
			
 
				+	0x80,	/* 1e */
			
 
				+	0,	/* 1f */
			
 
				+	0x80,	/* 20 */
			
 
				+	0x80,	/* 21 */
			
 
				+	0x80,	/* 22 */
			
 
				+	0x80,	/* 23 */
			
 
				+	0x80,	/* 24 */
			
 
				+	0x80,	/* 25 */
			
 
				+	0x80,	/* 26 */
			
 
				+	0x80,	/* 27 */
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Returns 1 if event counts things relating to marked instructions
			
 
				+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
			
 
				+ */
			
 
				+static int power5p_marked_instr_event(u64 event)
			
 
				+{
			
 
				+	int pmc, psel;
			
 
				+	int bit, byte, unit;
			
 
				+	u32 mask;
			
 
				+
			
 
				+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+	psel = event & PM_PMCSEL_MSK;
			
 
				+	if (pmc >= 5)
			
 
				+		return 0;
			
 
				+
			
 
				+	bit = -1;
			
 
				+	if (psel < sizeof(direct_event_is_marked)) {
			
 
				+		if (direct_event_is_marked[psel] & (1 << pmc))
			
 
				+			return 1;
			
 
				+		if (direct_event_is_marked[psel] & 0x80)
			
 
				+			bit = 4;
			
 
				+		else if (psel == 0x08)
			
 
				+			bit = pmc - 1;
			
 
				+		else if (psel == 0x10)
			
 
				+			bit = 4 - pmc;
			
 
				+		else if (psel == 0x1b && (pmc == 1 || pmc == 3))
			
 
				+			bit = 4;
			
 
				+	} else if ((psel & 0x48) == 0x40) {
			
 
				+		bit = psel & 7;
			
 
				+	} else if (psel == 0x28) {
			
 
				+		bit = pmc - 1;
			
 
				+	} else if (pmc == 3 && (psel == 0x2e || psel == 0x2f)) {
			
 
				+		bit = 4;
			
 
				+	}
			
 
				+
			
 
				+	if (!(event & PM_BUSEVENT_MSK) || bit == -1)
			
 
				+		return 0;
			
 
				+
			
 
				+	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
			
 
				+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
			
 
				+	if (unit == PM_LSU0) {
			
 
				+		/* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */
			
 
				+		mask = 0x5dff00;
			
 
				+	} else if (unit == PM_LSU1 && byte >= 4) {
			
 
				+		byte -= 4;
			
 
				+		/* byte 5 bits 6-7, byte 6 bits 0,4, byte 7 bits 0-4,6 */
			
 
				+		mask = 0x5f11c000;
			
 
				+	} else
			
 
				+		return 0;
			
 
				+
			
 
				+	return (mask >> (byte * 8 + bit)) & 1;
			
 
				+}
			
 
				+
			
 
				+static int power5p_compute_mmcr(u64 event[], int n_ev,
			
 
				+				unsigned int hwc[], u64 mmcr[])
			
 
				+{
			
 
				+	u64 mmcr1 = 0;
			
 
				+	u64 mmcra = 0;
			
 
				+	unsigned int pmc, unit, byte, psel;
			
 
				+	unsigned int ttm;
			
 
				+	int i, isbus, bit, grsel;
			
 
				+	unsigned int pmc_inuse = 0;
			
 
				+	unsigned char busbyte[4];
			
 
				+	unsigned char unituse[16];
			
 
				+	int ttmuse;
			
 
				+
			
 
				+	if (n_ev > 6)
			
 
				+		return -1;
			
 
				+
			
 
				+	/* First pass to count resource use */
			
 
				+	memset(busbyte, 0, sizeof(busbyte));
			
 
				+	memset(unituse, 0, sizeof(unituse));
			
 
				+	for (i = 0; i < n_ev; ++i) {
			
 
				+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+		if (pmc) {
			
 
				+			if (pmc > 6)
			
 
				+				return -1;
			
 
				+			if (pmc_inuse & (1 << (pmc - 1)))
			
 
				+				return -1;
			
 
				+			pmc_inuse |= 1 << (pmc - 1);
			
 
				+		}
			
 
				+		if (event[i] & PM_BUSEVENT_MSK) {
			
 
				+			unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
			
 
				+			byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
			
 
				+			if (unit > PM_LASTUNIT)
			
 
				+				return -1;
			
 
				+			if (unit == PM_ISU0_ALT)
			
 
				+				unit = PM_ISU0;
			
 
				+			if (byte >= 4) {
			
 
				+				if (unit != PM_LSU1)
			
 
				+					return -1;
			
 
				+				++unit;
			
 
				+				byte &= 3;
			
 
				+			}
			
 
				+			if (busbyte[byte] && busbyte[byte] != unit)
			
 
				+				return -1;
			
 
				+			busbyte[byte] = unit;
			
 
				+			unituse[unit] = 1;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Assign resources and set multiplexer selects.
			
 
				+	 *
			
 
				+	 * PM_ISU0 can go either on TTM0 or TTM1, but that's the only
			
 
				+	 * choice we have to deal with.
			
 
				+	 */
			
 
				+	if (unituse[PM_ISU0] &
			
 
				+	    (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) {
			
 
				+		unituse[PM_ISU0_ALT] = 1;	/* move ISU to TTM1 */
			
 
				+		unituse[PM_ISU0] = 0;
			
 
				+	}
			
 
				+	/* Set TTM[01]SEL fields. */
			
 
				+	ttmuse = 0;
			
 
				+	for (i = PM_FPU; i <= PM_ISU1; ++i) {
			
 
				+		if (!unituse[i])
			
 
				+			continue;
			
 
				+		if (ttmuse++)
			
 
				+			return -1;
			
 
				+		mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH;
			
 
				+	}
			
 
				+	ttmuse = 0;
			
 
				+	for (; i <= PM_GRS; ++i) {
			
 
				+		if (!unituse[i])
			
 
				+			continue;
			
 
				+		if (ttmuse++)
			
 
				+			return -1;
			
 
				+		mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH;
			
 
				+	}
			
 
				+	if (ttmuse > 1)
			
 
				+		return -1;
			
 
				+
			
 
				+	/* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */
			
 
				+	for (byte = 0; byte < 4; ++byte) {
			
 
				+		unit = busbyte[byte];
			
 
				+		if (!unit)
			
 
				+			continue;
			
 
				+		if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) {
			
 
				+			/* get ISU0 through TTM1 rather than TTM0 */
			
 
				+			unit = PM_ISU0_ALT;
			
 
				+		} else if (unit == PM_LSU1 + 1) {
			
 
				+			/* select lower word of LSU1 for this byte */
			
 
				+			mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
			
 
				+		}
			
 
				+		ttm = unit >> 2;
			
 
				+		mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
			
 
				+	}
			
 
				+
			
 
				+	/* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
			
 
				+	for (i = 0; i < n_ev; ++i) {
			
 
				+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
			
 
				+		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
			
 
				+		psel = event[i] & PM_PMCSEL_MSK;
			
 
				+		isbus = event[i] & PM_BUSEVENT_MSK;
			
 
				+		if (!pmc) {
			
 
				+			/* Bus event or any-PMC direct event */
			
 
				+			for (pmc = 0; pmc < 4; ++pmc) {
			
 
				+				if (!(pmc_inuse & (1 << pmc)))
			
 
				+					break;
			
 
				+			}
			
 
				+			if (pmc >= 4)
			
 
				+				return -1;
			
 
				+			pmc_inuse |= 1 << pmc;
			
 
				+		} else if (pmc <= 4) {
			
 
				+			/* Direct event */
			
 
				+			--pmc;
			
 
				+			if (isbus && (byte & 2) &&
			
 
				+			    (psel == 8 || psel == 0x10 || psel == 0x28))
			
 
				+				/* add events on higher-numbered bus */
			
 
				+				mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
			
 
				+		} else {
			
 
				+			/* Instructions or run cycles on PMC5/6 */
			
 
				+			--pmc;
			
 
				+		}
			
 
				+		if (isbus && unit == PM_GRS) {
			
 
				+			bit = psel & 7;
			
 
				+			grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
			
 
				+			mmcr1 |= (u64)grsel << grsel_shift[bit];
			
 
				+		}
			
 
				+		if (power5p_marked_instr_event(event[i]))
			
 
				+			mmcra |= MMCRA_SAMPLE_ENABLE;
			
 
				+		if ((psel & 0x58) == 0x40 && (byte & 1) != ((pmc >> 1) & 1))
			
 
				+			/* select alternate byte lane */
			
 
				+			psel |= 0x10;
			
 
				+		if (pmc <= 3)
			
 
				+			mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
			
 
				+		hwc[i] = pmc;
			
 
				+	}
			
 
				+
			
 
				+	/* Return MMCRx values */
			
 
				+	mmcr[0] = 0;
			
 
				+	if (pmc_inuse & 1)
			
 
				+		mmcr[0] = MMCR0_PMC1CE;
			
 
				+	if (pmc_inuse & 0x3e)
			
 
				+		mmcr[0] |= MMCR0_PMCjCE;
			
 
				+	mmcr[1] = mmcr1;
			
 
				+	mmcr[2] = mmcra;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void power5p_disable_pmc(unsigned int pmc, u64 mmcr[])
			
 
				+{
			
 
				+	if (pmc <= 3)
			
 
				+		mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
			
 
				+}
			
 
				+
			
 
				+static int power5p_generic_events[] = {
			
 
				+	[PERF_COUNT_HW_CPU_CYCLES]		= 0xf,
			
 
				+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x100009,
			
 
				+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x1c10a8, /* LD_REF_L1 */
			
 
				+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3c1088, /* LD_MISS_L1 */
			
 
				+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x230e4,  /* BR_ISSUED */
			
 
				+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x230e5,  /* BR_MPRED_CR */
			
 
				+};
			
 
				+
			
 
				+#define C(x)	PERF_COUNT_HW_CACHE_##x
			
 
				+
			
 
				+/*
			
 
				+ * Table of generalized cache-related events.
			
 
				+ * 0 means not supported, -1 means nonsensical, other values
			
 
				+ * are event codes.
			
 
				+ */
			
 
				+static int power5p_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
			
 
				+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0x1c10a8,	0x3c1088	},
			
 
				+		[C(OP_WRITE)] = {	0x2c10a8,	0xc10c3		},
			
 
				+		[C(OP_PREFETCH)] = {	0xc70e7,	-1		},
			
 
				+	},
			
 
				+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0,		0		},
			
 
				+		[C(OP_WRITE)] = {	-1,		-1		},
			
 
				+		[C(OP_PREFETCH)] = {	0,		0		},
			
 
				+	},
			
 
				+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0,		0		},
			
 
				+		[C(OP_WRITE)] = {	0,		0		},
			
 
				+		[C(OP_PREFETCH)] = {	0xc50c3,	0		},
			
 
				+	},
			
 
				+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0xc20e4,	0x800c4		},
			
 
				+		[C(OP_WRITE)] = {	-1,		-1		},
			
 
				+		[C(OP_PREFETCH)] = {	-1,		-1		},
			
 
				+	},
			
 
				+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0,		0x800c0		},
			
 
				+		[C(OP_WRITE)] = {	-1,		-1		},
			
 
				+		[C(OP_PREFETCH)] = {	-1,		-1		},
			
 
				+	},
			
 
				+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0x230e4,	0x230e5		},
			
 
				+		[C(OP_WRITE)] = {	-1,		-1		},
			
 
				+		[C(OP_PREFETCH)] = {	-1,		-1		},
			
 
				+	},
			
 
				+};
			
 
				+
			
 
				+struct power_pmu power5p_pmu = {
			
 
				+	.n_counter = 6,
			
 
				+	.max_alternatives = MAX_ALT,
			
 
				+	.add_fields = 0x7000000000055ull,
			
 
				+	.test_adder = 0x3000040000000ull,
			
 
				+	.compute_mmcr = power5p_compute_mmcr,
			
 
				+	.get_constraint = power5p_get_constraint,
			
 
				+	.get_alternatives = power5p_get_alternatives,
			
 
				+	.disable_pmc = power5p_disable_pmc,
			
 
				+	.limited_pmc_event = power5p_limited_pmc_event,
			
 
				+	.flags = PPMU_LIMITED_PMC5_6,
			
 
				+	.n_generic = ARRAY_SIZE(power5p_generic_events),
			
 
				+	.generic_events = power5p_generic_events,
			
 
				+	.cache_events = &power5p_cache_events,
			
 
				+};
			
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -0,0 +1,611 @@
 
				+/*
			
 
				+ * Performance counter support for POWER5 (not POWER5++) processors.
			
 
				+ *
			
 
				+ * Copyright 2009 Paul Mackerras, IBM Corporation.
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public License
			
 
				+ * as published by the Free Software Foundation; either version
			
 
				+ * 2 of the License, or (at your option) any later version.
			
 
				+ */
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/perf_counter.h>
			
 
				+#include <asm/reg.h>
			
 
				+
			
 
				+/*
			
 
				+ * Bits in event code for POWER5 (not POWER5++)
			
 
				+ */
			
 
				+#define PM_PMC_SH	20	/* PMC number (1-based) for direct events */
			
 
				+#define PM_PMC_MSK	0xf
			
 
				+#define PM_PMC_MSKS	(PM_PMC_MSK << PM_PMC_SH)
			
 
				+#define PM_UNIT_SH	16	/* TTMMUX number and setting - unit select */
			
 
				+#define PM_UNIT_MSK	0xf
			
 
				+#define PM_BYTE_SH	12	/* Byte number of event bus to use */
			
 
				+#define PM_BYTE_MSK	7
			
 
				+#define PM_GRS_SH	8	/* Storage subsystem mux select */
			
 
				+#define PM_GRS_MSK	7
			
 
				+#define PM_BUSEVENT_MSK	0x80	/* Set if event uses event bus */
			
 
				+#define PM_PMCSEL_MSK	0x7f
			
 
				+
			
 
				+/* Values in PM_UNIT field */
			
 
				+#define PM_FPU		0
			
 
				+#define PM_ISU0		1
			
 
				+#define PM_IFU		2
			
 
				+#define PM_ISU1		3
			
 
				+#define PM_IDU		4
			
 
				+#define PM_ISU0_ALT	6
			
 
				+#define PM_GRS		7
			
 
				+#define PM_LSU0		8
			
 
				+#define PM_LSU1		0xc
			
 
				+#define PM_LASTUNIT	0xc
			
 
				+
			
 
				+/*
			
 
				+ * Bits in MMCR1 for POWER5
			
 
				+ */
			
 
				+#define MMCR1_TTM0SEL_SH	62
			
 
				+#define MMCR1_TTM1SEL_SH	60
			
 
				+#define MMCR1_TTM2SEL_SH	58
			
 
				+#define MMCR1_TTM3SEL_SH	56
			
 
				+#define MMCR1_TTMSEL_MSK	3
			
 
				+#define MMCR1_TD_CP_DBG0SEL_SH	54
			
 
				+#define MMCR1_TD_CP_DBG1SEL_SH	52
			
 
				+#define MMCR1_TD_CP_DBG2SEL_SH	50
			
 
				+#define MMCR1_TD_CP_DBG3SEL_SH	48
			
 
				+#define MMCR1_GRS_L2SEL_SH	46
			
 
				+#define MMCR1_GRS_L2SEL_MSK	3
			
 
				+#define MMCR1_GRS_L3SEL_SH	44
			
 
				+#define MMCR1_GRS_L3SEL_MSK	3
			
 
				+#define MMCR1_GRS_MCSEL_SH	41
			
 
				+#define MMCR1_GRS_MCSEL_MSK	7
			
 
				+#define MMCR1_GRS_FABSEL_SH	39
			
 
				+#define MMCR1_GRS_FABSEL_MSK	3
			
 
				+#define MMCR1_PMC1_ADDER_SEL_SH	35
			
 
				+#define MMCR1_PMC2_ADDER_SEL_SH	34
			
 
				+#define MMCR1_PMC3_ADDER_SEL_SH	33
			
 
				+#define MMCR1_PMC4_ADDER_SEL_SH	32
			
 
				+#define MMCR1_PMC1SEL_SH	25
			
 
				+#define MMCR1_PMC2SEL_SH	17
			
 
				+#define MMCR1_PMC3SEL_SH	9
			
 
				+#define MMCR1_PMC4SEL_SH	1
			
 
				+#define MMCR1_PMCSEL_SH(n)	(MMCR1_PMC1SEL_SH - (n) * 8)
			
 
				+#define MMCR1_PMCSEL_MSK	0x7f
			
 
				+
			
 
				+/*
			
 
				+ * Bits in MMCRA
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Layout of constraint bits:
			
 
				+ * 6666555555555544444444443333333333222222222211111111110000000000
			
 
				+ * 3210987654321098765432109876543210987654321098765432109876543210
			
 
				+ *         <><>[  ><><>< ><> [  >[ >[ ><  ><  ><  ><  ><><><><><><>
			
 
				+ *         T0T1 NC G0G1G2 G3  UC PS1PS2 B0  B1  B2  B3 P6P5P4P3P2P1
			
 
				+ *
			
 
				+ * T0 - TTM0 constraint
			
 
				+ *     54-55: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0xc0_0000_0000_0000
			
 
				+ *
			
 
				+ * T1 - TTM1 constraint
			
 
				+ *     52-53: TTM1SEL value (0=IDU, 3=GRS) 0x30_0000_0000_0000
			
 
				+ *
			
 
				+ * NC - number of counters
			
 
				+ *     51: NC error 0x0008_0000_0000_0000
			
 
				+ *     48-50: number of events needing PMC1-4 0x0007_0000_0000_0000
			
 
				+ *
			
 
				+ * G0..G3 - GRS mux constraints
			
 
				+ *     46-47: GRS_L2SEL value
			
 
				+ *     44-45: GRS_L3SEL value
			
 
				+ *     41-44: GRS_MCSEL value
			
 
				+ *     39-40: GRS_FABSEL value
			
 
				+ *	Note that these match up with their bit positions in MMCR1
			
 
				+ *
			
 
				+ * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS
			
 
				+ *     37: UC3 error 0x20_0000_0000
			
 
				+ *     36: FPU|IFU|ISU1 events needed 0x10_0000_0000
			
 
				+ *     35: ISU0 events needed 0x08_0000_0000
			
 
				+ *     34: IDU|GRS events needed 0x04_0000_0000
			
 
				+ *
			
 
				+ * PS1
			
 
				+ *     33: PS1 error 0x2_0000_0000
			
 
				+ *     31-32: count of events needing PMC1/2 0x1_8000_0000
			
 
				+ *
			
 
				+ * PS2
			
 
				+ *     30: PS2 error 0x4000_0000
			
 
				+ *     28-29: count of events needing PMC3/4 0x3000_0000
			
 
				+ *
			
 
				+ * B0
			
 
				+ *     24-27: Byte 0 event source 0x0f00_0000
			
 
				+ *	      Encoding as for the event code
			
 
				+ *
			
 
				+ * B1, B2, B3
			
 
				+ *     20-23, 16-19, 12-15: Byte 1, 2, 3 event sources
			
 
				+ *
			
 
				+ * P1..P6
			
 
				+ *     0-11: Count of events needing PMC1..PMC6
			
 
				+ */
			
 
				+
			
 
				+static const int grsel_shift[8] = {
			
 
				+	MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH,
			
 
				+	MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH,
			
 
				+	MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH
			
 
				+};
			
 
				+
			
 
				+/* Masks and values for using events from the various units */
			
 
				+static u64 unit_cons[PM_LASTUNIT+1][2] = {
			
 
				+	[PM_FPU] =   { 0xc0002000000000ull, 0x00001000000000ull },
			
 
				+	[PM_ISU0] =  { 0x00002000000000ull, 0x00000800000000ull },
			
 
				+	[PM_ISU1] =  { 0xc0002000000000ull, 0xc0001000000000ull },
			
 
				+	[PM_IFU] =   { 0xc0002000000000ull, 0x80001000000000ull },
			
 
				+	[PM_IDU] =   { 0x30002000000000ull, 0x00000400000000ull },
			
 
				+	[PM_GRS] =   { 0x30002000000000ull, 0x30000400000000ull },
			
 
				+};
			
 
				+
			
 
				+static int power5_get_constraint(u64 event, u64 *maskp, u64 *valp)
			
 
				+{
			
 
				+	int pmc, byte, unit, sh;
			
 
				+	int bit, fmask;
			
 
				+	u64 mask = 0, value = 0;
			
 
				+	int grp = -1;
			
 
				+
			
 
				+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+	if (pmc) {
			
 
				+		if (pmc > 6)
			
 
				+			return -1;
			
 
				+		sh = (pmc - 1) * 2;
			
 
				+		mask |= 2 << sh;
			
 
				+		value |= 1 << sh;
			
 
				+		if (pmc <= 4)
			
 
				+			grp = (pmc - 1) >> 1;
			
 
				+		else if (event != 0x500009 && event != 0x600005)
			
 
				+			return -1;
			
 
				+	}
			
 
				+	if (event & PM_BUSEVENT_MSK) {
			
 
				+		unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
			
 
				+		if (unit > PM_LASTUNIT)
			
 
				+			return -1;
			
 
				+		if (unit == PM_ISU0_ALT)
			
 
				+			unit = PM_ISU0;
			
 
				+		mask |= unit_cons[unit][0];
			
 
				+		value |= unit_cons[unit][1];
			
 
				+		byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
			
 
				+		if (byte >= 4) {
			
 
				+			if (unit != PM_LSU1)
			
 
				+				return -1;
			
 
				+			/* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */
			
 
				+			++unit;
			
 
				+			byte &= 3;
			
 
				+		}
			
 
				+		if (unit == PM_GRS) {
			
 
				+			bit = event & 7;
			
 
				+			fmask = (bit == 6)? 7: 3;
			
 
				+			sh = grsel_shift[bit];
			
 
				+			mask |= (u64)fmask << sh;
			
 
				+			value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
			
 
				+		}
			
 
				+		/*
			
 
				+		 * Bus events on bytes 0 and 2 can be counted
			
 
				+		 * on PMC1/2; bytes 1 and 3 on PMC3/4.
			
 
				+		 */
			
 
				+		if (!pmc)
			
 
				+			grp = byte & 1;
			
 
				+		/* Set byte lane select field */
			
 
				+		mask  |= 0xfULL << (24 - 4 * byte);
			
 
				+		value |= (u64)unit << (24 - 4 * byte);
			
 
				+	}
			
 
				+	if (grp == 0) {
			
 
				+		/* increment PMC1/2 field */
			
 
				+		mask  |= 0x200000000ull;
			
 
				+		value |= 0x080000000ull;
			
 
				+	} else if (grp == 1) {
			
 
				+		/* increment PMC3/4 field */
			
 
				+		mask  |= 0x40000000ull;
			
 
				+		value |= 0x10000000ull;
			
 
				+	}
			
 
				+	if (pmc < 5) {
			
 
				+		/* need a counter from PMC1-4 set */
			
 
				+		mask  |= 0x8000000000000ull;
			
 
				+		value |= 0x1000000000000ull;
			
 
				+	}
			
 
				+	*maskp = mask;
			
 
				+	*valp = value;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+#define MAX_ALT	3	/* at most 3 alternatives for any event */
			
 
				+
			
 
				+static const unsigned int event_alternatives[][MAX_ALT] = {
			
 
				+	{ 0x120e4,  0x400002 },			/* PM_GRP_DISP_REJECT */
			
 
				+	{ 0x410c7,  0x441084 },			/* PM_THRD_L2MISS_BOTH_CYC */
			
 
				+	{ 0x100005, 0x600005 },			/* PM_RUN_CYC */
			
 
				+	{ 0x100009, 0x200009, 0x500009 },	/* PM_INST_CMPL */
			
 
				+	{ 0x300009, 0x400009 },			/* PM_INST_DISP */
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Scan the alternatives table for a match and return the
			
 
				+ * index into the alternatives table if found, else -1.
			
 
				+ */
			
 
				+static int find_alternative(u64 event)
			
 
				+{
			
 
				+	int i, j;
			
 
				+
			
 
				+	for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
			
 
				+		if (event < event_alternatives[i][0])
			
 
				+			break;
			
 
				+		for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
			
 
				+			if (event == event_alternatives[i][j])
			
 
				+				return i;
			
 
				+	}
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+static const unsigned char bytedecode_alternatives[4][4] = {
			
 
				+	/* PMC 1 */	{ 0x21, 0x23, 0x25, 0x27 },
			
 
				+	/* PMC 2 */	{ 0x07, 0x17, 0x0e, 0x1e },
			
 
				+	/* PMC 3 */	{ 0x20, 0x22, 0x24, 0x26 },
			
 
				+	/* PMC 4 */	{ 0x07, 0x17, 0x0e, 0x1e }
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Some direct events for decodes of event bus byte 3 have alternative
			
 
				+ * PMCSEL values on other counters.  This returns the alternative
			
 
				+ * event code for those that do, or -1 otherwise.
			
 
				+ */
			
 
				+static s64 find_alternative_bdecode(u64 event)
			
 
				+{
			
 
				+	int pmc, altpmc, pp, j;
			
 
				+
			
 
				+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+	if (pmc == 0 || pmc > 4)
			
 
				+		return -1;
			
 
				+	altpmc = 5 - pmc;	/* 1 <-> 4, 2 <-> 3 */
			
 
				+	pp = event & PM_PMCSEL_MSK;
			
 
				+	for (j = 0; j < 4; ++j) {
			
 
				+		if (bytedecode_alternatives[pmc - 1][j] == pp) {
			
 
				+			return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) |
			
 
				+				(altpmc << PM_PMC_SH) |
			
 
				+				bytedecode_alternatives[altpmc - 1][j];
			
 
				+		}
			
 
				+	}
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+static int power5_get_alternatives(u64 event, unsigned int flags, u64 alt[])
			
 
				+{
			
 
				+	int i, j, nalt = 1;
			
 
				+	s64 ae;
			
 
				+
			
 
				+	alt[0] = event;
			
 
				+	nalt = 1;
			
 
				+	i = find_alternative(event);
			
 
				+	if (i >= 0) {
			
 
				+		for (j = 0; j < MAX_ALT; ++j) {
			
 
				+			ae = event_alternatives[i][j];
			
 
				+			if (ae && ae != event)
			
 
				+				alt[nalt++] = ae;
			
 
				+		}
			
 
				+	} else {
			
 
				+		ae = find_alternative_bdecode(event);
			
 
				+		if (ae > 0)
			
 
				+			alt[nalt++] = ae;
			
 
				+	}
			
 
				+	return nalt;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Map of which direct events on which PMCs are marked instruction events.
			
 
				+ * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event.
			
 
				+ * Bit 0 is set if it is marked for all PMCs.
			
 
				+ * The 0x80 bit indicates a byte decode PMCSEL value.
			
 
				+ */
			
 
				+static unsigned char direct_event_is_marked[0x28] = {
			
 
				+	0,	/* 00 */
			
 
				+	0x1f,	/* 01 PM_IOPS_CMPL */
			
 
				+	0x2,	/* 02 PM_MRK_GRP_DISP */
			
 
				+	0xe,	/* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
			
 
				+	0,	/* 04 */
			
 
				+	0x1c,	/* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */
			
 
				+	0x80,	/* 06 */
			
 
				+	0x80,	/* 07 */
			
 
				+	0, 0, 0,/* 08 - 0a */
			
 
				+	0x18,	/* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */
			
 
				+	0,	/* 0c */
			
 
				+	0x80,	/* 0d */
			
 
				+	0x80,	/* 0e */
			
 
				+	0,	/* 0f */
			
 
				+	0,	/* 10 */
			
 
				+	0x14,	/* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */
			
 
				+	0,	/* 12 */
			
 
				+	0x10,	/* 13 PM_MRK_GRP_CMPL */
			
 
				+	0x1f,	/* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */
			
 
				+	0x2,	/* 15 PM_MRK_GRP_ISSUED */
			
 
				+	0x80,	/* 16 */
			
 
				+	0x80,	/* 17 */
			
 
				+	0, 0, 0, 0, 0,
			
 
				+	0x80,	/* 1d */
			
 
				+	0x80,	/* 1e */
			
 
				+	0,	/* 1f */
			
 
				+	0x80,	/* 20 */
			
 
				+	0x80,	/* 21 */
			
 
				+	0x80,	/* 22 */
			
 
				+	0x80,	/* 23 */
			
 
				+	0x80,	/* 24 */
			
 
				+	0x80,	/* 25 */
			
 
				+	0x80,	/* 26 */
			
 
				+	0x80,	/* 27 */
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Returns 1 if event counts things relating to marked instructions
			
 
				+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
			
 
				+ */
			
 
				+static int power5_marked_instr_event(u64 event)
			
 
				+{
			
 
				+	int pmc, psel;
			
 
				+	int bit, byte, unit;
			
 
				+	u32 mask;
			
 
				+
			
 
				+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+	psel = event & PM_PMCSEL_MSK;
			
 
				+	if (pmc >= 5)
			
 
				+		return 0;
			
 
				+
			
 
				+	bit = -1;
			
 
				+	if (psel < sizeof(direct_event_is_marked)) {
			
 
				+		if (direct_event_is_marked[psel] & (1 << pmc))
			
 
				+			return 1;
			
 
				+		if (direct_event_is_marked[psel] & 0x80)
			
 
				+			bit = 4;
			
 
				+		else if (psel == 0x08)
			
 
				+			bit = pmc - 1;
			
 
				+		else if (psel == 0x10)
			
 
				+			bit = 4 - pmc;
			
 
				+		else if (psel == 0x1b && (pmc == 1 || pmc == 3))
			
 
				+			bit = 4;
			
 
				+	} else if ((psel & 0x58) == 0x40)
			
 
				+		bit = psel & 7;
			
 
				+
			
 
				+	if (!(event & PM_BUSEVENT_MSK))
			
 
				+		return 0;
			
 
				+
			
 
				+	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
			
 
				+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
			
 
				+	if (unit == PM_LSU0) {
			
 
				+		/* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */
			
 
				+		mask = 0x5dff00;
			
 
				+	} else if (unit == PM_LSU1 && byte >= 4) {
			
 
				+		byte -= 4;
			
 
				+		/* byte 4 bits 1,3,5,7, byte 5 bits 6-7, byte 7 bits 0-4,6 */
			
 
				+		mask = 0x5f00c0aa;
			
 
				+	} else
			
 
				+		return 0;
			
 
				+
			
 
				+	return (mask >> (byte * 8 + bit)) & 1;
			
 
				+}
			
 
				+
			
 
				+static int power5_compute_mmcr(u64 event[], int n_ev,
			
 
				+			       unsigned int hwc[], u64 mmcr[])
			
 
				+{
			
 
				+	u64 mmcr1 = 0;
			
 
				+	u64 mmcra = 0;
			
 
				+	unsigned int pmc, unit, byte, psel;
			
 
				+	unsigned int ttm, grp;
			
 
				+	int i, isbus, bit, grsel;
			
 
				+	unsigned int pmc_inuse = 0;
			
 
				+	unsigned int pmc_grp_use[2];
			
 
				+	unsigned char busbyte[4];
			
 
				+	unsigned char unituse[16];
			
 
				+	int ttmuse;
			
 
				+
			
 
				+	if (n_ev > 6)
			
 
				+		return -1;
			
 
				+
			
 
				+	/* First pass to count resource use */
			
 
				+	pmc_grp_use[0] = pmc_grp_use[1] = 0;
			
 
				+	memset(busbyte, 0, sizeof(busbyte));
			
 
				+	memset(unituse, 0, sizeof(unituse));
			
 
				+	for (i = 0; i < n_ev; ++i) {
			
 
				+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+		if (pmc) {
			
 
				+			if (pmc > 6)
			
 
				+				return -1;
			
 
				+			if (pmc_inuse & (1 << (pmc - 1)))
			
 
				+				return -1;
			
 
				+			pmc_inuse |= 1 << (pmc - 1);
			
 
				+			/* count 1/2 vs 3/4 use */
			
 
				+			if (pmc <= 4)
			
 
				+				++pmc_grp_use[(pmc - 1) >> 1];
			
 
				+		}
			
 
				+		if (event[i] & PM_BUSEVENT_MSK) {
			
 
				+			unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
			
 
				+			byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
			
 
				+			if (unit > PM_LASTUNIT)
			
 
				+				return -1;
			
 
				+			if (unit == PM_ISU0_ALT)
			
 
				+				unit = PM_ISU0;
			
 
				+			if (byte >= 4) {
			
 
				+				if (unit != PM_LSU1)
			
 
				+					return -1;
			
 
				+				++unit;
			
 
				+				byte &= 3;
			
 
				+			}
			
 
				+			if (!pmc)
			
 
				+				++pmc_grp_use[byte & 1];
			
 
				+			if (busbyte[byte] && busbyte[byte] != unit)
			
 
				+				return -1;
			
 
				+			busbyte[byte] = unit;
			
 
				+			unituse[unit] = 1;
			
 
				+		}
			
 
				+	}
			
 
				+	if (pmc_grp_use[0] > 2 || pmc_grp_use[1] > 2)
			
 
				+		return -1;
			
 
				+
			
 
				+	/*
			
 
				+	 * Assign resources and set multiplexer selects.
			
 
				+	 *
			
 
				+	 * PM_ISU0 can go either on TTM0 or TTM1, but that's the only
			
 
				+	 * choice we have to deal with.
			
 
				+	 */
			
 
				+	if (unituse[PM_ISU0] &
			
 
				+	    (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) {
			
 
				+		unituse[PM_ISU0_ALT] = 1;	/* move ISU to TTM1 */
			
 
				+		unituse[PM_ISU0] = 0;
			
 
				+	}
			
 
				+	/* Set TTM[01]SEL fields. */
			
 
				+	ttmuse = 0;
			
 
				+	for (i = PM_FPU; i <= PM_ISU1; ++i) {
			
 
				+		if (!unituse[i])
			
 
				+			continue;
			
 
				+		if (ttmuse++)
			
 
				+			return -1;
			
 
				+		mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH;
			
 
				+	}
			
 
				+	ttmuse = 0;
			
 
				+	for (; i <= PM_GRS; ++i) {
			
 
				+		if (!unituse[i])
			
 
				+			continue;
			
 
				+		if (ttmuse++)
			
 
				+			return -1;
			
 
				+		mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH;
			
 
				+	}
			
 
				+	if (ttmuse > 1)
			
 
				+		return -1;
			
 
				+
			
 
				+	/* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */
			
 
				+	for (byte = 0; byte < 4; ++byte) {
			
 
				+		unit = busbyte[byte];
			
 
				+		if (!unit)
			
 
				+			continue;
			
 
				+		if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) {
			
 
				+			/* get ISU0 through TTM1 rather than TTM0 */
			
 
				+			unit = PM_ISU0_ALT;
			
 
				+		} else if (unit == PM_LSU1 + 1) {
			
 
				+			/* select lower word of LSU1 for this byte */
			
 
				+			mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
			
 
				+		}
			
 
				+		ttm = unit >> 2;
			
 
				+		mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
			
 
				+	}
			
 
				+
			
 
				+	/* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
			
 
				+	for (i = 0; i < n_ev; ++i) {
			
 
				+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
			
 
				+		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
			
 
				+		psel = event[i] & PM_PMCSEL_MSK;
			
 
				+		isbus = event[i] & PM_BUSEVENT_MSK;
			
 
				+		if (!pmc) {
			
 
				+			/* Bus event or any-PMC direct event */
			
 
				+			for (pmc = 0; pmc < 4; ++pmc) {
			
 
				+				if (pmc_inuse & (1 << pmc))
			
 
				+					continue;
			
 
				+				grp = (pmc >> 1) & 1;
			
 
				+				if (isbus) {
			
 
				+					if (grp == (byte & 1))
			
 
				+						break;
			
 
				+				} else if (pmc_grp_use[grp] < 2) {
			
 
				+					++pmc_grp_use[grp];
			
 
				+					break;
			
 
				+				}
			
 
				+			}
			
 
				+			pmc_inuse |= 1 << pmc;
			
 
				+		} else if (pmc <= 4) {
			
 
				+			/* Direct event */
			
 
				+			--pmc;
			
 
				+			if ((psel == 8 || psel == 0x10) && isbus && (byte & 2))
			
 
				+				/* add events on higher-numbered bus */
			
 
				+				mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
			
 
				+		} else {
			
 
				+			/* Instructions or run cycles on PMC5/6 */
			
 
				+			--pmc;
			
 
				+		}
			
 
				+		if (isbus && unit == PM_GRS) {
			
 
				+			bit = psel & 7;
			
 
				+			grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
			
 
				+			mmcr1 |= (u64)grsel << grsel_shift[bit];
			
 
				+		}
			
 
				+		if (power5_marked_instr_event(event[i]))
			
 
				+			mmcra |= MMCRA_SAMPLE_ENABLE;
			
 
				+		if (pmc <= 3)
			
 
				+			mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
			
 
				+		hwc[i] = pmc;
			
 
				+	}
			
 
				+
			
 
				+	/* Return MMCRx values */
			
 
				+	mmcr[0] = 0;
			
 
				+	if (pmc_inuse & 1)
			
 
				+		mmcr[0] = MMCR0_PMC1CE;
			
 
				+	if (pmc_inuse & 0x3e)
			
 
				+		mmcr[0] |= MMCR0_PMCjCE;
			
 
				+	mmcr[1] = mmcr1;
			
 
				+	mmcr[2] = mmcra;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void power5_disable_pmc(unsigned int pmc, u64 mmcr[])
			
 
				+{
			
 
				+	if (pmc <= 3)
			
 
				+		mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
			
 
				+}
			
 
				+
			
 
				+static int power5_generic_events[] = {
			
 
				+	[PERF_COUNT_HW_CPU_CYCLES]		= 0xf,
			
 
				+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x100009,
			
 
				+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4c1090, /* LD_REF_L1 */
			
 
				+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3c1088, /* LD_MISS_L1 */
			
 
				+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x230e4,  /* BR_ISSUED */
			
 
				+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x230e5,  /* BR_MPRED_CR */
			
 
				+};
			
 
				+
			
 
				+#define C(x)	PERF_COUNT_HW_CACHE_##x
			
 
				+
			
 
				+/*
			
 
				+ * Table of generalized cache-related events.
			
 
				+ * 0 means not supported, -1 means nonsensical, other values
			
 
				+ * are event codes.
			
 
				+ */
			
 
				+static int power5_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
			
 
				+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0x4c1090,	0x3c1088	},
			
 
				+		[C(OP_WRITE)] = {	0x3c1090,	0xc10c3		},
			
 
				+		[C(OP_PREFETCH)] = {	0xc70e7,	0		},
			
 
				+	},
			
 
				+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0,		0		},
			
 
				+		[C(OP_WRITE)] = {	-1,		-1		},
			
 
				+		[C(OP_PREFETCH)] = {	0,		0		},
			
 
				+	},
			
 
				+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0,		0x3c309b	},
			
 
				+		[C(OP_WRITE)] = {	0,		0		},
			
 
				+		[C(OP_PREFETCH)] = {	0xc50c3,	0		},
			
 
				+	},
			
 
				+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0x2c4090,	0x800c4		},
			
 
				+		[C(OP_WRITE)] = {	-1,		-1		},
			
 
				+		[C(OP_PREFETCH)] = {	-1,		-1		},
			
 
				+	},
			
 
				+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0,		0x800c0		},
			
 
				+		[C(OP_WRITE)] = {	-1,		-1		},
			
 
				+		[C(OP_PREFETCH)] = {	-1,		-1		},
			
 
				+	},
			
 
				+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0x230e4,	0x230e5		},
			
 
				+		[C(OP_WRITE)] = {	-1,		-1		},
			
 
				+		[C(OP_PREFETCH)] = {	-1,		-1		},
			
 
				+	},
			
 
				+};
			
 
				+
			
 
				+struct power_pmu power5_pmu = {
			
 
				+	.n_counter = 6,
			
 
				+	.max_alternatives = MAX_ALT,
			
 
				+	.add_fields = 0x7000090000555ull,
			
 
				+	.test_adder = 0x3000490000000ull,
			
 
				+	.compute_mmcr = power5_compute_mmcr,
			
 
				+	.get_constraint = power5_get_constraint,
			
 
				+	.get_alternatives = power5_get_alternatives,
			
 
				+	.disable_pmc = power5_disable_pmc,
			
 
				+	.n_generic = ARRAY_SIZE(power5_generic_events),
			
 
				+	.generic_events = power5_generic_events,
			
 
				+	.cache_events = &power5_cache_events,
			
 
				+};
			
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -0,0 +1,532 @@
 
				+/*
			
 
				+ * Performance counter support for POWER6 processors.
			
 
				+ *
			
 
				+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public License
			
 
				+ * as published by the Free Software Foundation; either version
			
 
				+ * 2 of the License, or (at your option) any later version.
			
 
				+ */
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/perf_counter.h>
			
 
				+#include <asm/reg.h>
			
 
				+
			
 
				+/*
			
 
				+ * Bits in event code for POWER6
			
 
				+ */
			
 
				+#define PM_PMC_SH	20	/* PMC number (1-based) for direct events */
			
 
				+#define PM_PMC_MSK	0x7
			
 
				+#define PM_PMC_MSKS	(PM_PMC_MSK << PM_PMC_SH)
			
 
				+#define PM_UNIT_SH	16	/* Unit event comes (TTMxSEL encoding) */
			
 
				+#define PM_UNIT_MSK	0xf
			
 
				+#define PM_UNIT_MSKS	(PM_UNIT_MSK << PM_UNIT_SH)
			
 
				+#define PM_LLAV		0x8000	/* Load lookahead match value */
			
 
				+#define PM_LLA		0x4000	/* Load lookahead match enable */
			
 
				+#define PM_BYTE_SH	12	/* Byte of event bus to use */
			
 
				+#define PM_BYTE_MSK	3
			
 
				+#define PM_SUBUNIT_SH	8	/* Subunit event comes from (NEST_SEL enc.) */
			
 
				+#define PM_SUBUNIT_MSK	7
			
 
				+#define PM_SUBUNIT_MSKS	(PM_SUBUNIT_MSK << PM_SUBUNIT_SH)
			
 
				+#define PM_PMCSEL_MSK	0xff	/* PMCxSEL value */
			
 
				+#define PM_BUSEVENT_MSK	0xf3700
			
 
				+
			
 
				+/*
			
 
				+ * Bits in MMCR1 for POWER6
			
 
				+ */
			
 
				+#define MMCR1_TTM0SEL_SH	60
			
 
				+#define MMCR1_TTMSEL_SH(n)	(MMCR1_TTM0SEL_SH - (n) * 4)
			
 
				+#define MMCR1_TTMSEL_MSK	0xf
			
 
				+#define MMCR1_TTMSEL(m, n)	(((m) >> MMCR1_TTMSEL_SH(n)) & MMCR1_TTMSEL_MSK)
			
 
				+#define MMCR1_NESTSEL_SH	45
			
 
				+#define MMCR1_NESTSEL_MSK	0x7
			
 
				+#define MMCR1_NESTSEL(m)	(((m) >> MMCR1_NESTSEL_SH) & MMCR1_NESTSEL_MSK)
			
 
				+#define MMCR1_PMC1_LLA		((u64)1 << 44)
			
 
				+#define MMCR1_PMC1_LLA_VALUE	((u64)1 << 39)
			
 
				+#define MMCR1_PMC1_ADDR_SEL	((u64)1 << 35)
			
 
				+#define MMCR1_PMC1SEL_SH	24
			
 
				+#define MMCR1_PMCSEL_SH(n)	(MMCR1_PMC1SEL_SH - (n) * 8)
			
 
				+#define MMCR1_PMCSEL_MSK	0xff
			
 
				+
			
 
				+/*
			
 
				+ * Map of which direct events on which PMCs are marked instruction events.
			
 
				+ * Indexed by PMCSEL value >> 1.
			
 
				+ * Bottom 4 bits are a map of which PMCs are interesting,
			
 
				+ * top 4 bits say what sort of event:
			
 
				+ *   0 = direct marked event,
			
 
				+ *   1 = byte decode event,
			
 
				+ *   4 = add/and event (PMC1 -> bits 0 & 4),
			
 
				+ *   5 = add/and event (PMC1 -> bits 1 & 5),
			
 
				+ *   6 = add/and event (PMC1 -> bits 2 & 6),
			
 
				+ *   7 = add/and event (PMC1 -> bits 3 & 7).
			
 
				+ */
			
 
				+static unsigned char direct_event_is_marked[0x60 >> 1] = {
			
 
				+	0,	/* 00 */
			
 
				+	0,	/* 02 */
			
 
				+	0,	/* 04 */
			
 
				+	0x07,	/* 06 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
			
 
				+	0x04,	/* 08 PM_MRK_DFU_FIN */
			
 
				+	0x06,	/* 0a PM_MRK_IFU_FIN, PM_MRK_INST_FIN */
			
 
				+	0,	/* 0c */
			
 
				+	0,	/* 0e */
			
 
				+	0x02,	/* 10 PM_MRK_INST_DISP */
			
 
				+	0x08,	/* 12 PM_MRK_LSU_DERAT_MISS */
			
 
				+	0,	/* 14 */
			
 
				+	0,	/* 16 */
			
 
				+	0x0c,	/* 18 PM_THRESH_TIMEO, PM_MRK_INST_FIN */
			
 
				+	0x0f,	/* 1a PM_MRK_INST_DISP, PM_MRK_{FXU,FPU,LSU}_FIN */
			
 
				+	0x01,	/* 1c PM_MRK_INST_ISSUED */
			
 
				+	0,	/* 1e */
			
 
				+	0,	/* 20 */
			
 
				+	0,	/* 22 */
			
 
				+	0,	/* 24 */
			
 
				+	0,	/* 26 */
			
 
				+	0x15,	/* 28 PM_MRK_DATA_FROM_L2MISS, PM_MRK_DATA_FROM_L3MISS */
			
 
				+	0,	/* 2a */
			
 
				+	0,	/* 2c */
			
 
				+	0,	/* 2e */
			
 
				+	0x4f,	/* 30 */
			
 
				+	0x7f,	/* 32 */
			
 
				+	0x4f,	/* 34 */
			
 
				+	0x5f,	/* 36 */
			
 
				+	0x6f,	/* 38 */
			
 
				+	0x4f,	/* 3a */
			
 
				+	0,	/* 3c */
			
 
				+	0x08,	/* 3e PM_MRK_INST_TIMEO */
			
 
				+	0x1f,	/* 40 */
			
 
				+	0x1f,	/* 42 */
			
 
				+	0x1f,	/* 44 */
			
 
				+	0x1f,	/* 46 */
			
 
				+	0x1f,	/* 48 */
			
 
				+	0x1f,	/* 4a */
			
 
				+	0x1f,	/* 4c */
			
 
				+	0x1f,	/* 4e */
			
 
				+	0,	/* 50 */
			
 
				+	0x05,	/* 52 PM_MRK_BR_TAKEN, PM_MRK_BR_MPRED */
			
 
				+	0x1c,	/* 54 PM_MRK_PTEG_FROM_L3MISS, PM_MRK_PTEG_FROM_L2MISS */
			
 
				+	0x02,	/* 56 PM_MRK_LD_MISS_L1 */
			
 
				+	0,	/* 58 */
			
 
				+	0,	/* 5a */
			
 
				+	0,	/* 5c */
			
 
				+	0,	/* 5e */
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Masks showing for each unit which bits are marked events.
			
 
				+ * These masks are in LE order, i.e. 0x00000001 is byte 0, bit 0.
			
 
				+ */
			
 
				+static u32 marked_bus_events[16] = {
			
 
				+	0x01000000,	/* direct events set 1: byte 3 bit 0 */
			
 
				+	0x00010000,	/* direct events set 2: byte 2 bit 0 */
			
 
				+	0, 0, 0, 0,	/* IDU, IFU, nest: nothing */
			
 
				+	0x00000088,	/* VMX set 1: byte 0 bits 3, 7 */
			
 
				+	0x000000c0,	/* VMX set 2: byte 0 bits 4-7 */
			
 
				+	0x04010000,	/* LSU set 1: byte 2 bit 0, byte 3 bit 2 */
			
 
				+	0xff010000u,	/* LSU set 2: byte 2 bit 0, all of byte 3 */
			
 
				+	0,		/* LSU set 3 */
			
 
				+	0x00000010,	/* VMX set 3: byte 0 bit 4 */
			
 
				+	0,		/* BFP set 1 */
			
 
				+	0x00000022,	/* BFP set 2: byte 0 bits 1, 5 */
			
 
				+	0, 0
			
 
				+};
			
 
				+	
			
 
				+/*
			
 
				+ * Returns 1 if event counts things relating to marked instructions
			
 
				+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
			
 
				+ */
			
 
				+static int power6_marked_instr_event(u64 event)
			
 
				+{
			
 
				+	int pmc, psel, ptype;
			
 
				+	int bit, byte, unit;
			
 
				+	u32 mask;
			
 
				+
			
 
				+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+	psel = (event & PM_PMCSEL_MSK) >> 1;	/* drop edge/level bit */
			
 
				+	if (pmc >= 5)
			
 
				+		return 0;
			
 
				+
			
 
				+	bit = -1;
			
 
				+	if (psel < sizeof(direct_event_is_marked)) {
			
 
				+		ptype = direct_event_is_marked[psel];
			
 
				+		if (pmc == 0 || !(ptype & (1 << (pmc - 1))))
			
 
				+			return 0;
			
 
				+		ptype >>= 4;
			
 
				+		if (ptype == 0)
			
 
				+			return 1;
			
 
				+		if (ptype == 1)
			
 
				+			bit = 0;
			
 
				+		else
			
 
				+			bit = ptype ^ (pmc - 1);
			
 
				+	} else if ((psel & 0x48) == 0x40)
			
 
				+		bit = psel & 7;
			
 
				+
			
 
				+	if (!(event & PM_BUSEVENT_MSK) || bit == -1)
			
 
				+		return 0;
			
 
				+
			
 
				+	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
			
 
				+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
			
 
				+	mask = marked_bus_events[unit];
			
 
				+	return (mask >> (byte * 8 + bit)) & 1;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Assign PMC numbers and compute MMCR1 value for a set of events
			
 
				+ */
			
 
				+static int p6_compute_mmcr(u64 event[], int n_ev,
			
 
				+			   unsigned int hwc[], u64 mmcr[])
			
 
				+{
			
 
				+	u64 mmcr1 = 0;
			
 
				+	u64 mmcra = 0;
			
 
				+	int i;
			
 
				+	unsigned int pmc, ev, b, u, s, psel;
			
 
				+	unsigned int ttmset = 0;
			
 
				+	unsigned int pmc_inuse = 0;
			
 
				+
			
 
				+	if (n_ev > 6)
			
 
				+		return -1;
			
 
				+	for (i = 0; i < n_ev; ++i) {
			
 
				+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+		if (pmc) {
			
 
				+			if (pmc_inuse & (1 << (pmc - 1)))
			
 
				+				return -1;	/* collision! */
			
 
				+			pmc_inuse |= 1 << (pmc - 1);
			
 
				+		}
			
 
				+	}
			
 
				+	for (i = 0; i < n_ev; ++i) {
			
 
				+		ev = event[i];
			
 
				+		pmc = (ev >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+		if (pmc) {
			
 
				+			--pmc;
			
 
				+		} else {
			
 
				+			/* can go on any PMC; find a free one */
			
 
				+			for (pmc = 0; pmc < 4; ++pmc)
			
 
				+				if (!(pmc_inuse & (1 << pmc)))
			
 
				+					break;
			
 
				+			if (pmc >= 4)
			
 
				+				return -1;
			
 
				+			pmc_inuse |= 1 << pmc;
			
 
				+		}
			
 
				+		hwc[i] = pmc;
			
 
				+		psel = ev & PM_PMCSEL_MSK;
			
 
				+		if (ev & PM_BUSEVENT_MSK) {
			
 
				+			/* this event uses the event bus */
			
 
				+			b = (ev >> PM_BYTE_SH) & PM_BYTE_MSK;
			
 
				+			u = (ev >> PM_UNIT_SH) & PM_UNIT_MSK;
			
 
				+			/* check for conflict on this byte of event bus */
			
 
				+			if ((ttmset & (1 << b)) && MMCR1_TTMSEL(mmcr1, b) != u)
			
 
				+				return -1;
			
 
				+			mmcr1 |= (u64)u << MMCR1_TTMSEL_SH(b);
			
 
				+			ttmset |= 1 << b;
			
 
				+			if (u == 5) {
			
 
				+				/* Nest events have a further mux */
			
 
				+				s = (ev >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
			
 
				+				if ((ttmset & 0x10) &&
			
 
				+				    MMCR1_NESTSEL(mmcr1) != s)
			
 
				+					return -1;
			
 
				+				ttmset |= 0x10;
			
 
				+				mmcr1 |= (u64)s << MMCR1_NESTSEL_SH;
			
 
				+			}
			
 
				+			if (0x30 <= psel && psel <= 0x3d) {
			
 
				+				/* these need the PMCx_ADDR_SEL bits */
			
 
				+				if (b >= 2)
			
 
				+					mmcr1 |= MMCR1_PMC1_ADDR_SEL >> pmc;
			
 
				+			}
			
 
				+			/* bus select values are different for PMC3/4 */
			
 
				+			if (pmc >= 2 && (psel & 0x90) == 0x80)
			
 
				+				psel ^= 0x20;
			
 
				+		}
			
 
				+		if (ev & PM_LLA) {
			
 
				+			mmcr1 |= MMCR1_PMC1_LLA >> pmc;
			
 
				+			if (ev & PM_LLAV)
			
 
				+				mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc;
			
 
				+		}
			
 
				+		if (power6_marked_instr_event(event[i]))
			
 
				+			mmcra |= MMCRA_SAMPLE_ENABLE;
			
 
				+		if (pmc < 4)
			
 
				+			mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc);
			
 
				+	}
			
 
				+	mmcr[0] = 0;
			
 
				+	if (pmc_inuse & 1)
			
 
				+		mmcr[0] = MMCR0_PMC1CE;
			
 
				+	if (pmc_inuse & 0xe)
			
 
				+		mmcr[0] |= MMCR0_PMCjCE;
			
 
				+	mmcr[1] = mmcr1;
			
 
				+	mmcr[2] = mmcra;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Layout of constraint bits:
			
 
				+ *
			
 
				+ *	0-1	add field: number of uses of PMC1 (max 1)
			
 
				+ *	2-3, 4-5, 6-7, 8-9, 10-11: ditto for PMC2, 3, 4, 5, 6
			
 
				+ *	12-15	add field: number of uses of PMC1-4 (max 4)
			
 
				+ *	16-19	select field: unit on byte 0 of event bus
			
 
				+ *	20-23, 24-27, 28-31 ditto for bytes 1, 2, 3
			
 
				+ *	32-34	select field: nest (subunit) event selector
			
 
				+ */
			
 
				+static int p6_get_constraint(u64 event, u64 *maskp, u64 *valp)
			
 
				+{
			
 
				+	int pmc, byte, sh, subunit;
			
 
				+	u64 mask = 0, value = 0;
			
 
				+
			
 
				+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+	if (pmc) {
			
 
				+		if (pmc > 4 && !(event == 0x500009 || event == 0x600005))
			
 
				+			return -1;
			
 
				+		sh = (pmc - 1) * 2;
			
 
				+		mask |= 2 << sh;
			
 
				+		value |= 1 << sh;
			
 
				+	}
			
 
				+	if (event & PM_BUSEVENT_MSK) {
			
 
				+		byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
			
 
				+		sh = byte * 4 + (16 - PM_UNIT_SH);
			
 
				+		mask |= PM_UNIT_MSKS << sh;
			
 
				+		value |= (u64)(event & PM_UNIT_MSKS) << sh;
			
 
				+		if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) {
			
 
				+			subunit = (event >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
			
 
				+			mask  |= (u64)PM_SUBUNIT_MSK << 32;
			
 
				+			value |= (u64)subunit << 32;
			
 
				+		}
			
 
				+	}
			
 
				+	if (pmc <= 4) {
			
 
				+		mask  |= 0x8000;	/* add field for count of PMC1-4 uses */
			
 
				+		value |= 0x1000;
			
 
				+	}
			
 
				+	*maskp = mask;
			
 
				+	*valp = value;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int p6_limited_pmc_event(u64 event)
			
 
				+{
			
 
				+	int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+
			
 
				+	return pmc == 5 || pmc == 6;
			
 
				+}
			
 
				+
			
 
				+#define MAX_ALT	4	/* at most 4 alternatives for any event */
			
 
				+
			
 
				+static const unsigned int event_alternatives[][MAX_ALT] = {
			
 
				+	{ 0x0130e8, 0x2000f6, 0x3000fc },	/* PM_PTEG_RELOAD_VALID */
			
 
				+	{ 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */
			
 
				+	{ 0x080088, 0x200054, 0x3000f0 },	/* PM_ST_MISS_L1 */
			
 
				+	{ 0x10000a, 0x2000f4, 0x600005 },	/* PM_RUN_CYC */
			
 
				+	{ 0x10000b, 0x2000f5 },			/* PM_RUN_COUNT */
			
 
				+	{ 0x10000e, 0x400010 },			/* PM_PURR */
			
 
				+	{ 0x100010, 0x4000f8 },			/* PM_FLUSH */
			
 
				+	{ 0x10001a, 0x200010 },			/* PM_MRK_INST_DISP */
			
 
				+	{ 0x100026, 0x3000f8 },			/* PM_TB_BIT_TRANS */
			
 
				+	{ 0x100054, 0x2000f0 },			/* PM_ST_FIN */
			
 
				+	{ 0x100056, 0x2000fc },			/* PM_L1_ICACHE_MISS */
			
 
				+	{ 0x1000f0, 0x40000a },			/* PM_INST_IMC_MATCH_CMPL */
			
 
				+	{ 0x1000f8, 0x200008 },			/* PM_GCT_EMPTY_CYC */
			
 
				+	{ 0x1000fc, 0x400006 },			/* PM_LSU_DERAT_MISS_CYC */
			
 
				+	{ 0x20000e, 0x400007 },			/* PM_LSU_DERAT_MISS */
			
 
				+	{ 0x200012, 0x300012 },			/* PM_INST_DISP */
			
 
				+	{ 0x2000f2, 0x3000f2 },			/* PM_INST_DISP */
			
 
				+	{ 0x2000f8, 0x300010 },			/* PM_EXT_INT */
			
 
				+	{ 0x2000fe, 0x300056 },			/* PM_DATA_FROM_L2MISS */
			
 
				+	{ 0x2d0030, 0x30001a },			/* PM_MRK_FPU_FIN */
			
 
				+	{ 0x30000a, 0x400018 },			/* PM_MRK_INST_FIN */
			
 
				+	{ 0x3000f6, 0x40000e },			/* PM_L1_DCACHE_RELOAD_VALID */
			
 
				+	{ 0x3000fe, 0x400056 },			/* PM_DATA_FROM_L3MISS */
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * This could be made more efficient with a binary search on
			
 
				+ * a presorted list, if necessary
			
 
				+ */
			
 
				+static int find_alternatives_list(u64 event)
			
 
				+{
			
 
				+	int i, j;
			
 
				+	unsigned int alt;
			
 
				+
			
 
				+	for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
			
 
				+		if (event < event_alternatives[i][0])
			
 
				+			return -1;
			
 
				+		for (j = 0; j < MAX_ALT; ++j) {
			
 
				+			alt = event_alternatives[i][j];
			
 
				+			if (!alt || event < alt)
			
 
				+				break;
			
 
				+			if (event == alt)
			
 
				+				return i;
			
 
				+		}
			
 
				+	}
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+static int p6_get_alternatives(u64 event, unsigned int flags, u64 alt[])
			
 
				+{
			
 
				+	int i, j, nlim;
			
 
				+	unsigned int psel, pmc;
			
 
				+	unsigned int nalt = 1;
			
 
				+	u64 aevent;
			
 
				+
			
 
				+	alt[0] = event;
			
 
				+	nlim = p6_limited_pmc_event(event);
			
 
				+
			
 
				+	/* check the alternatives table */
			
 
				+	i = find_alternatives_list(event);
			
 
				+	if (i >= 0) {
			
 
				+		/* copy out alternatives from list */
			
 
				+		for (j = 0; j < MAX_ALT; ++j) {
			
 
				+			aevent = event_alternatives[i][j];
			
 
				+			if (!aevent)
			
 
				+				break;
			
 
				+			if (aevent != event)
			
 
				+				alt[nalt++] = aevent;
			
 
				+			nlim += p6_limited_pmc_event(aevent);
			
 
				+		}
			
 
				+
			
 
				+	} else {
			
 
				+		/* Check for alternative ways of computing sum events */
			
 
				+		/* PMCSEL 0x32 counter N == PMCSEL 0x34 counter 5-N */
			
 
				+		psel = event & (PM_PMCSEL_MSK & ~1);	/* ignore edge bit */
			
 
				+		pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+		if (pmc && (psel == 0x32 || psel == 0x34))
			
 
				+			alt[nalt++] = ((event ^ 0x6) & ~PM_PMC_MSKS) |
			
 
				+				((5 - pmc) << PM_PMC_SH);
			
 
				+
			
 
				+		/* PMCSEL 0x38 counter N == PMCSEL 0x3a counter N+/-2 */
			
 
				+		if (pmc && (psel == 0x38 || psel == 0x3a))
			
 
				+			alt[nalt++] = ((event ^ 0x2) & ~PM_PMC_MSKS) |
			
 
				+				((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH);
			
 
				+	}
			
 
				+
			
 
				+	if (flags & PPMU_ONLY_COUNT_RUN) {
			
 
				+		/*
			
 
				+		 * We're only counting in RUN state,
			
 
				+		 * so PM_CYC is equivalent to PM_RUN_CYC,
			
 
				+		 * PM_INST_CMPL === PM_RUN_INST_CMPL, PM_PURR === PM_RUN_PURR.
			
 
				+		 * This doesn't include alternatives that don't provide
			
 
				+		 * any extra flexibility in assigning PMCs (e.g.
			
 
				+		 * 0x10000a for PM_RUN_CYC vs. 0x1e for PM_CYC).
			
 
				+		 * Note that even with these additional alternatives
			
 
				+		 * we never end up with more than 4 alternatives for any event.
			
 
				+		 */
			
 
				+		j = nalt;
			
 
				+		for (i = 0; i < nalt; ++i) {
			
 
				+			switch (alt[i]) {
			
 
				+			case 0x1e:	/* PM_CYC */
			
 
				+				alt[j++] = 0x600005;	/* PM_RUN_CYC */
			
 
				+				++nlim;
			
 
				+				break;
			
 
				+			case 0x10000a:	/* PM_RUN_CYC */
			
 
				+				alt[j++] = 0x1e;	/* PM_CYC */
			
 
				+				break;
			
 
				+			case 2:		/* PM_INST_CMPL */
			
 
				+				alt[j++] = 0x500009;	/* PM_RUN_INST_CMPL */
			
 
				+				++nlim;
			
 
				+				break;
			
 
				+			case 0x500009:	/* PM_RUN_INST_CMPL */
			
 
				+				alt[j++] = 2;		/* PM_INST_CMPL */
			
 
				+				break;
			
 
				+			case 0x10000e:	/* PM_PURR */
			
 
				+				alt[j++] = 0x4000f4;	/* PM_RUN_PURR */
			
 
				+				break;
			
 
				+			case 0x4000f4:	/* PM_RUN_PURR */
			
 
				+				alt[j++] = 0x10000e;	/* PM_PURR */
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+		nalt = j;
			
 
				+	}
			
 
				+
			
 
				+	if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) {
			
 
				+		/* remove the limited PMC events */
			
 
				+		j = 0;
			
 
				+		for (i = 0; i < nalt; ++i) {
			
 
				+			if (!p6_limited_pmc_event(alt[i])) {
			
 
				+				alt[j] = alt[i];
			
 
				+				++j;
			
 
				+			}
			
 
				+		}
			
 
				+		nalt = j;
			
 
				+	} else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) {
			
 
				+		/* remove all but the limited PMC events */
			
 
				+		j = 0;
			
 
				+		for (i = 0; i < nalt; ++i) {
			
 
				+			if (p6_limited_pmc_event(alt[i])) {
			
 
				+				alt[j] = alt[i];
			
 
				+				++j;
			
 
				+			}
			
 
				+		}
			
 
				+		nalt = j;
			
 
				+	}
			
 
				+
			
 
				+	return nalt;
			
 
				+}
			
 
				+
			
 
				+static void p6_disable_pmc(unsigned int pmc, u64 mmcr[])
			
 
				+{
			
 
				+	/* Set PMCxSEL to 0 to disable PMCx */
			
 
				+	if (pmc <= 3)
			
 
				+		mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
			
 
				+}
			
 
				+
			
 
				+static int power6_generic_events[] = {
			
 
				+	[PERF_COUNT_HW_CPU_CYCLES]		= 0x1e,
			
 
				+	[PERF_COUNT_HW_INSTRUCTIONS]		= 2,
			
 
				+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x280030, /* LD_REF_L1 */
			
 
				+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x30000c, /* LD_MISS_L1 */
			
 
				+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x410a0,  /* BR_PRED */
			
 
				+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x400052, /* BR_MPRED */
			
 
				+};
			
 
				+
			
 
				+#define C(x)	PERF_COUNT_HW_CACHE_##x
			
 
				+
			
 
				+/*
			
 
				+ * Table of generalized cache-related events.
			
 
				+ * 0 means not supported, -1 means nonsensical, other values
			
 
				+ * are event codes.
			
 
				+ * The "DTLB" and "ITLB" events relate to the DERAT and IERAT.
			
 
				+ */
			
 
				+static int power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
			
 
				+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0x80082,	0x80080		},
			
 
				+		[C(OP_WRITE)] = {	0x80086,	0x80088		},
			
 
				+		[C(OP_PREFETCH)] = {	0x810a4,	0		},
			
 
				+	},
			
 
				+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0,		0x100056 	},
			
 
				+		[C(OP_WRITE)] = {	-1,		-1		},
			
 
				+		[C(OP_PREFETCH)] = {	0x4008c,	0		},
			
 
				+	},
			
 
				+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0x150730,	0x250532	},
			
 
				+		[C(OP_WRITE)] = {	0x250432,	0x150432	},
			
 
				+		[C(OP_PREFETCH)] = {	0x810a6,	0		},
			
 
				+	},
			
 
				+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0,		0x20000e	},
			
 
				+		[C(OP_WRITE)] = {	-1,		-1		},
			
 
				+		[C(OP_PREFETCH)] = {	-1,		-1		},
			
 
				+	},
			
 
				+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0,		0x420ce		},
			
 
				+		[C(OP_WRITE)] = {	-1,		-1		},
			
 
				+		[C(OP_PREFETCH)] = {	-1,		-1		},
			
 
				+	},
			
 
				+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0x430e6,	0x400052	},
			
 
				+		[C(OP_WRITE)] = {	-1,		-1		},
			
 
				+		[C(OP_PREFETCH)] = {	-1,		-1		},
			
 
				+	},
			
 
				+};
			
 
				+
			
 
				+struct power_pmu power6_pmu = {
			
 
				+	.n_counter = 6,
			
 
				+	.max_alternatives = MAX_ALT,
			
 
				+	.add_fields = 0x1555,
			
 
				+	.test_adder = 0x3000,
			
 
				+	.compute_mmcr = p6_compute_mmcr,
			
 
				+	.get_constraint = p6_get_constraint,
			
 
				+	.get_alternatives = p6_get_alternatives,
			
 
				+	.disable_pmc = p6_disable_pmc,
			
 
				+	.limited_pmc_event = p6_limited_pmc_event,
			
 
				+	.flags = PPMU_LIMITED_PMC5_6 | PPMU_ALT_SIPR,
			
 
				+	.n_generic = ARRAY_SIZE(power6_generic_events),
			
 
				+	.generic_events = power6_generic_events,
			
 
				+	.cache_events = &power6_cache_events,
			
 
				+};
			
--- a/arch/powerpc/kernel/power7-pmu.c
+++ b/arch/powerpc/kernel/power7-pmu.c
@@ -0,0 +1,357 @@
 
				+/*
			
 
				+ * Performance counter support for POWER7 processors.
			
 
				+ *
			
 
				+ * Copyright 2009 Paul Mackerras, IBM Corporation.
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public License
			
 
				+ * as published by the Free Software Foundation; either version
			
 
				+ * 2 of the License, or (at your option) any later version.
			
 
				+ */
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/perf_counter.h>
			
 
				+#include <asm/reg.h>
			
 
				+
			
 
				+/*
			
 
				+ * Bits in event code for POWER7
			
 
				+ */
			
 
				+#define PM_PMC_SH	16	/* PMC number (1-based) for direct events */
			
 
				+#define PM_PMC_MSK	0xf
			
 
				+#define PM_PMC_MSKS	(PM_PMC_MSK << PM_PMC_SH)
			
 
				+#define PM_UNIT_SH	12	/* TTMMUX number and setting - unit select */
			
 
				+#define PM_UNIT_MSK	0xf
			
 
				+#define PM_COMBINE_SH	11	/* Combined event bit */
			
 
				+#define PM_COMBINE_MSK	1
			
 
				+#define PM_COMBINE_MSKS	0x800
			
 
				+#define PM_L2SEL_SH	8	/* L2 event select */
			
 
				+#define PM_L2SEL_MSK	7
			
 
				+#define PM_PMCSEL_MSK	0xff
			
 
				+
			
 
				+/*
			
 
				+ * Bits in MMCR1 for POWER7
			
 
				+ */
			
 
				+#define MMCR1_TTM0SEL_SH	60
			
 
				+#define MMCR1_TTM1SEL_SH	56
			
 
				+#define MMCR1_TTM2SEL_SH	52
			
 
				+#define MMCR1_TTM3SEL_SH	48
			
 
				+#define MMCR1_TTMSEL_MSK	0xf
			
 
				+#define MMCR1_L2SEL_SH		45
			
 
				+#define MMCR1_L2SEL_MSK		7
			
 
				+#define MMCR1_PMC1_COMBINE_SH	35
			
 
				+#define MMCR1_PMC2_COMBINE_SH	34
			
 
				+#define MMCR1_PMC3_COMBINE_SH	33
			
 
				+#define MMCR1_PMC4_COMBINE_SH	32
			
 
				+#define MMCR1_PMC1SEL_SH	24
			
 
				+#define MMCR1_PMC2SEL_SH	16
			
 
				+#define MMCR1_PMC3SEL_SH	8
			
 
				+#define MMCR1_PMC4SEL_SH	0
			
 
				+#define MMCR1_PMCSEL_SH(n)	(MMCR1_PMC1SEL_SH - (n) * 8)
			
 
				+#define MMCR1_PMCSEL_MSK	0xff
			
 
				+
			
 
				+/*
			
 
				+ * Bits in MMCRA
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Layout of constraint bits:
			
 
				+ * 6666555555555544444444443333333333222222222211111111110000000000
			
 
				+ * 3210987654321098765432109876543210987654321098765432109876543210
			
 
				+ *                                                 [  ><><><><><><>
			
 
				+ *                                                  NC P6P5P4P3P2P1
			
 
				+ *
			
 
				+ * NC - number of counters
			
 
				+ *     15: NC error 0x8000
			
 
				+ *     12-14: number of events needing PMC1-4 0x7000
			
 
				+ *
			
 
				+ * P6
			
 
				+ *     11: P6 error 0x800
			
 
				+ *     10-11: Count of events needing PMC6
			
 
				+ *
			
 
				+ * P1..P5
			
 
				+ *     0-9: Count of events needing PMC1..PMC5
			
 
				+ */
			
 
				+
			
 
				+static int power7_get_constraint(u64 event, u64 *maskp, u64 *valp)
			
 
				+{
			
 
				+	int pmc, sh;
			
 
				+	u64 mask = 0, value = 0;
			
 
				+
			
 
				+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+	if (pmc) {
			
 
				+		if (pmc > 6)
			
 
				+			return -1;
			
 
				+		sh = (pmc - 1) * 2;
			
 
				+		mask |= 2 << sh;
			
 
				+		value |= 1 << sh;
			
 
				+		if (pmc >= 5 && !(event == 0x500fa || event == 0x600f4))
			
 
				+			return -1;
			
 
				+	}
			
 
				+	if (pmc < 5) {
			
 
				+		/* need a counter from PMC1-4 set */
			
 
				+		mask  |= 0x8000;
			
 
				+		value |= 0x1000;
			
 
				+	}
			
 
				+	*maskp = mask;
			
 
				+	*valp = value;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+#define MAX_ALT	2	/* at most 2 alternatives for any event */
			
 
				+
			
 
				+static const unsigned int event_alternatives[][MAX_ALT] = {
			
 
				+	{ 0x200f2, 0x300f2 },		/* PM_INST_DISP */
			
 
				+	{ 0x200f4, 0x600f4 },		/* PM_RUN_CYC */
			
 
				+	{ 0x400fa, 0x500fa },		/* PM_RUN_INST_CMPL */
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Scan the alternatives table for a match and return the
			
 
				+ * index into the alternatives table if found, else -1.
			
 
				+ */
			
 
				+static int find_alternative(u64 event)
			
 
				+{
			
 
				+	int i, j;
			
 
				+
			
 
				+	for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
			
 
				+		if (event < event_alternatives[i][0])
			
 
				+			break;
			
 
				+		for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
			
 
				+			if (event == event_alternatives[i][j])
			
 
				+				return i;
			
 
				+	}
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+static s64 find_alternative_decode(u64 event)
			
 
				+{
			
 
				+	int pmc, psel;
			
 
				+
			
 
				+	/* this only handles the 4x decode events */
			
 
				+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+	psel = event & PM_PMCSEL_MSK;
			
 
				+	if ((pmc == 2 || pmc == 4) && (psel & ~7) == 0x40)
			
 
				+		return event - (1 << PM_PMC_SH) + 8;
			
 
				+	if ((pmc == 1 || pmc == 3) && (psel & ~7) == 0x48)
			
 
				+		return event + (1 << PM_PMC_SH) - 8;
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+static int power7_get_alternatives(u64 event, unsigned int flags, u64 alt[])
			
 
				+{
			
 
				+	int i, j, nalt = 1;
			
 
				+	s64 ae;
			
 
				+
			
 
				+	alt[0] = event;
			
 
				+	nalt = 1;
			
 
				+	i = find_alternative(event);
			
 
				+	if (i >= 0) {
			
 
				+		for (j = 0; j < MAX_ALT; ++j) {
			
 
				+			ae = event_alternatives[i][j];
			
 
				+			if (ae && ae != event)
			
 
				+				alt[nalt++] = ae;
			
 
				+		}
			
 
				+	} else {
			
 
				+		ae = find_alternative_decode(event);
			
 
				+		if (ae > 0)
			
 
				+			alt[nalt++] = ae;
			
 
				+	}
			
 
				+
			
 
				+	if (flags & PPMU_ONLY_COUNT_RUN) {
			
 
				+		/*
			
 
				+		 * We're only counting in RUN state,
			
 
				+		 * so PM_CYC is equivalent to PM_RUN_CYC
			
 
				+		 * and PM_INST_CMPL === PM_RUN_INST_CMPL.
			
 
				+		 * This doesn't include alternatives that don't provide
			
 
				+		 * any extra flexibility in assigning PMCs.
			
 
				+		 */
			
 
				+		j = nalt;
			
 
				+		for (i = 0; i < nalt; ++i) {
			
 
				+			switch (alt[i]) {
			
 
				+			case 0x1e:	/* PM_CYC */
			
 
				+				alt[j++] = 0x600f4;	/* PM_RUN_CYC */
			
 
				+				break;
			
 
				+			case 0x600f4:	/* PM_RUN_CYC */
			
 
				+				alt[j++] = 0x1e;
			
 
				+				break;
			
 
				+			case 0x2:	/* PM_PPC_CMPL */
			
 
				+				alt[j++] = 0x500fa;	/* PM_RUN_INST_CMPL */
			
 
				+				break;
			
 
				+			case 0x500fa:	/* PM_RUN_INST_CMPL */
			
 
				+				alt[j++] = 0x2;	/* PM_PPC_CMPL */
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+		nalt = j;
			
 
				+	}
			
 
				+
			
 
				+	return nalt;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Returns 1 if event counts things relating to marked instructions
			
 
				+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
			
 
				+ */
			
 
				+static int power7_marked_instr_event(u64 event)
			
 
				+{
			
 
				+	int pmc, psel;
			
 
				+	int unit;
			
 
				+
			
 
				+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
			
 
				+	psel = event & PM_PMCSEL_MSK & ~1;	/* trim off edge/level bit */
			
 
				+	if (pmc >= 5)
			
 
				+		return 0;
			
 
				+
			
 
				+	switch (psel >> 4) {
			
 
				+	case 2:
			
 
				+		return pmc == 2 || pmc == 4;
			
 
				+	case 3:
			
 
				+		if (psel == 0x3c)
			
 
				+			return pmc == 1;
			
 
				+		if (psel == 0x3e)
			
 
				+			return pmc != 2;
			
 
				+		return 1;
			
 
				+	case 4:
			
 
				+	case 5:
			
 
				+		return unit == 0xd;
			
 
				+	case 6:
			
 
				+		if (psel == 0x64)
			
 
				+			return pmc >= 3;
			
 
				+	case 8:
			
 
				+		return unit == 0xd;
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int power7_compute_mmcr(u64 event[], int n_ev,
			
 
				+			       unsigned int hwc[], u64 mmcr[])
			
 
				+{
			
 
				+	u64 mmcr1 = 0;
			
 
				+	u64 mmcra = 0;
			
 
				+	unsigned int pmc, unit, combine, l2sel, psel;
			
 
				+	unsigned int pmc_inuse = 0;
			
 
				+	int i;
			
 
				+
			
 
				+	/* First pass to count resource use */
			
 
				+	for (i = 0; i < n_ev; ++i) {
			
 
				+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+		if (pmc) {
			
 
				+			if (pmc > 6)
			
 
				+				return -1;
			
 
				+			if (pmc_inuse & (1 << (pmc - 1)))
			
 
				+				return -1;
			
 
				+			pmc_inuse |= 1 << (pmc - 1);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* Second pass: assign PMCs, set all MMCR1 fields */
			
 
				+	for (i = 0; i < n_ev; ++i) {
			
 
				+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
			
 
				+		combine = (event[i] >> PM_COMBINE_SH) & PM_COMBINE_MSK;
			
 
				+		l2sel = (event[i] >> PM_L2SEL_SH) & PM_L2SEL_MSK;
			
 
				+		psel = event[i] & PM_PMCSEL_MSK;
			
 
				+		if (!pmc) {
			
 
				+			/* Bus event or any-PMC direct event */
			
 
				+			for (pmc = 0; pmc < 4; ++pmc) {
			
 
				+				if (!(pmc_inuse & (1 << pmc)))
			
 
				+					break;
			
 
				+			}
			
 
				+			if (pmc >= 4)
			
 
				+				return -1;
			
 
				+			pmc_inuse |= 1 << pmc;
			
 
				+		} else {
			
 
				+			/* Direct or decoded event */
			
 
				+			--pmc;
			
 
				+		}
			
 
				+		if (pmc <= 3) {
			
 
				+			mmcr1 |= (u64) unit << (MMCR1_TTM0SEL_SH - 4 * pmc);
			
 
				+			mmcr1 |= (u64) combine << (MMCR1_PMC1_COMBINE_SH - pmc);
			
 
				+			mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
			
 
				+			if (unit == 6)	/* L2 events */
			
 
				+				mmcr1 |= (u64) l2sel << MMCR1_L2SEL_SH;
			
 
				+		}
			
 
				+		if (power7_marked_instr_event(event[i]))
			
 
				+			mmcra |= MMCRA_SAMPLE_ENABLE;
			
 
				+		hwc[i] = pmc;
			
 
				+	}
			
 
				+
			
 
				+	/* Return MMCRx values */
			
 
				+	mmcr[0] = 0;
			
 
				+	if (pmc_inuse & 1)
			
 
				+		mmcr[0] = MMCR0_PMC1CE;
			
 
				+	if (pmc_inuse & 0x3e)
			
 
				+		mmcr[0] |= MMCR0_PMCjCE;
			
 
				+	mmcr[1] = mmcr1;
			
 
				+	mmcr[2] = mmcra;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void power7_disable_pmc(unsigned int pmc, u64 mmcr[])
			
 
				+{
			
 
				+	if (pmc <= 3)
			
 
				+		mmcr[1] &= ~(0xffULL << MMCR1_PMCSEL_SH(pmc));
			
 
				+}
			
 
				+
			
 
				+static int power7_generic_events[] = {
			
 
				+	[PERF_COUNT_CPU_CYCLES] = 0x1e,
			
 
				+	[PERF_COUNT_INSTRUCTIONS] = 2,
			
 
				+	[PERF_COUNT_CACHE_REFERENCES] = 0xc880,		/* LD_REF_L1_LSU */
			
 
				+	[PERF_COUNT_CACHE_MISSES] = 0x400f0,		/* LD_MISS_L1 */
			
 
				+	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x10068,	/* BRU_FIN */
			
 
				+	[PERF_COUNT_BRANCH_MISSES] = 0x400f6,		/* BR_MPRED */
			
 
				+};
			
 
				+
			
 
				+#define C(x)	PERF_COUNT_HW_CACHE_##x
			
 
				+
			
 
				+/*
			
 
				+ * Table of generalized cache-related events.
			
 
				+ * 0 means not supported, -1 means nonsensical, other values
			
 
				+ * are event codes.
			
 
				+ */
			
 
				+static int power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
			
 
				+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0x400f0,	0xc880	},
			
 
				+		[C(OP_WRITE)] = {	0,		0x300f0	},
			
 
				+		[C(OP_PREFETCH)] = {	0xd8b8,		0	},
			
 
				+	},
			
 
				+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0,		0x200fc	},
			
 
				+		[C(OP_WRITE)] = {	-1,		-1	},
			
 
				+		[C(OP_PREFETCH)] = {	0x408a,		0	},
			
 
				+	},
			
 
				+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0x6080,		0x6084	},
			
 
				+		[C(OP_WRITE)] = {	0x6082,		0x6086	},
			
 
				+		[C(OP_PREFETCH)] = {	0,		0	},
			
 
				+	},
			
 
				+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0,		0x300fc	},
			
 
				+		[C(OP_WRITE)] = {	-1,		-1	},
			
 
				+		[C(OP_PREFETCH)] = {	-1,		-1	},
			
 
				+	},
			
 
				+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0,		0x400fc	},
			
 
				+		[C(OP_WRITE)] = {	-1,		-1	},
			
 
				+		[C(OP_PREFETCH)] = {	-1,		-1	},
			
 
				+	},
			
 
				+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0x10068,	0x400f6	},
			
 
				+		[C(OP_WRITE)] = {	-1,		-1	},
			
 
				+		[C(OP_PREFETCH)] = {	-1,		-1	},
			
 
				+	},
			
 
				+};
			
 
				+
			
 
				+struct power_pmu power7_pmu = {
			
 
				+	.n_counter = 6,
			
 
				+	.max_alternatives = MAX_ALT + 1,
			
 
				+	.add_fields = 0x1555ull,
			
 
				+	.test_adder = 0x3000ull,
			
 
				+	.compute_mmcr = power7_compute_mmcr,
			
 
				+	.get_constraint = power7_get_constraint,
			
 
				+	.get_alternatives = power7_get_alternatives,
			
 
				+	.disable_pmc = power7_disable_pmc,
			
 
				+	.n_generic = ARRAY_SIZE(power7_generic_events),
			
 
				+	.generic_events = power7_generic_events,
			
 
				+	.cache_events = &power7_cache_events,
			
 
				+};
			
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -0,0 +1,482 @@
 
				+/*
			
 
				+ * Performance counter support for PPC970-family processors.
			
 
				+ *
			
 
				+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public License
			
 
				+ * as published by the Free Software Foundation; either version
			
 
				+ * 2 of the License, or (at your option) any later version.
			
 
				+ */
			
 
				+#include <linux/string.h>
			
 
				+#include <linux/perf_counter.h>
			
 
				+#include <asm/reg.h>
			
 
				+
			
 
				+/*
			
 
				+ * Bits in event code for PPC970
			
 
				+ */
			
 
				+#define PM_PMC_SH	12	/* PMC number (1-based) for direct events */
			
 
				+#define PM_PMC_MSK	0xf
			
 
				+#define PM_UNIT_SH	8	/* TTMMUX number and setting - unit select */
			
 
				+#define PM_UNIT_MSK	0xf
			
 
				+#define PM_SPCSEL_SH	6
			
 
				+#define PM_SPCSEL_MSK	3
			
 
				+#define PM_BYTE_SH	4	/* Byte number of event bus to use */
			
 
				+#define PM_BYTE_MSK	3
			
 
				+#define PM_PMCSEL_MSK	0xf
			
 
				+
			
 
				+/* Values in PM_UNIT field */
			
 
				+#define PM_NONE		0
			
 
				+#define PM_FPU		1
			
 
				+#define PM_VPU		2
			
 
				+#define PM_ISU		3
			
 
				+#define PM_IFU		4
			
 
				+#define PM_IDU		5
			
 
				+#define PM_STS		6
			
 
				+#define PM_LSU0		7
			
 
				+#define PM_LSU1U	8
			
 
				+#define PM_LSU1L	9
			
 
				+#define PM_LASTUNIT	9
			
 
				+
			
 
				+/*
			
 
				+ * Bits in MMCR0 for PPC970
			
 
				+ */
			
 
				+#define MMCR0_PMC1SEL_SH	8
			
 
				+#define MMCR0_PMC2SEL_SH	1
			
 
				+#define MMCR_PMCSEL_MSK		0x1f
			
 
				+
			
 
				+/*
			
 
				+ * Bits in MMCR1 for PPC970
			
 
				+ */
			
 
				+#define MMCR1_TTM0SEL_SH	62
			
 
				+#define MMCR1_TTM1SEL_SH	59
			
 
				+#define MMCR1_TTM3SEL_SH	53
			
 
				+#define MMCR1_TTMSEL_MSK	3
			
 
				+#define MMCR1_TD_CP_DBG0SEL_SH	50
			
 
				+#define MMCR1_TD_CP_DBG1SEL_SH	48
			
 
				+#define MMCR1_TD_CP_DBG2SEL_SH	46
			
 
				+#define MMCR1_TD_CP_DBG3SEL_SH	44
			
 
				+#define MMCR1_PMC1_ADDER_SEL_SH	39
			
 
				+#define MMCR1_PMC2_ADDER_SEL_SH	38
			
 
				+#define MMCR1_PMC6_ADDER_SEL_SH	37
			
 
				+#define MMCR1_PMC5_ADDER_SEL_SH	36
			
 
				+#define MMCR1_PMC8_ADDER_SEL_SH	35
			
 
				+#define MMCR1_PMC7_ADDER_SEL_SH	34
			
 
				+#define MMCR1_PMC3_ADDER_SEL_SH	33
			
 
				+#define MMCR1_PMC4_ADDER_SEL_SH	32
			
 
				+#define MMCR1_PMC3SEL_SH	27
			
 
				+#define MMCR1_PMC4SEL_SH	22
			
 
				+#define MMCR1_PMC5SEL_SH	17
			
 
				+#define MMCR1_PMC6SEL_SH	12
			
 
				+#define MMCR1_PMC7SEL_SH	7
			
 
				+#define MMCR1_PMC8SEL_SH	2
			
 
				+
			
 
				+static short mmcr1_adder_bits[8] = {
			
 
				+	MMCR1_PMC1_ADDER_SEL_SH,
			
 
				+	MMCR1_PMC2_ADDER_SEL_SH,
			
 
				+	MMCR1_PMC3_ADDER_SEL_SH,
			
 
				+	MMCR1_PMC4_ADDER_SEL_SH,
			
 
				+	MMCR1_PMC5_ADDER_SEL_SH,
			
 
				+	MMCR1_PMC6_ADDER_SEL_SH,
			
 
				+	MMCR1_PMC7_ADDER_SEL_SH,
			
 
				+	MMCR1_PMC8_ADDER_SEL_SH
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Bits in MMCRA
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Layout of constraint bits:
			
 
				+ * 6666555555555544444444443333333333222222222211111111110000000000
			
 
				+ * 3210987654321098765432109876543210987654321098765432109876543210
			
 
				+ *               <><><>[  >[  >[  ><  ><  ><  ><  ><><><><><><><><>
			
 
				+ *               SPT0T1 UC  PS1 PS2 B0  B1  B2  B3 P1P2P3P4P5P6P7P8
			
 
				+ *
			
 
				+ * SP - SPCSEL constraint
			
 
				+ *     48-49: SPCSEL value 0x3_0000_0000_0000
			
 
				+ *
			
 
				+ * T0 - TTM0 constraint
			
 
				+ *     46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000
			
 
				+ *
			
 
				+ * T1 - TTM1 constraint
			
 
				+ *     44-45: TTM1SEL value (0=IDU, 3=STS) 0x3000_0000_0000
			
 
				+ *
			
 
				+ * UC - unit constraint: can't have all three of FPU|IFU|VPU, ISU, IDU|STS
			
 
				+ *     43: UC3 error 0x0800_0000_0000
			
 
				+ *     42: FPU|IFU|VPU events needed 0x0400_0000_0000
			
 
				+ *     41: ISU events needed 0x0200_0000_0000
			
 
				+ *     40: IDU|STS events needed 0x0100_0000_0000
			
 
				+ *
			
 
				+ * PS1
			
 
				+ *     39: PS1 error 0x0080_0000_0000
			
 
				+ *     36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
			
 
				+ *
			
 
				+ * PS2
			
 
				+ *     35: PS2 error 0x0008_0000_0000
			
 
				+ *     32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
			
 
				+ *
			
 
				+ * B0
			
 
				+ *     28-31: Byte 0 event source 0xf000_0000
			
 
				+ *	      Encoding as for the event code
			
 
				+ *
			
 
				+ * B1, B2, B3
			
 
				+ *     24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
			
 
				+ *
			
 
				+ * P1
			
 
				+ *     15: P1 error 0x8000
			
 
				+ *     14-15: Count of events needing PMC1
			
 
				+ *
			
 
				+ * P2..P8
			
 
				+ *     0-13: Count of events needing PMC2..PMC8
			
 
				+ */
			
 
				+
			
 
				+static unsigned char direct_marked_event[8] = {
			
 
				+	(1<<2) | (1<<3),	/* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */
			
 
				+	(1<<3) | (1<<5),	/* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */
			
 
				+	(1<<3) | (1<<5),	/* PMC3: PM_MRK_ST_CMPL_INT, PM_MRK_VMX_FIN */
			
 
				+	(1<<4) | (1<<5),	/* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */
			
 
				+	(1<<4) | (1<<5),	/* PMC5: PM_GRP_MRK, PM_MRK_GRP_TIMEO */
			
 
				+	(1<<3) | (1<<4) | (1<<5),
			
 
				+		/* PMC6: PM_MRK_ST_STS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */
			
 
				+	(1<<4) | (1<<5),	/* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */
			
 
				+	(1<<4)			/* PMC8: PM_MRK_LSU_FIN */
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Returns 1 if event counts things relating to marked instructions
			
 
				+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
			
 
				+ */
			
 
				+static int p970_marked_instr_event(u64 event)
			
 
				+{
			
 
				+	int pmc, psel, unit, byte, bit;
			
 
				+	unsigned int mask;
			
 
				+
			
 
				+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+	psel = event & PM_PMCSEL_MSK;
			
 
				+	if (pmc) {
			
 
				+		if (direct_marked_event[pmc - 1] & (1 << psel))
			
 
				+			return 1;
			
 
				+		if (psel == 0)		/* add events */
			
 
				+			bit = (pmc <= 4)? pmc - 1: 8 - pmc;
			
 
				+		else if (psel == 7 || psel == 13)	/* decode events */
			
 
				+			bit = 4;
			
 
				+		else
			
 
				+			return 0;
			
 
				+	} else
			
 
				+		bit = psel;
			
 
				+
			
 
				+	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
			
 
				+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
			
 
				+	mask = 0;
			
 
				+	switch (unit) {
			
 
				+	case PM_VPU:
			
 
				+		mask = 0x4c;		/* byte 0 bits 2,3,6 */
			
 
				+	case PM_LSU0:
			
 
				+		/* byte 2 bits 0,2,3,4,6; all of byte 1 */
			
 
				+		mask = 0x085dff00;
			
 
				+	case PM_LSU1L:
			
 
				+		mask = 0x50 << 24;	/* byte 3 bits 4,6 */
			
 
				+		break;
			
 
				+	}
			
 
				+	return (mask >> (byte * 8 + bit)) & 1;
			
 
				+}
			
 
				+
			
 
				+/* Masks and values for using events from the various units */
			
 
				+static u64 unit_cons[PM_LASTUNIT+1][2] = {
			
 
				+	[PM_FPU] =   { 0xc80000000000ull, 0x040000000000ull },
			
 
				+	[PM_VPU] =   { 0xc80000000000ull, 0xc40000000000ull },
			
 
				+	[PM_ISU] =   { 0x080000000000ull, 0x020000000000ull },
			
 
				+	[PM_IFU] =   { 0xc80000000000ull, 0x840000000000ull },
			
 
				+	[PM_IDU] =   { 0x380000000000ull, 0x010000000000ull },
			
 
				+	[PM_STS] =   { 0x380000000000ull, 0x310000000000ull },
			
 
				+};
			
 
				+
			
 
				+static int p970_get_constraint(u64 event, u64 *maskp, u64 *valp)
			
 
				+{
			
 
				+	int pmc, byte, unit, sh, spcsel;
			
 
				+	u64 mask = 0, value = 0;
			
 
				+	int grp = -1;
			
 
				+
			
 
				+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+	if (pmc) {
			
 
				+		if (pmc > 8)
			
 
				+			return -1;
			
 
				+		sh = (pmc - 1) * 2;
			
 
				+		mask |= 2 << sh;
			
 
				+		value |= 1 << sh;
			
 
				+		grp = ((pmc - 1) >> 1) & 1;
			
 
				+	}
			
 
				+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
			
 
				+	if (unit) {
			
 
				+		if (unit > PM_LASTUNIT)
			
 
				+			return -1;
			
 
				+		mask |= unit_cons[unit][0];
			
 
				+		value |= unit_cons[unit][1];
			
 
				+		byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
			
 
				+		/*
			
 
				+		 * Bus events on bytes 0 and 2 can be counted
			
 
				+		 * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
			
 
				+		 */
			
 
				+		if (!pmc)
			
 
				+			grp = byte & 1;
			
 
				+		/* Set byte lane select field */
			
 
				+		mask  |= 0xfULL << (28 - 4 * byte);
			
 
				+		value |= (u64)unit << (28 - 4 * byte);
			
 
				+	}
			
 
				+	if (grp == 0) {
			
 
				+		/* increment PMC1/2/5/6 field */
			
 
				+		mask  |= 0x8000000000ull;
			
 
				+		value |= 0x1000000000ull;
			
 
				+	} else if (grp == 1) {
			
 
				+		/* increment PMC3/4/7/8 field */
			
 
				+		mask  |= 0x800000000ull;
			
 
				+		value |= 0x100000000ull;
			
 
				+	}
			
 
				+	spcsel = (event >> PM_SPCSEL_SH) & PM_SPCSEL_MSK;
			
 
				+	if (spcsel) {
			
 
				+		mask  |= 3ull << 48;
			
 
				+		value |= (u64)spcsel << 48;
			
 
				+	}
			
 
				+	*maskp = mask;
			
 
				+	*valp = value;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int p970_get_alternatives(u64 event, unsigned int flags, u64 alt[])
			
 
				+{
			
 
				+	alt[0] = event;
			
 
				+
			
 
				+	/* 2 alternatives for LSU empty */
			
 
				+	if (event == 0x2002 || event == 0x3002) {
			
 
				+		alt[1] = event ^ 0x1000;
			
 
				+		return 2;
			
 
				+	}
			
 
				+		
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+static int p970_compute_mmcr(u64 event[], int n_ev,
			
 
				+			     unsigned int hwc[], u64 mmcr[])
			
 
				+{
			
 
				+	u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
			
 
				+	unsigned int pmc, unit, byte, psel;
			
 
				+	unsigned int ttm, grp;
			
 
				+	unsigned int pmc_inuse = 0;
			
 
				+	unsigned int pmc_grp_use[2];
			
 
				+	unsigned char busbyte[4];
			
 
				+	unsigned char unituse[16];
			
 
				+	unsigned char unitmap[] = { 0, 0<<3, 3<<3, 1<<3, 2<<3, 0|4, 3|4 };
			
 
				+	unsigned char ttmuse[2];
			
 
				+	unsigned char pmcsel[8];
			
 
				+	int i;
			
 
				+	int spcsel;
			
 
				+
			
 
				+	if (n_ev > 8)
			
 
				+		return -1;
			
 
				+
			
 
				+	/* First pass to count resource use */
			
 
				+	pmc_grp_use[0] = pmc_grp_use[1] = 0;
			
 
				+	memset(busbyte, 0, sizeof(busbyte));
			
 
				+	memset(unituse, 0, sizeof(unituse));
			
 
				+	for (i = 0; i < n_ev; ++i) {
			
 
				+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+		if (pmc) {
			
 
				+			if (pmc_inuse & (1 << (pmc - 1)))
			
 
				+				return -1;
			
 
				+			pmc_inuse |= 1 << (pmc - 1);
			
 
				+			/* count 1/2/5/6 vs 3/4/7/8 use */
			
 
				+			++pmc_grp_use[((pmc - 1) >> 1) & 1];
			
 
				+		}
			
 
				+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
			
 
				+		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
			
 
				+		if (unit) {
			
 
				+			if (unit > PM_LASTUNIT)
			
 
				+				return -1;
			
 
				+			if (!pmc)
			
 
				+				++pmc_grp_use[byte & 1];
			
 
				+			if (busbyte[byte] && busbyte[byte] != unit)
			
 
				+				return -1;
			
 
				+			busbyte[byte] = unit;
			
 
				+			unituse[unit] = 1;
			
 
				+		}
			
 
				+	}
			
 
				+	if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
			
 
				+		return -1;
			
 
				+
			
 
				+	/*
			
 
				+	 * Assign resources and set multiplexer selects.
			
 
				+	 *
			
 
				+	 * PM_ISU can go either on TTM0 or TTM1, but that's the only
			
 
				+	 * choice we have to deal with.
			
 
				+	 */
			
 
				+	if (unituse[PM_ISU] &
			
 
				+	    (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_VPU]))
			
 
				+		unitmap[PM_ISU] = 2 | 4;	/* move ISU to TTM1 */
			
 
				+	/* Set TTM[01]SEL fields. */
			
 
				+	ttmuse[0] = ttmuse[1] = 0;
			
 
				+	for (i = PM_FPU; i <= PM_STS; ++i) {
			
 
				+		if (!unituse[i])
			
 
				+			continue;
			
 
				+		ttm = unitmap[i];
			
 
				+		++ttmuse[(ttm >> 2) & 1];
			
 
				+		mmcr1 |= (u64)(ttm & ~4) << MMCR1_TTM1SEL_SH;
			
 
				+	}
			
 
				+	/* Check only one unit per TTMx */
			
 
				+	if (ttmuse[0] > 1 || ttmuse[1] > 1)
			
 
				+		return -1;
			
 
				+
			
 
				+	/* Set byte lane select fields and TTM3SEL. */
			
 
				+	for (byte = 0; byte < 4; ++byte) {
			
 
				+		unit = busbyte[byte];
			
 
				+		if (!unit)
			
 
				+			continue;
			
 
				+		if (unit <= PM_STS)
			
 
				+			ttm = (unitmap[unit] >> 2) & 1;
			
 
				+		else if (unit == PM_LSU0)
			
 
				+			ttm = 2;
			
 
				+		else {
			
 
				+			ttm = 3;
			
 
				+			if (unit == PM_LSU1L && byte >= 2)
			
 
				+				mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
			
 
				+		}
			
 
				+		mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
			
 
				+	}
			
 
				+
			
 
				+	/* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
			
 
				+	memset(pmcsel, 0x8, sizeof(pmcsel));	/* 8 means don't count */
			
 
				+	for (i = 0; i < n_ev; ++i) {
			
 
				+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
			
 
				+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
			
 
				+		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
			
 
				+		psel = event[i] & PM_PMCSEL_MSK;
			
 
				+		if (!pmc) {
			
 
				+			/* Bus event or any-PMC direct event */
			
 
				+			if (unit)
			
 
				+				psel |= 0x10 | ((byte & 2) << 2);
			
 
				+			else
			
 
				+				psel |= 8;
			
 
				+			for (pmc = 0; pmc < 8; ++pmc) {
			
 
				+				if (pmc_inuse & (1 << pmc))
			
 
				+					continue;
			
 
				+				grp = (pmc >> 1) & 1;
			
 
				+				if (unit) {
			
 
				+					if (grp == (byte & 1))
			
 
				+						break;
			
 
				+				} else if (pmc_grp_use[grp] < 4) {
			
 
				+					++pmc_grp_use[grp];
			
 
				+					break;
			
 
				+				}
			
 
				+			}
			
 
				+			pmc_inuse |= 1 << pmc;
			
 
				+		} else {
			
 
				+			/* Direct event */
			
 
				+			--pmc;
			
 
				+			if (psel == 0 && (byte & 2))
			
 
				+				/* add events on higher-numbered bus */
			
 
				+				mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
			
 
				+		}
			
 
				+		pmcsel[pmc] = psel;
			
 
				+		hwc[i] = pmc;
			
 
				+		spcsel = (event[i] >> PM_SPCSEL_SH) & PM_SPCSEL_MSK;
			
 
				+		mmcr1 |= spcsel;
			
 
				+		if (p970_marked_instr_event(event[i]))
			
 
				+			mmcra |= MMCRA_SAMPLE_ENABLE;
			
 
				+	}
			
 
				+	for (pmc = 0; pmc < 2; ++pmc)
			
 
				+		mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc);
			
 
				+	for (; pmc < 8; ++pmc)
			
 
				+		mmcr1 |= (u64)pmcsel[pmc] << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
			
 
				+	if (pmc_inuse & 1)
			
 
				+		mmcr0 |= MMCR0_PMC1CE;
			
 
				+	if (pmc_inuse & 0xfe)
			
 
				+		mmcr0 |= MMCR0_PMCjCE;
			
 
				+
			
 
				+	mmcra |= 0x2000;	/* mark only one IOP per PPC instruction */
			
 
				+
			
 
				+	/* Return MMCRx values */
			
 
				+	mmcr[0] = mmcr0;
			
 
				+	mmcr[1] = mmcr1;
			
 
				+	mmcr[2] = mmcra;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void p970_disable_pmc(unsigned int pmc, u64 mmcr[])
			
 
				+{
			
 
				+	int shift, i;
			
 
				+
			
 
				+	if (pmc <= 1) {
			
 
				+		shift = MMCR0_PMC1SEL_SH - 7 * pmc;
			
 
				+		i = 0;
			
 
				+	} else {
			
 
				+		shift = MMCR1_PMC3SEL_SH - 5 * (pmc - 2);
			
 
				+		i = 1;
			
 
				+	}
			
 
				+	/*
			
 
				+	 * Setting the PMCxSEL field to 0x08 disables PMC x.
			
 
				+	 */
			
 
				+	mmcr[i] = (mmcr[i] & ~(0x1fUL << shift)) | (0x08UL << shift);
			
 
				+}
			
 
				+
			
 
				+static int ppc970_generic_events[] = {
			
 
				+	[PERF_COUNT_HW_CPU_CYCLES]		= 7,
			
 
				+	[PERF_COUNT_HW_INSTRUCTIONS]		= 1,
			
 
				+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x8810, /* PM_LD_REF_L1 */
			
 
				+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3810, /* PM_LD_MISS_L1 */
			
 
				+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x431,  /* PM_BR_ISSUED */
			
 
				+	[PERF_COUNT_HW_BRANCH_MISSES] 		= 0x327,  /* PM_GRP_BR_MPRED */
			
 
				+};
			
 
				+
			
 
				+#define C(x)	PERF_COUNT_HW_CACHE_##x
			
 
				+
			
 
				+/*
			
 
				+ * Table of generalized cache-related events.
			
 
				+ * 0 means not supported, -1 means nonsensical, other values
			
 
				+ * are event codes.
			
 
				+ */
			
 
				+static int ppc970_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
			
 
				+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0x8810,		0x3810	},
			
 
				+		[C(OP_WRITE)] = {	0x7810,		0x813	},
			
 
				+		[C(OP_PREFETCH)] = {	0x731,		0	},
			
 
				+	},
			
 
				+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0,		0	},
			
 
				+		[C(OP_WRITE)] = {	-1,		-1	},
			
 
				+		[C(OP_PREFETCH)] = {	0,		0	},
			
 
				+	},
			
 
				+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0,		0	},
			
 
				+		[C(OP_WRITE)] = {	0,		0	},
			
 
				+		[C(OP_PREFETCH)] = {	0x733,		0	},
			
 
				+	},
			
 
				+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0,		0x704	},
			
 
				+		[C(OP_WRITE)] = {	-1,		-1	},
			
 
				+		[C(OP_PREFETCH)] = {	-1,		-1	},
			
 
				+	},
			
 
				+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0,		0x700	},
			
 
				+		[C(OP_WRITE)] = {	-1,		-1	},
			
 
				+		[C(OP_PREFETCH)] = {	-1,		-1	},
			
 
				+	},
			
 
				+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
			
 
				+		[C(OP_READ)] = {	0x431,		0x327	},
			
 
				+		[C(OP_WRITE)] = {	-1,		-1	},
			
 
				+		[C(OP_PREFETCH)] = {	-1,		-1	},
			
 
				+	},
			
 
				+};
			
 
				+
			
 
				+struct power_pmu ppc970_pmu = {
			
 
				+	.n_counter = 8,
			
 
				+	.max_alternatives = 2,
			
 
				+	.add_fields = 0x001100005555ull,
			
 
				+	.test_adder = 0x013300000000ull,
			
 
				+	.compute_mmcr = p970_compute_mmcr,
			
 
				+	.get_constraint = p970_get_constraint,
			
 
				+	.get_alternatives = p970_get_alternatives,
			
 
				+	.disable_pmc = p970_disable_pmc,
			
 
				+	.n_generic = ARRAY_SIZE(ppc970_generic_events),
			
 
				+	.generic_events = ppc970_generic_events,
			
 
				+	.cache_events = &ppc970_cache_events,
			
 
				+};
			
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -29,6 +29,7 @@
 
				 #include <linux/module.h>
			
 
				 #include <linux/kprobes.h>
			
 
				 #include <linux/kdebug.h>
			
 
				+#include <linux/perf_counter.h>
			
 
				 
			
 
				 #include <asm/firmware.h>
			
 
				 #include <asm/page.h>
			
@@ -170,6 +171,8 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 
				 		die("Weird page fault", regs, SIGSEGV);
			
 
				 	}
			
 
				 
			
 
				+	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
			
 
				+
			
 
				 	/* When running in the kernel we expect faults to occur only to
			
 
				 	 * addresses in user space.  All other faults represent errors in the
			
 
				 	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
			
@@ -309,6 +312,8 @@ good_area:
 
				 	}
			
 
				 	if (ret & VM_FAULT_MAJOR) {
			
 
				 		current->maj_flt++;
			
 
				+		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
			
 
				+				     regs, address);
			
 
				 #ifdef CONFIG_PPC_SMLPAR
			
 
				 		if (firmware_has_feature(FW_FEATURE_CMO)) {
			
 
				 			preempt_disable();
			
@@ -316,8 +321,11 @@ good_area:
 
				 			preempt_enable();
			
 
				 		}
			
 
				 #endif
			
 
				-	} else
			
 
				+	} else {
			
 
				 		current->min_flt++;
			
 
				+		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
			
 
				+				     regs, address);
			
 
				+	}
			
 
				 	up_read(&mm->mmap_sem);
			
 
				 	return 0;
			
 
				 
			
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -1,6 +1,7 @@
 
				 config PPC64
			
 
				 	bool "64-bit kernel"
			
 
				 	default n
			
 
				+	select HAVE_PERF_COUNTERS
			
 
				 	help
			
 
				 	  This option selects whether a 32-bit or a 64-bit kernel
			
 
				 	  will be built.
			
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -739,6 +739,7 @@ config X86_UP_IOAPIC
 
				 config X86_LOCAL_APIC
			
 
				 	def_bool y
			
 
				 	depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
			
 
				+	select HAVE_PERF_COUNTERS if (!M386 && !M486)
			
 
				 
			
 
				 config X86_IO_APIC
			
 
				 	def_bool y
			
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -825,10 +825,11 @@ ia32_sys_call_table:
 
				 	.quad compat_sys_signalfd4
			
 
				 	.quad sys_eventfd2
			
 
				 	.quad sys_epoll_create1
			
 
				-	.quad sys_dup3			/* 330 */
			
 
				+	.quad sys_dup3				/* 330 */
			
 
				 	.quad sys_pipe2
			
 
				 	.quad sys_inotify_init1
			
 
				 	.quad compat_sys_preadv
			
 
				 	.quad compat_sys_pwritev
			
 
				 	.quad compat_sys_rt_tgsigqueueinfo	/* 335 */
			
 
				+	.quad sys_perf_counter_open
			
 
				 ia32_syscall_end:
			
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -247,5 +247,241 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
 
				 #define smp_mb__before_atomic_inc()	barrier()
			
 
				 #define smp_mb__after_atomic_inc()	barrier()
			
 
				 
			
 
				+/* An 64bit atomic type */
			
 
				+
			
 
				+typedef struct {
			
 
				+	unsigned long long counter;
			
 
				+} atomic64_t;
			
 
				+
			
 
				+#define ATOMIC64_INIT(val)	{ (val) }
			
 
				+
			
 
				+/**
			
 
				+ * atomic64_read - read atomic64 variable
			
 
				+ * @v: pointer of type atomic64_t
			
 
				+ *
			
 
				+ * Atomically reads the value of @v.
			
 
				+ * Doesn't imply a read memory barrier.
			
 
				+ */
			
 
				+#define __atomic64_read(ptr)		((ptr)->counter)
			
 
				+
			
 
				+static inline unsigned long long
			
 
				+cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
			
 
				+{
			
 
				+	asm volatile(
			
 
				+
			
 
				+		LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
			
 
				+
			
 
				+		     :		"=A" (old)
			
 
				+
			
 
				+		     : [ptr]	"D" (ptr),
			
 
				+				"A" (old),
			
 
				+				"b" (ll_low(new)),
			
 
				+				"c" (ll_high(new))
			
 
				+
			
 
				+		     : "memory");
			
 
				+
			
 
				+	return old;
			
 
				+}
			
 
				+
			
 
				+static inline unsigned long long
			
 
				+atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
			
 
				+		 unsigned long long new_val)
			
 
				+{
			
 
				+	return cmpxchg8b(&ptr->counter, old_val, new_val);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * atomic64_xchg - xchg atomic64 variable
			
 
				+ * @ptr:      pointer to type atomic64_t
			
 
				+ * @new_val:  value to assign
			
 
				+ * @old_val:  old value that was there
			
 
				+ *
			
 
				+ * Atomically xchgs the value of @ptr to @new_val and returns
			
 
				+ * the old value.
			
 
				+ */
			
 
				+
			
 
				+static inline unsigned long long
			
 
				+atomic64_xchg(atomic64_t *ptr, unsigned long long new_val)
			
 
				+{
			
 
				+	unsigned long long old_val;
			
 
				+
			
 
				+	do {
			
 
				+		old_val = atomic_read(ptr);
			
 
				+	} while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
			
 
				+
			
 
				+	return old_val;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * atomic64_set - set atomic64 variable
			
 
				+ * @ptr:      pointer to type atomic64_t
			
 
				+ * @new_val:  value to assign
			
 
				+ *
			
 
				+ * Atomically sets the value of @ptr to @new_val.
			
 
				+ */
			
 
				+static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
			
 
				+{
			
 
				+	atomic64_xchg(ptr, new_val);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * atomic64_read - read atomic64 variable
			
 
				+ * @ptr:      pointer to type atomic64_t
			
 
				+ *
			
 
				+ * Atomically reads the value of @ptr and returns it.
			
 
				+ */
			
 
				+static inline unsigned long long atomic64_read(atomic64_t *ptr)
			
 
				+{
			
 
				+	unsigned long long curr_val;
			
 
				+
			
 
				+	do {
			
 
				+		curr_val = __atomic64_read(ptr);
			
 
				+	} while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val);
			
 
				+
			
 
				+	return curr_val;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * atomic64_add_return - add and return
			
 
				+ * @delta: integer value to add
			
 
				+ * @ptr:   pointer to type atomic64_t
			
 
				+ *
			
 
				+ * Atomically adds @delta to @ptr and returns @delta + *@ptr
			
 
				+ */
			
 
				+static inline unsigned long long
			
 
				+atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
			
 
				+{
			
 
				+	unsigned long long old_val, new_val;
			
 
				+
			
 
				+	do {
			
 
				+		old_val = atomic_read(ptr);
			
 
				+		new_val = old_val + delta;
			
 
				+
			
 
				+	} while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
			
 
				+
			
 
				+	return new_val;
			
 
				+}
			
 
				+
			
 
				+static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
			
 
				+{
			
 
				+	return atomic64_add_return(-delta, ptr);
			
 
				+}
			
 
				+
			
 
				+static inline long atomic64_inc_return(atomic64_t *ptr)
			
 
				+{
			
 
				+	return atomic64_add_return(1, ptr);
			
 
				+}
			
 
				+
			
 
				+static inline long atomic64_dec_return(atomic64_t *ptr)
			
 
				+{
			
 
				+	return atomic64_sub_return(1, ptr);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * atomic64_add - add integer to atomic64 variable
			
 
				+ * @delta: integer value to add
			
 
				+ * @ptr:   pointer to type atomic64_t
			
 
				+ *
			
 
				+ * Atomically adds @delta to @ptr.
			
 
				+ */
			
 
				+static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
			
 
				+{
			
 
				+	atomic64_add_return(delta, ptr);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * atomic64_sub - subtract the atomic64 variable
			
 
				+ * @delta: integer value to subtract
			
 
				+ * @ptr:   pointer to type atomic64_t
			
 
				+ *
			
 
				+ * Atomically subtracts @delta from @ptr.
			
 
				+ */
			
 
				+static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
			
 
				+{
			
 
				+	atomic64_add(-delta, ptr);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * atomic64_sub_and_test - subtract value from variable and test result
			
 
				+ * @delta: integer value to subtract
			
 
				+ * @ptr:   pointer to type atomic64_t
			
 
				+ *
			
 
				+ * Atomically subtracts @delta from @ptr and returns
			
 
				+ * true if the result is zero, or false for all
			
 
				+ * other cases.
			
 
				+ */
			
 
				+static inline int
			
 
				+atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
			
 
				+{
			
 
				+	unsigned long long old_val = atomic64_sub_return(delta, ptr);
			
 
				+
			
 
				+	return old_val == 0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * atomic64_inc - increment atomic64 variable
			
 
				+ * @ptr: pointer to type atomic64_t
			
 
				+ *
			
 
				+ * Atomically increments @ptr by 1.
			
 
				+ */
			
 
				+static inline void atomic64_inc(atomic64_t *ptr)
			
 
				+{
			
 
				+	atomic64_add(1, ptr);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * atomic64_dec - decrement atomic64 variable
			
 
				+ * @ptr: pointer to type atomic64_t
			
 
				+ *
			
 
				+ * Atomically decrements @ptr by 1.
			
 
				+ */
			
 
				+static inline void atomic64_dec(atomic64_t *ptr)
			
 
				+{
			
 
				+	atomic64_sub(1, ptr);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * atomic64_dec_and_test - decrement and test
			
 
				+ * @ptr: pointer to type atomic64_t
			
 
				+ *
			
 
				+ * Atomically decrements @ptr by 1 and
			
 
				+ * returns true if the result is 0, or false for all other
			
 
				+ * cases.
			
 
				+ */
			
 
				+static inline int atomic64_dec_and_test(atomic64_t *ptr)
			
 
				+{
			
 
				+	return atomic64_sub_and_test(1, ptr);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * atomic64_inc_and_test - increment and test
			
 
				+ * @ptr: pointer to type atomic64_t
			
 
				+ *
			
 
				+ * Atomically increments @ptr by 1
			
 
				+ * and returns true if the result is zero, or false for all
			
 
				+ * other cases.
			
 
				+ */
			
 
				+static inline int atomic64_inc_and_test(atomic64_t *ptr)
			
 
				+{
			
 
				+	return atomic64_sub_and_test(-1, ptr);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * atomic64_add_negative - add and test if negative
			
 
				+ * @delta: integer value to add
			
 
				+ * @ptr:   pointer to type atomic64_t
			
 
				+ *
			
 
				+ * Atomically adds @delta to @ptr and returns true
			
 
				+ * if the result is negative, or false when
			
 
				+ * result is greater than or equal to zero.
			
 
				+ */
			
 
				+static inline int
			
 
				+atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
			
 
				+{
			
 
				+	long long old_val = atomic64_add_return(delta, ptr);
			
 
				+
			
 
				+	return old_val < 0;
			
 
				+}
			
 
				+
			
 
				 #include <asm-generic/atomic.h>
			
 
				 #endif /* _ASM_X86_ATOMIC_32_H */
			
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -49,7 +49,7 @@ BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
 
				 BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
			
 
				 
			
 
				 #ifdef CONFIG_PERF_COUNTERS
			
 
				-BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
			
 
				+BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
			
 
				 #endif
			
 
				 
			
 
				 #ifdef CONFIG_X86_MCE_P4THERMAL
			
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -13,6 +13,8 @@ typedef struct {
 
				 	unsigned int irq_spurious_count;
			
 
				 #endif
			
 
				 	unsigned int generic_irqs;	/* arch dependent */
			
 
				+	unsigned int apic_perf_irqs;
			
 
				+	unsigned int apic_pending_irqs;
			
 
				 #ifdef CONFIG_SMP
			
 
				 	unsigned int irq_resched_count;
			
 
				 	unsigned int irq_call_count;
			
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,6 +29,8 @@
 
				 extern void apic_timer_interrupt(void);
			
 
				 extern void generic_interrupt(void);
			
 
				 extern void error_interrupt(void);
			
 
				+extern void perf_pending_interrupt(void);
			
 
				+
			
 
				 extern void spurious_interrupt(void);
			
 
				 extern void thermal_interrupt(void);
			
 
				 extern void reschedule_interrupt(void);
			
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ b/arch/x86/include/asm/intel_arch_perfmon.h
@@ -1,31 +0,0 @@
 
				-#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
			
 
				-#define _ASM_X86_INTEL_ARCH_PERFMON_H
			
 
				-
			
 
				-#define MSR_ARCH_PERFMON_PERFCTR0		0xc1
			
 
				-#define MSR_ARCH_PERFMON_PERFCTR1		0xc2
			
 
				-
			
 
				-#define MSR_ARCH_PERFMON_EVENTSEL0		0x186
			
 
				-#define MSR_ARCH_PERFMON_EVENTSEL1		0x187
			
 
				-
			
 
				-#define ARCH_PERFMON_EVENTSEL0_ENABLE	(1 << 22)
			
 
				-#define ARCH_PERFMON_EVENTSEL_INT	(1 << 20)
			
 
				-#define ARCH_PERFMON_EVENTSEL_OS	(1 << 17)
			
 
				-#define ARCH_PERFMON_EVENTSEL_USR	(1 << 16)
			
 
				-
			
 
				-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL	(0x3c)
			
 
				-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK	(0x00 << 8)
			
 
				-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
			
 
				-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
			
 
				-	(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
			
 
				-
			
 
				-union cpuid10_eax {
			
 
				-	struct {
			
 
				-		unsigned int version_id:8;
			
 
				-		unsigned int num_counters:8;
			
 
				-		unsigned int bit_width:8;
			
 
				-		unsigned int mask_length:8;
			
 
				-	} split;
			
 
				-	unsigned int full;
			
 
				-};
			
 
				-
			
 
				-#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
			
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -108,14 +108,14 @@
 
				 #define LOCAL_TIMER_VECTOR		0xef
			
 
				 
			
 
				 /*
			
 
				- * Performance monitoring interrupt vector:
			
 
				+ * Generic system vector for platform specific use
			
 
				  */
			
 
				-#define LOCAL_PERF_VECTOR		0xee
			
 
				+#define GENERIC_INTERRUPT_VECTOR	0xed
			
 
				 
			
 
				 /*
			
 
				- * Generic system vector for platform specific use
			
 
				+ * Performance monitoring pending work vector:
			
 
				  */
			
 
				-#define GENERIC_INTERRUPT_VECTOR	0xed
			
 
				+#define LOCAL_PENDING_VECTOR		0xec
			
 
				 
			
 
				 /*
			
 
				  * First APIC vector available to drivers: (vectors 0x30-0xee) we
			
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -0,0 +1,100 @@
 
				+#ifndef _ASM_X86_PERF_COUNTER_H
			
 
				+#define _ASM_X86_PERF_COUNTER_H
			
 
				+
			
 
				+/*
			
 
				+ * Performance counter hw details:
			
 
				+ */
			
 
				+
			
 
				+#define X86_PMC_MAX_GENERIC					8
			
 
				+#define X86_PMC_MAX_FIXED					3
			
 
				+
			
 
				+#define X86_PMC_IDX_GENERIC				        0
			
 
				+#define X86_PMC_IDX_FIXED				       32
			
 
				+#define X86_PMC_IDX_MAX					       64
			
 
				+
			
 
				+#define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
			
 
				+#define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
			
 
				+
			
 
				+#define MSR_ARCH_PERFMON_EVENTSEL0			     0x186
			
 
				+#define MSR_ARCH_PERFMON_EVENTSEL1			     0x187
			
 
				+
			
 
				+#define ARCH_PERFMON_EVENTSEL0_ENABLE			  (1 << 22)
			
 
				+#define ARCH_PERFMON_EVENTSEL_INT			  (1 << 20)
			
 
				+#define ARCH_PERFMON_EVENTSEL_OS			  (1 << 17)
			
 
				+#define ARCH_PERFMON_EVENTSEL_USR			  (1 << 16)
			
 
				+
			
 
				+/*
			
 
				+ * Includes eventsel and unit mask as well:
			
 
				+ */
			
 
				+#define ARCH_PERFMON_EVENT_MASK				    0xffff
			
 
				+
			
 
				+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c
			
 
				+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
			
 
				+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 		 0
			
 
				+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
			
 
				+		(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
			
 
				+
			
 
				+#define ARCH_PERFMON_BRANCH_MISSES_RETIRED			 6
			
 
				+
			
 
				+/*
			
 
				+ * Intel "Architectural Performance Monitoring" CPUID
			
 
				+ * detection/enumeration details:
			
 
				+ */
			
 
				+union cpuid10_eax {
			
 
				+	struct {
			
 
				+		unsigned int version_id:8;
			
 
				+		unsigned int num_counters:8;
			
 
				+		unsigned int bit_width:8;
			
 
				+		unsigned int mask_length:8;
			
 
				+	} split;
			
 
				+	unsigned int full;
			
 
				+};
			
 
				+
			
 
				+union cpuid10_edx {
			
 
				+	struct {
			
 
				+		unsigned int num_counters_fixed:4;
			
 
				+		unsigned int reserved:28;
			
 
				+	} split;
			
 
				+	unsigned int full;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Fixed-purpose performance counters:
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * All 3 fixed-mode PMCs are configured via this single MSR:
			
 
				+ */
			
 
				+#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL			0x38d
			
 
				+
			
 
				+/*
			
 
				+ * The counts are available in three separate MSRs:
			
 
				+ */
			
 
				+
			
 
				+/* Instr_Retired.Any: */
			
 
				+#define MSR_ARCH_PERFMON_FIXED_CTR0			0x309
			
 
				+#define X86_PMC_IDX_FIXED_INSTRUCTIONS			(X86_PMC_IDX_FIXED + 0)
			
 
				+
			
 
				+/* CPU_CLK_Unhalted.Core: */
			
 
				+#define MSR_ARCH_PERFMON_FIXED_CTR1			0x30a
			
 
				+#define X86_PMC_IDX_FIXED_CPU_CYCLES			(X86_PMC_IDX_FIXED + 1)
			
 
				+
			
 
				+/* CPU_CLK_Unhalted.Ref: */
			
 
				+#define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
			
 
				+#define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
			
 
				+
			
 
				+extern void set_perf_counter_pending(void);
			
 
				+
			
 
				+#define clear_perf_counter_pending()	do { } while (0)
			
 
				+#define test_perf_counter_pending()	(0)
			
 
				+
			
 
				+#ifdef CONFIG_PERF_COUNTERS
			
 
				+extern void init_hw_perf_counters(void);
			
 
				+extern void perf_counters_lapic_init(void);
			
 
				+#else
			
 
				+static inline void init_hw_perf_counters(void)		{ }
			
 
				+static inline void perf_counters_lapic_init(void)	{ }
			
 
				+#endif
			
 
				+
			
 
				+#endif /* _ASM_X86_PERF_COUNTER_H */
			
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -341,6 +341,7 @@
 
				 #define __NR_preadv		333
			
 
				 #define __NR_pwritev		334
			
 
				 #define __NR_rt_tgsigqueueinfo	335
			
 
				+#define __NR_perf_counter_open	336
			
 
				 
			
 
				 #ifdef __KERNEL__
			
 
				 
			
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -659,7 +659,8 @@ __SYSCALL(__NR_preadv, sys_preadv)
 
				 __SYSCALL(__NR_pwritev, sys_pwritev)
			
 
				 #define __NR_rt_tgsigqueueinfo			297
			
 
				 __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
			
 
				-
			
 
				+#define __NR_perf_counter_open			298
			
 
				+__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
			
 
				 
			
 
				 #ifndef __NO_STUBS
			
 
				 #define __ARCH_WANT_OLD_READDIR
			
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,6 +14,7 @@
 
				  *	Mikael Pettersson	:	PM converted to driver model.
			
 
				  */
			
 
				 
			
 
				+#include <linux/perf_counter.h>
			
 
				 #include <linux/kernel_stat.h>
			
 
				 #include <linux/mc146818rtc.h>
			
 
				 #include <linux/acpi_pmtmr.h>
			
@@ -34,6 +35,7 @@
 
				 #include <linux/smp.h>
			
 
				 #include <linux/mm.h>
			
 
				 
			
 
				+#include <asm/perf_counter.h>
			
 
				 #include <asm/pgalloc.h>
			
 
				 #include <asm/atomic.h>
			
 
				 #include <asm/mpspec.h>
			
@@ -1187,6 +1189,7 @@ void __cpuinit setup_local_APIC(void)
 
				 		apic_write(APIC_ESR, 0);
			
 
				 	}
			
 
				 #endif
			
 
				+	perf_counters_lapic_init();
			
 
				 
			
 
				 	preempt_disable();
			
 
				 
			
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
 
				 #
			
 
				-# Makefile for x86-compatible CPU details and quirks
			
 
				+# Makefile for x86-compatible CPU details, features and quirks
			
 
				 #
			
 
				 
			
 
				 # Don't trace early stages of a secondary CPU boot
			
@@ -23,11 +23,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o
 
				 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
			
 
				 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
			
 
				 
			
 
				-obj-$(CONFIG_X86_MCE)	+= mcheck/
			
 
				-obj-$(CONFIG_MTRR)	+= mtrr/
			
 
				-obj-$(CONFIG_CPU_FREQ)	+= cpufreq/
			
 
				+obj-$(CONFIG_PERF_COUNTERS)		+= perf_counter.o
			
 
				 
			
 
				-obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
			
 
				+obj-$(CONFIG_X86_MCE)			+= mcheck/
			
 
				+obj-$(CONFIG_MTRR)			+= mtrr/
			
 
				+obj-$(CONFIG_CPU_FREQ)			+= cpufreq/
			
 
				+
			
 
				+obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o
			
 
				 
			
 
				 quiet_cmd_mkcapflags = MKCAP   $@
			
 
				       cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
			
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,6 +13,7 @@
 
				 #include <linux/io.h>
			
 
				 
			
 
				 #include <asm/stackprotector.h>
			
 
				+#include <asm/perf_counter.h>
			
 
				 #include <asm/mmu_context.h>
			
 
				 #include <asm/hypervisor.h>
			
 
				 #include <asm/processor.h>
			
@@ -874,6 +875,7 @@ void __init identify_boot_cpu(void)
 
				 #else
			
 
				 	vgetcpu_set_mode();
			
 
				 #endif
			
 
				+	init_hw_perf_counters();
			
 
				 }
			
 
				 
			
 
				 void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
			
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,1704 @@
 
				+/*
			
 
				+ * Performance counter x86 architecture code
			
 
				+ *
			
 
				+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
			
 
				+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
			
 
				+ *  Copyright (C) 2009 Jaswinder Singh Rajput
			
 
				+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
			
 
				+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
			
 
				+ *
			
 
				+ *  For licencing details see kernel-base/COPYING
			
 
				+ */
			
 
				+
			
 
				+#include <linux/perf_counter.h>
			
 
				+#include <linux/capability.h>
			
 
				+#include <linux/notifier.h>
			
 
				+#include <linux/hardirq.h>
			
 
				+#include <linux/kprobes.h>
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/kdebug.h>
			
 
				+#include <linux/sched.h>
			
 
				+#include <linux/uaccess.h>
			
 
				+
			
 
				+#include <asm/apic.h>
			
 
				+#include <asm/stacktrace.h>
			
 
				+#include <asm/nmi.h>
			
 
				+
			
 
				+static u64 perf_counter_mask __read_mostly;
			
 
				+
			
 
				+struct cpu_hw_counters {
			
 
				+	struct perf_counter	*counters[X86_PMC_IDX_MAX];
			
 
				+	unsigned long		used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
			
 
				+	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
			
 
				+	unsigned long		interrupts;
			
 
				+	int			enabled;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * struct x86_pmu - generic x86 pmu
			
 
				+ */
			
 
				+struct x86_pmu {
			
 
				+	const char	*name;
			
 
				+	int		version;
			
 
				+	int		(*handle_irq)(struct pt_regs *);
			
 
				+	void		(*disable_all)(void);
			
 
				+	void		(*enable_all)(void);
			
 
				+	void		(*enable)(struct hw_perf_counter *, int);
			
 
				+	void		(*disable)(struct hw_perf_counter *, int);
			
 
				+	unsigned	eventsel;
			
 
				+	unsigned	perfctr;
			
 
				+	u64		(*event_map)(int);
			
 
				+	u64		(*raw_event)(u64);
			
 
				+	int		max_events;
			
 
				+	int		num_counters;
			
 
				+	int		num_counters_fixed;
			
 
				+	int		counter_bits;
			
 
				+	u64		counter_mask;
			
 
				+	u64		max_period;
			
 
				+	u64		intel_ctrl;
			
 
				+};
			
 
				+
			
 
				+static struct x86_pmu x86_pmu __read_mostly;
			
 
				+
			
 
				+static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
			
 
				+	.enabled = 1,
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Intel PerfMon v3. Used on Core2 and later.
			
 
				+ */
			
 
				+static const u64 intel_perfmon_event_map[] =
			
 
				+{
			
 
				+  [PERF_COUNT_HW_CPU_CYCLES]		= 0x003c,
			
 
				+  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
			
 
				+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4f2e,
			
 
				+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x412e,
			
 
				+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
			
 
				+  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
			
 
				+  [PERF_COUNT_HW_BUS_CYCLES]		= 0x013c,
			
 
				+};
			
 
				+
			
 
				+static u64 intel_pmu_event_map(int event)
			
 
				+{
			
 
				+	return intel_perfmon_event_map[event];
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Generalized hw caching related event table, filled
			
 
				+ * in on a per model basis. A value of 0 means
			
 
				+ * 'not supported', -1 means 'event makes no sense on
			
 
				+ * this CPU', any other value means the raw event
			
 
				+ * ID.
			
 
				+ */
			
 
				+
			
 
				+#define C(x) PERF_COUNT_HW_CACHE_##x
			
 
				+
			
 
				+static u64 __read_mostly hw_cache_event_ids
			
 
				+				[PERF_COUNT_HW_CACHE_MAX]
			
 
				+				[PERF_COUNT_HW_CACHE_OP_MAX]
			
 
				+				[PERF_COUNT_HW_CACHE_RESULT_MAX];
			
 
				+
			
 
				+static const u64 nehalem_hw_cache_event_ids
			
 
				+				[PERF_COUNT_HW_CACHE_MAX]
			
 
				+				[PERF_COUNT_HW_CACHE_OP_MAX]
			
 
				+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
			
 
				+{
			
 
				+ [ C(L1D) ] = {
			
 
				+	[ C(OP_READ) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI            */
			
 
				+		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE         */
			
 
				+	},
			
 
				+	[ C(OP_WRITE) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI            */
			
 
				+		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE         */
			
 
				+	},
			
 
				+	[ C(OP_PREFETCH) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
			
 
				+		[ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
			
 
				+	},
			
 
				+ },
			
 
				+ [ C(L1I ) ] = {
			
 
				+	[ C(OP_READ) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
			
 
				+		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
			
 
				+	},
			
 
				+	[ C(OP_WRITE) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = -1,
			
 
				+		[ C(RESULT_MISS)   ] = -1,
			
 
				+	},
			
 
				+	[ C(OP_PREFETCH) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x0,
			
 
				+		[ C(RESULT_MISS)   ] = 0x0,
			
 
				+	},
			
 
				+ },
			
 
				+ [ C(LL  ) ] = {
			
 
				+	[ C(OP_READ) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
			
 
				+		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
			
 
				+	},
			
 
				+	[ C(OP_WRITE) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */
			
 
				+		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
			
 
				+	},
			
 
				+	[ C(OP_PREFETCH) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference                */
			
 
				+		[ C(RESULT_MISS)   ] = 0x412e, /* LLC Misses                   */
			
 
				+	},
			
 
				+ },
			
 
				+ [ C(DTLB) ] = {
			
 
				+	[ C(OP_READ) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
			
 
				+		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
			
 
				+	},
			
 
				+	[ C(OP_WRITE) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
			
 
				+		[ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
			
 
				+	},
			
 
				+	[ C(OP_PREFETCH) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x0,
			
 
				+		[ C(RESULT_MISS)   ] = 0x0,
			
 
				+	},
			
 
				+ },
			
 
				+ [ C(ITLB) ] = {
			
 
				+	[ C(OP_READ) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
			
 
				+		[ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */
			
 
				+	},
			
 
				+	[ C(OP_WRITE) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = -1,
			
 
				+		[ C(RESULT_MISS)   ] = -1,
			
 
				+	},
			
 
				+	[ C(OP_PREFETCH) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = -1,
			
 
				+		[ C(RESULT_MISS)   ] = -1,
			
 
				+	},
			
 
				+ },
			
 
				+ [ C(BPU ) ] = {
			
 
				+	[ C(OP_READ) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
			
 
				+		[ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
			
 
				+	},
			
 
				+	[ C(OP_WRITE) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = -1,
			
 
				+		[ C(RESULT_MISS)   ] = -1,
			
 
				+	},
			
 
				+	[ C(OP_PREFETCH) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = -1,
			
 
				+		[ C(RESULT_MISS)   ] = -1,
			
 
				+	},
			
 
				+ },
			
 
				+};
			
 
				+
			
 
				+static const u64 core2_hw_cache_event_ids
			
 
				+				[PERF_COUNT_HW_CACHE_MAX]
			
 
				+				[PERF_COUNT_HW_CACHE_OP_MAX]
			
 
				+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
			
 
				+{
			
 
				+ [ C(L1D) ] = {
			
 
				+	[ C(OP_READ) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */
			
 
				+		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */
			
 
				+	},
			
 
				+	[ C(OP_WRITE) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */
			
 
				+		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */
			
 
				+	},
			
 
				+	[ C(OP_PREFETCH) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */
			
 
				+		[ C(RESULT_MISS)   ] = 0,
			
 
				+	},
			
 
				+ },
			
 
				+ [ C(L1I ) ] = {
			
 
				+	[ C(OP_READ) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
			
 
				+		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
			
 
				+	},
			
 
				+	[ C(OP_WRITE) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = -1,
			
 
				+		[ C(RESULT_MISS)   ] = -1,
			
 
				+	},
			
 
				+	[ C(OP_PREFETCH) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0,
			
 
				+		[ C(RESULT_MISS)   ] = 0,
			
 
				+	},
			
 
				+ },
			
 
				+ [ C(LL  ) ] = {
			
 
				+	[ C(OP_READ) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
			
 
				+		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
			
 
				+	},
			
 
				+	[ C(OP_WRITE) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
			
 
				+		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
			
 
				+	},
			
 
				+	[ C(OP_PREFETCH) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0,
			
 
				+		[ C(RESULT_MISS)   ] = 0,
			
 
				+	},
			
 
				+ },
			
 
				+ [ C(DTLB) ] = {
			
 
				+	[ C(OP_READ) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
			
 
				+		[ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */
			
 
				+	},
			
 
				+	[ C(OP_WRITE) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
			
 
				+		[ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */
			
 
				+	},
			
 
				+	[ C(OP_PREFETCH) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0,
			
 
				+		[ C(RESULT_MISS)   ] = 0,
			
 
				+	},
			
 
				+ },
			
 
				+ [ C(ITLB) ] = {
			
 
				+	[ C(OP_READ) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
			
 
				+		[ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */
			
 
				+	},
			
 
				+	[ C(OP_WRITE) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = -1,
			
 
				+		[ C(RESULT_MISS)   ] = -1,
			
 
				+	},
			
 
				+	[ C(OP_PREFETCH) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = -1,
			
 
				+		[ C(RESULT_MISS)   ] = -1,
			
 
				+	},
			
 
				+ },
			
 
				+ [ C(BPU ) ] = {
			
 
				+	[ C(OP_READ) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
			
 
				+		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
			
 
				+	},
			
 
				+	[ C(OP_WRITE) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = -1,
			
 
				+		[ C(RESULT_MISS)   ] = -1,
			
 
				+	},
			
 
				+	[ C(OP_PREFETCH) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = -1,
			
 
				+		[ C(RESULT_MISS)   ] = -1,
			
 
				+	},
			
 
				+ },
			
 
				+};
			
 
				+
			
 
				+static const u64 atom_hw_cache_event_ids
			
 
				+				[PERF_COUNT_HW_CACHE_MAX]
			
 
				+				[PERF_COUNT_HW_CACHE_OP_MAX]
			
 
				+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
			
 
				+{
			
 
				+ [ C(L1D) ] = {
			
 
				+	[ C(OP_READ) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */
			
 
				+		[ C(RESULT_MISS)   ] = 0,
			
 
				+	},
			
 
				+	[ C(OP_WRITE) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */
			
 
				+		[ C(RESULT_MISS)   ] = 0,
			
 
				+	},
			
 
				+	[ C(OP_PREFETCH) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x0,
			
 
				+		[ C(RESULT_MISS)   ] = 0,
			
 
				+	},
			
 
				+ },
			
 
				+ [ C(L1I ) ] = {
			
 
				+	[ C(OP_READ) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */
			
 
				+		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */
			
 
				+	},
			
 
				+	[ C(OP_WRITE) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = -1,
			
 
				+		[ C(RESULT_MISS)   ] = -1,
			
 
				+	},
			
 
				+	[ C(OP_PREFETCH) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0,
			
 
				+		[ C(RESULT_MISS)   ] = 0,
			
 
				+	},
			
 
				+ },
			
 
				+ [ C(LL  ) ] = {
			
 
				+	[ C(OP_READ) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
			
 
				+		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
			
 
				+	},
			
 
				+	[ C(OP_WRITE) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
			
 
				+		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
			
 
				+	},
			
 
				+	[ C(OP_PREFETCH) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0,
			
 
				+		[ C(RESULT_MISS)   ] = 0,
			
 
				+	},
			
 
				+ },
			
 
				+ [ C(DTLB) ] = {
			
 
				+	[ C(OP_READ) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */
			
 
				+		[ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
			
 
				+	},
			
 
				+	[ C(OP_WRITE) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */
			
 
				+		[ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
			
 
				+	},
			
 
				+	[ C(OP_PREFETCH) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0,
			
 
				+		[ C(RESULT_MISS)   ] = 0,
			
 
				+	},
			
 
				+ },
			
 
				+ [ C(ITLB) ] = {
			
 
				+	[ C(OP_READ) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
			
 
				+		[ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */
			
 
				+	},
			
 
				+	[ C(OP_WRITE) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = -1,
			
 
				+		[ C(RESULT_MISS)   ] = -1,
			
 
				+	},
			
 
				+	[ C(OP_PREFETCH) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = -1,
			
 
				+		[ C(RESULT_MISS)   ] = -1,
			
 
				+	},
			
 
				+ },
			
 
				+ [ C(BPU ) ] = {
			
 
				+	[ C(OP_READ) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
			
 
				+		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
			
 
				+	},
			
 
				+	[ C(OP_WRITE) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = -1,
			
 
				+		[ C(RESULT_MISS)   ] = -1,
			
 
				+	},
			
 
				+	[ C(OP_PREFETCH) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = -1,
			
 
				+		[ C(RESULT_MISS)   ] = -1,
			
 
				+	},
			
 
				+ },
			
 
				+};
			
 
				+
			
 
				+static u64 intel_pmu_raw_event(u64 event)
			
 
				+{
			
 
				+#define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
			
 
				+#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
			
 
				+#define CORE_EVNTSEL_EDGE_MASK		0x00040000ULL
			
 
				+#define CORE_EVNTSEL_INV_MASK		0x00800000ULL
			
 
				+#define CORE_EVNTSEL_COUNTER_MASK	0xFF000000ULL
			
 
				+
			
 
				+#define CORE_EVNTSEL_MASK		\
			
 
				+	(CORE_EVNTSEL_EVENT_MASK |	\
			
 
				+	 CORE_EVNTSEL_UNIT_MASK  |	\
			
 
				+	 CORE_EVNTSEL_EDGE_MASK  |	\
			
 
				+	 CORE_EVNTSEL_INV_MASK  |	\
			
 
				+	 CORE_EVNTSEL_COUNTER_MASK)
			
 
				+
			
 
				+	return event & CORE_EVNTSEL_MASK;
			
 
				+}
			
 
				+
			
 
				+static const u64 amd_0f_hw_cache_event_ids
			
 
				+				[PERF_COUNT_HW_CACHE_MAX]
			
 
				+				[PERF_COUNT_HW_CACHE_OP_MAX]
			
 
				+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
			
 
				+{
			
 
				+ [ C(L1D) ] = {
			
 
				+	[ C(OP_READ) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0,
			
 
				+		[ C(RESULT_MISS)   ] = 0,
			
 
				+	},
			
 
				+	[ C(OP_WRITE) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0,
			
 
				+		[ C(RESULT_MISS)   ] = 0,
			
 
				+	},
			
 
				+	[ C(OP_PREFETCH) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0,
			
 
				+		[ C(RESULT_MISS)   ] = 0,
			
 
				+	},
			
 
				+ },
			
 
				+ [ C(L1I ) ] = {
			
 
				+	[ C(OP_READ) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */
			
 
				+		[ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */
			
 
				+	},
			
 
				+	[ C(OP_WRITE) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = -1,
			
 
				+		[ C(RESULT_MISS)   ] = -1,
			
 
				+	},
			
 
				+	[ C(OP_PREFETCH) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0,
			
 
				+		[ C(RESULT_MISS)   ] = 0,
			
 
				+	},
			
 
				+ },
			
 
				+ [ C(LL  ) ] = {
			
 
				+	[ C(OP_READ) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0,
			
 
				+		[ C(RESULT_MISS)   ] = 0,
			
 
				+	},
			
 
				+	[ C(OP_WRITE) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0,
			
 
				+		[ C(RESULT_MISS)   ] = 0,
			
 
				+	},
			
 
				+	[ C(OP_PREFETCH) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0,
			
 
				+		[ C(RESULT_MISS)   ] = 0,
			
 
				+	},
			
 
				+ },
			
 
				+ [ C(DTLB) ] = {
			
 
				+	[ C(OP_READ) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0,
			
 
				+		[ C(RESULT_MISS)   ] = 0,
			
 
				+	},
			
 
				+	[ C(OP_WRITE) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0,
			
 
				+		[ C(RESULT_MISS)   ] = 0,
			
 
				+	},
			
 
				+	[ C(OP_PREFETCH) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0,
			
 
				+		[ C(RESULT_MISS)   ] = 0,
			
 
				+	},
			
 
				+ },
			
 
				+ [ C(ITLB) ] = {
			
 
				+	[ C(OP_READ) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
			
 
				+		[ C(RESULT_MISS)   ] = 0x0085, /* Instr. fetch ITLB misses   */
			
 
				+	},
			
 
				+	[ C(OP_WRITE) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = -1,
			
 
				+		[ C(RESULT_MISS)   ] = -1,
			
 
				+	},
			
 
				+	[ C(OP_PREFETCH) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = -1,
			
 
				+		[ C(RESULT_MISS)   ] = -1,
			
 
				+	},
			
 
				+ },
			
 
				+ [ C(BPU ) ] = {
			
 
				+	[ C(OP_READ) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */
			
 
				+		[ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */
			
 
				+	},
			
 
				+	[ C(OP_WRITE) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = -1,
			
 
				+		[ C(RESULT_MISS)   ] = -1,
			
 
				+	},
			
 
				+	[ C(OP_PREFETCH) ] = {
			
 
				+		[ C(RESULT_ACCESS) ] = -1,
			
 
				+		[ C(RESULT_MISS)   ] = -1,
			
 
				+	},
			
 
				+ },
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * AMD Performance Monitor K7 and later.
			
 
				+ */
			
 
				+static const u64 amd_perfmon_event_map[] =
			
 
				+{
			
 
				+  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0076,
			
 
				+  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
			
 
				+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0080,
			
 
				+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x0081,
			
 
				+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
			
 
				+  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
			
 
				+};
			
 
				+
			
 
				+static u64 amd_pmu_event_map(int event)
			
 
				+{
			
 
				+	return amd_perfmon_event_map[event];
			
 
				+}
			
 
				+
			
 
				+static u64 amd_pmu_raw_event(u64 event)
			
 
				+{
			
 
				+#define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL
			
 
				+#define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
			
 
				+#define K7_EVNTSEL_EDGE_MASK	0x000040000ULL
			
 
				+#define K7_EVNTSEL_INV_MASK	0x000800000ULL
			
 
				+#define K7_EVNTSEL_COUNTER_MASK	0x0FF000000ULL
			
 
				+
			
 
				+#define K7_EVNTSEL_MASK			\
			
 
				+	(K7_EVNTSEL_EVENT_MASK |	\
			
 
				+	 K7_EVNTSEL_UNIT_MASK  |	\
			
 
				+	 K7_EVNTSEL_EDGE_MASK  |	\
			
 
				+	 K7_EVNTSEL_INV_MASK   |	\
			
 
				+	 K7_EVNTSEL_COUNTER_MASK)
			
 
				+
			
 
				+	return event & K7_EVNTSEL_MASK;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Propagate counter elapsed time into the generic counter.
			
 
				+ * Can only be executed on the CPU where the counter is active.
			
 
				+ * Returns the delta events processed.
			
 
				+ */
			
 
				+static u64
			
 
				+x86_perf_counter_update(struct perf_counter *counter,
			
 
				+			struct hw_perf_counter *hwc, int idx)
			
 
				+{
			
 
				+	int shift = 64 - x86_pmu.counter_bits;
			
 
				+	u64 prev_raw_count, new_raw_count;
			
 
				+	s64 delta;
			
 
				+
			
 
				+	/*
			
 
				+	 * Careful: an NMI might modify the previous counter value.
			
 
				+	 *
			
 
				+	 * Our tactic to handle this is to first atomically read and
			
 
				+	 * exchange a new raw count - then add that new-prev delta
			
 
				+	 * count to the generic counter atomically:
			
 
				+	 */
			
 
				+again:
			
 
				+	prev_raw_count = atomic64_read(&hwc->prev_count);
			
 
				+	rdmsrl(hwc->counter_base + idx, new_raw_count);
			
 
				+
			
 
				+	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
			
 
				+					new_raw_count) != prev_raw_count)
			
 
				+		goto again;
			
 
				+
			
 
				+	/*
			
 
				+	 * Now we have the new raw value and have updated the prev
			
 
				+	 * timestamp already. We can now calculate the elapsed delta
			
 
				+	 * (counter-)time and add that to the generic counter.
			
 
				+	 *
			
 
				+	 * Careful, not all hw sign-extends above the physical width
			
 
				+	 * of the count.
			
 
				+	 */
			
 
				+	delta = (new_raw_count << shift) - (prev_raw_count << shift);
			
 
				+	delta >>= shift;
			
 
				+
			
 
				+	atomic64_add(delta, &counter->count);
			
 
				+	atomic64_sub(delta, &hwc->period_left);
			
 
				+
			
 
				+	return new_raw_count;
			
 
				+}
			
 
				+
			
 
				+static atomic_t active_counters;
			
 
				+static DEFINE_MUTEX(pmc_reserve_mutex);
			
 
				+
			
 
				+static bool reserve_pmc_hardware(void)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	if (nmi_watchdog == NMI_LOCAL_APIC)
			
 
				+		disable_lapic_nmi_watchdog();
			
 
				+
			
 
				+	for (i = 0; i < x86_pmu.num_counters; i++) {
			
 
				+		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
			
 
				+			goto perfctr_fail;
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < x86_pmu.num_counters; i++) {
			
 
				+		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
			
 
				+			goto eventsel_fail;
			
 
				+	}
			
 
				+
			
 
				+	return true;
			
 
				+
			
 
				+eventsel_fail:
			
 
				+	for (i--; i >= 0; i--)
			
 
				+		release_evntsel_nmi(x86_pmu.eventsel + i);
			
 
				+
			
 
				+	i = x86_pmu.num_counters;
			
 
				+
			
 
				+perfctr_fail:
			
 
				+	for (i--; i >= 0; i--)
			
 
				+		release_perfctr_nmi(x86_pmu.perfctr + i);
			
 
				+
			
 
				+	if (nmi_watchdog == NMI_LOCAL_APIC)
			
 
				+		enable_lapic_nmi_watchdog();
			
 
				+
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+static void release_pmc_hardware(void)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < x86_pmu.num_counters; i++) {
			
 
				+		release_perfctr_nmi(x86_pmu.perfctr + i);
			
 
				+		release_evntsel_nmi(x86_pmu.eventsel + i);
			
 
				+	}
			
 
				+
			
 
				+	if (nmi_watchdog == NMI_LOCAL_APIC)
			
 
				+		enable_lapic_nmi_watchdog();
			
 
				+}
			
 
				+
			
 
				+static void hw_perf_counter_destroy(struct perf_counter *counter)
			
 
				+{
			
 
				+	if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
			
 
				+		release_pmc_hardware();
			
 
				+		mutex_unlock(&pmc_reserve_mutex);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static inline int x86_pmu_initialized(void)
			
 
				+{
			
 
				+	return x86_pmu.handle_irq != NULL;
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
			
 
				+{
			
 
				+	unsigned int cache_type, cache_op, cache_result;
			
 
				+	u64 config, val;
			
 
				+
			
 
				+	config = attr->config;
			
 
				+
			
 
				+	cache_type = (config >>  0) & 0xff;
			
 
				+	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	cache_op = (config >>  8) & 0xff;
			
 
				+	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	cache_result = (config >> 16) & 0xff;
			
 
				+	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	val = hw_cache_event_ids[cache_type][cache_op][cache_result];
			
 
				+
			
 
				+	if (val == 0)
			
 
				+		return -ENOENT;
			
 
				+
			
 
				+	if (val == -1)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	hwc->config |= val;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Setup the hardware configuration for a given attr_type
			
 
				+ */
			
 
				+static int __hw_perf_counter_init(struct perf_counter *counter)
			
 
				+{
			
 
				+	struct perf_counter_attr *attr = &counter->attr;
			
 
				+	struct hw_perf_counter *hwc = &counter->hw;
			
 
				+	int err;
			
 
				+
			
 
				+	if (!x86_pmu_initialized())
			
 
				+		return -ENODEV;
			
 
				+
			
 
				+	err = 0;
			
 
				+	if (!atomic_inc_not_zero(&active_counters)) {
			
 
				+		mutex_lock(&pmc_reserve_mutex);
			
 
				+		if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
			
 
				+			err = -EBUSY;
			
 
				+		else
			
 
				+			atomic_inc(&active_counters);
			
 
				+		mutex_unlock(&pmc_reserve_mutex);
			
 
				+	}
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	/*
			
 
				+	 * Generate PMC IRQs:
			
 
				+	 * (keep 'enabled' bit clear for now)
			
 
				+	 */
			
 
				+	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
			
 
				+
			
 
				+	/*
			
 
				+	 * Count user and OS events unless requested not to.
			
 
				+	 */
			
 
				+	if (!attr->exclude_user)
			
 
				+		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
			
 
				+	if (!attr->exclude_kernel)
			
 
				+		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
			
 
				+
			
 
				+	if (!hwc->sample_period) {
			
 
				+		hwc->sample_period = x86_pmu.max_period;
			
 
				+		hwc->last_period = hwc->sample_period;
			
 
				+		atomic64_set(&hwc->period_left, hwc->sample_period);
			
 
				+	}
			
 
				+
			
 
				+	counter->destroy = hw_perf_counter_destroy;
			
 
				+
			
 
				+	/*
			
 
				+	 * Raw event type provide the config in the event structure
			
 
				+	 */
			
 
				+	if (attr->type == PERF_TYPE_RAW) {
			
 
				+		hwc->config |= x86_pmu.raw_event(attr->config);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	if (attr->type == PERF_TYPE_HW_CACHE)
			
 
				+		return set_ext_hw_attr(hwc, attr);
			
 
				+
			
 
				+	if (attr->config >= x86_pmu.max_events)
			
 
				+		return -EINVAL;
			
 
				+	/*
			
 
				+	 * The generic map:
			
 
				+	 */
			
 
				+	hwc->config |= x86_pmu.event_map(attr->config);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void intel_pmu_disable_all(void)
			
 
				+{
			
 
				+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
			
 
				+}
			
 
				+
			
 
				+static void amd_pmu_disable_all(void)
			
 
				+{
			
 
				+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
			
 
				+	int idx;
			
 
				+
			
 
				+	if (!cpuc->enabled)
			
 
				+		return;
			
 
				+
			
 
				+	cpuc->enabled = 0;
			
 
				+	/*
			
 
				+	 * ensure we write the disable before we start disabling the
			
 
				+	 * counters proper, so that amd_pmu_enable_counter() does the
			
 
				+	 * right thing.
			
 
				+	 */
			
 
				+	barrier();
			
 
				+
			
 
				+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
			
 
				+		u64 val;
			
 
				+
			
 
				+		if (!test_bit(idx, cpuc->active_mask))
			
 
				+			continue;
			
 
				+		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
			
 
				+		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
			
 
				+			continue;
			
 
				+		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
			
 
				+		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void hw_perf_disable(void)
			
 
				+{
			
 
				+	if (!x86_pmu_initialized())
			
 
				+		return;
			
 
				+	return x86_pmu.disable_all();
			
 
				+}
			
 
				+
			
 
				+static void intel_pmu_enable_all(void)
			
 
				+{
			
 
				+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
			
 
				+}
			
 
				+
			
 
				+static void amd_pmu_enable_all(void)
			
 
				+{
			
 
				+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
			
 
				+	int idx;
			
 
				+
			
 
				+	if (cpuc->enabled)
			
 
				+		return;
			
 
				+
			
 
				+	cpuc->enabled = 1;
			
 
				+	barrier();
			
 
				+
			
 
				+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
			
 
				+		u64 val;
			
 
				+
			
 
				+		if (!test_bit(idx, cpuc->active_mask))
			
 
				+			continue;
			
 
				+		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
			
 
				+		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
			
 
				+			continue;
			
 
				+		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
			
 
				+		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void hw_perf_enable(void)
			
 
				+{
			
 
				+	if (!x86_pmu_initialized())
			
 
				+		return;
			
 
				+	x86_pmu.enable_all();
			
 
				+}
			
 
				+
			
 
				+static inline u64 intel_pmu_get_status(void)
			
 
				+{
			
 
				+	u64 status;
			
 
				+
			
 
				+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
			
 
				+
			
 
				+	return status;
			
 
				+}
			
 
				+
			
 
				+static inline void intel_pmu_ack_status(u64 ack)
			
 
				+{
			
 
				+	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
			
 
				+}
			
 
				+
			
 
				+static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
			
 
				+{
			
 
				+	int err;
			
 
				+	err = checking_wrmsrl(hwc->config_base + idx,
			
 
				+			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
			
 
				+}
			
 
				+
			
 
				+static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
			
 
				+{
			
 
				+	int err;
			
 
				+	err = checking_wrmsrl(hwc->config_base + idx,
			
 
				+			      hwc->config);
			
 
				+}
			
 
				+
			
 
				+static inline void
			
 
				+intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
			
 
				+{
			
 
				+	int idx = __idx - X86_PMC_IDX_FIXED;
			
 
				+	u64 ctrl_val, mask;
			
 
				+	int err;
			
 
				+
			
 
				+	mask = 0xfULL << (idx * 4);
			
 
				+
			
 
				+	rdmsrl(hwc->config_base, ctrl_val);
			
 
				+	ctrl_val &= ~mask;
			
 
				+	err = checking_wrmsrl(hwc->config_base, ctrl_val);
			
 
				+}
			
 
				+
			
 
				+static inline void
			
 
				+intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
			
 
				+{
			
 
				+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
			
 
				+		intel_pmu_disable_fixed(hwc, idx);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	x86_pmu_disable_counter(hwc, idx);
			
 
				+}
			
 
				+
			
 
				+static inline void
			
 
				+amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
			
 
				+{
			
 
				+	x86_pmu_disable_counter(hwc, idx);
			
 
				+}
			
 
				+
			
 
				+static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
			
 
				+
			
 
				+/*
			
 
				+ * Set the next IRQ period, based on the hwc->period_left value.
			
 
				+ * To be called with the counter disabled in hw:
			
 
				+ */
			
 
				+static int
			
 
				+x86_perf_counter_set_period(struct perf_counter *counter,
			
 
				+			     struct hw_perf_counter *hwc, int idx)
			
 
				+{
			
 
				+	s64 left = atomic64_read(&hwc->period_left);
			
 
				+	s64 period = hwc->sample_period;
			
 
				+	int err, ret = 0;
			
 
				+
			
 
				+	/*
			
 
				+	 * If we are way outside a reasoable range then just skip forward:
			
 
				+	 */
			
 
				+	if (unlikely(left <= -period)) {
			
 
				+		left = period;
			
 
				+		atomic64_set(&hwc->period_left, left);
			
 
				+		hwc->last_period = period;
			
 
				+		ret = 1;
			
 
				+	}
			
 
				+
			
 
				+	if (unlikely(left <= 0)) {
			
 
				+		left += period;
			
 
				+		atomic64_set(&hwc->period_left, left);
			
 
				+		hwc->last_period = period;
			
 
				+		ret = 1;
			
 
				+	}
			
 
				+	/*
			
 
				+	 * Quirk: certain CPUs dont like it if just 1 event is left:
			
 
				+	 */
			
 
				+	if (unlikely(left < 2))
			
 
				+		left = 2;
			
 
				+
			
 
				+	if (left > x86_pmu.max_period)
			
 
				+		left = x86_pmu.max_period;
			
 
				+
			
 
				+	per_cpu(prev_left[idx], smp_processor_id()) = left;
			
 
				+
			
 
				+	/*
			
 
				+	 * The hw counter starts counting from this counter offset,
			
 
				+	 * mark it to be able to extra future deltas:
			
 
				+	 */
			
 
				+	atomic64_set(&hwc->prev_count, (u64)-left);
			
 
				+
			
 
				+	err = checking_wrmsrl(hwc->counter_base + idx,
			
 
				+			     (u64)(-left) & x86_pmu.counter_mask);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static inline void
			
 
				+intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
			
 
				+{
			
 
				+	int idx = __idx - X86_PMC_IDX_FIXED;
			
 
				+	u64 ctrl_val, bits, mask;
			
 
				+	int err;
			
 
				+
			
 
				+	/*
			
 
				+	 * Enable IRQ generation (0x8),
			
 
				+	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
			
 
				+	 * if requested:
			
 
				+	 */
			
 
				+	bits = 0x8ULL;
			
 
				+	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
			
 
				+		bits |= 0x2;
			
 
				+	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
			
 
				+		bits |= 0x1;
			
 
				+	bits <<= (idx * 4);
			
 
				+	mask = 0xfULL << (idx * 4);
			
 
				+
			
 
				+	rdmsrl(hwc->config_base, ctrl_val);
			
 
				+	ctrl_val &= ~mask;
			
 
				+	ctrl_val |= bits;
			
 
				+	err = checking_wrmsrl(hwc->config_base, ctrl_val);
			
 
				+}
			
 
				+
			
 
				+static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
			
 
				+{
			
 
				+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
			
 
				+		intel_pmu_enable_fixed(hwc, idx);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	x86_pmu_enable_counter(hwc, idx);
			
 
				+}
			
 
				+
			
 
				+static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
			
 
				+{
			
 
				+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
			
 
				+
			
 
				+	if (cpuc->enabled)
			
 
				+		x86_pmu_enable_counter(hwc, idx);
			
 
				+	else
			
 
				+		x86_pmu_disable_counter(hwc, idx);
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
			
 
				+{
			
 
				+	unsigned int event;
			
 
				+
			
 
				+	if (!x86_pmu.num_counters_fixed)
			
 
				+		return -1;
			
 
				+
			
 
				+	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
			
 
				+
			
 
				+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
			
 
				+		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
			
 
				+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
			
 
				+		return X86_PMC_IDX_FIXED_CPU_CYCLES;
			
 
				+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
			
 
				+		return X86_PMC_IDX_FIXED_BUS_CYCLES;
			
 
				+
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Find a PMC slot for the freshly enabled / scheduled in counter:
			
 
				+ */
			
 
				+static int x86_pmu_enable(struct perf_counter *counter)
			
 
				+{
			
 
				+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
			
 
				+	struct hw_perf_counter *hwc = &counter->hw;
			
 
				+	int idx;
			
 
				+
			
 
				+	idx = fixed_mode_idx(counter, hwc);
			
 
				+	if (idx >= 0) {
			
 
				+		/*
			
 
				+		 * Try to get the fixed counter, if that is already taken
			
 
				+		 * then try to get a generic counter:
			
 
				+		 */
			
 
				+		if (test_and_set_bit(idx, cpuc->used_mask))
			
 
				+			goto try_generic;
			
 
				+
			
 
				+		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
			
 
				+		/*
			
 
				+		 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
			
 
				+		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
			
 
				+		 */
			
 
				+		hwc->counter_base =
			
 
				+			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
			
 
				+		hwc->idx = idx;
			
 
				+	} else {
			
 
				+		idx = hwc->idx;
			
 
				+		/* Try to get the previous generic counter again */
			
 
				+		if (test_and_set_bit(idx, cpuc->used_mask)) {
			
 
				+try_generic:
			
 
				+			idx = find_first_zero_bit(cpuc->used_mask,
			
 
				+						  x86_pmu.num_counters);
			
 
				+			if (idx == x86_pmu.num_counters)
			
 
				+				return -EAGAIN;
			
 
				+
			
 
				+			set_bit(idx, cpuc->used_mask);
			
 
				+			hwc->idx = idx;
			
 
				+		}
			
 
				+		hwc->config_base  = x86_pmu.eventsel;
			
 
				+		hwc->counter_base = x86_pmu.perfctr;
			
 
				+	}
			
 
				+
			
 
				+	perf_counters_lapic_init();
			
 
				+
			
 
				+	x86_pmu.disable(hwc, idx);
			
 
				+
			
 
				+	cpuc->counters[idx] = counter;
			
 
				+	set_bit(idx, cpuc->active_mask);
			
 
				+
			
 
				+	x86_perf_counter_set_period(counter, hwc, idx);
			
 
				+	x86_pmu.enable(hwc, idx);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void x86_pmu_unthrottle(struct perf_counter *counter)
			
 
				+{
			
 
				+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
			
 
				+	struct hw_perf_counter *hwc = &counter->hw;
			
 
				+
			
 
				+	if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
			
 
				+				cpuc->counters[hwc->idx] != counter))
			
 
				+		return;
			
 
				+
			
 
				+	x86_pmu.enable(hwc, hwc->idx);
			
 
				+}
			
 
				+
			
 
				+void perf_counter_print_debug(void)
			
 
				+{
			
 
				+	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
			
 
				+	struct cpu_hw_counters *cpuc;
			
 
				+	unsigned long flags;
			
 
				+	int cpu, idx;
			
 
				+
			
 
				+	if (!x86_pmu.num_counters)
			
 
				+		return;
			
 
				+
			
 
				+	local_irq_save(flags);
			
 
				+
			
 
				+	cpu = smp_processor_id();
			
 
				+	cpuc = &per_cpu(cpu_hw_counters, cpu);
			
 
				+
			
 
				+	if (x86_pmu.version >= 2) {
			
 
				+		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
			
 
				+		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
			
 
				+		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
			
 
				+		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
			
 
				+
			
 
				+		pr_info("\n");
			
 
				+		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
			
 
				+		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
			
 
				+		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
			
 
				+		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
			
 
				+	}
			
 
				+	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used_mask);
			
 
				+
			
 
				+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
			
 
				+		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
			
 
				+		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
			
 
				+
			
 
				+		prev_left = per_cpu(prev_left[idx], cpu);
			
 
				+
			
 
				+		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
			
 
				+			cpu, idx, pmc_ctrl);
			
 
				+		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
			
 
				+			cpu, idx, pmc_count);
			
 
				+		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
			
 
				+			cpu, idx, prev_left);
			
 
				+	}
			
 
				+	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
			
 
				+		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
			
 
				+
			
 
				+		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
			
 
				+			cpu, idx, pmc_count);
			
 
				+	}
			
 
				+	local_irq_restore(flags);
			
 
				+}
			
 
				+
			
 
				+static void x86_pmu_disable(struct perf_counter *counter)
			
 
				+{
			
 
				+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
			
 
				+	struct hw_perf_counter *hwc = &counter->hw;
			
 
				+	int idx = hwc->idx;
			
 
				+
			
 
				+	/*
			
 
				+	 * Must be done before we disable, otherwise the nmi handler
			
 
				+	 * could reenable again:
			
 
				+	 */
			
 
				+	clear_bit(idx, cpuc->active_mask);
			
 
				+	x86_pmu.disable(hwc, idx);
			
 
				+
			
 
				+	/*
			
 
				+	 * Make sure the cleared pointer becomes visible before we
			
 
				+	 * (potentially) free the counter:
			
 
				+	 */
			
 
				+	barrier();
			
 
				+
			
 
				+	/*
			
 
				+	 * Drain the remaining delta count out of a counter
			
 
				+	 * that we are disabling:
			
 
				+	 */
			
 
				+	x86_perf_counter_update(counter, hwc, idx);
			
 
				+	cpuc->counters[idx] = NULL;
			
 
				+	clear_bit(idx, cpuc->used_mask);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Save and restart an expired counter. Called by NMI contexts,
			
 
				+ * so it has to be careful about preempting normal counter ops:
			
 
				+ */
			
 
				+static int intel_pmu_save_and_restart(struct perf_counter *counter)
			
 
				+{
			
 
				+	struct hw_perf_counter *hwc = &counter->hw;
			
 
				+	int idx = hwc->idx;
			
 
				+	int ret;
			
 
				+
			
 
				+	x86_perf_counter_update(counter, hwc, idx);
			
 
				+	ret = x86_perf_counter_set_period(counter, hwc, idx);
			
 
				+
			
 
				+	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
			
 
				+		intel_pmu_enable_counter(hwc, idx);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static void intel_pmu_reset(void)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+	int idx;
			
 
				+
			
 
				+	if (!x86_pmu.num_counters)
			
 
				+		return;
			
 
				+
			
 
				+	local_irq_save(flags);
			
 
				+
			
 
				+	printk("clearing PMU state on CPU#%d\n", smp_processor_id());
			
 
				+
			
 
				+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
			
 
				+		checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
			
 
				+		checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull);
			
 
				+	}
			
 
				+	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
			
 
				+		checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
			
 
				+	}
			
 
				+
			
 
				+	local_irq_restore(flags);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * This handler is triggered by the local APIC, so the APIC IRQ handling
			
 
				+ * rules apply:
			
 
				+ */
			
 
				+static int intel_pmu_handle_irq(struct pt_regs *regs)
			
 
				+{
			
 
				+	struct perf_sample_data data;
			
 
				+	struct cpu_hw_counters *cpuc;
			
 
				+	int bit, cpu, loops;
			
 
				+	u64 ack, status;
			
 
				+
			
 
				+	data.regs = regs;
			
 
				+	data.addr = 0;
			
 
				+
			
 
				+	cpu = smp_processor_id();
			
 
				+	cpuc = &per_cpu(cpu_hw_counters, cpu);
			
 
				+
			
 
				+	perf_disable();
			
 
				+	status = intel_pmu_get_status();
			
 
				+	if (!status) {
			
 
				+		perf_enable();
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	loops = 0;
			
 
				+again:
			
 
				+	if (++loops > 100) {
			
 
				+		WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
			
 
				+		perf_counter_print_debug();
			
 
				+		intel_pmu_reset();
			
 
				+		perf_enable();
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				+	inc_irq_stat(apic_perf_irqs);
			
 
				+	ack = status;
			
 
				+	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
			
 
				+		struct perf_counter *counter = cpuc->counters[bit];
			
 
				+
			
 
				+		clear_bit(bit, (unsigned long *) &status);
			
 
				+		if (!test_bit(bit, cpuc->active_mask))
			
 
				+			continue;
			
 
				+
			
 
				+		if (!intel_pmu_save_and_restart(counter))
			
 
				+			continue;
			
 
				+
			
 
				+		if (perf_counter_overflow(counter, 1, &data))
			
 
				+			intel_pmu_disable_counter(&counter->hw, bit);
			
 
				+	}
			
 
				+
			
 
				+	intel_pmu_ack_status(ack);
			
 
				+
			
 
				+	/*
			
 
				+	 * Repeat if there is more work to be done:
			
 
				+	 */
			
 
				+	status = intel_pmu_get_status();
			
 
				+	if (status)
			
 
				+		goto again;
			
 
				+
			
 
				+	perf_enable();
			
 
				+
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+static int amd_pmu_handle_irq(struct pt_regs *regs)
			
 
				+{
			
 
				+	struct perf_sample_data data;
			
 
				+	struct cpu_hw_counters *cpuc;
			
 
				+	struct perf_counter *counter;
			
 
				+	struct hw_perf_counter *hwc;
			
 
				+	int cpu, idx, handled = 0;
			
 
				+	u64 val;
			
 
				+
			
 
				+	data.regs = regs;
			
 
				+	data.addr = 0;
			
 
				+
			
 
				+	cpu = smp_processor_id();
			
 
				+	cpuc = &per_cpu(cpu_hw_counters, cpu);
			
 
				+
			
 
				+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
			
 
				+		if (!test_bit(idx, cpuc->active_mask))
			
 
				+			continue;
			
 
				+
			
 
				+		counter = cpuc->counters[idx];
			
 
				+		hwc = &counter->hw;
			
 
				+
			
 
				+		val = x86_perf_counter_update(counter, hwc, idx);
			
 
				+		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
			
 
				+			continue;
			
 
				+
			
 
				+		/*
			
 
				+		 * counter overflow
			
 
				+		 */
			
 
				+		handled		= 1;
			
 
				+		data.period	= counter->hw.last_period;
			
 
				+
			
 
				+		if (!x86_perf_counter_set_period(counter, hwc, idx))
			
 
				+			continue;
			
 
				+
			
 
				+		if (perf_counter_overflow(counter, 1, &data))
			
 
				+			amd_pmu_disable_counter(hwc, idx);
			
 
				+	}
			
 
				+
			
 
				+	if (handled)
			
 
				+		inc_irq_stat(apic_perf_irqs);
			
 
				+
			
 
				+	return handled;
			
 
				+}
			
 
				+
			
 
				+void smp_perf_pending_interrupt(struct pt_regs *regs)
			
 
				+{
			
 
				+	irq_enter();
			
 
				+	ack_APIC_irq();
			
 
				+	inc_irq_stat(apic_pending_irqs);
			
 
				+	perf_counter_do_pending();
			
 
				+	irq_exit();
			
 
				+}
			
 
				+
			
 
				+void set_perf_counter_pending(void)
			
 
				+{
			
 
				+	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
			
 
				+}
			
 
				+
			
 
				+void perf_counters_lapic_init(void)
			
 
				+{
			
 
				+	if (!x86_pmu_initialized())
			
 
				+		return;
			
 
				+
			
 
				+	/*
			
 
				+	 * Always use NMI for PMU
			
 
				+	 */
			
 
				+	apic_write(APIC_LVTPC, APIC_DM_NMI);
			
 
				+}
			
 
				+
			
 
				+static int __kprobes
			
 
				+perf_counter_nmi_handler(struct notifier_block *self,
			
 
				+			 unsigned long cmd, void *__args)
			
 
				+{
			
 
				+	struct die_args *args = __args;
			
 
				+	struct pt_regs *regs;
			
 
				+
			
 
				+	if (!atomic_read(&active_counters))
			
 
				+		return NOTIFY_DONE;
			
 
				+
			
 
				+	switch (cmd) {
			
 
				+	case DIE_NMI:
			
 
				+	case DIE_NMI_IPI:
			
 
				+		break;
			
 
				+
			
 
				+	default:
			
 
				+		return NOTIFY_DONE;
			
 
				+	}
			
 
				+
			
 
				+	regs = args->regs;
			
 
				+
			
 
				+	apic_write(APIC_LVTPC, APIC_DM_NMI);
			
 
				+	/*
			
 
				+	 * Can't rely on the handled return value to say it was our NMI, two
			
 
				+	 * counters could trigger 'simultaneously' raising two back-to-back NMIs.
			
 
				+	 *
			
 
				+	 * If the first NMI handles both, the latter will be empty and daze
			
 
				+	 * the CPU.
			
 
				+	 */
			
 
				+	x86_pmu.handle_irq(regs);
			
 
				+
			
 
				+	return NOTIFY_STOP;
			
 
				+}
			
 
				+
			
 
				+static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
			
 
				+	.notifier_call		= perf_counter_nmi_handler,
			
 
				+	.next			= NULL,
			
 
				+	.priority		= 1
			
 
				+};
			
 
				+
			
 
				+static struct x86_pmu intel_pmu = {
			
 
				+	.name			= "Intel",
			
 
				+	.handle_irq		= intel_pmu_handle_irq,
			
 
				+	.disable_all		= intel_pmu_disable_all,
			
 
				+	.enable_all		= intel_pmu_enable_all,
			
 
				+	.enable			= intel_pmu_enable_counter,
			
 
				+	.disable		= intel_pmu_disable_counter,
			
 
				+	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
			
 
				+	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
			
 
				+	.event_map		= intel_pmu_event_map,
			
 
				+	.raw_event		= intel_pmu_raw_event,
			
 
				+	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
			
 
				+	/*
			
 
				+	 * Intel PMCs cannot be accessed sanely above 32 bit width,
			
 
				+	 * so we install an artificial 1<<31 period regardless of
			
 
				+	 * the generic counter period:
			
 
				+	 */
			
 
				+	.max_period		= (1ULL << 31) - 1,
			
 
				+};
			
 
				+
			
 
				+static struct x86_pmu amd_pmu = {
			
 
				+	.name			= "AMD",
			
 
				+	.handle_irq		= amd_pmu_handle_irq,
			
 
				+	.disable_all		= amd_pmu_disable_all,
			
 
				+	.enable_all		= amd_pmu_enable_all,
			
 
				+	.enable			= amd_pmu_enable_counter,
			
 
				+	.disable		= amd_pmu_disable_counter,
			
 
				+	.eventsel		= MSR_K7_EVNTSEL0,
			
 
				+	.perfctr		= MSR_K7_PERFCTR0,
			
 
				+	.event_map		= amd_pmu_event_map,
			
 
				+	.raw_event		= amd_pmu_raw_event,
			
 
				+	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
			
 
				+	.num_counters		= 4,
			
 
				+	.counter_bits		= 48,
			
 
				+	.counter_mask		= (1ULL << 48) - 1,
			
 
				+	/* use highest bit to detect overflow */
			
 
				+	.max_period		= (1ULL << 47) - 1,
			
 
				+};
			
 
				+
			
 
				+static int intel_pmu_init(void)
			
 
				+{
			
 
				+	union cpuid10_edx edx;
			
 
				+	union cpuid10_eax eax;
			
 
				+	unsigned int unused;
			
 
				+	unsigned int ebx;
			
 
				+	int version;
			
 
				+
			
 
				+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
			
 
				+		return -ENODEV;
			
 
				+
			
 
				+	/*
			
 
				+	 * Check whether the Architectural PerfMon supports
			
 
				+	 * Branch Misses Retired Event or not.
			
 
				+	 */
			
 
				+	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
			
 
				+	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
			
 
				+		return -ENODEV;
			
 
				+
			
 
				+	version = eax.split.version_id;
			
 
				+	if (version < 2)
			
 
				+		return -ENODEV;
			
 
				+
			
 
				+	x86_pmu				= intel_pmu;
			
 
				+	x86_pmu.version			= version;
			
 
				+	x86_pmu.num_counters		= eax.split.num_counters;
			
 
				+	x86_pmu.counter_bits		= eax.split.bit_width;
			
 
				+	x86_pmu.counter_mask		= (1ULL << eax.split.bit_width) - 1;
			
 
				+
			
 
				+	/*
			
 
				+	 * Quirk: v2 perfmon does not report fixed-purpose counters, so
			
 
				+	 * assume at least 3 counters:
			
 
				+	 */
			
 
				+	x86_pmu.num_counters_fixed	= max((int)edx.split.num_counters_fixed, 3);
			
 
				+
			
 
				+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
			
 
				+
			
 
				+	/*
			
 
				+	 * Install the hw-cache-events table:
			
 
				+	 */
			
 
				+	switch (boot_cpu_data.x86_model) {
			
 
				+	case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
			
 
				+	case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
			
 
				+	case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
			
 
				+	case 29: /* six-core 45 nm xeon "Dunnington" */
			
 
				+		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
			
 
				+		       sizeof(hw_cache_event_ids));
			
 
				+
			
 
				+		pr_cont("Core2 events, ");
			
 
				+		break;
			
 
				+	default:
			
 
				+	case 26:
			
 
				+		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
			
 
				+		       sizeof(hw_cache_event_ids));
			
 
				+
			
 
				+		pr_cont("Nehalem/Corei7 events, ");
			
 
				+		break;
			
 
				+	case 28:
			
 
				+		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
			
 
				+		       sizeof(hw_cache_event_ids));
			
 
				+
			
 
				+		pr_cont("Atom events, ");
			
 
				+		break;
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int amd_pmu_init(void)
			
 
				+{
			
 
				+	x86_pmu = amd_pmu;
			
 
				+
			
 
				+	switch (boot_cpu_data.x86) {
			
 
				+	case 0x0f:
			
 
				+	case 0x10:
			
 
				+	case 0x11:
			
 
				+		memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids,
			
 
				+		       sizeof(hw_cache_event_ids));
			
 
				+
			
 
				+		pr_cont("AMD Family 0f/10/11 events, ");
			
 
				+		break;
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+void __init init_hw_perf_counters(void)
			
 
				+{
			
 
				+	int err;
			
 
				+
			
 
				+	pr_info("Performance Counters: ");
			
 
				+
			
 
				+	switch (boot_cpu_data.x86_vendor) {
			
 
				+	case X86_VENDOR_INTEL:
			
 
				+		err = intel_pmu_init();
			
 
				+		break;
			
 
				+	case X86_VENDOR_AMD:
			
 
				+		err = amd_pmu_init();
			
 
				+		break;
			
 
				+	default:
			
 
				+		return;
			
 
				+	}
			
 
				+	if (err != 0) {
			
 
				+		pr_cont("no PMU driver, software counters only.\n");
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	pr_cont("%s PMU driver.\n", x86_pmu.name);
			
 
				+
			
 
				+	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
			
 
				+		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
			
 
				+		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
			
 
				+		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
			
 
				+	}
			
 
				+	perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
			
 
				+	perf_max_counters = x86_pmu.num_counters;
			
 
				+
			
 
				+	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
			
 
				+		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
			
 
				+		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
			
 
				+		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
			
 
				+	}
			
 
				+
			
 
				+	perf_counter_mask |=
			
 
				+		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
			
 
				+
			
 
				+	perf_counters_lapic_init();
			
 
				+	register_die_notifier(&perf_counter_nmi_notifier);
			
 
				+
			
 
				+	pr_info("... version:                 %d\n",     x86_pmu.version);
			
 
				+	pr_info("... bit width:               %d\n",     x86_pmu.counter_bits);
			
 
				+	pr_info("... generic counters:        %d\n",     x86_pmu.num_counters);
			
 
				+	pr_info("... value mask:              %016Lx\n", x86_pmu.counter_mask);
			
 
				+	pr_info("... max period:              %016Lx\n", x86_pmu.max_period);
			
 
				+	pr_info("... fixed-purpose counters:  %d\n",     x86_pmu.num_counters_fixed);
			
 
				+	pr_info("... counter mask:            %016Lx\n", perf_counter_mask);
			
 
				+}
			
 
				+
			
 
				+static inline void x86_pmu_read(struct perf_counter *counter)
			
 
				+{
			
 
				+	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
			
 
				+}
			
 
				+
			
 
				+static const struct pmu pmu = {
			
 
				+	.enable		= x86_pmu_enable,
			
 
				+	.disable	= x86_pmu_disable,
			
 
				+	.read		= x86_pmu_read,
			
 
				+	.unthrottle	= x86_pmu_unthrottle,
			
 
				+};
			
 
				+
			
 
				+const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
			
 
				+{
			
 
				+	int err;
			
 
				+
			
 
				+	err = __hw_perf_counter_init(counter);
			
 
				+	if (err)
			
 
				+		return ERR_PTR(err);
			
 
				+
			
 
				+	return &pmu;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * callchain support
			
 
				+ */
			
 
				+
			
 
				+static inline
			
 
				+void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
			
 
				+{
			
 
				+	if (entry->nr < MAX_STACK_DEPTH)
			
 
				+		entry->ip[entry->nr++] = ip;
			
 
				+}
			
 
				+
			
 
				+static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
			
 
				+static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
			
 
				+
			
 
				+
			
 
				+static void
			
 
				+backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
			
 
				+{
			
 
				+	/* Ignore warnings */
			
 
				+}
			
 
				+
			
 
				+static void backtrace_warning(void *data, char *msg)
			
 
				+{
			
 
				+	/* Ignore warnings */
			
 
				+}
			
 
				+
			
 
				+static int backtrace_stack(void *data, char *name)
			
 
				+{
			
 
				+	/* Don't bother with IRQ stacks for now */
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+static void backtrace_address(void *data, unsigned long addr, int reliable)
			
 
				+{
			
 
				+	struct perf_callchain_entry *entry = data;
			
 
				+
			
 
				+	if (reliable)
			
 
				+		callchain_store(entry, addr);
			
 
				+}
			
 
				+
			
 
				+static const struct stacktrace_ops backtrace_ops = {
			
 
				+	.warning		= backtrace_warning,
			
 
				+	.warning_symbol		= backtrace_warning_symbol,
			
 
				+	.stack			= backtrace_stack,
			
 
				+	.address		= backtrace_address,
			
 
				+};
			
 
				+
			
 
				+static void
			
 
				+perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
			
 
				+{
			
 
				+	unsigned long bp;
			
 
				+	char *stack;
			
 
				+	int nr = entry->nr;
			
 
				+
			
 
				+	callchain_store(entry, instruction_pointer(regs));
			
 
				+
			
 
				+	stack = ((char *)regs + sizeof(struct pt_regs));
			
 
				+#ifdef CONFIG_FRAME_POINTER
			
 
				+	bp = frame_pointer(regs);
			
 
				+#else
			
 
				+	bp = 0;
			
 
				+#endif
			
 
				+
			
 
				+	dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
			
 
				+
			
 
				+	entry->kernel = entry->nr - nr;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+struct stack_frame {
			
 
				+	const void __user	*next_fp;
			
 
				+	unsigned long		return_address;
			
 
				+};
			
 
				+
			
 
				+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
			
 
				+		return 0;
			
 
				+
			
 
				+	ret = 1;
			
 
				+	pagefault_disable();
			
 
				+	if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
			
 
				+		ret = 0;
			
 
				+	pagefault_enable();
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
			
 
				+{
			
 
				+	struct stack_frame frame;
			
 
				+	const void __user *fp;
			
 
				+	int nr = entry->nr;
			
 
				+
			
 
				+	regs = (struct pt_regs *)current->thread.sp0 - 1;
			
 
				+	fp   = (void __user *)regs->bp;
			
 
				+
			
 
				+	callchain_store(entry, regs->ip);
			
 
				+
			
 
				+	while (entry->nr < MAX_STACK_DEPTH) {
			
 
				+		frame.next_fp	     = NULL;
			
 
				+		frame.return_address = 0;
			
 
				+
			
 
				+		if (!copy_stack_frame(fp, &frame))
			
 
				+			break;
			
 
				+
			
 
				+		if ((unsigned long)fp < user_stack_pointer(regs))
			
 
				+			break;
			
 
				+
			
 
				+		callchain_store(entry, frame.return_address);
			
 
				+		fp = frame.next_fp;
			
 
				+	}
			
 
				+
			
 
				+	entry->user = entry->nr - nr;
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
			
 
				+{
			
 
				+	int is_user;
			
 
				+
			
 
				+	if (!regs)
			
 
				+		return;
			
 
				+
			
 
				+	is_user = user_mode(regs);
			
 
				+
			
 
				+	if (!current || current->pid == 0)
			
 
				+		return;
			
 
				+
			
 
				+	if (is_user && current->state != TASK_RUNNING)
			
 
				+		return;
			
 
				+
			
 
				+	if (!is_user)
			
 
				+		perf_callchain_kernel(regs, entry);
			
 
				+
			
 
				+	if (current->mm)
			
 
				+		perf_callchain_user(regs, entry);
			
 
				+}
			
 
				+
			
 
				+struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
			
 
				+{
			
 
				+	struct perf_callchain_entry *entry;
			
 
				+
			
 
				+	if (in_nmi())
			
 
				+		entry = &__get_cpu_var(nmi_entry);
			
 
				+	else
			
 
				+		entry = &__get_cpu_var(irq_entry);
			
 
				+
			
 
				+	entry->nr = 0;
			
 
				+	entry->hv = 0;
			
 
				+	entry->kernel = 0;
			
 
				+	entry->user = 0;
			
 
				+
			
 
				+	perf_do_callchain(regs, entry);
			
 
				+
			
 
				+	return entry;
			
 
				+}
			
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -19,8 +19,8 @@
 
				 #include <linux/nmi.h>
			
 
				 #include <linux/kprobes.h>
			
 
				 
			
 
				-#include <asm/genapic.h>
			
 
				-#include <asm/intel_arch_perfmon.h>
			
 
				+#include <asm/apic.h>
			
 
				+#include <asm/perf_counter.h>
			
 
				 
			
 
				 struct nmi_watchdog_ctlblk {
			
 
				 	unsigned int cccr_msr;
			
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1012,6 +1012,11 @@ apicinterrupt ERROR_APIC_VECTOR \
 
				 apicinterrupt SPURIOUS_APIC_VECTOR \
			
 
				 	spurious_interrupt smp_spurious_interrupt
			
 
				 
			
 
				+#ifdef CONFIG_PERF_COUNTERS
			
 
				+apicinterrupt LOCAL_PENDING_VECTOR \
			
 
				+	perf_pending_interrupt smp_perf_pending_interrupt
			
 
				+#endif
			
 
				+
			
 
				 /*
			
 
				  * Exception entry points.
			
 
				  */
			
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -62,6 +62,14 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 
				 	for_each_online_cpu(j)
			
 
				 		seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
			
 
				 	seq_printf(p, "  Spurious interrupts\n");
			
 
				+	seq_printf(p, "%*s: ", prec, "CNT");
			
 
				+	for_each_online_cpu(j)
			
 
				+		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
			
 
				+	seq_printf(p, "  Performance counter interrupts\n");
			
 
				+	seq_printf(p, "%*s: ", prec, "PND");
			
 
				+	for_each_online_cpu(j)
			
 
				+		seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
			
 
				+	seq_printf(p, "  Performance pending work\n");
			
 
				 #endif
			
 
				 	if (generic_interrupt_extension) {
			
 
				 		seq_printf(p, "%*s: ", prec, "PLT");
			
@@ -165,6 +173,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 
				 #ifdef CONFIG_X86_LOCAL_APIC
			
 
				 	sum += irq_stats(cpu)->apic_timer_irqs;
			
 
				 	sum += irq_stats(cpu)->irq_spurious_count;
			
 
				+	sum += irq_stats(cpu)->apic_perf_irqs;
			
 
				+	sum += irq_stats(cpu)->apic_pending_irqs;
			
 
				 #endif
			
 
				 	if (generic_interrupt_extension)
			
 
				 		sum += irq_stats(cpu)->generic_irqs;
			
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -181,10 +181,15 @@ static void __init apic_intr_init(void)
 
				 {
			
 
				 	smp_intr_init();
			
 
				 
			
 
				-#ifdef CONFIG_X86_64
			
 
				+#ifdef CONFIG_X86_THERMAL_VECTOR
			
 
				 	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
			
 
				+#endif
			
 
				+#ifdef CONFIG_X86_THRESHOLD
			
 
				 	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
			
 
				 #endif
			
 
				+#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC)
			
 
				+	alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
			
 
				+#endif
			
 
				 
			
 
				 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
			
 
				 	/* self generated IPI for local APIC timer */
			
@@ -199,17 +204,9 @@ static void __init apic_intr_init(void)
 
				 
			
 
				 	/* Performance monitoring interrupts: */
			
 
				 # ifdef CONFIG_PERF_COUNTERS
			
 
				-	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
			
 
				 	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
			
 
				 # endif
			
 
				 
			
 
				-#endif
			
 
				-
			
 
				-#ifdef CONFIG_X86_32
			
 
				-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
			
 
				-	/* thermal monitor LVT interrupt */
			
 
				-	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
			
 
				-#endif
			
 
				 #endif
			
 
				 }
			
 
				 
			
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,6 @@
 
				  *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
			
 
				  *  2000-2002   x86-64 support by Andi Kleen
			
 
				  */
			
 
				-
			
 
				 #include <linux/sched.h>
			
 
				 #include <linux/mm.h>
			
 
				 #include <linux/smp.h>
			
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -335,3 +335,4 @@ ENTRY(sys_call_table)
 
				 	.long sys_preadv
			
 
				 	.long sys_pwritev
			
 
				 	.long sys_rt_tgsigqueueinfo	/* 335 */
			
 
				+	.long sys_perf_counter_open
			
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -942,8 +942,13 @@ void __init trap_init(void)
 
				 #endif
			
 
				 	set_intr_gate(19, &simd_coprocessor_error);
			
 
				 
			
 
				+	/* Reserve all the builtin and the syscall vector: */
			
 
				+	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
			
 
				+		set_bit(i, used_vectors);
			
 
				+
			
 
				 #ifdef CONFIG_IA32_EMULATION
			
 
				 	set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
			
 
				+	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
			
 
				 #endif
			
 
				 
			
 
				 #ifdef CONFIG_X86_32
			
@@ -960,14 +965,9 @@ void __init trap_init(void)
 
				 	}
			
 
				 
			
 
				 	set_system_trap_gate(SYSCALL_VECTOR, &system_call);
			
 
				+	set_bit(SYSCALL_VECTOR, used_vectors);
			
 
				 #endif
			
 
				 
			
 
				-	/* Reserve all the builtin and the syscall vector: */
			
 
				-	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
			
 
				-		set_bit(i, used_vectors);
			
 
				-
			
 
				-	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
			
 
				-
			
 
				 	/*
			
 
				 	 * Should be a barrier for any external CPU state:
			
 
				 	 */
			
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -10,6 +10,7 @@
 
				 #include <linux/bootmem.h>		/* max_low_pfn			*/
			
 
				 #include <linux/kprobes.h>		/* __kprobes, ...		*/
			
 
				 #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
			
 
				+#include <linux/perf_counter.h>		/* perf_swcounter_event		*/
			
 
				 
			
 
				 #include <asm/traps.h>			/* dotraplinkage, ...		*/
			
 
				 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
			
@@ -1013,6 +1014,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 
				 	if (unlikely(error_code & PF_RSVD))
			
 
				 		pgtable_bad(regs, error_code, address);
			
 
				 
			
 
				+	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
			
 
				+
			
 
				 	/*
			
 
				 	 * If we're in an interrupt, have no user context or are running
			
 
				 	 * in an atomic region then we must not take the fault:
			
@@ -1106,10 +1109,15 @@ good_area:
 
				 		return;
			
 
				 	}
			
 
				 
			
 
				-	if (fault & VM_FAULT_MAJOR)
			
 
				+	if (fault & VM_FAULT_MAJOR) {
			
 
				 		tsk->maj_flt++;
			
 
				-	else
			
 
				+		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
			
 
				+				     regs, address);
			
 
				+	} else {
			
 
				 		tsk->min_flt++;
			
 
				+		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
			
 
				+				     regs, address);
			
 
				+	}
			
 
				 
			
 
				 	check_v8086_mode(regs, address, tsk);
			
 
				 
			
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -40,8 +40,9 @@ static int profile_exceptions_notify(struct notifier_block *self,
 
				 
			
 
				 	switch (val) {
			
 
				 	case DIE_NMI:
			
 
				-		if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu)))
			
 
				-			ret = NOTIFY_STOP;
			
 
				+	case DIE_NMI_IPI:
			
 
				+		model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu));
			
 
				+		ret = NOTIFY_STOP;
			
 
				 		break;
			
 
				 	default:
			
 
				 		break;
			
@@ -134,7 +135,7 @@ static void nmi_cpu_setup(void *dummy)
 
				 static struct notifier_block profile_exceptions_nb = {
			
 
				 	.notifier_call = profile_exceptions_notify,
			
 
				 	.next = NULL,
			
 
				-	.priority = 0
			
 
				+	.priority = 2
			
 
				 };
			
 
				 
			
 
				 static int nmi_setup(void)
			
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -18,7 +18,7 @@
 
				 #include <asm/msr.h>
			
 
				 #include <asm/apic.h>
			
 
				 #include <asm/nmi.h>
			
 
				-#include <asm/intel_arch_perfmon.h>
			
 
				+#include <asm/perf_counter.h>
			
 
				 
			
 
				 #include "op_x86_model.h"
			
 
				 #include "op_counter.h"
			
@@ -136,6 +136,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
 
				 	u64 val;
			
 
				 	int i;
			
 
				 
			
 
				+	/*
			
 
				+	 * This can happen if perf counters are in use when
			
 
				+	 * we steal the die notifier NMI.
			
 
				+	 */
			
 
				+	if (unlikely(!reset_value))
			
 
				+		goto out;
			
 
				+
			
 
				 	for (i = 0 ; i < num_counters; ++i) {
			
 
				 		if (!reset_value[i])
			
 
				 			continue;
			
@@ -146,6 +153,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+out:
			
 
				 	/* Only P6 based Pentium M need to re-unmask the apic vector but it
			
 
				 	 * doesn't hurt other P6 variant */
			
 
				 	apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
			
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -338,6 +338,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	current->mm->context.vdso = (void *)addr;
			
 
				+
			
 
				 	if (compat_uses_vma || !compat) {
			
 
				 		/*
			
 
				 		 * MAYWRITE to allow gdb to COW and set breakpoints
			
@@ -358,11 +360,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 
				 			goto up_fail;
			
 
				 	}
			
 
				 
			
 
				-	current->mm->context.vdso = (void *)addr;
			
 
				 	current_thread_info()->sysenter_return =
			
 
				 		VDSO32_SYMBOL(addr, SYSENTER_RETURN);
			
 
				 
			
 
				   up_fail:
			
 
				+	if (ret)
			
 
				+		current->mm->context.vdso = NULL;
			
 
				+
			
 
				 	up_write(&mm->mmap_sem);
			
 
				 
			
 
				 	return ret;
			
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -116,15 +116,18 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 
				 		goto up_fail;
			
 
				 	}
			
 
				 
			
 
				+	current->mm->context.vdso = (void *)addr;
			
 
				+
			
 
				 	ret = install_special_mapping(mm, addr, vdso_size,
			
 
				 				      VM_READ|VM_EXEC|
			
 
				 				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
			
 
				 				      VM_ALWAYSDUMP,
			
 
				 				      vdso_pages);
			
 
				-	if (ret)
			
 
				+	if (ret) {
			
 
				+		current->mm->context.vdso = NULL;
			
 
				 		goto up_fail;
			
 
				+	}
			
 
				 
			
 
				-	current->mm->context.vdso = (void *)addr;
			
 
				 up_fail:
			
 
				 	up_write(&mm->mmap_sem);
			
 
				 	return ret;
			
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -25,6 +25,7 @@
 
				 #include <linux/kbd_kern.h>
			
 
				 #include <linux/proc_fs.h>
			
 
				 #include <linux/quotaops.h>
			
 
				+#include <linux/perf_counter.h>
			
 
				 #include <linux/kernel.h>
			
 
				 #include <linux/module.h>
			
 
				 #include <linux/suspend.h>
			
@@ -243,6 +244,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty)
 
				 	struct pt_regs *regs = get_irq_regs();
			
 
				 	if (regs)
			
 
				 		show_regs(regs);
			
 
				+	perf_counter_print_debug();
			
 
				 }
			
 
				 static struct sysrq_key_op sysrq_showregs_op = {
			
 
				 	.handler	= sysrq_handle_showregs,
			
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,6 +33,7 @@
 
				 #include <linux/string.h>
			
 
				 #include <linux/init.h>
			
 
				 #include <linux/pagemap.h>
			
 
				+#include <linux/perf_counter.h>
			
 
				 #include <linux/highmem.h>
			
 
				 #include <linux/spinlock.h>
			
 
				 #include <linux/key.h>
			
@@ -922,6 +923,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
 
				 	task_lock(tsk);
			
 
				 	strlcpy(tsk->comm, buf, sizeof(tsk->comm));
			
 
				 	task_unlock(tsk);
			
 
				+	perf_counter_comm(tsk);
			
 
				 }
			
 
				 
			
 
				 int flush_old_exec(struct linux_binprm * bprm)
			
@@ -990,6 +992,13 @@ int flush_old_exec(struct linux_binprm * bprm)
 
				 
			
 
				 	current->personality &= ~bprm->per_clear;
			
 
				 
			
 
				+	/*
			
 
				+	 * Flush performance counters when crossing a
			
 
				+	 * security domain:
			
 
				+	 */
			
 
				+	if (!get_dumpable(current->mm))
			
 
				+		perf_counter_exit_task(current);
			
 
				+
			
 
				 	/* An exec changes our domain. We are no longer part of the thread
			
 
				 	   group */
			
 
				 
			
--- a/include/asm-generic/atomic.h
+++ b/include/asm-generic/atomic.h
@@ -134,7 +134,7 @@ static inline long atomic_long_add_unless(atomic_long_t *l, long a, long u)
 
				 #define atomic_long_cmpxchg(l, old, new) \
			
 
				 	(atomic64_cmpxchg((atomic64_t *)(l), (old), (new)))
			
 
				 #define atomic_long_xchg(v, new) \
			
 
				-	(atomic64_xchg((atomic64_t *)(l), (new)))
			
 
				+	(atomic64_xchg((atomic64_t *)(v), (new)))
			
 
				 
			
 
				 #else  /*  BITS_PER_LONG == 64  */
			
 
				 
			
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -108,6 +108,15 @@ extern struct group_info init_groups;
 
				 
			
 
				 extern struct cred init_cred;
			
 
				 
			
 
				+#ifdef CONFIG_PERF_COUNTERS
			
 
				+# define INIT_PERF_COUNTERS(tsk)					\
			
 
				+	.perf_counter_mutex = 						\
			
 
				+		 __MUTEX_INITIALIZER(tsk.perf_counter_mutex),		\
			
 
				+	.perf_counter_list = LIST_HEAD_INIT(tsk.perf_counter_list),
			
 
				+#else
			
 
				+# define INIT_PERF_COUNTERS(tsk)
			
 
				+#endif
			
 
				+
			
 
				 /*
			
 
				  *  INIT_TASK is used to set up the first task table, touch at
			
 
				  * your own risk!. Base=0, limit=0x1fffff (=2MB)
			
@@ -171,6 +180,7 @@ extern struct cred init_cred;
 
				 	},								\
			
 
				 	.dirties = INIT_PROP_LOCAL_SINGLE(dirties),			\
			
 
				 	INIT_IDS							\
			
 
				+	INIT_PERF_COUNTERS(tsk)						\
			
 
				 	INIT_TRACE_IRQFLAGS						\
			
 
				 	INIT_LOCKDEP							\
			
 
				 	INIT_FTRACE_GRAPH						\
			
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -81,7 +81,12 @@ static inline unsigned int kstat_irqs(unsigned int irq)
 
				 	return sum;
			
 
				 }
			
 
				 
			
 
				+
			
 
				+/*
			
 
				+ * Lock/unlock the current runqueue - to extract task statistics:
			
 
				+ */
			
 
				 extern unsigned long long task_delta_exec(struct task_struct *);
			
 
				+
			
 
				 extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
			
 
				 extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
			
 
				 extern void account_steal_time(cputime_t);
			
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -0,0 +1,697 @@
 
				+/*
			
 
				+ *  Performance counters:
			
 
				+ *
			
 
				+ *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
			
 
				+ *    Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar
			
 
				+ *    Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra
			
 
				+ *
			
 
				+ *  Data type definitions, declarations, prototypes.
			
 
				+ *
			
 
				+ *    Started by: Thomas Gleixner and Ingo Molnar
			
 
				+ *
			
 
				+ *  For licencing details see kernel-base/COPYING
			
 
				+ */
			
 
				+#ifndef _LINUX_PERF_COUNTER_H
			
 
				+#define _LINUX_PERF_COUNTER_H
			
 
				+
			
 
				+#include <linux/types.h>
			
 
				+#include <linux/ioctl.h>
			
 
				+#include <asm/byteorder.h>
			
 
				+
			
 
				+/*
			
 
				+ * User-space ABI bits:
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * attr.type
			
 
				+ */
			
 
				+enum perf_type_id {
			
 
				+	PERF_TYPE_HARDWARE			= 0,
			
 
				+	PERF_TYPE_SOFTWARE			= 1,
			
 
				+	PERF_TYPE_TRACEPOINT			= 2,
			
 
				+	PERF_TYPE_HW_CACHE			= 3,
			
 
				+	PERF_TYPE_RAW				= 4,
			
 
				+
			
 
				+	PERF_TYPE_MAX,				/* non-ABI */
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Generalized performance counter event types, used by the
			
 
				+ * attr.event_id parameter of the sys_perf_counter_open()
			
 
				+ * syscall:
			
 
				+ */
			
 
				+enum perf_hw_id {
			
 
				+	/*
			
 
				+	 * Common hardware events, generalized by the kernel:
			
 
				+	 */
			
 
				+	PERF_COUNT_HW_CPU_CYCLES		= 0,
			
 
				+	PERF_COUNT_HW_INSTRUCTIONS		= 1,
			
 
				+	PERF_COUNT_HW_CACHE_REFERENCES		= 2,
			
 
				+	PERF_COUNT_HW_CACHE_MISSES		= 3,
			
 
				+	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
			
 
				+	PERF_COUNT_HW_BRANCH_MISSES		= 5,
			
 
				+	PERF_COUNT_HW_BUS_CYCLES		= 6,
			
 
				+
			
 
				+	PERF_COUNT_HW_MAX,			/* non-ABI */
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Generalized hardware cache counters:
			
 
				+ *
			
 
				+ *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
			
 
				+ *       { read, write, prefetch } x
			
 
				+ *       { accesses, misses }
			
 
				+ */
			
 
				+enum perf_hw_cache_id {
			
 
				+	PERF_COUNT_HW_CACHE_L1D			= 0,
			
 
				+	PERF_COUNT_HW_CACHE_L1I			= 1,
			
 
				+	PERF_COUNT_HW_CACHE_LL			= 2,
			
 
				+	PERF_COUNT_HW_CACHE_DTLB		= 3,
			
 
				+	PERF_COUNT_HW_CACHE_ITLB		= 4,
			
 
				+	PERF_COUNT_HW_CACHE_BPU			= 5,
			
 
				+
			
 
				+	PERF_COUNT_HW_CACHE_MAX,		/* non-ABI */
			
 
				+};
			
 
				+
			
 
				+enum perf_hw_cache_op_id {
			
 
				+	PERF_COUNT_HW_CACHE_OP_READ		= 0,
			
 
				+	PERF_COUNT_HW_CACHE_OP_WRITE		= 1,
			
 
				+	PERF_COUNT_HW_CACHE_OP_PREFETCH		= 2,
			
 
				+
			
 
				+	PERF_COUNT_HW_CACHE_OP_MAX,		/* non-ABI */
			
 
				+};
			
 
				+
			
 
				+enum perf_hw_cache_op_result_id {
			
 
				+	PERF_COUNT_HW_CACHE_RESULT_ACCESS	= 0,
			
 
				+	PERF_COUNT_HW_CACHE_RESULT_MISS		= 1,
			
 
				+
			
 
				+	PERF_COUNT_HW_CACHE_RESULT_MAX,		/* non-ABI */
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Special "software" counters provided by the kernel, even if the hardware
			
 
				+ * does not support performance counters. These counters measure various
			
 
				+ * physical and sw events of the kernel (and allow the profiling of them as
			
 
				+ * well):
			
 
				+ */
			
 
				+enum perf_sw_ids {
			
 
				+	PERF_COUNT_SW_CPU_CLOCK			= 0,
			
 
				+	PERF_COUNT_SW_TASK_CLOCK		= 1,
			
 
				+	PERF_COUNT_SW_PAGE_FAULTS		= 2,
			
 
				+	PERF_COUNT_SW_CONTEXT_SWITCHES		= 3,
			
 
				+	PERF_COUNT_SW_CPU_MIGRATIONS		= 4,
			
 
				+	PERF_COUNT_SW_PAGE_FAULTS_MIN		= 5,
			
 
				+	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
			
 
				+
			
 
				+	PERF_COUNT_SW_MAX,			/* non-ABI */
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Bits that can be set in attr.sample_type to request information
			
 
				+ * in the overflow packets.
			
 
				+ */
			
 
				+enum perf_counter_sample_format {
			
 
				+	PERF_SAMPLE_IP				= 1U << 0,
			
 
				+	PERF_SAMPLE_TID				= 1U << 1,
			
 
				+	PERF_SAMPLE_TIME			= 1U << 2,
			
 
				+	PERF_SAMPLE_ADDR			= 1U << 3,
			
 
				+	PERF_SAMPLE_GROUP			= 1U << 4,
			
 
				+	PERF_SAMPLE_CALLCHAIN			= 1U << 5,
			
 
				+	PERF_SAMPLE_ID				= 1U << 6,
			
 
				+	PERF_SAMPLE_CPU				= 1U << 7,
			
 
				+	PERF_SAMPLE_PERIOD			= 1U << 8,
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Bits that can be set in attr.read_format to request that
			
 
				+ * reads on the counter should return the indicated quantities,
			
 
				+ * in increasing order of bit value, after the counter value.
			
 
				+ */
			
 
				+enum perf_counter_read_format {
			
 
				+	PERF_FORMAT_TOTAL_TIME_ENABLED		= 1U << 0,
			
 
				+	PERF_FORMAT_TOTAL_TIME_RUNNING		= 1U << 1,
			
 
				+	PERF_FORMAT_ID				= 1U << 2,
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Hardware event to monitor via a performance monitoring counter:
			
 
				+ */
			
 
				+struct perf_counter_attr {
			
 
				+	/*
			
 
				+	 * Major type: hardware/software/tracepoint/etc.
			
 
				+	 */
			
 
				+	__u32			type;
			
 
				+	__u32			__reserved_1;
			
 
				+
			
 
				+	/*
			
 
				+	 * Type specific configuration information.
			
 
				+	 */
			
 
				+	__u64			config;
			
 
				+
			
 
				+	union {
			
 
				+		__u64		sample_period;
			
 
				+		__u64		sample_freq;
			
 
				+	};
			
 
				+
			
 
				+	__u64			sample_type;
			
 
				+	__u64			read_format;
			
 
				+
			
 
				+	__u64			disabled       :  1, /* off by default        */
			
 
				+				inherit	       :  1, /* children inherit it   */
			
 
				+				pinned	       :  1, /* must always be on PMU */
			
 
				+				exclusive      :  1, /* only group on PMU     */
			
 
				+				exclude_user   :  1, /* don't count user      */
			
 
				+				exclude_kernel :  1, /* ditto kernel          */
			
 
				+				exclude_hv     :  1, /* ditto hypervisor      */
			
 
				+				exclude_idle   :  1, /* don't count when idle */
			
 
				+				mmap           :  1, /* include mmap data     */
			
 
				+				comm	       :  1, /* include comm data     */
			
 
				+				freq           :  1, /* use freq, not period  */
			
 
				+
			
 
				+				__reserved_2   : 53;
			
 
				+
			
 
				+	__u32			wakeup_events;	/* wakeup every n events */
			
 
				+	__u32			__reserved_3;
			
 
				+
			
 
				+	__u64			__reserved_4;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Ioctls that can be done on a perf counter fd:
			
 
				+ */
			
 
				+#define PERF_COUNTER_IOC_ENABLE		_IO ('$', 0)
			
 
				+#define PERF_COUNTER_IOC_DISABLE	_IO ('$', 1)
			
 
				+#define PERF_COUNTER_IOC_REFRESH	_IO ('$', 2)
			
 
				+#define PERF_COUNTER_IOC_RESET		_IO ('$', 3)
			
 
				+#define PERF_COUNTER_IOC_PERIOD		_IOW('$', 4, u64)
			
 
				+
			
 
				+enum perf_counter_ioc_flags {
			
 
				+	PERF_IOC_FLAG_GROUP		= 1U << 0,
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Structure of the page that can be mapped via mmap
			
 
				+ */
			
 
				+struct perf_counter_mmap_page {
			
 
				+	__u32	version;		/* version number of this structure */
			
 
				+	__u32	compat_version;		/* lowest version this is compat with */
			
 
				+
			
 
				+	/*
			
 
				+	 * Bits needed to read the hw counters in user-space.
			
 
				+	 *
			
 
				+	 *   u32 seq;
			
 
				+	 *   s64 count;
			
 
				+	 *
			
 
				+	 *   do {
			
 
				+	 *     seq = pc->lock;
			
 
				+	 *
			
 
				+	 *     barrier()
			
 
				+	 *     if (pc->index) {
			
 
				+	 *       count = pmc_read(pc->index - 1);
			
 
				+	 *       count += pc->offset;
			
 
				+	 *     } else
			
 
				+	 *       goto regular_read;
			
 
				+	 *
			
 
				+	 *     barrier();
			
 
				+	 *   } while (pc->lock != seq);
			
 
				+	 *
			
 
				+	 * NOTE: for obvious reason this only works on self-monitoring
			
 
				+	 *       processes.
			
 
				+	 */
			
 
				+	__u32	lock;			/* seqlock for synchronization */
			
 
				+	__u32	index;			/* hardware counter identifier */
			
 
				+	__s64	offset;			/* add to hardware counter value */
			
 
				+
			
 
				+	/*
			
 
				+	 * Control data for the mmap() data buffer.
			
 
				+	 *
			
 
				+	 * User-space reading this value should issue an rmb(), on SMP capable
			
 
				+	 * platforms, after reading this value -- see perf_counter_wakeup().
			
 
				+	 */
			
 
				+	__u64   data_head;		/* head in the data section */
			
 
				+};
			
 
				+
			
 
				+#define PERF_EVENT_MISC_CPUMODE_MASK		(3 << 0)
			
 
				+#define PERF_EVENT_MISC_CPUMODE_UNKNOWN		(0 << 0)
			
 
				+#define PERF_EVENT_MISC_KERNEL			(1 << 0)
			
 
				+#define PERF_EVENT_MISC_USER			(2 << 0)
			
 
				+#define PERF_EVENT_MISC_HYPERVISOR		(3 << 0)
			
 
				+#define PERF_EVENT_MISC_OVERFLOW		(1 << 2)
			
 
				+
			
 
				+struct perf_event_header {
			
 
				+	__u32	type;
			
 
				+	__u16	misc;
			
 
				+	__u16	size;
			
 
				+};
			
 
				+
			
 
				+enum perf_event_type {
			
 
				+
			
 
				+	/*
			
 
				+	 * The MMAP events record the PROT_EXEC mappings so that we can
			
 
				+	 * correlate userspace IPs to code. They have the following structure:
			
 
				+	 *
			
 
				+	 * struct {
			
 
				+	 *	struct perf_event_header	header;
			
 
				+	 *
			
 
				+	 *	u32				pid, tid;
			
 
				+	 *	u64				addr;
			
 
				+	 *	u64				len;
			
 
				+	 *	u64				pgoff;
			
 
				+	 *	char				filename[];
			
 
				+	 * };
			
 
				+	 */
			
 
				+	PERF_EVENT_MMAP			= 1,
			
 
				+
			
 
				+	/*
			
 
				+	 * struct {
			
 
				+	 *	struct perf_event_header	header;
			
 
				+	 *
			
 
				+	 *	u32				pid, tid;
			
 
				+	 *	char				comm[];
			
 
				+	 * };
			
 
				+	 */
			
 
				+	PERF_EVENT_COMM			= 3,
			
 
				+
			
 
				+	/*
			
 
				+	 * struct {
			
 
				+	 *	struct perf_event_header	header;
			
 
				+	 *	u64				time;
			
 
				+	 *	u64				id;
			
 
				+	 *	u64				sample_period;
			
 
				+	 * };
			
 
				+	 */
			
 
				+	PERF_EVENT_PERIOD		= 4,
			
 
				+
			
 
				+	/*
			
 
				+	 * struct {
			
 
				+	 *	struct perf_event_header	header;
			
 
				+	 *	u64				time;
			
 
				+	 *	u64				id;
			
 
				+	 * };
			
 
				+	 */
			
 
				+	PERF_EVENT_THROTTLE		= 5,
			
 
				+	PERF_EVENT_UNTHROTTLE		= 6,
			
 
				+
			
 
				+	/*
			
 
				+	 * struct {
			
 
				+	 *	struct perf_event_header	header;
			
 
				+	 *	u32				pid, ppid;
			
 
				+	 * };
			
 
				+	 */
			
 
				+	PERF_EVENT_FORK			= 7,
			
 
				+
			
 
				+	/*
			
 
				+	 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
			
 
				+	 * will be PERF_RECORD_*
			
 
				+	 *
			
 
				+	 * struct {
			
 
				+	 *	struct perf_event_header	header;
			
 
				+	 *
			
 
				+	 *	{ u64			ip;	  } && PERF_RECORD_IP
			
 
				+	 *	{ u32			pid, tid; } && PERF_RECORD_TID
			
 
				+	 *	{ u64			time;     } && PERF_RECORD_TIME
			
 
				+	 *	{ u64			addr;     } && PERF_RECORD_ADDR
			
 
				+	 *	{ u64			config;   } && PERF_RECORD_CONFIG
			
 
				+	 *	{ u32			cpu, res; } && PERF_RECORD_CPU
			
 
				+	 *
			
 
				+	 *	{ u64			nr;
			
 
				+	 *	  { u64 id, val; }	cnt[nr];  } && PERF_RECORD_GROUP
			
 
				+	 *
			
 
				+	 *	{ u16			nr,
			
 
				+	 *				hv,
			
 
				+	 *				kernel,
			
 
				+	 *				user;
			
 
				+	 *	  u64			ips[nr];  } && PERF_RECORD_CALLCHAIN
			
 
				+	 * };
			
 
				+	 */
			
 
				+};
			
 
				+
			
 
				+#ifdef __KERNEL__
			
 
				+/*
			
 
				+ * Kernel-internal data types and definitions:
			
 
				+ */
			
 
				+
			
 
				+#ifdef CONFIG_PERF_COUNTERS
			
 
				+# include <asm/perf_counter.h>
			
 
				+#endif
			
 
				+
			
 
				+#include <linux/list.h>
			
 
				+#include <linux/mutex.h>
			
 
				+#include <linux/rculist.h>
			
 
				+#include <linux/rcupdate.h>
			
 
				+#include <linux/spinlock.h>
			
 
				+#include <linux/hrtimer.h>
			
 
				+#include <linux/fs.h>
			
 
				+#include <linux/pid_namespace.h>
			
 
				+#include <asm/atomic.h>
			
 
				+
			
 
				+struct task_struct;
			
 
				+
			
 
				+/**
			
 
				+ * struct hw_perf_counter - performance counter hardware details:
			
 
				+ */
			
 
				+struct hw_perf_counter {
			
 
				+#ifdef CONFIG_PERF_COUNTERS
			
 
				+	union {
			
 
				+		struct { /* hardware */
			
 
				+			u64		config;
			
 
				+			unsigned long	config_base;
			
 
				+			unsigned long	counter_base;
			
 
				+			int		idx;
			
 
				+		};
			
 
				+		union { /* software */
			
 
				+			atomic64_t	count;
			
 
				+			struct hrtimer	hrtimer;
			
 
				+		};
			
 
				+	};
			
 
				+	atomic64_t			prev_count;
			
 
				+	u64				sample_period;
			
 
				+	u64				last_period;
			
 
				+	atomic64_t			period_left;
			
 
				+	u64				interrupts;
			
 
				+
			
 
				+	u64				freq_count;
			
 
				+	u64				freq_interrupts;
			
 
				+	u64				freq_stamp;
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+struct perf_counter;
			
 
				+
			
 
				+/**
			
 
				+ * struct pmu - generic performance monitoring unit
			
 
				+ */
			
 
				+struct pmu {
			
 
				+	int (*enable)			(struct perf_counter *counter);
			
 
				+	void (*disable)			(struct perf_counter *counter);
			
 
				+	void (*read)			(struct perf_counter *counter);
			
 
				+	void (*unthrottle)		(struct perf_counter *counter);
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * enum perf_counter_active_state - the states of a counter
			
 
				+ */
			
 
				+enum perf_counter_active_state {
			
 
				+	PERF_COUNTER_STATE_ERROR	= -2,
			
 
				+	PERF_COUNTER_STATE_OFF		= -1,
			
 
				+	PERF_COUNTER_STATE_INACTIVE	=  0,
			
 
				+	PERF_COUNTER_STATE_ACTIVE	=  1,
			
 
				+};
			
 
				+
			
 
				+struct file;
			
 
				+
			
 
				+struct perf_mmap_data {
			
 
				+	struct rcu_head			rcu_head;
			
 
				+	int				nr_pages;	/* nr of data pages  */
			
 
				+	int				nr_locked;	/* nr pages mlocked  */
			
 
				+
			
 
				+	atomic_t			poll;		/* POLL_ for wakeups */
			
 
				+	atomic_t			events;		/* event limit       */
			
 
				+
			
 
				+	atomic_long_t			head;		/* write position    */
			
 
				+	atomic_long_t			done_head;	/* completed head    */
			
 
				+
			
 
				+	atomic_t			lock;		/* concurrent writes */
			
 
				+
			
 
				+	atomic_t			wakeup;		/* needs a wakeup    */
			
 
				+
			
 
				+	struct perf_counter_mmap_page   *user_page;
			
 
				+	void				*data_pages[0];
			
 
				+};
			
 
				+
			
 
				+struct perf_pending_entry {
			
 
				+	struct perf_pending_entry *next;
			
 
				+	void (*func)(struct perf_pending_entry *);
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * struct perf_counter - performance counter kernel representation:
			
 
				+ */
			
 
				+struct perf_counter {
			
 
				+#ifdef CONFIG_PERF_COUNTERS
			
 
				+	struct list_head		list_entry;
			
 
				+	struct list_head		event_entry;
			
 
				+	struct list_head		sibling_list;
			
 
				+	int				nr_siblings;
			
 
				+	struct perf_counter		*group_leader;
			
 
				+	const struct pmu		*pmu;
			
 
				+
			
 
				+	enum perf_counter_active_state	state;
			
 
				+	atomic64_t			count;
			
 
				+
			
 
				+	/*
			
 
				+	 * These are the total time in nanoseconds that the counter
			
 
				+	 * has been enabled (i.e. eligible to run, and the task has
			
 
				+	 * been scheduled in, if this is a per-task counter)
			
 
				+	 * and running (scheduled onto the CPU), respectively.
			
 
				+	 *
			
 
				+	 * They are computed from tstamp_enabled, tstamp_running and
			
 
				+	 * tstamp_stopped when the counter is in INACTIVE or ACTIVE state.
			
 
				+	 */
			
 
				+	u64				total_time_enabled;
			
 
				+	u64				total_time_running;
			
 
				+
			
 
				+	/*
			
 
				+	 * These are timestamps used for computing total_time_enabled
			
 
				+	 * and total_time_running when the counter is in INACTIVE or
			
 
				+	 * ACTIVE state, measured in nanoseconds from an arbitrary point
			
 
				+	 * in time.
			
 
				+	 * tstamp_enabled: the notional time when the counter was enabled
			
 
				+	 * tstamp_running: the notional time when the counter was scheduled on
			
 
				+	 * tstamp_stopped: in INACTIVE state, the notional time when the
			
 
				+	 *	counter was scheduled off.
			
 
				+	 */
			
 
				+	u64				tstamp_enabled;
			
 
				+	u64				tstamp_running;
			
 
				+	u64				tstamp_stopped;
			
 
				+
			
 
				+	struct perf_counter_attr	attr;
			
 
				+	struct hw_perf_counter		hw;
			
 
				+
			
 
				+	struct perf_counter_context	*ctx;
			
 
				+	struct file			*filp;
			
 
				+
			
 
				+	/*
			
 
				+	 * These accumulate total time (in nanoseconds) that children
			
 
				+	 * counters have been enabled and running, respectively.
			
 
				+	 */
			
 
				+	atomic64_t			child_total_time_enabled;
			
 
				+	atomic64_t			child_total_time_running;
			
 
				+
			
 
				+	/*
			
 
				+	 * Protect attach/detach and child_list:
			
 
				+	 */
			
 
				+	struct mutex			child_mutex;
			
 
				+	struct list_head		child_list;
			
 
				+	struct perf_counter		*parent;
			
 
				+
			
 
				+	int				oncpu;
			
 
				+	int				cpu;
			
 
				+
			
 
				+	struct list_head		owner_entry;
			
 
				+	struct task_struct		*owner;
			
 
				+
			
 
				+	/* mmap bits */
			
 
				+	struct mutex			mmap_mutex;
			
 
				+	atomic_t			mmap_count;
			
 
				+	struct perf_mmap_data		*data;
			
 
				+
			
 
				+	/* poll related */
			
 
				+	wait_queue_head_t		waitq;
			
 
				+	struct fasync_struct		*fasync;
			
 
				+
			
 
				+	/* delayed work for NMIs and such */
			
 
				+	int				pending_wakeup;
			
 
				+	int				pending_kill;
			
 
				+	int				pending_disable;
			
 
				+	struct perf_pending_entry	pending;
			
 
				+
			
 
				+	atomic_t			event_limit;
			
 
				+
			
 
				+	void (*destroy)(struct perf_counter *);
			
 
				+	struct rcu_head			rcu_head;
			
 
				+
			
 
				+	struct pid_namespace		*ns;
			
 
				+	u64				id;
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * struct perf_counter_context - counter context structure
			
 
				+ *
			
 
				+ * Used as a container for task counters and CPU counters as well:
			
 
				+ */
			
 
				+struct perf_counter_context {
			
 
				+	/*
			
 
				+	 * Protect the states of the counters in the list,
			
 
				+	 * nr_active, and the list:
			
 
				+	 */
			
 
				+	spinlock_t			lock;
			
 
				+	/*
			
 
				+	 * Protect the list of counters.  Locking either mutex or lock
			
 
				+	 * is sufficient to ensure the list doesn't change; to change
			
 
				+	 * the list you need to lock both the mutex and the spinlock.
			
 
				+	 */
			
 
				+	struct mutex			mutex;
			
 
				+
			
 
				+	struct list_head		counter_list;
			
 
				+	struct list_head		event_list;
			
 
				+	int				nr_counters;
			
 
				+	int				nr_active;
			
 
				+	int				is_active;
			
 
				+	atomic_t			refcount;
			
 
				+	struct task_struct		*task;
			
 
				+
			
 
				+	/*
			
 
				+	 * Context clock, runs when context enabled.
			
 
				+	 */
			
 
				+	u64				time;
			
 
				+	u64				timestamp;
			
 
				+
			
 
				+	/*
			
 
				+	 * These fields let us detect when two contexts have both
			
 
				+	 * been cloned (inherited) from a common ancestor.
			
 
				+	 */
			
 
				+	struct perf_counter_context	*parent_ctx;
			
 
				+	u64				parent_gen;
			
 
				+	u64				generation;
			
 
				+	int				pin_count;
			
 
				+	struct rcu_head			rcu_head;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * struct perf_counter_cpu_context - per cpu counter context structure
			
 
				+ */
			
 
				+struct perf_cpu_context {
			
 
				+	struct perf_counter_context	ctx;
			
 
				+	struct perf_counter_context	*task_ctx;
			
 
				+	int				active_oncpu;
			
 
				+	int				max_pertask;
			
 
				+	int				exclusive;
			
 
				+
			
 
				+	/*
			
 
				+	 * Recursion avoidance:
			
 
				+	 *
			
 
				+	 * task, softirq, irq, nmi context
			
 
				+	 */
			
 
				+	int				recursion[4];
			
 
				+};
			
 
				+
			
 
				+#ifdef CONFIG_PERF_COUNTERS
			
 
				+
			
 
				+/*
			
 
				+ * Set by architecture code:
			
 
				+ */
			
 
				+extern int perf_max_counters;
			
 
				+
			
 
				+extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter);
			
 
				+
			
 
				+extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
			
 
				+extern void perf_counter_task_sched_out(struct task_struct *task,
			
 
				+					struct task_struct *next, int cpu);
			
 
				+extern void perf_counter_task_tick(struct task_struct *task, int cpu);
			
 
				+extern int perf_counter_init_task(struct task_struct *child);
			
 
				+extern void perf_counter_exit_task(struct task_struct *child);
			
 
				+extern void perf_counter_free_task(struct task_struct *task);
			
 
				+extern void perf_counter_do_pending(void);
			
 
				+extern void perf_counter_print_debug(void);
			
 
				+extern void __perf_disable(void);
			
 
				+extern bool __perf_enable(void);
			
 
				+extern void perf_disable(void);
			
 
				+extern void perf_enable(void);
			
 
				+extern int perf_counter_task_disable(void);
			
 
				+extern int perf_counter_task_enable(void);
			
 
				+extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
			
 
				+	       struct perf_cpu_context *cpuctx,
			
 
				+	       struct perf_counter_context *ctx, int cpu);
			
 
				+extern void perf_counter_update_userpage(struct perf_counter *counter);
			
 
				+
			
 
				+struct perf_sample_data {
			
 
				+	struct pt_regs			*regs;
			
 
				+	u64				addr;
			
 
				+	u64				period;
			
 
				+};
			
 
				+
			
 
				+extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
			
 
				+				 struct perf_sample_data *data);
			
 
				+
			
 
				+/*
			
 
				+ * Return 1 for a software counter, 0 for a hardware counter
			
 
				+ */
			
 
				+static inline int is_software_counter(struct perf_counter *counter)
			
 
				+{
			
 
				+	return (counter->attr.type != PERF_TYPE_RAW) &&
			
 
				+		(counter->attr.type != PERF_TYPE_HARDWARE);
			
 
				+}
			
 
				+
			
 
				+extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
			
 
				+
			
 
				+extern void __perf_counter_mmap(struct vm_area_struct *vma);
			
 
				+
			
 
				+static inline void perf_counter_mmap(struct vm_area_struct *vma)
			
 
				+{
			
 
				+	if (vma->vm_flags & VM_EXEC)
			
 
				+		__perf_counter_mmap(vma);
			
 
				+}
			
 
				+
			
 
				+extern void perf_counter_comm(struct task_struct *tsk);
			
 
				+extern void perf_counter_fork(struct task_struct *tsk);
			
 
				+
			
 
				+extern void perf_counter_task_migration(struct task_struct *task, int cpu);
			
 
				+
			
 
				+#define MAX_STACK_DEPTH			255
			
 
				+
			
 
				+struct perf_callchain_entry {
			
 
				+	u16				nr;
			
 
				+	u16				hv;
			
 
				+	u16				kernel;
			
 
				+	u16				user;
			
 
				+	u64				ip[MAX_STACK_DEPTH];
			
 
				+};
			
 
				+
			
 
				+extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
			
 
				+
			
 
				+extern int sysctl_perf_counter_paranoid;
			
 
				+extern int sysctl_perf_counter_mlock;
			
 
				+extern int sysctl_perf_counter_sample_rate;
			
 
				+
			
 
				+extern void perf_counter_init(void);
			
 
				+
			
 
				+#ifndef perf_misc_flags
			
 
				+#define perf_misc_flags(regs)	(user_mode(regs) ? PERF_EVENT_MISC_USER : \
			
 
				+				 PERF_EVENT_MISC_KERNEL)
			
 
				+#define perf_instruction_pointer(regs)	instruction_pointer(regs)
			
 
				+#endif
			
 
				+
			
 
				+#else
			
 
				+static inline void
			
 
				+perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
			
 
				+static inline void
			
 
				+perf_counter_task_sched_out(struct task_struct *task,
			
 
				+			    struct task_struct *next, int cpu)		{ }
			
 
				+static inline void
			
 
				+perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
			
 
				+static inline int perf_counter_init_task(struct task_struct *child)	{ return 0; }
			
 
				+static inline void perf_counter_exit_task(struct task_struct *child)	{ }
			
 
				+static inline void perf_counter_free_task(struct task_struct *task)	{ }
			
 
				+static inline void perf_counter_do_pending(void)			{ }
			
 
				+static inline void perf_counter_print_debug(void)			{ }
			
 
				+static inline void perf_disable(void)					{ }
			
 
				+static inline void perf_enable(void)					{ }
			
 
				+static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
			
 
				+static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
			
 
				+
			
 
				+static inline void
			
 
				+perf_swcounter_event(u32 event, u64 nr, int nmi,
			
 
				+		     struct pt_regs *regs, u64 addr)			{ }
			
 
				+
			
 
				+static inline void perf_counter_mmap(struct vm_area_struct *vma)	{ }
			
 
				+static inline void perf_counter_comm(struct task_struct *tsk)		{ }
			
 
				+static inline void perf_counter_fork(struct task_struct *tsk)		{ }
			
 
				+static inline void perf_counter_init(void)				{ }
			
 
				+static inline void perf_counter_task_migration(struct task_struct *task,
			
 
				+					       int cpu)			{ }
			
 
				+#endif
			
 
				+
			
 
				+#endif /* __KERNEL__ */
			
 
				+#endif /* _LINUX_PERF_COUNTER_H */
			
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -85,4 +85,7 @@
 
				 #define PR_SET_TIMERSLACK 29
			
 
				 #define PR_GET_TIMERSLACK 30
			
 
				 
			
 
				+#define PR_TASK_PERF_COUNTERS_DISABLE		31
			
 
				+#define PR_TASK_PERF_COUNTERS_ENABLE		32
			
 
				+
			
 
				 #endif /* _LINUX_PRCTL_H */
			
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -99,6 +99,7 @@ struct robust_list_head;
 
				 struct bio;
			
 
				 struct fs_struct;
			
 
				 struct bts_context;
			
 
				+struct perf_counter_context;
			
 
				 
			
 
				 /*
			
 
				  * List of flags we want to share for kernel threads,
			
@@ -139,6 +140,7 @@ extern unsigned long nr_running(void);
 
				 extern unsigned long nr_uninterruptible(void);
			
 
				 extern unsigned long nr_iowait(void);
			
 
				 extern void calc_global_load(void);
			
 
				+extern u64 cpu_nr_migrations(int cpu);
			
 
				 
			
 
				 extern unsigned long get_parent_ip(unsigned long addr);
			
 
				 
			
@@ -674,6 +676,10 @@ struct user_struct {
 
				 	struct work_struct work;
			
 
				 #endif
			
 
				 #endif
			
 
				+
			
 
				+#ifdef CONFIG_PERF_COUNTERS
			
 
				+	atomic_long_t locked_vm;
			
 
				+#endif
			
 
				 };
			
 
				 
			
 
				 extern int uids_sysfs_init(void);
			
@@ -1073,9 +1079,10 @@ struct sched_entity {
 
				 	u64			last_wakeup;
			
 
				 	u64			avg_overlap;
			
 
				 
			
 
				+	u64			nr_migrations;
			
 
				+
			
 
				 	u64			start_runtime;
			
 
				 	u64			avg_wakeup;
			
 
				-	u64			nr_migrations;
			
 
				 
			
 
				 #ifdef CONFIG_SCHEDSTATS
			
 
				 	u64			wait_start;
			
@@ -1396,6 +1403,11 @@ struct task_struct {
 
				 	struct list_head pi_state_list;
			
 
				 	struct futex_pi_state *pi_state_cache;
			
 
				 #endif
			
 
				+#ifdef CONFIG_PERF_COUNTERS
			
 
				+	struct perf_counter_context *perf_counter_ctxp;
			
 
				+	struct mutex perf_counter_mutex;
			
 
				+	struct list_head perf_counter_list;
			
 
				+#endif
			
 
				 #ifdef CONFIG_NUMA
			
 
				 	struct mempolicy *mempolicy;
			
 
				 	short il_next;
			
@@ -2410,6 +2422,13 @@ static inline void inc_syscw(struct task_struct *tsk)
 
				 #define TASK_SIZE_OF(tsk)	TASK_SIZE
			
 
				 #endif
			
 
				 
			
 
				+/*
			
 
				+ * Call the function if the target task is executing on a CPU right now:
			
 
				+ */
			
 
				+extern void task_oncpu_function_call(struct task_struct *p,
			
 
				+				     void (*func) (void *info), void *info);
			
 
				+
			
 
				+
			
 
				 #ifdef CONFIG_MM_OWNER
			
 
				 extern void mm_update_next_owner(struct mm_struct *mm);
			
 
				 extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
			
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -55,6 +55,7 @@ struct compat_timeval;
 
				 struct robust_list_head;
			
 
				 struct getcpu_cache;
			
 
				 struct old_linux_dirent;
			
 
				+struct perf_counter_attr;
			
 
				 
			
 
				 #include <linux/types.h>
			
 
				 #include <linux/aio_abi.h>
			
@@ -755,4 +756,8 @@ asmlinkage long sys_pipe(int __user *);
 
				 
			
 
				 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
			
 
				 
			
 
				+
			
 
				+asmlinkage long sys_perf_counter_open(
			
 
				+		const struct perf_counter_attr __user *attr_uptr,
			
 
				+		pid_t pid, int cpu, int group_fd, unsigned long flags);
			
 
				 #endif
			
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -933,6 +933,40 @@ config AIO
 
				           by some high performance threaded applications. Disabling
			
 
				           this option saves about 7k.
			
 
				 
			
 
				+config HAVE_PERF_COUNTERS
			
 
				+	bool
			
 
				+
			
 
				+menu "Performance Counters"
			
 
				+
			
 
				+config PERF_COUNTERS
			
 
				+	bool "Kernel Performance Counters"
			
 
				+	depends on HAVE_PERF_COUNTERS
			
 
				+	select ANON_INODES
			
 
				+	help
			
 
				+	  Enable kernel support for performance counter hardware.
			
 
				+
			
 
				+	  Performance counters are special hardware registers available
			
 
				+	  on most modern CPUs. These registers count the number of certain
			
 
				+	  types of hw events: such as instructions executed, cachemisses
			
 
				+	  suffered, or branches mis-predicted - without slowing down the
			
 
				+	  kernel or applications. These registers can also trigger interrupts
			
 
				+	  when a threshold number of events have passed - and can thus be
			
 
				+	  used to profile the code that runs on that CPU.
			
 
				+
			
 
				+	  The Linux Performance Counter subsystem provides an abstraction of
			
 
				+	  these hardware capabilities, available via a system call. It
			
 
				+	  provides per task and per CPU counters, and it provides event
			
 
				+	  capabilities on top of those.
			
 
				+
			
 
				+	  Say Y if unsure.
			
 
				+
			
 
				+config EVENT_PROFILE
			
 
				+	bool "Tracepoint profile sources"
			
 
				+	depends on PERF_COUNTERS && EVENT_TRACER
			
 
				+	default y
			
 
				+
			
 
				+endmenu
			
 
				+
			
 
				 config VM_EVENT_COUNTERS
			
 
				 	default y
			
 
				 	bool "Enable VM event counters for /proc/vmstat" if EMBEDDED
			
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -96,6 +96,7 @@ obj-$(CONFIG_TRACING) += trace/
 
				 obj-$(CONFIG_X86_DS) += trace/
			
 
				 obj-$(CONFIG_SMP) += sched_cpupri.o
			
 
				 obj-$(CONFIG_SLOW_WORK) += slow-work.o
			
 
				+obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
			
 
				 
			
 
				 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
			
 
				 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
			
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -48,6 +48,7 @@
 
				 #include <linux/tracehook.h>
			
 
				 #include <linux/fs_struct.h>
			
 
				 #include <linux/init_task.h>
			
 
				+#include <linux/perf_counter.h>
			
 
				 #include <trace/events/sched.h>
			
 
				 
			
 
				 #include <asm/uaccess.h>
			
@@ -154,6 +155,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 
				 {
			
 
				 	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
			
 
				 
			
 
				+#ifdef CONFIG_PERF_COUNTERS
			
 
				+	WARN_ON_ONCE(tsk->perf_counter_ctxp);
			
 
				+#endif
			
 
				 	trace_sched_process_free(tsk);
			
 
				 	put_task_struct(tsk);
			
 
				 }
			
@@ -170,6 +174,7 @@ repeat:
 
				 	atomic_dec(&__task_cred(p)->user->processes);
			
 
				 
			
 
				 	proc_flush_task(p);
			
 
				+
			
 
				 	write_lock_irq(&tasklist_lock);
			
 
				 	tracehook_finish_release_task(p);
			
 
				 	__exit_signal(p);
			
@@ -971,16 +976,19 @@ NORET_TYPE void do_exit(long code)
 
				 		module_put(tsk->binfmt->module);
			
 
				 
			
 
				 	proc_exit_connector(tsk);
			
 
				+
			
 
				+	/*
			
 
				+	 * Flush inherited counters to the parent - before the parent
			
 
				+	 * gets woken up by child-exit notifications.
			
 
				+	 */
			
 
				+	perf_counter_exit_task(tsk);
			
 
				+
			
 
				 	exit_notify(tsk, group_dead);
			
 
				 #ifdef CONFIG_NUMA
			
 
				 	mpol_put(tsk->mempolicy);
			
 
				 	tsk->mempolicy = NULL;
			
 
				 #endif
			
 
				 #ifdef CONFIG_FUTEX
			
 
				-	/*
			
 
				-	 * This must happen late, after the PID is not
			
 
				-	 * hashed anymore:
			
 
				-	 */
			
 
				 	if (unlikely(!list_empty(&tsk->pi_state_list)))
			
 
				 		exit_pi_state_list(tsk);
			
 
				 	if (unlikely(current->pi_state_cache))
			
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -62,6 +62,7 @@
 
				 #include <linux/blkdev.h>
			
 
				 #include <linux/fs_struct.h>
			
 
				 #include <linux/magic.h>
			
 
				+#include <linux/perf_counter.h>
			
 
				 
			
 
				 #include <asm/pgtable.h>
			
 
				 #include <asm/pgalloc.h>
			
@@ -1096,6 +1097,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
				 	/* Perform scheduler related setup. Assign this task to a CPU. */
			
 
				 	sched_fork(p, clone_flags);
			
 
				 
			
 
				+	retval = perf_counter_init_task(p);
			
 
				+	if (retval)
			
 
				+		goto bad_fork_cleanup_policy;
			
 
				+
			
 
				 	if ((retval = audit_alloc(p)))
			
 
				 		goto bad_fork_cleanup_policy;
			
 
				 	/* copy all the process information */
			
@@ -1290,6 +1295,7 @@ bad_fork_cleanup_semundo:
 
				 bad_fork_cleanup_audit:
			
 
				 	audit_free(p);
			
 
				 bad_fork_cleanup_policy:
			
 
				+	perf_counter_free_task(p);
			
 
				 #ifdef CONFIG_NUMA
			
 
				 	mpol_put(p->mempolicy);
			
 
				 bad_fork_cleanup_cgroup:
			
@@ -1403,6 +1409,12 @@ long do_fork(unsigned long clone_flags,
 
				 		if (clone_flags & CLONE_VFORK) {
			
 
				 			p->vfork_done = &vfork;
			
 
				 			init_completion(&vfork);
			
 
				+		} else if (!(clone_flags & CLONE_VM)) {
			
 
				+			/*
			
 
				+			 * vfork will do an exec which will call
			
 
				+			 * set_task_comm()
			
 
				+			 */
			
 
				+			perf_counter_fork(p);
			
 
				 		}
			
 
				 
			
 
				 		audit_finish_fork(p);
			
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -89,7 +89,7 @@ __mutex_lock_slowpath(atomic_t *lock_count);
 
				  *
			
 
				  * This function is similar to (but not equivalent to) down().
			
 
				  */
			
 
				-void inline __sched mutex_lock(struct mutex *lock)
			
 
				+void __sched mutex_lock(struct mutex *lock)
			
 
				 {
			
 
				 	might_sleep();
			
 
				 	/*
			
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -0,0 +1,4260 @@
 
				+/*
			
 
				+ * Performance counter core code
			
 
				+ *
			
 
				+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
			
 
				+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
			
 
				+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
			
 
				+ *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
			
 
				+ *
			
 
				+ *  For licensing details see kernel-base/COPYING
			
 
				+ */
			
 
				+
			
 
				+#include <linux/fs.h>
			
 
				+#include <linux/mm.h>
			
 
				+#include <linux/cpu.h>
			
 
				+#include <linux/smp.h>
			
 
				+#include <linux/file.h>
			
 
				+#include <linux/poll.h>
			
 
				+#include <linux/sysfs.h>
			
 
				+#include <linux/dcache.h>
			
 
				+#include <linux/percpu.h>
			
 
				+#include <linux/ptrace.h>
			
 
				+#include <linux/vmstat.h>
			
 
				+#include <linux/hardirq.h>
			
 
				+#include <linux/rculist.h>
			
 
				+#include <linux/uaccess.h>
			
 
				+#include <linux/syscalls.h>
			
 
				+#include <linux/anon_inodes.h>
			
 
				+#include <linux/kernel_stat.h>
			
 
				+#include <linux/perf_counter.h>
			
 
				+
			
 
				+#include <asm/irq_regs.h>
			
 
				+
			
 
				+/*
			
 
				+ * Each CPU has a list of per CPU counters:
			
 
				+ */
			
 
				+DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
			
 
				+
			
 
				+int perf_max_counters __read_mostly = 1;
			
 
				+static int perf_reserved_percpu __read_mostly;
			
 
				+static int perf_overcommit __read_mostly = 1;
			
 
				+
			
 
				+static atomic_t nr_counters __read_mostly;
			
 
				+static atomic_t nr_mmap_counters __read_mostly;
			
 
				+static atomic_t nr_comm_counters __read_mostly;
			
 
				+
			
 
				+/*
			
 
				+ * perf counter paranoia level:
			
 
				+ *  0 - not paranoid
			
 
				+ *  1 - disallow cpu counters to unpriv
			
 
				+ *  2 - disallow kernel profiling to unpriv
			
 
				+ */
			
 
				+int sysctl_perf_counter_paranoid __read_mostly;
			
 
				+
			
 
				+static inline bool perf_paranoid_cpu(void)
			
 
				+{
			
 
				+	return sysctl_perf_counter_paranoid > 0;
			
 
				+}
			
 
				+
			
 
				+static inline bool perf_paranoid_kernel(void)
			
 
				+{
			
 
				+	return sysctl_perf_counter_paranoid > 1;
			
 
				+}
			
 
				+
			
 
				+int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
			
 
				+
			
 
				+/*
			
 
				+ * max perf counter sample rate
			
 
				+ */
			
 
				+int sysctl_perf_counter_sample_rate __read_mostly = 100000;
			
 
				+
			
 
				+static atomic64_t perf_counter_id;
			
 
				+
			
 
				+/*
			
 
				+ * Lock for (sysadmin-configurable) counter reservations:
			
 
				+ */
			
 
				+static DEFINE_SPINLOCK(perf_resource_lock);
			
 
				+
			
 
				+/*
			
 
				+ * Architecture provided APIs - weak aliases:
			
 
				+ */
			
 
				+extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
			
 
				+{
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+void __weak hw_perf_disable(void)		{ barrier(); }
			
 
				+void __weak hw_perf_enable(void)		{ barrier(); }
			
 
				+
			
 
				+void __weak hw_perf_counter_setup(int cpu)	{ barrier(); }
			
 
				+
			
 
				+int __weak
			
 
				+hw_perf_group_sched_in(struct perf_counter *group_leader,
			
 
				+	       struct perf_cpu_context *cpuctx,
			
 
				+	       struct perf_counter_context *ctx, int cpu)
			
 
				+{
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+void __weak perf_counter_print_debug(void)	{ }
			
 
				+
			
 
				+static DEFINE_PER_CPU(int, disable_count);
			
 
				+
			
 
				+void __perf_disable(void)
			
 
				+{
			
 
				+	__get_cpu_var(disable_count)++;
			
 
				+}
			
 
				+
			
 
				+bool __perf_enable(void)
			
 
				+{
			
 
				+	return !--__get_cpu_var(disable_count);
			
 
				+}
			
 
				+
			
 
				+void perf_disable(void)
			
 
				+{
			
 
				+	__perf_disable();
			
 
				+	hw_perf_disable();
			
 
				+}
			
 
				+
			
 
				+void perf_enable(void)
			
 
				+{
			
 
				+	if (__perf_enable())
			
 
				+		hw_perf_enable();
			
 
				+}
			
 
				+
			
 
				+static void get_ctx(struct perf_counter_context *ctx)
			
 
				+{
			
 
				+	atomic_inc(&ctx->refcount);
			
 
				+}
			
 
				+
			
 
				+static void free_ctx(struct rcu_head *head)
			
 
				+{
			
 
				+	struct perf_counter_context *ctx;
			
 
				+
			
 
				+	ctx = container_of(head, struct perf_counter_context, rcu_head);
			
 
				+	kfree(ctx);
			
 
				+}
			
 
				+
			
 
				+static void put_ctx(struct perf_counter_context *ctx)
			
 
				+{
			
 
				+	if (atomic_dec_and_test(&ctx->refcount)) {
			
 
				+		if (ctx->parent_ctx)
			
 
				+			put_ctx(ctx->parent_ctx);
			
 
				+		if (ctx->task)
			
 
				+			put_task_struct(ctx->task);
			
 
				+		call_rcu(&ctx->rcu_head, free_ctx);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Get the perf_counter_context for a task and lock it.
			
 
				+ * This has to cope with with the fact that until it is locked,
			
 
				+ * the context could get moved to another task.
			
 
				+ */
			
 
				+static struct perf_counter_context *
			
 
				+perf_lock_task_context(struct task_struct *task, unsigned long *flags)
			
 
				+{
			
 
				+	struct perf_counter_context *ctx;
			
 
				+
			
 
				+	rcu_read_lock();
			
 
				+ retry:
			
 
				+	ctx = rcu_dereference(task->perf_counter_ctxp);
			
 
				+	if (ctx) {
			
 
				+		/*
			
 
				+		 * If this context is a clone of another, it might
			
 
				+		 * get swapped for another underneath us by
			
 
				+		 * perf_counter_task_sched_out, though the
			
 
				+		 * rcu_read_lock() protects us from any context
			
 
				+		 * getting freed.  Lock the context and check if it
			
 
				+		 * got swapped before we could get the lock, and retry
			
 
				+		 * if so.  If we locked the right context, then it
			
 
				+		 * can't get swapped on us any more.
			
 
				+		 */
			
 
				+		spin_lock_irqsave(&ctx->lock, *flags);
			
 
				+		if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
			
 
				+			spin_unlock_irqrestore(&ctx->lock, *flags);
			
 
				+			goto retry;
			
 
				+		}
			
 
				+	}
			
 
				+	rcu_read_unlock();
			
 
				+	return ctx;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Get the context for a task and increment its pin_count so it
			
 
				+ * can't get swapped to another task.  This also increments its
			
 
				+ * reference count so that the context can't get freed.
			
 
				+ */
			
 
				+static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
			
 
				+{
			
 
				+	struct perf_counter_context *ctx;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	ctx = perf_lock_task_context(task, &flags);
			
 
				+	if (ctx) {
			
 
				+		++ctx->pin_count;
			
 
				+		get_ctx(ctx);
			
 
				+		spin_unlock_irqrestore(&ctx->lock, flags);
			
 
				+	}
			
 
				+	return ctx;
			
 
				+}
			
 
				+
			
 
				+static void perf_unpin_context(struct perf_counter_context *ctx)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&ctx->lock, flags);
			
 
				+	--ctx->pin_count;
			
 
				+	spin_unlock_irqrestore(&ctx->lock, flags);
			
 
				+	put_ctx(ctx);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Add a counter from the lists for its context.
			
 
				+ * Must be called with ctx->mutex and ctx->lock held.
			
 
				+ */
			
 
				+static void
			
 
				+list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
			
 
				+{
			
 
				+	struct perf_counter *group_leader = counter->group_leader;
			
 
				+
			
 
				+	/*
			
 
				+	 * Depending on whether it is a standalone or sibling counter,
			
 
				+	 * add it straight to the context's counter list, or to the group
			
 
				+	 * leader's sibling list:
			
 
				+	 */
			
 
				+	if (group_leader == counter)
			
 
				+		list_add_tail(&counter->list_entry, &ctx->counter_list);
			
 
				+	else {
			
 
				+		list_add_tail(&counter->list_entry, &group_leader->sibling_list);
			
 
				+		group_leader->nr_siblings++;
			
 
				+	}
			
 
				+
			
 
				+	list_add_rcu(&counter->event_entry, &ctx->event_list);
			
 
				+	ctx->nr_counters++;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Remove a counter from the lists for its context.
			
 
				+ * Must be called with ctx->mutex and ctx->lock held.
			
 
				+ */
			
 
				+static void
			
 
				+list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
			
 
				+{
			
 
				+	struct perf_counter *sibling, *tmp;
			
 
				+
			
 
				+	if (list_empty(&counter->list_entry))
			
 
				+		return;
			
 
				+	ctx->nr_counters--;
			
 
				+
			
 
				+	list_del_init(&counter->list_entry);
			
 
				+	list_del_rcu(&counter->event_entry);
			
 
				+
			
 
				+	if (counter->group_leader != counter)
			
 
				+		counter->group_leader->nr_siblings--;
			
 
				+
			
 
				+	/*
			
 
				+	 * If this was a group counter with sibling counters then
			
 
				+	 * upgrade the siblings to singleton counters by adding them
			
 
				+	 * to the context list directly:
			
 
				+	 */
			
 
				+	list_for_each_entry_safe(sibling, tmp,
			
 
				+				 &counter->sibling_list, list_entry) {
			
 
				+
			
 
				+		list_move_tail(&sibling->list_entry, &ctx->counter_list);
			
 
				+		sibling->group_leader = sibling;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+counter_sched_out(struct perf_counter *counter,
			
 
				+		  struct perf_cpu_context *cpuctx,
			
 
				+		  struct perf_counter_context *ctx)
			
 
				+{
			
 
				+	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
			
 
				+		return;
			
 
				+
			
 
				+	counter->state = PERF_COUNTER_STATE_INACTIVE;
			
 
				+	counter->tstamp_stopped = ctx->time;
			
 
				+	counter->pmu->disable(counter);
			
 
				+	counter->oncpu = -1;
			
 
				+
			
 
				+	if (!is_software_counter(counter))
			
 
				+		cpuctx->active_oncpu--;
			
 
				+	ctx->nr_active--;
			
 
				+	if (counter->attr.exclusive || !cpuctx->active_oncpu)
			
 
				+		cpuctx->exclusive = 0;
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+group_sched_out(struct perf_counter *group_counter,
			
 
				+		struct perf_cpu_context *cpuctx,
			
 
				+		struct perf_counter_context *ctx)
			
 
				+{
			
 
				+	struct perf_counter *counter;
			
 
				+
			
 
				+	if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
			
 
				+		return;
			
 
				+
			
 
				+	counter_sched_out(group_counter, cpuctx, ctx);
			
 
				+
			
 
				+	/*
			
 
				+	 * Schedule out siblings (if any):
			
 
				+	 */
			
 
				+	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
			
 
				+		counter_sched_out(counter, cpuctx, ctx);
			
 
				+
			
 
				+	if (group_counter->attr.exclusive)
			
 
				+		cpuctx->exclusive = 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Cross CPU call to remove a performance counter
			
 
				+ *
			
 
				+ * We disable the counter on the hardware level first. After that we
			
 
				+ * remove it from the context list.
			
 
				+ */
			
 
				+static void __perf_counter_remove_from_context(void *info)
			
 
				+{
			
 
				+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
			
 
				+	struct perf_counter *counter = info;
			
 
				+	struct perf_counter_context *ctx = counter->ctx;
			
 
				+
			
 
				+	/*
			
 
				+	 * If this is a task context, we need to check whether it is
			
 
				+	 * the current task context of this cpu. If not it has been
			
 
				+	 * scheduled out before the smp call arrived.
			
 
				+	 */
			
 
				+	if (ctx->task && cpuctx->task_ctx != ctx)
			
 
				+		return;
			
 
				+
			
 
				+	spin_lock(&ctx->lock);
			
 
				+	/*
			
 
				+	 * Protect the list operation against NMI by disabling the
			
 
				+	 * counters on a global level.
			
 
				+	 */
			
 
				+	perf_disable();
			
 
				+
			
 
				+	counter_sched_out(counter, cpuctx, ctx);
			
 
				+
			
 
				+	list_del_counter(counter, ctx);
			
 
				+
			
 
				+	if (!ctx->task) {
			
 
				+		/*
			
 
				+		 * Allow more per task counters with respect to the
			
 
				+		 * reservation:
			
 
				+		 */
			
 
				+		cpuctx->max_pertask =
			
 
				+			min(perf_max_counters - ctx->nr_counters,
			
 
				+			    perf_max_counters - perf_reserved_percpu);
			
 
				+	}
			
 
				+
			
 
				+	perf_enable();
			
 
				+	spin_unlock(&ctx->lock);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Remove the counter from a task's (or a CPU's) list of counters.
			
 
				+ *
			
 
				+ * Must be called with ctx->mutex held.
			
 
				+ *
			
 
				+ * CPU counters are removed with a smp call. For task counters we only
			
 
				+ * call when the task is on a CPU.
			
 
				+ *
			
 
				+ * If counter->ctx is a cloned context, callers must make sure that
			
 
				+ * every task struct that counter->ctx->task could possibly point to
			
 
				+ * remains valid.  This is OK when called from perf_release since
			
 
				+ * that only calls us on the top-level context, which can't be a clone.
			
 
				+ * When called from perf_counter_exit_task, it's OK because the
			
 
				+ * context has been detached from its task.
			
 
				+ */
			
 
				+static void perf_counter_remove_from_context(struct perf_counter *counter)
			
 
				+{
			
 
				+	struct perf_counter_context *ctx = counter->ctx;
			
 
				+	struct task_struct *task = ctx->task;
			
 
				+
			
 
				+	if (!task) {
			
 
				+		/*
			
 
				+		 * Per cpu counters are removed via an smp call and
			
 
				+		 * the removal is always sucessful.
			
 
				+		 */
			
 
				+		smp_call_function_single(counter->cpu,
			
 
				+					 __perf_counter_remove_from_context,
			
 
				+					 counter, 1);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+retry:
			
 
				+	task_oncpu_function_call(task, __perf_counter_remove_from_context,
			
 
				+				 counter);
			
 
				+
			
 
				+	spin_lock_irq(&ctx->lock);
			
 
				+	/*
			
 
				+	 * If the context is active we need to retry the smp call.
			
 
				+	 */
			
 
				+	if (ctx->nr_active && !list_empty(&counter->list_entry)) {
			
 
				+		spin_unlock_irq(&ctx->lock);
			
 
				+		goto retry;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * The lock prevents that this context is scheduled in so we
			
 
				+	 * can remove the counter safely, if the call above did not
			
 
				+	 * succeed.
			
 
				+	 */
			
 
				+	if (!list_empty(&counter->list_entry)) {
			
 
				+		list_del_counter(counter, ctx);
			
 
				+	}
			
 
				+	spin_unlock_irq(&ctx->lock);
			
 
				+}
			
 
				+
			
 
				+static inline u64 perf_clock(void)
			
 
				+{
			
 
				+	return cpu_clock(smp_processor_id());
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Update the record of the current time in a context.
			
 
				+ */
			
 
				+static void update_context_time(struct perf_counter_context *ctx)
			
 
				+{
			
 
				+	u64 now = perf_clock();
			
 
				+
			
 
				+	ctx->time += now - ctx->timestamp;
			
 
				+	ctx->timestamp = now;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Update the total_time_enabled and total_time_running fields for a counter.
			
 
				+ */
			
 
				+static void update_counter_times(struct perf_counter *counter)
			
 
				+{
			
 
				+	struct perf_counter_context *ctx = counter->ctx;
			
 
				+	u64 run_end;
			
 
				+
			
 
				+	if (counter->state < PERF_COUNTER_STATE_INACTIVE)
			
 
				+		return;
			
 
				+
			
 
				+	counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
			
 
				+
			
 
				+	if (counter->state == PERF_COUNTER_STATE_INACTIVE)
			
 
				+		run_end = counter->tstamp_stopped;
			
 
				+	else
			
 
				+		run_end = ctx->time;
			
 
				+
			
 
				+	counter->total_time_running = run_end - counter->tstamp_running;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Update total_time_enabled and total_time_running for all counters in a group.
			
 
				+ */
			
 
				+static void update_group_times(struct perf_counter *leader)
			
 
				+{
			
 
				+	struct perf_counter *counter;
			
 
				+
			
 
				+	update_counter_times(leader);
			
 
				+	list_for_each_entry(counter, &leader->sibling_list, list_entry)
			
 
				+		update_counter_times(counter);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Cross CPU call to disable a performance counter
			
 
				+ */
			
 
				+static void __perf_counter_disable(void *info)
			
 
				+{
			
 
				+	struct perf_counter *counter = info;
			
 
				+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
			
 
				+	struct perf_counter_context *ctx = counter->ctx;
			
 
				+
			
 
				+	/*
			
 
				+	 * If this is a per-task counter, need to check whether this
			
 
				+	 * counter's task is the current task on this cpu.
			
 
				+	 */
			
 
				+	if (ctx->task && cpuctx->task_ctx != ctx)
			
 
				+		return;
			
 
				+
			
 
				+	spin_lock(&ctx->lock);
			
 
				+
			
 
				+	/*
			
 
				+	 * If the counter is on, turn it off.
			
 
				+	 * If it is in error state, leave it in error state.
			
 
				+	 */
			
 
				+	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
			
 
				+		update_context_time(ctx);
			
 
				+		update_counter_times(counter);
			
 
				+		if (counter == counter->group_leader)
			
 
				+			group_sched_out(counter, cpuctx, ctx);
			
 
				+		else
			
 
				+			counter_sched_out(counter, cpuctx, ctx);
			
 
				+		counter->state = PERF_COUNTER_STATE_OFF;
			
 
				+	}
			
 
				+
			
 
				+	spin_unlock(&ctx->lock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Disable a counter.
			
 
				+ *
			
 
				+ * If counter->ctx is a cloned context, callers must make sure that
			
 
				+ * every task struct that counter->ctx->task could possibly point to
			
 
				+ * remains valid.  This condition is satisifed when called through
			
 
				+ * perf_counter_for_each_child or perf_counter_for_each because they
			
 
				+ * hold the top-level counter's child_mutex, so any descendant that
			
 
				+ * goes to exit will block in sync_child_counter.
			
 
				+ * When called from perf_pending_counter it's OK because counter->ctx
			
 
				+ * is the current context on this CPU and preemption is disabled,
			
 
				+ * hence we can't get into perf_counter_task_sched_out for this context.
			
 
				+ */
			
 
				+static void perf_counter_disable(struct perf_counter *counter)
			
 
				+{
			
 
				+	struct perf_counter_context *ctx = counter->ctx;
			
 
				+	struct task_struct *task = ctx->task;
			
 
				+
			
 
				+	if (!task) {
			
 
				+		/*
			
 
				+		 * Disable the counter on the cpu that it's on
			
 
				+		 */
			
 
				+		smp_call_function_single(counter->cpu, __perf_counter_disable,
			
 
				+					 counter, 1);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+ retry:
			
 
				+	task_oncpu_function_call(task, __perf_counter_disable, counter);
			
 
				+
			
 
				+	spin_lock_irq(&ctx->lock);
			
 
				+	/*
			
 
				+	 * If the counter is still active, we need to retry the cross-call.
			
 
				+	 */
			
 
				+	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
			
 
				+		spin_unlock_irq(&ctx->lock);
			
 
				+		goto retry;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Since we have the lock this context can't be scheduled
			
 
				+	 * in, so we can change the state safely.
			
 
				+	 */
			
 
				+	if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
			
 
				+		update_counter_times(counter);
			
 
				+		counter->state = PERF_COUNTER_STATE_OFF;
			
 
				+	}
			
 
				+
			
 
				+	spin_unlock_irq(&ctx->lock);
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+counter_sched_in(struct perf_counter *counter,
			
 
				+		 struct perf_cpu_context *cpuctx,
			
 
				+		 struct perf_counter_context *ctx,
			
 
				+		 int cpu)
			
 
				+{
			
 
				+	if (counter->state <= PERF_COUNTER_STATE_OFF)
			
 
				+		return 0;
			
 
				+
			
 
				+	counter->state = PERF_COUNTER_STATE_ACTIVE;
			
 
				+	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
			
 
				+	/*
			
 
				+	 * The new state must be visible before we turn it on in the hardware:
			
 
				+	 */
			
 
				+	smp_wmb();
			
 
				+
			
 
				+	if (counter->pmu->enable(counter)) {
			
 
				+		counter->state = PERF_COUNTER_STATE_INACTIVE;
			
 
				+		counter->oncpu = -1;
			
 
				+		return -EAGAIN;
			
 
				+	}
			
 
				+
			
 
				+	counter->tstamp_running += ctx->time - counter->tstamp_stopped;
			
 
				+
			
 
				+	if (!is_software_counter(counter))
			
 
				+		cpuctx->active_oncpu++;
			
 
				+	ctx->nr_active++;
			
 
				+
			
 
				+	if (counter->attr.exclusive)
			
 
				+		cpuctx->exclusive = 1;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+group_sched_in(struct perf_counter *group_counter,
			
 
				+	       struct perf_cpu_context *cpuctx,
			
 
				+	       struct perf_counter_context *ctx,
			
 
				+	       int cpu)
			
 
				+{
			
 
				+	struct perf_counter *counter, *partial_group;
			
 
				+	int ret;
			
 
				+
			
 
				+	if (group_counter->state == PERF_COUNTER_STATE_OFF)
			
 
				+		return 0;
			
 
				+
			
 
				+	ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
			
 
				+	if (ret)
			
 
				+		return ret < 0 ? ret : 0;
			
 
				+
			
 
				+	if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
			
 
				+		return -EAGAIN;
			
 
				+
			
 
				+	/*
			
 
				+	 * Schedule in siblings as one group (if any):
			
 
				+	 */
			
 
				+	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
			
 
				+		if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
			
 
				+			partial_group = counter;
			
 
				+			goto group_error;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+group_error:
			
 
				+	/*
			
 
				+	 * Groups can be scheduled in as one unit only, so undo any
			
 
				+	 * partial group before returning:
			
 
				+	 */
			
 
				+	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
			
 
				+		if (counter == partial_group)
			
 
				+			break;
			
 
				+		counter_sched_out(counter, cpuctx, ctx);
			
 
				+	}
			
 
				+	counter_sched_out(group_counter, cpuctx, ctx);
			
 
				+
			
 
				+	return -EAGAIN;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Return 1 for a group consisting entirely of software counters,
			
 
				+ * 0 if the group contains any hardware counters.
			
 
				+ */
			
 
				+static int is_software_only_group(struct perf_counter *leader)
			
 
				+{
			
 
				+	struct perf_counter *counter;
			
 
				+
			
 
				+	if (!is_software_counter(leader))
			
 
				+		return 0;
			
 
				+
			
 
				+	list_for_each_entry(counter, &leader->sibling_list, list_entry)
			
 
				+		if (!is_software_counter(counter))
			
 
				+			return 0;
			
 
				+
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Work out whether we can put this counter group on the CPU now.
			
 
				+ */
			
 
				+static int group_can_go_on(struct perf_counter *counter,
			
 
				+			   struct perf_cpu_context *cpuctx,
			
 
				+			   int can_add_hw)
			
 
				+{
			
 
				+	/*
			
 
				+	 * Groups consisting entirely of software counters can always go on.
			
 
				+	 */
			
 
				+	if (is_software_only_group(counter))
			
 
				+		return 1;
			
 
				+	/*
			
 
				+	 * If an exclusive group is already on, no other hardware
			
 
				+	 * counters can go on.
			
 
				+	 */
			
 
				+	if (cpuctx->exclusive)
			
 
				+		return 0;
			
 
				+	/*
			
 
				+	 * If this group is exclusive and there are already
			
 
				+	 * counters on the CPU, it can't go on.
			
 
				+	 */
			
 
				+	if (counter->attr.exclusive && cpuctx->active_oncpu)
			
 
				+		return 0;
			
 
				+	/*
			
 
				+	 * Otherwise, try to add it if all previous groups were able
			
 
				+	 * to go on.
			
 
				+	 */
			
 
				+	return can_add_hw;
			
 
				+}
			
 
				+
			
 
				+static void add_counter_to_ctx(struct perf_counter *counter,
			
 
				+			       struct perf_counter_context *ctx)
			
 
				+{
			
 
				+	list_add_counter(counter, ctx);
			
 
				+	counter->tstamp_enabled = ctx->time;
			
 
				+	counter->tstamp_running = ctx->time;
			
 
				+	counter->tstamp_stopped = ctx->time;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Cross CPU call to install and enable a performance counter
			
 
				+ *
			
 
				+ * Must be called with ctx->mutex held
			
 
				+ */
			
 
				+static void __perf_install_in_context(void *info)
			
 
				+{
			
 
				+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
			
 
				+	struct perf_counter *counter = info;
			
 
				+	struct perf_counter_context *ctx = counter->ctx;
			
 
				+	struct perf_counter *leader = counter->group_leader;
			
 
				+	int cpu = smp_processor_id();
			
 
				+	int err;
			
 
				+
			
 
				+	/*
			
 
				+	 * If this is a task context, we need to check whether it is
			
 
				+	 * the current task context of this cpu. If not it has been
			
 
				+	 * scheduled out before the smp call arrived.
			
 
				+	 * Or possibly this is the right context but it isn't
			
 
				+	 * on this cpu because it had no counters.
			
 
				+	 */
			
 
				+	if (ctx->task && cpuctx->task_ctx != ctx) {
			
 
				+		if (cpuctx->task_ctx || ctx->task != current)
			
 
				+			return;
			
 
				+		cpuctx->task_ctx = ctx;
			
 
				+	}
			
 
				+
			
 
				+	spin_lock(&ctx->lock);
			
 
				+	ctx->is_active = 1;
			
 
				+	update_context_time(ctx);
			
 
				+
			
 
				+	/*
			
 
				+	 * Protect the list operation against NMI by disabling the
			
 
				+	 * counters on a global level. NOP for non NMI based counters.
			
 
				+	 */
			
 
				+	perf_disable();
			
 
				+
			
 
				+	add_counter_to_ctx(counter, ctx);
			
 
				+
			
 
				+	/*
			
 
				+	 * Don't put the counter on if it is disabled or if
			
 
				+	 * it is in a group and the group isn't on.
			
 
				+	 */
			
 
				+	if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
			
 
				+	    (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
			
 
				+		goto unlock;
			
 
				+
			
 
				+	/*
			
 
				+	 * An exclusive counter can't go on if there are already active
			
 
				+	 * hardware counters, and no hardware counter can go on if there
			
 
				+	 * is already an exclusive counter on.
			
 
				+	 */
			
 
				+	if (!group_can_go_on(counter, cpuctx, 1))
			
 
				+		err = -EEXIST;
			
 
				+	else
			
 
				+		err = counter_sched_in(counter, cpuctx, ctx, cpu);
			
 
				+
			
 
				+	if (err) {
			
 
				+		/*
			
 
				+		 * This counter couldn't go on.  If it is in a group
			
 
				+		 * then we have to pull the whole group off.
			
 
				+		 * If the counter group is pinned then put it in error state.
			
 
				+		 */
			
 
				+		if (leader != counter)
			
 
				+			group_sched_out(leader, cpuctx, ctx);
			
 
				+		if (leader->attr.pinned) {
			
 
				+			update_group_times(leader);
			
 
				+			leader->state = PERF_COUNTER_STATE_ERROR;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (!err && !ctx->task && cpuctx->max_pertask)
			
 
				+		cpuctx->max_pertask--;
			
 
				+
			
 
				+ unlock:
			
 
				+	perf_enable();
			
 
				+
			
 
				+	spin_unlock(&ctx->lock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Attach a performance counter to a context
			
 
				+ *
			
 
				+ * First we add the counter to the list with the hardware enable bit
			
 
				+ * in counter->hw_config cleared.
			
 
				+ *
			
 
				+ * If the counter is attached to a task which is on a CPU we use a smp
			
 
				+ * call to enable it in the task context. The task might have been
			
 
				+ * scheduled away, but we check this in the smp call again.
			
 
				+ *
			
 
				+ * Must be called with ctx->mutex held.
			
 
				+ */
			
 
				+static void
			
 
				+perf_install_in_context(struct perf_counter_context *ctx,
			
 
				+			struct perf_counter *counter,
			
 
				+			int cpu)
			
 
				+{
			
 
				+	struct task_struct *task = ctx->task;
			
 
				+
			
 
				+	if (!task) {
			
 
				+		/*
			
 
				+		 * Per cpu counters are installed via an smp call and
			
 
				+		 * the install is always sucessful.
			
 
				+		 */
			
 
				+		smp_call_function_single(cpu, __perf_install_in_context,
			
 
				+					 counter, 1);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+retry:
			
 
				+	task_oncpu_function_call(task, __perf_install_in_context,
			
 
				+				 counter);
			
 
				+
			
 
				+	spin_lock_irq(&ctx->lock);
			
 
				+	/*
			
 
				+	 * we need to retry the smp call.
			
 
				+	 */
			
 
				+	if (ctx->is_active && list_empty(&counter->list_entry)) {
			
 
				+		spin_unlock_irq(&ctx->lock);
			
 
				+		goto retry;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * The lock prevents that this context is scheduled in so we
			
 
				+	 * can add the counter safely, if it the call above did not
			
 
				+	 * succeed.
			
 
				+	 */
			
 
				+	if (list_empty(&counter->list_entry))
			
 
				+		add_counter_to_ctx(counter, ctx);
			
 
				+	spin_unlock_irq(&ctx->lock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Cross CPU call to enable a performance counter
			
 
				+ */
			
 
				+static void __perf_counter_enable(void *info)
			
 
				+{
			
 
				+	struct perf_counter *counter = info;
			
 
				+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
			
 
				+	struct perf_counter_context *ctx = counter->ctx;
			
 
				+	struct perf_counter *leader = counter->group_leader;
			
 
				+	int err;
			
 
				+
			
 
				+	/*
			
 
				+	 * If this is a per-task counter, need to check whether this
			
 
				+	 * counter's task is the current task on this cpu.
			
 
				+	 */
			
 
				+	if (ctx->task && cpuctx->task_ctx != ctx) {
			
 
				+		if (cpuctx->task_ctx || ctx->task != current)
			
 
				+			return;
			
 
				+		cpuctx->task_ctx = ctx;
			
 
				+	}
			
 
				+
			
 
				+	spin_lock(&ctx->lock);
			
 
				+	ctx->is_active = 1;
			
 
				+	update_context_time(ctx);
			
 
				+
			
 
				+	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
			
 
				+		goto unlock;
			
 
				+	counter->state = PERF_COUNTER_STATE_INACTIVE;
			
 
				+	counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
			
 
				+
			
 
				+	/*
			
 
				+	 * If the counter is in a group and isn't the group leader,
			
 
				+	 * then don't put it on unless the group is on.
			
 
				+	 */
			
 
				+	if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
			
 
				+		goto unlock;
			
 
				+
			
 
				+	if (!group_can_go_on(counter, cpuctx, 1)) {
			
 
				+		err = -EEXIST;
			
 
				+	} else {
			
 
				+		perf_disable();
			
 
				+		if (counter == leader)
			
 
				+			err = group_sched_in(counter, cpuctx, ctx,
			
 
				+					     smp_processor_id());
			
 
				+		else
			
 
				+			err = counter_sched_in(counter, cpuctx, ctx,
			
 
				+					       smp_processor_id());
			
 
				+		perf_enable();
			
 
				+	}
			
 
				+
			
 
				+	if (err) {
			
 
				+		/*
			
 
				+		 * If this counter can't go on and it's part of a
			
 
				+		 * group, then the whole group has to come off.
			
 
				+		 */
			
 
				+		if (leader != counter)
			
 
				+			group_sched_out(leader, cpuctx, ctx);
			
 
				+		if (leader->attr.pinned) {
			
 
				+			update_group_times(leader);
			
 
				+			leader->state = PERF_COUNTER_STATE_ERROR;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+ unlock:
			
 
				+	spin_unlock(&ctx->lock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Enable a counter.
			
 
				+ *
			
 
				+ * If counter->ctx is a cloned context, callers must make sure that
			
 
				+ * every task struct that counter->ctx->task could possibly point to
			
 
				+ * remains valid.  This condition is satisfied when called through
			
 
				+ * perf_counter_for_each_child or perf_counter_for_each as described
			
 
				+ * for perf_counter_disable.
			
 
				+ */
			
 
				+static void perf_counter_enable(struct perf_counter *counter)
			
 
				+{
			
 
				+	struct perf_counter_context *ctx = counter->ctx;
			
 
				+	struct task_struct *task = ctx->task;
			
 
				+
			
 
				+	if (!task) {
			
 
				+		/*
			
 
				+		 * Enable the counter on the cpu that it's on
			
 
				+		 */
			
 
				+		smp_call_function_single(counter->cpu, __perf_counter_enable,
			
 
				+					 counter, 1);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	spin_lock_irq(&ctx->lock);
			
 
				+	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
			
 
				+		goto out;
			
 
				+
			
 
				+	/*
			
 
				+	 * If the counter is in error state, clear that first.
			
 
				+	 * That way, if we see the counter in error state below, we
			
 
				+	 * know that it has gone back into error state, as distinct
			
 
				+	 * from the task having been scheduled away before the
			
 
				+	 * cross-call arrived.
			
 
				+	 */
			
 
				+	if (counter->state == PERF_COUNTER_STATE_ERROR)
			
 
				+		counter->state = PERF_COUNTER_STATE_OFF;
			
 
				+
			
 
				+ retry:
			
 
				+	spin_unlock_irq(&ctx->lock);
			
 
				+	task_oncpu_function_call(task, __perf_counter_enable, counter);
			
 
				+
			
 
				+	spin_lock_irq(&ctx->lock);
			
 
				+
			
 
				+	/*
			
 
				+	 * If the context is active and the counter is still off,
			
 
				+	 * we need to retry the cross-call.
			
 
				+	 */
			
 
				+	if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
			
 
				+		goto retry;
			
 
				+
			
 
				+	/*
			
 
				+	 * Since we have the lock this context can't be scheduled
			
 
				+	 * in, so we can change the state safely.
			
 
				+	 */
			
 
				+	if (counter->state == PERF_COUNTER_STATE_OFF) {
			
 
				+		counter->state = PERF_COUNTER_STATE_INACTIVE;
			
 
				+		counter->tstamp_enabled =
			
 
				+			ctx->time - counter->total_time_enabled;
			
 
				+	}
			
 
				+ out:
			
 
				+	spin_unlock_irq(&ctx->lock);
			
 
				+}
			
 
				+
			
 
				+static int perf_counter_refresh(struct perf_counter *counter, int refresh)
			
 
				+{
			
 
				+	/*
			
 
				+	 * not supported on inherited counters
			
 
				+	 */
			
 
				+	if (counter->attr.inherit)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	atomic_add(refresh, &counter->event_limit);
			
 
				+	perf_counter_enable(counter);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+void __perf_counter_sched_out(struct perf_counter_context *ctx,
			
 
				+			      struct perf_cpu_context *cpuctx)
			
 
				+{
			
 
				+	struct perf_counter *counter;
			
 
				+
			
 
				+	spin_lock(&ctx->lock);
			
 
				+	ctx->is_active = 0;
			
 
				+	if (likely(!ctx->nr_counters))
			
 
				+		goto out;
			
 
				+	update_context_time(ctx);
			
 
				+
			
 
				+	perf_disable();
			
 
				+	if (ctx->nr_active) {
			
 
				+		list_for_each_entry(counter, &ctx->counter_list, list_entry) {
			
 
				+			if (counter != counter->group_leader)
			
 
				+				counter_sched_out(counter, cpuctx, ctx);
			
 
				+			else
			
 
				+				group_sched_out(counter, cpuctx, ctx);
			
 
				+		}
			
 
				+	}
			
 
				+	perf_enable();
			
 
				+ out:
			
 
				+	spin_unlock(&ctx->lock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Test whether two contexts are equivalent, i.e. whether they
			
 
				+ * have both been cloned from the same version of the same context
			
 
				+ * and they both have the same number of enabled counters.
			
 
				+ * If the number of enabled counters is the same, then the set
			
 
				+ * of enabled counters should be the same, because these are both
			
 
				+ * inherited contexts, therefore we can't access individual counters
			
 
				+ * in them directly with an fd; we can only enable/disable all
			
 
				+ * counters via prctl, or enable/disable all counters in a family
			
 
				+ * via ioctl, which will have the same effect on both contexts.
			
 
				+ */
			
 
				+static int context_equiv(struct perf_counter_context *ctx1,
			
 
				+			 struct perf_counter_context *ctx2)
			
 
				+{
			
 
				+	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
			
 
				+		&& ctx1->parent_gen == ctx2->parent_gen
			
 
				+		&& !ctx1->pin_count && !ctx2->pin_count;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Called from scheduler to remove the counters of the current task,
			
 
				+ * with interrupts disabled.
			
 
				+ *
			
 
				+ * We stop each counter and update the counter value in counter->count.
			
 
				+ *
			
 
				+ * This does not protect us against NMI, but disable()
			
 
				+ * sets the disabled bit in the control field of counter _before_
			
 
				+ * accessing the counter control register. If a NMI hits, then it will
			
 
				+ * not restart the counter.
			
 
				+ */
			
 
				+void perf_counter_task_sched_out(struct task_struct *task,
			
 
				+				 struct task_struct *next, int cpu)
			
 
				+{
			
 
				+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
			
 
				+	struct perf_counter_context *ctx = task->perf_counter_ctxp;
			
 
				+	struct perf_counter_context *next_ctx;
			
 
				+	struct perf_counter_context *parent;
			
 
				+	struct pt_regs *regs;
			
 
				+	int do_switch = 1;
			
 
				+
			
 
				+	regs = task_pt_regs(task);
			
 
				+	perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
			
 
				+
			
 
				+	if (likely(!ctx || !cpuctx->task_ctx))
			
 
				+		return;
			
 
				+
			
 
				+	update_context_time(ctx);
			
 
				+
			
 
				+	rcu_read_lock();
			
 
				+	parent = rcu_dereference(ctx->parent_ctx);
			
 
				+	next_ctx = next->perf_counter_ctxp;
			
 
				+	if (parent && next_ctx &&
			
 
				+	    rcu_dereference(next_ctx->parent_ctx) == parent) {
			
 
				+		/*
			
 
				+		 * Looks like the two contexts are clones, so we might be
			
 
				+		 * able to optimize the context switch.  We lock both
			
 
				+		 * contexts and check that they are clones under the
			
 
				+		 * lock (including re-checking that neither has been
			
 
				+		 * uncloned in the meantime).  It doesn't matter which
			
 
				+		 * order we take the locks because no other cpu could
			
 
				+		 * be trying to lock both of these tasks.
			
 
				+		 */
			
 
				+		spin_lock(&ctx->lock);
			
 
				+		spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
			
 
				+		if (context_equiv(ctx, next_ctx)) {
			
 
				+			/*
			
 
				+			 * XXX do we need a memory barrier of sorts
			
 
				+			 * wrt to rcu_dereference() of perf_counter_ctxp
			
 
				+			 */
			
 
				+			task->perf_counter_ctxp = next_ctx;
			
 
				+			next->perf_counter_ctxp = ctx;
			
 
				+			ctx->task = next;
			
 
				+			next_ctx->task = task;
			
 
				+			do_switch = 0;
			
 
				+		}
			
 
				+		spin_unlock(&next_ctx->lock);
			
 
				+		spin_unlock(&ctx->lock);
			
 
				+	}
			
 
				+	rcu_read_unlock();
			
 
				+
			
 
				+	if (do_switch) {
			
 
				+		__perf_counter_sched_out(ctx, cpuctx);
			
 
				+		cpuctx->task_ctx = NULL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Called with IRQs disabled
			
 
				+ */
			
 
				+static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
			
 
				+{
			
 
				+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
			
 
				+
			
 
				+	if (!cpuctx->task_ctx)
			
 
				+		return;
			
 
				+
			
 
				+	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
			
 
				+		return;
			
 
				+
			
 
				+	__perf_counter_sched_out(ctx, cpuctx);
			
 
				+	cpuctx->task_ctx = NULL;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Called with IRQs disabled
			
 
				+ */
			
 
				+static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
			
 
				+{
			
 
				+	__perf_counter_sched_out(&cpuctx->ctx, cpuctx);
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+__perf_counter_sched_in(struct perf_counter_context *ctx,
			
 
				+			struct perf_cpu_context *cpuctx, int cpu)
			
 
				+{
			
 
				+	struct perf_counter *counter;
			
 
				+	int can_add_hw = 1;
			
 
				+
			
 
				+	spin_lock(&ctx->lock);
			
 
				+	ctx->is_active = 1;
			
 
				+	if (likely(!ctx->nr_counters))
			
 
				+		goto out;
			
 
				+
			
 
				+	ctx->timestamp = perf_clock();
			
 
				+
			
 
				+	perf_disable();
			
 
				+
			
 
				+	/*
			
 
				+	 * First go through the list and put on any pinned groups
			
 
				+	 * in order to give them the best chance of going on.
			
 
				+	 */
			
 
				+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
			
 
				+		if (counter->state <= PERF_COUNTER_STATE_OFF ||
			
 
				+		    !counter->attr.pinned)
			
 
				+			continue;
			
 
				+		if (counter->cpu != -1 && counter->cpu != cpu)
			
 
				+			continue;
			
 
				+
			
 
				+		if (counter != counter->group_leader)
			
 
				+			counter_sched_in(counter, cpuctx, ctx, cpu);
			
 
				+		else {
			
 
				+			if (group_can_go_on(counter, cpuctx, 1))
			
 
				+				group_sched_in(counter, cpuctx, ctx, cpu);
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * If this pinned group hasn't been scheduled,
			
 
				+		 * put it in error state.
			
 
				+		 */
			
 
				+		if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
			
 
				+			update_group_times(counter);
			
 
				+			counter->state = PERF_COUNTER_STATE_ERROR;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
			
 
				+		/*
			
 
				+		 * Ignore counters in OFF or ERROR state, and
			
 
				+		 * ignore pinned counters since we did them already.
			
 
				+		 */
			
 
				+		if (counter->state <= PERF_COUNTER_STATE_OFF ||
			
 
				+		    counter->attr.pinned)
			
 
				+			continue;
			
 
				+
			
 
				+		/*
			
 
				+		 * Listen to the 'cpu' scheduling filter constraint
			
 
				+		 * of counters:
			
 
				+		 */
			
 
				+		if (counter->cpu != -1 && counter->cpu != cpu)
			
 
				+			continue;
			
 
				+
			
 
				+		if (counter != counter->group_leader) {
			
 
				+			if (counter_sched_in(counter, cpuctx, ctx, cpu))
			
 
				+				can_add_hw = 0;
			
 
				+		} else {
			
 
				+			if (group_can_go_on(counter, cpuctx, can_add_hw)) {
			
 
				+				if (group_sched_in(counter, cpuctx, ctx, cpu))
			
 
				+					can_add_hw = 0;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	perf_enable();
			
 
				+ out:
			
 
				+	spin_unlock(&ctx->lock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Called from scheduler to add the counters of the current task
			
 
				+ * with interrupts disabled.
			
 
				+ *
			
 
				+ * We restore the counter value and then enable it.
			
 
				+ *
			
 
				+ * This does not protect us against NMI, but enable()
			
 
				+ * sets the enabled bit in the control field of counter _before_
			
 
				+ * accessing the counter control register. If a NMI hits, then it will
			
 
				+ * keep the counter running.
			
 
				+ */
			
 
				+void perf_counter_task_sched_in(struct task_struct *task, int cpu)
			
 
				+{
			
 
				+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
			
 
				+	struct perf_counter_context *ctx = task->perf_counter_ctxp;
			
 
				+
			
 
				+	if (likely(!ctx))
			
 
				+		return;
			
 
				+	if (cpuctx->task_ctx == ctx)
			
 
				+		return;
			
 
				+	__perf_counter_sched_in(ctx, cpuctx, cpu);
			
 
				+	cpuctx->task_ctx = ctx;
			
 
				+}
			
 
				+
			
 
				+static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
			
 
				+{
			
 
				+	struct perf_counter_context *ctx = &cpuctx->ctx;
			
 
				+
			
 
				+	__perf_counter_sched_in(ctx, cpuctx, cpu);
			
 
				+}
			
 
				+
			
 
				+#define MAX_INTERRUPTS (~0ULL)
			
 
				+
			
 
				+static void perf_log_throttle(struct perf_counter *counter, int enable);
			
 
				+static void perf_log_period(struct perf_counter *counter, u64 period);
			
 
				+
			
 
				+static void perf_adjust_period(struct perf_counter *counter, u64 events)
			
 
				+{
			
 
				+	struct hw_perf_counter *hwc = &counter->hw;
			
 
				+	u64 period, sample_period;
			
 
				+	s64 delta;
			
 
				+
			
 
				+	events *= hwc->sample_period;
			
 
				+	period = div64_u64(events, counter->attr.sample_freq);
			
 
				+
			
 
				+	delta = (s64)(period - hwc->sample_period);
			
 
				+	delta = (delta + 7) / 8; /* low pass filter */
			
 
				+
			
 
				+	sample_period = hwc->sample_period + delta;
			
 
				+
			
 
				+	if (!sample_period)
			
 
				+		sample_period = 1;
			
 
				+
			
 
				+	perf_log_period(counter, sample_period);
			
 
				+
			
 
				+	hwc->sample_period = sample_period;
			
 
				+}
			
 
				+
			
 
				+static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
			
 
				+{
			
 
				+	struct perf_counter *counter;
			
 
				+	struct hw_perf_counter *hwc;
			
 
				+	u64 interrupts, freq;
			
 
				+
			
 
				+	spin_lock(&ctx->lock);
			
 
				+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
			
 
				+		if (counter->state != PERF_COUNTER_STATE_ACTIVE)
			
 
				+			continue;
			
 
				+
			
 
				+		hwc = &counter->hw;
			
 
				+
			
 
				+		interrupts = hwc->interrupts;
			
 
				+		hwc->interrupts = 0;
			
 
				+
			
 
				+		/*
			
 
				+		 * unthrottle counters on the tick
			
 
				+		 */
			
 
				+		if (interrupts == MAX_INTERRUPTS) {
			
 
				+			perf_log_throttle(counter, 1);
			
 
				+			counter->pmu->unthrottle(counter);
			
 
				+			interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
			
 
				+		}
			
 
				+
			
 
				+		if (!counter->attr.freq || !counter->attr.sample_freq)
			
 
				+			continue;
			
 
				+
			
 
				+		/*
			
 
				+		 * if the specified freq < HZ then we need to skip ticks
			
 
				+		 */
			
 
				+		if (counter->attr.sample_freq < HZ) {
			
 
				+			freq = counter->attr.sample_freq;
			
 
				+
			
 
				+			hwc->freq_count += freq;
			
 
				+			hwc->freq_interrupts += interrupts;
			
 
				+
			
 
				+			if (hwc->freq_count < HZ)
			
 
				+				continue;
			
 
				+
			
 
				+			interrupts = hwc->freq_interrupts;
			
 
				+			hwc->freq_interrupts = 0;
			
 
				+			hwc->freq_count -= HZ;
			
 
				+		} else
			
 
				+			freq = HZ;
			
 
				+
			
 
				+		perf_adjust_period(counter, freq * interrupts);
			
 
				+
			
 
				+		/*
			
 
				+		 * In order to avoid being stalled by an (accidental) huge
			
 
				+		 * sample period, force reset the sample period if we didn't
			
 
				+		 * get any events in this freq period.
			
 
				+		 */
			
 
				+		if (!interrupts) {
			
 
				+			perf_disable();
			
 
				+			counter->pmu->disable(counter);
			
 
				+			atomic_set(&hwc->period_left, 0);
			
 
				+			counter->pmu->enable(counter);
			
 
				+			perf_enable();
			
 
				+		}
			
 
				+	}
			
 
				+	spin_unlock(&ctx->lock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Round-robin a context's counters:
			
 
				+ */
			
 
				+static void rotate_ctx(struct perf_counter_context *ctx)
			
 
				+{
			
 
				+	struct perf_counter *counter;
			
 
				+
			
 
				+	if (!ctx->nr_counters)
			
 
				+		return;
			
 
				+
			
 
				+	spin_lock(&ctx->lock);
			
 
				+	/*
			
 
				+	 * Rotate the first entry last (works just fine for group counters too):
			
 
				+	 */
			
 
				+	perf_disable();
			
 
				+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
			
 
				+		list_move_tail(&counter->list_entry, &ctx->counter_list);
			
 
				+		break;
			
 
				+	}
			
 
				+	perf_enable();
			
 
				+
			
 
				+	spin_unlock(&ctx->lock);
			
 
				+}
			
 
				+
			
 
				+void perf_counter_task_tick(struct task_struct *curr, int cpu)
			
 
				+{
			
 
				+	struct perf_cpu_context *cpuctx;
			
 
				+	struct perf_counter_context *ctx;
			
 
				+
			
 
				+	if (!atomic_read(&nr_counters))
			
 
				+		return;
			
 
				+
			
 
				+	cpuctx = &per_cpu(perf_cpu_context, cpu);
			
 
				+	ctx = curr->perf_counter_ctxp;
			
 
				+
			
 
				+	perf_ctx_adjust_freq(&cpuctx->ctx);
			
 
				+	if (ctx)
			
 
				+		perf_ctx_adjust_freq(ctx);
			
 
				+
			
 
				+	perf_counter_cpu_sched_out(cpuctx);
			
 
				+	if (ctx)
			
 
				+		__perf_counter_task_sched_out(ctx);
			
 
				+
			
 
				+	rotate_ctx(&cpuctx->ctx);
			
 
				+	if (ctx)
			
 
				+		rotate_ctx(ctx);
			
 
				+
			
 
				+	perf_counter_cpu_sched_in(cpuctx, cpu);
			
 
				+	if (ctx)
			
 
				+		perf_counter_task_sched_in(curr, cpu);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Cross CPU call to read the hardware counter
			
 
				+ */
			
 
				+static void __read(void *info)
			
 
				+{
			
 
				+	struct perf_counter *counter = info;
			
 
				+	struct perf_counter_context *ctx = counter->ctx;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	local_irq_save(flags);
			
 
				+	if (ctx->is_active)
			
 
				+		update_context_time(ctx);
			
 
				+	counter->pmu->read(counter);
			
 
				+	update_counter_times(counter);
			
 
				+	local_irq_restore(flags);
			
 
				+}
			
 
				+
			
 
				+static u64 perf_counter_read(struct perf_counter *counter)
			
 
				+{
			
 
				+	/*
			
 
				+	 * If counter is enabled and currently active on a CPU, update the
			
 
				+	 * value in the counter structure:
			
 
				+	 */
			
 
				+	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
			
 
				+		smp_call_function_single(counter->oncpu,
			
 
				+					 __read, counter, 1);
			
 
				+	} else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
			
 
				+		update_counter_times(counter);
			
 
				+	}
			
 
				+
			
 
				+	return atomic64_read(&counter->count);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Initialize the perf_counter context in a task_struct:
			
 
				+ */
			
 
				+static void
			
 
				+__perf_counter_init_context(struct perf_counter_context *ctx,
			
 
				+			    struct task_struct *task)
			
 
				+{
			
 
				+	memset(ctx, 0, sizeof(*ctx));
			
 
				+	spin_lock_init(&ctx->lock);
			
 
				+	mutex_init(&ctx->mutex);
			
 
				+	INIT_LIST_HEAD(&ctx->counter_list);
			
 
				+	INIT_LIST_HEAD(&ctx->event_list);
			
 
				+	atomic_set(&ctx->refcount, 1);
			
 
				+	ctx->task = task;
			
 
				+}
			
 
				+
			
 
				+static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
			
 
				+{
			
 
				+	struct perf_counter_context *parent_ctx;
			
 
				+	struct perf_counter_context *ctx;
			
 
				+	struct perf_cpu_context *cpuctx;
			
 
				+	struct task_struct *task;
			
 
				+	unsigned long flags;
			
 
				+	int err;
			
 
				+
			
 
				+	/*
			
 
				+	 * If cpu is not a wildcard then this is a percpu counter:
			
 
				+	 */
			
 
				+	if (cpu != -1) {
			
 
				+		/* Must be root to operate on a CPU counter: */
			
 
				+		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
			
 
				+			return ERR_PTR(-EACCES);
			
 
				+
			
 
				+		if (cpu < 0 || cpu > num_possible_cpus())
			
 
				+			return ERR_PTR(-EINVAL);
			
 
				+
			
 
				+		/*
			
 
				+		 * We could be clever and allow to attach a counter to an
			
 
				+		 * offline CPU and activate it when the CPU comes up, but
			
 
				+		 * that's for later.
			
 
				+		 */
			
 
				+		if (!cpu_isset(cpu, cpu_online_map))
			
 
				+			return ERR_PTR(-ENODEV);
			
 
				+
			
 
				+		cpuctx = &per_cpu(perf_cpu_context, cpu);
			
 
				+		ctx = &cpuctx->ctx;
			
 
				+		get_ctx(ctx);
			
 
				+
			
 
				+		return ctx;
			
 
				+	}
			
 
				+
			
 
				+	rcu_read_lock();
			
 
				+	if (!pid)
			
 
				+		task = current;
			
 
				+	else
			
 
				+		task = find_task_by_vpid(pid);
			
 
				+	if (task)
			
 
				+		get_task_struct(task);
			
 
				+	rcu_read_unlock();
			
 
				+
			
 
				+	if (!task)
			
 
				+		return ERR_PTR(-ESRCH);
			
 
				+
			
 
				+	/*
			
 
				+	 * Can't attach counters to a dying task.
			
 
				+	 */
			
 
				+	err = -ESRCH;
			
 
				+	if (task->flags & PF_EXITING)
			
 
				+		goto errout;
			
 
				+
			
 
				+	/* Reuse ptrace permission checks for now. */
			
 
				+	err = -EACCES;
			
 
				+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
			
 
				+		goto errout;
			
 
				+
			
 
				+ retry:
			
 
				+	ctx = perf_lock_task_context(task, &flags);
			
 
				+	if (ctx) {
			
 
				+		parent_ctx = ctx->parent_ctx;
			
 
				+		if (parent_ctx) {
			
 
				+			put_ctx(parent_ctx);
			
 
				+			ctx->parent_ctx = NULL;		/* no longer a clone */
			
 
				+		}
			
 
				+		/*
			
 
				+		 * Get an extra reference before dropping the lock so that
			
 
				+		 * this context won't get freed if the task exits.
			
 
				+		 */
			
 
				+		get_ctx(ctx);
			
 
				+		spin_unlock_irqrestore(&ctx->lock, flags);
			
 
				+	}
			
 
				+
			
 
				+	if (!ctx) {
			
 
				+		ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
			
 
				+		err = -ENOMEM;
			
 
				+		if (!ctx)
			
 
				+			goto errout;
			
 
				+		__perf_counter_init_context(ctx, task);
			
 
				+		get_ctx(ctx);
			
 
				+		if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
			
 
				+			/*
			
 
				+			 * We raced with some other task; use
			
 
				+			 * the context they set.
			
 
				+			 */
			
 
				+			kfree(ctx);
			
 
				+			goto retry;
			
 
				+		}
			
 
				+		get_task_struct(task);
			
 
				+	}
			
 
				+
			
 
				+	put_task_struct(task);
			
 
				+	return ctx;
			
 
				+
			
 
				+ errout:
			
 
				+	put_task_struct(task);
			
 
				+	return ERR_PTR(err);
			
 
				+}
			
 
				+
			
 
				+static void free_counter_rcu(struct rcu_head *head)
			
 
				+{
			
 
				+	struct perf_counter *counter;
			
 
				+
			
 
				+	counter = container_of(head, struct perf_counter, rcu_head);
			
 
				+	if (counter->ns)
			
 
				+		put_pid_ns(counter->ns);
			
 
				+	kfree(counter);
			
 
				+}
			
 
				+
			
 
				+static void perf_pending_sync(struct perf_counter *counter);
			
 
				+
			
 
				+static void free_counter(struct perf_counter *counter)
			
 
				+{
			
 
				+	perf_pending_sync(counter);
			
 
				+
			
 
				+	atomic_dec(&nr_counters);
			
 
				+	if (counter->attr.mmap)
			
 
				+		atomic_dec(&nr_mmap_counters);
			
 
				+	if (counter->attr.comm)
			
 
				+		atomic_dec(&nr_comm_counters);
			
 
				+
			
 
				+	if (counter->destroy)
			
 
				+		counter->destroy(counter);
			
 
				+
			
 
				+	put_ctx(counter->ctx);
			
 
				+	call_rcu(&counter->rcu_head, free_counter_rcu);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Called when the last reference to the file is gone.
			
 
				+ */
			
 
				+static int perf_release(struct inode *inode, struct file *file)
			
 
				+{
			
 
				+	struct perf_counter *counter = file->private_data;
			
 
				+	struct perf_counter_context *ctx = counter->ctx;
			
 
				+
			
 
				+	file->private_data = NULL;
			
 
				+
			
 
				+	WARN_ON_ONCE(ctx->parent_ctx);
			
 
				+	mutex_lock(&ctx->mutex);
			
 
				+	perf_counter_remove_from_context(counter);
			
 
				+	mutex_unlock(&ctx->mutex);
			
 
				+
			
 
				+	mutex_lock(&counter->owner->perf_counter_mutex);
			
 
				+	list_del_init(&counter->owner_entry);
			
 
				+	mutex_unlock(&counter->owner->perf_counter_mutex);
			
 
				+	put_task_struct(counter->owner);
			
 
				+
			
 
				+	free_counter(counter);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Read the performance counter - simple non blocking version for now
			
 
				+ */
			
 
				+static ssize_t
			
 
				+perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
			
 
				+{
			
 
				+	u64 values[3];
			
 
				+	int n;
			
 
				+
			
 
				+	/*
			
 
				+	 * Return end-of-file for a read on a counter that is in
			
 
				+	 * error state (i.e. because it was pinned but it couldn't be
			
 
				+	 * scheduled on to the CPU at some point).
			
 
				+	 */
			
 
				+	if (counter->state == PERF_COUNTER_STATE_ERROR)
			
 
				+		return 0;
			
 
				+
			
 
				+	WARN_ON_ONCE(counter->ctx->parent_ctx);
			
 
				+	mutex_lock(&counter->child_mutex);
			
 
				+	values[0] = perf_counter_read(counter);
			
 
				+	n = 1;
			
 
				+	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
			
 
				+		values[n++] = counter->total_time_enabled +
			
 
				+			atomic64_read(&counter->child_total_time_enabled);
			
 
				+	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
			
 
				+		values[n++] = counter->total_time_running +
			
 
				+			atomic64_read(&counter->child_total_time_running);
			
 
				+	if (counter->attr.read_format & PERF_FORMAT_ID)
			
 
				+		values[n++] = counter->id;
			
 
				+	mutex_unlock(&counter->child_mutex);
			
 
				+
			
 
				+	if (count < n * sizeof(u64))
			
 
				+		return -EINVAL;
			
 
				+	count = n * sizeof(u64);
			
 
				+
			
 
				+	if (copy_to_user(buf, values, count))
			
 
				+		return -EFAULT;
			
 
				+
			
 
				+	return count;
			
 
				+}
			
 
				+
			
 
				+static ssize_t
			
 
				+perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
			
 
				+{
			
 
				+	struct perf_counter *counter = file->private_data;
			
 
				+
			
 
				+	return perf_read_hw(counter, buf, count);
			
 
				+}
			
 
				+
			
 
				+static unsigned int perf_poll(struct file *file, poll_table *wait)
			
 
				+{
			
 
				+	struct perf_counter *counter = file->private_data;
			
 
				+	struct perf_mmap_data *data;
			
 
				+	unsigned int events = POLL_HUP;
			
 
				+
			
 
				+	rcu_read_lock();
			
 
				+	data = rcu_dereference(counter->data);
			
 
				+	if (data)
			
 
				+		events = atomic_xchg(&data->poll, 0);
			
 
				+	rcu_read_unlock();
			
 
				+
			
 
				+	poll_wait(file, &counter->waitq, wait);
			
 
				+
			
 
				+	return events;
			
 
				+}
			
 
				+
			
 
				+static void perf_counter_reset(struct perf_counter *counter)
			
 
				+{
			
 
				+	(void)perf_counter_read(counter);
			
 
				+	atomic64_set(&counter->count, 0);
			
 
				+	perf_counter_update_userpage(counter);
			
 
				+}
			
 
				+
			
 
				+static void perf_counter_for_each_sibling(struct perf_counter *counter,
			
 
				+					  void (*func)(struct perf_counter *))
			
 
				+{
			
 
				+	struct perf_counter_context *ctx = counter->ctx;
			
 
				+	struct perf_counter *sibling;
			
 
				+
			
 
				+	WARN_ON_ONCE(ctx->parent_ctx);
			
 
				+	mutex_lock(&ctx->mutex);
			
 
				+	counter = counter->group_leader;
			
 
				+
			
 
				+	func(counter);
			
 
				+	list_for_each_entry(sibling, &counter->sibling_list, list_entry)
			
 
				+		func(sibling);
			
 
				+	mutex_unlock(&ctx->mutex);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Holding the top-level counter's child_mutex means that any
			
 
				+ * descendant process that has inherited this counter will block
			
 
				+ * in sync_child_counter if it goes to exit, thus satisfying the
			
 
				+ * task existence requirements of perf_counter_enable/disable.
			
 
				+ */
			
 
				+static void perf_counter_for_each_child(struct perf_counter *counter,
			
 
				+					void (*func)(struct perf_counter *))
			
 
				+{
			
 
				+	struct perf_counter *child;
			
 
				+
			
 
				+	WARN_ON_ONCE(counter->ctx->parent_ctx);
			
 
				+	mutex_lock(&counter->child_mutex);
			
 
				+	func(counter);
			
 
				+	list_for_each_entry(child, &counter->child_list, child_list)
			
 
				+		func(child);
			
 
				+	mutex_unlock(&counter->child_mutex);
			
 
				+}
			
 
				+
			
 
				+static void perf_counter_for_each(struct perf_counter *counter,
			
 
				+				  void (*func)(struct perf_counter *))
			
 
				+{
			
 
				+	struct perf_counter *child;
			
 
				+
			
 
				+	WARN_ON_ONCE(counter->ctx->parent_ctx);
			
 
				+	mutex_lock(&counter->child_mutex);
			
 
				+	perf_counter_for_each_sibling(counter, func);
			
 
				+	list_for_each_entry(child, &counter->child_list, child_list)
			
 
				+		perf_counter_for_each_sibling(child, func);
			
 
				+	mutex_unlock(&counter->child_mutex);
			
 
				+}
			
 
				+
			
 
				+static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
			
 
				+{
			
 
				+	struct perf_counter_context *ctx = counter->ctx;
			
 
				+	unsigned long size;
			
 
				+	int ret = 0;
			
 
				+	u64 value;
			
 
				+
			
 
				+	if (!counter->attr.sample_period)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	size = copy_from_user(&value, arg, sizeof(value));
			
 
				+	if (size != sizeof(value))
			
 
				+		return -EFAULT;
			
 
				+
			
 
				+	if (!value)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	spin_lock_irq(&ctx->lock);
			
 
				+	if (counter->attr.freq) {
			
 
				+		if (value > sysctl_perf_counter_sample_rate) {
			
 
				+			ret = -EINVAL;
			
 
				+			goto unlock;
			
 
				+		}
			
 
				+
			
 
				+		counter->attr.sample_freq = value;
			
 
				+	} else {
			
 
				+		perf_log_period(counter, value);
			
 
				+
			
 
				+		counter->attr.sample_period = value;
			
 
				+		counter->hw.sample_period = value;
			
 
				+	}
			
 
				+unlock:
			
 
				+	spin_unlock_irq(&ctx->lock);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
			
 
				+{
			
 
				+	struct perf_counter *counter = file->private_data;
			
 
				+	void (*func)(struct perf_counter *);
			
 
				+	u32 flags = arg;
			
 
				+
			
 
				+	switch (cmd) {
			
 
				+	case PERF_COUNTER_IOC_ENABLE:
			
 
				+		func = perf_counter_enable;
			
 
				+		break;
			
 
				+	case PERF_COUNTER_IOC_DISABLE:
			
 
				+		func = perf_counter_disable;
			
 
				+		break;
			
 
				+	case PERF_COUNTER_IOC_RESET:
			
 
				+		func = perf_counter_reset;
			
 
				+		break;
			
 
				+
			
 
				+	case PERF_COUNTER_IOC_REFRESH:
			
 
				+		return perf_counter_refresh(counter, arg);
			
 
				+
			
 
				+	case PERF_COUNTER_IOC_PERIOD:
			
 
				+		return perf_counter_period(counter, (u64 __user *)arg);
			
 
				+
			
 
				+	default:
			
 
				+		return -ENOTTY;
			
 
				+	}
			
 
				+
			
 
				+	if (flags & PERF_IOC_FLAG_GROUP)
			
 
				+		perf_counter_for_each(counter, func);
			
 
				+	else
			
 
				+		perf_counter_for_each_child(counter, func);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int perf_counter_task_enable(void)
			
 
				+{
			
 
				+	struct perf_counter *counter;
			
 
				+
			
 
				+	mutex_lock(&current->perf_counter_mutex);
			
 
				+	list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
			
 
				+		perf_counter_for_each_child(counter, perf_counter_enable);
			
 
				+	mutex_unlock(&current->perf_counter_mutex);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int perf_counter_task_disable(void)
			
 
				+{
			
 
				+	struct perf_counter *counter;
			
 
				+
			
 
				+	mutex_lock(&current->perf_counter_mutex);
			
 
				+	list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
			
 
				+		perf_counter_for_each_child(counter, perf_counter_disable);
			
 
				+	mutex_unlock(&current->perf_counter_mutex);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Callers need to ensure there can be no nesting of this function, otherwise
			
 
				+ * the seqlock logic goes bad. We can not serialize this because the arch
			
 
				+ * code calls this from NMI context.
			
 
				+ */
			
 
				+void perf_counter_update_userpage(struct perf_counter *counter)
			
 
				+{
			
 
				+	struct perf_counter_mmap_page *userpg;
			
 
				+	struct perf_mmap_data *data;
			
 
				+
			
 
				+	rcu_read_lock();
			
 
				+	data = rcu_dereference(counter->data);
			
 
				+	if (!data)
			
 
				+		goto unlock;
			
 
				+
			
 
				+	userpg = data->user_page;
			
 
				+
			
 
				+	/*
			
 
				+	 * Disable preemption so as to not let the corresponding user-space
			
 
				+	 * spin too long if we get preempted.
			
 
				+	 */
			
 
				+	preempt_disable();
			
 
				+	++userpg->lock;
			
 
				+	barrier();
			
 
				+	userpg->index = counter->hw.idx;
			
 
				+	userpg->offset = atomic64_read(&counter->count);
			
 
				+	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
			
 
				+		userpg->offset -= atomic64_read(&counter->hw.prev_count);
			
 
				+
			
 
				+	barrier();
			
 
				+	++userpg->lock;
			
 
				+	preempt_enable();
			
 
				+unlock:
			
 
				+	rcu_read_unlock();
			
 
				+}
			
 
				+
			
 
				+static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
			
 
				+{
			
 
				+	struct perf_counter *counter = vma->vm_file->private_data;
			
 
				+	struct perf_mmap_data *data;
			
 
				+	int ret = VM_FAULT_SIGBUS;
			
 
				+
			
 
				+	rcu_read_lock();
			
 
				+	data = rcu_dereference(counter->data);
			
 
				+	if (!data)
			
 
				+		goto unlock;
			
 
				+
			
 
				+	if (vmf->pgoff == 0) {
			
 
				+		vmf->page = virt_to_page(data->user_page);
			
 
				+	} else {
			
 
				+		int nr = vmf->pgoff - 1;
			
 
				+
			
 
				+		if ((unsigned)nr > data->nr_pages)
			
 
				+			goto unlock;
			
 
				+
			
 
				+		vmf->page = virt_to_page(data->data_pages[nr]);
			
 
				+	}
			
 
				+	get_page(vmf->page);
			
 
				+	ret = 0;
			
 
				+unlock:
			
 
				+	rcu_read_unlock();
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
			
 
				+{
			
 
				+	struct perf_mmap_data *data;
			
 
				+	unsigned long size;
			
 
				+	int i;
			
 
				+
			
 
				+	WARN_ON(atomic_read(&counter->mmap_count));
			
 
				+
			
 
				+	size = sizeof(struct perf_mmap_data);
			
 
				+	size += nr_pages * sizeof(void *);
			
 
				+
			
 
				+	data = kzalloc(size, GFP_KERNEL);
			
 
				+	if (!data)
			
 
				+		goto fail;
			
 
				+
			
 
				+	data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
			
 
				+	if (!data->user_page)
			
 
				+		goto fail_user_page;
			
 
				+
			
 
				+	for (i = 0; i < nr_pages; i++) {
			
 
				+		data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
			
 
				+		if (!data->data_pages[i])
			
 
				+			goto fail_data_pages;
			
 
				+	}
			
 
				+
			
 
				+	data->nr_pages = nr_pages;
			
 
				+	atomic_set(&data->lock, -1);
			
 
				+
			
 
				+	rcu_assign_pointer(counter->data, data);
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+fail_data_pages:
			
 
				+	for (i--; i >= 0; i--)
			
 
				+		free_page((unsigned long)data->data_pages[i]);
			
 
				+
			
 
				+	free_page((unsigned long)data->user_page);
			
 
				+
			
 
				+fail_user_page:
			
 
				+	kfree(data);
			
 
				+
			
 
				+fail:
			
 
				+	return -ENOMEM;
			
 
				+}
			
 
				+
			
 
				+static void __perf_mmap_data_free(struct rcu_head *rcu_head)
			
 
				+{
			
 
				+	struct perf_mmap_data *data;
			
 
				+	int i;
			
 
				+
			
 
				+	data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
			
 
				+
			
 
				+	free_page((unsigned long)data->user_page);
			
 
				+	for (i = 0; i < data->nr_pages; i++)
			
 
				+		free_page((unsigned long)data->data_pages[i]);
			
 
				+	kfree(data);
			
 
				+}
			
 
				+
			
 
				+static void perf_mmap_data_free(struct perf_counter *counter)
			
 
				+{
			
 
				+	struct perf_mmap_data *data = counter->data;
			
 
				+
			
 
				+	WARN_ON(atomic_read(&counter->mmap_count));
			
 
				+
			
 
				+	rcu_assign_pointer(counter->data, NULL);
			
 
				+	call_rcu(&data->rcu_head, __perf_mmap_data_free);
			
 
				+}
			
 
				+
			
 
				+static void perf_mmap_open(struct vm_area_struct *vma)
			
 
				+{
			
 
				+	struct perf_counter *counter = vma->vm_file->private_data;
			
 
				+
			
 
				+	atomic_inc(&counter->mmap_count);
			
 
				+}
			
 
				+
			
 
				+static void perf_mmap_close(struct vm_area_struct *vma)
			
 
				+{
			
 
				+	struct perf_counter *counter = vma->vm_file->private_data;
			
 
				+
			
 
				+	WARN_ON_ONCE(counter->ctx->parent_ctx);
			
 
				+	if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) {
			
 
				+		struct user_struct *user = current_user();
			
 
				+
			
 
				+		atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
			
 
				+		vma->vm_mm->locked_vm -= counter->data->nr_locked;
			
 
				+		perf_mmap_data_free(counter);
			
 
				+		mutex_unlock(&counter->mmap_mutex);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static struct vm_operations_struct perf_mmap_vmops = {
			
 
				+	.open  = perf_mmap_open,
			
 
				+	.close = perf_mmap_close,
			
 
				+	.fault = perf_mmap_fault,
			
 
				+};
			
 
				+
			
 
				+static int perf_mmap(struct file *file, struct vm_area_struct *vma)
			
 
				+{
			
 
				+	struct perf_counter *counter = file->private_data;
			
 
				+	unsigned long user_locked, user_lock_limit;
			
 
				+	struct user_struct *user = current_user();
			
 
				+	unsigned long locked, lock_limit;
			
 
				+	unsigned long vma_size;
			
 
				+	unsigned long nr_pages;
			
 
				+	long user_extra, extra;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	vma_size = vma->vm_end - vma->vm_start;
			
 
				+	nr_pages = (vma_size / PAGE_SIZE) - 1;
			
 
				+
			
 
				+	/*
			
 
				+	 * If we have data pages ensure they're a power-of-two number, so we
			
 
				+	 * can do bitmasks instead of modulo.
			
 
				+	 */
			
 
				+	if (nr_pages != 0 && !is_power_of_2(nr_pages))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	if (vma_size != PAGE_SIZE * (1 + nr_pages))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	if (vma->vm_pgoff != 0)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	WARN_ON_ONCE(counter->ctx->parent_ctx);
			
 
				+	mutex_lock(&counter->mmap_mutex);
			
 
				+	if (atomic_inc_not_zero(&counter->mmap_count)) {
			
 
				+		if (nr_pages != counter->data->nr_pages)
			
 
				+			ret = -EINVAL;
			
 
				+		goto unlock;
			
 
				+	}
			
 
				+
			
 
				+	user_extra = nr_pages + 1;
			
 
				+	user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
			
 
				+
			
 
				+	/*
			
 
				+	 * Increase the limit linearly with more CPUs:
			
 
				+	 */
			
 
				+	user_lock_limit *= num_online_cpus();
			
 
				+
			
 
				+	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
			
 
				+
			
 
				+	extra = 0;
			
 
				+	if (user_locked > user_lock_limit)
			
 
				+		extra = user_locked - user_lock_limit;
			
 
				+
			
 
				+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
			
 
				+	lock_limit >>= PAGE_SHIFT;
			
 
				+	locked = vma->vm_mm->locked_vm + extra;
			
 
				+
			
 
				+	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
			
 
				+		ret = -EPERM;
			
 
				+		goto unlock;
			
 
				+	}
			
 
				+
			
 
				+	WARN_ON(counter->data);
			
 
				+	ret = perf_mmap_data_alloc(counter, nr_pages);
			
 
				+	if (ret)
			
 
				+		goto unlock;
			
 
				+
			
 
				+	atomic_set(&counter->mmap_count, 1);
			
 
				+	atomic_long_add(user_extra, &user->locked_vm);
			
 
				+	vma->vm_mm->locked_vm += extra;
			
 
				+	counter->data->nr_locked = extra;
			
 
				+unlock:
			
 
				+	mutex_unlock(&counter->mmap_mutex);
			
 
				+
			
 
				+	vma->vm_flags &= ~VM_MAYWRITE;
			
 
				+	vma->vm_flags |= VM_RESERVED;
			
 
				+	vma->vm_ops = &perf_mmap_vmops;
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int perf_fasync(int fd, struct file *filp, int on)
			
 
				+{
			
 
				+	struct inode *inode = filp->f_path.dentry->d_inode;
			
 
				+	struct perf_counter *counter = filp->private_data;
			
 
				+	int retval;
			
 
				+
			
 
				+	mutex_lock(&inode->i_mutex);
			
 
				+	retval = fasync_helper(fd, filp, on, &counter->fasync);
			
 
				+	mutex_unlock(&inode->i_mutex);
			
 
				+
			
 
				+	if (retval < 0)
			
 
				+		return retval;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static const struct file_operations perf_fops = {
			
 
				+	.release		= perf_release,
			
 
				+	.read			= perf_read,
			
 
				+	.poll			= perf_poll,
			
 
				+	.unlocked_ioctl		= perf_ioctl,
			
 
				+	.compat_ioctl		= perf_ioctl,
			
 
				+	.mmap			= perf_mmap,
			
 
				+	.fasync			= perf_fasync,
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Perf counter wakeup
			
 
				+ *
			
 
				+ * If there's data, ensure we set the poll() state and publish everything
			
 
				+ * to user-space before waking everybody up.
			
 
				+ */
			
 
				+
			
 
				+void perf_counter_wakeup(struct perf_counter *counter)
			
 
				+{
			
 
				+	wake_up_all(&counter->waitq);
			
 
				+
			
 
				+	if (counter->pending_kill) {
			
 
				+		kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
			
 
				+		counter->pending_kill = 0;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Pending wakeups
			
 
				+ *
			
 
				+ * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
			
 
				+ *
			
 
				+ * The NMI bit means we cannot possibly take locks. Therefore, maintain a
			
 
				+ * single linked list and use cmpxchg() to add entries lockless.
			
 
				+ */
			
 
				+
			
 
				+static void perf_pending_counter(struct perf_pending_entry *entry)
			
 
				+{
			
 
				+	struct perf_counter *counter = container_of(entry,
			
 
				+			struct perf_counter, pending);
			
 
				+
			
 
				+	if (counter->pending_disable) {
			
 
				+		counter->pending_disable = 0;
			
 
				+		perf_counter_disable(counter);
			
 
				+	}
			
 
				+
			
 
				+	if (counter->pending_wakeup) {
			
 
				+		counter->pending_wakeup = 0;
			
 
				+		perf_counter_wakeup(counter);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
			
 
				+
			
 
				+static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
			
 
				+	PENDING_TAIL,
			
 
				+};
			
 
				+
			
 
				+static void perf_pending_queue(struct perf_pending_entry *entry,
			
 
				+			       void (*func)(struct perf_pending_entry *))
			
 
				+{
			
 
				+	struct perf_pending_entry **head;
			
 
				+
			
 
				+	if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
			
 
				+		return;
			
 
				+
			
 
				+	entry->func = func;
			
 
				+
			
 
				+	head = &get_cpu_var(perf_pending_head);
			
 
				+
			
 
				+	do {
			
 
				+		entry->next = *head;
			
 
				+	} while (cmpxchg(head, entry->next, entry) != entry->next);
			
 
				+
			
 
				+	set_perf_counter_pending();
			
 
				+
			
 
				+	put_cpu_var(perf_pending_head);
			
 
				+}
			
 
				+
			
 
				+static int __perf_pending_run(void)
			
 
				+{
			
 
				+	struct perf_pending_entry *list;
			
 
				+	int nr = 0;
			
 
				+
			
 
				+	list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
			
 
				+	while (list != PENDING_TAIL) {
			
 
				+		void (*func)(struct perf_pending_entry *);
			
 
				+		struct perf_pending_entry *entry = list;
			
 
				+
			
 
				+		list = list->next;
			
 
				+
			
 
				+		func = entry->func;
			
 
				+		entry->next = NULL;
			
 
				+		/*
			
 
				+		 * Ensure we observe the unqueue before we issue the wakeup,
			
 
				+		 * so that we won't be waiting forever.
			
 
				+		 * -- see perf_not_pending().
			
 
				+		 */
			
 
				+		smp_wmb();
			
 
				+
			
 
				+		func(entry);
			
 
				+		nr++;
			
 
				+	}
			
 
				+
			
 
				+	return nr;
			
 
				+}
			
 
				+
			
 
				+static inline int perf_not_pending(struct perf_counter *counter)
			
 
				+{
			
 
				+	/*
			
 
				+	 * If we flush on whatever cpu we run, there is a chance we don't
			
 
				+	 * need to wait.
			
 
				+	 */
			
 
				+	get_cpu();
			
 
				+	__perf_pending_run();
			
 
				+	put_cpu();
			
 
				+
			
 
				+	/*
			
 
				+	 * Ensure we see the proper queue state before going to sleep
			
 
				+	 * so that we do not miss the wakeup. -- see perf_pending_handle()
			
 
				+	 */
			
 
				+	smp_rmb();
			
 
				+	return counter->pending.next == NULL;
			
 
				+}
			
 
				+
			
 
				+static void perf_pending_sync(struct perf_counter *counter)
			
 
				+{
			
 
				+	wait_event(counter->waitq, perf_not_pending(counter));
			
 
				+}
			
 
				+
			
 
				+void perf_counter_do_pending(void)
			
 
				+{
			
 
				+	__perf_pending_run();
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Callchain support -- arch specific
			
 
				+ */
			
 
				+
			
 
				+__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
			
 
				+{
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Output
			
 
				+ */
			
 
				+
			
 
				+struct perf_output_handle {
			
 
				+	struct perf_counter	*counter;
			
 
				+	struct perf_mmap_data	*data;
			
 
				+	unsigned long		head;
			
 
				+	unsigned long		offset;
			
 
				+	int			nmi;
			
 
				+	int			overflow;
			
 
				+	int			locked;
			
 
				+	unsigned long		flags;
			
 
				+};
			
 
				+
			
 
				+static void perf_output_wakeup(struct perf_output_handle *handle)
			
 
				+{
			
 
				+	atomic_set(&handle->data->poll, POLL_IN);
			
 
				+
			
 
				+	if (handle->nmi) {
			
 
				+		handle->counter->pending_wakeup = 1;
			
 
				+		perf_pending_queue(&handle->counter->pending,
			
 
				+				   perf_pending_counter);
			
 
				+	} else
			
 
				+		perf_counter_wakeup(handle->counter);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Curious locking construct.
			
 
				+ *
			
 
				+ * We need to ensure a later event doesn't publish a head when a former
			
 
				+ * event isn't done writing. However since we need to deal with NMIs we
			
 
				+ * cannot fully serialize things.
			
 
				+ *
			
 
				+ * What we do is serialize between CPUs so we only have to deal with NMI
			
 
				+ * nesting on a single CPU.
			
 
				+ *
			
 
				+ * We only publish the head (and generate a wakeup) when the outer-most
			
 
				+ * event completes.
			
 
				+ */
			
 
				+static void perf_output_lock(struct perf_output_handle *handle)
			
 
				+{
			
 
				+	struct perf_mmap_data *data = handle->data;
			
 
				+	int cpu;
			
 
				+
			
 
				+	handle->locked = 0;
			
 
				+
			
 
				+	local_irq_save(handle->flags);
			
 
				+	cpu = smp_processor_id();
			
 
				+
			
 
				+	if (in_nmi() && atomic_read(&data->lock) == cpu)
			
 
				+		return;
			
 
				+
			
 
				+	while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
			
 
				+		cpu_relax();
			
 
				+
			
 
				+	handle->locked = 1;
			
 
				+}
			
 
				+
			
 
				+static void perf_output_unlock(struct perf_output_handle *handle)
			
 
				+{
			
 
				+	struct perf_mmap_data *data = handle->data;
			
 
				+	unsigned long head;
			
 
				+	int cpu;
			
 
				+
			
 
				+	data->done_head = data->head;
			
 
				+
			
 
				+	if (!handle->locked)
			
 
				+		goto out;
			
 
				+
			
 
				+again:
			
 
				+	/*
			
 
				+	 * The xchg implies a full barrier that ensures all writes are done
			
 
				+	 * before we publish the new head, matched by a rmb() in userspace when
			
 
				+	 * reading this position.
			
 
				+	 */
			
 
				+	while ((head = atomic_long_xchg(&data->done_head, 0)))
			
 
				+		data->user_page->data_head = head;
			
 
				+
			
 
				+	/*
			
 
				+	 * NMI can happen here, which means we can miss a done_head update.
			
 
				+	 */
			
 
				+
			
 
				+	cpu = atomic_xchg(&data->lock, -1);
			
 
				+	WARN_ON_ONCE(cpu != smp_processor_id());
			
 
				+
			
 
				+	/*
			
 
				+	 * Therefore we have to validate we did not indeed do so.
			
 
				+	 */
			
 
				+	if (unlikely(atomic_long_read(&data->done_head))) {
			
 
				+		/*
			
 
				+		 * Since we had it locked, we can lock it again.
			
 
				+		 */
			
 
				+		while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
			
 
				+			cpu_relax();
			
 
				+
			
 
				+		goto again;
			
 
				+	}
			
 
				+
			
 
				+	if (atomic_xchg(&data->wakeup, 0))
			
 
				+		perf_output_wakeup(handle);
			
 
				+out:
			
 
				+	local_irq_restore(handle->flags);
			
 
				+}
			
 
				+
			
 
				+static int perf_output_begin(struct perf_output_handle *handle,
			
 
				+			     struct perf_counter *counter, unsigned int size,
			
 
				+			     int nmi, int overflow)
			
 
				+{
			
 
				+	struct perf_mmap_data *data;
			
 
				+	unsigned int offset, head;
			
 
				+
			
 
				+	/*
			
 
				+	 * For inherited counters we send all the output towards the parent.
			
 
				+	 */
			
 
				+	if (counter->parent)
			
 
				+		counter = counter->parent;
			
 
				+
			
 
				+	rcu_read_lock();
			
 
				+	data = rcu_dereference(counter->data);
			
 
				+	if (!data)
			
 
				+		goto out;
			
 
				+
			
 
				+	handle->data	 = data;
			
 
				+	handle->counter	 = counter;
			
 
				+	handle->nmi	 = nmi;
			
 
				+	handle->overflow = overflow;
			
 
				+
			
 
				+	if (!data->nr_pages)
			
 
				+		goto fail;
			
 
				+
			
 
				+	perf_output_lock(handle);
			
 
				+
			
 
				+	do {
			
 
				+		offset = head = atomic_long_read(&data->head);
			
 
				+		head += size;
			
 
				+	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
			
 
				+
			
 
				+	handle->offset	= offset;
			
 
				+	handle->head	= head;
			
 
				+
			
 
				+	if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
			
 
				+		atomic_set(&data->wakeup, 1);
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+fail:
			
 
				+	perf_output_wakeup(handle);
			
 
				+out:
			
 
				+	rcu_read_unlock();
			
 
				+
			
 
				+	return -ENOSPC;
			
 
				+}
			
 
				+
			
 
				+static void perf_output_copy(struct perf_output_handle *handle,
			
 
				+			     const void *buf, unsigned int len)
			
 
				+{
			
 
				+	unsigned int pages_mask;
			
 
				+	unsigned int offset;
			
 
				+	unsigned int size;
			
 
				+	void **pages;
			
 
				+
			
 
				+	offset		= handle->offset;
			
 
				+	pages_mask	= handle->data->nr_pages - 1;
			
 
				+	pages		= handle->data->data_pages;
			
 
				+
			
 
				+	do {
			
 
				+		unsigned int page_offset;
			
 
				+		int nr;
			
 
				+
			
 
				+		nr	    = (offset >> PAGE_SHIFT) & pages_mask;
			
 
				+		page_offset = offset & (PAGE_SIZE - 1);
			
 
				+		size	    = min_t(unsigned int, PAGE_SIZE - page_offset, len);
			
 
				+
			
 
				+		memcpy(pages[nr] + page_offset, buf, size);
			
 
				+
			
 
				+		len	    -= size;
			
 
				+		buf	    += size;
			
 
				+		offset	    += size;
			
 
				+	} while (len);
			
 
				+
			
 
				+	handle->offset = offset;
			
 
				+
			
 
				+	/*
			
 
				+	 * Check we didn't copy past our reservation window, taking the
			
 
				+	 * possible unsigned int wrap into account.
			
 
				+	 */
			
 
				+	WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
			
 
				+}
			
 
				+
			
 
				+#define perf_output_put(handle, x) \
			
 
				+	perf_output_copy((handle), &(x), sizeof(x))
			
 
				+
			
 
				+static void perf_output_end(struct perf_output_handle *handle)
			
 
				+{
			
 
				+	struct perf_counter *counter = handle->counter;
			
 
				+	struct perf_mmap_data *data = handle->data;
			
 
				+
			
 
				+	int wakeup_events = counter->attr.wakeup_events;
			
 
				+
			
 
				+	if (handle->overflow && wakeup_events) {
			
 
				+		int events = atomic_inc_return(&data->events);
			
 
				+		if (events >= wakeup_events) {
			
 
				+			atomic_sub(wakeup_events, &data->events);
			
 
				+			atomic_set(&data->wakeup, 1);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	perf_output_unlock(handle);
			
 
				+	rcu_read_unlock();
			
 
				+}
			
 
				+
			
 
				+static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
			
 
				+{
			
 
				+	/*
			
 
				+	 * only top level counters have the pid namespace they were created in
			
 
				+	 */
			
 
				+	if (counter->parent)
			
 
				+		counter = counter->parent;
			
 
				+
			
 
				+	return task_tgid_nr_ns(p, counter->ns);
			
 
				+}
			
 
				+
			
 
				+static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
			
 
				+{
			
 
				+	/*
			
 
				+	 * only top level counters have the pid namespace they were created in
			
 
				+	 */
			
 
				+	if (counter->parent)
			
 
				+		counter = counter->parent;
			
 
				+
			
 
				+	return task_pid_nr_ns(p, counter->ns);
			
 
				+}
			
 
				+
			
 
				+static void perf_counter_output(struct perf_counter *counter, int nmi,
			
 
				+				struct perf_sample_data *data)
			
 
				+{
			
 
				+	int ret;
			
 
				+	u64 sample_type = counter->attr.sample_type;
			
 
				+	struct perf_output_handle handle;
			
 
				+	struct perf_event_header header;
			
 
				+	u64 ip;
			
 
				+	struct {
			
 
				+		u32 pid, tid;
			
 
				+	} tid_entry;
			
 
				+	struct {
			
 
				+		u64 id;
			
 
				+		u64 counter;
			
 
				+	} group_entry;
			
 
				+	struct perf_callchain_entry *callchain = NULL;
			
 
				+	int callchain_size = 0;
			
 
				+	u64 time;
			
 
				+	struct {
			
 
				+		u32 cpu, reserved;
			
 
				+	} cpu_entry;
			
 
				+
			
 
				+	header.type = 0;
			
 
				+	header.size = sizeof(header);
			
 
				+
			
 
				+	header.misc = PERF_EVENT_MISC_OVERFLOW;
			
 
				+	header.misc |= perf_misc_flags(data->regs);
			
 
				+
			
 
				+	if (sample_type & PERF_SAMPLE_IP) {
			
 
				+		ip = perf_instruction_pointer(data->regs);
			
 
				+		header.type |= PERF_SAMPLE_IP;
			
 
				+		header.size += sizeof(ip);
			
 
				+	}
			
 
				+
			
 
				+	if (sample_type & PERF_SAMPLE_TID) {
			
 
				+		/* namespace issues */
			
 
				+		tid_entry.pid = perf_counter_pid(counter, current);
			
 
				+		tid_entry.tid = perf_counter_tid(counter, current);
			
 
				+
			
 
				+		header.type |= PERF_SAMPLE_TID;
			
 
				+		header.size += sizeof(tid_entry);
			
 
				+	}
			
 
				+
			
 
				+	if (sample_type & PERF_SAMPLE_TIME) {
			
 
				+		/*
			
 
				+		 * Maybe do better on x86 and provide cpu_clock_nmi()
			
 
				+		 */
			
 
				+		time = sched_clock();
			
 
				+
			
 
				+		header.type |= PERF_SAMPLE_TIME;
			
 
				+		header.size += sizeof(u64);
			
 
				+	}
			
 
				+
			
 
				+	if (sample_type & PERF_SAMPLE_ADDR) {
			
 
				+		header.type |= PERF_SAMPLE_ADDR;
			
 
				+		header.size += sizeof(u64);
			
 
				+	}
			
 
				+
			
 
				+	if (sample_type & PERF_SAMPLE_ID) {
			
 
				+		header.type |= PERF_SAMPLE_ID;
			
 
				+		header.size += sizeof(u64);
			
 
				+	}
			
 
				+
			
 
				+	if (sample_type & PERF_SAMPLE_CPU) {
			
 
				+		header.type |= PERF_SAMPLE_CPU;
			
 
				+		header.size += sizeof(cpu_entry);
			
 
				+
			
 
				+		cpu_entry.cpu = raw_smp_processor_id();
			
 
				+	}
			
 
				+
			
 
				+	if (sample_type & PERF_SAMPLE_PERIOD) {
			
 
				+		header.type |= PERF_SAMPLE_PERIOD;
			
 
				+		header.size += sizeof(u64);
			
 
				+	}
			
 
				+
			
 
				+	if (sample_type & PERF_SAMPLE_GROUP) {
			
 
				+		header.type |= PERF_SAMPLE_GROUP;
			
 
				+		header.size += sizeof(u64) +
			
 
				+			counter->nr_siblings * sizeof(group_entry);
			
 
				+	}
			
 
				+
			
 
				+	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
			
 
				+		callchain = perf_callchain(data->regs);
			
 
				+
			
 
				+		if (callchain) {
			
 
				+			callchain_size = (1 + callchain->nr) * sizeof(u64);
			
 
				+
			
 
				+			header.type |= PERF_SAMPLE_CALLCHAIN;
			
 
				+			header.size += callchain_size;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
			
 
				+	if (ret)
			
 
				+		return;
			
 
				+
			
 
				+	perf_output_put(&handle, header);
			
 
				+
			
 
				+	if (sample_type & PERF_SAMPLE_IP)
			
 
				+		perf_output_put(&handle, ip);
			
 
				+
			
 
				+	if (sample_type & PERF_SAMPLE_TID)
			
 
				+		perf_output_put(&handle, tid_entry);
			
 
				+
			
 
				+	if (sample_type & PERF_SAMPLE_TIME)
			
 
				+		perf_output_put(&handle, time);
			
 
				+
			
 
				+	if (sample_type & PERF_SAMPLE_ADDR)
			
 
				+		perf_output_put(&handle, data->addr);
			
 
				+
			
 
				+	if (sample_type & PERF_SAMPLE_ID)
			
 
				+		perf_output_put(&handle, counter->id);
			
 
				+
			
 
				+	if (sample_type & PERF_SAMPLE_CPU)
			
 
				+		perf_output_put(&handle, cpu_entry);
			
 
				+
			
 
				+	if (sample_type & PERF_SAMPLE_PERIOD)
			
 
				+		perf_output_put(&handle, data->period);
			
 
				+
			
 
				+	/*
			
 
				+	 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
			
 
				+	 */
			
 
				+	if (sample_type & PERF_SAMPLE_GROUP) {
			
 
				+		struct perf_counter *leader, *sub;
			
 
				+		u64 nr = counter->nr_siblings;
			
 
				+
			
 
				+		perf_output_put(&handle, nr);
			
 
				+
			
 
				+		leader = counter->group_leader;
			
 
				+		list_for_each_entry(sub, &leader->sibling_list, list_entry) {
			
 
				+			if (sub != counter)
			
 
				+				sub->pmu->read(sub);
			
 
				+
			
 
				+			group_entry.id = sub->id;
			
 
				+			group_entry.counter = atomic64_read(&sub->count);
			
 
				+
			
 
				+			perf_output_put(&handle, group_entry);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (callchain)
			
 
				+		perf_output_copy(&handle, callchain, callchain_size);
			
 
				+
			
 
				+	perf_output_end(&handle);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * fork tracking
			
 
				+ */
			
 
				+
			
 
				+struct perf_fork_event {
			
 
				+	struct task_struct	*task;
			
 
				+
			
 
				+	struct {
			
 
				+		struct perf_event_header	header;
			
 
				+
			
 
				+		u32				pid;
			
 
				+		u32				ppid;
			
 
				+	} event;
			
 
				+};
			
 
				+
			
 
				+static void perf_counter_fork_output(struct perf_counter *counter,
			
 
				+				     struct perf_fork_event *fork_event)
			
 
				+{
			
 
				+	struct perf_output_handle handle;
			
 
				+	int size = fork_event->event.header.size;
			
 
				+	struct task_struct *task = fork_event->task;
			
 
				+	int ret = perf_output_begin(&handle, counter, size, 0, 0);
			
 
				+
			
 
				+	if (ret)
			
 
				+		return;
			
 
				+
			
 
				+	fork_event->event.pid = perf_counter_pid(counter, task);
			
 
				+	fork_event->event.ppid = perf_counter_pid(counter, task->real_parent);
			
 
				+
			
 
				+	perf_output_put(&handle, fork_event->event);
			
 
				+	perf_output_end(&handle);
			
 
				+}
			
 
				+
			
 
				+static int perf_counter_fork_match(struct perf_counter *counter)
			
 
				+{
			
 
				+	if (counter->attr.comm || counter->attr.mmap)
			
 
				+		return 1;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
			
 
				+				  struct perf_fork_event *fork_event)
			
 
				+{
			
 
				+	struct perf_counter *counter;
			
 
				+
			
 
				+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
			
 
				+		return;
			
 
				+
			
 
				+	rcu_read_lock();
			
 
				+	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
			
 
				+		if (perf_counter_fork_match(counter))
			
 
				+			perf_counter_fork_output(counter, fork_event);
			
 
				+	}
			
 
				+	rcu_read_unlock();
			
 
				+}
			
 
				+
			
 
				+static void perf_counter_fork_event(struct perf_fork_event *fork_event)
			
 
				+{
			
 
				+	struct perf_cpu_context *cpuctx;
			
 
				+	struct perf_counter_context *ctx;
			
 
				+
			
 
				+	cpuctx = &get_cpu_var(perf_cpu_context);
			
 
				+	perf_counter_fork_ctx(&cpuctx->ctx, fork_event);
			
 
				+	put_cpu_var(perf_cpu_context);
			
 
				+
			
 
				+	rcu_read_lock();
			
 
				+	/*
			
 
				+	 * doesn't really matter which of the child contexts the
			
 
				+	 * events ends up in.
			
 
				+	 */
			
 
				+	ctx = rcu_dereference(current->perf_counter_ctxp);
			
 
				+	if (ctx)
			
 
				+		perf_counter_fork_ctx(ctx, fork_event);
			
 
				+	rcu_read_unlock();
			
 
				+}
			
 
				+
			
 
				+void perf_counter_fork(struct task_struct *task)
			
 
				+{
			
 
				+	struct perf_fork_event fork_event;
			
 
				+
			
 
				+	if (!atomic_read(&nr_comm_counters) &&
			
 
				+	    !atomic_read(&nr_mmap_counters))
			
 
				+		return;
			
 
				+
			
 
				+	fork_event = (struct perf_fork_event){
			
 
				+		.task	= task,
			
 
				+		.event  = {
			
 
				+			.header = {
			
 
				+				.type = PERF_EVENT_FORK,
			
 
				+				.size = sizeof(fork_event.event),
			
 
				+			},
			
 
				+		},
			
 
				+	};
			
 
				+
			
 
				+	perf_counter_fork_event(&fork_event);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * comm tracking
			
 
				+ */
			
 
				+
			
 
				+struct perf_comm_event {
			
 
				+	struct task_struct	*task;
			
 
				+	char			*comm;
			
 
				+	int			comm_size;
			
 
				+
			
 
				+	struct {
			
 
				+		struct perf_event_header	header;
			
 
				+
			
 
				+		u32				pid;
			
 
				+		u32				tid;
			
 
				+	} event;
			
 
				+};
			
 
				+
			
 
				+static void perf_counter_comm_output(struct perf_counter *counter,
			
 
				+				     struct perf_comm_event *comm_event)
			
 
				+{
			
 
				+	struct perf_output_handle handle;
			
 
				+	int size = comm_event->event.header.size;
			
 
				+	int ret = perf_output_begin(&handle, counter, size, 0, 0);
			
 
				+
			
 
				+	if (ret)
			
 
				+		return;
			
 
				+
			
 
				+	comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
			
 
				+	comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
			
 
				+
			
 
				+	perf_output_put(&handle, comm_event->event);
			
 
				+	perf_output_copy(&handle, comm_event->comm,
			
 
				+				   comm_event->comm_size);
			
 
				+	perf_output_end(&handle);
			
 
				+}
			
 
				+
			
 
				+static int perf_counter_comm_match(struct perf_counter *counter)
			
 
				+{
			
 
				+	if (counter->attr.comm)
			
 
				+		return 1;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
			
 
				+				  struct perf_comm_event *comm_event)
			
 
				+{
			
 
				+	struct perf_counter *counter;
			
 
				+
			
 
				+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
			
 
				+		return;
			
 
				+
			
 
				+	rcu_read_lock();
			
 
				+	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
			
 
				+		if (perf_counter_comm_match(counter))
			
 
				+			perf_counter_comm_output(counter, comm_event);
			
 
				+	}
			
 
				+	rcu_read_unlock();
			
 
				+}
			
 
				+
			
 
				+static void perf_counter_comm_event(struct perf_comm_event *comm_event)
			
 
				+{
			
 
				+	struct perf_cpu_context *cpuctx;
			
 
				+	struct perf_counter_context *ctx;
			
 
				+	unsigned int size;
			
 
				+	char *comm = comm_event->task->comm;
			
 
				+
			
 
				+	size = ALIGN(strlen(comm)+1, sizeof(u64));
			
 
				+
			
 
				+	comm_event->comm = comm;
			
 
				+	comm_event->comm_size = size;
			
 
				+
			
 
				+	comm_event->event.header.size = sizeof(comm_event->event) + size;
			
 
				+
			
 
				+	cpuctx = &get_cpu_var(perf_cpu_context);
			
 
				+	perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
			
 
				+	put_cpu_var(perf_cpu_context);
			
 
				+
			
 
				+	rcu_read_lock();
			
 
				+	/*
			
 
				+	 * doesn't really matter which of the child contexts the
			
 
				+	 * events ends up in.
			
 
				+	 */
			
 
				+	ctx = rcu_dereference(current->perf_counter_ctxp);
			
 
				+	if (ctx)
			
 
				+		perf_counter_comm_ctx(ctx, comm_event);
			
 
				+	rcu_read_unlock();
			
 
				+}
			
 
				+
			
 
				+void perf_counter_comm(struct task_struct *task)
			
 
				+{
			
 
				+	struct perf_comm_event comm_event;
			
 
				+
			
 
				+	if (!atomic_read(&nr_comm_counters))
			
 
				+		return;
			
 
				+
			
 
				+	comm_event = (struct perf_comm_event){
			
 
				+		.task	= task,
			
 
				+		.event  = {
			
 
				+			.header = { .type = PERF_EVENT_COMM, },
			
 
				+		},
			
 
				+	};
			
 
				+
			
 
				+	perf_counter_comm_event(&comm_event);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * mmap tracking
			
 
				+ */
			
 
				+
			
 
				+struct perf_mmap_event {
			
 
				+	struct vm_area_struct	*vma;
			
 
				+
			
 
				+	const char		*file_name;
			
 
				+	int			file_size;
			
 
				+
			
 
				+	struct {
			
 
				+		struct perf_event_header	header;
			
 
				+
			
 
				+		u32				pid;
			
 
				+		u32				tid;
			
 
				+		u64				start;
			
 
				+		u64				len;
			
 
				+		u64				pgoff;
			
 
				+	} event;
			
 
				+};
			
 
				+
			
 
				+static void perf_counter_mmap_output(struct perf_counter *counter,
			
 
				+				     struct perf_mmap_event *mmap_event)
			
 
				+{
			
 
				+	struct perf_output_handle handle;
			
 
				+	int size = mmap_event->event.header.size;
			
 
				+	int ret = perf_output_begin(&handle, counter, size, 0, 0);
			
 
				+
			
 
				+	if (ret)
			
 
				+		return;
			
 
				+
			
 
				+	mmap_event->event.pid = perf_counter_pid(counter, current);
			
 
				+	mmap_event->event.tid = perf_counter_tid(counter, current);
			
 
				+
			
 
				+	perf_output_put(&handle, mmap_event->event);
			
 
				+	perf_output_copy(&handle, mmap_event->file_name,
			
 
				+				   mmap_event->file_size);
			
 
				+	perf_output_end(&handle);
			
 
				+}
			
 
				+
			
 
				+static int perf_counter_mmap_match(struct perf_counter *counter,
			
 
				+				   struct perf_mmap_event *mmap_event)
			
 
				+{
			
 
				+	if (counter->attr.mmap)
			
 
				+		return 1;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
			
 
				+				  struct perf_mmap_event *mmap_event)
			
 
				+{
			
 
				+	struct perf_counter *counter;
			
 
				+
			
 
				+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
			
 
				+		return;
			
 
				+
			
 
				+	rcu_read_lock();
			
 
				+	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
			
 
				+		if (perf_counter_mmap_match(counter, mmap_event))
			
 
				+			perf_counter_mmap_output(counter, mmap_event);
			
 
				+	}
			
 
				+	rcu_read_unlock();
			
 
				+}
			
 
				+
			
 
				+static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
			
 
				+{
			
 
				+	struct perf_cpu_context *cpuctx;
			
 
				+	struct perf_counter_context *ctx;
			
 
				+	struct vm_area_struct *vma = mmap_event->vma;
			
 
				+	struct file *file = vma->vm_file;
			
 
				+	unsigned int size;
			
 
				+	char tmp[16];
			
 
				+	char *buf = NULL;
			
 
				+	const char *name;
			
 
				+
			
 
				+	if (file) {
			
 
				+		buf = kzalloc(PATH_MAX, GFP_KERNEL);
			
 
				+		if (!buf) {
			
 
				+			name = strncpy(tmp, "//enomem", sizeof(tmp));
			
 
				+			goto got_name;
			
 
				+		}
			
 
				+		name = d_path(&file->f_path, buf, PATH_MAX);
			
 
				+		if (IS_ERR(name)) {
			
 
				+			name = strncpy(tmp, "//toolong", sizeof(tmp));
			
 
				+			goto got_name;
			
 
				+		}
			
 
				+	} else {
			
 
				+		name = arch_vma_name(mmap_event->vma);
			
 
				+		if (name)
			
 
				+			goto got_name;
			
 
				+
			
 
				+		if (!vma->vm_mm) {
			
 
				+			name = strncpy(tmp, "[vdso]", sizeof(tmp));
			
 
				+			goto got_name;
			
 
				+		}
			
 
				+
			
 
				+		name = strncpy(tmp, "//anon", sizeof(tmp));
			
 
				+		goto got_name;
			
 
				+	}
			
 
				+
			
 
				+got_name:
			
 
				+	size = ALIGN(strlen(name)+1, sizeof(u64));
			
 
				+
			
 
				+	mmap_event->file_name = name;
			
 
				+	mmap_event->file_size = size;
			
 
				+
			
 
				+	mmap_event->event.header.size = sizeof(mmap_event->event) + size;
			
 
				+
			
 
				+	cpuctx = &get_cpu_var(perf_cpu_context);
			
 
				+	perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
			
 
				+	put_cpu_var(perf_cpu_context);
			
 
				+
			
 
				+	rcu_read_lock();
			
 
				+	/*
			
 
				+	 * doesn't really matter which of the child contexts the
			
 
				+	 * events ends up in.
			
 
				+	 */
			
 
				+	ctx = rcu_dereference(current->perf_counter_ctxp);
			
 
				+	if (ctx)
			
 
				+		perf_counter_mmap_ctx(ctx, mmap_event);
			
 
				+	rcu_read_unlock();
			
 
				+
			
 
				+	kfree(buf);
			
 
				+}
			
 
				+
			
 
				+void __perf_counter_mmap(struct vm_area_struct *vma)
			
 
				+{
			
 
				+	struct perf_mmap_event mmap_event;
			
 
				+
			
 
				+	if (!atomic_read(&nr_mmap_counters))
			
 
				+		return;
			
 
				+
			
 
				+	mmap_event = (struct perf_mmap_event){
			
 
				+		.vma	= vma,
			
 
				+		.event  = {
			
 
				+			.header = { .type = PERF_EVENT_MMAP, },
			
 
				+			.start  = vma->vm_start,
			
 
				+			.len    = vma->vm_end - vma->vm_start,
			
 
				+			.pgoff  = vma->vm_pgoff,
			
 
				+		},
			
 
				+	};
			
 
				+
			
 
				+	perf_counter_mmap_event(&mmap_event);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Log sample_period changes so that analyzing tools can re-normalize the
			
 
				+ * event flow.
			
 
				+ */
			
 
				+
			
 
				+struct freq_event {
			
 
				+	struct perf_event_header	header;
			
 
				+	u64				time;
			
 
				+	u64				id;
			
 
				+	u64				period;
			
 
				+};
			
 
				+
			
 
				+static void perf_log_period(struct perf_counter *counter, u64 period)
			
 
				+{
			
 
				+	struct perf_output_handle handle;
			
 
				+	struct freq_event event;
			
 
				+	int ret;
			
 
				+
			
 
				+	if (counter->hw.sample_period == period)
			
 
				+		return;
			
 
				+
			
 
				+	if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
			
 
				+		return;
			
 
				+
			
 
				+	event = (struct freq_event) {
			
 
				+		.header = {
			
 
				+			.type = PERF_EVENT_PERIOD,
			
 
				+			.misc = 0,
			
 
				+			.size = sizeof(event),
			
 
				+		},
			
 
				+		.time = sched_clock(),
			
 
				+		.id = counter->id,
			
 
				+		.period = period,
			
 
				+	};
			
 
				+
			
 
				+	ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
			
 
				+	if (ret)
			
 
				+		return;
			
 
				+
			
 
				+	perf_output_put(&handle, event);
			
 
				+	perf_output_end(&handle);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * IRQ throttle logging
			
 
				+ */
			
 
				+
			
 
				+static void perf_log_throttle(struct perf_counter *counter, int enable)
			
 
				+{
			
 
				+	struct perf_output_handle handle;
			
 
				+	int ret;
			
 
				+
			
 
				+	struct {
			
 
				+		struct perf_event_header	header;
			
 
				+		u64				time;
			
 
				+		u64				id;
			
 
				+	} throttle_event = {
			
 
				+		.header = {
			
 
				+			.type = PERF_EVENT_THROTTLE + 1,
			
 
				+			.misc = 0,
			
 
				+			.size = sizeof(throttle_event),
			
 
				+		},
			
 
				+		.time	= sched_clock(),
			
 
				+		.id	= counter->id,
			
 
				+	};
			
 
				+
			
 
				+	ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
			
 
				+	if (ret)
			
 
				+		return;
			
 
				+
			
 
				+	perf_output_put(&handle, throttle_event);
			
 
				+	perf_output_end(&handle);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Generic counter overflow handling.
			
 
				+ */
			
 
				+
			
 
				+int perf_counter_overflow(struct perf_counter *counter, int nmi,
			
 
				+			  struct perf_sample_data *data)
			
 
				+{
			
 
				+	int events = atomic_read(&counter->event_limit);
			
 
				+	int throttle = counter->pmu->unthrottle != NULL;
			
 
				+	struct hw_perf_counter *hwc = &counter->hw;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	if (!throttle) {
			
 
				+		hwc->interrupts++;
			
 
				+	} else {
			
 
				+		if (hwc->interrupts != MAX_INTERRUPTS) {
			
 
				+			hwc->interrupts++;
			
 
				+			if (HZ * hwc->interrupts >
			
 
				+					(u64)sysctl_perf_counter_sample_rate) {
			
 
				+				hwc->interrupts = MAX_INTERRUPTS;
			
 
				+				perf_log_throttle(counter, 0);
			
 
				+				ret = 1;
			
 
				+			}
			
 
				+		} else {
			
 
				+			/*
			
 
				+			 * Keep re-disabling counters even though on the previous
			
 
				+			 * pass we disabled it - just in case we raced with a
			
 
				+			 * sched-in and the counter got enabled again:
			
 
				+			 */
			
 
				+			ret = 1;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (counter->attr.freq) {
			
 
				+		u64 now = sched_clock();
			
 
				+		s64 delta = now - hwc->freq_stamp;
			
 
				+
			
 
				+		hwc->freq_stamp = now;
			
 
				+
			
 
				+		if (delta > 0 && delta < TICK_NSEC)
			
 
				+			perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * XXX event_limit might not quite work as expected on inherited
			
 
				+	 * counters
			
 
				+	 */
			
 
				+
			
 
				+	counter->pending_kill = POLL_IN;
			
 
				+	if (events && atomic_dec_and_test(&counter->event_limit)) {
			
 
				+		ret = 1;
			
 
				+		counter->pending_kill = POLL_HUP;
			
 
				+		if (nmi) {
			
 
				+			counter->pending_disable = 1;
			
 
				+			perf_pending_queue(&counter->pending,
			
 
				+					   perf_pending_counter);
			
 
				+		} else
			
 
				+			perf_counter_disable(counter);
			
 
				+	}
			
 
				+
			
 
				+	perf_counter_output(counter, nmi, data);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Generic software counter infrastructure
			
 
				+ */
			
 
				+
			
 
				+static void perf_swcounter_update(struct perf_counter *counter)
			
 
				+{
			
 
				+	struct hw_perf_counter *hwc = &counter->hw;
			
 
				+	u64 prev, now;
			
 
				+	s64 delta;
			
 
				+
			
 
				+again:
			
 
				+	prev = atomic64_read(&hwc->prev_count);
			
 
				+	now = atomic64_read(&hwc->count);
			
 
				+	if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
			
 
				+		goto again;
			
 
				+
			
 
				+	delta = now - prev;
			
 
				+
			
 
				+	atomic64_add(delta, &counter->count);
			
 
				+	atomic64_sub(delta, &hwc->period_left);
			
 
				+}
			
 
				+
			
 
				+static void perf_swcounter_set_period(struct perf_counter *counter)
			
 
				+{
			
 
				+	struct hw_perf_counter *hwc = &counter->hw;
			
 
				+	s64 left = atomic64_read(&hwc->period_left);
			
 
				+	s64 period = hwc->sample_period;
			
 
				+
			
 
				+	if (unlikely(left <= -period)) {
			
 
				+		left = period;
			
 
				+		atomic64_set(&hwc->period_left, left);
			
 
				+		hwc->last_period = period;
			
 
				+	}
			
 
				+
			
 
				+	if (unlikely(left <= 0)) {
			
 
				+		left += period;
			
 
				+		atomic64_add(period, &hwc->period_left);
			
 
				+		hwc->last_period = period;
			
 
				+	}
			
 
				+
			
 
				+	atomic64_set(&hwc->prev_count, -left);
			
 
				+	atomic64_set(&hwc->count, -left);
			
 
				+}
			
 
				+
			
 
				+static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
			
 
				+{
			
 
				+	enum hrtimer_restart ret = HRTIMER_RESTART;
			
 
				+	struct perf_sample_data data;
			
 
				+	struct perf_counter *counter;
			
 
				+	u64 period;
			
 
				+
			
 
				+	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
			
 
				+	counter->pmu->read(counter);
			
 
				+
			
 
				+	data.addr = 0;
			
 
				+	data.regs = get_irq_regs();
			
 
				+	/*
			
 
				+	 * In case we exclude kernel IPs or are somehow not in interrupt
			
 
				+	 * context, provide the next best thing, the user IP.
			
 
				+	 */
			
 
				+	if ((counter->attr.exclude_kernel || !data.regs) &&
			
 
				+			!counter->attr.exclude_user)
			
 
				+		data.regs = task_pt_regs(current);
			
 
				+
			
 
				+	if (data.regs) {
			
 
				+		if (perf_counter_overflow(counter, 0, &data))
			
 
				+			ret = HRTIMER_NORESTART;
			
 
				+	}
			
 
				+
			
 
				+	period = max_t(u64, 10000, counter->hw.sample_period);
			
 
				+	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static void perf_swcounter_overflow(struct perf_counter *counter,
			
 
				+				    int nmi, struct pt_regs *regs, u64 addr)
			
 
				+{
			
 
				+	struct perf_sample_data data = {
			
 
				+		.regs	= regs,
			
 
				+		.addr	= addr,
			
 
				+		.period	= counter->hw.last_period,
			
 
				+	};
			
 
				+
			
 
				+	perf_swcounter_update(counter);
			
 
				+	perf_swcounter_set_period(counter);
			
 
				+	if (perf_counter_overflow(counter, nmi, &data))
			
 
				+		/* soft-disable the counter */
			
 
				+		;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+static int perf_swcounter_is_counting(struct perf_counter *counter)
			
 
				+{
			
 
				+	struct perf_counter_context *ctx;
			
 
				+	unsigned long flags;
			
 
				+	int count;
			
 
				+
			
 
				+	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
			
 
				+		return 1;
			
 
				+
			
 
				+	if (counter->state != PERF_COUNTER_STATE_INACTIVE)
			
 
				+		return 0;
			
 
				+
			
 
				+	/*
			
 
				+	 * If the counter is inactive, it could be just because
			
 
				+	 * its task is scheduled out, or because it's in a group
			
 
				+	 * which could not go on the PMU.  We want to count in
			
 
				+	 * the first case but not the second.  If the context is
			
 
				+	 * currently active then an inactive software counter must
			
 
				+	 * be the second case.  If it's not currently active then
			
 
				+	 * we need to know whether the counter was active when the
			
 
				+	 * context was last active, which we can determine by
			
 
				+	 * comparing counter->tstamp_stopped with ctx->time.
			
 
				+	 *
			
 
				+	 * We are within an RCU read-side critical section,
			
 
				+	 * which protects the existence of *ctx.
			
 
				+	 */
			
 
				+	ctx = counter->ctx;
			
 
				+	spin_lock_irqsave(&ctx->lock, flags);
			
 
				+	count = 1;
			
 
				+	/* Re-check state now we have the lock */
			
 
				+	if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
			
 
				+	    counter->ctx->is_active ||
			
 
				+	    counter->tstamp_stopped < ctx->time)
			
 
				+		count = 0;
			
 
				+	spin_unlock_irqrestore(&ctx->lock, flags);
			
 
				+	return count;
			
 
				+}
			
 
				+
			
 
				+static int perf_swcounter_match(struct perf_counter *counter,
			
 
				+				enum perf_type_id type,
			
 
				+				u32 event, struct pt_regs *regs)
			
 
				+{
			
 
				+	if (!perf_swcounter_is_counting(counter))
			
 
				+		return 0;
			
 
				+
			
 
				+	if (counter->attr.type != type)
			
 
				+		return 0;
			
 
				+	if (counter->attr.config != event)
			
 
				+		return 0;
			
 
				+
			
 
				+	if (regs) {
			
 
				+		if (counter->attr.exclude_user && user_mode(regs))
			
 
				+			return 0;
			
 
				+
			
 
				+		if (counter->attr.exclude_kernel && !user_mode(regs))
			
 
				+			return 0;
			
 
				+	}
			
 
				+
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
			
 
				+			       int nmi, struct pt_regs *regs, u64 addr)
			
 
				+{
			
 
				+	int neg = atomic64_add_negative(nr, &counter->hw.count);
			
 
				+
			
 
				+	if (counter->hw.sample_period && !neg && regs)
			
 
				+		perf_swcounter_overflow(counter, nmi, regs, addr);
			
 
				+}
			
 
				+
			
 
				+static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
			
 
				+				     enum perf_type_id type, u32 event,
			
 
				+				     u64 nr, int nmi, struct pt_regs *regs,
			
 
				+				     u64 addr)
			
 
				+{
			
 
				+	struct perf_counter *counter;
			
 
				+
			
 
				+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
			
 
				+		return;
			
 
				+
			
 
				+	rcu_read_lock();
			
 
				+	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
			
 
				+		if (perf_swcounter_match(counter, type, event, regs))
			
 
				+			perf_swcounter_add(counter, nr, nmi, regs, addr);
			
 
				+	}
			
 
				+	rcu_read_unlock();
			
 
				+}
			
 
				+
			
 
				+static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
			
 
				+{
			
 
				+	if (in_nmi())
			
 
				+		return &cpuctx->recursion[3];
			
 
				+
			
 
				+	if (in_irq())
			
 
				+		return &cpuctx->recursion[2];
			
 
				+
			
 
				+	if (in_softirq())
			
 
				+		return &cpuctx->recursion[1];
			
 
				+
			
 
				+	return &cpuctx->recursion[0];
			
 
				+}
			
 
				+
			
 
				+static void __perf_swcounter_event(enum perf_type_id type, u32 event,
			
 
				+				   u64 nr, int nmi, struct pt_regs *regs,
			
 
				+				   u64 addr)
			
 
				+{
			
 
				+	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
			
 
				+	int *recursion = perf_swcounter_recursion_context(cpuctx);
			
 
				+	struct perf_counter_context *ctx;
			
 
				+
			
 
				+	if (*recursion)
			
 
				+		goto out;
			
 
				+
			
 
				+	(*recursion)++;
			
 
				+	barrier();
			
 
				+
			
 
				+	perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
			
 
				+				 nr, nmi, regs, addr);
			
 
				+	rcu_read_lock();
			
 
				+	/*
			
 
				+	 * doesn't really matter which of the child contexts the
			
 
				+	 * events ends up in.
			
 
				+	 */
			
 
				+	ctx = rcu_dereference(current->perf_counter_ctxp);
			
 
				+	if (ctx)
			
 
				+		perf_swcounter_ctx_event(ctx, type, event, nr, nmi, regs, addr);
			
 
				+	rcu_read_unlock();
			
 
				+
			
 
				+	barrier();
			
 
				+	(*recursion)--;
			
 
				+
			
 
				+out:
			
 
				+	put_cpu_var(perf_cpu_context);
			
 
				+}
			
 
				+
			
 
				+void
			
 
				+perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
			
 
				+{
			
 
				+	__perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr);
			
 
				+}
			
 
				+
			
 
				+static void perf_swcounter_read(struct perf_counter *counter)
			
 
				+{
			
 
				+	perf_swcounter_update(counter);
			
 
				+}
			
 
				+
			
 
				+static int perf_swcounter_enable(struct perf_counter *counter)
			
 
				+{
			
 
				+	perf_swcounter_set_period(counter);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void perf_swcounter_disable(struct perf_counter *counter)
			
 
				+{
			
 
				+	perf_swcounter_update(counter);
			
 
				+}
			
 
				+
			
 
				+static const struct pmu perf_ops_generic = {
			
 
				+	.enable		= perf_swcounter_enable,
			
 
				+	.disable	= perf_swcounter_disable,
			
 
				+	.read		= perf_swcounter_read,
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Software counter: cpu wall time clock
			
 
				+ */
			
 
				+
			
 
				+static void cpu_clock_perf_counter_update(struct perf_counter *counter)
			
 
				+{
			
 
				+	int cpu = raw_smp_processor_id();
			
 
				+	s64 prev;
			
 
				+	u64 now;
			
 
				+
			
 
				+	now = cpu_clock(cpu);
			
 
				+	prev = atomic64_read(&counter->hw.prev_count);
			
 
				+	atomic64_set(&counter->hw.prev_count, now);
			
 
				+	atomic64_add(now - prev, &counter->count);
			
 
				+}
			
 
				+
			
 
				+static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
			
 
				+{
			
 
				+	struct hw_perf_counter *hwc = &counter->hw;
			
 
				+	int cpu = raw_smp_processor_id();
			
 
				+
			
 
				+	atomic64_set(&hwc->prev_count, cpu_clock(cpu));
			
 
				+	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
			
 
				+	hwc->hrtimer.function = perf_swcounter_hrtimer;
			
 
				+	if (hwc->sample_period) {
			
 
				+		u64 period = max_t(u64, 10000, hwc->sample_period);
			
 
				+		__hrtimer_start_range_ns(&hwc->hrtimer,
			
 
				+				ns_to_ktime(period), 0,
			
 
				+				HRTIMER_MODE_REL, 0);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
			
 
				+{
			
 
				+	if (counter->hw.sample_period)
			
 
				+		hrtimer_cancel(&counter->hw.hrtimer);
			
 
				+	cpu_clock_perf_counter_update(counter);
			
 
				+}
			
 
				+
			
 
				+static void cpu_clock_perf_counter_read(struct perf_counter *counter)
			
 
				+{
			
 
				+	cpu_clock_perf_counter_update(counter);
			
 
				+}
			
 
				+
			
 
				+static const struct pmu perf_ops_cpu_clock = {
			
 
				+	.enable		= cpu_clock_perf_counter_enable,
			
 
				+	.disable	= cpu_clock_perf_counter_disable,
			
 
				+	.read		= cpu_clock_perf_counter_read,
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Software counter: task time clock
			
 
				+ */
			
 
				+
			
 
				+static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
			
 
				+{
			
 
				+	u64 prev;
			
 
				+	s64 delta;
			
 
				+
			
 
				+	prev = atomic64_xchg(&counter->hw.prev_count, now);
			
 
				+	delta = now - prev;
			
 
				+	atomic64_add(delta, &counter->count);
			
 
				+}
			
 
				+
			
 
				+static int task_clock_perf_counter_enable(struct perf_counter *counter)
			
 
				+{
			
 
				+	struct hw_perf_counter *hwc = &counter->hw;
			
 
				+	u64 now;
			
 
				+
			
 
				+	now = counter->ctx->time;
			
 
				+
			
 
				+	atomic64_set(&hwc->prev_count, now);
			
 
				+	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
			
 
				+	hwc->hrtimer.function = perf_swcounter_hrtimer;
			
 
				+	if (hwc->sample_period) {
			
 
				+		u64 period = max_t(u64, 10000, hwc->sample_period);
			
 
				+		__hrtimer_start_range_ns(&hwc->hrtimer,
			
 
				+				ns_to_ktime(period), 0,
			
 
				+				HRTIMER_MODE_REL, 0);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void task_clock_perf_counter_disable(struct perf_counter *counter)
			
 
				+{
			
 
				+	if (counter->hw.sample_period)
			
 
				+		hrtimer_cancel(&counter->hw.hrtimer);
			
 
				+	task_clock_perf_counter_update(counter, counter->ctx->time);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+static void task_clock_perf_counter_read(struct perf_counter *counter)
			
 
				+{
			
 
				+	u64 time;
			
 
				+
			
 
				+	if (!in_nmi()) {
			
 
				+		update_context_time(counter->ctx);
			
 
				+		time = counter->ctx->time;
			
 
				+	} else {
			
 
				+		u64 now = perf_clock();
			
 
				+		u64 delta = now - counter->ctx->timestamp;
			
 
				+		time = counter->ctx->time + delta;
			
 
				+	}
			
 
				+
			
 
				+	task_clock_perf_counter_update(counter, time);
			
 
				+}
			
 
				+
			
 
				+static const struct pmu perf_ops_task_clock = {
			
 
				+	.enable		= task_clock_perf_counter_enable,
			
 
				+	.disable	= task_clock_perf_counter_disable,
			
 
				+	.read		= task_clock_perf_counter_read,
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Software counter: cpu migrations
			
 
				+ */
			
 
				+void perf_counter_task_migration(struct task_struct *task, int cpu)
			
 
				+{
			
 
				+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
			
 
				+	struct perf_counter_context *ctx;
			
 
				+
			
 
				+	perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
			
 
				+				 PERF_COUNT_SW_CPU_MIGRATIONS,
			
 
				+				 1, 1, NULL, 0);
			
 
				+
			
 
				+	ctx = perf_pin_task_context(task);
			
 
				+	if (ctx) {
			
 
				+		perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
			
 
				+					 PERF_COUNT_SW_CPU_MIGRATIONS,
			
 
				+					 1, 1, NULL, 0);
			
 
				+		perf_unpin_context(ctx);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#ifdef CONFIG_EVENT_PROFILE
			
 
				+void perf_tpcounter_event(int event_id)
			
 
				+{
			
 
				+	struct pt_regs *regs = get_irq_regs();
			
 
				+
			
 
				+	if (!regs)
			
 
				+		regs = task_pt_regs(current);
			
 
				+
			
 
				+	__perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(perf_tpcounter_event);
			
 
				+
			
 
				+extern int ftrace_profile_enable(int);
			
 
				+extern void ftrace_profile_disable(int);
			
 
				+
			
 
				+static void tp_perf_counter_destroy(struct perf_counter *counter)
			
 
				+{
			
 
				+	ftrace_profile_disable(perf_event_id(&counter->attr));
			
 
				+}
			
 
				+
			
 
				+static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
			
 
				+{
			
 
				+	int event_id = perf_event_id(&counter->attr);
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = ftrace_profile_enable(event_id);
			
 
				+	if (ret)
			
 
				+		return NULL;
			
 
				+
			
 
				+	counter->destroy = tp_perf_counter_destroy;
			
 
				+
			
 
				+	return &perf_ops_generic;
			
 
				+}
			
 
				+#else
			
 
				+static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
			
 
				+{
			
 
				+	return NULL;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
			
 
				+{
			
 
				+	const struct pmu *pmu = NULL;
			
 
				+
			
 
				+	/*
			
 
				+	 * Software counters (currently) can't in general distinguish
			
 
				+	 * between user, kernel and hypervisor events.
			
 
				+	 * However, context switches and cpu migrations are considered
			
 
				+	 * to be kernel events, and page faults are never hypervisor
			
 
				+	 * events.
			
 
				+	 */
			
 
				+	switch (counter->attr.config) {
			
 
				+	case PERF_COUNT_SW_CPU_CLOCK:
			
 
				+		pmu = &perf_ops_cpu_clock;
			
 
				+
			
 
				+		break;
			
 
				+	case PERF_COUNT_SW_TASK_CLOCK:
			
 
				+		/*
			
 
				+		 * If the user instantiates this as a per-cpu counter,
			
 
				+		 * use the cpu_clock counter instead.
			
 
				+		 */
			
 
				+		if (counter->ctx->task)
			
 
				+			pmu = &perf_ops_task_clock;
			
 
				+		else
			
 
				+			pmu = &perf_ops_cpu_clock;
			
 
				+
			
 
				+		break;
			
 
				+	case PERF_COUNT_SW_PAGE_FAULTS:
			
 
				+	case PERF_COUNT_SW_PAGE_FAULTS_MIN:
			
 
				+	case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
			
 
				+	case PERF_COUNT_SW_CONTEXT_SWITCHES:
			
 
				+	case PERF_COUNT_SW_CPU_MIGRATIONS:
			
 
				+		pmu = &perf_ops_generic;
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	return pmu;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Allocate and initialize a counter structure
			
 
				+ */
			
 
				+static struct perf_counter *
			
 
				+perf_counter_alloc(struct perf_counter_attr *attr,
			
 
				+		   int cpu,
			
 
				+		   struct perf_counter_context *ctx,
			
 
				+		   struct perf_counter *group_leader,
			
 
				+		   gfp_t gfpflags)
			
 
				+{
			
 
				+	const struct pmu *pmu;
			
 
				+	struct perf_counter *counter;
			
 
				+	struct hw_perf_counter *hwc;
			
 
				+	long err;
			
 
				+
			
 
				+	counter = kzalloc(sizeof(*counter), gfpflags);
			
 
				+	if (!counter)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+
			
 
				+	/*
			
 
				+	 * Single counters are their own group leaders, with an
			
 
				+	 * empty sibling list:
			
 
				+	 */
			
 
				+	if (!group_leader)
			
 
				+		group_leader = counter;
			
 
				+
			
 
				+	mutex_init(&counter->child_mutex);
			
 
				+	INIT_LIST_HEAD(&counter->child_list);
			
 
				+
			
 
				+	INIT_LIST_HEAD(&counter->list_entry);
			
 
				+	INIT_LIST_HEAD(&counter->event_entry);
			
 
				+	INIT_LIST_HEAD(&counter->sibling_list);
			
 
				+	init_waitqueue_head(&counter->waitq);
			
 
				+
			
 
				+	mutex_init(&counter->mmap_mutex);
			
 
				+
			
 
				+	counter->cpu		= cpu;
			
 
				+	counter->attr		= *attr;
			
 
				+	counter->group_leader	= group_leader;
			
 
				+	counter->pmu		= NULL;
			
 
				+	counter->ctx		= ctx;
			
 
				+	counter->oncpu		= -1;
			
 
				+
			
 
				+	counter->ns		= get_pid_ns(current->nsproxy->pid_ns);
			
 
				+	counter->id		= atomic64_inc_return(&perf_counter_id);
			
 
				+
			
 
				+	counter->state		= PERF_COUNTER_STATE_INACTIVE;
			
 
				+
			
 
				+	if (attr->disabled)
			
 
				+		counter->state = PERF_COUNTER_STATE_OFF;
			
 
				+
			
 
				+	pmu = NULL;
			
 
				+
			
 
				+	hwc = &counter->hw;
			
 
				+	hwc->sample_period = attr->sample_period;
			
 
				+	if (attr->freq && attr->sample_freq)
			
 
				+		hwc->sample_period = 1;
			
 
				+
			
 
				+	atomic64_set(&hwc->period_left, hwc->sample_period);
			
 
				+
			
 
				+	/*
			
 
				+	 * we currently do not support PERF_SAMPLE_GROUP on inherited counters
			
 
				+	 */
			
 
				+	if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
			
 
				+		goto done;
			
 
				+
			
 
				+	if (attr->type == PERF_TYPE_RAW) {
			
 
				+		pmu = hw_perf_counter_init(counter);
			
 
				+		goto done;
			
 
				+	}
			
 
				+
			
 
				+	switch (attr->type) {
			
 
				+	case PERF_TYPE_HARDWARE:
			
 
				+	case PERF_TYPE_HW_CACHE:
			
 
				+		pmu = hw_perf_counter_init(counter);
			
 
				+		break;
			
 
				+
			
 
				+	case PERF_TYPE_SOFTWARE:
			
 
				+		pmu = sw_perf_counter_init(counter);
			
 
				+		break;
			
 
				+
			
 
				+	case PERF_TYPE_TRACEPOINT:
			
 
				+		pmu = tp_perf_counter_init(counter);
			
 
				+		break;
			
 
				+	}
			
 
				+done:
			
 
				+	err = 0;
			
 
				+	if (!pmu)
			
 
				+		err = -EINVAL;
			
 
				+	else if (IS_ERR(pmu))
			
 
				+		err = PTR_ERR(pmu);
			
 
				+
			
 
				+	if (err) {
			
 
				+		if (counter->ns)
			
 
				+			put_pid_ns(counter->ns);
			
 
				+		kfree(counter);
			
 
				+		return ERR_PTR(err);
			
 
				+	}
			
 
				+
			
 
				+	counter->pmu = pmu;
			
 
				+
			
 
				+	atomic_inc(&nr_counters);
			
 
				+	if (counter->attr.mmap)
			
 
				+		atomic_inc(&nr_mmap_counters);
			
 
				+	if (counter->attr.comm)
			
 
				+		atomic_inc(&nr_comm_counters);
			
 
				+
			
 
				+	return counter;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
			
 
				+ *
			
 
				+ * @attr_uptr:	event type attributes for monitoring/sampling
			
 
				+ * @pid:		target pid
			
 
				+ * @cpu:		target cpu
			
 
				+ * @group_fd:		group leader counter fd
			
 
				+ */
			
 
				+SYSCALL_DEFINE5(perf_counter_open,
			
 
				+		const struct perf_counter_attr __user *, attr_uptr,
			
 
				+		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
			
 
				+{
			
 
				+	struct perf_counter *counter, *group_leader;
			
 
				+	struct perf_counter_attr attr;
			
 
				+	struct perf_counter_context *ctx;
			
 
				+	struct file *counter_file = NULL;
			
 
				+	struct file *group_file = NULL;
			
 
				+	int fput_needed = 0;
			
 
				+	int fput_needed2 = 0;
			
 
				+	int ret;
			
 
				+
			
 
				+	/* for future expandability... */
			
 
				+	if (flags)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	if (copy_from_user(&attr, attr_uptr, sizeof(attr)) != 0)
			
 
				+		return -EFAULT;
			
 
				+
			
 
				+	if (!attr.exclude_kernel) {
			
 
				+		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
			
 
				+			return -EACCES;
			
 
				+	}
			
 
				+
			
 
				+	if (attr.freq) {
			
 
				+		if (attr.sample_freq > sysctl_perf_counter_sample_rate)
			
 
				+			return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Get the target context (task or percpu):
			
 
				+	 */
			
 
				+	ctx = find_get_context(pid, cpu);
			
 
				+	if (IS_ERR(ctx))
			
 
				+		return PTR_ERR(ctx);
			
 
				+
			
 
				+	/*
			
 
				+	 * Look up the group leader (we will attach this counter to it):
			
 
				+	 */
			
 
				+	group_leader = NULL;
			
 
				+	if (group_fd != -1) {
			
 
				+		ret = -EINVAL;
			
 
				+		group_file = fget_light(group_fd, &fput_needed);
			
 
				+		if (!group_file)
			
 
				+			goto err_put_context;
			
 
				+		if (group_file->f_op != &perf_fops)
			
 
				+			goto err_put_context;
			
 
				+
			
 
				+		group_leader = group_file->private_data;
			
 
				+		/*
			
 
				+		 * Do not allow a recursive hierarchy (this new sibling
			
 
				+		 * becoming part of another group-sibling):
			
 
				+		 */
			
 
				+		if (group_leader->group_leader != group_leader)
			
 
				+			goto err_put_context;
			
 
				+		/*
			
 
				+		 * Do not allow to attach to a group in a different
			
 
				+		 * task or CPU context:
			
 
				+		 */
			
 
				+		if (group_leader->ctx != ctx)
			
 
				+			goto err_put_context;
			
 
				+		/*
			
 
				+		 * Only a group leader can be exclusive or pinned
			
 
				+		 */
			
 
				+		if (attr.exclusive || attr.pinned)
			
 
				+			goto err_put_context;
			
 
				+	}
			
 
				+
			
 
				+	counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
			
 
				+				     GFP_KERNEL);
			
 
				+	ret = PTR_ERR(counter);
			
 
				+	if (IS_ERR(counter))
			
 
				+		goto err_put_context;
			
 
				+
			
 
				+	ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
			
 
				+	if (ret < 0)
			
 
				+		goto err_free_put_context;
			
 
				+
			
 
				+	counter_file = fget_light(ret, &fput_needed2);
			
 
				+	if (!counter_file)
			
 
				+		goto err_free_put_context;
			
 
				+
			
 
				+	counter->filp = counter_file;
			
 
				+	WARN_ON_ONCE(ctx->parent_ctx);
			
 
				+	mutex_lock(&ctx->mutex);
			
 
				+	perf_install_in_context(ctx, counter, cpu);
			
 
				+	++ctx->generation;
			
 
				+	mutex_unlock(&ctx->mutex);
			
 
				+
			
 
				+	counter->owner = current;
			
 
				+	get_task_struct(current);
			
 
				+	mutex_lock(&current->perf_counter_mutex);
			
 
				+	list_add_tail(&counter->owner_entry, &current->perf_counter_list);
			
 
				+	mutex_unlock(&current->perf_counter_mutex);
			
 
				+
			
 
				+	fput_light(counter_file, fput_needed2);
			
 
				+
			
 
				+out_fput:
			
 
				+	fput_light(group_file, fput_needed);
			
 
				+
			
 
				+	return ret;
			
 
				+
			
 
				+err_free_put_context:
			
 
				+	kfree(counter);
			
 
				+
			
 
				+err_put_context:
			
 
				+	put_ctx(ctx);
			
 
				+
			
 
				+	goto out_fput;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * inherit a counter from parent task to child task:
			
 
				+ */
			
 
				+static struct perf_counter *
			
 
				+inherit_counter(struct perf_counter *parent_counter,
			
 
				+	      struct task_struct *parent,
			
 
				+	      struct perf_counter_context *parent_ctx,
			
 
				+	      struct task_struct *child,
			
 
				+	      struct perf_counter *group_leader,
			
 
				+	      struct perf_counter_context *child_ctx)
			
 
				+{
			
 
				+	struct perf_counter *child_counter;
			
 
				+
			
 
				+	/*
			
 
				+	 * Instead of creating recursive hierarchies of counters,
			
 
				+	 * we link inherited counters back to the original parent,
			
 
				+	 * which has a filp for sure, which we use as the reference
			
 
				+	 * count:
			
 
				+	 */
			
 
				+	if (parent_counter->parent)
			
 
				+		parent_counter = parent_counter->parent;
			
 
				+
			
 
				+	child_counter = perf_counter_alloc(&parent_counter->attr,
			
 
				+					   parent_counter->cpu, child_ctx,
			
 
				+					   group_leader, GFP_KERNEL);
			
 
				+	if (IS_ERR(child_counter))
			
 
				+		return child_counter;
			
 
				+	get_ctx(child_ctx);
			
 
				+
			
 
				+	/*
			
 
				+	 * Make the child state follow the state of the parent counter,
			
 
				+	 * not its attr.disabled bit.  We hold the parent's mutex,
			
 
				+	 * so we won't race with perf_counter_{en, dis}able_family.
			
 
				+	 */
			
 
				+	if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
			
 
				+		child_counter->state = PERF_COUNTER_STATE_INACTIVE;
			
 
				+	else
			
 
				+		child_counter->state = PERF_COUNTER_STATE_OFF;
			
 
				+
			
 
				+	if (parent_counter->attr.freq)
			
 
				+		child_counter->hw.sample_period = parent_counter->hw.sample_period;
			
 
				+
			
 
				+	/*
			
 
				+	 * Link it up in the child's context:
			
 
				+	 */
			
 
				+	add_counter_to_ctx(child_counter, child_ctx);
			
 
				+
			
 
				+	child_counter->parent = parent_counter;
			
 
				+	/*
			
 
				+	 * inherit into child's child as well:
			
 
				+	 */
			
 
				+	child_counter->attr.inherit = 1;
			
 
				+
			
 
				+	/*
			
 
				+	 * Get a reference to the parent filp - we will fput it
			
 
				+	 * when the child counter exits. This is safe to do because
			
 
				+	 * we are in the parent and we know that the filp still
			
 
				+	 * exists and has a nonzero count:
			
 
				+	 */
			
 
				+	atomic_long_inc(&parent_counter->filp->f_count);
			
 
				+
			
 
				+	/*
			
 
				+	 * Link this into the parent counter's child list
			
 
				+	 */
			
 
				+	WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
			
 
				+	mutex_lock(&parent_counter->child_mutex);
			
 
				+	list_add_tail(&child_counter->child_list, &parent_counter->child_list);
			
 
				+	mutex_unlock(&parent_counter->child_mutex);
			
 
				+
			
 
				+	return child_counter;
			
 
				+}
			
 
				+
			
 
				+static int inherit_group(struct perf_counter *parent_counter,
			
 
				+	      struct task_struct *parent,
			
 
				+	      struct perf_counter_context *parent_ctx,
			
 
				+	      struct task_struct *child,
			
 
				+	      struct perf_counter_context *child_ctx)
			
 
				+{
			
 
				+	struct perf_counter *leader;
			
 
				+	struct perf_counter *sub;
			
 
				+	struct perf_counter *child_ctr;
			
 
				+
			
 
				+	leader = inherit_counter(parent_counter, parent, parent_ctx,
			
 
				+				 child, NULL, child_ctx);
			
 
				+	if (IS_ERR(leader))
			
 
				+		return PTR_ERR(leader);
			
 
				+	list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
			
 
				+		child_ctr = inherit_counter(sub, parent, parent_ctx,
			
 
				+					    child, leader, child_ctx);
			
 
				+		if (IS_ERR(child_ctr))
			
 
				+			return PTR_ERR(child_ctr);
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void sync_child_counter(struct perf_counter *child_counter,
			
 
				+			       struct perf_counter *parent_counter)
			
 
				+{
			
 
				+	u64 child_val;
			
 
				+
			
 
				+	child_val = atomic64_read(&child_counter->count);
			
 
				+
			
 
				+	/*
			
 
				+	 * Add back the child's count to the parent's count:
			
 
				+	 */
			
 
				+	atomic64_add(child_val, &parent_counter->count);
			
 
				+	atomic64_add(child_counter->total_time_enabled,
			
 
				+		     &parent_counter->child_total_time_enabled);
			
 
				+	atomic64_add(child_counter->total_time_running,
			
 
				+		     &parent_counter->child_total_time_running);
			
 
				+
			
 
				+	/*
			
 
				+	 * Remove this counter from the parent's list
			
 
				+	 */
			
 
				+	WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
			
 
				+	mutex_lock(&parent_counter->child_mutex);
			
 
				+	list_del_init(&child_counter->child_list);
			
 
				+	mutex_unlock(&parent_counter->child_mutex);
			
 
				+
			
 
				+	/*
			
 
				+	 * Release the parent counter, if this was the last
			
 
				+	 * reference to it.
			
 
				+	 */
			
 
				+	fput(parent_counter->filp);
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+__perf_counter_exit_task(struct perf_counter *child_counter,
			
 
				+			 struct perf_counter_context *child_ctx)
			
 
				+{
			
 
				+	struct perf_counter *parent_counter;
			
 
				+
			
 
				+	update_counter_times(child_counter);
			
 
				+	perf_counter_remove_from_context(child_counter);
			
 
				+
			
 
				+	parent_counter = child_counter->parent;
			
 
				+	/*
			
 
				+	 * It can happen that parent exits first, and has counters
			
 
				+	 * that are still around due to the child reference. These
			
 
				+	 * counters need to be zapped - but otherwise linger.
			
 
				+	 */
			
 
				+	if (parent_counter) {
			
 
				+		sync_child_counter(child_counter, parent_counter);
			
 
				+		free_counter(child_counter);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * When a child task exits, feed back counter values to parent counters.
			
 
				+ */
			
 
				+void perf_counter_exit_task(struct task_struct *child)
			
 
				+{
			
 
				+	struct perf_counter *child_counter, *tmp;
			
 
				+	struct perf_counter_context *child_ctx;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	if (likely(!child->perf_counter_ctxp))
			
 
				+		return;
			
 
				+
			
 
				+	local_irq_save(flags);
			
 
				+	/*
			
 
				+	 * We can't reschedule here because interrupts are disabled,
			
 
				+	 * and either child is current or it is a task that can't be
			
 
				+	 * scheduled, so we are now safe from rescheduling changing
			
 
				+	 * our context.
			
 
				+	 */
			
 
				+	child_ctx = child->perf_counter_ctxp;
			
 
				+	__perf_counter_task_sched_out(child_ctx);
			
 
				+
			
 
				+	/*
			
 
				+	 * Take the context lock here so that if find_get_context is
			
 
				+	 * reading child->perf_counter_ctxp, we wait until it has
			
 
				+	 * incremented the context's refcount before we do put_ctx below.
			
 
				+	 */
			
 
				+	spin_lock(&child_ctx->lock);
			
 
				+	child->perf_counter_ctxp = NULL;
			
 
				+	if (child_ctx->parent_ctx) {
			
 
				+		/*
			
 
				+		 * This context is a clone; unclone it so it can't get
			
 
				+		 * swapped to another process while we're removing all
			
 
				+		 * the counters from it.
			
 
				+		 */
			
 
				+		put_ctx(child_ctx->parent_ctx);
			
 
				+		child_ctx->parent_ctx = NULL;
			
 
				+	}
			
 
				+	spin_unlock(&child_ctx->lock);
			
 
				+	local_irq_restore(flags);
			
 
				+
			
 
				+	/*
			
 
				+	 * We can recurse on the same lock type through:
			
 
				+	 *
			
 
				+	 *   __perf_counter_exit_task()
			
 
				+	 *     sync_child_counter()
			
 
				+	 *       fput(parent_counter->filp)
			
 
				+	 *         perf_release()
			
 
				+	 *           mutex_lock(&ctx->mutex)
			
 
				+	 *
			
 
				+	 * But since its the parent context it won't be the same instance.
			
 
				+	 */
			
 
				+	mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
			
 
				+
			
 
				+again:
			
 
				+	list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
			
 
				+				 list_entry)
			
 
				+		__perf_counter_exit_task(child_counter, child_ctx);
			
 
				+
			
 
				+	/*
			
 
				+	 * If the last counter was a group counter, it will have appended all
			
 
				+	 * its siblings to the list, but we obtained 'tmp' before that which
			
 
				+	 * will still point to the list head terminating the iteration.
			
 
				+	 */
			
 
				+	if (!list_empty(&child_ctx->counter_list))
			
 
				+		goto again;
			
 
				+
			
 
				+	mutex_unlock(&child_ctx->mutex);
			
 
				+
			
 
				+	put_ctx(child_ctx);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * free an unexposed, unused context as created by inheritance by
			
 
				+ * init_task below, used by fork() in case of fail.
			
 
				+ */
			
 
				+void perf_counter_free_task(struct task_struct *task)
			
 
				+{
			
 
				+	struct perf_counter_context *ctx = task->perf_counter_ctxp;
			
 
				+	struct perf_counter *counter, *tmp;
			
 
				+
			
 
				+	if (!ctx)
			
 
				+		return;
			
 
				+
			
 
				+	mutex_lock(&ctx->mutex);
			
 
				+again:
			
 
				+	list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) {
			
 
				+		struct perf_counter *parent = counter->parent;
			
 
				+
			
 
				+		if (WARN_ON_ONCE(!parent))
			
 
				+			continue;
			
 
				+
			
 
				+		mutex_lock(&parent->child_mutex);
			
 
				+		list_del_init(&counter->child_list);
			
 
				+		mutex_unlock(&parent->child_mutex);
			
 
				+
			
 
				+		fput(parent->filp);
			
 
				+
			
 
				+		list_del_counter(counter, ctx);
			
 
				+		free_counter(counter);
			
 
				+	}
			
 
				+
			
 
				+	if (!list_empty(&ctx->counter_list))
			
 
				+		goto again;
			
 
				+
			
 
				+	mutex_unlock(&ctx->mutex);
			
 
				+
			
 
				+	put_ctx(ctx);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Initialize the perf_counter context in task_struct
			
 
				+ */
			
 
				+int perf_counter_init_task(struct task_struct *child)
			
 
				+{
			
 
				+	struct perf_counter_context *child_ctx, *parent_ctx;
			
 
				+	struct perf_counter_context *cloned_ctx;
			
 
				+	struct perf_counter *counter;
			
 
				+	struct task_struct *parent = current;
			
 
				+	int inherited_all = 1;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	child->perf_counter_ctxp = NULL;
			
 
				+
			
 
				+	mutex_init(&child->perf_counter_mutex);
			
 
				+	INIT_LIST_HEAD(&child->perf_counter_list);
			
 
				+
			
 
				+	if (likely(!parent->perf_counter_ctxp))
			
 
				+		return 0;
			
 
				+
			
 
				+	/*
			
 
				+	 * This is executed from the parent task context, so inherit
			
 
				+	 * counters that have been marked for cloning.
			
 
				+	 * First allocate and initialize a context for the child.
			
 
				+	 */
			
 
				+
			
 
				+	child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
			
 
				+	if (!child_ctx)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	__perf_counter_init_context(child_ctx, child);
			
 
				+	child->perf_counter_ctxp = child_ctx;
			
 
				+	get_task_struct(child);
			
 
				+
			
 
				+	/*
			
 
				+	 * If the parent's context is a clone, pin it so it won't get
			
 
				+	 * swapped under us.
			
 
				+	 */
			
 
				+	parent_ctx = perf_pin_task_context(parent);
			
 
				+
			
 
				+	/*
			
 
				+	 * No need to check if parent_ctx != NULL here; since we saw
			
 
				+	 * it non-NULL earlier, the only reason for it to become NULL
			
 
				+	 * is if we exit, and since we're currently in the middle of
			
 
				+	 * a fork we can't be exiting at the same time.
			
 
				+	 */
			
 
				+
			
 
				+	/*
			
 
				+	 * Lock the parent list. No need to lock the child - not PID
			
 
				+	 * hashed yet and not running, so nobody can access it.
			
 
				+	 */
			
 
				+	mutex_lock(&parent_ctx->mutex);
			
 
				+
			
 
				+	/*
			
 
				+	 * We dont have to disable NMIs - we are only looking at
			
 
				+	 * the list, not manipulating it:
			
 
				+	 */
			
 
				+	list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) {
			
 
				+		if (counter != counter->group_leader)
			
 
				+			continue;
			
 
				+
			
 
				+		if (!counter->attr.inherit) {
			
 
				+			inherited_all = 0;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		ret = inherit_group(counter, parent, parent_ctx,
			
 
				+					     child, child_ctx);
			
 
				+		if (ret) {
			
 
				+			inherited_all = 0;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (inherited_all) {
			
 
				+		/*
			
 
				+		 * Mark the child context as a clone of the parent
			
 
				+		 * context, or of whatever the parent is a clone of.
			
 
				+		 * Note that if the parent is a clone, it could get
			
 
				+		 * uncloned at any point, but that doesn't matter
			
 
				+		 * because the list of counters and the generation
			
 
				+		 * count can't have changed since we took the mutex.
			
 
				+		 */
			
 
				+		cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
			
 
				+		if (cloned_ctx) {
			
 
				+			child_ctx->parent_ctx = cloned_ctx;
			
 
				+			child_ctx->parent_gen = parent_ctx->parent_gen;
			
 
				+		} else {
			
 
				+			child_ctx->parent_ctx = parent_ctx;
			
 
				+			child_ctx->parent_gen = parent_ctx->generation;
			
 
				+		}
			
 
				+		get_ctx(child_ctx->parent_ctx);
			
 
				+	}
			
 
				+
			
 
				+	mutex_unlock(&parent_ctx->mutex);
			
 
				+
			
 
				+	perf_unpin_context(parent_ctx);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static void __cpuinit perf_counter_init_cpu(int cpu)
			
 
				+{
			
 
				+	struct perf_cpu_context *cpuctx;
			
 
				+
			
 
				+	cpuctx = &per_cpu(perf_cpu_context, cpu);
			
 
				+	__perf_counter_init_context(&cpuctx->ctx, NULL);
			
 
				+
			
 
				+	spin_lock(&perf_resource_lock);
			
 
				+	cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
			
 
				+	spin_unlock(&perf_resource_lock);
			
 
				+
			
 
				+	hw_perf_counter_setup(cpu);
			
 
				+}
			
 
				+
			
 
				+#ifdef CONFIG_HOTPLUG_CPU
			
 
				+static void __perf_counter_exit_cpu(void *info)
			
 
				+{
			
 
				+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
			
 
				+	struct perf_counter_context *ctx = &cpuctx->ctx;
			
 
				+	struct perf_counter *counter, *tmp;
			
 
				+
			
 
				+	list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
			
 
				+		__perf_counter_remove_from_context(counter);
			
 
				+}
			
 
				+static void perf_counter_exit_cpu(int cpu)
			
 
				+{
			
 
				+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
			
 
				+	struct perf_counter_context *ctx = &cpuctx->ctx;
			
 
				+
			
 
				+	mutex_lock(&ctx->mutex);
			
 
				+	smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
			
 
				+	mutex_unlock(&ctx->mutex);
			
 
				+}
			
 
				+#else
			
 
				+static inline void perf_counter_exit_cpu(int cpu) { }
			
 
				+#endif
			
 
				+
			
 
				+static int __cpuinit
			
 
				+perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
			
 
				+{
			
 
				+	unsigned int cpu = (long)hcpu;
			
 
				+
			
 
				+	switch (action) {
			
 
				+
			
 
				+	case CPU_UP_PREPARE:
			
 
				+	case CPU_UP_PREPARE_FROZEN:
			
 
				+		perf_counter_init_cpu(cpu);
			
 
				+		break;
			
 
				+
			
 
				+	case CPU_DOWN_PREPARE:
			
 
				+	case CPU_DOWN_PREPARE_FROZEN:
			
 
				+		perf_counter_exit_cpu(cpu);
			
 
				+		break;
			
 
				+
			
 
				+	default:
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	return NOTIFY_OK;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * This has to have a higher priority than migration_notifier in sched.c.
			
 
				+ */
			
 
				+static struct notifier_block __cpuinitdata perf_cpu_nb = {
			
 
				+	.notifier_call		= perf_cpu_notify,
			
 
				+	.priority		= 20,
			
 
				+};
			
 
				+
			
 
				+void __init perf_counter_init(void)
			
 
				+{
			
 
				+	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
			
 
				+			(void *)(long)smp_processor_id());
			
 
				+	register_cpu_notifier(&perf_cpu_nb);
			
 
				+}
			
 
				+
			
 
				+static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
			
 
				+{
			
 
				+	return sprintf(buf, "%d\n", perf_reserved_percpu);
			
 
				+}
			
 
				+
			
 
				+static ssize_t
			
 
				+perf_set_reserve_percpu(struct sysdev_class *class,
			
 
				+			const char *buf,
			
 
				+			size_t count)
			
 
				+{
			
 
				+	struct perf_cpu_context *cpuctx;
			
 
				+	unsigned long val;
			
 
				+	int err, cpu, mpt;
			
 
				+
			
 
				+	err = strict_strtoul(buf, 10, &val);
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+	if (val > perf_max_counters)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	spin_lock(&perf_resource_lock);
			
 
				+	perf_reserved_percpu = val;
			
 
				+	for_each_online_cpu(cpu) {
			
 
				+		cpuctx = &per_cpu(perf_cpu_context, cpu);
			
 
				+		spin_lock_irq(&cpuctx->ctx.lock);
			
 
				+		mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
			
 
				+			  perf_max_counters - perf_reserved_percpu);
			
 
				+		cpuctx->max_pertask = mpt;
			
 
				+		spin_unlock_irq(&cpuctx->ctx.lock);
			
 
				+	}
			
 
				+	spin_unlock(&perf_resource_lock);
			
 
				+
			
 
				+	return count;
			
 
				+}
			
 
				+
			
 
				+static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
			
 
				+{
			
 
				+	return sprintf(buf, "%d\n", perf_overcommit);
			
 
				+}
			
 
				+
			
 
				+static ssize_t
			
 
				+perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
			
 
				+{
			
 
				+	unsigned long val;
			
 
				+	int err;
			
 
				+
			
 
				+	err = strict_strtoul(buf, 10, &val);
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+	if (val > 1)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	spin_lock(&perf_resource_lock);
			
 
				+	perf_overcommit = val;
			
 
				+	spin_unlock(&perf_resource_lock);
			
 
				+
			
 
				+	return count;
			
 
				+}
			
 
				+
			
 
				+static SYSDEV_CLASS_ATTR(
			
 
				+				reserve_percpu,
			
 
				+				0644,
			
 
				+				perf_show_reserve_percpu,
			
 
				+				perf_set_reserve_percpu
			
 
				+			);
			
 
				+
			
 
				+static SYSDEV_CLASS_ATTR(
			
 
				+				overcommit,
			
 
				+				0644,
			
 
				+				perf_show_overcommit,
			
 
				+				perf_set_overcommit
			
 
				+			);
			
 
				+
			
 
				+static struct attribute *perfclass_attrs[] = {
			
 
				+	&attr_reserve_percpu.attr,
			
 
				+	&attr_overcommit.attr,
			
 
				+	NULL
			
 
				+};
			
 
				+
			
 
				+static struct attribute_group perfclass_attr_group = {
			
 
				+	.attrs			= perfclass_attrs,
			
 
				+	.name			= "perf_counters",
			
 
				+};
			
 
				+
			
 
				+static int __init perf_counter_sysfs_init(void)
			
 
				+{
			
 
				+	return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
			
 
				+				  &perfclass_attr_group);
			
 
				+}
			
 
				+device_initcall(perf_counter_sysfs_init);
			
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,6 +39,7 @@
 
				 #include <linux/completion.h>
			
 
				 #include <linux/kernel_stat.h>
			
 
				 #include <linux/debug_locks.h>
			
 
				+#include <linux/perf_counter.h>
			
 
				 #include <linux/security.h>
			
 
				 #include <linux/notifier.h>
			
 
				 #include <linux/profile.h>
			
@@ -579,6 +580,7 @@ struct rq {
 
				 	struct load_weight load;
			
 
				 	unsigned long nr_load_updates;
			
 
				 	u64 nr_switches;
			
 
				+	u64 nr_migrations_in;
			
 
				 
			
 
				 	struct cfs_rq cfs;
			
 
				 	struct rt_rq rt;
			
@@ -691,7 +693,7 @@ static inline int cpu_of(struct rq *rq)
 
				 #define task_rq(p)		cpu_rq(task_cpu(p))
			
 
				 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
			
 
				 
			
 
				-static inline void update_rq_clock(struct rq *rq)
			
 
				+inline void update_rq_clock(struct rq *rq)
			
 
				 {
			
 
				 	rq->clock = sched_clock_cpu(cpu_of(rq));
			
 
				 }
			
@@ -1968,12 +1970,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 
				 		p->se.sleep_start -= clock_offset;
			
 
				 	if (p->se.block_start)
			
 
				 		p->se.block_start -= clock_offset;
			
 
				+#endif
			
 
				 	if (old_cpu != new_cpu) {
			
 
				-		schedstat_inc(p, se.nr_migrations);
			
 
				+		p->se.nr_migrations++;
			
 
				+		new_rq->nr_migrations_in++;
			
 
				+#ifdef CONFIG_SCHEDSTATS
			
 
				 		if (task_hot(p, old_rq->clock, NULL))
			
 
				 			schedstat_inc(p, se.nr_forced2_migrations);
			
 
				-	}
			
 
				 #endif
			
 
				+		perf_counter_task_migration(p, new_cpu);
			
 
				+	}
			
 
				 	p->se.vruntime -= old_cfsrq->min_vruntime -
			
 
				 					 new_cfsrq->min_vruntime;
			
 
				 
			
@@ -2368,6 +2374,27 @@ static int sched_balance_self(int cpu, int flag)
 
				 
			
 
				 #endif /* CONFIG_SMP */
			
 
				 
			
 
				+/**
			
 
				+ * task_oncpu_function_call - call a function on the cpu on which a task runs
			
 
				+ * @p:		the task to evaluate
			
 
				+ * @func:	the function to be called
			
 
				+ * @info:	the function call argument
			
 
				+ *
			
 
				+ * Calls the function @func when the task is currently running. This might
			
 
				+ * be on the current CPU, which just calls the function directly
			
 
				+ */
			
 
				+void task_oncpu_function_call(struct task_struct *p,
			
 
				+			      void (*func) (void *info), void *info)
			
 
				+{
			
 
				+	int cpu;
			
 
				+
			
 
				+	preempt_disable();
			
 
				+	cpu = task_cpu(p);
			
 
				+	if (task_curr(p))
			
 
				+		smp_call_function_single(cpu, func, info, 1);
			
 
				+	preempt_enable();
			
 
				+}
			
 
				+
			
 
				 /***
			
 
				  * try_to_wake_up - wake up a thread
			
 
				  * @p: the to-be-woken-up thread
			
@@ -2535,6 +2562,7 @@ static void __sched_fork(struct task_struct *p)
 
				 	p->se.exec_start		= 0;
			
 
				 	p->se.sum_exec_runtime		= 0;
			
 
				 	p->se.prev_sum_exec_runtime	= 0;
			
 
				+	p->se.nr_migrations		= 0;
			
 
				 	p->se.last_wakeup		= 0;
			
 
				 	p->se.avg_overlap		= 0;
			
 
				 	p->se.start_runtime		= 0;
			
@@ -2765,6 +2793,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 
				 	 */
			
 
				 	prev_state = prev->state;
			
 
				 	finish_arch_switch(prev);
			
 
				+	perf_counter_task_sched_in(current, cpu_of(rq));
			
 
				 	finish_lock_switch(rq, prev);
			
 
				 #ifdef CONFIG_SMP
			
 
				 	if (post_schedule)
			
@@ -2979,6 +3008,15 @@ static void calc_load_account_active(struct rq *this_rq)
 
				 	}
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Externally visible per-cpu scheduler statistics:
			
 
				+ * cpu_nr_migrations(cpu) - number of migrations into that cpu
			
 
				+ */
			
 
				+u64 cpu_nr_migrations(int cpu)
			
 
				+{
			
 
				+	return cpu_rq(cpu)->nr_migrations_in;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Update rq->cpu_load[] statistics. This function is usually called every
			
 
				  * scheduler tick (TICK_NSEC).
			
@@ -5077,6 +5115,8 @@ void scheduler_tick(void)
 
				 	curr->sched_class->task_tick(rq, curr, 0);
			
 
				 	spin_unlock(&rq->lock);
			
 
				 
			
 
				+	perf_counter_task_tick(curr, cpu);
			
 
				+
			
 
				 #ifdef CONFIG_SMP
			
 
				 	rq->idle_at_tick = idle_cpu(cpu);
			
 
				 	trigger_load_balance(rq, cpu);
			
@@ -5292,6 +5332,7 @@ need_resched_nonpreemptible:
 
				 
			
 
				 	if (likely(prev != next)) {
			
 
				 		sched_info_switch(prev, next);
			
 
				+		perf_counter_task_sched_out(prev, next, cpu);
			
 
				 
			
 
				 		rq->nr_switches++;
			
 
				 		rq->curr = next;
			
@@ -7535,8 +7576,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 
				 	return NOTIFY_OK;
			
 
				 }
			
 
				 
			
 
				-/* Register at highest priority so that task migration (migrate_all_tasks)
			
 
				- * happens before everything else.
			
 
				+/*
			
 
				+ * Register at high priority so that task migration (migrate_all_tasks)
			
 
				+ * happens before everything else.  This has to be lower priority than
			
 
				+ * the notifier in the perf_counter subsystem, though.
			
 
				  */
			
 
				 static struct notifier_block __cpuinitdata migration_notifier = {
			
 
				 	.notifier_call = migration_call,
			
@@ -9214,7 +9257,7 @@ void __init sched_init(void)
 
				 		 * 1024) and two child groups A0 and A1 (of weight 1024 each),
			
 
				 		 * then A0's share of the cpu resource is:
			
 
				 		 *
			
 
				-		 * 	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
			
 
				+		 *	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
			
 
				 		 *
			
 
				 		 * We achieve this by letting init_task_group's tasks sit
			
 
				 		 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
			
@@ -9319,6 +9362,8 @@ void __init sched_init(void)
 
				 	alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
			
 
				 #endif /* SMP */
			
 
				 
			
 
				+	perf_counter_init();
			
 
				+
			
 
				 	scheduler_running = 1;
			
 
				 }
			
 
				 
			
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,6 +14,7 @@
 
				 #include <linux/prctl.h>
			
 
				 #include <linux/highuid.h>
			
 
				 #include <linux/fs.h>
			
 
				+#include <linux/perf_counter.h>
			
 
				 #include <linux/resource.h>
			
 
				 #include <linux/kernel.h>
			
 
				 #include <linux/kexec.h>
			
@@ -1793,6 +1794,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 
				 		case PR_SET_TSC:
			
 
				 			error = SET_TSC_CTL(arg2);
			
 
				 			break;
			
 
				+		case PR_TASK_PERF_COUNTERS_DISABLE:
			
 
				+			error = perf_counter_task_disable();
			
 
				+			break;
			
 
				+		case PR_TASK_PERF_COUNTERS_ENABLE:
			
 
				+			error = perf_counter_task_enable();
			
 
				+			break;
			
 
				 		case PR_GET_TIMERSLACK:
			
 
				 			error = current->timer_slack_ns;
			
 
				 			break;
			
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -175,3 +175,6 @@ cond_syscall(compat_sys_timerfd_settime);
 
				 cond_syscall(compat_sys_timerfd_gettime);
			
 
				 cond_syscall(sys_eventfd);
			
 
				 cond_syscall(sys_eventfd2);
			
 
				+
			
 
				+/* performance counters: */
			
 
				+cond_syscall(sys_perf_counter_open);
			
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -49,6 +49,7 @@
 
				 #include <linux/reboot.h>
			
 
				 #include <linux/ftrace.h>
			
 
				 #include <linux/slow-work.h>
			
 
				+#include <linux/perf_counter.h>
			
 
				 
			
 
				 #include <asm/uaccess.h>
			
 
				 #include <asm/processor.h>
			
@@ -932,6 +933,32 @@ static struct ctl_table kern_table[] = {
 
				 		.child		= slow_work_sysctls,
			
 
				 	},
			
 
				 #endif
			
 
				+#ifdef CONFIG_PERF_COUNTERS
			
 
				+	{
			
 
				+		.ctl_name	= CTL_UNNUMBERED,
			
 
				+		.procname	= "perf_counter_paranoid",
			
 
				+		.data		= &sysctl_perf_counter_paranoid,
			
 
				+		.maxlen		= sizeof(sysctl_perf_counter_paranoid),
			
 
				+		.mode		= 0644,
			
 
				+		.proc_handler	= &proc_dointvec,
			
 
				+	},
			
 
				+	{
			
 
				+		.ctl_name	= CTL_UNNUMBERED,
			
 
				+		.procname	= "perf_counter_mlock_kb",
			
 
				+		.data		= &sysctl_perf_counter_mlock,
			
 
				+		.maxlen		= sizeof(sysctl_perf_counter_mlock),
			
 
				+		.mode		= 0644,
			
 
				+		.proc_handler	= &proc_dointvec,
			
 
				+	},
			
 
				+	{
			
 
				+		.ctl_name	= CTL_UNNUMBERED,
			
 
				+		.procname	= "perf_counter_max_sample_rate",
			
 
				+		.data		= &sysctl_perf_counter_sample_rate,
			
 
				+		.maxlen		= sizeof(sysctl_perf_counter_sample_rate),
			
 
				+		.mode		= 0644,
			
 
				+		.proc_handler	= &proc_dointvec,
			
 
				+	},
			
 
				+#endif
			
 
				 /*
			
 
				  * NOTE: do not add new entries to this table unless you have read
			
 
				  * Documentation/sysctl/ctl_unnumbered.txt
			
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,6 +37,7 @@
 
				 #include <linux/delay.h>
			
 
				 #include <linux/tick.h>
			
 
				 #include <linux/kallsyms.h>
			
 
				+#include <linux/perf_counter.h>
			
 
				 
			
 
				 #include <asm/uaccess.h>
			
 
				 #include <asm/unistd.h>
			
@@ -1129,6 +1130,8 @@ static void run_timer_softirq(struct softirq_action *h)
 
				 {
			
 
				 	struct tvec_base *base = __get_cpu_var(tvec_bases);
			
 
				 
			
 
				+	perf_counter_do_pending();
			
 
				+
			
 
				 	hrtimer_run_pending();
			
 
				 
			
 
				 	if (time_after_eq(jiffies, base->timer_jiffies))
			
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,6 +28,7 @@
 
				 #include <linux/mempolicy.h>
			
 
				 #include <linux/rmap.h>
			
 
				 #include <linux/mmu_notifier.h>
			
 
				+#include <linux/perf_counter.h>
			
 
				 
			
 
				 #include <asm/uaccess.h>
			
 
				 #include <asm/cacheflush.h>
			
@@ -1222,6 +1223,8 @@ munmap_back:
 
				 	if (correct_wcount)
			
 
				 		atomic_inc(&inode->i_writecount);
			
 
				 out:
			
 
				+	perf_counter_mmap(vma);
			
 
				+
			
 
				 	mm->total_vm += len >> PAGE_SHIFT;
			
 
				 	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
			
 
				 	if (vm_flags & VM_LOCKED) {
			
@@ -2308,6 +2311,8 @@ int install_special_mapping(struct mm_struct *mm,
 
				 
			
 
				 	mm->total_vm += len >> PAGE_SHIFT;
			
 
				 
			
 
				+	perf_counter_mmap(vma);
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,6 +23,7 @@
 
				 #include <linux/swapops.h>
			
 
				 #include <linux/mmu_notifier.h>
			
 
				 #include <linux/migrate.h>
			
 
				+#include <linux/perf_counter.h>
			
 
				 #include <asm/uaccess.h>
			
 
				 #include <asm/pgtable.h>
			
 
				 #include <asm/cacheflush.h>
			
@@ -299,6 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
 
				 		error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
			
 
				 		if (error)
			
 
				 			goto out;
			
 
				+		perf_counter_mmap(vma);
			
 
				 		nstart = tmp;
			
 
				 
			
 
				 		if (nstart < prev->vm_end)
			
--- a/tools/perf/.gitignore
+++ b/tools/perf/.gitignore
@@ -0,0 +1,16 @@
 
				+PERF-BUILD-OPTIONS
			
 
				+PERF-CFLAGS
			
 
				+PERF-GUI-VARS
			
 
				+PERF-VERSION-FILE
			
 
				+perf
			
 
				+perf-help
			
 
				+perf-record
			
 
				+perf-report
			
 
				+perf-stat
			
 
				+perf-top
			
 
				+perf*.1
			
 
				+perf*.xml
			
 
				+common-cmds.h
			
 
				+tags
			
 
				+TAGS
			
 
				+cscope*
			
--- a/tools/perf/Documentation/Makefile
+++ b/tools/perf/Documentation/Makefile
@@ -0,0 +1,300 @@
 
				+MAN1_TXT= \
			
 
				+	$(filter-out $(addsuffix .txt, $(ARTICLES) $(SP_ARTICLES)), \
			
 
				+		$(wildcard perf-*.txt)) \
			
 
				+	perf.txt
			
 
				+MAN5_TXT=
			
 
				+MAN7_TXT=
			
 
				+
			
 
				+MAN_TXT = $(MAN1_TXT) $(MAN5_TXT) $(MAN7_TXT)
			
 
				+MAN_XML=$(patsubst %.txt,%.xml,$(MAN_TXT))
			
 
				+MAN_HTML=$(patsubst %.txt,%.html,$(MAN_TXT))
			
 
				+
			
 
				+DOC_HTML=$(MAN_HTML)
			
 
				+
			
 
				+ARTICLES =
			
 
				+# with their own formatting rules.
			
 
				+SP_ARTICLES =
			
 
				+API_DOCS = $(patsubst %.txt,%,$(filter-out technical/api-index-skel.txt technical/api-index.txt, $(wildcard technical/api-*.txt)))
			
 
				+SP_ARTICLES += $(API_DOCS)
			
 
				+SP_ARTICLES += technical/api-index
			
 
				+
			
 
				+DOC_HTML += $(patsubst %,%.html,$(ARTICLES) $(SP_ARTICLES))
			
 
				+
			
 
				+DOC_MAN1=$(patsubst %.txt,%.1,$(MAN1_TXT))
			
 
				+DOC_MAN5=$(patsubst %.txt,%.5,$(MAN5_TXT))
			
 
				+DOC_MAN7=$(patsubst %.txt,%.7,$(MAN7_TXT))
			
 
				+
			
 
				+prefix?=$(HOME)
			
 
				+bindir?=$(prefix)/bin
			
 
				+htmldir?=$(prefix)/share/doc/perf-doc
			
 
				+pdfdir?=$(prefix)/share/doc/perf-doc
			
 
				+mandir?=$(prefix)/share/man
			
 
				+man1dir=$(mandir)/man1
			
 
				+man5dir=$(mandir)/man5
			
 
				+man7dir=$(mandir)/man7
			
 
				+# DESTDIR=
			
 
				+
			
 
				+ASCIIDOC=asciidoc
			
 
				+ASCIIDOC_EXTRA =
			
 
				+MANPAGE_XSL = manpage-normal.xsl
			
 
				+XMLTO_EXTRA =
			
 
				+INSTALL?=install
			
 
				+RM ?= rm -f
			
 
				+DOC_REF = origin/man
			
 
				+HTML_REF = origin/html
			
 
				+
			
 
				+infodir?=$(prefix)/share/info
			
 
				+MAKEINFO=makeinfo
			
 
				+INSTALL_INFO=install-info
			
 
				+DOCBOOK2X_TEXI=docbook2x-texi
			
 
				+DBLATEX=dblatex
			
 
				+ifndef PERL_PATH
			
 
				+	PERL_PATH = /usr/bin/perl
			
 
				+endif
			
 
				+
			
 
				+-include ../config.mak.autogen
			
 
				+-include ../config.mak
			
 
				+
			
 
				+#
			
 
				+# For asciidoc ...
			
 
				+#	-7.1.2,	no extra settings are needed.
			
 
				+#	8.0-,	set ASCIIDOC8.
			
 
				+#
			
 
				+
			
 
				+#
			
 
				+# For docbook-xsl ...
			
 
				+#	-1.68.1,	set ASCIIDOC_NO_ROFF? (based on changelog from 1.73.0)
			
 
				+#	1.69.0,		no extra settings are needed?
			
 
				+#	1.69.1-1.71.0,	set DOCBOOK_SUPPRESS_SP?
			
 
				+#	1.71.1,		no extra settings are needed?
			
 
				+#	1.72.0,		set DOCBOOK_XSL_172.
			
 
				+#	1.73.0-,	set ASCIIDOC_NO_ROFF
			
 
				+#
			
 
				+
			
 
				+#
			
 
				+# If you had been using DOCBOOK_XSL_172 in an attempt to get rid
			
 
				+# of 'the ".ft C" problem' in your generated manpages, and you
			
 
				+# instead ended up with weird characters around callouts, try
			
 
				+# using ASCIIDOC_NO_ROFF instead (it works fine with ASCIIDOC8).
			
 
				+#
			
 
				+
			
 
				+ifdef ASCIIDOC8
			
 
				+ASCIIDOC_EXTRA += -a asciidoc7compatible
			
 
				+endif
			
 
				+ifdef DOCBOOK_XSL_172
			
 
				+ASCIIDOC_EXTRA += -a perf-asciidoc-no-roff
			
 
				+MANPAGE_XSL = manpage-1.72.xsl
			
 
				+else
			
 
				+	ifdef ASCIIDOC_NO_ROFF
			
 
				+	# docbook-xsl after 1.72 needs the regular XSL, but will not
			
 
				+	# pass-thru raw roff codes from asciidoc.conf, so turn them off.
			
 
				+	ASCIIDOC_EXTRA += -a perf-asciidoc-no-roff
			
 
				+	endif
			
 
				+endif
			
 
				+ifdef MAN_BOLD_LITERAL
			
 
				+XMLTO_EXTRA += -m manpage-bold-literal.xsl
			
 
				+endif
			
 
				+ifdef DOCBOOK_SUPPRESS_SP
			
 
				+XMLTO_EXTRA += -m manpage-suppress-sp.xsl
			
 
				+endif
			
 
				+
			
 
				+SHELL_PATH ?= $(SHELL)
			
 
				+# Shell quote;
			
 
				+SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
			
 
				+
			
 
				+#
			
 
				+# Please note that there is a minor bug in asciidoc.
			
 
				+# The version after 6.0.3 _will_ include the patch found here:
			
 
				+#   http://marc.theaimsgroup.com/?l=perf&m=111558757202243&w=2
			
 
				+#
			
 
				+# Until that version is released you may have to apply the patch
			
 
				+# yourself - yes, all 6 characters of it!
			
 
				+#
			
 
				+
			
 
				+QUIET_SUBDIR0  = +$(MAKE) -C # space to separate -C and subdir
			
 
				+QUIET_SUBDIR1  =
			
 
				+
			
 
				+ifneq ($(findstring $(MAKEFLAGS),w),w)
			
 
				+PRINT_DIR = --no-print-directory
			
 
				+else # "make -w"
			
 
				+NO_SUBDIR = :
			
 
				+endif
			
 
				+
			
 
				+ifneq ($(findstring $(MAKEFLAGS),s),s)
			
 
				+ifndef V
			
 
				+	QUIET_ASCIIDOC	= @echo '   ' ASCIIDOC $@;
			
 
				+	QUIET_XMLTO	= @echo '   ' XMLTO $@;
			
 
				+	QUIET_DB2TEXI	= @echo '   ' DB2TEXI $@;
			
 
				+	QUIET_MAKEINFO	= @echo '   ' MAKEINFO $@;
			
 
				+	QUIET_DBLATEX	= @echo '   ' DBLATEX $@;
			
 
				+	QUIET_XSLTPROC	= @echo '   ' XSLTPROC $@;
			
 
				+	QUIET_GEN	= @echo '   ' GEN $@;
			
 
				+	QUIET_STDERR	= 2> /dev/null
			
 
				+	QUIET_SUBDIR0	= +@subdir=
			
 
				+	QUIET_SUBDIR1	= ;$(NO_SUBDIR) echo '   ' SUBDIR $$subdir; \
			
 
				+			  $(MAKE) $(PRINT_DIR) -C $$subdir
			
 
				+	export V
			
 
				+endif
			
 
				+endif
			
 
				+
			
 
				+all: html man
			
 
				+
			
 
				+html: $(DOC_HTML)
			
 
				+
			
 
				+$(DOC_HTML) $(DOC_MAN1) $(DOC_MAN5) $(DOC_MAN7): asciidoc.conf
			
 
				+
			
 
				+man: man1 man5 man7
			
 
				+man1: $(DOC_MAN1)
			
 
				+man5: $(DOC_MAN5)
			
 
				+man7: $(DOC_MAN7)
			
 
				+
			
 
				+info: perf.info perfman.info
			
 
				+
			
 
				+pdf: user-manual.pdf
			
 
				+
			
 
				+install: install-man
			
 
				+
			
 
				+install-man: man
			
 
				+	$(INSTALL) -d -m 755 $(DESTDIR)$(man1dir)
			
 
				+#	$(INSTALL) -d -m 755 $(DESTDIR)$(man5dir)
			
 
				+#	$(INSTALL) -d -m 755 $(DESTDIR)$(man7dir)
			
 
				+	$(INSTALL) -m 644 $(DOC_MAN1) $(DESTDIR)$(man1dir)
			
 
				+#	$(INSTALL) -m 644 $(DOC_MAN5) $(DESTDIR)$(man5dir)
			
 
				+#	$(INSTALL) -m 644 $(DOC_MAN7) $(DESTDIR)$(man7dir)
			
 
				+
			
 
				+install-info: info
			
 
				+	$(INSTALL) -d -m 755 $(DESTDIR)$(infodir)
			
 
				+	$(INSTALL) -m 644 perf.info perfman.info $(DESTDIR)$(infodir)
			
 
				+	if test -r $(DESTDIR)$(infodir)/dir; then \
			
 
				+	  $(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perf.info ;\
			
 
				+	  $(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perfman.info ;\
			
 
				+	else \
			
 
				+	  echo "No directory found in $(DESTDIR)$(infodir)" >&2 ; \
			
 
				+	fi
			
 
				+
			
 
				+install-pdf: pdf
			
 
				+	$(INSTALL) -d -m 755 $(DESTDIR)$(pdfdir)
			
 
				+	$(INSTALL) -m 644 user-manual.pdf $(DESTDIR)$(pdfdir)
			
 
				+
			
 
				+install-html: html
			
 
				+	'$(SHELL_PATH_SQ)' ./install-webdoc.sh $(DESTDIR)$(htmldir)
			
 
				+
			
 
				+../PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
			
 
				+	$(QUIET_SUBDIR0)../ $(QUIET_SUBDIR1) PERF-VERSION-FILE
			
 
				+
			
 
				+-include ../PERF-VERSION-FILE
			
 
				+
			
 
				+#
			
 
				+# Determine "include::" file references in asciidoc files.
			
 
				+#
			
 
				+doc.dep : $(wildcard *.txt) build-docdep.perl
			
 
				+	$(QUIET_GEN)$(RM) $@+ $@ && \
			
 
				+	$(PERL_PATH) ./build-docdep.perl >$@+ $(QUIET_STDERR) && \
			
 
				+	mv $@+ $@
			
 
				+
			
 
				+-include doc.dep
			
 
				+
			
 
				+cmds_txt = cmds-ancillaryinterrogators.txt \
			
 
				+	cmds-ancillarymanipulators.txt \
			
 
				+	cmds-mainporcelain.txt \
			
 
				+	cmds-plumbinginterrogators.txt \
			
 
				+	cmds-plumbingmanipulators.txt \
			
 
				+	cmds-synchingrepositories.txt \
			
 
				+	cmds-synchelpers.txt \
			
 
				+	cmds-purehelpers.txt \
			
 
				+	cmds-foreignscminterface.txt
			
 
				+
			
 
				+$(cmds_txt): cmd-list.made
			
 
				+
			
 
				+cmd-list.made: cmd-list.perl ../command-list.txt $(MAN1_TXT)
			
 
				+	$(QUIET_GEN)$(RM) $@ && \
			
 
				+	$(PERL_PATH) ./cmd-list.perl ../command-list.txt $(QUIET_STDERR) && \
			
 
				+	date >$@
			
 
				+
			
 
				+clean:
			
 
				+	$(RM) *.xml *.xml+ *.html *.html+ *.1 *.5 *.7
			
 
				+	$(RM) *.texi *.texi+ *.texi++ perf.info perfman.info
			
 
				+	$(RM) howto-index.txt howto/*.html doc.dep
			
 
				+	$(RM) technical/api-*.html technical/api-index.txt
			
 
				+	$(RM) $(cmds_txt) *.made
			
 
				+
			
 
				+$(MAN_HTML): %.html : %.txt
			
 
				+	$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
			
 
				+	$(ASCIIDOC) -b xhtml11 -d manpage -f asciidoc.conf \
			
 
				+		$(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \
			
 
				+	mv $@+ $@
			
 
				+
			
 
				+%.1 %.5 %.7 : %.xml
			
 
				+	$(QUIET_XMLTO)$(RM) $@ && \
			
 
				+	xmlto -m $(MANPAGE_XSL) $(XMLTO_EXTRA) man $<
			
 
				+
			
 
				+%.xml : %.txt
			
 
				+	$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
			
 
				+	$(ASCIIDOC) -b docbook -d manpage -f asciidoc.conf \
			
 
				+		$(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \
			
 
				+	mv $@+ $@
			
 
				+
			
 
				+XSLT = docbook.xsl
			
 
				+XSLTOPTS = --xinclude --stringparam html.stylesheet docbook-xsl.css
			
 
				+
			
 
				+user-manual.html: user-manual.xml
			
 
				+	$(QUIET_XSLTPROC)xsltproc $(XSLTOPTS) -o $@ $(XSLT) $<
			
 
				+
			
 
				+perf.info: user-manual.texi
			
 
				+	$(QUIET_MAKEINFO)$(MAKEINFO) --no-split -o $@ user-manual.texi
			
 
				+
			
 
				+user-manual.texi: user-manual.xml
			
 
				+	$(QUIET_DB2TEXI)$(RM) $@+ $@ && \
			
 
				+	$(DOCBOOK2X_TEXI) user-manual.xml --encoding=UTF-8 --to-stdout >$@++ && \
			
 
				+	$(PERL_PATH) fix-texi.perl <$@++ >$@+ && \
			
 
				+	rm $@++ && \
			
 
				+	mv $@+ $@
			
 
				+
			
 
				+user-manual.pdf: user-manual.xml
			
 
				+	$(QUIET_DBLATEX)$(RM) $@+ $@ && \
			
 
				+	$(DBLATEX) -o $@+ -p /etc/asciidoc/dblatex/asciidoc-dblatex.xsl -s /etc/asciidoc/dblatex/asciidoc-dblatex.sty $< && \
			
 
				+	mv $@+ $@
			
 
				+
			
 
				+perfman.texi: $(MAN_XML) cat-texi.perl
			
 
				+	$(QUIET_DB2TEXI)$(RM) $@+ $@ && \
			
 
				+	($(foreach xml,$(MAN_XML),$(DOCBOOK2X_TEXI) --encoding=UTF-8 \
			
 
				+		--to-stdout $(xml) &&) true) > $@++ && \
			
 
				+	$(PERL_PATH) cat-texi.perl $@ <$@++ >$@+ && \
			
 
				+	rm $@++ && \
			
 
				+	mv $@+ $@
			
 
				+
			
 
				+perfman.info: perfman.texi
			
 
				+	$(QUIET_MAKEINFO)$(MAKEINFO) --no-split --no-validate $*.texi
			
 
				+
			
 
				+$(patsubst %.txt,%.texi,$(MAN_TXT)): %.texi : %.xml
			
 
				+	$(QUIET_DB2TEXI)$(RM) $@+ $@ && \
			
 
				+	$(DOCBOOK2X_TEXI) --to-stdout $*.xml >$@+ && \
			
 
				+	mv $@+ $@
			
 
				+
			
 
				+howto-index.txt: howto-index.sh $(wildcard howto/*.txt)
			
 
				+	$(QUIET_GEN)$(RM) $@+ $@ && \
			
 
				+	'$(SHELL_PATH_SQ)' ./howto-index.sh $(wildcard howto/*.txt) >$@+ && \
			
 
				+	mv $@+ $@
			
 
				+
			
 
				+$(patsubst %,%.html,$(ARTICLES)) : %.html : %.txt
			
 
				+	$(QUIET_ASCIIDOC)$(ASCIIDOC) -b xhtml11 $*.txt
			
 
				+
			
 
				+WEBDOC_DEST = /pub/software/tools/perf/docs
			
 
				+
			
 
				+$(patsubst %.txt,%.html,$(wildcard howto/*.txt)): %.html : %.txt
			
 
				+	$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
			
 
				+	sed -e '1,/^$$/d' $< | $(ASCIIDOC) -b xhtml11 - >$@+ && \
			
 
				+	mv $@+ $@
			
 
				+
			
 
				+install-webdoc : html
			
 
				+	'$(SHELL_PATH_SQ)' ./install-webdoc.sh $(WEBDOC_DEST)
			
 
				+
			
 
				+quick-install: quick-install-man
			
 
				+
			
 
				+quick-install-man:
			
 
				+	'$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(DOC_REF) $(DESTDIR)$(mandir)
			
 
				+
			
 
				+quick-install-html:
			
 
				+	'$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(HTML_REF) $(DESTDIR)$(htmldir)
			
 
				+
			
 
				+.PHONY: .FORCE-PERF-VERSION-FILE
			
--- a/tools/perf/Documentation/asciidoc.conf
+++ b/tools/perf/Documentation/asciidoc.conf
@@ -0,0 +1,91 @@
 
				+## linkperf: macro
			
 
				+#
			
 
				+# Usage: linkperf:command[manpage-section]
			
 
				+#
			
 
				+# Note, {0} is the manpage section, while {target} is the command.
			
 
				+#
			
 
				+# Show PERF link as: <command>(<section>); if section is defined, else just show
			
 
				+# the command.
			
 
				+
			
 
				+[macros]
			
 
				+(?su)[\\]?(?P<name>linkperf):(?P<target>\S*?)\[(?P<attrlist>.*?)\]=
			
 
				+
			
 
				+[attributes]
			
 
				+asterisk=&#42;
			
 
				+plus=&#43;
			
 
				+caret=&#94;
			
 
				+startsb=&#91;
			
 
				+endsb=&#93;
			
 
				+tilde=&#126;
			
 
				+
			
 
				+ifdef::backend-docbook[]
			
 
				+[linkperf-inlinemacro]
			
 
				+{0%{target}}
			
 
				+{0#<citerefentry>}
			
 
				+{0#<refentrytitle>{target}</refentrytitle><manvolnum>{0}</manvolnum>}
			
 
				+{0#</citerefentry>}
			
 
				+endif::backend-docbook[]
			
 
				+
			
 
				+ifdef::backend-docbook[]
			
 
				+ifndef::perf-asciidoc-no-roff[]
			
 
				+# "unbreak" docbook-xsl v1.68 for manpages. v1.69 works with or without this.
			
 
				+# v1.72 breaks with this because it replaces dots not in roff requests.
			
 
				+[listingblock]
			
 
				+<example><title>{title}</title>
			
 
				+<literallayout>
			
 
				+ifdef::doctype-manpage[]
			
 
				+&#10;.ft C&#10;
			
 
				+endif::doctype-manpage[]
			
 
				+|
			
 
				+ifdef::doctype-manpage[]
			
 
				+&#10;.ft&#10;
			
 
				+endif::doctype-manpage[]
			
 
				+</literallayout>
			
 
				+{title#}</example>
			
 
				+endif::perf-asciidoc-no-roff[]
			
 
				+
			
 
				+ifdef::perf-asciidoc-no-roff[]
			
 
				+ifdef::doctype-manpage[]
			
 
				+# The following two small workarounds insert a simple paragraph after screen
			
 
				+[listingblock]
			
 
				+<example><title>{title}</title>
			
 
				+<literallayout>
			
 
				+|
			
 
				+</literallayout><simpara></simpara>
			
 
				+{title#}</example>
			
 
				+
			
 
				+[verseblock]
			
 
				+<formalpara{id? id="{id}"}><title>{title}</title><para>
			
 
				+{title%}<literallayout{id? id="{id}"}>
			
 
				+{title#}<literallayout>
			
 
				+|
			
 
				+</literallayout>
			
 
				+{title#}</para></formalpara>
			
 
				+{title%}<simpara></simpara>
			
 
				+endif::doctype-manpage[]
			
 
				+endif::perf-asciidoc-no-roff[]
			
 
				+endif::backend-docbook[]
			
 
				+
			
 
				+ifdef::doctype-manpage[]
			
 
				+ifdef::backend-docbook[]
			
 
				+[header]
			
 
				+template::[header-declarations]
			
 
				+<refentry>
			
 
				+<refmeta>
			
 
				+<refentrytitle>{mantitle}</refentrytitle>
			
 
				+<manvolnum>{manvolnum}</manvolnum>
			
 
				+<refmiscinfo class="source">perf</refmiscinfo>
			
 
				+<refmiscinfo class="version">{perf_version}</refmiscinfo>
			
 
				+<refmiscinfo class="manual">perf Manual</refmiscinfo>
			
 
				+</refmeta>
			
 
				+<refnamediv>
			
 
				+  <refname>{manname}</refname>
			
 
				+  <refpurpose>{manpurpose}</refpurpose>
			
 
				+</refnamediv>
			
 
				+endif::backend-docbook[]
			
 
				+endif::doctype-manpage[]
			
 
				+
			
 
				+ifdef::backend-xhtml11[]
			
 
				+[linkperf-inlinemacro]
			
 
				+<a href="{target}.html">{target}{0?({0})}</a>
			
 
				+endif::backend-xhtml11[]
			
--- a/tools/perf/Documentation/manpage-1.72.xsl
+++ b/tools/perf/Documentation/manpage-1.72.xsl
@@ -0,0 +1,14 @@
 
				+<!-- manpage-1.72.xsl:
			
 
				+     special settings for manpages rendered from asciidoc+docbook
			
 
				+     handles peculiarities in docbook-xsl 1.72.0 -->
			
 
				+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
			
 
				+		version="1.0">
			
 
				+
			
 
				+<xsl:import href="manpage-base.xsl"/>
			
 
				+
			
 
				+<!-- these are the special values for the roff control characters
			
 
				+     needed for docbook-xsl 1.72.0 -->
			
 
				+<xsl:param name="git.docbook.backslash">&#x2593;</xsl:param>
			
 
				+<xsl:param name="git.docbook.dot"      >&#x2302;</xsl:param>
			
 
				+
			
 
				+</xsl:stylesheet>
			
--- a/tools/perf/Documentation/manpage-base.xsl
+++ b/tools/perf/Documentation/manpage-base.xsl
@@ -0,0 +1,35 @@
 
				+<!-- manpage-base.xsl:
			
 
				+     special formatting for manpages rendered from asciidoc+docbook -->
			
 
				+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
			
 
				+		version="1.0">
			
 
				+
			
 
				+<!-- these params silence some output from xmlto -->
			
 
				+<xsl:param name="man.output.quietly" select="1"/>
			
 
				+<xsl:param name="refentry.meta.get.quietly" select="1"/>
			
 
				+
			
 
				+<!-- convert asciidoc callouts to man page format;
			
 
				+     git.docbook.backslash and git.docbook.dot params
			
 
				+     must be supplied by another XSL file or other means -->
			
 
				+<xsl:template match="co">
			
 
				+	<xsl:value-of select="concat(
			
 
				+			      $git.docbook.backslash,'fB(',
			
 
				+			      substring-after(@id,'-'),')',
			
 
				+			      $git.docbook.backslash,'fR')"/>
			
 
				+</xsl:template>
			
 
				+<xsl:template match="calloutlist">
			
 
				+	<xsl:value-of select="$git.docbook.dot"/>
			
 
				+	<xsl:text>sp&#10;</xsl:text>
			
 
				+	<xsl:apply-templates/>
			
 
				+	<xsl:text>&#10;</xsl:text>
			
 
				+</xsl:template>
			
 
				+<xsl:template match="callout">
			
 
				+	<xsl:value-of select="concat(
			
 
				+			      $git.docbook.backslash,'fB',
			
 
				+			      substring-after(@arearefs,'-'),
			
 
				+			      '. ',$git.docbook.backslash,'fR')"/>
			
 
				+	<xsl:apply-templates/>
			
 
				+	<xsl:value-of select="$git.docbook.dot"/>
			
 
				+	<xsl:text>br&#10;</xsl:text>
			
 
				+</xsl:template>
			
 
				+
			
 
				+</xsl:stylesheet>
			
--- a/tools/perf/Documentation/manpage-bold-literal.xsl
+++ b/tools/perf/Documentation/manpage-bold-literal.xsl
@@ -0,0 +1,17 @@
 
				+<!-- manpage-bold-literal.xsl:
			
 
				+     special formatting for manpages rendered from asciidoc+docbook -->
			
 
				+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
			
 
				+		version="1.0">
			
 
				+
			
 
				+<!-- render literal text as bold (instead of plain or monospace);
			
 
				+     this makes literal text easier to distinguish in manpages
			
 
				+     viewed on a tty -->
			
 
				+<xsl:template match="literal">
			
 
				+	<xsl:value-of select="$git.docbook.backslash"/>
			
 
				+	<xsl:text>fB</xsl:text>
			
 
				+	<xsl:apply-templates/>
			
 
				+	<xsl:value-of select="$git.docbook.backslash"/>
			
 
				+	<xsl:text>fR</xsl:text>
			
 
				+</xsl:template>
			
 
				+
			
 
				+</xsl:stylesheet>
			
--- a/tools/perf/Documentation/manpage-normal.xsl
+++ b/tools/perf/Documentation/manpage-normal.xsl
@@ -0,0 +1,13 @@
 
				+<!-- manpage-normal.xsl:
			
 
				+     special settings for manpages rendered from asciidoc+docbook
			
 
				+     handles anything we want to keep away from docbook-xsl 1.72.0 -->
			
 
				+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
			
 
				+		version="1.0">
			
 
				+
			
 
				+<xsl:import href="manpage-base.xsl"/>
			
 
				+
			
 
				+<!-- these are the normal values for the roff control characters -->
			
 
				+<xsl:param name="git.docbook.backslash">\</xsl:param>
			
 
				+<xsl:param name="git.docbook.dot"	>.</xsl:param>
			
 
				+
			
 
				+</xsl:stylesheet>
			
--- a/tools/perf/Documentation/manpage-suppress-sp.xsl
+++ b/tools/perf/Documentation/manpage-suppress-sp.xsl
@@ -0,0 +1,21 @@
 
				+<!-- manpage-suppress-sp.xsl:
			
 
				+     special settings for manpages rendered from asciidoc+docbook
			
 
				+     handles erroneous, inline .sp in manpage output of some
			
 
				+     versions of docbook-xsl -->
			
 
				+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
			
 
				+		version="1.0">
			
 
				+
			
 
				+<!-- attempt to work around spurious .sp at the tail of the line
			
 
				+     that some versions of docbook stylesheets seem to add -->
			
 
				+<xsl:template match="simpara">
			
 
				+  <xsl:variable name="content">
			
 
				+    <xsl:apply-templates/>
			
 
				+  </xsl:variable>
			
 
				+  <xsl:value-of select="normalize-space($content)"/>
			
 
				+  <xsl:if test="not(ancestor::authorblurb) and
			
 
				+                not(ancestor::personblurb)">
			
 
				+    <xsl:text>&#10;&#10;</xsl:text>
			
 
				+  </xsl:if>
			
 
				+</xsl:template>
			
 
				+
			
 
				+</xsl:stylesheet>
			
--- a/tools/perf/Documentation/perf-annotate.txt
+++ b/tools/perf/Documentation/perf-annotate.txt
@@ -0,0 +1,29 @@
 
				+perf-annotate(1)
			
 
				+==============
			
 
				+
			
 
				+NAME
			
 
				+----
			
 
				+perf-annotate - Read perf.data (created by perf record) and display annotated code
			
 
				+
			
 
				+SYNOPSIS
			
 
				+--------
			
 
				+[verse]
			
 
				+'perf annotate' [-i <file> | --input=file] symbol_name
			
 
				+
			
 
				+DESCRIPTION
			
 
				+-----------
			
 
				+This command reads the input file and displays an annotated version of the
			
 
				+code. If the object file has debug symbols then the source code will be
			
 
				+displayed alongside assembly code.
			
 
				+
			
 
				+If there is no debug info in the object, then annotated assembly is displayed.
			
 
				+
			
 
				+OPTIONS
			
 
				+-------
			
 
				+-i::
			
 
				+--input=::
			
 
				+        Input file name. (default: perf.data)
			
 
				+
			
 
				+SEE ALSO
			
 
				+--------
			
 
				+linkperf:perf-record[1]
			
--- a/tools/perf/Documentation/perf-help.txt
+++ b/tools/perf/Documentation/perf-help.txt
@@ -0,0 +1,38 @@
 
				+perf-help(1)
			
 
				+============
			
 
				+
			
 
				+NAME
			
 
				+----
			
 
				+perf-help - display help information about perf
			
 
				+
			
 
				+SYNOPSIS
			
 
				+--------
			
 
				+'perf help' [-a|--all] [COMMAND]
			
 
				+
			
 
				+DESCRIPTION
			
 
				+-----------
			
 
				+
			
 
				+With no options and no COMMAND given, the synopsis of the 'perf'
			
 
				+command and a list of the most commonly used perf commands are printed
			
 
				+on the standard output.
			
 
				+
			
 
				+If the option '--all' or '-a' is given, then all available commands are
			
 
				+printed on the standard output.
			
 
				+
			
 
				+If a perf command is named, a manual page for that command is brought
			
 
				+up. The 'man' program is used by default for this purpose, but this
			
 
				+can be overridden by other options or configuration variables.
			
 
				+
			
 
				+Note that `perf --help ...` is identical to `perf help ...` because the
			
 
				+former is internally converted into the latter.
			
 
				+
			
 
				+OPTIONS
			
 
				+-------
			
 
				+-a::
			
 
				+--all::
			
 
				+	Prints all the available commands on the standard output. This
			
 
				+	option supersedes any other option.
			
 
				+
			
 
				+PERF
			
 
				+----
			
 
				+Part of the linkperf:perf[1] suite
			
--- a/tools/perf/Documentation/perf-list.txt
+++ b/tools/perf/Documentation/perf-list.txt
@@ -0,0 +1,25 @@
 
				+perf-list(1)
			
 
				+============
			
 
				+
			
 
				+NAME
			
 
				+----
			
 
				+perf-list - List all symbolic event types
			
 
				+
			
 
				+SYNOPSIS
			
 
				+--------
			
 
				+[verse]
			
 
				+'perf list'
			
 
				+
			
 
				+DESCRIPTION
			
 
				+-----------
			
 
				+This command displays the symbolic event types which can be selected in the
			
 
				+various perf commands with the -e option.
			
 
				+
			
 
				+OPTIONS
			
 
				+-------
			
 
				+None
			
 
				+
			
 
				+SEE ALSO
			
 
				+--------
			
 
				+linkperf:perf-stat[1], linkperf:perf-top[1],
			
 
				+linkperf:perf-record[1]
			
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -0,0 +1,42 @@
 
				+perf-record(1)
			
 
				+==============
			
 
				+
			
 
				+NAME
			
 
				+----
			
 
				+perf-record - Run a command and record its profile into perf.data
			
 
				+
			
 
				+SYNOPSIS
			
 
				+--------
			
 
				+[verse]
			
 
				+'perf record' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
			
 
				+'perf record' [-e <EVENT> | --event=EVENT] [-l] [-a] -- <command> [<options>]
			
 
				+
			
 
				+DESCRIPTION
			
 
				+-----------
			
 
				+This command runs a command and gathers a performance counter profile
			
 
				+from it, into perf.data - without displaying anything.
			
 
				+
			
 
				+This file can then be inspected later on, using 'perf report'.
			
 
				+
			
 
				+
			
 
				+OPTIONS
			
 
				+-------
			
 
				+<command>...::
			
 
				+	Any command you can specify in a shell.
			
 
				+
			
 
				+-e::
			
 
				+--event=::
			
 
				+	Select the PMU event. Selection can be a symbolic event name
			
 
				+	(use 'perf list' to list all events) or a raw PMU
			
 
				+	event (eventsel+umask) in the form of rNNN where NNN is a
			
 
				+	 hexadecimal event descriptor.
			
 
				+
			
 
				+-a::
			
 
				+        system-wide collection
			
 
				+
			
 
				+-l::
			
 
				+        scale counter values
			
 
				+
			
 
				+SEE ALSO
			
 
				+--------
			
 
				+linkperf:perf-stat[1], linkperf:perf-list[1]
			
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -0,0 +1,26 @@
 
				+perf-report(1)
			
 
				+==============
			
 
				+
			
 
				+NAME
			
 
				+----
			
 
				+perf-report - Read perf.data (created by perf record) and display the profile
			
 
				+
			
 
				+SYNOPSIS
			
 
				+--------
			
 
				+[verse]
			
 
				+'perf report' [-i <file> | --input=file]
			
 
				+
			
 
				+DESCRIPTION
			
 
				+-----------
			
 
				+This command displays the performance counter profile information recorded
			
 
				+via perf report.
			
 
				+
			
 
				+OPTIONS
			
 
				+-------
			
 
				+-i::
			
 
				+--input=::
			
 
				+        Input file name. (default: perf.data)
			
 
				+
			
 
				+SEE ALSO
			
 
				+--------
			
 
				+linkperf:perf-stat[1]
			
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -0,0 +1,66 @@
 
				+perf-stat(1)
			
 
				+============
			
 
				+
			
 
				+NAME
			
 
				+----
			
 
				+perf-stat - Run a command and gather performance counter statistics
			
 
				+
			
 
				+SYNOPSIS
			
 
				+--------
			
 
				+[verse]
			
 
				+'perf stat' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
			
 
				+'perf stat' [-e <EVENT> | --event=EVENT] [-l] [-a] -- <command> [<options>]
			
 
				+
			
 
				+DESCRIPTION
			
 
				+-----------
			
 
				+This command runs a command and gathers performance counter statistics
			
 
				+from it.
			
 
				+
			
 
				+
			
 
				+OPTIONS
			
 
				+-------
			
 
				+<command>...::
			
 
				+	Any command you can specify in a shell.
			
 
				+
			
 
				+
			
 
				+-e::
			
 
				+--event=::
			
 
				+	Select the PMU event. Selection can be a symbolic event name
			
 
				+	(use 'perf list' to list all events) or a raw PMU
			
 
				+	event (eventsel+umask) in the form of rNNN where NNN is a
			
 
				+	 hexadecimal event descriptor.
			
 
				+
			
 
				+-i::
			
 
				+--inherit::
			
 
				+        child tasks inherit counters
			
 
				+-p::
			
 
				+--pid=<pid>::
			
 
				+        stat events on existing pid
			
 
				+
			
 
				+-a::
			
 
				+        system-wide collection
			
 
				+
			
 
				+-l::
			
 
				+        scale counter values
			
 
				+
			
 
				+EXAMPLES
			
 
				+--------
			
 
				+
			
 
				+$ perf stat -- make -j
			
 
				+
			
 
				+ Performance counter stats for 'make -j':
			
 
				+
			
 
				+    8117.370256  task clock ticks     #      11.281 CPU utilization factor
			
 
				+            678  context switches     #       0.000 M/sec
			
 
				+            133  CPU migrations       #       0.000 M/sec
			
 
				+         235724  pagefaults           #       0.029 M/sec
			
 
				+    24821162526  CPU cycles           #    3057.784 M/sec
			
 
				+    18687303457  instructions         #    2302.138 M/sec
			
 
				+      172158895  cache references     #      21.209 M/sec
			
 
				+       27075259  cache misses         #       3.335 M/sec
			
 
				+
			
 
				+ Wall-clock time elapsed:   719.554352 msecs
			
 
				+
			
 
				+SEE ALSO
			
 
				+--------
			
 
				+linkperf:perf-top[1], linkperf:perf-list[1]
			
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -0,0 +1,39 @@
 
				+perf-top(1)
			
 
				+===========
			
 
				+
			
 
				+NAME
			
 
				+----
			
 
				+perf-top - Run a command and profile it
			
 
				+
			
 
				+SYNOPSIS
			
 
				+--------
			
 
				+[verse]
			
 
				+'perf top' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
			
 
				+
			
 
				+DESCRIPTION
			
 
				+-----------
			
 
				+This command runs a command and gathers a performance counter profile
			
 
				+from it.
			
 
				+
			
 
				+
			
 
				+OPTIONS
			
 
				+-------
			
 
				+<command>...::
			
 
				+	Any command you can specify in a shell.
			
 
				+
			
 
				+-e::
			
 
				+--event=::
			
 
				+	Select the PMU event. Selection can be a symbolic event name
			
 
				+	(use 'perf list' to list all events) or a raw PMU
			
 
				+	event (eventsel+umask) in the form of rNNN where NNN is a
			
 
				+	 hexadecimal event descriptor.
			
 
				+
			
 
				+-a::
			
 
				+        system-wide collection
			
 
				+
			
 
				+-l::
			
 
				+        scale counter values
			
 
				+
			
 
				+SEE ALSO
			
 
				+--------
			
 
				+linkperf:perf-stat[1], linkperf:perf-list[1]
			
--- a/tools/perf/Documentation/perf.txt
+++ b/tools/perf/Documentation/perf.txt
@@ -0,0 +1,24 @@
 
				+perf(1)
			
 
				+=======
			
 
				+
			
 
				+NAME
			
 
				+----
			
 
				+perf - Performance analysis tools for Linux
			
 
				+
			
 
				+SYNOPSIS
			
 
				+--------
			
 
				+[verse]
			
 
				+'perf' [--version] [--help] COMMAND [ARGS]
			
 
				+
			
 
				+DESCRIPTION
			
 
				+-----------
			
 
				+Performance counters for Linux are are a new kernel-based subsystem
			
 
				+that provide a framework for all things performance analysis. It
			
 
				+covers hardware level (CPU/PMU, Performance Monitoring Unit) features
			
 
				+and software features (software counters, tracepoints) as well.
			
 
				+
			
 
				+SEE ALSO
			
 
				+--------
			
 
				+linkperf:perf-stat[1], linkperf:perf-top[1],
			
 
				+linkperf:perf-record[1], linkperf:perf-report[1],
			
 
				+linkperf:perf-list[1]
			
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -0,0 +1,929 @@
 
				+# The default target of this Makefile is...
			
 
				+all::
			
 
				+
			
 
				+# Define V=1 to have a more verbose compile.
			
 
				+#
			
 
				+# Define SNPRINTF_RETURNS_BOGUS if your are on a system which snprintf()
			
 
				+# or vsnprintf() return -1 instead of number of characters which would
			
 
				+# have been written to the final string if enough space had been available.
			
 
				+#
			
 
				+# Define FREAD_READS_DIRECTORIES if your are on a system which succeeds
			
 
				+# when attempting to read from an fopen'ed directory.
			
 
				+#
			
 
				+# Define NO_OPENSSL environment variable if you do not have OpenSSL.
			
 
				+# This also implies MOZILLA_SHA1.
			
 
				+#
			
 
				+# Define CURLDIR=/foo/bar if your curl header and library files are in
			
 
				+# /foo/bar/include and /foo/bar/lib directories.
			
 
				+#
			
 
				+# Define EXPATDIR=/foo/bar if your expat header and library files are in
			
 
				+# /foo/bar/include and /foo/bar/lib directories.
			
 
				+#
			
 
				+# Define NO_D_INO_IN_DIRENT if you don't have d_ino in your struct dirent.
			
 
				+#
			
 
				+# Define NO_D_TYPE_IN_DIRENT if your platform defines DT_UNKNOWN but lacks
			
 
				+# d_type in struct dirent (latest Cygwin -- will be fixed soonish).
			
 
				+#
			
 
				+# Define NO_C99_FORMAT if your formatted IO functions (printf/scanf et.al.)
			
 
				+# do not support the 'size specifiers' introduced by C99, namely ll, hh,
			
 
				+# j, z, t. (representing long long int, char, intmax_t, size_t, ptrdiff_t).
			
 
				+# some C compilers supported these specifiers prior to C99 as an extension.
			
 
				+#
			
 
				+# Define NO_STRCASESTR if you don't have strcasestr.
			
 
				+#
			
 
				+# Define NO_MEMMEM if you don't have memmem.
			
 
				+#
			
 
				+# Define NO_STRTOUMAX if you don't have strtoumax in the C library.
			
 
				+# If your compiler also does not support long long or does not have
			
 
				+# strtoull, define NO_STRTOULL.
			
 
				+#
			
 
				+# Define NO_SETENV if you don't have setenv in the C library.
			
 
				+#
			
 
				+# Define NO_UNSETENV if you don't have unsetenv in the C library.
			
 
				+#
			
 
				+# Define NO_MKDTEMP if you don't have mkdtemp in the C library.
			
 
				+#
			
 
				+# Define NO_SYS_SELECT_H if you don't have sys/select.h.
			
 
				+#
			
 
				+# Define NO_SYMLINK_HEAD if you never want .perf/HEAD to be a symbolic link.
			
 
				+# Enable it on Windows.  By default, symrefs are still used.
			
 
				+#
			
 
				+# Define NO_SVN_TESTS if you want to skip time-consuming SVN interoperability
			
 
				+# tests.  These tests take up a significant amount of the total test time
			
 
				+# but are not needed unless you plan to talk to SVN repos.
			
 
				+#
			
 
				+# Define NO_FINK if you are building on Darwin/Mac OS X, have Fink
			
 
				+# installed in /sw, but don't want PERF to link against any libraries
			
 
				+# installed there.  If defined you may specify your own (or Fink's)
			
 
				+# include directories and library directories by defining CFLAGS
			
 
				+# and LDFLAGS appropriately.
			
 
				+#
			
 
				+# Define NO_DARWIN_PORTS if you are building on Darwin/Mac OS X,
			
 
				+# have DarwinPorts installed in /opt/local, but don't want PERF to
			
 
				+# link against any libraries installed there.  If defined you may
			
 
				+# specify your own (or DarwinPort's) include directories and
			
 
				+# library directories by defining CFLAGS and LDFLAGS appropriately.
			
 
				+#
			
 
				+# Define PPC_SHA1 environment variable when running make to make use of
			
 
				+# a bundled SHA1 routine optimized for PowerPC.
			
 
				+#
			
 
				+# Define ARM_SHA1 environment variable when running make to make use of
			
 
				+# a bundled SHA1 routine optimized for ARM.
			
 
				+#
			
 
				+# Define MOZILLA_SHA1 environment variable when running make to make use of
			
 
				+# a bundled SHA1 routine coming from Mozilla. It is GPL'd and should be fast
			
 
				+# on non-x86 architectures (e.g. PowerPC), while the OpenSSL version (default
			
 
				+# choice) has very fast version optimized for i586.
			
 
				+#
			
 
				+# Define NEEDS_SSL_WITH_CRYPTO if you need -lcrypto with -lssl (Darwin).
			
 
				+#
			
 
				+# Define NEEDS_LIBICONV if linking with libc is not enough (Darwin).
			
 
				+#
			
 
				+# Define NEEDS_SOCKET if linking with libc is not enough (SunOS,
			
 
				+# Patrick Mauritz).
			
 
				+#
			
 
				+# Define NO_MMAP if you want to avoid mmap.
			
 
				+#
			
 
				+# Define NO_PTHREADS if you do not have or do not want to use Pthreads.
			
 
				+#
			
 
				+# Define NO_PREAD if you have a problem with pread() system call (e.g.
			
 
				+# cygwin.dll before v1.5.22).
			
 
				+#
			
 
				+# Define NO_FAST_WORKING_DIRECTORY if accessing objects in pack files is
			
 
				+# generally faster on your platform than accessing the working directory.
			
 
				+#
			
 
				+# Define NO_TRUSTABLE_FILEMODE if your filesystem may claim to support
			
 
				+# the executable mode bit, but doesn't really do so.
			
 
				+#
			
 
				+# Define NO_IPV6 if you lack IPv6 support and getaddrinfo().
			
 
				+#
			
 
				+# Define NO_SOCKADDR_STORAGE if your platform does not have struct
			
 
				+# sockaddr_storage.
			
 
				+#
			
 
				+# Define NO_ICONV if your libc does not properly support iconv.
			
 
				+#
			
 
				+# Define OLD_ICONV if your library has an old iconv(), where the second
			
 
				+# (input buffer pointer) parameter is declared with type (const char **).
			
 
				+#
			
 
				+# Define NO_DEFLATE_BOUND if your zlib does not have deflateBound.
			
 
				+#
			
 
				+# Define NO_R_TO_GCC_LINKER if your gcc does not like "-R/path/lib"
			
 
				+# that tells runtime paths to dynamic libraries;
			
 
				+# "-Wl,-rpath=/path/lib" is used instead.
			
 
				+#
			
 
				+# Define USE_NSEC below if you want perf to care about sub-second file mtimes
			
 
				+# and ctimes. Note that you need recent glibc (at least 2.2.4) for this, and
			
 
				+# it will BREAK YOUR LOCAL DIFFS! show-diff and anything using it will likely
			
 
				+# randomly break unless your underlying filesystem supports those sub-second
			
 
				+# times (my ext3 doesn't).
			
 
				+#
			
 
				+# Define USE_ST_TIMESPEC if your "struct stat" uses "st_ctimespec" instead of
			
 
				+# "st_ctim"
			
 
				+#
			
 
				+# Define NO_NSEC if your "struct stat" does not have "st_ctim.tv_nsec"
			
 
				+# available.  This automatically turns USE_NSEC off.
			
 
				+#
			
 
				+# Define USE_STDEV below if you want perf to care about the underlying device
			
 
				+# change being considered an inode change from the update-index perspective.
			
 
				+#
			
 
				+# Define NO_ST_BLOCKS_IN_STRUCT_STAT if your platform does not have st_blocks
			
 
				+# field that counts the on-disk footprint in 512-byte blocks.
			
 
				+#
			
 
				+# Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8
			
 
				+#
			
 
				+# Define DOCBOOK_XSL_172 if you want to format man pages with DocBook XSL v1.72.
			
 
				+#
			
 
				+# Define NO_PERL_MAKEMAKER if you cannot use Makefiles generated by perl's
			
 
				+# MakeMaker (e.g. using ActiveState under Cygwin).
			
 
				+#
			
 
				+# Define NO_PERL if you do not want Perl scripts or libraries at all.
			
 
				+#
			
 
				+# Define INTERNAL_QSORT to use Git's implementation of qsort(), which
			
 
				+# is a simplified version of the merge sort used in glibc. This is
			
 
				+# recommended if Git triggers O(n^2) behavior in your platform's qsort().
			
 
				+#
			
 
				+# Define NO_EXTERNAL_GREP if you don't want "perf grep" to ever call
			
 
				+# your external grep (e.g., if your system lacks grep, if its grep is
			
 
				+# broken, or spawning external process is slower than built-in grep perf has).
			
 
				+
			
 
				+PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
			
 
				+	@$(SHELL_PATH) util/PERF-VERSION-GEN
			
 
				+-include PERF-VERSION-FILE
			
 
				+
			
 
				+uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
			
 
				+uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not')
			
 
				+uname_O := $(shell sh -c 'uname -o 2>/dev/null || echo not')
			
 
				+uname_R := $(shell sh -c 'uname -r 2>/dev/null || echo not')
			
 
				+uname_P := $(shell sh -c 'uname -p 2>/dev/null || echo not')
			
 
				+uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
			
 
				+
			
 
				+# CFLAGS and LDFLAGS are for the users to override from the command line.
			
 
				+
			
 
				+CFLAGS = -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement -O6
			
 
				+LDFLAGS = -lpthread -lrt -lelf
			
 
				+ALL_CFLAGS = $(CFLAGS)
			
 
				+ALL_LDFLAGS = $(LDFLAGS)
			
 
				+STRIP ?= strip
			
 
				+
			
 
				+# Among the variables below, these:
			
 
				+#   perfexecdir
			
 
				+#   template_dir
			
 
				+#   mandir
			
 
				+#   infodir
			
 
				+#   htmldir
			
 
				+#   ETC_PERFCONFIG (but not sysconfdir)
			
 
				+# can be specified as a relative path some/where/else;
			
 
				+# this is interpreted as relative to $(prefix) and "perf" at
			
 
				+# runtime figures out where they are based on the path to the executable.
			
 
				+# This can help installing the suite in a relocatable way.
			
 
				+
			
 
				+prefix = $(HOME)
			
 
				+bindir_relative = bin
			
 
				+bindir = $(prefix)/$(bindir_relative)
			
 
				+mandir = share/man
			
 
				+infodir = share/info
			
 
				+perfexecdir = libexec/perf-core
			
 
				+sharedir = $(prefix)/share
			
 
				+template_dir = share/perf-core/templates
			
 
				+htmldir = share/doc/perf-doc
			
 
				+ifeq ($(prefix),/usr)
			
 
				+sysconfdir = /etc
			
 
				+ETC_PERFCONFIG = $(sysconfdir)/perfconfig
			
 
				+else
			
 
				+sysconfdir = $(prefix)/etc
			
 
				+ETC_PERFCONFIG = etc/perfconfig
			
 
				+endif
			
 
				+lib = lib
			
 
				+# DESTDIR=
			
 
				+
			
 
				+export prefix bindir sharedir sysconfdir
			
 
				+
			
 
				+CC = gcc
			
 
				+AR = ar
			
 
				+RM = rm -f
			
 
				+TAR = tar
			
 
				+FIND = find
			
 
				+INSTALL = install
			
 
				+RPMBUILD = rpmbuild
			
 
				+PTHREAD_LIBS = -lpthread
			
 
				+
			
 
				+# sparse is architecture-neutral, which means that we need to tell it
			
 
				+# explicitly what architecture to check for. Fix this up for yours..
			
 
				+SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__
			
 
				+
			
 
				+
			
 
				+
			
 
				+### --- END CONFIGURATION SECTION ---
			
 
				+
			
 
				+# Those must not be GNU-specific; they are shared with perl/ which may
			
 
				+# be built by a different compiler. (Note that this is an artifact now
			
 
				+# but it still might be nice to keep that distinction.)
			
 
				+BASIC_CFLAGS =
			
 
				+BASIC_LDFLAGS =
			
 
				+
			
 
				+# Guard against environment variables
			
 
				+BUILTIN_OBJS =
			
 
				+BUILT_INS =
			
 
				+COMPAT_CFLAGS =
			
 
				+COMPAT_OBJS =
			
 
				+LIB_H =
			
 
				+LIB_OBJS =
			
 
				+SCRIPT_PERL =
			
 
				+SCRIPT_SH =
			
 
				+TEST_PROGRAMS =
			
 
				+
			
 
				+#
			
 
				+# No scripts right now:
			
 
				+#
			
 
				+
			
 
				+# SCRIPT_SH += perf-am.sh
			
 
				+
			
 
				+#
			
 
				+# No Perl scripts right now:
			
 
				+#
			
 
				+
			
 
				+# SCRIPT_PERL += perf-add--interactive.perl
			
 
				+
			
 
				+SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH)) \
			
 
				+	  $(patsubst %.perl,%,$(SCRIPT_PERL))
			
 
				+
			
 
				+# Empty...
			
 
				+EXTRA_PROGRAMS =
			
 
				+
			
 
				+# ... and all the rest that could be moved out of bindir to perfexecdir
			
 
				+PROGRAMS += $(EXTRA_PROGRAMS)
			
 
				+
			
 
				+#
			
 
				+# Single 'perf' binary right now:
			
 
				+#
			
 
				+PROGRAMS += perf
			
 
				+
			
 
				+# List built-in command $C whose implementation cmd_$C() is not in
			
 
				+# builtin-$C.o but is linked in as part of some other command.
			
 
				+#
			
 
				+# None right now:
			
 
				+#
			
 
				+# BUILT_INS += perf-init $X
			
 
				+
			
 
				+# what 'all' will build and 'install' will install, in perfexecdir
			
 
				+ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS)
			
 
				+
			
 
				+# what 'all' will build but not install in perfexecdir
			
 
				+OTHER_PROGRAMS = perf$X
			
 
				+
			
 
				+# Set paths to tools early so that they can be used for version tests.
			
 
				+ifndef SHELL_PATH
			
 
				+	SHELL_PATH = /bin/sh
			
 
				+endif
			
 
				+ifndef PERL_PATH
			
 
				+	PERL_PATH = /usr/bin/perl
			
 
				+endif
			
 
				+
			
 
				+export PERL_PATH
			
 
				+
			
 
				+LIB_FILE=libperf.a
			
 
				+
			
 
				+LIB_H += ../../include/linux/perf_counter.h
			
 
				+LIB_H += perf.h
			
 
				+LIB_H += util/list.h
			
 
				+LIB_H += util/rbtree.h
			
 
				+LIB_H += util/levenshtein.h
			
 
				+LIB_H += util/parse-options.h
			
 
				+LIB_H += util/parse-events.h
			
 
				+LIB_H += util/quote.h
			
 
				+LIB_H += util/util.h
			
 
				+LIB_H += util/help.h
			
 
				+LIB_H += util/strbuf.h
			
 
				+LIB_H += util/string.h
			
 
				+LIB_H += util/run-command.h
			
 
				+LIB_H += util/sigchain.h
			
 
				+LIB_H += util/symbol.h
			
 
				+LIB_H += util/color.h
			
 
				+
			
 
				+LIB_OBJS += util/abspath.o
			
 
				+LIB_OBJS += util/alias.o
			
 
				+LIB_OBJS += util/config.o
			
 
				+LIB_OBJS += util/ctype.o
			
 
				+LIB_OBJS += util/environment.o
			
 
				+LIB_OBJS += util/exec_cmd.o
			
 
				+LIB_OBJS += util/help.o
			
 
				+LIB_OBJS += util/levenshtein.o
			
 
				+LIB_OBJS += util/parse-options.o
			
 
				+LIB_OBJS += util/parse-events.o
			
 
				+LIB_OBJS += util/path.o
			
 
				+LIB_OBJS += util/rbtree.o
			
 
				+LIB_OBJS += util/run-command.o
			
 
				+LIB_OBJS += util/quote.o
			
 
				+LIB_OBJS += util/strbuf.o
			
 
				+LIB_OBJS += util/string.o
			
 
				+LIB_OBJS += util/usage.o
			
 
				+LIB_OBJS += util/wrapper.o
			
 
				+LIB_OBJS += util/sigchain.o
			
 
				+LIB_OBJS += util/symbol.o
			
 
				+LIB_OBJS += util/color.o
			
 
				+LIB_OBJS += util/pager.o
			
 
				+
			
 
				+BUILTIN_OBJS += builtin-annotate.o
			
 
				+BUILTIN_OBJS += builtin-help.o
			
 
				+BUILTIN_OBJS += builtin-list.o
			
 
				+BUILTIN_OBJS += builtin-record.o
			
 
				+BUILTIN_OBJS += builtin-report.o
			
 
				+BUILTIN_OBJS += builtin-stat.o
			
 
				+BUILTIN_OBJS += builtin-top.o
			
 
				+
			
 
				+PERFLIBS = $(LIB_FILE)
			
 
				+EXTLIBS =
			
 
				+
			
 
				+#
			
 
				+# Platform specific tweaks
			
 
				+#
			
 
				+
			
 
				+# We choose to avoid "if .. else if .. else .. endif endif"
			
 
				+# because maintaining the nesting to match is a pain.  If
			
 
				+# we had "elif" things would have been much nicer...
			
 
				+
			
 
				+-include config.mak.autogen
			
 
				+-include config.mak
			
 
				+
			
 
				+ifeq ($(uname_S),Darwin)
			
 
				+	ifndef NO_FINK
			
 
				+		ifeq ($(shell test -d /sw/lib && echo y),y)
			
 
				+			BASIC_CFLAGS += -I/sw/include
			
 
				+			BASIC_LDFLAGS += -L/sw/lib
			
 
				+		endif
			
 
				+	endif
			
 
				+	ifndef NO_DARWIN_PORTS
			
 
				+		ifeq ($(shell test -d /opt/local/lib && echo y),y)
			
 
				+			BASIC_CFLAGS += -I/opt/local/include
			
 
				+			BASIC_LDFLAGS += -L/opt/local/lib
			
 
				+		endif
			
 
				+	endif
			
 
				+	PTHREAD_LIBS =
			
 
				+endif
			
 
				+
			
 
				+ifndef CC_LD_DYNPATH
			
 
				+	ifdef NO_R_TO_GCC_LINKER
			
 
				+		# Some gcc does not accept and pass -R to the linker to specify
			
 
				+		# the runtime dynamic library path.
			
 
				+		CC_LD_DYNPATH = -Wl,-rpath,
			
 
				+	else
			
 
				+		CC_LD_DYNPATH = -R
			
 
				+	endif
			
 
				+endif
			
 
				+
			
 
				+ifdef ZLIB_PATH
			
 
				+	BASIC_CFLAGS += -I$(ZLIB_PATH)/include
			
 
				+	EXTLIBS += -L$(ZLIB_PATH)/$(lib) $(CC_LD_DYNPATH)$(ZLIB_PATH)/$(lib)
			
 
				+endif
			
 
				+EXTLIBS += -lz
			
 
				+
			
 
				+ifdef NEEDS_SOCKET
			
 
				+	EXTLIBS += -lsocket
			
 
				+endif
			
 
				+ifdef NEEDS_NSL
			
 
				+	EXTLIBS += -lnsl
			
 
				+endif
			
 
				+ifdef NO_D_TYPE_IN_DIRENT
			
 
				+	BASIC_CFLAGS += -DNO_D_TYPE_IN_DIRENT
			
 
				+endif
			
 
				+ifdef NO_D_INO_IN_DIRENT
			
 
				+	BASIC_CFLAGS += -DNO_D_INO_IN_DIRENT
			
 
				+endif
			
 
				+ifdef NO_ST_BLOCKS_IN_STRUCT_STAT
			
 
				+	BASIC_CFLAGS += -DNO_ST_BLOCKS_IN_STRUCT_STAT
			
 
				+endif
			
 
				+ifdef USE_NSEC
			
 
				+	BASIC_CFLAGS += -DUSE_NSEC
			
 
				+endif
			
 
				+ifdef USE_ST_TIMESPEC
			
 
				+	BASIC_CFLAGS += -DUSE_ST_TIMESPEC
			
 
				+endif
			
 
				+ifdef NO_NSEC
			
 
				+	BASIC_CFLAGS += -DNO_NSEC
			
 
				+endif
			
 
				+ifdef NO_C99_FORMAT
			
 
				+	BASIC_CFLAGS += -DNO_C99_FORMAT
			
 
				+endif
			
 
				+ifdef SNPRINTF_RETURNS_BOGUS
			
 
				+	COMPAT_CFLAGS += -DSNPRINTF_RETURNS_BOGUS
			
 
				+	COMPAT_OBJS += compat/snprintf.o
			
 
				+endif
			
 
				+ifdef FREAD_READS_DIRECTORIES
			
 
				+	COMPAT_CFLAGS += -DFREAD_READS_DIRECTORIES
			
 
				+	COMPAT_OBJS += compat/fopen.o
			
 
				+endif
			
 
				+ifdef NO_SYMLINK_HEAD
			
 
				+	BASIC_CFLAGS += -DNO_SYMLINK_HEAD
			
 
				+endif
			
 
				+ifdef NO_STRCASESTR
			
 
				+	COMPAT_CFLAGS += -DNO_STRCASESTR
			
 
				+	COMPAT_OBJS += compat/strcasestr.o
			
 
				+endif
			
 
				+ifdef NO_STRTOUMAX
			
 
				+	COMPAT_CFLAGS += -DNO_STRTOUMAX
			
 
				+	COMPAT_OBJS += compat/strtoumax.o
			
 
				+endif
			
 
				+ifdef NO_STRTOULL
			
 
				+	COMPAT_CFLAGS += -DNO_STRTOULL
			
 
				+endif
			
 
				+ifdef NO_SETENV
			
 
				+	COMPAT_CFLAGS += -DNO_SETENV
			
 
				+	COMPAT_OBJS += compat/setenv.o
			
 
				+endif
			
 
				+ifdef NO_MKDTEMP
			
 
				+	COMPAT_CFLAGS += -DNO_MKDTEMP
			
 
				+	COMPAT_OBJS += compat/mkdtemp.o
			
 
				+endif
			
 
				+ifdef NO_UNSETENV
			
 
				+	COMPAT_CFLAGS += -DNO_UNSETENV
			
 
				+	COMPAT_OBJS += compat/unsetenv.o
			
 
				+endif
			
 
				+ifdef NO_SYS_SELECT_H
			
 
				+	BASIC_CFLAGS += -DNO_SYS_SELECT_H
			
 
				+endif
			
 
				+ifdef NO_MMAP
			
 
				+	COMPAT_CFLAGS += -DNO_MMAP
			
 
				+	COMPAT_OBJS += compat/mmap.o
			
 
				+else
			
 
				+	ifdef USE_WIN32_MMAP
			
 
				+		COMPAT_CFLAGS += -DUSE_WIN32_MMAP
			
 
				+		COMPAT_OBJS += compat/win32mmap.o
			
 
				+	endif
			
 
				+endif
			
 
				+ifdef NO_PREAD
			
 
				+	COMPAT_CFLAGS += -DNO_PREAD
			
 
				+	COMPAT_OBJS += compat/pread.o
			
 
				+endif
			
 
				+ifdef NO_FAST_WORKING_DIRECTORY
			
 
				+	BASIC_CFLAGS += -DNO_FAST_WORKING_DIRECTORY
			
 
				+endif
			
 
				+ifdef NO_TRUSTABLE_FILEMODE
			
 
				+	BASIC_CFLAGS += -DNO_TRUSTABLE_FILEMODE
			
 
				+endif
			
 
				+ifdef NO_IPV6
			
 
				+	BASIC_CFLAGS += -DNO_IPV6
			
 
				+endif
			
 
				+ifdef NO_UINTMAX_T
			
 
				+	BASIC_CFLAGS += -Duintmax_t=uint32_t
			
 
				+endif
			
 
				+ifdef NO_SOCKADDR_STORAGE
			
 
				+ifdef NO_IPV6
			
 
				+	BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in
			
 
				+else
			
 
				+	BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in6
			
 
				+endif
			
 
				+endif
			
 
				+ifdef NO_INET_NTOP
			
 
				+	LIB_OBJS += compat/inet_ntop.o
			
 
				+endif
			
 
				+ifdef NO_INET_PTON
			
 
				+	LIB_OBJS += compat/inet_pton.o
			
 
				+endif
			
 
				+
			
 
				+ifdef NO_ICONV
			
 
				+	BASIC_CFLAGS += -DNO_ICONV
			
 
				+endif
			
 
				+
			
 
				+ifdef OLD_ICONV
			
 
				+	BASIC_CFLAGS += -DOLD_ICONV
			
 
				+endif
			
 
				+
			
 
				+ifdef NO_DEFLATE_BOUND
			
 
				+	BASIC_CFLAGS += -DNO_DEFLATE_BOUND
			
 
				+endif
			
 
				+
			
 
				+ifdef PPC_SHA1
			
 
				+	SHA1_HEADER = "ppc/sha1.h"
			
 
				+	LIB_OBJS += ppc/sha1.o ppc/sha1ppc.o
			
 
				+else
			
 
				+ifdef ARM_SHA1
			
 
				+	SHA1_HEADER = "arm/sha1.h"
			
 
				+	LIB_OBJS += arm/sha1.o arm/sha1_arm.o
			
 
				+else
			
 
				+ifdef MOZILLA_SHA1
			
 
				+	SHA1_HEADER = "mozilla-sha1/sha1.h"
			
 
				+	LIB_OBJS += mozilla-sha1/sha1.o
			
 
				+else
			
 
				+	SHA1_HEADER = <openssl/sha.h>
			
 
				+	EXTLIBS += $(LIB_4_CRYPTO)
			
 
				+endif
			
 
				+endif
			
 
				+endif
			
 
				+ifdef NO_PERL_MAKEMAKER
			
 
				+	export NO_PERL_MAKEMAKER
			
 
				+endif
			
 
				+ifdef NO_HSTRERROR
			
 
				+	COMPAT_CFLAGS += -DNO_HSTRERROR
			
 
				+	COMPAT_OBJS += compat/hstrerror.o
			
 
				+endif
			
 
				+ifdef NO_MEMMEM
			
 
				+	COMPAT_CFLAGS += -DNO_MEMMEM
			
 
				+	COMPAT_OBJS += compat/memmem.o
			
 
				+endif
			
 
				+ifdef INTERNAL_QSORT
			
 
				+	COMPAT_CFLAGS += -DINTERNAL_QSORT
			
 
				+	COMPAT_OBJS += compat/qsort.o
			
 
				+endif
			
 
				+ifdef RUNTIME_PREFIX
			
 
				+	COMPAT_CFLAGS += -DRUNTIME_PREFIX
			
 
				+endif
			
 
				+
			
 
				+ifdef DIR_HAS_BSD_GROUP_SEMANTICS
			
 
				+	COMPAT_CFLAGS += -DDIR_HAS_BSD_GROUP_SEMANTICS
			
 
				+endif
			
 
				+ifdef NO_EXTERNAL_GREP
			
 
				+	BASIC_CFLAGS += -DNO_EXTERNAL_GREP
			
 
				+endif
			
 
				+
			
 
				+ifeq ($(PERL_PATH),)
			
 
				+NO_PERL=NoThanks
			
 
				+endif
			
 
				+
			
 
				+QUIET_SUBDIR0  = +$(MAKE) -C # space to separate -C and subdir
			
 
				+QUIET_SUBDIR1  =
			
 
				+
			
 
				+ifneq ($(findstring $(MAKEFLAGS),w),w)
			
 
				+PRINT_DIR = --no-print-directory
			
 
				+else # "make -w"
			
 
				+NO_SUBDIR = :
			
 
				+endif
			
 
				+
			
 
				+ifneq ($(findstring $(MAKEFLAGS),s),s)
			
 
				+ifndef V
			
 
				+	QUIET_CC       = @echo '   ' CC $@;
			
 
				+	QUIET_AR       = @echo '   ' AR $@;
			
 
				+	QUIET_LINK     = @echo '   ' LINK $@;
			
 
				+	QUIET_BUILT_IN = @echo '   ' BUILTIN $@;
			
 
				+	QUIET_GEN      = @echo '   ' GEN $@;
			
 
				+	QUIET_SUBDIR0  = +@subdir=
			
 
				+	QUIET_SUBDIR1  = ;$(NO_SUBDIR) echo '   ' SUBDIR $$subdir; \
			
 
				+			 $(MAKE) $(PRINT_DIR) -C $$subdir
			
 
				+	export V
			
 
				+	export QUIET_GEN
			
 
				+	export QUIET_BUILT_IN
			
 
				+endif
			
 
				+endif
			
 
				+
			
 
				+ifdef ASCIIDOC8
			
 
				+	export ASCIIDOC8
			
 
				+endif
			
 
				+
			
 
				+# Shell quote (do not use $(call) to accommodate ancient setups);
			
 
				+
			
 
				+SHA1_HEADER_SQ = $(subst ','\'',$(SHA1_HEADER))
			
 
				+ETC_PERFCONFIG_SQ = $(subst ','\'',$(ETC_PERFCONFIG))
			
 
				+
			
 
				+DESTDIR_SQ = $(subst ','\'',$(DESTDIR))
			
 
				+bindir_SQ = $(subst ','\'',$(bindir))
			
 
				+bindir_relative_SQ = $(subst ','\'',$(bindir_relative))
			
 
				+mandir_SQ = $(subst ','\'',$(mandir))
			
 
				+infodir_SQ = $(subst ','\'',$(infodir))
			
 
				+perfexecdir_SQ = $(subst ','\'',$(perfexecdir))
			
 
				+template_dir_SQ = $(subst ','\'',$(template_dir))
			
 
				+htmldir_SQ = $(subst ','\'',$(htmldir))
			
 
				+prefix_SQ = $(subst ','\'',$(prefix))
			
 
				+
			
 
				+SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
			
 
				+PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH))
			
 
				+
			
 
				+LIBS = $(PERFLIBS) $(EXTLIBS)
			
 
				+
			
 
				+BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \
			
 
				+	$(COMPAT_CFLAGS)
			
 
				+LIB_OBJS += $(COMPAT_OBJS)
			
 
				+
			
 
				+ALL_CFLAGS += $(BASIC_CFLAGS)
			
 
				+ALL_LDFLAGS += $(BASIC_LDFLAGS)
			
 
				+
			
 
				+export TAR INSTALL DESTDIR SHELL_PATH
			
 
				+
			
 
				+
			
 
				+### Build rules
			
 
				+
			
 
				+SHELL = $(SHELL_PATH)
			
 
				+
			
 
				+all:: shell_compatibility_test $(ALL_PROGRAMS) $(BUILT_INS) $(OTHER_PROGRAMS) PERF-BUILD-OPTIONS
			
 
				+ifneq (,$X)
			
 
				+	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), test '$p' -ef '$p$X' || $(RM) '$p';)
			
 
				+endif
			
 
				+
			
 
				+all::
			
 
				+
			
 
				+please_set_SHELL_PATH_to_a_more_modern_shell:
			
 
				+	@$$(:)
			
 
				+
			
 
				+shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell
			
 
				+
			
 
				+strip: $(PROGRAMS) perf$X
			
 
				+	$(STRIP) $(STRIP_OPTS) $(PROGRAMS) perf$X
			
 
				+
			
 
				+perf.o: perf.c common-cmds.h PERF-CFLAGS
			
 
				+	$(QUIET_CC)$(CC) -DPERF_VERSION='"$(PERF_VERSION)"' \
			
 
				+		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
			
 
				+		$(ALL_CFLAGS) -c $(filter %.c,$^)
			
 
				+
			
 
				+perf$X: perf.o $(BUILTIN_OBJS) $(PERFLIBS)
			
 
				+	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ perf.o \
			
 
				+		$(BUILTIN_OBJS) $(ALL_LDFLAGS) $(LIBS)
			
 
				+
			
 
				+builtin-help.o: builtin-help.c common-cmds.h PERF-CFLAGS
			
 
				+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
			
 
				+		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
			
 
				+		'-DPERF_MAN_PATH="$(mandir_SQ)"' \
			
 
				+		'-DPERF_INFO_PATH="$(infodir_SQ)"' $<
			
 
				+
			
 
				+$(BUILT_INS): perf$X
			
 
				+	$(QUIET_BUILT_IN)$(RM) $@ && \
			
 
				+	ln perf$X $@ 2>/dev/null || \
			
 
				+	ln -s perf$X $@ 2>/dev/null || \
			
 
				+	cp perf$X $@
			
 
				+
			
 
				+common-cmds.h: util/generate-cmdlist.sh command-list.txt
			
 
				+
			
 
				+common-cmds.h: $(wildcard Documentation/perf-*.txt)
			
 
				+	$(QUIET_GEN)util/generate-cmdlist.sh > $@+ && mv $@+ $@
			
 
				+
			
 
				+$(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh
			
 
				+	$(QUIET_GEN)$(RM) $@ $@+ && \
			
 
				+	sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \
			
 
				+	    -e 's|@SHELL_PATH@|$(SHELL_PATH_SQ)|' \
			
 
				+	    -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \
			
 
				+	    -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
			
 
				+	    -e 's/@@NO_CURL@@/$(NO_CURL)/g' \
			
 
				+	    $@.sh >$@+ && \
			
 
				+	chmod +x $@+ && \
			
 
				+	mv $@+ $@
			
 
				+
			
 
				+configure: configure.ac
			
 
				+	$(QUIET_GEN)$(RM) $@ $<+ && \
			
 
				+	sed -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
			
 
				+	    $< > $<+ && \
			
 
				+	autoconf -o $@ $<+ && \
			
 
				+	$(RM) $<+
			
 
				+
			
 
				+# These can record PERF_VERSION
			
 
				+perf.o perf.spec \
			
 
				+	$(patsubst %.sh,%,$(SCRIPT_SH)) \
			
 
				+	$(patsubst %.perl,%,$(SCRIPT_PERL)) \
			
 
				+	: PERF-VERSION-FILE
			
 
				+
			
 
				+%.o: %.c PERF-CFLAGS
			
 
				+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
			
 
				+%.s: %.c PERF-CFLAGS
			
 
				+	$(QUIET_CC)$(CC) -S $(ALL_CFLAGS) $<
			
 
				+%.o: %.S
			
 
				+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
			
 
				+
			
 
				+util/exec_cmd.o: util/exec_cmd.c PERF-CFLAGS
			
 
				+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
			
 
				+		'-DPERF_EXEC_PATH="$(perfexecdir_SQ)"' \
			
 
				+		'-DBINDIR="$(bindir_relative_SQ)"' \
			
 
				+		'-DPREFIX="$(prefix_SQ)"' \
			
 
				+		$<
			
 
				+
			
 
				+builtin-init-db.o: builtin-init-db.c PERF-CFLAGS
			
 
				+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DDEFAULT_PERF_TEMPLATE_DIR='"$(template_dir_SQ)"' $<
			
 
				+
			
 
				+util/config.o: util/config.c PERF-CFLAGS
			
 
				+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
			
 
				+
			
 
				+perf-%$X: %.o $(PERFLIBS)
			
 
				+	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS)
			
 
				+
			
 
				+$(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H)
			
 
				+$(patsubst perf-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h)
			
 
				+builtin-revert.o wt-status.o: wt-status.h
			
 
				+
			
 
				+$(LIB_FILE): $(LIB_OBJS)
			
 
				+	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS)
			
 
				+
			
 
				+doc:
			
 
				+	$(MAKE) -C Documentation all
			
 
				+
			
 
				+man:
			
 
				+	$(MAKE) -C Documentation man
			
 
				+
			
 
				+html:
			
 
				+	$(MAKE) -C Documentation html
			
 
				+
			
 
				+info:
			
 
				+	$(MAKE) -C Documentation info
			
 
				+
			
 
				+pdf:
			
 
				+	$(MAKE) -C Documentation pdf
			
 
				+
			
 
				+TAGS:
			
 
				+	$(RM) TAGS
			
 
				+	$(FIND) . -name '*.[hcS]' -print | xargs etags -a
			
 
				+
			
 
				+tags:
			
 
				+	$(RM) tags
			
 
				+	$(FIND) . -name '*.[hcS]' -print | xargs ctags -a
			
 
				+
			
 
				+cscope:
			
 
				+	$(RM) cscope*
			
 
				+	$(FIND) . -name '*.[hcS]' -print | xargs cscope -b
			
 
				+
			
 
				+### Detect prefix changes
			
 
				+TRACK_CFLAGS = $(subst ','\'',$(ALL_CFLAGS)):\
			
 
				+             $(bindir_SQ):$(perfexecdir_SQ):$(template_dir_SQ):$(prefix_SQ)
			
 
				+
			
 
				+PERF-CFLAGS: .FORCE-PERF-CFLAGS
			
 
				+	@FLAGS='$(TRACK_CFLAGS)'; \
			
 
				+	    if test x"$$FLAGS" != x"`cat PERF-CFLAGS 2>/dev/null`" ; then \
			
 
				+		echo 1>&2 "    * new build flags or prefix"; \
			
 
				+		echo "$$FLAGS" >PERF-CFLAGS; \
			
 
				+            fi
			
 
				+
			
 
				+# We need to apply sq twice, once to protect from the shell
			
 
				+# that runs PERF-BUILD-OPTIONS, and then again to protect it
			
 
				+# and the first level quoting from the shell that runs "echo".
			
 
				+PERF-BUILD-OPTIONS: .FORCE-PERF-BUILD-OPTIONS
			
 
				+	@echo SHELL_PATH=\''$(subst ','\'',$(SHELL_PATH_SQ))'\' >$@
			
 
				+	@echo TAR=\''$(subst ','\'',$(subst ','\'',$(TAR)))'\' >>$@
			
 
				+	@echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@
			
 
				+	@echo NO_PERL=\''$(subst ','\'',$(subst ','\'',$(NO_PERL)))'\' >>$@
			
 
				+
			
 
				+### Testing rules
			
 
				+
			
 
				+#
			
 
				+# None right now:
			
 
				+#
			
 
				+# TEST_PROGRAMS += test-something$X
			
 
				+
			
 
				+all:: $(TEST_PROGRAMS)
			
 
				+
			
 
				+# GNU make supports exporting all variables by "export" without parameters.
			
 
				+# However, the environment gets quite big, and some programs have problems
			
 
				+# with that.
			
 
				+
			
 
				+export NO_SVN_TESTS
			
 
				+
			
 
				+check: common-cmds.h
			
 
				+	if sparse; \
			
 
				+	then \
			
 
				+		for i in *.c */*.c; \
			
 
				+		do \
			
 
				+			sparse $(ALL_CFLAGS) $(SPARSE_FLAGS) $$i || exit; \
			
 
				+		done; \
			
 
				+	else \
			
 
				+		echo 2>&1 "Did you mean 'make test'?"; \
			
 
				+		exit 1; \
			
 
				+	fi
			
 
				+
			
 
				+remove-dashes:
			
 
				+	./fixup-builtins $(BUILT_INS) $(PROGRAMS) $(SCRIPTS)
			
 
				+
			
 
				+### Installation rules
			
 
				+
			
 
				+ifneq ($(filter /%,$(firstword $(template_dir))),)
			
 
				+template_instdir = $(template_dir)
			
 
				+else
			
 
				+template_instdir = $(prefix)/$(template_dir)
			
 
				+endif
			
 
				+export template_instdir
			
 
				+
			
 
				+ifneq ($(filter /%,$(firstword $(perfexecdir))),)
			
 
				+perfexec_instdir = $(perfexecdir)
			
 
				+else
			
 
				+perfexec_instdir = $(prefix)/$(perfexecdir)
			
 
				+endif
			
 
				+perfexec_instdir_SQ = $(subst ','\'',$(perfexec_instdir))
			
 
				+export perfexec_instdir
			
 
				+
			
 
				+install: all
			
 
				+	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
			
 
				+	$(INSTALL) perf$X '$(DESTDIR_SQ)$(bindir_SQ)'
			
 
				+ifdef BUILT_INS
			
 
				+	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
			
 
				+	$(INSTALL) $(BUILT_INS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
			
 
				+ifneq (,$X)
			
 
				+	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
			
 
				+endif
			
 
				+endif
			
 
				+
			
 
				+install-doc:
			
 
				+	$(MAKE) -C Documentation install
			
 
				+
			
 
				+install-man:
			
 
				+	$(MAKE) -C Documentation install-man
			
 
				+
			
 
				+install-html:
			
 
				+	$(MAKE) -C Documentation install-html
			
 
				+
			
 
				+install-info:
			
 
				+	$(MAKE) -C Documentation install-info
			
 
				+
			
 
				+install-pdf:
			
 
				+	$(MAKE) -C Documentation install-pdf
			
 
				+
			
 
				+quick-install-doc:
			
 
				+	$(MAKE) -C Documentation quick-install
			
 
				+
			
 
				+quick-install-man:
			
 
				+	$(MAKE) -C Documentation quick-install-man
			
 
				+
			
 
				+quick-install-html:
			
 
				+	$(MAKE) -C Documentation quick-install-html
			
 
				+
			
 
				+
			
 
				+### Maintainer's dist rules
			
 
				+#
			
 
				+# None right now
			
 
				+#
			
 
				+#
			
 
				+# perf.spec: perf.spec.in
			
 
				+#	sed -e 's/@@VERSION@@/$(PERF_VERSION)/g' < $< > $@+
			
 
				+#	mv $@+ $@
			
 
				+#
			
 
				+# PERF_TARNAME=perf-$(PERF_VERSION)
			
 
				+# dist: perf.spec perf-archive$(X) configure
			
 
				+#	./perf-archive --format=tar \
			
 
				+#		--prefix=$(PERF_TARNAME)/ HEAD^{tree} > $(PERF_TARNAME).tar
			
 
				+#	@mkdir -p $(PERF_TARNAME)
			
 
				+#	@cp perf.spec configure $(PERF_TARNAME)
			
 
				+#	@echo $(PERF_VERSION) > $(PERF_TARNAME)/version
			
 
				+#	$(TAR) rf $(PERF_TARNAME).tar \
			
 
				+#		$(PERF_TARNAME)/perf.spec \
			
 
				+#		$(PERF_TARNAME)/configure \
			
 
				+#		$(PERF_TARNAME)/version
			
 
				+#	@$(RM) -r $(PERF_TARNAME)
			
 
				+#	gzip -f -9 $(PERF_TARNAME).tar
			
 
				+#
			
 
				+# htmldocs = perf-htmldocs-$(PERF_VERSION)
			
 
				+# manpages = perf-manpages-$(PERF_VERSION)
			
 
				+# dist-doc:
			
 
				+#	$(RM) -r .doc-tmp-dir
			
 
				+#	mkdir .doc-tmp-dir
			
 
				+#	$(MAKE) -C Documentation WEBDOC_DEST=../.doc-tmp-dir install-webdoc
			
 
				+#	cd .doc-tmp-dir && $(TAR) cf ../$(htmldocs).tar .
			
 
				+#	gzip -n -9 -f $(htmldocs).tar
			
 
				+#	:
			
 
				+#	$(RM) -r .doc-tmp-dir
			
 
				+#	mkdir -p .doc-tmp-dir/man1 .doc-tmp-dir/man5 .doc-tmp-dir/man7
			
 
				+#	$(MAKE) -C Documentation DESTDIR=./ \
			
 
				+#		man1dir=../.doc-tmp-dir/man1 \
			
 
				+#		man5dir=../.doc-tmp-dir/man5 \
			
 
				+#		man7dir=../.doc-tmp-dir/man7 \
			
 
				+#		install
			
 
				+#	cd .doc-tmp-dir && $(TAR) cf ../$(manpages).tar .
			
 
				+#	gzip -n -9 -f $(manpages).tar
			
 
				+#	$(RM) -r .doc-tmp-dir
			
 
				+#
			
 
				+# rpm: dist
			
 
				+#	$(RPMBUILD) -ta $(PERF_TARNAME).tar.gz
			
 
				+
			
 
				+### Cleaning rules
			
 
				+
			
 
				+distclean: clean
			
 
				+#	$(RM) configure
			
 
				+
			
 
				+clean:
			
 
				+	$(RM) *.o */*.o $(LIB_FILE)
			
 
				+	$(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf$X
			
 
				+	$(RM) $(TEST_PROGRAMS)
			
 
				+	$(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo common-cmds.h TAGS tags cscope*
			
 
				+	$(RM) -r autom4te.cache
			
 
				+	$(RM) config.log config.mak.autogen config.mak.append config.status config.cache
			
 
				+	$(RM) -r $(PERF_TARNAME) .doc-tmp-dir
			
 
				+	$(RM) $(PERF_TARNAME).tar.gz perf-core_$(PERF_VERSION)-*.tar.gz
			
 
				+	$(RM) $(htmldocs).tar.gz $(manpages).tar.gz
			
 
				+	$(MAKE) -C Documentation/ clean
			
 
				+	$(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-BUILD-OPTIONS
			
 
				+
			
 
				+.PHONY: all install clean strip
			
 
				+.PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
			
 
				+.PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS
			
 
				+.PHONY: .FORCE-PERF-BUILD-OPTIONS
			
 
				+
			
 
				+### Make sure built-ins do not have dups and listed in perf.c
			
 
				+#
			
 
				+check-builtins::
			
 
				+	./check-builtins.sh
			
 
				+
			
 
				+### Test suite coverage testing
			
 
				+#
			
 
				+# None right now
			
 
				+#
			
 
				+# .PHONY: coverage coverage-clean coverage-build coverage-report
			
 
				+#
			
 
				+# coverage:
			
 
				+#	$(MAKE) coverage-build
			
 
				+#	$(MAKE) coverage-report
			
 
				+#
			
 
				+# coverage-clean:
			
 
				+#	rm -f *.gcda *.gcno
			
 
				+#
			
 
				+# COVERAGE_CFLAGS = $(CFLAGS) -O0 -ftest-coverage -fprofile-arcs
			
 
				+# COVERAGE_LDFLAGS = $(CFLAGS)  -O0 -lgcov
			
 
				+#
			
 
				+# coverage-build: coverage-clean
			
 
				+#	$(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" all
			
 
				+#	$(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" \
			
 
				+#		-j1 test
			
 
				+#
			
 
				+# coverage-report:
			
 
				+#	gcov -b *.c */*.c
			
 
				+#	grep '^function.*called 0 ' *.c.gcov */*.c.gcov \
			
 
				+#		| sed -e 's/\([^:]*\)\.gcov: *function \([^ ]*\) called.*/\1: \2/' \
			
 
				+#		| tee coverage-untested-functions
			
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -0,0 +1,1356 @@
 
				+/*
			
 
				+ * builtin-annotate.c
			
 
				+ *
			
 
				+ * Builtin annotate command: Analyze the perf.data input file,
			
 
				+ * look up and read DSOs and symbol information and display
			
 
				+ * a histogram of results, along various sorting keys.
			
 
				+ */
			
 
				+#include "builtin.h"
			
 
				+
			
 
				+#include "util/util.h"
			
 
				+
			
 
				+#include "util/color.h"
			
 
				+#include "util/list.h"
			
 
				+#include "util/cache.h"
			
 
				+#include "util/rbtree.h"
			
 
				+#include "util/symbol.h"
			
 
				+#include "util/string.h"
			
 
				+
			
 
				+#include "perf.h"
			
 
				+
			
 
				+#include "util/parse-options.h"
			
 
				+#include "util/parse-events.h"
			
 
				+
			
 
				+#define SHOW_KERNEL	1
			
 
				+#define SHOW_USER	2
			
 
				+#define SHOW_HV		4
			
 
				+
			
 
				+static char		const *input_name = "perf.data";
			
 
				+static char		*vmlinux = "vmlinux";
			
 
				+
			
 
				+static char		default_sort_order[] = "comm,symbol";
			
 
				+static char		*sort_order = default_sort_order;
			
 
				+
			
 
				+static int		input;
			
 
				+static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
			
 
				+
			
 
				+static int		dump_trace = 0;
			
 
				+#define dprintf(x...)	do { if (dump_trace) printf(x); } while (0)
			
 
				+
			
 
				+static int		verbose;
			
 
				+
			
 
				+static unsigned long	page_size;
			
 
				+static unsigned long	mmap_window = 32;
			
 
				+
			
 
				+struct ip_event {
			
 
				+	struct perf_event_header header;
			
 
				+	__u64 ip;
			
 
				+	__u32 pid, tid;
			
 
				+};
			
 
				+
			
 
				+struct mmap_event {
			
 
				+	struct perf_event_header header;
			
 
				+	__u32 pid, tid;
			
 
				+	__u64 start;
			
 
				+	__u64 len;
			
 
				+	__u64 pgoff;
			
 
				+	char filename[PATH_MAX];
			
 
				+};
			
 
				+
			
 
				+struct comm_event {
			
 
				+	struct perf_event_header header;
			
 
				+	__u32 pid, tid;
			
 
				+	char comm[16];
			
 
				+};
			
 
				+
			
 
				+struct fork_event {
			
 
				+	struct perf_event_header header;
			
 
				+	__u32 pid, ppid;
			
 
				+};
			
 
				+
			
 
				+struct period_event {
			
 
				+	struct perf_event_header header;
			
 
				+	__u64 time;
			
 
				+	__u64 id;
			
 
				+	__u64 sample_period;
			
 
				+};
			
 
				+
			
 
				+typedef union event_union {
			
 
				+	struct perf_event_header	header;
			
 
				+	struct ip_event			ip;
			
 
				+	struct mmap_event		mmap;
			
 
				+	struct comm_event		comm;
			
 
				+	struct fork_event		fork;
			
 
				+	struct period_event		period;
			
 
				+} event_t;
			
 
				+
			
 
				+static LIST_HEAD(dsos);
			
 
				+static struct dso *kernel_dso;
			
 
				+static struct dso *vdso;
			
 
				+
			
 
				+
			
 
				+static void dsos__add(struct dso *dso)
			
 
				+{
			
 
				+	list_add_tail(&dso->node, &dsos);
			
 
				+}
			
 
				+
			
 
				+static struct dso *dsos__find(const char *name)
			
 
				+{
			
 
				+	struct dso *pos;
			
 
				+
			
 
				+	list_for_each_entry(pos, &dsos, node)
			
 
				+		if (strcmp(pos->name, name) == 0)
			
 
				+			return pos;
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static struct dso *dsos__findnew(const char *name)
			
 
				+{
			
 
				+	struct dso *dso = dsos__find(name);
			
 
				+	int nr;
			
 
				+
			
 
				+	if (dso)
			
 
				+		return dso;
			
 
				+
			
 
				+	dso = dso__new(name, 0);
			
 
				+	if (!dso)
			
 
				+		goto out_delete_dso;
			
 
				+
			
 
				+	nr = dso__load(dso, NULL, verbose);
			
 
				+	if (nr < 0) {
			
 
				+		if (verbose)
			
 
				+			fprintf(stderr, "Failed to open: %s\n", name);
			
 
				+		goto out_delete_dso;
			
 
				+	}
			
 
				+	if (!nr && verbose) {
			
 
				+		fprintf(stderr,
			
 
				+		"No symbols found in: %s, maybe install a debug package?\n",
			
 
				+				name);
			
 
				+	}
			
 
				+
			
 
				+	dsos__add(dso);
			
 
				+
			
 
				+	return dso;
			
 
				+
			
 
				+out_delete_dso:
			
 
				+	dso__delete(dso);
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static void dsos__fprintf(FILE *fp)
			
 
				+{
			
 
				+	struct dso *pos;
			
 
				+
			
 
				+	list_for_each_entry(pos, &dsos, node)
			
 
				+		dso__fprintf(pos, fp);
			
 
				+}
			
 
				+
			
 
				+static struct symbol *vdso__find_symbol(struct dso *dso, __u64 ip)
			
 
				+{
			
 
				+	return dso__find_symbol(kernel_dso, ip);
			
 
				+}
			
 
				+
			
 
				+static int load_kernel(void)
			
 
				+{
			
 
				+	int err;
			
 
				+
			
 
				+	kernel_dso = dso__new("[kernel]", 0);
			
 
				+	if (!kernel_dso)
			
 
				+		return -1;
			
 
				+
			
 
				+	err = dso__load_kernel(kernel_dso, vmlinux, NULL, verbose);
			
 
				+	if (err) {
			
 
				+		dso__delete(kernel_dso);
			
 
				+		kernel_dso = NULL;
			
 
				+	} else
			
 
				+		dsos__add(kernel_dso);
			
 
				+
			
 
				+	vdso = dso__new("[vdso]", 0);
			
 
				+	if (!vdso)
			
 
				+		return -1;
			
 
				+
			
 
				+	vdso->find_symbol = vdso__find_symbol;
			
 
				+
			
 
				+	dsos__add(vdso);
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+struct map {
			
 
				+	struct list_head node;
			
 
				+	__u64	 start;
			
 
				+	__u64	 end;
			
 
				+	__u64	 pgoff;
			
 
				+	__u64	 (*map_ip)(struct map *, __u64);
			
 
				+	struct dso	 *dso;
			
 
				+};
			
 
				+
			
 
				+static __u64 map__map_ip(struct map *map, __u64 ip)
			
 
				+{
			
 
				+	return ip - map->start + map->pgoff;
			
 
				+}
			
 
				+
			
 
				+static __u64 vdso__map_ip(struct map *map, __u64 ip)
			
 
				+{
			
 
				+	return ip;
			
 
				+}
			
 
				+
			
 
				+static struct map *map__new(struct mmap_event *event)
			
 
				+{
			
 
				+	struct map *self = malloc(sizeof(*self));
			
 
				+
			
 
				+	if (self != NULL) {
			
 
				+		const char *filename = event->filename;
			
 
				+
			
 
				+		self->start = event->start;
			
 
				+		self->end   = event->start + event->len;
			
 
				+		self->pgoff = event->pgoff;
			
 
				+
			
 
				+		self->dso = dsos__findnew(filename);
			
 
				+		if (self->dso == NULL)
			
 
				+			goto out_delete;
			
 
				+
			
 
				+		if (self->dso == vdso)
			
 
				+			self->map_ip = vdso__map_ip;
			
 
				+		else
			
 
				+			self->map_ip = map__map_ip;
			
 
				+	}
			
 
				+	return self;
			
 
				+out_delete:
			
 
				+	free(self);
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static struct map *map__clone(struct map *self)
			
 
				+{
			
 
				+	struct map *map = malloc(sizeof(*self));
			
 
				+
			
 
				+	if (!map)
			
 
				+		return NULL;
			
 
				+
			
 
				+	memcpy(map, self, sizeof(*self));
			
 
				+
			
 
				+	return map;
			
 
				+}
			
 
				+
			
 
				+static int map__overlap(struct map *l, struct map *r)
			
 
				+{
			
 
				+	if (l->start > r->start) {
			
 
				+		struct map *t = l;
			
 
				+		l = r;
			
 
				+		r = t;
			
 
				+	}
			
 
				+
			
 
				+	if (l->end > r->start)
			
 
				+		return 1;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static size_t map__fprintf(struct map *self, FILE *fp)
			
 
				+{
			
 
				+	return fprintf(fp, " %Lx-%Lx %Lx %s\n",
			
 
				+		       self->start, self->end, self->pgoff, self->dso->name);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+struct thread {
			
 
				+	struct rb_node	 rb_node;
			
 
				+	struct list_head maps;
			
 
				+	pid_t		 pid;
			
 
				+	char		 *comm;
			
 
				+};
			
 
				+
			
 
				+static struct thread *thread__new(pid_t pid)
			
 
				+{
			
 
				+	struct thread *self = malloc(sizeof(*self));
			
 
				+
			
 
				+	if (self != NULL) {
			
 
				+		self->pid = pid;
			
 
				+		self->comm = malloc(32);
			
 
				+		if (self->comm)
			
 
				+			snprintf(self->comm, 32, ":%d", self->pid);
			
 
				+		INIT_LIST_HEAD(&self->maps);
			
 
				+	}
			
 
				+
			
 
				+	return self;
			
 
				+}
			
 
				+
			
 
				+static int thread__set_comm(struct thread *self, const char *comm)
			
 
				+{
			
 
				+	if (self->comm)
			
 
				+		free(self->comm);
			
 
				+	self->comm = strdup(comm);
			
 
				+	return self->comm ? 0 : -ENOMEM;
			
 
				+}
			
 
				+
			
 
				+static size_t thread__fprintf(struct thread *self, FILE *fp)
			
 
				+{
			
 
				+	struct map *pos;
			
 
				+	size_t ret = fprintf(fp, "Thread %d %s\n", self->pid, self->comm);
			
 
				+
			
 
				+	list_for_each_entry(pos, &self->maps, node)
			
 
				+		ret += map__fprintf(pos, fp);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static struct rb_root threads;
			
 
				+static struct thread *last_match;
			
 
				+
			
 
				+static struct thread *threads__findnew(pid_t pid)
			
 
				+{
			
 
				+	struct rb_node **p = &threads.rb_node;
			
 
				+	struct rb_node *parent = NULL;
			
 
				+	struct thread *th;
			
 
				+
			
 
				+	/*
			
 
				+	 * Font-end cache - PID lookups come in blocks,
			
 
				+	 * so most of the time we dont have to look up
			
 
				+	 * the full rbtree:
			
 
				+	 */
			
 
				+	if (last_match && last_match->pid == pid)
			
 
				+		return last_match;
			
 
				+
			
 
				+	while (*p != NULL) {
			
 
				+		parent = *p;
			
 
				+		th = rb_entry(parent, struct thread, rb_node);
			
 
				+
			
 
				+		if (th->pid == pid) {
			
 
				+			last_match = th;
			
 
				+			return th;
			
 
				+		}
			
 
				+
			
 
				+		if (pid < th->pid)
			
 
				+			p = &(*p)->rb_left;
			
 
				+		else
			
 
				+			p = &(*p)->rb_right;
			
 
				+	}
			
 
				+
			
 
				+	th = thread__new(pid);
			
 
				+	if (th != NULL) {
			
 
				+		rb_link_node(&th->rb_node, parent, p);
			
 
				+		rb_insert_color(&th->rb_node, &threads);
			
 
				+		last_match = th;
			
 
				+	}
			
 
				+
			
 
				+	return th;
			
 
				+}
			
 
				+
			
 
				+static void thread__insert_map(struct thread *self, struct map *map)
			
 
				+{
			
 
				+	struct map *pos, *tmp;
			
 
				+
			
 
				+	list_for_each_entry_safe(pos, tmp, &self->maps, node) {
			
 
				+		if (map__overlap(pos, map)) {
			
 
				+			list_del_init(&pos->node);
			
 
				+			/* XXX leaks dsos */
			
 
				+			free(pos);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	list_add_tail(&map->node, &self->maps);
			
 
				+}
			
 
				+
			
 
				+static int thread__fork(struct thread *self, struct thread *parent)
			
 
				+{
			
 
				+	struct map *map;
			
 
				+
			
 
				+	if (self->comm)
			
 
				+		free(self->comm);
			
 
				+	self->comm = strdup(parent->comm);
			
 
				+	if (!self->comm)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	list_for_each_entry(map, &parent->maps, node) {
			
 
				+		struct map *new = map__clone(map);
			
 
				+		if (!new)
			
 
				+			return -ENOMEM;
			
 
				+		thread__insert_map(self, new);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static struct map *thread__find_map(struct thread *self, __u64 ip)
			
 
				+{
			
 
				+	struct map *pos;
			
 
				+
			
 
				+	if (self == NULL)
			
 
				+		return NULL;
			
 
				+
			
 
				+	list_for_each_entry(pos, &self->maps, node)
			
 
				+		if (ip >= pos->start && ip <= pos->end)
			
 
				+			return pos;
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static size_t threads__fprintf(FILE *fp)
			
 
				+{
			
 
				+	size_t ret = 0;
			
 
				+	struct rb_node *nd;
			
 
				+
			
 
				+	for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
			
 
				+		struct thread *pos = rb_entry(nd, struct thread, rb_node);
			
 
				+
			
 
				+		ret += thread__fprintf(pos, fp);
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * histogram, sorted on item, collects counts
			
 
				+ */
			
 
				+
			
 
				+static struct rb_root hist;
			
 
				+
			
 
				+struct hist_entry {
			
 
				+	struct rb_node	 rb_node;
			
 
				+
			
 
				+	struct thread	 *thread;
			
 
				+	struct map	 *map;
			
 
				+	struct dso	 *dso;
			
 
				+	struct symbol	 *sym;
			
 
				+	__u64	 ip;
			
 
				+	char		 level;
			
 
				+
			
 
				+	uint32_t	 count;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * configurable sorting bits
			
 
				+ */
			
 
				+
			
 
				+struct sort_entry {
			
 
				+	struct list_head list;
			
 
				+
			
 
				+	char *header;
			
 
				+
			
 
				+	int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
			
 
				+	int64_t (*collapse)(struct hist_entry *, struct hist_entry *);
			
 
				+	size_t	(*print)(FILE *fp, struct hist_entry *);
			
 
				+};
			
 
				+
			
 
				+/* --sort pid */
			
 
				+
			
 
				+static int64_t
			
 
				+sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
			
 
				+{
			
 
				+	return right->thread->pid - left->thread->pid;
			
 
				+}
			
 
				+
			
 
				+static size_t
			
 
				+sort__thread_print(FILE *fp, struct hist_entry *self)
			
 
				+{
			
 
				+	return fprintf(fp, "%16s:%5d", self->thread->comm ?: "", self->thread->pid);
			
 
				+}
			
 
				+
			
 
				+static struct sort_entry sort_thread = {
			
 
				+	.header = "         Command:  Pid",
			
 
				+	.cmp	= sort__thread_cmp,
			
 
				+	.print	= sort__thread_print,
			
 
				+};
			
 
				+
			
 
				+/* --sort comm */
			
 
				+
			
 
				+static int64_t
			
 
				+sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
			
 
				+{
			
 
				+	return right->thread->pid - left->thread->pid;
			
 
				+}
			
 
				+
			
 
				+static int64_t
			
 
				+sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
			
 
				+{
			
 
				+	char *comm_l = left->thread->comm;
			
 
				+	char *comm_r = right->thread->comm;
			
 
				+
			
 
				+	if (!comm_l || !comm_r) {
			
 
				+		if (!comm_l && !comm_r)
			
 
				+			return 0;
			
 
				+		else if (!comm_l)
			
 
				+			return -1;
			
 
				+		else
			
 
				+			return 1;
			
 
				+	}
			
 
				+
			
 
				+	return strcmp(comm_l, comm_r);
			
 
				+}
			
 
				+
			
 
				+static size_t
			
 
				+sort__comm_print(FILE *fp, struct hist_entry *self)
			
 
				+{
			
 
				+	return fprintf(fp, "%16s", self->thread->comm);
			
 
				+}
			
 
				+
			
 
				+static struct sort_entry sort_comm = {
			
 
				+	.header		= "         Command",
			
 
				+	.cmp		= sort__comm_cmp,
			
 
				+	.collapse	= sort__comm_collapse,
			
 
				+	.print		= sort__comm_print,
			
 
				+};
			
 
				+
			
 
				+/* --sort dso */
			
 
				+
			
 
				+static int64_t
			
 
				+sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
			
 
				+{
			
 
				+	struct dso *dso_l = left->dso;
			
 
				+	struct dso *dso_r = right->dso;
			
 
				+
			
 
				+	if (!dso_l || !dso_r) {
			
 
				+		if (!dso_l && !dso_r)
			
 
				+			return 0;
			
 
				+		else if (!dso_l)
			
 
				+			return -1;
			
 
				+		else
			
 
				+			return 1;
			
 
				+	}
			
 
				+
			
 
				+	return strcmp(dso_l->name, dso_r->name);
			
 
				+}
			
 
				+
			
 
				+static size_t
			
 
				+sort__dso_print(FILE *fp, struct hist_entry *self)
			
 
				+{
			
 
				+	if (self->dso)
			
 
				+		return fprintf(fp, "%-25s", self->dso->name);
			
 
				+
			
 
				+	return fprintf(fp, "%016llx         ", (__u64)self->ip);
			
 
				+}
			
 
				+
			
 
				+static struct sort_entry sort_dso = {
			
 
				+	.header = "Shared Object            ",
			
 
				+	.cmp	= sort__dso_cmp,
			
 
				+	.print	= sort__dso_print,
			
 
				+};
			
 
				+
			
 
				+/* --sort symbol */
			
 
				+
			
 
				+static int64_t
			
 
				+sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
			
 
				+{
			
 
				+	__u64 ip_l, ip_r;
			
 
				+
			
 
				+	if (left->sym == right->sym)
			
 
				+		return 0;
			
 
				+
			
 
				+	ip_l = left->sym ? left->sym->start : left->ip;
			
 
				+	ip_r = right->sym ? right->sym->start : right->ip;
			
 
				+
			
 
				+	return (int64_t)(ip_r - ip_l);
			
 
				+}
			
 
				+
			
 
				+static size_t
			
 
				+sort__sym_print(FILE *fp, struct hist_entry *self)
			
 
				+{
			
 
				+	size_t ret = 0;
			
 
				+
			
 
				+	if (verbose)
			
 
				+		ret += fprintf(fp, "%#018llx  ", (__u64)self->ip);
			
 
				+
			
 
				+	if (self->sym) {
			
 
				+		ret += fprintf(fp, "[%c] %s",
			
 
				+			self->dso == kernel_dso ? 'k' : '.', self->sym->name);
			
 
				+	} else {
			
 
				+		ret += fprintf(fp, "%#016llx", (__u64)self->ip);
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static struct sort_entry sort_sym = {
			
 
				+	.header = "Symbol",
			
 
				+	.cmp	= sort__sym_cmp,
			
 
				+	.print	= sort__sym_print,
			
 
				+};
			
 
				+
			
 
				+static int sort__need_collapse = 0;
			
 
				+
			
 
				+struct sort_dimension {
			
 
				+	char			*name;
			
 
				+	struct sort_entry	*entry;
			
 
				+	int			taken;
			
 
				+};
			
 
				+
			
 
				+static struct sort_dimension sort_dimensions[] = {
			
 
				+	{ .name = "pid",	.entry = &sort_thread,	},
			
 
				+	{ .name = "comm",	.entry = &sort_comm,	},
			
 
				+	{ .name = "dso",	.entry = &sort_dso,	},
			
 
				+	{ .name = "symbol",	.entry = &sort_sym,	},
			
 
				+};
			
 
				+
			
 
				+static LIST_HEAD(hist_entry__sort_list);
			
 
				+
			
 
				+static int sort_dimension__add(char *tok)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < ARRAY_SIZE(sort_dimensions); i++) {
			
 
				+		struct sort_dimension *sd = &sort_dimensions[i];
			
 
				+
			
 
				+		if (sd->taken)
			
 
				+			continue;
			
 
				+
			
 
				+		if (strncasecmp(tok, sd->name, strlen(tok)))
			
 
				+			continue;
			
 
				+
			
 
				+		if (sd->entry->collapse)
			
 
				+			sort__need_collapse = 1;
			
 
				+
			
 
				+		list_add_tail(&sd->entry->list, &hist_entry__sort_list);
			
 
				+		sd->taken = 1;
			
 
				+
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	return -ESRCH;
			
 
				+}
			
 
				+
			
 
				+static int64_t
			
 
				+hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
			
 
				+{
			
 
				+	struct sort_entry *se;
			
 
				+	int64_t cmp = 0;
			
 
				+
			
 
				+	list_for_each_entry(se, &hist_entry__sort_list, list) {
			
 
				+		cmp = se->cmp(left, right);
			
 
				+		if (cmp)
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	return cmp;
			
 
				+}
			
 
				+
			
 
				+static int64_t
			
 
				+hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
			
 
				+{
			
 
				+	struct sort_entry *se;
			
 
				+	int64_t cmp = 0;
			
 
				+
			
 
				+	list_for_each_entry(se, &hist_entry__sort_list, list) {
			
 
				+		int64_t (*f)(struct hist_entry *, struct hist_entry *);
			
 
				+
			
 
				+		f = se->collapse ?: se->cmp;
			
 
				+
			
 
				+		cmp = f(left, right);
			
 
				+		if (cmp)
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	return cmp;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * collect histogram counts
			
 
				+ */
			
 
				+static void hist_hit(struct hist_entry *he, __u64 ip)
			
 
				+{
			
 
				+	unsigned int sym_size, offset;
			
 
				+	struct symbol *sym = he->sym;
			
 
				+
			
 
				+	he->count++;
			
 
				+
			
 
				+	if (!sym || !sym->hist)
			
 
				+		return;
			
 
				+
			
 
				+	sym_size = sym->end - sym->start;
			
 
				+	offset = ip - sym->start;
			
 
				+
			
 
				+	if (offset >= sym_size)
			
 
				+		return;
			
 
				+
			
 
				+	sym->hist_sum++;
			
 
				+	sym->hist[offset]++;
			
 
				+
			
 
				+	if (verbose >= 3)
			
 
				+		printf("%p %s: count++ [ip: %p, %08Lx] => %Ld\n",
			
 
				+			(void *)(unsigned long)he->sym->start,
			
 
				+			he->sym->name,
			
 
				+			(void *)(unsigned long)ip, ip - he->sym->start,
			
 
				+			sym->hist[offset]);
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
			
 
				+		struct symbol *sym, __u64 ip, char level)
			
 
				+{
			
 
				+	struct rb_node **p = &hist.rb_node;
			
 
				+	struct rb_node *parent = NULL;
			
 
				+	struct hist_entry *he;
			
 
				+	struct hist_entry entry = {
			
 
				+		.thread	= thread,
			
 
				+		.map	= map,
			
 
				+		.dso	= dso,
			
 
				+		.sym	= sym,
			
 
				+		.ip	= ip,
			
 
				+		.level	= level,
			
 
				+		.count	= 1,
			
 
				+	};
			
 
				+	int cmp;
			
 
				+
			
 
				+	while (*p != NULL) {
			
 
				+		parent = *p;
			
 
				+		he = rb_entry(parent, struct hist_entry, rb_node);
			
 
				+
			
 
				+		cmp = hist_entry__cmp(&entry, he);
			
 
				+
			
 
				+		if (!cmp) {
			
 
				+			hist_hit(he, ip);
			
 
				+
			
 
				+			return 0;
			
 
				+		}
			
 
				+
			
 
				+		if (cmp < 0)
			
 
				+			p = &(*p)->rb_left;
			
 
				+		else
			
 
				+			p = &(*p)->rb_right;
			
 
				+	}
			
 
				+
			
 
				+	he = malloc(sizeof(*he));
			
 
				+	if (!he)
			
 
				+		return -ENOMEM;
			
 
				+	*he = entry;
			
 
				+	rb_link_node(&he->rb_node, parent, p);
			
 
				+	rb_insert_color(&he->rb_node, &hist);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void hist_entry__free(struct hist_entry *he)
			
 
				+{
			
 
				+	free(he);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * collapse the histogram
			
 
				+ */
			
 
				+
			
 
				+static struct rb_root collapse_hists;
			
 
				+
			
 
				+static void collapse__insert_entry(struct hist_entry *he)
			
 
				+{
			
 
				+	struct rb_node **p = &collapse_hists.rb_node;
			
 
				+	struct rb_node *parent = NULL;
			
 
				+	struct hist_entry *iter;
			
 
				+	int64_t cmp;
			
 
				+
			
 
				+	while (*p != NULL) {
			
 
				+		parent = *p;
			
 
				+		iter = rb_entry(parent, struct hist_entry, rb_node);
			
 
				+
			
 
				+		cmp = hist_entry__collapse(iter, he);
			
 
				+
			
 
				+		if (!cmp) {
			
 
				+			iter->count += he->count;
			
 
				+			hist_entry__free(he);
			
 
				+			return;
			
 
				+		}
			
 
				+
			
 
				+		if (cmp < 0)
			
 
				+			p = &(*p)->rb_left;
			
 
				+		else
			
 
				+			p = &(*p)->rb_right;
			
 
				+	}
			
 
				+
			
 
				+	rb_link_node(&he->rb_node, parent, p);
			
 
				+	rb_insert_color(&he->rb_node, &collapse_hists);
			
 
				+}
			
 
				+
			
 
				+static void collapse__resort(void)
			
 
				+{
			
 
				+	struct rb_node *next;
			
 
				+	struct hist_entry *n;
			
 
				+
			
 
				+	if (!sort__need_collapse)
			
 
				+		return;
			
 
				+
			
 
				+	next = rb_first(&hist);
			
 
				+	while (next) {
			
 
				+		n = rb_entry(next, struct hist_entry, rb_node);
			
 
				+		next = rb_next(&n->rb_node);
			
 
				+
			
 
				+		rb_erase(&n->rb_node, &hist);
			
 
				+		collapse__insert_entry(n);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * reverse the map, sort on count.
			
 
				+ */
			
 
				+
			
 
				+static struct rb_root output_hists;
			
 
				+
			
 
				+static void output__insert_entry(struct hist_entry *he)
			
 
				+{
			
 
				+	struct rb_node **p = &output_hists.rb_node;
			
 
				+	struct rb_node *parent = NULL;
			
 
				+	struct hist_entry *iter;
			
 
				+
			
 
				+	while (*p != NULL) {
			
 
				+		parent = *p;
			
 
				+		iter = rb_entry(parent, struct hist_entry, rb_node);
			
 
				+
			
 
				+		if (he->count > iter->count)
			
 
				+			p = &(*p)->rb_left;
			
 
				+		else
			
 
				+			p = &(*p)->rb_right;
			
 
				+	}
			
 
				+
			
 
				+	rb_link_node(&he->rb_node, parent, p);
			
 
				+	rb_insert_color(&he->rb_node, &output_hists);
			
 
				+}
			
 
				+
			
 
				+static void output__resort(void)
			
 
				+{
			
 
				+	struct rb_node *next;
			
 
				+	struct hist_entry *n;
			
 
				+	struct rb_root *tree = &hist;
			
 
				+
			
 
				+	if (sort__need_collapse)
			
 
				+		tree = &collapse_hists;
			
 
				+
			
 
				+	next = rb_first(tree);
			
 
				+
			
 
				+	while (next) {
			
 
				+		n = rb_entry(next, struct hist_entry, rb_node);
			
 
				+		next = rb_next(&n->rb_node);
			
 
				+
			
 
				+		rb_erase(&n->rb_node, tree);
			
 
				+		output__insert_entry(n);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void register_idle_thread(void)
			
 
				+{
			
 
				+	struct thread *thread = threads__findnew(0);
			
 
				+
			
 
				+	if (thread == NULL ||
			
 
				+			thread__set_comm(thread, "[idle]")) {
			
 
				+		fprintf(stderr, "problem inserting idle task.\n");
			
 
				+		exit(-1);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static unsigned long total = 0,
			
 
				+		     total_mmap = 0,
			
 
				+		     total_comm = 0,
			
 
				+		     total_fork = 0,
			
 
				+		     total_unknown = 0;
			
 
				+
			
 
				+static int
			
 
				+process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
			
 
				+{
			
 
				+	char level;
			
 
				+	int show = 0;
			
 
				+	struct dso *dso = NULL;
			
 
				+	struct thread *thread = threads__findnew(event->ip.pid);
			
 
				+	__u64 ip = event->ip.ip;
			
 
				+	struct map *map = NULL;
			
 
				+
			
 
				+	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
			
 
				+		(void *)(offset + head),
			
 
				+		(void *)(long)(event->header.size),
			
 
				+		event->header.misc,
			
 
				+		event->ip.pid,
			
 
				+		(void *)(long)ip);
			
 
				+
			
 
				+	dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
			
 
				+
			
 
				+	if (thread == NULL) {
			
 
				+		fprintf(stderr, "problem processing %d event, skipping it.\n",
			
 
				+			event->header.type);
			
 
				+		return -1;
			
 
				+	}
			
 
				+
			
 
				+	if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
			
 
				+		show = SHOW_KERNEL;
			
 
				+		level = 'k';
			
 
				+
			
 
				+		dso = kernel_dso;
			
 
				+
			
 
				+		dprintf(" ...... dso: %s\n", dso->name);
			
 
				+
			
 
				+	} else if (event->header.misc & PERF_EVENT_MISC_USER) {
			
 
				+
			
 
				+		show = SHOW_USER;
			
 
				+		level = '.';
			
 
				+
			
 
				+		map = thread__find_map(thread, ip);
			
 
				+		if (map != NULL) {
			
 
				+			ip = map->map_ip(map, ip);
			
 
				+			dso = map->dso;
			
 
				+		} else {
			
 
				+			/*
			
 
				+			 * If this is outside of all known maps,
			
 
				+			 * and is a negative address, try to look it
			
 
				+			 * up in the kernel dso, as it might be a
			
 
				+			 * vsyscall (which executes in user-mode):
			
 
				+			 */
			
 
				+			if ((long long)ip < 0)
			
 
				+				dso = kernel_dso;
			
 
				+		}
			
 
				+		dprintf(" ...... dso: %s\n", dso ? dso->name : "<not found>");
			
 
				+
			
 
				+	} else {
			
 
				+		show = SHOW_HV;
			
 
				+		level = 'H';
			
 
				+		dprintf(" ...... dso: [hypervisor]\n");
			
 
				+	}
			
 
				+
			
 
				+	if (show & show_mask) {
			
 
				+		struct symbol *sym = NULL;
			
 
				+
			
 
				+		if (dso)
			
 
				+			sym = dso->find_symbol(dso, ip);
			
 
				+
			
 
				+		if (hist_entry__add(thread, map, dso, sym, ip, level)) {
			
 
				+			fprintf(stderr,
			
 
				+		"problem incrementing symbol count, skipping event\n");
			
 
				+			return -1;
			
 
				+		}
			
 
				+	}
			
 
				+	total++;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
			
 
				+{
			
 
				+	struct thread *thread = threads__findnew(event->mmap.pid);
			
 
				+	struct map *map = map__new(&event->mmap);
			
 
				+
			
 
				+	dprintf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n",
			
 
				+		(void *)(offset + head),
			
 
				+		(void *)(long)(event->header.size),
			
 
				+		event->mmap.pid,
			
 
				+		(void *)(long)event->mmap.start,
			
 
				+		(void *)(long)event->mmap.len,
			
 
				+		(void *)(long)event->mmap.pgoff,
			
 
				+		event->mmap.filename);
			
 
				+
			
 
				+	if (thread == NULL || map == NULL) {
			
 
				+		dprintf("problem processing PERF_EVENT_MMAP, skipping event.\n");
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	thread__insert_map(thread, map);
			
 
				+	total_mmap++;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+process_comm_event(event_t *event, unsigned long offset, unsigned long head)
			
 
				+{
			
 
				+	struct thread *thread = threads__findnew(event->comm.pid);
			
 
				+
			
 
				+	dprintf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
			
 
				+		(void *)(offset + head),
			
 
				+		(void *)(long)(event->header.size),
			
 
				+		event->comm.comm, event->comm.pid);
			
 
				+
			
 
				+	if (thread == NULL ||
			
 
				+	    thread__set_comm(thread, event->comm.comm)) {
			
 
				+		dprintf("problem processing PERF_EVENT_COMM, skipping event.\n");
			
 
				+		return -1;
			
 
				+	}
			
 
				+	total_comm++;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+process_fork_event(event_t *event, unsigned long offset, unsigned long head)
			
 
				+{
			
 
				+	struct thread *thread = threads__findnew(event->fork.pid);
			
 
				+	struct thread *parent = threads__findnew(event->fork.ppid);
			
 
				+
			
 
				+	dprintf("%p [%p]: PERF_EVENT_FORK: %d:%d\n",
			
 
				+		(void *)(offset + head),
			
 
				+		(void *)(long)(event->header.size),
			
 
				+		event->fork.pid, event->fork.ppid);
			
 
				+
			
 
				+	if (!thread || !parent || thread__fork(thread, parent)) {
			
 
				+		dprintf("problem processing PERF_EVENT_FORK, skipping event.\n");
			
 
				+		return -1;
			
 
				+	}
			
 
				+	total_fork++;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+process_period_event(event_t *event, unsigned long offset, unsigned long head)
			
 
				+{
			
 
				+	dprintf("%p [%p]: PERF_EVENT_PERIOD: time:%Ld, id:%Ld: period:%Ld\n",
			
 
				+		(void *)(offset + head),
			
 
				+		(void *)(long)(event->header.size),
			
 
				+		event->period.time,
			
 
				+		event->period.id,
			
 
				+		event->period.sample_period);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+process_event(event_t *event, unsigned long offset, unsigned long head)
			
 
				+{
			
 
				+	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW)
			
 
				+		return process_overflow_event(event, offset, head);
			
 
				+
			
 
				+	switch (event->header.type) {
			
 
				+	case PERF_EVENT_MMAP:
			
 
				+		return process_mmap_event(event, offset, head);
			
 
				+
			
 
				+	case PERF_EVENT_COMM:
			
 
				+		return process_comm_event(event, offset, head);
			
 
				+
			
 
				+	case PERF_EVENT_FORK:
			
 
				+		return process_fork_event(event, offset, head);
			
 
				+
			
 
				+	case PERF_EVENT_PERIOD:
			
 
				+		return process_period_event(event, offset, head);
			
 
				+	/*
			
 
				+	 * We dont process them right now but they are fine:
			
 
				+	 */
			
 
				+
			
 
				+	case PERF_EVENT_THROTTLE:
			
 
				+	case PERF_EVENT_UNTHROTTLE:
			
 
				+		return 0;
			
 
				+
			
 
				+	default:
			
 
				+		return -1;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+parse_line(FILE *file, struct symbol *sym, __u64 start, __u64 len)
			
 
				+{
			
 
				+	char *line = NULL, *tmp, *tmp2;
			
 
				+	unsigned int offset;
			
 
				+	size_t line_len;
			
 
				+	__u64 line_ip;
			
 
				+	int ret;
			
 
				+	char *c;
			
 
				+
			
 
				+	if (getline(&line, &line_len, file) < 0)
			
 
				+		return -1;
			
 
				+	if (!line)
			
 
				+		return -1;
			
 
				+
			
 
				+	c = strchr(line, '\n');
			
 
				+	if (c)
			
 
				+		*c = 0;
			
 
				+
			
 
				+	line_ip = -1;
			
 
				+	offset = 0;
			
 
				+	ret = -2;
			
 
				+
			
 
				+	/*
			
 
				+	 * Strip leading spaces:
			
 
				+	 */
			
 
				+	tmp = line;
			
 
				+	while (*tmp) {
			
 
				+		if (*tmp != ' ')
			
 
				+			break;
			
 
				+		tmp++;
			
 
				+	}
			
 
				+
			
 
				+	if (*tmp) {
			
 
				+		/*
			
 
				+		 * Parse hexa addresses followed by ':'
			
 
				+		 */
			
 
				+		line_ip = strtoull(tmp, &tmp2, 16);
			
 
				+		if (*tmp2 != ':')
			
 
				+			line_ip = -1;
			
 
				+	}
			
 
				+
			
 
				+	if (line_ip != -1) {
			
 
				+		unsigned int hits = 0;
			
 
				+		double percent = 0.0;
			
 
				+		char *color = PERF_COLOR_NORMAL;
			
 
				+
			
 
				+		offset = line_ip - start;
			
 
				+		if (offset < len)
			
 
				+			hits = sym->hist[offset];
			
 
				+
			
 
				+		if (sym->hist_sum)
			
 
				+			percent = 100.0 * hits / sym->hist_sum;
			
 
				+
			
 
				+		/*
			
 
				+		 * We color high-overhead entries in red, mid-overhead
			
 
				+		 * entries in green - and keep the low overhead places
			
 
				+		 * normal:
			
 
				+		 */
			
 
				+		if (percent >= 5.0)
			
 
				+			color = PERF_COLOR_RED;
			
 
				+		else {
			
 
				+			if (percent > 0.5)
			
 
				+				color = PERF_COLOR_GREEN;
			
 
				+		}
			
 
				+
			
 
				+		color_fprintf(stdout, color, " %7.2f", percent);
			
 
				+		printf(" :	");
			
 
				+		color_fprintf(stdout, PERF_COLOR_BLUE, "%s\n", line);
			
 
				+	} else {
			
 
				+		if (!*line)
			
 
				+			printf("         :\n");
			
 
				+		else
			
 
				+			printf("         :	%s\n", line);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void annotate_sym(struct dso *dso, struct symbol *sym)
			
 
				+{
			
 
				+	char *filename = dso->name;
			
 
				+	__u64 start, end, len;
			
 
				+	char command[PATH_MAX*2];
			
 
				+	FILE *file;
			
 
				+
			
 
				+	if (!filename)
			
 
				+		return;
			
 
				+	if (dso == kernel_dso)
			
 
				+		filename = vmlinux;
			
 
				+
			
 
				+	printf("\n------------------------------------------------\n");
			
 
				+	printf(" Percent |	Source code & Disassembly of %s\n", filename);
			
 
				+	printf("------------------------------------------------\n");
			
 
				+
			
 
				+	if (verbose >= 2)
			
 
				+		printf("annotating [%p] %30s : [%p] %30s\n", dso, dso->name, sym, sym->name);
			
 
				+
			
 
				+	start = sym->obj_start;
			
 
				+	if (!start)
			
 
				+		start = sym->start;
			
 
				+
			
 
				+	end = start + sym->end - sym->start + 1;
			
 
				+	len = sym->end - sym->start;
			
 
				+
			
 
				+	sprintf(command, "objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS %s", (__u64)start, (__u64)end, filename);
			
 
				+
			
 
				+	if (verbose >= 3)
			
 
				+		printf("doing: %s\n", command);
			
 
				+
			
 
				+	file = popen(command, "r");
			
 
				+	if (!file)
			
 
				+		return;
			
 
				+
			
 
				+	while (!feof(file)) {
			
 
				+		if (parse_line(file, sym, start, len) < 0)
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	pclose(file);
			
 
				+}
			
 
				+
			
 
				+static void find_annotations(void)
			
 
				+{
			
 
				+	struct rb_node *nd;
			
 
				+	struct dso *dso;
			
 
				+	int count = 0;
			
 
				+
			
 
				+	list_for_each_entry(dso, &dsos, node) {
			
 
				+
			
 
				+		for (nd = rb_first(&dso->syms); nd; nd = rb_next(nd)) {
			
 
				+			struct symbol *sym = rb_entry(nd, struct symbol, rb_node);
			
 
				+
			
 
				+			if (sym->hist) {
			
 
				+				annotate_sym(dso, sym);
			
 
				+				count++;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (!count)
			
 
				+		printf(" Error: symbol '%s' not present amongst the samples.\n", sym_hist_filter);
			
 
				+}
			
 
				+
			
 
				+static int __cmd_annotate(void)
			
 
				+{
			
 
				+	int ret, rc = EXIT_FAILURE;
			
 
				+	unsigned long offset = 0;
			
 
				+	unsigned long head = 0;
			
 
				+	struct stat stat;
			
 
				+	event_t *event;
			
 
				+	uint32_t size;
			
 
				+	char *buf;
			
 
				+
			
 
				+	register_idle_thread();
			
 
				+
			
 
				+	input = open(input_name, O_RDONLY);
			
 
				+	if (input < 0) {
			
 
				+		perror("failed to open file");
			
 
				+		exit(-1);
			
 
				+	}
			
 
				+
			
 
				+	ret = fstat(input, &stat);
			
 
				+	if (ret < 0) {
			
 
				+		perror("failed to stat file");
			
 
				+		exit(-1);
			
 
				+	}
			
 
				+
			
 
				+	if (!stat.st_size) {
			
 
				+		fprintf(stderr, "zero-sized file, nothing to do!\n");
			
 
				+		exit(0);
			
 
				+	}
			
 
				+
			
 
				+	if (load_kernel() < 0) {
			
 
				+		perror("failed to load kernel symbols");
			
 
				+		return EXIT_FAILURE;
			
 
				+	}
			
 
				+
			
 
				+remap:
			
 
				+	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
			
 
				+			   MAP_SHARED, input, offset);
			
 
				+	if (buf == MAP_FAILED) {
			
 
				+		perror("failed to mmap file");
			
 
				+		exit(-1);
			
 
				+	}
			
 
				+
			
 
				+more:
			
 
				+	event = (event_t *)(buf + head);
			
 
				+
			
 
				+	size = event->header.size;
			
 
				+	if (!size)
			
 
				+		size = 8;
			
 
				+
			
 
				+	if (head + event->header.size >= page_size * mmap_window) {
			
 
				+		unsigned long shift = page_size * (head / page_size);
			
 
				+		int ret;
			
 
				+
			
 
				+		ret = munmap(buf, page_size * mmap_window);
			
 
				+		assert(ret == 0);
			
 
				+
			
 
				+		offset += shift;
			
 
				+		head -= shift;
			
 
				+		goto remap;
			
 
				+	}
			
 
				+
			
 
				+	size = event->header.size;
			
 
				+
			
 
				+	dprintf("%p [%p]: event: %d\n",
			
 
				+			(void *)(offset + head),
			
 
				+			(void *)(long)event->header.size,
			
 
				+			event->header.type);
			
 
				+
			
 
				+	if (!size || process_event(event, offset, head) < 0) {
			
 
				+
			
 
				+		dprintf("%p [%p]: skipping unknown header type: %d\n",
			
 
				+			(void *)(offset + head),
			
 
				+			(void *)(long)(event->header.size),
			
 
				+			event->header.type);
			
 
				+
			
 
				+		total_unknown++;
			
 
				+
			
 
				+		/*
			
 
				+		 * assume we lost track of the stream, check alignment, and
			
 
				+		 * increment a single u64 in the hope to catch on again 'soon'.
			
 
				+		 */
			
 
				+
			
 
				+		if (unlikely(head & 7))
			
 
				+			head &= ~7ULL;
			
 
				+
			
 
				+		size = 8;
			
 
				+	}
			
 
				+
			
 
				+	head += size;
			
 
				+
			
 
				+	if (offset + head < stat.st_size)
			
 
				+		goto more;
			
 
				+
			
 
				+	rc = EXIT_SUCCESS;
			
 
				+	close(input);
			
 
				+
			
 
				+	dprintf("      IP events: %10ld\n", total);
			
 
				+	dprintf("    mmap events: %10ld\n", total_mmap);
			
 
				+	dprintf("    comm events: %10ld\n", total_comm);
			
 
				+	dprintf("    fork events: %10ld\n", total_fork);
			
 
				+	dprintf(" unknown events: %10ld\n", total_unknown);
			
 
				+
			
 
				+	if (dump_trace)
			
 
				+		return 0;
			
 
				+
			
 
				+	if (verbose >= 3)
			
 
				+		threads__fprintf(stdout);
			
 
				+
			
 
				+	if (verbose >= 2)
			
 
				+		dsos__fprintf(stdout);
			
 
				+
			
 
				+	collapse__resort();
			
 
				+	output__resort();
			
 
				+
			
 
				+	find_annotations();
			
 
				+
			
 
				+	return rc;
			
 
				+}
			
 
				+
			
 
				+static const char * const annotate_usage[] = {
			
 
				+	"perf annotate [<options>] <command>",
			
 
				+	NULL
			
 
				+};
			
 
				+
			
 
				+static const struct option options[] = {
			
 
				+	OPT_STRING('i', "input", &input_name, "file",
			
 
				+		    "input file name"),
			
 
				+	OPT_STRING('s', "symbol", &sym_hist_filter, "symbol",
			
 
				+		    "symbol to annotate"),
			
 
				+	OPT_BOOLEAN('v', "verbose", &verbose,
			
 
				+		    "be more verbose (show symbol address, etc)"),
			
 
				+	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
			
 
				+		    "dump raw trace in ASCII"),
			
 
				+	OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
			
 
				+	OPT_END()
			
 
				+};
			
 
				+
			
 
				+static void setup_sorting(void)
			
 
				+{
			
 
				+	char *tmp, *tok, *str = strdup(sort_order);
			
 
				+
			
 
				+	for (tok = strtok_r(str, ", ", &tmp);
			
 
				+			tok; tok = strtok_r(NULL, ", ", &tmp)) {
			
 
				+		if (sort_dimension__add(tok) < 0) {
			
 
				+			error("Unknown --sort key: `%s'", tok);
			
 
				+			usage_with_options(annotate_usage, options);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	free(str);
			
 
				+}
			
 
				+
			
 
				+int cmd_annotate(int argc, const char **argv, const char *prefix)
			
 
				+{
			
 
				+	symbol__init();
			
 
				+
			
 
				+	page_size = getpagesize();
			
 
				+
			
 
				+	argc = parse_options(argc, argv, options, annotate_usage, 0);
			
 
				+
			
 
				+	setup_sorting();
			
 
				+
			
 
				+	if (argc) {
			
 
				+		/*
			
 
				+		 * Special case: if there's an argument left then assume tha
			
 
				+		 * it's a symbol filter:
			
 
				+		 */
			
 
				+		if (argc > 1)
			
 
				+			usage_with_options(annotate_usage, options);
			
 
				+
			
 
				+		sym_hist_filter = argv[0];
			
 
				+	}
			
 
				+
			
 
				+	if (!sym_hist_filter)
			
 
				+		usage_with_options(annotate_usage, options);
			
 
				+
			
 
				+	setup_pager();
			
 
				+
			
 
				+	return __cmd_annotate();
			
 
				+}
			
--- a/tools/perf/builtin-help.c
+++ b/tools/perf/builtin-help.c
@@ -0,0 +1,461 @@
 
				+/*
			
 
				+ * builtin-help.c
			
 
				+ *
			
 
				+ * Builtin help command
			
 
				+ */
			
 
				+#include "util/cache.h"
			
 
				+#include "builtin.h"
			
 
				+#include "util/exec_cmd.h"
			
 
				+#include "common-cmds.h"
			
 
				+#include "util/parse-options.h"
			
 
				+#include "util/run-command.h"
			
 
				+#include "util/help.h"
			
 
				+
			
 
				+static struct man_viewer_list {
			
 
				+	struct man_viewer_list *next;
			
 
				+	char name[FLEX_ARRAY];
			
 
				+} *man_viewer_list;
			
 
				+
			
 
				+static struct man_viewer_info_list {
			
 
				+	struct man_viewer_info_list *next;
			
 
				+	const char *info;
			
 
				+	char name[FLEX_ARRAY];
			
 
				+} *man_viewer_info_list;
			
 
				+
			
 
				+enum help_format {
			
 
				+	HELP_FORMAT_MAN,
			
 
				+	HELP_FORMAT_INFO,
			
 
				+	HELP_FORMAT_WEB,
			
 
				+};
			
 
				+
			
 
				+static int show_all = 0;
			
 
				+static enum help_format help_format = HELP_FORMAT_MAN;
			
 
				+static struct option builtin_help_options[] = {
			
 
				+	OPT_BOOLEAN('a', "all", &show_all, "print all available commands"),
			
 
				+	OPT_SET_INT('m', "man", &help_format, "show man page", HELP_FORMAT_MAN),
			
 
				+	OPT_SET_INT('w', "web", &help_format, "show manual in web browser",
			
 
				+			HELP_FORMAT_WEB),
			
 
				+	OPT_SET_INT('i', "info", &help_format, "show info page",
			
 
				+			HELP_FORMAT_INFO),
			
 
				+	OPT_END(),
			
 
				+};
			
 
				+
			
 
				+static const char * const builtin_help_usage[] = {
			
 
				+	"perf help [--all] [--man|--web|--info] [command]",
			
 
				+	NULL
			
 
				+};
			
 
				+
			
 
				+static enum help_format parse_help_format(const char *format)
			
 
				+{
			
 
				+	if (!strcmp(format, "man"))
			
 
				+		return HELP_FORMAT_MAN;
			
 
				+	if (!strcmp(format, "info"))
			
 
				+		return HELP_FORMAT_INFO;
			
 
				+	if (!strcmp(format, "web") || !strcmp(format, "html"))
			
 
				+		return HELP_FORMAT_WEB;
			
 
				+	die("unrecognized help format '%s'", format);
			
 
				+}
			
 
				+
			
 
				+static const char *get_man_viewer_info(const char *name)
			
 
				+{
			
 
				+	struct man_viewer_info_list *viewer;
			
 
				+
			
 
				+	for (viewer = man_viewer_info_list; viewer; viewer = viewer->next)
			
 
				+	{
			
 
				+		if (!strcasecmp(name, viewer->name))
			
 
				+			return viewer->info;
			
 
				+	}
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static int check_emacsclient_version(void)
			
 
				+{
			
 
				+	struct strbuf buffer = STRBUF_INIT;
			
 
				+	struct child_process ec_process;
			
 
				+	const char *argv_ec[] = { "emacsclient", "--version", NULL };
			
 
				+	int version;
			
 
				+
			
 
				+	/* emacsclient prints its version number on stderr */
			
 
				+	memset(&ec_process, 0, sizeof(ec_process));
			
 
				+	ec_process.argv = argv_ec;
			
 
				+	ec_process.err = -1;
			
 
				+	ec_process.stdout_to_stderr = 1;
			
 
				+	if (start_command(&ec_process)) {
			
 
				+		fprintf(stderr, "Failed to start emacsclient.\n");
			
 
				+		return -1;
			
 
				+	}
			
 
				+	strbuf_read(&buffer, ec_process.err, 20);
			
 
				+	close(ec_process.err);
			
 
				+
			
 
				+	/*
			
 
				+	 * Don't bother checking return value, because "emacsclient --version"
			
 
				+	 * seems to always exits with code 1.
			
 
				+	 */
			
 
				+	finish_command(&ec_process);
			
 
				+
			
 
				+	if (prefixcmp(buffer.buf, "emacsclient")) {
			
 
				+		fprintf(stderr, "Failed to parse emacsclient version.\n");
			
 
				+		strbuf_release(&buffer);
			
 
				+		return -1;
			
 
				+	}
			
 
				+
			
 
				+	strbuf_remove(&buffer, 0, strlen("emacsclient"));
			
 
				+	version = atoi(buffer.buf);
			
 
				+
			
 
				+	if (version < 22) {
			
 
				+		fprintf(stderr,
			
 
				+			"emacsclient version '%d' too old (< 22).\n",
			
 
				+			version);
			
 
				+		strbuf_release(&buffer);
			
 
				+		return -1;
			
 
				+	}
			
 
				+
			
 
				+	strbuf_release(&buffer);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void exec_woman_emacs(const char* path, const char *page)
			
 
				+{
			
 
				+	if (!check_emacsclient_version()) {
			
 
				+		/* This works only with emacsclient version >= 22. */
			
 
				+		struct strbuf man_page = STRBUF_INIT;
			
 
				+
			
 
				+		if (!path)
			
 
				+			path = "emacsclient";
			
 
				+		strbuf_addf(&man_page, "(woman \"%s\")", page);
			
 
				+		execlp(path, "emacsclient", "-e", man_page.buf, NULL);
			
 
				+		warning("failed to exec '%s': %s", path, strerror(errno));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void exec_man_konqueror(const char* path, const char *page)
			
 
				+{
			
 
				+	const char *display = getenv("DISPLAY");
			
 
				+	if (display && *display) {
			
 
				+		struct strbuf man_page = STRBUF_INIT;
			
 
				+		const char *filename = "kfmclient";
			
 
				+
			
 
				+		/* It's simpler to launch konqueror using kfmclient. */
			
 
				+		if (path) {
			
 
				+			const char *file = strrchr(path, '/');
			
 
				+			if (file && !strcmp(file + 1, "konqueror")) {
			
 
				+				char *new = strdup(path);
			
 
				+				char *dest = strrchr(new, '/');
			
 
				+
			
 
				+				/* strlen("konqueror") == strlen("kfmclient") */
			
 
				+				strcpy(dest + 1, "kfmclient");
			
 
				+				path = new;
			
 
				+			}
			
 
				+			if (file)
			
 
				+				filename = file;
			
 
				+		} else
			
 
				+			path = "kfmclient";
			
 
				+		strbuf_addf(&man_page, "man:%s(1)", page);
			
 
				+		execlp(path, filename, "newTab", man_page.buf, NULL);
			
 
				+		warning("failed to exec '%s': %s", path, strerror(errno));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void exec_man_man(const char* path, const char *page)
			
 
				+{
			
 
				+	if (!path)
			
 
				+		path = "man";
			
 
				+	execlp(path, "man", page, NULL);
			
 
				+	warning("failed to exec '%s': %s", path, strerror(errno));
			
 
				+}
			
 
				+
			
 
				+static void exec_man_cmd(const char *cmd, const char *page)
			
 
				+{
			
 
				+	struct strbuf shell_cmd = STRBUF_INIT;
			
 
				+	strbuf_addf(&shell_cmd, "%s %s", cmd, page);
			
 
				+	execl("/bin/sh", "sh", "-c", shell_cmd.buf, NULL);
			
 
				+	warning("failed to exec '%s': %s", cmd, strerror(errno));
			
 
				+}
			
 
				+
			
 
				+static void add_man_viewer(const char *name)
			
 
				+{
			
 
				+	struct man_viewer_list **p = &man_viewer_list;
			
 
				+	size_t len = strlen(name);
			
 
				+
			
 
				+	while (*p)
			
 
				+		p = &((*p)->next);
			
 
				+	*p = calloc(1, (sizeof(**p) + len + 1));
			
 
				+	strncpy((*p)->name, name, len);
			
 
				+}
			
 
				+
			
 
				+static int supported_man_viewer(const char *name, size_t len)
			
 
				+{
			
 
				+	return (!strncasecmp("man", name, len) ||
			
 
				+		!strncasecmp("woman", name, len) ||
			
 
				+		!strncasecmp("konqueror", name, len));
			
 
				+}
			
 
				+
			
 
				+static void do_add_man_viewer_info(const char *name,
			
 
				+				   size_t len,
			
 
				+				   const char *value)
			
 
				+{
			
 
				+	struct man_viewer_info_list *new = calloc(1, sizeof(*new) + len + 1);
			
 
				+
			
 
				+	strncpy(new->name, name, len);
			
 
				+	new->info = strdup(value);
			
 
				+	new->next = man_viewer_info_list;
			
 
				+	man_viewer_info_list = new;
			
 
				+}
			
 
				+
			
 
				+static int add_man_viewer_path(const char *name,
			
 
				+			       size_t len,
			
 
				+			       const char *value)
			
 
				+{
			
 
				+	if (supported_man_viewer(name, len))
			
 
				+		do_add_man_viewer_info(name, len, value);
			
 
				+	else
			
 
				+		warning("'%s': path for unsupported man viewer.\n"
			
 
				+			"Please consider using 'man.<tool>.cmd' instead.",
			
 
				+			name);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int add_man_viewer_cmd(const char *name,
			
 
				+			      size_t len,
			
 
				+			      const char *value)
			
 
				+{
			
 
				+	if (supported_man_viewer(name, len))
			
 
				+		warning("'%s': cmd for supported man viewer.\n"
			
 
				+			"Please consider using 'man.<tool>.path' instead.",
			
 
				+			name);
			
 
				+	else
			
 
				+		do_add_man_viewer_info(name, len, value);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int add_man_viewer_info(const char *var, const char *value)
			
 
				+{
			
 
				+	const char *name = var + 4;
			
 
				+	const char *subkey = strrchr(name, '.');
			
 
				+
			
 
				+	if (!subkey)
			
 
				+		return error("Config with no key for man viewer: %s", name);
			
 
				+
			
 
				+	if (!strcmp(subkey, ".path")) {
			
 
				+		if (!value)
			
 
				+			return config_error_nonbool(var);
			
 
				+		return add_man_viewer_path(name, subkey - name, value);
			
 
				+	}
			
 
				+	if (!strcmp(subkey, ".cmd")) {
			
 
				+		if (!value)
			
 
				+			return config_error_nonbool(var);
			
 
				+		return add_man_viewer_cmd(name, subkey - name, value);
			
 
				+	}
			
 
				+
			
 
				+	warning("'%s': unsupported man viewer sub key.", subkey);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int perf_help_config(const char *var, const char *value, void *cb)
			
 
				+{
			
 
				+	if (!strcmp(var, "help.format")) {
			
 
				+		if (!value)
			
 
				+			return config_error_nonbool(var);
			
 
				+		help_format = parse_help_format(value);
			
 
				+		return 0;
			
 
				+	}
			
 
				+	if (!strcmp(var, "man.viewer")) {
			
 
				+		if (!value)
			
 
				+			return config_error_nonbool(var);
			
 
				+		add_man_viewer(value);
			
 
				+		return 0;
			
 
				+	}
			
 
				+	if (!prefixcmp(var, "man."))
			
 
				+		return add_man_viewer_info(var, value);
			
 
				+
			
 
				+	return perf_default_config(var, value, cb);
			
 
				+}
			
 
				+
			
 
				+static struct cmdnames main_cmds, other_cmds;
			
 
				+
			
 
				+void list_common_cmds_help(void)
			
 
				+{
			
 
				+	int i, longest = 0;
			
 
				+
			
 
				+	for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
			
 
				+		if (longest < strlen(common_cmds[i].name))
			
 
				+			longest = strlen(common_cmds[i].name);
			
 
				+	}
			
 
				+
			
 
				+	puts(" The most commonly used perf commands are:");
			
 
				+	for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
			
 
				+		printf("   %s   ", common_cmds[i].name);
			
 
				+		mput_char(' ', longest - strlen(common_cmds[i].name));
			
 
				+		puts(common_cmds[i].help);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int is_perf_command(const char *s)
			
 
				+{
			
 
				+	return is_in_cmdlist(&main_cmds, s) ||
			
 
				+		is_in_cmdlist(&other_cmds, s);
			
 
				+}
			
 
				+
			
 
				+static const char *prepend(const char *prefix, const char *cmd)
			
 
				+{
			
 
				+	size_t pre_len = strlen(prefix);
			
 
				+	size_t cmd_len = strlen(cmd);
			
 
				+	char *p = malloc(pre_len + cmd_len + 1);
			
 
				+	memcpy(p, prefix, pre_len);
			
 
				+	strcpy(p + pre_len, cmd);
			
 
				+	return p;
			
 
				+}
			
 
				+
			
 
				+static const char *cmd_to_page(const char *perf_cmd)
			
 
				+{
			
 
				+	if (!perf_cmd)
			
 
				+		return "perf";
			
 
				+	else if (!prefixcmp(perf_cmd, "perf"))
			
 
				+		return perf_cmd;
			
 
				+	else if (is_perf_command(perf_cmd))
			
 
				+		return prepend("perf-", perf_cmd);
			
 
				+	else
			
 
				+		return prepend("perf-", perf_cmd);
			
 
				+}
			
 
				+
			
 
				+static void setup_man_path(void)
			
 
				+{
			
 
				+	struct strbuf new_path = STRBUF_INIT;
			
 
				+	const char *old_path = getenv("MANPATH");
			
 
				+
			
 
				+	/* We should always put ':' after our path. If there is no
			
 
				+	 * old_path, the ':' at the end will let 'man' to try
			
 
				+	 * system-wide paths after ours to find the manual page. If
			
 
				+	 * there is old_path, we need ':' as delimiter. */
			
 
				+	strbuf_addstr(&new_path, system_path(PERF_MAN_PATH));
			
 
				+	strbuf_addch(&new_path, ':');
			
 
				+	if (old_path)
			
 
				+		strbuf_addstr(&new_path, old_path);
			
 
				+
			
 
				+	setenv("MANPATH", new_path.buf, 1);
			
 
				+
			
 
				+	strbuf_release(&new_path);
			
 
				+}
			
 
				+
			
 
				+static void exec_viewer(const char *name, const char *page)
			
 
				+{
			
 
				+	const char *info = get_man_viewer_info(name);
			
 
				+
			
 
				+	if (!strcasecmp(name, "man"))
			
 
				+		exec_man_man(info, page);
			
 
				+	else if (!strcasecmp(name, "woman"))
			
 
				+		exec_woman_emacs(info, page);
			
 
				+	else if (!strcasecmp(name, "konqueror"))
			
 
				+		exec_man_konqueror(info, page);
			
 
				+	else if (info)
			
 
				+		exec_man_cmd(info, page);
			
 
				+	else
			
 
				+		warning("'%s': unknown man viewer.", name);
			
 
				+}
			
 
				+
			
 
				+static void show_man_page(const char *perf_cmd)
			
 
				+{
			
 
				+	struct man_viewer_list *viewer;
			
 
				+	const char *page = cmd_to_page(perf_cmd);
			
 
				+	const char *fallback = getenv("PERF_MAN_VIEWER");
			
 
				+
			
 
				+	setup_man_path();
			
 
				+	for (viewer = man_viewer_list; viewer; viewer = viewer->next)
			
 
				+	{
			
 
				+		exec_viewer(viewer->name, page); /* will return when unable */
			
 
				+	}
			
 
				+	if (fallback)
			
 
				+		exec_viewer(fallback, page);
			
 
				+	exec_viewer("man", page);
			
 
				+	die("no man viewer handled the request");
			
 
				+}
			
 
				+
			
 
				+static void show_info_page(const char *perf_cmd)
			
 
				+{
			
 
				+	const char *page = cmd_to_page(perf_cmd);
			
 
				+	setenv("INFOPATH", system_path(PERF_INFO_PATH), 1);
			
 
				+	execlp("info", "info", "perfman", page, NULL);
			
 
				+}
			
 
				+
			
 
				+static void get_html_page_path(struct strbuf *page_path, const char *page)
			
 
				+{
			
 
				+	struct stat st;
			
 
				+	const char *html_path = system_path(PERF_HTML_PATH);
			
 
				+
			
 
				+	/* Check that we have a perf documentation directory. */
			
 
				+	if (stat(mkpath("%s/perf.html", html_path), &st)
			
 
				+	    || !S_ISREG(st.st_mode))
			
 
				+		die("'%s': not a documentation directory.", html_path);
			
 
				+
			
 
				+	strbuf_init(page_path, 0);
			
 
				+	strbuf_addf(page_path, "%s/%s.html", html_path, page);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * If open_html is not defined in a platform-specific way (see for
			
 
				+ * example compat/mingw.h), we use the script web--browse to display
			
 
				+ * HTML.
			
 
				+ */
			
 
				+#ifndef open_html
			
 
				+static void open_html(const char *path)
			
 
				+{
			
 
				+	execl_perf_cmd("web--browse", "-c", "help.browser", path, NULL);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+static void show_html_page(const char *perf_cmd)
			
 
				+{
			
 
				+	const char *page = cmd_to_page(perf_cmd);
			
 
				+	struct strbuf page_path; /* it leaks but we exec bellow */
			
 
				+
			
 
				+	get_html_page_path(&page_path, page);
			
 
				+
			
 
				+	open_html(page_path.buf);
			
 
				+}
			
 
				+
			
 
				+int cmd_help(int argc, const char **argv, const char *prefix)
			
 
				+{
			
 
				+	const char *alias;
			
 
				+	load_command_list("perf-", &main_cmds, &other_cmds);
			
 
				+
			
 
				+	perf_config(perf_help_config, NULL);
			
 
				+
			
 
				+	argc = parse_options(argc, argv, builtin_help_options,
			
 
				+			builtin_help_usage, 0);
			
 
				+
			
 
				+	if (show_all) {
			
 
				+		printf("\n usage: %s\n\n", perf_usage_string);
			
 
				+		list_commands("perf commands", &main_cmds, &other_cmds);
			
 
				+		printf(" %s\n\n", perf_more_info_string);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	if (!argv[0]) {
			
 
				+		printf("\n usage: %s\n\n", perf_usage_string);
			
 
				+		list_common_cmds_help();
			
 
				+		printf("\n %s\n\n", perf_more_info_string);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	alias = alias_lookup(argv[0]);
			
 
				+	if (alias && !is_perf_command(argv[0])) {
			
 
				+		printf("`perf %s' is aliased to `%s'\n", argv[0], alias);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	switch (help_format) {
			
 
				+	case HELP_FORMAT_MAN:
			
 
				+		show_man_page(argv[0]);
			
 
				+		break;
			
 
				+	case HELP_FORMAT_INFO:
			
 
				+		show_info_page(argv[0]);
			
 
				+		break;
			
 
				+	case HELP_FORMAT_WEB:
			
 
				+		show_html_page(argv[0]);
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/tools/perf/builtin-list.c
+++ b/tools/perf/builtin-list.c
@@ -0,0 +1,20 @@
 
				+/*
			
 
				+ * builtin-list.c
			
 
				+ *
			
 
				+ * Builtin list command: list all event types
			
 
				+ *
			
 
				+ * Copyright (C) 2009, Thomas Gleixner <tglx@linutronix.de>
			
 
				+ * Copyright (C) 2008-2009, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
			
 
				+ */
			
 
				+#include "builtin.h"
			
 
				+
			
 
				+#include "perf.h"
			
 
				+
			
 
				+#include "util/parse-options.h"
			
 
				+#include "util/parse-events.h"
			
 
				+
			
 
				+int cmd_list(int argc, const char **argv, const char *prefix)
			
 
				+{
			
 
				+	print_events();
			
 
				+	return 0;
			
 
				+}
			
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -0,0 +1,582 @@
 
				+/*
			
 
				+ * builtin-record.c
			
 
				+ *
			
 
				+ * Builtin record command: Record the profile of a workload
			
 
				+ * (or a CPU, or a PID) into the perf.data output file - for
			
 
				+ * later analysis via perf report.
			
 
				+ */
			
 
				+#include "builtin.h"
			
 
				+
			
 
				+#include "perf.h"
			
 
				+
			
 
				+#include "util/util.h"
			
 
				+#include "util/parse-options.h"
			
 
				+#include "util/parse-events.h"
			
 
				+#include "util/string.h"
			
 
				+
			
 
				+#include <unistd.h>
			
 
				+#include <sched.h>
			
 
				+
			
 
				+#define ALIGN(x, a)		__ALIGN_MASK(x, (typeof(x))(a)-1)
			
 
				+#define __ALIGN_MASK(x, mask)	(((x)+(mask))&~(mask))
			
 
				+
			
 
				+static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
			
 
				+
			
 
				+static long			default_interval		= 100000;
			
 
				+
			
 
				+static int			nr_cpus				= 0;
			
 
				+static unsigned int		page_size;
			
 
				+static unsigned int		mmap_pages			= 128;
			
 
				+static int			freq				= 0;
			
 
				+static int			output;
			
 
				+static const char		*output_name			= "perf.data";
			
 
				+static int			group				= 0;
			
 
				+static unsigned int		realtime_prio			= 0;
			
 
				+static int			system_wide			= 0;
			
 
				+static pid_t			target_pid			= -1;
			
 
				+static int			inherit				= 1;
			
 
				+static int			force				= 0;
			
 
				+static int			append_file			= 0;
			
 
				+static int			verbose				= 0;
			
 
				+
			
 
				+static long			samples;
			
 
				+static struct timeval		last_read;
			
 
				+static struct timeval		this_read;
			
 
				+
			
 
				+static __u64			bytes_written;
			
 
				+
			
 
				+static struct pollfd		event_array[MAX_NR_CPUS * MAX_COUNTERS];
			
 
				+
			
 
				+static int			nr_poll;
			
 
				+static int			nr_cpu;
			
 
				+
			
 
				+struct mmap_event {
			
 
				+	struct perf_event_header	header;
			
 
				+	__u32				pid;
			
 
				+	__u32				tid;
			
 
				+	__u64				start;
			
 
				+	__u64				len;
			
 
				+	__u64				pgoff;
			
 
				+	char				filename[PATH_MAX];
			
 
				+};
			
 
				+
			
 
				+struct comm_event {
			
 
				+	struct perf_event_header	header;
			
 
				+	__u32				pid;
			
 
				+	__u32				tid;
			
 
				+	char				comm[16];
			
 
				+};
			
 
				+
			
 
				+
			
 
				+struct mmap_data {
			
 
				+	int			counter;
			
 
				+	void			*base;
			
 
				+	unsigned int		mask;
			
 
				+	unsigned int		prev;
			
 
				+};
			
 
				+
			
 
				+static struct mmap_data		mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
			
 
				+
			
 
				+static unsigned int mmap_read_head(struct mmap_data *md)
			
 
				+{
			
 
				+	struct perf_counter_mmap_page *pc = md->base;
			
 
				+	int head;
			
 
				+
			
 
				+	head = pc->data_head;
			
 
				+	rmb();
			
 
				+
			
 
				+	return head;
			
 
				+}
			
 
				+
			
 
				+static void mmap_read(struct mmap_data *md)
			
 
				+{
			
 
				+	unsigned int head = mmap_read_head(md);
			
 
				+	unsigned int old = md->prev;
			
 
				+	unsigned char *data = md->base + page_size;
			
 
				+	unsigned long size;
			
 
				+	void *buf;
			
 
				+	int diff;
			
 
				+
			
 
				+	gettimeofday(&this_read, NULL);
			
 
				+
			
 
				+	/*
			
 
				+	 * If we're further behind than half the buffer, there's a chance
			
 
				+	 * the writer will bite our tail and mess up the samples under us.
			
 
				+	 *
			
 
				+	 * If we somehow ended up ahead of the head, we got messed up.
			
 
				+	 *
			
 
				+	 * In either case, truncate and restart at head.
			
 
				+	 */
			
 
				+	diff = head - old;
			
 
				+	if (diff > md->mask / 2 || diff < 0) {
			
 
				+		struct timeval iv;
			
 
				+		unsigned long msecs;
			
 
				+
			
 
				+		timersub(&this_read, &last_read, &iv);
			
 
				+		msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
			
 
				+
			
 
				+		fprintf(stderr, "WARNING: failed to keep up with mmap data."
			
 
				+				"  Last read %lu msecs ago.\n", msecs);
			
 
				+
			
 
				+		/*
			
 
				+		 * head points to a known good entry, start there.
			
 
				+		 */
			
 
				+		old = head;
			
 
				+	}
			
 
				+
			
 
				+	last_read = this_read;
			
 
				+
			
 
				+	if (old != head)
			
 
				+		samples++;
			
 
				+
			
 
				+	size = head - old;
			
 
				+
			
 
				+	if ((old & md->mask) + size != (head & md->mask)) {
			
 
				+		buf = &data[old & md->mask];
			
 
				+		size = md->mask + 1 - (old & md->mask);
			
 
				+		old += size;
			
 
				+
			
 
				+		while (size) {
			
 
				+			int ret = write(output, buf, size);
			
 
				+
			
 
				+			if (ret < 0)
			
 
				+				die("failed to write");
			
 
				+
			
 
				+			size -= ret;
			
 
				+			buf += ret;
			
 
				+
			
 
				+			bytes_written += ret;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	buf = &data[old & md->mask];
			
 
				+	size = head - old;
			
 
				+	old += size;
			
 
				+
			
 
				+	while (size) {
			
 
				+		int ret = write(output, buf, size);
			
 
				+
			
 
				+		if (ret < 0)
			
 
				+			die("failed to write");
			
 
				+
			
 
				+		size -= ret;
			
 
				+		buf += ret;
			
 
				+
			
 
				+		bytes_written += ret;
			
 
				+	}
			
 
				+
			
 
				+	md->prev = old;
			
 
				+}
			
 
				+
			
 
				+static volatile int done = 0;
			
 
				+static volatile int signr = -1;
			
 
				+
			
 
				+static void sig_handler(int sig)
			
 
				+{
			
 
				+	done = 1;
			
 
				+	signr = sig;
			
 
				+}
			
 
				+
			
 
				+static void sig_atexit(void)
			
 
				+{
			
 
				+	if (signr == -1)
			
 
				+		return;
			
 
				+
			
 
				+	signal(signr, SIG_DFL);
			
 
				+	kill(getpid(), signr);
			
 
				+}
			
 
				+
			
 
				+static void pid_synthesize_comm_event(pid_t pid, int full)
			
 
				+{
			
 
				+	struct comm_event comm_ev;
			
 
				+	char filename[PATH_MAX];
			
 
				+	char bf[BUFSIZ];
			
 
				+	int fd, ret;
			
 
				+	size_t size;
			
 
				+	char *field, *sep;
			
 
				+	DIR *tasks;
			
 
				+	struct dirent dirent, *next;
			
 
				+
			
 
				+	snprintf(filename, sizeof(filename), "/proc/%d/stat", pid);
			
 
				+
			
 
				+	fd = open(filename, O_RDONLY);
			
 
				+	if (fd < 0) {
			
 
				+		fprintf(stderr, "couldn't open %s\n", filename);
			
 
				+		exit(EXIT_FAILURE);
			
 
				+	}
			
 
				+	if (read(fd, bf, sizeof(bf)) < 0) {
			
 
				+		fprintf(stderr, "couldn't read %s\n", filename);
			
 
				+		exit(EXIT_FAILURE);
			
 
				+	}
			
 
				+	close(fd);
			
 
				+
			
 
				+	/* 9027 (cat) R 6747 9027 6747 34816 9027 ... */
			
 
				+	memset(&comm_ev, 0, sizeof(comm_ev));
			
 
				+	field = strchr(bf, '(');
			
 
				+	if (field == NULL)
			
 
				+		goto out_failure;
			
 
				+	sep = strchr(++field, ')');
			
 
				+	if (sep == NULL)
			
 
				+		goto out_failure;
			
 
				+	size = sep - field;
			
 
				+	memcpy(comm_ev.comm, field, size++);
			
 
				+
			
 
				+	comm_ev.pid = pid;
			
 
				+	comm_ev.header.type = PERF_EVENT_COMM;
			
 
				+	size = ALIGN(size, sizeof(__u64));
			
 
				+	comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
			
 
				+
			
 
				+	if (!full) {
			
 
				+		comm_ev.tid = pid;
			
 
				+
			
 
				+		ret = write(output, &comm_ev, comm_ev.header.size);
			
 
				+		if (ret < 0) {
			
 
				+			perror("failed to write");
			
 
				+			exit(-1);
			
 
				+		}
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	snprintf(filename, sizeof(filename), "/proc/%d/task", pid);
			
 
				+
			
 
				+	tasks = opendir(filename);
			
 
				+	while (!readdir_r(tasks, &dirent, &next) && next) {
			
 
				+		char *end;
			
 
				+		pid = strtol(dirent.d_name, &end, 10);
			
 
				+		if (*end)
			
 
				+			continue;
			
 
				+
			
 
				+		comm_ev.tid = pid;
			
 
				+
			
 
				+		ret = write(output, &comm_ev, comm_ev.header.size);
			
 
				+		if (ret < 0) {
			
 
				+			perror("failed to write");
			
 
				+			exit(-1);
			
 
				+		}
			
 
				+	}
			
 
				+	closedir(tasks);
			
 
				+	return;
			
 
				+
			
 
				+out_failure:
			
 
				+	fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
			
 
				+		filename);
			
 
				+	exit(EXIT_FAILURE);
			
 
				+}
			
 
				+
			
 
				+static void pid_synthesize_mmap_samples(pid_t pid)
			
 
				+{
			
 
				+	char filename[PATH_MAX];
			
 
				+	FILE *fp;
			
 
				+
			
 
				+	snprintf(filename, sizeof(filename), "/proc/%d/maps", pid);
			
 
				+
			
 
				+	fp = fopen(filename, "r");
			
 
				+	if (fp == NULL) {
			
 
				+		fprintf(stderr, "couldn't open %s\n", filename);
			
 
				+		exit(EXIT_FAILURE);
			
 
				+	}
			
 
				+	while (1) {
			
 
				+		char bf[BUFSIZ], *pbf = bf;
			
 
				+		struct mmap_event mmap_ev = {
			
 
				+			.header.type = PERF_EVENT_MMAP,
			
 
				+		};
			
 
				+		int n;
			
 
				+		size_t size;
			
 
				+		if (fgets(bf, sizeof(bf), fp) == NULL)
			
 
				+			break;
			
 
				+
			
 
				+		/* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */
			
 
				+		n = hex2u64(pbf, &mmap_ev.start);
			
 
				+		if (n < 0)
			
 
				+			continue;
			
 
				+		pbf += n + 1;
			
 
				+		n = hex2u64(pbf, &mmap_ev.len);
			
 
				+		if (n < 0)
			
 
				+			continue;
			
 
				+		pbf += n + 3;
			
 
				+		if (*pbf == 'x') { /* vm_exec */
			
 
				+			char *execname = strrchr(bf, ' ');
			
 
				+
			
 
				+			if (execname == NULL || execname[1] != '/')
			
 
				+				continue;
			
 
				+
			
 
				+			execname += 1;
			
 
				+			size = strlen(execname);
			
 
				+			execname[size - 1] = '\0'; /* Remove \n */
			
 
				+			memcpy(mmap_ev.filename, execname, size);
			
 
				+			size = ALIGN(size, sizeof(__u64));
			
 
				+			mmap_ev.len -= mmap_ev.start;
			
 
				+			mmap_ev.header.size = (sizeof(mmap_ev) -
			
 
				+					       (sizeof(mmap_ev.filename) - size));
			
 
				+			mmap_ev.pid = pid;
			
 
				+			mmap_ev.tid = pid;
			
 
				+
			
 
				+			if (write(output, &mmap_ev, mmap_ev.header.size) < 0) {
			
 
				+				perror("failed to write");
			
 
				+				exit(-1);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	fclose(fp);
			
 
				+}
			
 
				+
			
 
				+static void synthesize_samples(void)
			
 
				+{
			
 
				+	DIR *proc;
			
 
				+	struct dirent dirent, *next;
			
 
				+
			
 
				+	proc = opendir("/proc");
			
 
				+
			
 
				+	while (!readdir_r(proc, &dirent, &next) && next) {
			
 
				+		char *end;
			
 
				+		pid_t pid;
			
 
				+
			
 
				+		pid = strtol(dirent.d_name, &end, 10);
			
 
				+		if (*end) /* only interested in proper numerical dirents */
			
 
				+			continue;
			
 
				+
			
 
				+		pid_synthesize_comm_event(pid, 1);
			
 
				+		pid_synthesize_mmap_samples(pid);
			
 
				+	}
			
 
				+
			
 
				+	closedir(proc);
			
 
				+}
			
 
				+
			
 
				+static int group_fd;
			
 
				+
			
 
				+static void create_counter(int counter, int cpu, pid_t pid)
			
 
				+{
			
 
				+	struct perf_counter_attr *attr = attrs + counter;
			
 
				+	int track = 1;
			
 
				+
			
 
				+	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
			
 
				+	if (freq) {
			
 
				+		attr->sample_type	|= PERF_SAMPLE_PERIOD;
			
 
				+		attr->freq		= 1;
			
 
				+		attr->sample_freq	= freq;
			
 
				+	}
			
 
				+	attr->mmap		= track;
			
 
				+	attr->comm		= track;
			
 
				+	attr->inherit		= (cpu < 0) && inherit;
			
 
				+	attr->disabled		= 1;
			
 
				+
			
 
				+	track = 0; /* only the first counter needs these */
			
 
				+
			
 
				+try_again:
			
 
				+	fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0);
			
 
				+
			
 
				+	if (fd[nr_cpu][counter] < 0) {
			
 
				+		int err = errno;
			
 
				+
			
 
				+		if (err == EPERM)
			
 
				+			die("Permission error - are you root?\n");
			
 
				+
			
 
				+		/*
			
 
				+		 * If it's cycles then fall back to hrtimer
			
 
				+		 * based cpu-clock-tick sw counter, which
			
 
				+		 * is always available even if no PMU support:
			
 
				+		 */
			
 
				+		if (attr->type == PERF_TYPE_HARDWARE
			
 
				+			&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
			
 
				+
			
 
				+			if (verbose)
			
 
				+				warning(" ... trying to fall back to cpu-clock-ticks\n");
			
 
				+			attr->type = PERF_TYPE_SOFTWARE;
			
 
				+			attr->config = PERF_COUNT_SW_CPU_CLOCK;
			
 
				+			goto try_again;
			
 
				+		}
			
 
				+		printf("\n");
			
 
				+		error("perfcounter syscall returned with %d (%s)\n",
			
 
				+			fd[nr_cpu][counter], strerror(err));
			
 
				+		die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n");
			
 
				+		exit(-1);
			
 
				+	}
			
 
				+
			
 
				+	assert(fd[nr_cpu][counter] >= 0);
			
 
				+	fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
			
 
				+
			
 
				+	/*
			
 
				+	 * First counter acts as the group leader:
			
 
				+	 */
			
 
				+	if (group && group_fd == -1)
			
 
				+		group_fd = fd[nr_cpu][counter];
			
 
				+
			
 
				+	event_array[nr_poll].fd = fd[nr_cpu][counter];
			
 
				+	event_array[nr_poll].events = POLLIN;
			
 
				+	nr_poll++;
			
 
				+
			
 
				+	mmap_array[nr_cpu][counter].counter = counter;
			
 
				+	mmap_array[nr_cpu][counter].prev = 0;
			
 
				+	mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
			
 
				+	mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
			
 
				+			PROT_READ, MAP_SHARED, fd[nr_cpu][counter], 0);
			
 
				+	if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
			
 
				+		error("failed to mmap with %d (%s)\n", errno, strerror(errno));
			
 
				+		exit(-1);
			
 
				+	}
			
 
				+
			
 
				+	ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE);
			
 
				+}
			
 
				+
			
 
				+static void open_counters(int cpu, pid_t pid)
			
 
				+{
			
 
				+	int counter;
			
 
				+
			
 
				+	if (pid > 0) {
			
 
				+		pid_synthesize_comm_event(pid, 0);
			
 
				+		pid_synthesize_mmap_samples(pid);
			
 
				+	}
			
 
				+
			
 
				+	group_fd = -1;
			
 
				+	for (counter = 0; counter < nr_counters; counter++)
			
 
				+		create_counter(counter, cpu, pid);
			
 
				+
			
 
				+	nr_cpu++;
			
 
				+}
			
 
				+
			
 
				+static int __cmd_record(int argc, const char **argv)
			
 
				+{
			
 
				+	int i, counter;
			
 
				+	struct stat st;
			
 
				+	pid_t pid;
			
 
				+	int flags;
			
 
				+	int ret;
			
 
				+
			
 
				+	page_size = sysconf(_SC_PAGE_SIZE);
			
 
				+	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
			
 
				+	assert(nr_cpus <= MAX_NR_CPUS);
			
 
				+	assert(nr_cpus >= 0);
			
 
				+
			
 
				+	if (!stat(output_name, &st) && !force && !append_file) {
			
 
				+		fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n",
			
 
				+				output_name);
			
 
				+		exit(-1);
			
 
				+	}
			
 
				+
			
 
				+	flags = O_CREAT|O_RDWR;
			
 
				+	if (append_file)
			
 
				+		flags |= O_APPEND;
			
 
				+	else
			
 
				+		flags |= O_TRUNC;
			
 
				+
			
 
				+	output = open(output_name, flags, S_IRUSR|S_IWUSR);
			
 
				+	if (output < 0) {
			
 
				+		perror("failed to create output file");
			
 
				+		exit(-1);
			
 
				+	}
			
 
				+
			
 
				+	if (!system_wide) {
			
 
				+		open_counters(-1, target_pid != -1 ? target_pid : getpid());
			
 
				+	} else for (i = 0; i < nr_cpus; i++)
			
 
				+		open_counters(i, target_pid);
			
 
				+
			
 
				+	atexit(sig_atexit);
			
 
				+	signal(SIGCHLD, sig_handler);
			
 
				+	signal(SIGINT, sig_handler);
			
 
				+
			
 
				+	if (target_pid == -1 && argc) {
			
 
				+		pid = fork();
			
 
				+		if (pid < 0)
			
 
				+			perror("failed to fork");
			
 
				+
			
 
				+		if (!pid) {
			
 
				+			if (execvp(argv[0], (char **)argv)) {
			
 
				+				perror(argv[0]);
			
 
				+				exit(-1);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (realtime_prio) {
			
 
				+		struct sched_param param;
			
 
				+
			
 
				+		param.sched_priority = realtime_prio;
			
 
				+		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
			
 
				+			printf("Could not set realtime priority.\n");
			
 
				+			exit(-1);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (system_wide)
			
 
				+		synthesize_samples();
			
 
				+
			
 
				+	while (!done) {
			
 
				+		int hits = samples;
			
 
				+
			
 
				+		for (i = 0; i < nr_cpu; i++) {
			
 
				+			for (counter = 0; counter < nr_counters; counter++)
			
 
				+				mmap_read(&mmap_array[i][counter]);
			
 
				+		}
			
 
				+
			
 
				+		if (hits == samples)
			
 
				+			ret = poll(event_array, nr_poll, 100);
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Approximate RIP event size: 24 bytes.
			
 
				+	 */
			
 
				+	fprintf(stderr,
			
 
				+		"[ perf record: Captured and wrote %.3f MB %s (~%lld samples) ]\n",
			
 
				+		(double)bytes_written / 1024.0 / 1024.0,
			
 
				+		output_name,
			
 
				+		bytes_written / 24);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static const char * const record_usage[] = {
			
 
				+	"perf record [<options>] [<command>]",
			
 
				+	"perf record [<options>] -- <command> [<options>]",
			
 
				+	NULL
			
 
				+};
			
 
				+
			
 
				+static const struct option options[] = {
			
 
				+	OPT_CALLBACK('e', "event", NULL, "event",
			
 
				+		     "event selector. use 'perf list' to list available events",
			
 
				+		     parse_events),
			
 
				+	OPT_INTEGER('p', "pid", &target_pid,
			
 
				+		    "record events on existing pid"),
			
 
				+	OPT_INTEGER('r', "realtime", &realtime_prio,
			
 
				+		    "collect data with this RT SCHED_FIFO priority"),
			
 
				+	OPT_BOOLEAN('a', "all-cpus", &system_wide,
			
 
				+			    "system-wide collection from all CPUs"),
			
 
				+	OPT_BOOLEAN('A', "append", &append_file,
			
 
				+			    "append to the output file to do incremental profiling"),
			
 
				+	OPT_BOOLEAN('f', "force", &force,
			
 
				+			"overwrite existing data file"),
			
 
				+	OPT_LONG('c', "count", &default_interval,
			
 
				+		    "event period to sample"),
			
 
				+	OPT_STRING('o', "output", &output_name, "file",
			
 
				+		    "output file name"),
			
 
				+	OPT_BOOLEAN('i', "inherit", &inherit,
			
 
				+		    "child tasks inherit counters"),
			
 
				+	OPT_INTEGER('F', "freq", &freq,
			
 
				+		    "profile at this frequency"),
			
 
				+	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
			
 
				+		    "number of mmap data pages"),
			
 
				+	OPT_BOOLEAN('v', "verbose", &verbose,
			
 
				+		    "be more verbose (show counter open errors, etc)"),
			
 
				+	OPT_END()
			
 
				+};
			
 
				+
			
 
				+int cmd_record(int argc, const char **argv, const char *prefix)
			
 
				+{
			
 
				+	int counter;
			
 
				+
			
 
				+	argc = parse_options(argc, argv, options, record_usage, 0);
			
 
				+	if (!argc && target_pid == -1 && !system_wide)
			
 
				+		usage_with_options(record_usage, options);
			
 
				+
			
 
				+	if (!nr_counters)
			
 
				+		nr_counters = 1;
			
 
				+
			
 
				+	for (counter = 0; counter < nr_counters; counter++) {
			
 
				+		if (attrs[counter].sample_period)
			
 
				+			continue;
			
 
				+
			
 
				+		attrs[counter].sample_period = default_interval;
			
 
				+	}
			
 
				+
			
 
				+	return __cmd_record(argc, argv);
			
 
				+}
			
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -0,0 +1,1316 @@
 
				+/*
			
 
				+ * builtin-report.c
			
 
				+ *
			
 
				+ * Builtin report command: Analyze the perf.data input file,
			
 
				+ * look up and read DSOs and symbol information and display
			
 
				+ * a histogram of results, along various sorting keys.
			
 
				+ */
			
 
				+#include "builtin.h"
			
 
				+
			
 
				+#include "util/util.h"
			
 
				+
			
 
				+#include "util/color.h"
			
 
				+#include "util/list.h"
			
 
				+#include "util/cache.h"
			
 
				+#include "util/rbtree.h"
			
 
				+#include "util/symbol.h"
			
 
				+#include "util/string.h"
			
 
				+
			
 
				+#include "perf.h"
			
 
				+
			
 
				+#include "util/parse-options.h"
			
 
				+#include "util/parse-events.h"
			
 
				+
			
 
				+#define SHOW_KERNEL	1
			
 
				+#define SHOW_USER	2
			
 
				+#define SHOW_HV		4
			
 
				+
			
 
				+static char		const *input_name = "perf.data";
			
 
				+static char		*vmlinux = NULL;
			
 
				+
			
 
				+static char		default_sort_order[] = "comm,dso";
			
 
				+static char		*sort_order = default_sort_order;
			
 
				+
			
 
				+static int		input;
			
 
				+static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
			
 
				+
			
 
				+static int		dump_trace = 0;
			
 
				+#define dprintf(x...)	do { if (dump_trace) printf(x); } while (0)
			
 
				+
			
 
				+static int		verbose;
			
 
				+static int		full_paths;
			
 
				+
			
 
				+static unsigned long	page_size;
			
 
				+static unsigned long	mmap_window = 32;
			
 
				+
			
 
				+struct ip_event {
			
 
				+	struct perf_event_header header;
			
 
				+	__u64 ip;
			
 
				+	__u32 pid, tid;
			
 
				+	__u64 period;
			
 
				+};
			
 
				+
			
 
				+struct mmap_event {
			
 
				+	struct perf_event_header header;
			
 
				+	__u32 pid, tid;
			
 
				+	__u64 start;
			
 
				+	__u64 len;
			
 
				+	__u64 pgoff;
			
 
				+	char filename[PATH_MAX];
			
 
				+};
			
 
				+
			
 
				+struct comm_event {
			
 
				+	struct perf_event_header header;
			
 
				+	__u32 pid, tid;
			
 
				+	char comm[16];
			
 
				+};
			
 
				+
			
 
				+struct fork_event {
			
 
				+	struct perf_event_header header;
			
 
				+	__u32 pid, ppid;
			
 
				+};
			
 
				+
			
 
				+struct period_event {
			
 
				+	struct perf_event_header header;
			
 
				+	__u64 time;
			
 
				+	__u64 id;
			
 
				+	__u64 sample_period;
			
 
				+};
			
 
				+
			
 
				+typedef union event_union {
			
 
				+	struct perf_event_header	header;
			
 
				+	struct ip_event			ip;
			
 
				+	struct mmap_event		mmap;
			
 
				+	struct comm_event		comm;
			
 
				+	struct fork_event		fork;
			
 
				+	struct period_event		period;
			
 
				+} event_t;
			
 
				+
			
 
				+static LIST_HEAD(dsos);
			
 
				+static struct dso *kernel_dso;
			
 
				+static struct dso *vdso;
			
 
				+
			
 
				+static void dsos__add(struct dso *dso)
			
 
				+{
			
 
				+	list_add_tail(&dso->node, &dsos);
			
 
				+}
			
 
				+
			
 
				+static struct dso *dsos__find(const char *name)
			
 
				+{
			
 
				+	struct dso *pos;
			
 
				+
			
 
				+	list_for_each_entry(pos, &dsos, node)
			
 
				+		if (strcmp(pos->name, name) == 0)
			
 
				+			return pos;
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static struct dso *dsos__findnew(const char *name)
			
 
				+{
			
 
				+	struct dso *dso = dsos__find(name);
			
 
				+	int nr;
			
 
				+
			
 
				+	if (dso)
			
 
				+		return dso;
			
 
				+
			
 
				+	dso = dso__new(name, 0);
			
 
				+	if (!dso)
			
 
				+		goto out_delete_dso;
			
 
				+
			
 
				+	nr = dso__load(dso, NULL, verbose);
			
 
				+	if (nr < 0) {
			
 
				+		if (verbose)
			
 
				+			fprintf(stderr, "Failed to open: %s\n", name);
			
 
				+		goto out_delete_dso;
			
 
				+	}
			
 
				+	if (!nr && verbose) {
			
 
				+		fprintf(stderr,
			
 
				+		"No symbols found in: %s, maybe install a debug package?\n",
			
 
				+				name);
			
 
				+	}
			
 
				+
			
 
				+	dsos__add(dso);
			
 
				+
			
 
				+	return dso;
			
 
				+
			
 
				+out_delete_dso:
			
 
				+	dso__delete(dso);
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static void dsos__fprintf(FILE *fp)
			
 
				+{
			
 
				+	struct dso *pos;
			
 
				+
			
 
				+	list_for_each_entry(pos, &dsos, node)
			
 
				+		dso__fprintf(pos, fp);
			
 
				+}
			
 
				+
			
 
				+static struct symbol *vdso__find_symbol(struct dso *dso, __u64 ip)
			
 
				+{
			
 
				+	return dso__find_symbol(kernel_dso, ip);
			
 
				+}
			
 
				+
			
 
				+static int load_kernel(void)
			
 
				+{
			
 
				+	int err;
			
 
				+
			
 
				+	kernel_dso = dso__new("[kernel]", 0);
			
 
				+	if (!kernel_dso)
			
 
				+		return -1;
			
 
				+
			
 
				+	err = dso__load_kernel(kernel_dso, vmlinux, NULL, verbose);
			
 
				+	if (err) {
			
 
				+		dso__delete(kernel_dso);
			
 
				+		kernel_dso = NULL;
			
 
				+	} else
			
 
				+		dsos__add(kernel_dso);
			
 
				+
			
 
				+	vdso = dso__new("[vdso]", 0);
			
 
				+	if (!vdso)
			
 
				+		return -1;
			
 
				+
			
 
				+	vdso->find_symbol = vdso__find_symbol;
			
 
				+
			
 
				+	dsos__add(vdso);
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static char __cwd[PATH_MAX];
			
 
				+static char *cwd = __cwd;
			
 
				+static int cwdlen;
			
 
				+
			
 
				+static int strcommon(const char *pathname)
			
 
				+{
			
 
				+	int n = 0;
			
 
				+
			
 
				+	while (pathname[n] == cwd[n] && n < cwdlen)
			
 
				+		++n;
			
 
				+
			
 
				+	return n;
			
 
				+}
			
 
				+
			
 
				+struct map {
			
 
				+	struct list_head node;
			
 
				+	__u64	 start;
			
 
				+	__u64	 end;
			
 
				+	__u64	 pgoff;
			
 
				+	__u64	 (*map_ip)(struct map *, __u64);
			
 
				+	struct dso	 *dso;
			
 
				+};
			
 
				+
			
 
				+static __u64 map__map_ip(struct map *map, __u64 ip)
			
 
				+{
			
 
				+	return ip - map->start + map->pgoff;
			
 
				+}
			
 
				+
			
 
				+static __u64 vdso__map_ip(struct map *map, __u64 ip)
			
 
				+{
			
 
				+	return ip;
			
 
				+}
			
 
				+
			
 
				+static inline int is_anon_memory(const char *filename)
			
 
				+{
			
 
				+     return strcmp(filename, "//anon") == 0;
			
 
				+}
			
 
				+
			
 
				+static struct map *map__new(struct mmap_event *event)
			
 
				+{
			
 
				+	struct map *self = malloc(sizeof(*self));
			
 
				+
			
 
				+	if (self != NULL) {
			
 
				+		const char *filename = event->filename;
			
 
				+		char newfilename[PATH_MAX];
			
 
				+		int anon;
			
 
				+
			
 
				+		if (cwd) {
			
 
				+			int n = strcommon(filename);
			
 
				+
			
 
				+			if (n == cwdlen) {
			
 
				+				snprintf(newfilename, sizeof(newfilename),
			
 
				+					 ".%s", filename + n);
			
 
				+				filename = newfilename;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		anon = is_anon_memory(filename);
			
 
				+
			
 
				+		if (anon) {
			
 
				+			snprintf(newfilename, sizeof(newfilename), "/tmp/perf-%d.map", event->pid);
			
 
				+			filename = newfilename;
			
 
				+		}
			
 
				+
			
 
				+		self->start = event->start;
			
 
				+		self->end   = event->start + event->len;
			
 
				+		self->pgoff = event->pgoff;
			
 
				+
			
 
				+		self->dso = dsos__findnew(filename);
			
 
				+		if (self->dso == NULL)
			
 
				+			goto out_delete;
			
 
				+
			
 
				+		if (self->dso == vdso || anon)
			
 
				+			self->map_ip = vdso__map_ip;
			
 
				+		else
			
 
				+			self->map_ip = map__map_ip;
			
 
				+	}
			
 
				+	return self;
			
 
				+out_delete:
			
 
				+	free(self);
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static struct map *map__clone(struct map *self)
			
 
				+{
			
 
				+	struct map *map = malloc(sizeof(*self));
			
 
				+
			
 
				+	if (!map)
			
 
				+		return NULL;
			
 
				+
			
 
				+	memcpy(map, self, sizeof(*self));
			
 
				+
			
 
				+	return map;
			
 
				+}
			
 
				+
			
 
				+static int map__overlap(struct map *l, struct map *r)
			
 
				+{
			
 
				+	if (l->start > r->start) {
			
 
				+		struct map *t = l;
			
 
				+		l = r;
			
 
				+		r = t;
			
 
				+	}
			
 
				+
			
 
				+	if (l->end > r->start)
			
 
				+		return 1;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static size_t map__fprintf(struct map *self, FILE *fp)
			
 
				+{
			
 
				+	return fprintf(fp, " %Lx-%Lx %Lx %s\n",
			
 
				+		       self->start, self->end, self->pgoff, self->dso->name);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+struct thread {
			
 
				+	struct rb_node	 rb_node;
			
 
				+	struct list_head maps;
			
 
				+	pid_t		 pid;
			
 
				+	char		 *comm;
			
 
				+};
			
 
				+
			
 
				+static struct thread *thread__new(pid_t pid)
			
 
				+{
			
 
				+	struct thread *self = malloc(sizeof(*self));
			
 
				+
			
 
				+	if (self != NULL) {
			
 
				+		self->pid = pid;
			
 
				+		self->comm = malloc(32);
			
 
				+		if (self->comm)
			
 
				+			snprintf(self->comm, 32, ":%d", self->pid);
			
 
				+		INIT_LIST_HEAD(&self->maps);
			
 
				+	}
			
 
				+
			
 
				+	return self;
			
 
				+}
			
 
				+
			
 
				+static int thread__set_comm(struct thread *self, const char *comm)
			
 
				+{
			
 
				+	if (self->comm)
			
 
				+		free(self->comm);
			
 
				+	self->comm = strdup(comm);
			
 
				+	return self->comm ? 0 : -ENOMEM;
			
 
				+}
			
 
				+
			
 
				+static size_t thread__fprintf(struct thread *self, FILE *fp)
			
 
				+{
			
 
				+	struct map *pos;
			
 
				+	size_t ret = fprintf(fp, "Thread %d %s\n", self->pid, self->comm);
			
 
				+
			
 
				+	list_for_each_entry(pos, &self->maps, node)
			
 
				+		ret += map__fprintf(pos, fp);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static struct rb_root threads;
			
 
				+static struct thread *last_match;
			
 
				+
			
 
				+static struct thread *threads__findnew(pid_t pid)
			
 
				+{
			
 
				+	struct rb_node **p = &threads.rb_node;
			
 
				+	struct rb_node *parent = NULL;
			
 
				+	struct thread *th;
			
 
				+
			
 
				+	/*
			
 
				+	 * Font-end cache - PID lookups come in blocks,
			
 
				+	 * so most of the time we dont have to look up
			
 
				+	 * the full rbtree:
			
 
				+	 */
			
 
				+	if (last_match && last_match->pid == pid)
			
 
				+		return last_match;
			
 
				+
			
 
				+	while (*p != NULL) {
			
 
				+		parent = *p;
			
 
				+		th = rb_entry(parent, struct thread, rb_node);
			
 
				+
			
 
				+		if (th->pid == pid) {
			
 
				+			last_match = th;
			
 
				+			return th;
			
 
				+		}
			
 
				+
			
 
				+		if (pid < th->pid)
			
 
				+			p = &(*p)->rb_left;
			
 
				+		else
			
 
				+			p = &(*p)->rb_right;
			
 
				+	}
			
 
				+
			
 
				+	th = thread__new(pid);
			
 
				+	if (th != NULL) {
			
 
				+		rb_link_node(&th->rb_node, parent, p);
			
 
				+		rb_insert_color(&th->rb_node, &threads);
			
 
				+		last_match = th;
			
 
				+	}
			
 
				+
			
 
				+	return th;
			
 
				+}
			
 
				+
			
 
				+static void thread__insert_map(struct thread *self, struct map *map)
			
 
				+{
			
 
				+	struct map *pos, *tmp;
			
 
				+
			
 
				+	list_for_each_entry_safe(pos, tmp, &self->maps, node) {
			
 
				+		if (map__overlap(pos, map)) {
			
 
				+			list_del_init(&pos->node);
			
 
				+			/* XXX leaks dsos */
			
 
				+			free(pos);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	list_add_tail(&map->node, &self->maps);
			
 
				+}
			
 
				+
			
 
				+static int thread__fork(struct thread *self, struct thread *parent)
			
 
				+{
			
 
				+	struct map *map;
			
 
				+
			
 
				+	if (self->comm)
			
 
				+		free(self->comm);
			
 
				+	self->comm = strdup(parent->comm);
			
 
				+	if (!self->comm)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	list_for_each_entry(map, &parent->maps, node) {
			
 
				+		struct map *new = map__clone(map);
			
 
				+		if (!new)
			
 
				+			return -ENOMEM;
			
 
				+		thread__insert_map(self, new);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static struct map *thread__find_map(struct thread *self, __u64 ip)
			
 
				+{
			
 
				+	struct map *pos;
			
 
				+
			
 
				+	if (self == NULL)
			
 
				+		return NULL;
			
 
				+
			
 
				+	list_for_each_entry(pos, &self->maps, node)
			
 
				+		if (ip >= pos->start && ip <= pos->end)
			
 
				+			return pos;
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static size_t threads__fprintf(FILE *fp)
			
 
				+{
			
 
				+	size_t ret = 0;
			
 
				+	struct rb_node *nd;
			
 
				+
			
 
				+	for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
			
 
				+		struct thread *pos = rb_entry(nd, struct thread, rb_node);
			
 
				+
			
 
				+		ret += thread__fprintf(pos, fp);
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * histogram, sorted on item, collects counts
			
 
				+ */
			
 
				+
			
 
				+static struct rb_root hist;
			
 
				+
			
 
				+struct hist_entry {
			
 
				+	struct rb_node	 rb_node;
			
 
				+
			
 
				+	struct thread	 *thread;
			
 
				+	struct map	 *map;
			
 
				+	struct dso	 *dso;
			
 
				+	struct symbol	 *sym;
			
 
				+	__u64		 ip;
			
 
				+	char		 level;
			
 
				+
			
 
				+	__u64		 count;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * configurable sorting bits
			
 
				+ */
			
 
				+
			
 
				+struct sort_entry {
			
 
				+	struct list_head list;
			
 
				+
			
 
				+	char *header;
			
 
				+
			
 
				+	int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
			
 
				+	int64_t (*collapse)(struct hist_entry *, struct hist_entry *);
			
 
				+	size_t	(*print)(FILE *fp, struct hist_entry *);
			
 
				+};
			
 
				+
			
 
				+/* --sort pid */
			
 
				+
			
 
				+static int64_t
			
 
				+sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
			
 
				+{
			
 
				+	return right->thread->pid - left->thread->pid;
			
 
				+}
			
 
				+
			
 
				+static size_t
			
 
				+sort__thread_print(FILE *fp, struct hist_entry *self)
			
 
				+{
			
 
				+	return fprintf(fp, "%16s:%5d", self->thread->comm ?: "", self->thread->pid);
			
 
				+}
			
 
				+
			
 
				+static struct sort_entry sort_thread = {
			
 
				+	.header = "         Command:  Pid",
			
 
				+	.cmp	= sort__thread_cmp,
			
 
				+	.print	= sort__thread_print,
			
 
				+};
			
 
				+
			
 
				+/* --sort comm */
			
 
				+
			
 
				+static int64_t
			
 
				+sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
			
 
				+{
			
 
				+	return right->thread->pid - left->thread->pid;
			
 
				+}
			
 
				+
			
 
				+static int64_t
			
 
				+sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
			
 
				+{
			
 
				+	char *comm_l = left->thread->comm;
			
 
				+	char *comm_r = right->thread->comm;
			
 
				+
			
 
				+	if (!comm_l || !comm_r) {
			
 
				+		if (!comm_l && !comm_r)
			
 
				+			return 0;
			
 
				+		else if (!comm_l)
			
 
				+			return -1;
			
 
				+		else
			
 
				+			return 1;
			
 
				+	}
			
 
				+
			
 
				+	return strcmp(comm_l, comm_r);
			
 
				+}
			
 
				+
			
 
				+static size_t
			
 
				+sort__comm_print(FILE *fp, struct hist_entry *self)
			
 
				+{
			
 
				+	return fprintf(fp, "%16s", self->thread->comm);
			
 
				+}
			
 
				+
			
 
				+static struct sort_entry sort_comm = {
			
 
				+	.header		= "         Command",
			
 
				+	.cmp		= sort__comm_cmp,
			
 
				+	.collapse	= sort__comm_collapse,
			
 
				+	.print		= sort__comm_print,
			
 
				+};
			
 
				+
			
 
				+/* --sort dso */
			
 
				+
			
 
				+static int64_t
			
 
				+sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
			
 
				+{
			
 
				+	struct dso *dso_l = left->dso;
			
 
				+	struct dso *dso_r = right->dso;
			
 
				+
			
 
				+	if (!dso_l || !dso_r) {
			
 
				+		if (!dso_l && !dso_r)
			
 
				+			return 0;
			
 
				+		else if (!dso_l)
			
 
				+			return -1;
			
 
				+		else
			
 
				+			return 1;
			
 
				+	}
			
 
				+
			
 
				+	return strcmp(dso_l->name, dso_r->name);
			
 
				+}
			
 
				+
			
 
				+static size_t
			
 
				+sort__dso_print(FILE *fp, struct hist_entry *self)
			
 
				+{
			
 
				+	if (self->dso)
			
 
				+		return fprintf(fp, "%-25s", self->dso->name);
			
 
				+
			
 
				+	return fprintf(fp, "%016llx         ", (__u64)self->ip);
			
 
				+}
			
 
				+
			
 
				+static struct sort_entry sort_dso = {
			
 
				+	.header = "Shared Object            ",
			
 
				+	.cmp	= sort__dso_cmp,
			
 
				+	.print	= sort__dso_print,
			
 
				+};
			
 
				+
			
 
				+/* --sort symbol */
			
 
				+
			
 
				+static int64_t
			
 
				+sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
			
 
				+{
			
 
				+	__u64 ip_l, ip_r;
			
 
				+
			
 
				+	if (left->sym == right->sym)
			
 
				+		return 0;
			
 
				+
			
 
				+	ip_l = left->sym ? left->sym->start : left->ip;
			
 
				+	ip_r = right->sym ? right->sym->start : right->ip;
			
 
				+
			
 
				+	return (int64_t)(ip_r - ip_l);
			
 
				+}
			
 
				+
			
 
				+static size_t
			
 
				+sort__sym_print(FILE *fp, struct hist_entry *self)
			
 
				+{
			
 
				+	size_t ret = 0;
			
 
				+
			
 
				+	if (verbose)
			
 
				+		ret += fprintf(fp, "%#018llx  ", (__u64)self->ip);
			
 
				+
			
 
				+	if (self->sym) {
			
 
				+		ret += fprintf(fp, "[%c] %s",
			
 
				+			self->dso == kernel_dso ? 'k' : '.', self->sym->name);
			
 
				+	} else {
			
 
				+		ret += fprintf(fp, "%#016llx", (__u64)self->ip);
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static struct sort_entry sort_sym = {
			
 
				+	.header = "Symbol",
			
 
				+	.cmp	= sort__sym_cmp,
			
 
				+	.print	= sort__sym_print,
			
 
				+};
			
 
				+
			
 
				+static int sort__need_collapse = 0;
			
 
				+
			
 
				+struct sort_dimension {
			
 
				+	char			*name;
			
 
				+	struct sort_entry	*entry;
			
 
				+	int			taken;
			
 
				+};
			
 
				+
			
 
				+static struct sort_dimension sort_dimensions[] = {
			
 
				+	{ .name = "pid",	.entry = &sort_thread,	},
			
 
				+	{ .name = "comm",	.entry = &sort_comm,	},
			
 
				+	{ .name = "dso",	.entry = &sort_dso,	},
			
 
				+	{ .name = "symbol",	.entry = &sort_sym,	},
			
 
				+};
			
 
				+
			
 
				+static LIST_HEAD(hist_entry__sort_list);
			
 
				+
			
 
				+static int sort_dimension__add(char *tok)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < ARRAY_SIZE(sort_dimensions); i++) {
			
 
				+		struct sort_dimension *sd = &sort_dimensions[i];
			
 
				+
			
 
				+		if (sd->taken)
			
 
				+			continue;
			
 
				+
			
 
				+		if (strncasecmp(tok, sd->name, strlen(tok)))
			
 
				+			continue;
			
 
				+
			
 
				+		if (sd->entry->collapse)
			
 
				+			sort__need_collapse = 1;
			
 
				+
			
 
				+		list_add_tail(&sd->entry->list, &hist_entry__sort_list);
			
 
				+		sd->taken = 1;
			
 
				+
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	return -ESRCH;
			
 
				+}
			
 
				+
			
 
				+static int64_t
			
 
				+hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
			
 
				+{
			
 
				+	struct sort_entry *se;
			
 
				+	int64_t cmp = 0;
			
 
				+
			
 
				+	list_for_each_entry(se, &hist_entry__sort_list, list) {
			
 
				+		cmp = se->cmp(left, right);
			
 
				+		if (cmp)
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	return cmp;
			
 
				+}
			
 
				+
			
 
				+static int64_t
			
 
				+hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
			
 
				+{
			
 
				+	struct sort_entry *se;
			
 
				+	int64_t cmp = 0;
			
 
				+
			
 
				+	list_for_each_entry(se, &hist_entry__sort_list, list) {
			
 
				+		int64_t (*f)(struct hist_entry *, struct hist_entry *);
			
 
				+
			
 
				+		f = se->collapse ?: se->cmp;
			
 
				+
			
 
				+		cmp = f(left, right);
			
 
				+		if (cmp)
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	return cmp;
			
 
				+}
			
 
				+
			
 
				+static size_t
			
 
				+hist_entry__fprintf(FILE *fp, struct hist_entry *self, __u64 total_samples)
			
 
				+{
			
 
				+	struct sort_entry *se;
			
 
				+	size_t ret;
			
 
				+
			
 
				+	if (total_samples) {
			
 
				+		double percent = self->count * 100.0 / total_samples;
			
 
				+		char *color = PERF_COLOR_NORMAL;
			
 
				+
			
 
				+		/*
			
 
				+		 * We color high-overhead entries in red, mid-overhead
			
 
				+		 * entries in green - and keep the low overhead places
			
 
				+		 * normal:
			
 
				+		 */
			
 
				+		if (percent >= 5.0) {
			
 
				+			color = PERF_COLOR_RED;
			
 
				+		} else {
			
 
				+			if (percent >= 0.5)
			
 
				+				color = PERF_COLOR_GREEN;
			
 
				+		}
			
 
				+
			
 
				+		ret = color_fprintf(fp, color, "   %6.2f%%",
			
 
				+				(self->count * 100.0) / total_samples);
			
 
				+	} else
			
 
				+		ret = fprintf(fp, "%12Ld ", self->count);
			
 
				+
			
 
				+	list_for_each_entry(se, &hist_entry__sort_list, list) {
			
 
				+		fprintf(fp, "  ");
			
 
				+		ret += se->print(fp, self);
			
 
				+	}
			
 
				+
			
 
				+	ret += fprintf(fp, "\n");
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * collect histogram counts
			
 
				+ */
			
 
				+
			
 
				+static int
			
 
				+hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
			
 
				+		struct symbol *sym, __u64 ip, char level, __u64 count)
			
 
				+{
			
 
				+	struct rb_node **p = &hist.rb_node;
			
 
				+	struct rb_node *parent = NULL;
			
 
				+	struct hist_entry *he;
			
 
				+	struct hist_entry entry = {
			
 
				+		.thread	= thread,
			
 
				+		.map	= map,
			
 
				+		.dso	= dso,
			
 
				+		.sym	= sym,
			
 
				+		.ip	= ip,
			
 
				+		.level	= level,
			
 
				+		.count	= count,
			
 
				+	};
			
 
				+	int cmp;
			
 
				+
			
 
				+	while (*p != NULL) {
			
 
				+		parent = *p;
			
 
				+		he = rb_entry(parent, struct hist_entry, rb_node);
			
 
				+
			
 
				+		cmp = hist_entry__cmp(&entry, he);
			
 
				+
			
 
				+		if (!cmp) {
			
 
				+			he->count += count;
			
 
				+			return 0;
			
 
				+		}
			
 
				+
			
 
				+		if (cmp < 0)
			
 
				+			p = &(*p)->rb_left;
			
 
				+		else
			
 
				+			p = &(*p)->rb_right;
			
 
				+	}
			
 
				+
			
 
				+	he = malloc(sizeof(*he));
			
 
				+	if (!he)
			
 
				+		return -ENOMEM;
			
 
				+	*he = entry;
			
 
				+	rb_link_node(&he->rb_node, parent, p);
			
 
				+	rb_insert_color(&he->rb_node, &hist);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void hist_entry__free(struct hist_entry *he)
			
 
				+{
			
 
				+	free(he);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * collapse the histogram
			
 
				+ */
			
 
				+
			
 
				+static struct rb_root collapse_hists;
			
 
				+
			
 
				+static void collapse__insert_entry(struct hist_entry *he)
			
 
				+{
			
 
				+	struct rb_node **p = &collapse_hists.rb_node;
			
 
				+	struct rb_node *parent = NULL;
			
 
				+	struct hist_entry *iter;
			
 
				+	int64_t cmp;
			
 
				+
			
 
				+	while (*p != NULL) {
			
 
				+		parent = *p;
			
 
				+		iter = rb_entry(parent, struct hist_entry, rb_node);
			
 
				+
			
 
				+		cmp = hist_entry__collapse(iter, he);
			
 
				+
			
 
				+		if (!cmp) {
			
 
				+			iter->count += he->count;
			
 
				+			hist_entry__free(he);
			
 
				+			return;
			
 
				+		}
			
 
				+
			
 
				+		if (cmp < 0)
			
 
				+			p = &(*p)->rb_left;
			
 
				+		else
			
 
				+			p = &(*p)->rb_right;
			
 
				+	}
			
 
				+
			
 
				+	rb_link_node(&he->rb_node, parent, p);
			
 
				+	rb_insert_color(&he->rb_node, &collapse_hists);
			
 
				+}
			
 
				+
			
 
				+static void collapse__resort(void)
			
 
				+{
			
 
				+	struct rb_node *next;
			
 
				+	struct hist_entry *n;
			
 
				+
			
 
				+	if (!sort__need_collapse)
			
 
				+		return;
			
 
				+
			
 
				+	next = rb_first(&hist);
			
 
				+	while (next) {
			
 
				+		n = rb_entry(next, struct hist_entry, rb_node);
			
 
				+		next = rb_next(&n->rb_node);
			
 
				+
			
 
				+		rb_erase(&n->rb_node, &hist);
			
 
				+		collapse__insert_entry(n);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * reverse the map, sort on count.
			
 
				+ */
			
 
				+
			
 
				+static struct rb_root output_hists;
			
 
				+
			
 
				+static void output__insert_entry(struct hist_entry *he)
			
 
				+{
			
 
				+	struct rb_node **p = &output_hists.rb_node;
			
 
				+	struct rb_node *parent = NULL;
			
 
				+	struct hist_entry *iter;
			
 
				+
			
 
				+	while (*p != NULL) {
			
 
				+		parent = *p;
			
 
				+		iter = rb_entry(parent, struct hist_entry, rb_node);
			
 
				+
			
 
				+		if (he->count > iter->count)
			
 
				+			p = &(*p)->rb_left;
			
 
				+		else
			
 
				+			p = &(*p)->rb_right;
			
 
				+	}
			
 
				+
			
 
				+	rb_link_node(&he->rb_node, parent, p);
			
 
				+	rb_insert_color(&he->rb_node, &output_hists);
			
 
				+}
			
 
				+
			
 
				+static void output__resort(void)
			
 
				+{
			
 
				+	struct rb_node *next;
			
 
				+	struct hist_entry *n;
			
 
				+	struct rb_root *tree = &hist;
			
 
				+
			
 
				+	if (sort__need_collapse)
			
 
				+		tree = &collapse_hists;
			
 
				+
			
 
				+	next = rb_first(tree);
			
 
				+
			
 
				+	while (next) {
			
 
				+		n = rb_entry(next, struct hist_entry, rb_node);
			
 
				+		next = rb_next(&n->rb_node);
			
 
				+
			
 
				+		rb_erase(&n->rb_node, tree);
			
 
				+		output__insert_entry(n);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static size_t output__fprintf(FILE *fp, __u64 total_samples)
			
 
				+{
			
 
				+	struct hist_entry *pos;
			
 
				+	struct sort_entry *se;
			
 
				+	struct rb_node *nd;
			
 
				+	size_t ret = 0;
			
 
				+
			
 
				+	fprintf(fp, "\n");
			
 
				+	fprintf(fp, "#\n");
			
 
				+	fprintf(fp, "# (%Ld samples)\n", (__u64)total_samples);
			
 
				+	fprintf(fp, "#\n");
			
 
				+
			
 
				+	fprintf(fp, "# Overhead");
			
 
				+	list_for_each_entry(se, &hist_entry__sort_list, list)
			
 
				+		fprintf(fp, "  %s", se->header);
			
 
				+	fprintf(fp, "\n");
			
 
				+
			
 
				+	fprintf(fp, "# ........");
			
 
				+	list_for_each_entry(se, &hist_entry__sort_list, list) {
			
 
				+		int i;
			
 
				+
			
 
				+		fprintf(fp, "  ");
			
 
				+		for (i = 0; i < strlen(se->header); i++)
			
 
				+			fprintf(fp, ".");
			
 
				+	}
			
 
				+	fprintf(fp, "\n");
			
 
				+
			
 
				+	fprintf(fp, "#\n");
			
 
				+
			
 
				+	for (nd = rb_first(&output_hists); nd; nd = rb_next(nd)) {
			
 
				+		pos = rb_entry(nd, struct hist_entry, rb_node);
			
 
				+		ret += hist_entry__fprintf(fp, pos, total_samples);
			
 
				+	}
			
 
				+
			
 
				+	if (!strcmp(sort_order, default_sort_order)) {
			
 
				+		fprintf(fp, "#\n");
			
 
				+		fprintf(fp, "# (For more details, try: perf report --sort comm,dso,symbol)\n");
			
 
				+		fprintf(fp, "#\n");
			
 
				+	}
			
 
				+	fprintf(fp, "\n");
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static void register_idle_thread(void)
			
 
				+{
			
 
				+	struct thread *thread = threads__findnew(0);
			
 
				+
			
 
				+	if (thread == NULL ||
			
 
				+			thread__set_comm(thread, "[idle]")) {
			
 
				+		fprintf(stderr, "problem inserting idle task.\n");
			
 
				+		exit(-1);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static unsigned long total = 0,
			
 
				+		     total_mmap = 0,
			
 
				+		     total_comm = 0,
			
 
				+		     total_fork = 0,
			
 
				+		     total_unknown = 0;
			
 
				+
			
 
				+static int
			
 
				+process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
			
 
				+{
			
 
				+	char level;
			
 
				+	int show = 0;
			
 
				+	struct dso *dso = NULL;
			
 
				+	struct thread *thread = threads__findnew(event->ip.pid);
			
 
				+	__u64 ip = event->ip.ip;
			
 
				+	__u64 period = 1;
			
 
				+	struct map *map = NULL;
			
 
				+
			
 
				+	if (event->header.type & PERF_SAMPLE_PERIOD)
			
 
				+		period = event->ip.period;
			
 
				+
			
 
				+	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p period: %Ld\n",
			
 
				+		(void *)(offset + head),
			
 
				+		(void *)(long)(event->header.size),
			
 
				+		event->header.misc,
			
 
				+		event->ip.pid,
			
 
				+		(void *)(long)ip,
			
 
				+		(long long)period);
			
 
				+
			
 
				+	dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
			
 
				+
			
 
				+	if (thread == NULL) {
			
 
				+		fprintf(stderr, "problem processing %d event, skipping it.\n",
			
 
				+			event->header.type);
			
 
				+		return -1;
			
 
				+	}
			
 
				+
			
 
				+	if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
			
 
				+		show = SHOW_KERNEL;
			
 
				+		level = 'k';
			
 
				+
			
 
				+		dso = kernel_dso;
			
 
				+
			
 
				+		dprintf(" ...... dso: %s\n", dso->name);
			
 
				+
			
 
				+	} else if (event->header.misc & PERF_EVENT_MISC_USER) {
			
 
				+
			
 
				+		show = SHOW_USER;
			
 
				+		level = '.';
			
 
				+
			
 
				+		map = thread__find_map(thread, ip);
			
 
				+		if (map != NULL) {
			
 
				+			ip = map->map_ip(map, ip);
			
 
				+			dso = map->dso;
			
 
				+		} else {
			
 
				+			/*
			
 
				+			 * If this is outside of all known maps,
			
 
				+			 * and is a negative address, try to look it
			
 
				+			 * up in the kernel dso, as it might be a
			
 
				+			 * vsyscall (which executes in user-mode):
			
 
				+			 */
			
 
				+			if ((long long)ip < 0)
			
 
				+				dso = kernel_dso;
			
 
				+		}
			
 
				+		dprintf(" ...... dso: %s\n", dso ? dso->name : "<not found>");
			
 
				+
			
 
				+	} else {
			
 
				+		show = SHOW_HV;
			
 
				+		level = 'H';
			
 
				+		dprintf(" ...... dso: [hypervisor]\n");
			
 
				+	}
			
 
				+
			
 
				+	if (show & show_mask) {
			
 
				+		struct symbol *sym = NULL;
			
 
				+
			
 
				+		if (dso)
			
 
				+			sym = dso->find_symbol(dso, ip);
			
 
				+
			
 
				+		if (hist_entry__add(thread, map, dso, sym, ip, level, period)) {
			
 
				+			fprintf(stderr,
			
 
				+		"problem incrementing symbol count, skipping event\n");
			
 
				+			return -1;
			
 
				+		}
			
 
				+	}
			
 
				+	total += period;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
			
 
				+{
			
 
				+	struct thread *thread = threads__findnew(event->mmap.pid);
			
 
				+	struct map *map = map__new(&event->mmap);
			
 
				+
			
 
				+	dprintf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n",
			
 
				+		(void *)(offset + head),
			
 
				+		(void *)(long)(event->header.size),
			
 
				+		event->mmap.pid,
			
 
				+		(void *)(long)event->mmap.start,
			
 
				+		(void *)(long)event->mmap.len,
			
 
				+		(void *)(long)event->mmap.pgoff,
			
 
				+		event->mmap.filename);
			
 
				+
			
 
				+	if (thread == NULL || map == NULL) {
			
 
				+		dprintf("problem processing PERF_EVENT_MMAP, skipping event.\n");
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	thread__insert_map(thread, map);
			
 
				+	total_mmap++;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+process_comm_event(event_t *event, unsigned long offset, unsigned long head)
			
 
				+{
			
 
				+	struct thread *thread = threads__findnew(event->comm.pid);
			
 
				+
			
 
				+	dprintf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
			
 
				+		(void *)(offset + head),
			
 
				+		(void *)(long)(event->header.size),
			
 
				+		event->comm.comm, event->comm.pid);
			
 
				+
			
 
				+	if (thread == NULL ||
			
 
				+	    thread__set_comm(thread, event->comm.comm)) {
			
 
				+		dprintf("problem processing PERF_EVENT_COMM, skipping event.\n");
			
 
				+		return -1;
			
 
				+	}
			
 
				+	total_comm++;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+process_fork_event(event_t *event, unsigned long offset, unsigned long head)
			
 
				+{
			
 
				+	struct thread *thread = threads__findnew(event->fork.pid);
			
 
				+	struct thread *parent = threads__findnew(event->fork.ppid);
			
 
				+
			
 
				+	dprintf("%p [%p]: PERF_EVENT_FORK: %d:%d\n",
			
 
				+		(void *)(offset + head),
			
 
				+		(void *)(long)(event->header.size),
			
 
				+		event->fork.pid, event->fork.ppid);
			
 
				+
			
 
				+	if (!thread || !parent || thread__fork(thread, parent)) {
			
 
				+		dprintf("problem processing PERF_EVENT_FORK, skipping event.\n");
			
 
				+		return -1;
			
 
				+	}
			
 
				+	total_fork++;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+process_period_event(event_t *event, unsigned long offset, unsigned long head)
			
 
				+{
			
 
				+	dprintf("%p [%p]: PERF_EVENT_PERIOD: time:%Ld, id:%Ld: period:%Ld\n",
			
 
				+		(void *)(offset + head),
			
 
				+		(void *)(long)(event->header.size),
			
 
				+		event->period.time,
			
 
				+		event->period.id,
			
 
				+		event->period.sample_period);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+process_event(event_t *event, unsigned long offset, unsigned long head)
			
 
				+{
			
 
				+	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW)
			
 
				+		return process_overflow_event(event, offset, head);
			
 
				+
			
 
				+	switch (event->header.type) {
			
 
				+	case PERF_EVENT_MMAP:
			
 
				+		return process_mmap_event(event, offset, head);
			
 
				+
			
 
				+	case PERF_EVENT_COMM:
			
 
				+		return process_comm_event(event, offset, head);
			
 
				+
			
 
				+	case PERF_EVENT_FORK:
			
 
				+		return process_fork_event(event, offset, head);
			
 
				+
			
 
				+	case PERF_EVENT_PERIOD:
			
 
				+		return process_period_event(event, offset, head);
			
 
				+	/*
			
 
				+	 * We dont process them right now but they are fine:
			
 
				+	 */
			
 
				+
			
 
				+	case PERF_EVENT_THROTTLE:
			
 
				+	case PERF_EVENT_UNTHROTTLE:
			
 
				+		return 0;
			
 
				+
			
 
				+	default:
			
 
				+		return -1;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int __cmd_report(void)
			
 
				+{
			
 
				+	int ret, rc = EXIT_FAILURE;
			
 
				+	unsigned long offset = 0;
			
 
				+	unsigned long head = 0;
			
 
				+	struct stat stat;
			
 
				+	event_t *event;
			
 
				+	uint32_t size;
			
 
				+	char *buf;
			
 
				+
			
 
				+	register_idle_thread();
			
 
				+
			
 
				+	input = open(input_name, O_RDONLY);
			
 
				+	if (input < 0) {
			
 
				+		fprintf(stderr, " failed to open file: %s", input_name);
			
 
				+		if (!strcmp(input_name, "perf.data"))
			
 
				+			fprintf(stderr, "  (try 'perf record' first)");
			
 
				+		fprintf(stderr, "\n");
			
 
				+		exit(-1);
			
 
				+	}
			
 
				+
			
 
				+	ret = fstat(input, &stat);
			
 
				+	if (ret < 0) {
			
 
				+		perror("failed to stat file");
			
 
				+		exit(-1);
			
 
				+	}
			
 
				+
			
 
				+	if (!stat.st_size) {
			
 
				+		fprintf(stderr, "zero-sized file, nothing to do!\n");
			
 
				+		exit(0);
			
 
				+	}
			
 
				+
			
 
				+	if (load_kernel() < 0) {
			
 
				+		perror("failed to load kernel symbols");
			
 
				+		return EXIT_FAILURE;
			
 
				+	}
			
 
				+
			
 
				+	if (!full_paths) {
			
 
				+		if (getcwd(__cwd, sizeof(__cwd)) == NULL) {
			
 
				+			perror("failed to get the current directory");
			
 
				+			return EXIT_FAILURE;
			
 
				+		}
			
 
				+		cwdlen = strlen(cwd);
			
 
				+	} else {
			
 
				+		cwd = NULL;
			
 
				+		cwdlen = 0;
			
 
				+	}
			
 
				+remap:
			
 
				+	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
			
 
				+			   MAP_SHARED, input, offset);
			
 
				+	if (buf == MAP_FAILED) {
			
 
				+		perror("failed to mmap file");
			
 
				+		exit(-1);
			
 
				+	}
			
 
				+
			
 
				+more:
			
 
				+	event = (event_t *)(buf + head);
			
 
				+
			
 
				+	size = event->header.size;
			
 
				+	if (!size)
			
 
				+		size = 8;
			
 
				+
			
 
				+	if (head + event->header.size >= page_size * mmap_window) {
			
 
				+		unsigned long shift = page_size * (head / page_size);
			
 
				+		int ret;
			
 
				+
			
 
				+		ret = munmap(buf, page_size * mmap_window);
			
 
				+		assert(ret == 0);
			
 
				+
			
 
				+		offset += shift;
			
 
				+		head -= shift;
			
 
				+		goto remap;
			
 
				+	}
			
 
				+
			
 
				+	size = event->header.size;
			
 
				+
			
 
				+	dprintf("%p [%p]: event: %d\n",
			
 
				+			(void *)(offset + head),
			
 
				+			(void *)(long)event->header.size,
			
 
				+			event->header.type);
			
 
				+
			
 
				+	if (!size || process_event(event, offset, head) < 0) {
			
 
				+
			
 
				+		dprintf("%p [%p]: skipping unknown header type: %d\n",
			
 
				+			(void *)(offset + head),
			
 
				+			(void *)(long)(event->header.size),
			
 
				+			event->header.type);
			
 
				+
			
 
				+		total_unknown++;
			
 
				+
			
 
				+		/*
			
 
				+		 * assume we lost track of the stream, check alignment, and
			
 
				+		 * increment a single u64 in the hope to catch on again 'soon'.
			
 
				+		 */
			
 
				+
			
 
				+		if (unlikely(head & 7))
			
 
				+			head &= ~7ULL;
			
 
				+
			
 
				+		size = 8;
			
 
				+	}
			
 
				+
			
 
				+	head += size;
			
 
				+
			
 
				+	if (offset + head < stat.st_size)
			
 
				+		goto more;
			
 
				+
			
 
				+	rc = EXIT_SUCCESS;
			
 
				+	close(input);
			
 
				+
			
 
				+	dprintf("      IP events: %10ld\n", total);
			
 
				+	dprintf("    mmap events: %10ld\n", total_mmap);
			
 
				+	dprintf("    comm events: %10ld\n", total_comm);
			
 
				+	dprintf("    fork events: %10ld\n", total_fork);
			
 
				+	dprintf(" unknown events: %10ld\n", total_unknown);
			
 
				+
			
 
				+	if (dump_trace)
			
 
				+		return 0;
			
 
				+
			
 
				+	if (verbose >= 3)
			
 
				+		threads__fprintf(stdout);
			
 
				+
			
 
				+	if (verbose >= 2)
			
 
				+		dsos__fprintf(stdout);
			
 
				+
			
 
				+	collapse__resort();
			
 
				+	output__resort();
			
 
				+	output__fprintf(stdout, total);
			
 
				+
			
 
				+	return rc;
			
 
				+}
			
 
				+
			
 
				+static const char * const report_usage[] = {
			
 
				+	"perf report [<options>] <command>",
			
 
				+	NULL
			
 
				+};
			
 
				+
			
 
				+static const struct option options[] = {
			
 
				+	OPT_STRING('i', "input", &input_name, "file",
			
 
				+		    "input file name"),
			
 
				+	OPT_BOOLEAN('v', "verbose", &verbose,
			
 
				+		    "be more verbose (show symbol address, etc)"),
			
 
				+	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
			
 
				+		    "dump raw trace in ASCII"),
			
 
				+	OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
			
 
				+	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
			
 
				+		   "sort by key(s): pid, comm, dso, symbol. Default: pid,symbol"),
			
 
				+	OPT_BOOLEAN('P', "full-paths", &full_paths,
			
 
				+		    "Don't shorten the pathnames taking into account the cwd"),
			
 
				+	OPT_END()
			
 
				+};
			
 
				+
			
 
				+static void setup_sorting(void)
			
 
				+{
			
 
				+	char *tmp, *tok, *str = strdup(sort_order);
			
 
				+
			
 
				+	for (tok = strtok_r(str, ", ", &tmp);
			
 
				+			tok; tok = strtok_r(NULL, ", ", &tmp)) {
			
 
				+		if (sort_dimension__add(tok) < 0) {
			
 
				+			error("Unknown --sort key: `%s'", tok);
			
 
				+			usage_with_options(report_usage, options);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	free(str);
			
 
				+}
			
 
				+
			
 
				+int cmd_report(int argc, const char **argv, const char *prefix)
			
 
				+{
			
 
				+	symbol__init();
			
 
				+
			
 
				+	page_size = getpagesize();
			
 
				+
			
 
				+	argc = parse_options(argc, argv, options, report_usage, 0);
			
 
				+
			
 
				+	setup_sorting();
			
 
				+
			
 
				+	/*
			
 
				+	 * Any (unrecognized) arguments left?
			
 
				+	 */
			
 
				+	if (argc)
			
 
				+		usage_with_options(report_usage, options);
			
 
				+
			
 
				+	setup_pager();
			
 
				+
			
 
				+	return __cmd_report();
			
 
				+}
			
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -0,0 +1,367 @@
 
				+/*
			
 
				+ * builtin-stat.c
			
 
				+ *
			
 
				+ * Builtin stat command: Give a precise performance counters summary
			
 
				+ * overview about any workload, CPU or specific PID.
			
 
				+ *
			
 
				+ * Sample output:
			
 
				+
			
 
				+   $ perf stat ~/hackbench 10
			
 
				+   Time: 0.104
			
 
				+
			
 
				+    Performance counter stats for '/home/mingo/hackbench':
			
 
				+
			
 
				+       1255.538611  task clock ticks     #      10.143 CPU utilization factor
			
 
				+             54011  context switches     #       0.043 M/sec
			
 
				+               385  CPU migrations       #       0.000 M/sec
			
 
				+             17755  pagefaults           #       0.014 M/sec
			
 
				+        3808323185  CPU cycles           #    3033.219 M/sec
			
 
				+        1575111190  instructions         #    1254.530 M/sec
			
 
				+          17367895  cache references     #      13.833 M/sec
			
 
				+           7674421  cache misses         #       6.112 M/sec
			
 
				+
			
 
				+    Wall-clock time elapsed:   123.786620 msecs
			
 
				+
			
 
				+ *
			
 
				+ * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
			
 
				+ *
			
 
				+ * Improvements and fixes by:
			
 
				+ *
			
 
				+ *   Arjan van de Ven <arjan@linux.intel.com>
			
 
				+ *   Yanmin Zhang <yanmin.zhang@intel.com>
			
 
				+ *   Wu Fengguang <fengguang.wu@intel.com>
			
 
				+ *   Mike Galbraith <efault@gmx.de>
			
 
				+ *   Paul Mackerras <paulus@samba.org>
			
 
				+ *
			
 
				+ * Released under the GPL v2. (and only v2, not any later version)
			
 
				+ */
			
 
				+
			
 
				+#include "perf.h"
			
 
				+#include "builtin.h"
			
 
				+#include "util/util.h"
			
 
				+#include "util/parse-options.h"
			
 
				+#include "util/parse-events.h"
			
 
				+
			
 
				+#include <sys/prctl.h>
			
 
				+
			
 
				+static struct perf_counter_attr default_attrs[MAX_COUNTERS] = {
			
 
				+
			
 
				+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK	},
			
 
				+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES},
			
 
				+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS	},
			
 
				+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS	},
			
 
				+
			
 
				+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES	},
			
 
				+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS	},
			
 
				+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES},
			
 
				+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES	},
			
 
				+
			
 
				+};
			
 
				+
			
 
				+static int			system_wide			=  0;
			
 
				+static int			inherit				=  1;
			
 
				+static int			verbose				=  0;
			
 
				+
			
 
				+static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
			
 
				+
			
 
				+static int			target_pid			= -1;
			
 
				+static int			nr_cpus				=  0;
			
 
				+static unsigned int		page_size;
			
 
				+
			
 
				+static int			scale				=  1;
			
 
				+
			
 
				+static const unsigned int default_count[] = {
			
 
				+	1000000,
			
 
				+	1000000,
			
 
				+	  10000,
			
 
				+	  10000,
			
 
				+	1000000,
			
 
				+	  10000,
			
 
				+};
			
 
				+
			
 
				+static __u64			event_res[MAX_COUNTERS][3];
			
 
				+static __u64			event_scaled[MAX_COUNTERS];
			
 
				+
			
 
				+static __u64			runtime_nsecs;
			
 
				+static __u64			walltime_nsecs;
			
 
				+static __u64			runtime_cycles;
			
 
				+
			
 
				+static void create_perf_stat_counter(int counter)
			
 
				+{
			
 
				+	struct perf_counter_attr *attr = attrs + counter;
			
 
				+
			
 
				+	if (scale)
			
 
				+		attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
			
 
				+				    PERF_FORMAT_TOTAL_TIME_RUNNING;
			
 
				+
			
 
				+	if (system_wide) {
			
 
				+		int cpu;
			
 
				+		for (cpu = 0; cpu < nr_cpus; cpu ++) {
			
 
				+			fd[cpu][counter] = sys_perf_counter_open(attr, -1, cpu, -1, 0);
			
 
				+			if (fd[cpu][counter] < 0 && verbose) {
			
 
				+				printf("Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n", counter, fd[cpu][counter], strerror(errno));
			
 
				+			}
			
 
				+		}
			
 
				+	} else {
			
 
				+		attr->inherit	= inherit;
			
 
				+		attr->disabled	= 1;
			
 
				+
			
 
				+		fd[0][counter] = sys_perf_counter_open(attr, 0, -1, -1, 0);
			
 
				+		if (fd[0][counter] < 0 && verbose) {
			
 
				+			printf("Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n", counter, fd[0][counter], strerror(errno));
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Does the counter have nsecs as a unit?
			
 
				+ */
			
 
				+static inline int nsec_counter(int counter)
			
 
				+{
			
 
				+	if (attrs[counter].type != PERF_TYPE_SOFTWARE)
			
 
				+		return 0;
			
 
				+
			
 
				+	if (attrs[counter].config == PERF_COUNT_SW_CPU_CLOCK)
			
 
				+		return 1;
			
 
				+
			
 
				+	if (attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK)
			
 
				+		return 1;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Read out the results of a single counter:
			
 
				+ */
			
 
				+static void read_counter(int counter)
			
 
				+{
			
 
				+	__u64 *count, single_count[3];
			
 
				+	ssize_t res;
			
 
				+	int cpu, nv;
			
 
				+	int scaled;
			
 
				+
			
 
				+	count = event_res[counter];
			
 
				+
			
 
				+	count[0] = count[1] = count[2] = 0;
			
 
				+
			
 
				+	nv = scale ? 3 : 1;
			
 
				+	for (cpu = 0; cpu < nr_cpus; cpu ++) {
			
 
				+		if (fd[cpu][counter] < 0)
			
 
				+			continue;
			
 
				+
			
 
				+		res = read(fd[cpu][counter], single_count, nv * sizeof(__u64));
			
 
				+		assert(res == nv * sizeof(__u64));
			
 
				+
			
 
				+		count[0] += single_count[0];
			
 
				+		if (scale) {
			
 
				+			count[1] += single_count[1];
			
 
				+			count[2] += single_count[2];
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	scaled = 0;
			
 
				+	if (scale) {
			
 
				+		if (count[2] == 0) {
			
 
				+			event_scaled[counter] = -1;
			
 
				+			count[0] = 0;
			
 
				+			return;
			
 
				+		}
			
 
				+
			
 
				+		if (count[2] < count[1]) {
			
 
				+			event_scaled[counter] = 1;
			
 
				+			count[0] = (unsigned long long)
			
 
				+				((double)count[0] * count[1] / count[2] + 0.5);
			
 
				+		}
			
 
				+	}
			
 
				+	/*
			
 
				+	 * Save the full runtime - to allow normalization during printout:
			
 
				+	 */
			
 
				+	if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
			
 
				+		attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK)
			
 
				+		runtime_nsecs = count[0];
			
 
				+	if (attrs[counter].type == PERF_TYPE_HARDWARE &&
			
 
				+		attrs[counter].config == PERF_COUNT_HW_CPU_CYCLES)
			
 
				+		runtime_cycles = count[0];
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Print out the results of a single counter:
			
 
				+ */
			
 
				+static void print_counter(int counter)
			
 
				+{
			
 
				+	__u64 *count;
			
 
				+	int scaled;
			
 
				+
			
 
				+	count = event_res[counter];
			
 
				+	scaled = event_scaled[counter];
			
 
				+
			
 
				+	if (scaled == -1) {
			
 
				+		fprintf(stderr, " %14s  %-20s\n",
			
 
				+			"<not counted>", event_name(counter));
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	if (nsec_counter(counter)) {
			
 
				+		double msecs = (double)count[0] / 1000000;
			
 
				+
			
 
				+		fprintf(stderr, " %14.6f  %-20s",
			
 
				+			msecs, event_name(counter));
			
 
				+		if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
			
 
				+			attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) {
			
 
				+
			
 
				+			if (walltime_nsecs)
			
 
				+				fprintf(stderr, " # %11.3f CPU utilization factor",
			
 
				+					(double)count[0] / (double)walltime_nsecs);
			
 
				+		}
			
 
				+	} else {
			
 
				+		fprintf(stderr, " %14Ld  %-20s",
			
 
				+			count[0], event_name(counter));
			
 
				+		if (runtime_nsecs)
			
 
				+			fprintf(stderr, " # %11.3f M/sec",
			
 
				+				(double)count[0]/runtime_nsecs*1000.0);
			
 
				+		if (runtime_cycles &&
			
 
				+			attrs[counter].type == PERF_TYPE_HARDWARE &&
			
 
				+				attrs[counter].config == PERF_COUNT_HW_INSTRUCTIONS) {
			
 
				+
			
 
				+			fprintf(stderr, " # %1.3f per cycle",
			
 
				+				(double)count[0] / (double)runtime_cycles);
			
 
				+		}
			
 
				+	}
			
 
				+	if (scaled)
			
 
				+		fprintf(stderr, "  (scaled from %.2f%%)",
			
 
				+			(double) count[2] / count[1] * 100);
			
 
				+	fprintf(stderr, "\n");
			
 
				+}
			
 
				+
			
 
				+static int do_perf_stat(int argc, const char **argv)
			
 
				+{
			
 
				+	unsigned long long t0, t1;
			
 
				+	int counter;
			
 
				+	int status;
			
 
				+	int pid;
			
 
				+	int i;
			
 
				+
			
 
				+	if (!system_wide)
			
 
				+		nr_cpus = 1;
			
 
				+
			
 
				+	for (counter = 0; counter < nr_counters; counter++)
			
 
				+		create_perf_stat_counter(counter);
			
 
				+
			
 
				+	/*
			
 
				+	 * Enable counters and exec the command:
			
 
				+	 */
			
 
				+	t0 = rdclock();
			
 
				+	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
			
 
				+
			
 
				+	if ((pid = fork()) < 0)
			
 
				+		perror("failed to fork");
			
 
				+
			
 
				+	if (!pid) {
			
 
				+		if (execvp(argv[0], (char **)argv)) {
			
 
				+			perror(argv[0]);
			
 
				+			exit(-1);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	while (wait(&status) >= 0)
			
 
				+		;
			
 
				+
			
 
				+	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
			
 
				+	t1 = rdclock();
			
 
				+
			
 
				+	walltime_nsecs = t1 - t0;
			
 
				+
			
 
				+	fflush(stdout);
			
 
				+
			
 
				+	fprintf(stderr, "\n");
			
 
				+	fprintf(stderr, " Performance counter stats for \'%s", argv[0]);
			
 
				+
			
 
				+	for (i = 1; i < argc; i++)
			
 
				+		fprintf(stderr, " %s", argv[i]);
			
 
				+
			
 
				+	fprintf(stderr, "\':\n");
			
 
				+	fprintf(stderr, "\n");
			
 
				+
			
 
				+	for (counter = 0; counter < nr_counters; counter++)
			
 
				+		read_counter(counter);
			
 
				+
			
 
				+	for (counter = 0; counter < nr_counters; counter++)
			
 
				+		print_counter(counter);
			
 
				+
			
 
				+
			
 
				+	fprintf(stderr, "\n");
			
 
				+	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
			
 
				+			(double)(t1-t0)/1e6);
			
 
				+	fprintf(stderr, "\n");
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static volatile int signr = -1;
			
 
				+
			
 
				+static void skip_signal(int signo)
			
 
				+{
			
 
				+	signr = signo;
			
 
				+}
			
 
				+
			
 
				+static void sig_atexit(void)
			
 
				+{
			
 
				+	if (signr == -1)
			
 
				+		return;
			
 
				+
			
 
				+	signal(signr, SIG_DFL);
			
 
				+	kill(getpid(), signr);
			
 
				+}
			
 
				+
			
 
				+static const char * const stat_usage[] = {
			
 
				+	"perf stat [<options>] <command>",
			
 
				+	NULL
			
 
				+};
			
 
				+
			
 
				+static const struct option options[] = {
			
 
				+	OPT_CALLBACK('e', "event", NULL, "event",
			
 
				+		     "event selector. use 'perf list' to list available events",
			
 
				+		     parse_events),
			
 
				+	OPT_BOOLEAN('i', "inherit", &inherit,
			
 
				+		    "child tasks inherit counters"),
			
 
				+	OPT_INTEGER('p', "pid", &target_pid,
			
 
				+		    "stat events on existing pid"),
			
 
				+	OPT_BOOLEAN('a', "all-cpus", &system_wide,
			
 
				+			    "system-wide collection from all CPUs"),
			
 
				+	OPT_BOOLEAN('S', "scale", &scale,
			
 
				+			    "scale/normalize counters"),
			
 
				+	OPT_BOOLEAN('v', "verbose", &verbose,
			
 
				+		    "be more verbose (show counter open errors, etc)"),
			
 
				+	OPT_END()
			
 
				+};
			
 
				+
			
 
				+int cmd_stat(int argc, const char **argv, const char *prefix)
			
 
				+{
			
 
				+	page_size = sysconf(_SC_PAGE_SIZE);
			
 
				+
			
 
				+	memcpy(attrs, default_attrs, sizeof(attrs));
			
 
				+
			
 
				+	argc = parse_options(argc, argv, options, stat_usage, 0);
			
 
				+	if (!argc)
			
 
				+		usage_with_options(stat_usage, options);
			
 
				+
			
 
				+	if (!nr_counters)
			
 
				+		nr_counters = 8;
			
 
				+
			
 
				+	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
			
 
				+	assert(nr_cpus <= MAX_NR_CPUS);
			
 
				+	assert(nr_cpus >= 0);
			
 
				+
			
 
				+	/*
			
 
				+	 * We dont want to block the signals - that would cause
			
 
				+	 * child tasks to inherit that and Ctrl-C would not work.
			
 
				+	 * What we want is for Ctrl-C to work in the exec()-ed
			
 
				+	 * task, but being ignored by perf stat itself:
			
 
				+	 */
			
 
				+	atexit(sig_atexit);
			
 
				+	signal(SIGINT,  skip_signal);
			
 
				+	signal(SIGALRM, skip_signal);
			
 
				+	signal(SIGABRT, skip_signal);
			
 
				+
			
 
				+	return do_perf_stat(argc, argv);
			
 
				+}
			
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -0,0 +1,736 @@
 
				+/*
			
 
				+ * builtin-top.c
			
 
				+ *
			
 
				+ * Builtin top command: Display a continuously updated profile of
			
 
				+ * any workload, CPU or specific PID.
			
 
				+ *
			
 
				+ * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
			
 
				+ *
			
 
				+ * Improvements and fixes by:
			
 
				+ *
			
 
				+ *   Arjan van de Ven <arjan@linux.intel.com>
			
 
				+ *   Yanmin Zhang <yanmin.zhang@intel.com>
			
 
				+ *   Wu Fengguang <fengguang.wu@intel.com>
			
 
				+ *   Mike Galbraith <efault@gmx.de>
			
 
				+ *   Paul Mackerras <paulus@samba.org>
			
 
				+ *
			
 
				+ * Released under the GPL v2. (and only v2, not any later version)
			
 
				+ */
			
 
				+#include "builtin.h"
			
 
				+
			
 
				+#include "perf.h"
			
 
				+
			
 
				+#include "util/symbol.h"
			
 
				+#include "util/color.h"
			
 
				+#include "util/util.h"
			
 
				+#include "util/rbtree.h"
			
 
				+#include "util/parse-options.h"
			
 
				+#include "util/parse-events.h"
			
 
				+
			
 
				+#include <assert.h>
			
 
				+#include <fcntl.h>
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+
			
 
				+#include <errno.h>
			
 
				+#include <time.h>
			
 
				+#include <sched.h>
			
 
				+#include <pthread.h>
			
 
				+
			
 
				+#include <sys/syscall.h>
			
 
				+#include <sys/ioctl.h>
			
 
				+#include <sys/poll.h>
			
 
				+#include <sys/prctl.h>
			
 
				+#include <sys/wait.h>
			
 
				+#include <sys/uio.h>
			
 
				+#include <sys/mman.h>
			
 
				+
			
 
				+#include <linux/unistd.h>
			
 
				+#include <linux/types.h>
			
 
				+
			
 
				+static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
			
 
				+
			
 
				+static int			system_wide			=  0;
			
 
				+
			
 
				+static int			default_interval		= 100000;
			
 
				+
			
 
				+static __u64			count_filter			=  5;
			
 
				+static int			print_entries			= 15;
			
 
				+
			
 
				+static int			target_pid			= -1;
			
 
				+static int			profile_cpu			= -1;
			
 
				+static int			nr_cpus				=  0;
			
 
				+static unsigned int		realtime_prio			=  0;
			
 
				+static int			group				=  0;
			
 
				+static unsigned int		page_size;
			
 
				+static unsigned int		mmap_pages			= 16;
			
 
				+static int			freq				=  0;
			
 
				+static int			verbose				=  0;
			
 
				+
			
 
				+static char			*sym_filter;
			
 
				+static unsigned long		filter_start;
			
 
				+static unsigned long		filter_end;
			
 
				+
			
 
				+static int			delay_secs			=  2;
			
 
				+static int			zero;
			
 
				+static int			dump_symtab;
			
 
				+
			
 
				+/*
			
 
				+ * Symbols
			
 
				+ */
			
 
				+
			
 
				+static __u64			min_ip;
			
 
				+static __u64			max_ip = -1ll;
			
 
				+
			
 
				+struct sym_entry {
			
 
				+	struct rb_node		rb_node;
			
 
				+	struct list_head	node;
			
 
				+	unsigned long		count[MAX_COUNTERS];
			
 
				+	unsigned long		snap_count;
			
 
				+	double			weight;
			
 
				+	int			skip;
			
 
				+};
			
 
				+
			
 
				+struct sym_entry		*sym_filter_entry;
			
 
				+
			
 
				+struct dso			*kernel_dso;
			
 
				+
			
 
				+/*
			
 
				+ * Symbols will be added here in record_ip and will get out
			
 
				+ * after decayed.
			
 
				+ */
			
 
				+static LIST_HEAD(active_symbols);
			
 
				+static pthread_mutex_t active_symbols_lock = PTHREAD_MUTEX_INITIALIZER;
			
 
				+
			
 
				+/*
			
 
				+ * Ordering weight: count-1 * count-2 * ... / count-n
			
 
				+ */
			
 
				+static double sym_weight(const struct sym_entry *sym)
			
 
				+{
			
 
				+	double weight = sym->snap_count;
			
 
				+	int counter;
			
 
				+
			
 
				+	for (counter = 1; counter < nr_counters-1; counter++)
			
 
				+		weight *= sym->count[counter];
			
 
				+
			
 
				+	weight /= (sym->count[counter] + 1);
			
 
				+
			
 
				+	return weight;
			
 
				+}
			
 
				+
			
 
				+static long			samples;
			
 
				+static long			userspace_samples;
			
 
				+static const char		CONSOLE_CLEAR[] = "[H[2J";
			
 
				+
			
 
				+static void __list_insert_active_sym(struct sym_entry *syme)
			
 
				+{
			
 
				+	list_add(&syme->node, &active_symbols);
			
 
				+}
			
 
				+
			
 
				+static void list_remove_active_sym(struct sym_entry *syme)
			
 
				+{
			
 
				+	pthread_mutex_lock(&active_symbols_lock);
			
 
				+	list_del_init(&syme->node);
			
 
				+	pthread_mutex_unlock(&active_symbols_lock);
			
 
				+}
			
 
				+
			
 
				+static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se)
			
 
				+{
			
 
				+	struct rb_node **p = &tree->rb_node;
			
 
				+	struct rb_node *parent = NULL;
			
 
				+	struct sym_entry *iter;
			
 
				+
			
 
				+	while (*p != NULL) {
			
 
				+		parent = *p;
			
 
				+		iter = rb_entry(parent, struct sym_entry, rb_node);
			
 
				+
			
 
				+		if (se->weight > iter->weight)
			
 
				+			p = &(*p)->rb_left;
			
 
				+		else
			
 
				+			p = &(*p)->rb_right;
			
 
				+	}
			
 
				+
			
 
				+	rb_link_node(&se->rb_node, parent, p);
			
 
				+	rb_insert_color(&se->rb_node, tree);
			
 
				+}
			
 
				+
			
 
				+static void print_sym_table(void)
			
 
				+{
			
 
				+	int printed = 0, j;
			
 
				+	int counter;
			
 
				+	float samples_per_sec = samples/delay_secs;
			
 
				+	float ksamples_per_sec = (samples-userspace_samples)/delay_secs;
			
 
				+	float sum_ksamples = 0.0;
			
 
				+	struct sym_entry *syme, *n;
			
 
				+	struct rb_root tmp = RB_ROOT;
			
 
				+	struct rb_node *nd;
			
 
				+
			
 
				+	samples = userspace_samples = 0;
			
 
				+
			
 
				+	/* Sort the active symbols */
			
 
				+	pthread_mutex_lock(&active_symbols_lock);
			
 
				+	syme = list_entry(active_symbols.next, struct sym_entry, node);
			
 
				+	pthread_mutex_unlock(&active_symbols_lock);
			
 
				+
			
 
				+	list_for_each_entry_safe_from(syme, n, &active_symbols, node) {
			
 
				+		syme->snap_count = syme->count[0];
			
 
				+		if (syme->snap_count != 0) {
			
 
				+			syme->weight = sym_weight(syme);
			
 
				+			rb_insert_active_sym(&tmp, syme);
			
 
				+			sum_ksamples += syme->snap_count;
			
 
				+
			
 
				+			for (j = 0; j < nr_counters; j++)
			
 
				+				syme->count[j] = zero ? 0 : syme->count[j] * 7 / 8;
			
 
				+		} else
			
 
				+			list_remove_active_sym(syme);
			
 
				+	}
			
 
				+
			
 
				+	puts(CONSOLE_CLEAR);
			
 
				+
			
 
				+	printf(
			
 
				+"------------------------------------------------------------------------------\n");
			
 
				+	printf( "   PerfTop:%8.0f irqs/sec  kernel:%4.1f%% [",
			
 
				+		samples_per_sec,
			
 
				+		100.0 - (100.0*((samples_per_sec-ksamples_per_sec)/samples_per_sec)));
			
 
				+
			
 
				+	if (nr_counters == 1) {
			
 
				+		printf("%Ld", attrs[0].sample_period);
			
 
				+		if (freq)
			
 
				+			printf("Hz ");
			
 
				+		else
			
 
				+			printf(" ");
			
 
				+	}
			
 
				+
			
 
				+	for (counter = 0; counter < nr_counters; counter++) {
			
 
				+		if (counter)
			
 
				+			printf("/");
			
 
				+
			
 
				+		printf("%s", event_name(counter));
			
 
				+	}
			
 
				+
			
 
				+	printf( "], ");
			
 
				+
			
 
				+	if (target_pid != -1)
			
 
				+		printf(" (target_pid: %d", target_pid);
			
 
				+	else
			
 
				+		printf(" (all");
			
 
				+
			
 
				+	if (profile_cpu != -1)
			
 
				+		printf(", cpu: %d)\n", profile_cpu);
			
 
				+	else {
			
 
				+		if (target_pid != -1)
			
 
				+			printf(")\n");
			
 
				+		else
			
 
				+			printf(", %d CPUs)\n", nr_cpus);
			
 
				+	}
			
 
				+
			
 
				+	printf("------------------------------------------------------------------------------\n\n");
			
 
				+
			
 
				+	if (nr_counters == 1)
			
 
				+		printf("             samples    pcnt");
			
 
				+	else
			
 
				+		printf("  weight     samples    pcnt");
			
 
				+
			
 
				+	printf("         RIP          kernel function\n"
			
 
				+	       	       "  ______     _______   _____   ________________   _______________\n\n"
			
 
				+	);
			
 
				+
			
 
				+	for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) {
			
 
				+		struct sym_entry *syme = rb_entry(nd, struct sym_entry, rb_node);
			
 
				+		struct symbol *sym = (struct symbol *)(syme + 1);
			
 
				+		char *color = PERF_COLOR_NORMAL;
			
 
				+		double pcnt;
			
 
				+
			
 
				+		if (++printed > print_entries || syme->snap_count < count_filter)
			
 
				+			continue;
			
 
				+
			
 
				+		pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) /
			
 
				+					 sum_ksamples));
			
 
				+
			
 
				+		/*
			
 
				+		 * We color high-overhead entries in red, mid-overhead
			
 
				+		 * entries in green - and keep the low overhead places
			
 
				+		 * normal:
			
 
				+		 */
			
 
				+		if (pcnt >= 5.0) {
			
 
				+			color = PERF_COLOR_RED;
			
 
				+		} else {
			
 
				+			if (pcnt >= 0.5)
			
 
				+				color = PERF_COLOR_GREEN;
			
 
				+		}
			
 
				+
			
 
				+		if (nr_counters == 1)
			
 
				+			printf("%20.2f - ", syme->weight);
			
 
				+		else
			
 
				+			printf("%9.1f %10ld - ", syme->weight, syme->snap_count);
			
 
				+
			
 
				+		color_fprintf(stdout, color, "%4.1f%%", pcnt);
			
 
				+		printf(" - %016llx : %s\n", sym->start, sym->name);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void *display_thread(void *arg)
			
 
				+{
			
 
				+	struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
			
 
				+	int delay_msecs = delay_secs * 1000;
			
 
				+
			
 
				+	printf("PerfTop refresh period: %d seconds\n", delay_secs);
			
 
				+
			
 
				+	do {
			
 
				+		print_sym_table();
			
 
				+	} while (!poll(&stdin_poll, 1, delay_msecs) == 1);
			
 
				+
			
 
				+	printf("key pressed - exiting.\n");
			
 
				+	exit(0);
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static int symbol_filter(struct dso *self, struct symbol *sym)
			
 
				+{
			
 
				+	static int filter_match;
			
 
				+	struct sym_entry *syme;
			
 
				+	const char *name = sym->name;
			
 
				+
			
 
				+	if (!strcmp(name, "_text") ||
			
 
				+	    !strcmp(name, "_etext") ||
			
 
				+	    !strcmp(name, "_sinittext") ||
			
 
				+	    !strncmp("init_module", name, 11) ||
			
 
				+	    !strncmp("cleanup_module", name, 14) ||
			
 
				+	    strstr(name, "_text_start") ||
			
 
				+	    strstr(name, "_text_end"))
			
 
				+		return 1;
			
 
				+
			
 
				+	syme = dso__sym_priv(self, sym);
			
 
				+	/* Tag samples to be skipped. */
			
 
				+	if (!strcmp("default_idle", name) ||
			
 
				+	    !strcmp("cpu_idle", name) ||
			
 
				+	    !strcmp("enter_idle", name) ||
			
 
				+	    !strcmp("exit_idle", name) ||
			
 
				+	    !strcmp("mwait_idle", name))
			
 
				+		syme->skip = 1;
			
 
				+
			
 
				+	if (filter_match == 1) {
			
 
				+		filter_end = sym->start;
			
 
				+		filter_match = -1;
			
 
				+		if (filter_end - filter_start > 10000) {
			
 
				+			fprintf(stderr,
			
 
				+				"hm, too large filter symbol <%s> - skipping.\n",
			
 
				+				sym_filter);
			
 
				+			fprintf(stderr, "symbol filter start: %016lx\n",
			
 
				+				filter_start);
			
 
				+			fprintf(stderr, "                end: %016lx\n",
			
 
				+				filter_end);
			
 
				+			filter_end = filter_start = 0;
			
 
				+			sym_filter = NULL;
			
 
				+			sleep(1);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (filter_match == 0 && sym_filter && !strcmp(name, sym_filter)) {
			
 
				+		filter_match = 1;
			
 
				+		filter_start = sym->start;
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int parse_symbols(void)
			
 
				+{
			
 
				+	struct rb_node *node;
			
 
				+	struct symbol  *sym;
			
 
				+
			
 
				+	kernel_dso = dso__new("[kernel]", sizeof(struct sym_entry));
			
 
				+	if (kernel_dso == NULL)
			
 
				+		return -1;
			
 
				+
			
 
				+	if (dso__load_kernel(kernel_dso, NULL, symbol_filter, 1) != 0)
			
 
				+		goto out_delete_dso;
			
 
				+
			
 
				+	node = rb_first(&kernel_dso->syms);
			
 
				+	sym = rb_entry(node, struct symbol, rb_node);
			
 
				+	min_ip = sym->start;
			
 
				+
			
 
				+	node = rb_last(&kernel_dso->syms);
			
 
				+	sym = rb_entry(node, struct symbol, rb_node);
			
 
				+	max_ip = sym->end;
			
 
				+
			
 
				+	if (dump_symtab)
			
 
				+		dso__fprintf(kernel_dso, stderr);
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+out_delete_dso:
			
 
				+	dso__delete(kernel_dso);
			
 
				+	kernel_dso = NULL;
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+#define TRACE_COUNT     3
			
 
				+
			
 
				+/*
			
 
				+ * Binary search in the histogram table and record the hit:
			
 
				+ */
			
 
				+static void record_ip(__u64 ip, int counter)
			
 
				+{
			
 
				+	struct symbol *sym = dso__find_symbol(kernel_dso, ip);
			
 
				+
			
 
				+	if (sym != NULL) {
			
 
				+		struct sym_entry *syme = dso__sym_priv(kernel_dso, sym);
			
 
				+
			
 
				+		if (!syme->skip) {
			
 
				+			syme->count[counter]++;
			
 
				+			pthread_mutex_lock(&active_symbols_lock);
			
 
				+			if (list_empty(&syme->node) || !syme->node.next)
			
 
				+				__list_insert_active_sym(syme);
			
 
				+			pthread_mutex_unlock(&active_symbols_lock);
			
 
				+			return;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	samples--;
			
 
				+}
			
 
				+
			
 
				+static void process_event(__u64 ip, int counter)
			
 
				+{
			
 
				+	samples++;
			
 
				+
			
 
				+	if (ip < min_ip || ip > max_ip) {
			
 
				+		userspace_samples++;
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	record_ip(ip, counter);
			
 
				+}
			
 
				+
			
 
				+struct mmap_data {
			
 
				+	int			counter;
			
 
				+	void			*base;
			
 
				+	unsigned int		mask;
			
 
				+	unsigned int		prev;
			
 
				+};
			
 
				+
			
 
				+static unsigned int mmap_read_head(struct mmap_data *md)
			
 
				+{
			
 
				+	struct perf_counter_mmap_page *pc = md->base;
			
 
				+	int head;
			
 
				+
			
 
				+	head = pc->data_head;
			
 
				+	rmb();
			
 
				+
			
 
				+	return head;
			
 
				+}
			
 
				+
			
 
				+struct timeval last_read, this_read;
			
 
				+
			
 
				+static void mmap_read_counter(struct mmap_data *md)
			
 
				+{
			
 
				+	unsigned int head = mmap_read_head(md);
			
 
				+	unsigned int old = md->prev;
			
 
				+	unsigned char *data = md->base + page_size;
			
 
				+	int diff;
			
 
				+
			
 
				+	gettimeofday(&this_read, NULL);
			
 
				+
			
 
				+	/*
			
 
				+	 * If we're further behind than half the buffer, there's a chance
			
 
				+	 * the writer will bite our tail and mess up the samples under us.
			
 
				+	 *
			
 
				+	 * If we somehow ended up ahead of the head, we got messed up.
			
 
				+	 *
			
 
				+	 * In either case, truncate and restart at head.
			
 
				+	 */
			
 
				+	diff = head - old;
			
 
				+	if (diff > md->mask / 2 || diff < 0) {
			
 
				+		struct timeval iv;
			
 
				+		unsigned long msecs;
			
 
				+
			
 
				+		timersub(&this_read, &last_read, &iv);
			
 
				+		msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
			
 
				+
			
 
				+		fprintf(stderr, "WARNING: failed to keep up with mmap data."
			
 
				+				"  Last read %lu msecs ago.\n", msecs);
			
 
				+
			
 
				+		/*
			
 
				+		 * head points to a known good entry, start there.
			
 
				+		 */
			
 
				+		old = head;
			
 
				+	}
			
 
				+
			
 
				+	last_read = this_read;
			
 
				+
			
 
				+	for (; old != head;) {
			
 
				+		struct ip_event {
			
 
				+			struct perf_event_header header;
			
 
				+			__u64 ip;
			
 
				+			__u32 pid, target_pid;
			
 
				+		};
			
 
				+		struct mmap_event {
			
 
				+			struct perf_event_header header;
			
 
				+			__u32 pid, target_pid;
			
 
				+			__u64 start;
			
 
				+			__u64 len;
			
 
				+			__u64 pgoff;
			
 
				+			char filename[PATH_MAX];
			
 
				+		};
			
 
				+
			
 
				+		typedef union event_union {
			
 
				+			struct perf_event_header header;
			
 
				+			struct ip_event ip;
			
 
				+			struct mmap_event mmap;
			
 
				+		} event_t;
			
 
				+
			
 
				+		event_t *event = (event_t *)&data[old & md->mask];
			
 
				+
			
 
				+		event_t event_copy;
			
 
				+
			
 
				+		size_t size = event->header.size;
			
 
				+
			
 
				+		/*
			
 
				+		 * Event straddles the mmap boundary -- header should always
			
 
				+		 * be inside due to u64 alignment of output.
			
 
				+		 */
			
 
				+		if ((old & md->mask) + size != ((old + size) & md->mask)) {
			
 
				+			unsigned int offset = old;
			
 
				+			unsigned int len = min(sizeof(*event), size), cpy;
			
 
				+			void *dst = &event_copy;
			
 
				+
			
 
				+			do {
			
 
				+				cpy = min(md->mask + 1 - (offset & md->mask), len);
			
 
				+				memcpy(dst, &data[offset & md->mask], cpy);
			
 
				+				offset += cpy;
			
 
				+				dst += cpy;
			
 
				+				len -= cpy;
			
 
				+			} while (len);
			
 
				+
			
 
				+			event = &event_copy;
			
 
				+		}
			
 
				+
			
 
				+		old += size;
			
 
				+
			
 
				+		if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
			
 
				+			if (event->header.type & PERF_SAMPLE_IP)
			
 
				+				process_event(event->ip.ip, md->counter);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	md->prev = old;
			
 
				+}
			
 
				+
			
 
				+static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
			
 
				+static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
			
 
				+
			
 
				+static void mmap_read(void)
			
 
				+{
			
 
				+	int i, counter;
			
 
				+
			
 
				+	for (i = 0; i < nr_cpus; i++) {
			
 
				+		for (counter = 0; counter < nr_counters; counter++)
			
 
				+			mmap_read_counter(&mmap_array[i][counter]);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int nr_poll;
			
 
				+int group_fd;
			
 
				+
			
 
				+static void start_counter(int i, int counter)
			
 
				+{
			
 
				+	struct perf_counter_attr *attr;
			
 
				+	unsigned int cpu;
			
 
				+
			
 
				+	cpu = profile_cpu;
			
 
				+	if (target_pid == -1 && profile_cpu == -1)
			
 
				+		cpu = i;
			
 
				+
			
 
				+	attr = attrs + counter;
			
 
				+
			
 
				+	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
			
 
				+	attr->freq		= freq;
			
 
				+
			
 
				+try_again:
			
 
				+	fd[i][counter] = sys_perf_counter_open(attr, target_pid, cpu, group_fd, 0);
			
 
				+
			
 
				+	if (fd[i][counter] < 0) {
			
 
				+		int err = errno;
			
 
				+
			
 
				+		if (err == EPERM)
			
 
				+			die("No permission - are you root?\n");
			
 
				+		/*
			
 
				+		 * If it's cycles then fall back to hrtimer
			
 
				+		 * based cpu-clock-tick sw counter, which
			
 
				+		 * is always available even if no PMU support:
			
 
				+		 */
			
 
				+		if (attr->type == PERF_TYPE_HARDWARE
			
 
				+			&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
			
 
				+
			
 
				+			if (verbose)
			
 
				+				warning(" ... trying to fall back to cpu-clock-ticks\n");
			
 
				+
			
 
				+			attr->type = PERF_TYPE_SOFTWARE;
			
 
				+			attr->config = PERF_COUNT_SW_CPU_CLOCK;
			
 
				+			goto try_again;
			
 
				+		}
			
 
				+		printf("\n");
			
 
				+		error("perfcounter syscall returned with %d (%s)\n",
			
 
				+			fd[i][counter], strerror(err));
			
 
				+		die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n");
			
 
				+		exit(-1);
			
 
				+	}
			
 
				+	assert(fd[i][counter] >= 0);
			
 
				+	fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
			
 
				+
			
 
				+	/*
			
 
				+	 * First counter acts as the group leader:
			
 
				+	 */
			
 
				+	if (group && group_fd == -1)
			
 
				+		group_fd = fd[i][counter];
			
 
				+
			
 
				+	event_array[nr_poll].fd = fd[i][counter];
			
 
				+	event_array[nr_poll].events = POLLIN;
			
 
				+	nr_poll++;
			
 
				+
			
 
				+	mmap_array[i][counter].counter = counter;
			
 
				+	mmap_array[i][counter].prev = 0;
			
 
				+	mmap_array[i][counter].mask = mmap_pages*page_size - 1;
			
 
				+	mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
			
 
				+			PROT_READ, MAP_SHARED, fd[i][counter], 0);
			
 
				+	if (mmap_array[i][counter].base == MAP_FAILED)
			
 
				+		die("failed to mmap with %d (%s)\n", errno, strerror(errno));
			
 
				+}
			
 
				+
			
 
				+static int __cmd_top(void)
			
 
				+{
			
 
				+	pthread_t thread;
			
 
				+	int i, counter;
			
 
				+	int ret;
			
 
				+
			
 
				+	for (i = 0; i < nr_cpus; i++) {
			
 
				+		group_fd = -1;
			
 
				+		for (counter = 0; counter < nr_counters; counter++)
			
 
				+			start_counter(i, counter);
			
 
				+	}
			
 
				+
			
 
				+	/* Wait for a minimal set of events before starting the snapshot */
			
 
				+	poll(event_array, nr_poll, 100);
			
 
				+
			
 
				+	mmap_read();
			
 
				+
			
 
				+	if (pthread_create(&thread, NULL, display_thread, NULL)) {
			
 
				+		printf("Could not create display thread.\n");
			
 
				+		exit(-1);
			
 
				+	}
			
 
				+
			
 
				+	if (realtime_prio) {
			
 
				+		struct sched_param param;
			
 
				+
			
 
				+		param.sched_priority = realtime_prio;
			
 
				+		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
			
 
				+			printf("Could not set realtime priority.\n");
			
 
				+			exit(-1);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	while (1) {
			
 
				+		int hits = samples;
			
 
				+
			
 
				+		mmap_read();
			
 
				+
			
 
				+		if (hits == samples)
			
 
				+			ret = poll(event_array, nr_poll, 100);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static const char * const top_usage[] = {
			
 
				+	"perf top [<options>]",
			
 
				+	NULL
			
 
				+};
			
 
				+
			
 
				+static const struct option options[] = {
			
 
				+	OPT_CALLBACK('e', "event", NULL, "event",
			
 
				+		     "event selector. use 'perf list' to list available events",
			
 
				+		     parse_events),
			
 
				+	OPT_INTEGER('c', "count", &default_interval,
			
 
				+		    "event period to sample"),
			
 
				+	OPT_INTEGER('p', "pid", &target_pid,
			
 
				+		    "profile events on existing pid"),
			
 
				+	OPT_BOOLEAN('a', "all-cpus", &system_wide,
			
 
				+			    "system-wide collection from all CPUs"),
			
 
				+	OPT_INTEGER('C', "CPU", &profile_cpu,
			
 
				+		    "CPU to profile on"),
			
 
				+	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
			
 
				+		    "number of mmap data pages"),
			
 
				+	OPT_INTEGER('r', "realtime", &realtime_prio,
			
 
				+		    "collect data with this RT SCHED_FIFO priority"),
			
 
				+	OPT_INTEGER('d', "delay", &delay_secs,
			
 
				+		    "number of seconds to delay between refreshes"),
			
 
				+	OPT_BOOLEAN('D', "dump-symtab", &dump_symtab,
			
 
				+			    "dump the symbol table used for profiling"),
			
 
				+	OPT_INTEGER('f', "count-filter", &count_filter,
			
 
				+		    "only display functions with more events than this"),
			
 
				+	OPT_BOOLEAN('g', "group", &group,
			
 
				+			    "put the counters into a counter group"),
			
 
				+	OPT_STRING('s', "sym-filter", &sym_filter, "pattern",
			
 
				+		    "only display symbols matchig this pattern"),
			
 
				+	OPT_BOOLEAN('z', "zero", &group,
			
 
				+		    "zero history across updates"),
			
 
				+	OPT_INTEGER('F', "freq", &freq,
			
 
				+		    "profile at this frequency"),
			
 
				+	OPT_INTEGER('E', "entries", &print_entries,
			
 
				+		    "display this many functions"),
			
 
				+	OPT_BOOLEAN('v', "verbose", &verbose,
			
 
				+		    "be more verbose (show counter open errors, etc)"),
			
 
				+	OPT_END()
			
 
				+};
			
 
				+
			
 
				+int cmd_top(int argc, const char **argv, const char *prefix)
			
 
				+{
			
 
				+	int counter;
			
 
				+
			
 
				+	page_size = sysconf(_SC_PAGE_SIZE);
			
 
				+
			
 
				+	argc = parse_options(argc, argv, options, top_usage, 0);
			
 
				+	if (argc)
			
 
				+		usage_with_options(top_usage, options);
			
 
				+
			
 
				+	if (freq) {
			
 
				+		default_interval = freq;
			
 
				+		freq = 1;
			
 
				+	}
			
 
				+
			
 
				+	/* CPU and PID are mutually exclusive */
			
 
				+	if (target_pid != -1 && profile_cpu != -1) {
			
 
				+		printf("WARNING: PID switch overriding CPU\n");
			
 
				+		sleep(1);
			
 
				+		profile_cpu = -1;
			
 
				+	}
			
 
				+
			
 
				+	if (!nr_counters)
			
 
				+		nr_counters = 1;
			
 
				+
			
 
				+	if (delay_secs < 1)
			
 
				+		delay_secs = 1;
			
 
				+
			
 
				+	parse_symbols();
			
 
				+
			
 
				+	/*
			
 
				+	 * Fill in the ones not specifically initialized via -c:
			
 
				+	 */
			
 
				+	for (counter = 0; counter < nr_counters; counter++) {
			
 
				+		if (attrs[counter].sample_period)
			
 
				+			continue;
			
 
				+
			
 
				+		attrs[counter].sample_period = default_interval;
			
 
				+	}
			
 
				+
			
 
				+	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
			
 
				+	assert(nr_cpus <= MAX_NR_CPUS);
			
 
				+	assert(nr_cpus >= 0);
			
 
				+
			
 
				+	if (target_pid != -1 || profile_cpu != -1)
			
 
				+		nr_cpus = 1;
			
 
				+
			
 
				+	return __cmd_top();
			
 
				+}
			
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -0,0 +1,26 @@
 
				+#ifndef BUILTIN_H
			
 
				+#define BUILTIN_H
			
 
				+
			
 
				+#include "util/util.h"
			
 
				+#include "util/strbuf.h"
			
 
				+
			
 
				+extern const char perf_version_string[];
			
 
				+extern const char perf_usage_string[];
			
 
				+extern const char perf_more_info_string[];
			
 
				+
			
 
				+extern void list_common_cmds_help(void);
			
 
				+extern const char *help_unknown_cmd(const char *cmd);
			
 
				+extern void prune_packed_objects(int);
			
 
				+extern int read_line_with_nul(char *buf, int size, FILE *file);
			
 
				+extern int check_pager_config(const char *cmd);
			
 
				+
			
 
				+extern int cmd_annotate(int argc, const char **argv, const char *prefix);
			
 
				+extern int cmd_help(int argc, const char **argv, const char *prefix);
			
 
				+extern int cmd_record(int argc, const char **argv, const char *prefix);
			
 
				+extern int cmd_report(int argc, const char **argv, const char *prefix);
			
 
				+extern int cmd_stat(int argc, const char **argv, const char *prefix);
			
 
				+extern int cmd_top(int argc, const char **argv, const char *prefix);
			
 
				+extern int cmd_version(int argc, const char **argv, const char *prefix);
			
 
				+extern int cmd_list(int argc, const char **argv, const char *prefix);
			
 
				+
			
 
				+#endif
			
--- a/tools/perf/command-list.txt
+++ b/tools/perf/command-list.txt
@@ -0,0 +1,10 @@
 
				+#
			
 
				+# List of known perf commands.
			
 
				+# command name			category [deprecated] [common]
			
 
				+#
			
 
				+perf-annotate			mainporcelain common
			
 
				+perf-list			mainporcelain common
			
 
				+perf-record			mainporcelain common
			
 
				+perf-report			mainporcelain common
			
 
				+perf-stat			mainporcelain common
			
 
				+perf-top			mainporcelain common
			
--- a/tools/perf/design.txt
+++ b/tools/perf/design.txt
@@ -0,0 +1,442 @@
 
				+
			
 
				+Performance Counters for Linux
			
 
				+------------------------------
			
 
				+
			
 
				+Performance counters are special hardware registers available on most modern
			
 
				+CPUs. These registers count the number of certain types of hw events: such
			
 
				+as instructions executed, cachemisses suffered, or branches mis-predicted -
			
 
				+without slowing down the kernel or applications. These registers can also
			
 
				+trigger interrupts when a threshold number of events have passed - and can
			
 
				+thus be used to profile the code that runs on that CPU.
			
 
				+
			
 
				+The Linux Performance Counter subsystem provides an abstraction of these
			
 
				+hardware capabilities. It provides per task and per CPU counters, counter
			
 
				+groups, and it provides event capabilities on top of those.  It
			
 
				+provides "virtual" 64-bit counters, regardless of the width of the
			
 
				+underlying hardware counters.
			
 
				+
			
 
				+Performance counters are accessed via special file descriptors.
			
 
				+There's one file descriptor per virtual counter used.
			
 
				+
			
 
				+The special file descriptor is opened via the perf_counter_open()
			
 
				+system call:
			
 
				+
			
 
				+   int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
			
 
				+			     pid_t pid, int cpu, int group_fd,
			
 
				+			     unsigned long flags);
			
 
				+
			
 
				+The syscall returns the new fd. The fd can be used via the normal
			
 
				+VFS system calls: read() can be used to read the counter, fcntl()
			
 
				+can be used to set the blocking mode, etc.
			
 
				+
			
 
				+Multiple counters can be kept open at a time, and the counters
			
 
				+can be poll()ed.
			
 
				+
			
 
				+When creating a new counter fd, 'perf_counter_hw_event' is:
			
 
				+
			
 
				+struct perf_counter_hw_event {
			
 
				+        /*
			
 
				+         * The MSB of the config word signifies if the rest contains cpu
			
 
				+         * specific (raw) counter configuration data, if unset, the next
			
 
				+         * 7 bits are an event type and the rest of the bits are the event
			
 
				+         * identifier.
			
 
				+         */
			
 
				+        __u64                   config;
			
 
				+
			
 
				+        __u64                   irq_period;
			
 
				+        __u32                   record_type;
			
 
				+        __u32                   read_format;
			
 
				+
			
 
				+        __u64                   disabled       :  1, /* off by default        */
			
 
				+                                inherit        :  1, /* children inherit it   */
			
 
				+                                pinned         :  1, /* must always be on PMU */
			
 
				+                                exclusive      :  1, /* only group on PMU     */
			
 
				+                                exclude_user   :  1, /* don't count user      */
			
 
				+                                exclude_kernel :  1, /* ditto kernel          */
			
 
				+                                exclude_hv     :  1, /* ditto hypervisor      */
			
 
				+                                exclude_idle   :  1, /* don't count when idle */
			
 
				+                                mmap           :  1, /* include mmap data     */
			
 
				+                                munmap         :  1, /* include munmap data   */
			
 
				+                                comm           :  1, /* include comm data     */
			
 
				+
			
 
				+                                __reserved_1   : 52;
			
 
				+
			
 
				+        __u32                   extra_config_len;
			
 
				+        __u32                   wakeup_events;  /* wakeup every n events */
			
 
				+
			
 
				+        __u64                   __reserved_2;
			
 
				+        __u64                   __reserved_3;
			
 
				+};
			
 
				+
			
 
				+The 'config' field specifies what the counter should count.  It
			
 
				+is divided into 3 bit-fields:
			
 
				+
			
 
				+raw_type: 1 bit   (most significant bit)	0x8000_0000_0000_0000
			
 
				+type:	  7 bits  (next most significant)	0x7f00_0000_0000_0000
			
 
				+event_id: 56 bits (least significant)		0x00ff_ffff_ffff_ffff
			
 
				+
			
 
				+If 'raw_type' is 1, then the counter will count a hardware event
			
 
				+specified by the remaining 63 bits of event_config.  The encoding is
			
 
				+machine-specific.
			
 
				+
			
 
				+If 'raw_type' is 0, then the 'type' field says what kind of counter
			
 
				+this is, with the following encoding:
			
 
				+
			
 
				+enum perf_event_types {
			
 
				+	PERF_TYPE_HARDWARE		= 0,
			
 
				+	PERF_TYPE_SOFTWARE		= 1,
			
 
				+	PERF_TYPE_TRACEPOINT		= 2,
			
 
				+};
			
 
				+
			
 
				+A counter of PERF_TYPE_HARDWARE will count the hardware event
			
 
				+specified by 'event_id':
			
 
				+
			
 
				+/*
			
 
				+ * Generalized performance counter event types, used by the hw_event.event_id
			
 
				+ * parameter of the sys_perf_counter_open() syscall:
			
 
				+ */
			
 
				+enum hw_event_ids {
			
 
				+	/*
			
 
				+	 * Common hardware events, generalized by the kernel:
			
 
				+	 */
			
 
				+	PERF_COUNT_HW_CPU_CYCLES		= 0,
			
 
				+	PERF_COUNT_HW_INSTRUCTIONS		= 1,
			
 
				+	PERF_COUNT_HW_CACHE_REFERENCES	= 2,
			
 
				+	PERF_COUNT_HW_CACHE_MISSES		= 3,
			
 
				+	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
			
 
				+	PERF_COUNT_HW_BRANCH_MISSES	= 5,
			
 
				+	PERF_COUNT_HW_BUS_CYCLES		= 6,
			
 
				+};
			
 
				+
			
 
				+These are standardized types of events that work relatively uniformly
			
 
				+on all CPUs that implement Performance Counters support under Linux,
			
 
				+although there may be variations (e.g., different CPUs might count
			
 
				+cache references and misses at different levels of the cache hierarchy).
			
 
				+If a CPU is not able to count the selected event, then the system call
			
 
				+will return -EINVAL.
			
 
				+
			
 
				+More hw_event_types are supported as well, but they are CPU-specific
			
 
				+and accessed as raw events.  For example, to count "External bus
			
 
				+cycles while bus lock signal asserted" events on Intel Core CPUs, pass
			
 
				+in a 0x4064 event_id value and set hw_event.raw_type to 1.
			
 
				+
			
 
				+A counter of type PERF_TYPE_SOFTWARE will count one of the available
			
 
				+software events, selected by 'event_id':
			
 
				+
			
 
				+/*
			
 
				+ * Special "software" counters provided by the kernel, even if the hardware
			
 
				+ * does not support performance counters. These counters measure various
			
 
				+ * physical and sw events of the kernel (and allow the profiling of them as
			
 
				+ * well):
			
 
				+ */
			
 
				+enum sw_event_ids {
			
 
				+	PERF_COUNT_SW_CPU_CLOCK		= 0,
			
 
				+	PERF_COUNT_SW_TASK_CLOCK		= 1,
			
 
				+	PERF_COUNT_SW_PAGE_FAULTS		= 2,
			
 
				+	PERF_COUNT_SW_CONTEXT_SWITCHES	= 3,
			
 
				+	PERF_COUNT_SW_CPU_MIGRATIONS	= 4,
			
 
				+	PERF_COUNT_SW_PAGE_FAULTS_MIN	= 5,
			
 
				+	PERF_COUNT_SW_PAGE_FAULTS_MAJ	= 6,
			
 
				+};
			
 
				+
			
 
				+Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event
			
 
				+tracer is available, and event_id values can be obtained from
			
 
				+/debug/tracing/events/*/*/id
			
 
				+
			
 
				+
			
 
				+Counters come in two flavours: counting counters and sampling
			
 
				+counters.  A "counting" counter is one that is used for counting the
			
 
				+number of events that occur, and is characterised by having
			
 
				+irq_period = 0.
			
 
				+
			
 
				+
			
 
				+A read() on a counter returns the current value of the counter and possible
			
 
				+additional values as specified by 'read_format', each value is a u64 (8 bytes)
			
 
				+in size.
			
 
				+
			
 
				+/*
			
 
				+ * Bits that can be set in hw_event.read_format to request that
			
 
				+ * reads on the counter should return the indicated quantities,
			
 
				+ * in increasing order of bit value, after the counter value.
			
 
				+ */
			
 
				+enum perf_counter_read_format {
			
 
				+        PERF_FORMAT_TOTAL_TIME_ENABLED  =  1,
			
 
				+        PERF_FORMAT_TOTAL_TIME_RUNNING  =  2,
			
 
				+};
			
 
				+
			
 
				+Using these additional values one can establish the overcommit ratio for a
			
 
				+particular counter allowing one to take the round-robin scheduling effect
			
 
				+into account.
			
 
				+
			
 
				+
			
 
				+A "sampling" counter is one that is set up to generate an interrupt
			
 
				+every N events, where N is given by 'irq_period'.  A sampling counter
			
 
				+has irq_period > 0. The record_type controls what data is recorded on each
			
 
				+interrupt:
			
 
				+
			
 
				+/*
			
 
				+ * Bits that can be set in hw_event.record_type to request information
			
 
				+ * in the overflow packets.
			
 
				+ */
			
 
				+enum perf_counter_record_format {
			
 
				+        PERF_RECORD_IP          = 1U << 0,
			
 
				+        PERF_RECORD_TID         = 1U << 1,
			
 
				+        PERF_RECORD_TIME        = 1U << 2,
			
 
				+        PERF_RECORD_ADDR        = 1U << 3,
			
 
				+        PERF_RECORD_GROUP       = 1U << 4,
			
 
				+        PERF_RECORD_CALLCHAIN   = 1U << 5,
			
 
				+};
			
 
				+
			
 
				+Such (and other) events will be recorded in a ring-buffer, which is
			
 
				+available to user-space using mmap() (see below).
			
 
				+
			
 
				+The 'disabled' bit specifies whether the counter starts out disabled
			
 
				+or enabled.  If it is initially disabled, it can be enabled by ioctl
			
 
				+or prctl (see below).
			
 
				+
			
 
				+The 'inherit' bit, if set, specifies that this counter should count
			
 
				+events on descendant tasks as well as the task specified.  This only
			
 
				+applies to new descendents, not to any existing descendents at the
			
 
				+time the counter is created (nor to any new descendents of existing
			
 
				+descendents).
			
 
				+
			
 
				+The 'pinned' bit, if set, specifies that the counter should always be
			
 
				+on the CPU if at all possible.  It only applies to hardware counters
			
 
				+and only to group leaders.  If a pinned counter cannot be put onto the
			
 
				+CPU (e.g. because there are not enough hardware counters or because of
			
 
				+a conflict with some other event), then the counter goes into an
			
 
				+'error' state, where reads return end-of-file (i.e. read() returns 0)
			
 
				+until the counter is subsequently enabled or disabled.
			
 
				+
			
 
				+The 'exclusive' bit, if set, specifies that when this counter's group
			
 
				+is on the CPU, it should be the only group using the CPU's counters.
			
 
				+In future, this will allow sophisticated monitoring programs to supply
			
 
				+extra configuration information via 'extra_config_len' to exploit
			
 
				+advanced features of the CPU's Performance Monitor Unit (PMU) that are
			
 
				+not otherwise accessible and that might disrupt other hardware
			
 
				+counters.
			
 
				+
			
 
				+The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a
			
 
				+way to request that counting of events be restricted to times when the
			
 
				+CPU is in user, kernel and/or hypervisor mode.
			
 
				+
			
 
				+The 'mmap' and 'munmap' bits allow recording of PROT_EXEC mmap/munmap
			
 
				+operations, these can be used to relate userspace IP addresses to actual
			
 
				+code, even after the mapping (or even the whole process) is gone,
			
 
				+these events are recorded in the ring-buffer (see below).
			
 
				+
			
 
				+The 'comm' bit allows tracking of process comm data on process creation.
			
 
				+This too is recorded in the ring-buffer (see below).
			
 
				+
			
 
				+The 'pid' parameter to the perf_counter_open() system call allows the
			
 
				+counter to be specific to a task:
			
 
				+
			
 
				+ pid == 0: if the pid parameter is zero, the counter is attached to the
			
 
				+ current task.
			
 
				+
			
 
				+ pid > 0: the counter is attached to a specific task (if the current task
			
 
				+ has sufficient privilege to do so)
			
 
				+
			
 
				+ pid < 0: all tasks are counted (per cpu counters)
			
 
				+
			
 
				+The 'cpu' parameter allows a counter to be made specific to a CPU:
			
 
				+
			
 
				+ cpu >= 0: the counter is restricted to a specific CPU
			
 
				+ cpu == -1: the counter counts on all CPUs
			
 
				+
			
 
				+(Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.)
			
 
				+
			
 
				+A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts
			
 
				+events of that task and 'follows' that task to whatever CPU the task
			
 
				+gets schedule to. Per task counters can be created by any user, for
			
 
				+their own tasks.
			
 
				+
			
 
				+A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
			
 
				+all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
			
 
				+
			
 
				+The 'flags' parameter is currently unused and must be zero.
			
 
				+
			
 
				+The 'group_fd' parameter allows counter "groups" to be set up.  A
			
 
				+counter group has one counter which is the group "leader".  The leader
			
 
				+is created first, with group_fd = -1 in the perf_counter_open call
			
 
				+that creates it.  The rest of the group members are created
			
 
				+subsequently, with group_fd giving the fd of the group leader.
			
 
				+(A single counter on its own is created with group_fd = -1 and is
			
 
				+considered to be a group with only 1 member.)
			
 
				+
			
 
				+A counter group is scheduled onto the CPU as a unit, that is, it will
			
 
				+only be put onto the CPU if all of the counters in the group can be
			
 
				+put onto the CPU.  This means that the values of the member counters
			
 
				+can be meaningfully compared, added, divided (to get ratios), etc.,
			
 
				+with each other, since they have counted events for the same set of
			
 
				+executed instructions.
			
 
				+
			
 
				+
			
 
				+Like stated, asynchronous events, like counter overflow or PROT_EXEC mmap
			
 
				+tracking are logged into a ring-buffer. This ring-buffer is created and
			
 
				+accessed through mmap().
			
 
				+
			
 
				+The mmap size should be 1+2^n pages, where the first page is a meta-data page
			
 
				+(struct perf_counter_mmap_page) that contains various bits of information such
			
 
				+as where the ring-buffer head is.
			
 
				+
			
 
				+/*
			
 
				+ * Structure of the page that can be mapped via mmap
			
 
				+ */
			
 
				+struct perf_counter_mmap_page {
			
 
				+        __u32   version;                /* version number of this structure */
			
 
				+        __u32   compat_version;         /* lowest version this is compat with */
			
 
				+
			
 
				+        /*
			
 
				+         * Bits needed to read the hw counters in user-space.
			
 
				+         *
			
 
				+         *   u32 seq;
			
 
				+         *   s64 count;
			
 
				+         *
			
 
				+         *   do {
			
 
				+         *     seq = pc->lock;
			
 
				+         *
			
 
				+         *     barrier()
			
 
				+         *     if (pc->index) {
			
 
				+         *       count = pmc_read(pc->index - 1);
			
 
				+         *       count += pc->offset;
			
 
				+         *     } else
			
 
				+         *       goto regular_read;
			
 
				+         *
			
 
				+         *     barrier();
			
 
				+         *   } while (pc->lock != seq);
			
 
				+         *
			
 
				+         * NOTE: for obvious reason this only works on self-monitoring
			
 
				+         *       processes.
			
 
				+         */
			
 
				+        __u32   lock;                   /* seqlock for synchronization */
			
 
				+        __u32   index;                  /* hardware counter identifier */
			
 
				+        __s64   offset;                 /* add to hardware counter value */
			
 
				+
			
 
				+        /*
			
 
				+         * Control data for the mmap() data buffer.
			
 
				+         *
			
 
				+         * User-space reading this value should issue an rmb(), on SMP capable
			
 
				+         * platforms, after reading this value -- see perf_counter_wakeup().
			
 
				+         */
			
 
				+        __u32   data_head;              /* head in the data section */
			
 
				+};
			
 
				+
			
 
				+NOTE: the hw-counter userspace bits are arch specific and are currently only
			
 
				+      implemented on powerpc.
			
 
				+
			
 
				+The following 2^n pages are the ring-buffer which contains events of the form:
			
 
				+
			
 
				+#define PERF_EVENT_MISC_KERNEL          (1 << 0)
			
 
				+#define PERF_EVENT_MISC_USER            (1 << 1)
			
 
				+#define PERF_EVENT_MISC_OVERFLOW        (1 << 2)
			
 
				+
			
 
				+struct perf_event_header {
			
 
				+        __u32   type;
			
 
				+        __u16   misc;
			
 
				+        __u16   size;
			
 
				+};
			
 
				+
			
 
				+enum perf_event_type {
			
 
				+
			
 
				+        /*
			
 
				+         * The MMAP events record the PROT_EXEC mappings so that we can
			
 
				+         * correlate userspace IPs to code. They have the following structure:
			
 
				+         *
			
 
				+         * struct {
			
 
				+         *      struct perf_event_header        header;
			
 
				+         *
			
 
				+         *      u32                             pid, tid;
			
 
				+         *      u64                             addr;
			
 
				+         *      u64                             len;
			
 
				+         *      u64                             pgoff;
			
 
				+         *      char                            filename[];
			
 
				+         * };
			
 
				+         */
			
 
				+        PERF_EVENT_MMAP                 = 1,
			
 
				+        PERF_EVENT_MUNMAP               = 2,
			
 
				+
			
 
				+        /*
			
 
				+         * struct {
			
 
				+         *      struct perf_event_header        header;
			
 
				+         *
			
 
				+         *      u32                             pid, tid;
			
 
				+         *      char                            comm[];
			
 
				+         * };
			
 
				+         */
			
 
				+        PERF_EVENT_COMM                 = 3,
			
 
				+
			
 
				+        /*
			
 
				+         * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
			
 
				+         * will be PERF_RECORD_*
			
 
				+         *
			
 
				+         * struct {
			
 
				+         *      struct perf_event_header        header;
			
 
				+         *
			
 
				+         *      { u64                   ip;       } && PERF_RECORD_IP
			
 
				+         *      { u32                   pid, tid; } && PERF_RECORD_TID
			
 
				+         *      { u64                   time;     } && PERF_RECORD_TIME
			
 
				+         *      { u64                   addr;     } && PERF_RECORD_ADDR
			
 
				+         *
			
 
				+         *      { u64                   nr;
			
 
				+         *        { u64 event, val; }   cnt[nr];  } && PERF_RECORD_GROUP
			
 
				+         *
			
 
				+         *      { u16                   nr,
			
 
				+         *                              hv,
			
 
				+         *                              kernel,
			
 
				+         *                              user;
			
 
				+         *        u64                   ips[nr];  } && PERF_RECORD_CALLCHAIN
			
 
				+         * };
			
 
				+         */
			
 
				+};
			
 
				+
			
 
				+NOTE: PERF_RECORD_CALLCHAIN is arch specific and currently only implemented
			
 
				+      on x86.
			
 
				+
			
 
				+Notification of new events is possible through poll()/select()/epoll() and
			
 
				+fcntl() managing signals.
			
 
				+
			
 
				+Normally a notification is generated for every page filled, however one can
			
 
				+additionally set perf_counter_hw_event.wakeup_events to generate one every
			
 
				+so many counter overflow events.
			
 
				+
			
 
				+Future work will include a splice() interface to the ring-buffer.
			
 
				+
			
 
				+
			
 
				+Counters can be enabled and disabled in two ways: via ioctl and via
			
 
				+prctl.  When a counter is disabled, it doesn't count or generate
			
 
				+events but does continue to exist and maintain its count value.
			
 
				+
			
 
				+An individual counter or counter group can be enabled with
			
 
				+
			
 
				+	ioctl(fd, PERF_COUNTER_IOC_ENABLE);
			
 
				+
			
 
				+or disabled with
			
 
				+
			
 
				+	ioctl(fd, PERF_COUNTER_IOC_DISABLE);
			
 
				+
			
 
				+Enabling or disabling the leader of a group enables or disables the
			
 
				+whole group; that is, while the group leader is disabled, none of the
			
 
				+counters in the group will count.  Enabling or disabling a member of a
			
 
				+group other than the leader only affects that counter - disabling an
			
 
				+non-leader stops that counter from counting but doesn't affect any
			
 
				+other counter.
			
 
				+
			
 
				+Additionally, non-inherited overflow counters can use
			
 
				+
			
 
				+	ioctl(fd, PERF_COUNTER_IOC_REFRESH, nr);
			
 
				+
			
 
				+to enable a counter for 'nr' events, after which it gets disabled again.
			
 
				+
			
 
				+A process can enable or disable all the counter groups that are
			
 
				+attached to it, using prctl:
			
 
				+
			
 
				+	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
			
 
				+
			
 
				+	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
			
 
				+
			
 
				+This applies to all counters on the current process, whether created
			
 
				+by this process or by another, and doesn't affect any counters that
			
 
				+this process has created on other processes.  It only enables or
			
 
				+disables the group leaders, not any other members in the groups.
			
 
				+
			
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -0,0 +1,428 @@
 
				+/*
			
 
				+ * perf.c
			
 
				+ *
			
 
				+ * Performance analysis utility.
			
 
				+ *
			
 
				+ * This is the main hub from which the sub-commands (perf stat,
			
 
				+ * perf top, perf record, perf report, etc.) are started.
			
 
				+ */
			
 
				+#include "builtin.h"
			
 
				+
			
 
				+#include "util/exec_cmd.h"
			
 
				+#include "util/cache.h"
			
 
				+#include "util/quote.h"
			
 
				+#include "util/run-command.h"
			
 
				+
			
 
				+const char perf_usage_string[] =
			
 
				+	"perf [--version] [--help] COMMAND [ARGS]";
			
 
				+
			
 
				+const char perf_more_info_string[] =
			
 
				+	"See 'perf help COMMAND' for more information on a specific command.";
			
 
				+
			
 
				+static int use_pager = -1;
			
 
				+struct pager_config {
			
 
				+	const char *cmd;
			
 
				+	int val;
			
 
				+};
			
 
				+
			
 
				+static int pager_command_config(const char *var, const char *value, void *data)
			
 
				+{
			
 
				+	struct pager_config *c = data;
			
 
				+	if (!prefixcmp(var, "pager.") && !strcmp(var + 6, c->cmd))
			
 
				+		c->val = perf_config_bool(var, value);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* returns 0 for "no pager", 1 for "use pager", and -1 for "not specified" */
			
 
				+int check_pager_config(const char *cmd)
			
 
				+{
			
 
				+	struct pager_config c;
			
 
				+	c.cmd = cmd;
			
 
				+	c.val = -1;
			
 
				+	perf_config(pager_command_config, &c);
			
 
				+	return c.val;
			
 
				+}
			
 
				+
			
 
				+static void commit_pager_choice(void) {
			
 
				+	switch (use_pager) {
			
 
				+	case 0:
			
 
				+		setenv("PERF_PAGER", "cat", 1);
			
 
				+		break;
			
 
				+	case 1:
			
 
				+		/* setup_pager(); */
			
 
				+		break;
			
 
				+	default:
			
 
				+		break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int handle_options(const char*** argv, int* argc, int* envchanged)
			
 
				+{
			
 
				+	int handled = 0;
			
 
				+
			
 
				+	while (*argc > 0) {
			
 
				+		const char *cmd = (*argv)[0];
			
 
				+		if (cmd[0] != '-')
			
 
				+			break;
			
 
				+
			
 
				+		/*
			
 
				+		 * For legacy reasons, the "version" and "help"
			
 
				+		 * commands can be written with "--" prepended
			
 
				+		 * to make them look like flags.
			
 
				+		 */
			
 
				+		if (!strcmp(cmd, "--help") || !strcmp(cmd, "--version"))
			
 
				+			break;
			
 
				+
			
 
				+		/*
			
 
				+		 * Check remaining flags.
			
 
				+		 */
			
 
				+		if (!prefixcmp(cmd, "--exec-path")) {
			
 
				+			cmd += 11;
			
 
				+			if (*cmd == '=')
			
 
				+				perf_set_argv_exec_path(cmd + 1);
			
 
				+			else {
			
 
				+				puts(perf_exec_path());
			
 
				+				exit(0);
			
 
				+			}
			
 
				+		} else if (!strcmp(cmd, "--html-path")) {
			
 
				+			puts(system_path(PERF_HTML_PATH));
			
 
				+			exit(0);
			
 
				+		} else if (!strcmp(cmd, "-p") || !strcmp(cmd, "--paginate")) {
			
 
				+			use_pager = 1;
			
 
				+		} else if (!strcmp(cmd, "--no-pager")) {
			
 
				+			use_pager = 0;
			
 
				+			if (envchanged)
			
 
				+				*envchanged = 1;
			
 
				+		} else if (!strcmp(cmd, "--perf-dir")) {
			
 
				+			if (*argc < 2) {
			
 
				+				fprintf(stderr, "No directory given for --perf-dir.\n" );
			
 
				+				usage(perf_usage_string);
			
 
				+			}
			
 
				+			setenv(PERF_DIR_ENVIRONMENT, (*argv)[1], 1);
			
 
				+			if (envchanged)
			
 
				+				*envchanged = 1;
			
 
				+			(*argv)++;
			
 
				+			(*argc)--;
			
 
				+			handled++;
			
 
				+		} else if (!prefixcmp(cmd, "--perf-dir=")) {
			
 
				+			setenv(PERF_DIR_ENVIRONMENT, cmd + 10, 1);
			
 
				+			if (envchanged)
			
 
				+				*envchanged = 1;
			
 
				+		} else if (!strcmp(cmd, "--work-tree")) {
			
 
				+			if (*argc < 2) {
			
 
				+				fprintf(stderr, "No directory given for --work-tree.\n" );
			
 
				+				usage(perf_usage_string);
			
 
				+			}
			
 
				+			setenv(PERF_WORK_TREE_ENVIRONMENT, (*argv)[1], 1);
			
 
				+			if (envchanged)
			
 
				+				*envchanged = 1;
			
 
				+			(*argv)++;
			
 
				+			(*argc)--;
			
 
				+		} else if (!prefixcmp(cmd, "--work-tree=")) {
			
 
				+			setenv(PERF_WORK_TREE_ENVIRONMENT, cmd + 12, 1);
			
 
				+			if (envchanged)
			
 
				+				*envchanged = 1;
			
 
				+		} else {
			
 
				+			fprintf(stderr, "Unknown option: %s\n", cmd);
			
 
				+			usage(perf_usage_string);
			
 
				+		}
			
 
				+
			
 
				+		(*argv)++;
			
 
				+		(*argc)--;
			
 
				+		handled++;
			
 
				+	}
			
 
				+	return handled;
			
 
				+}
			
 
				+
			
 
				+static int handle_alias(int *argcp, const char ***argv)
			
 
				+{
			
 
				+	int envchanged = 0, ret = 0, saved_errno = errno;
			
 
				+	int count, option_count;
			
 
				+	const char** new_argv;
			
 
				+	const char *alias_command;
			
 
				+	char *alias_string;
			
 
				+
			
 
				+	alias_command = (*argv)[0];
			
 
				+	alias_string = alias_lookup(alias_command);
			
 
				+	if (alias_string) {
			
 
				+		if (alias_string[0] == '!') {
			
 
				+			if (*argcp > 1) {
			
 
				+				struct strbuf buf;
			
 
				+
			
 
				+				strbuf_init(&buf, PATH_MAX);
			
 
				+				strbuf_addstr(&buf, alias_string);
			
 
				+				sq_quote_argv(&buf, (*argv) + 1, PATH_MAX);
			
 
				+				free(alias_string);
			
 
				+				alias_string = buf.buf;
			
 
				+			}
			
 
				+			ret = system(alias_string + 1);
			
 
				+			if (ret >= 0 && WIFEXITED(ret) &&
			
 
				+			    WEXITSTATUS(ret) != 127)
			
 
				+				exit(WEXITSTATUS(ret));
			
 
				+			die("Failed to run '%s' when expanding alias '%s'",
			
 
				+			    alias_string + 1, alias_command);
			
 
				+		}
			
 
				+		count = split_cmdline(alias_string, &new_argv);
			
 
				+		if (count < 0)
			
 
				+			die("Bad alias.%s string", alias_command);
			
 
				+		option_count = handle_options(&new_argv, &count, &envchanged);
			
 
				+		if (envchanged)
			
 
				+			die("alias '%s' changes environment variables\n"
			
 
				+				 "You can use '!perf' in the alias to do this.",
			
 
				+				 alias_command);
			
 
				+		memmove(new_argv - option_count, new_argv,
			
 
				+				count * sizeof(char *));
			
 
				+		new_argv -= option_count;
			
 
				+
			
 
				+		if (count < 1)
			
 
				+			die("empty alias for %s", alias_command);
			
 
				+
			
 
				+		if (!strcmp(alias_command, new_argv[0]))
			
 
				+			die("recursive alias: %s", alias_command);
			
 
				+
			
 
				+		new_argv = realloc(new_argv, sizeof(char*) *
			
 
				+				    (count + *argcp + 1));
			
 
				+		/* insert after command name */
			
 
				+		memcpy(new_argv + count, *argv + 1, sizeof(char*) * *argcp);
			
 
				+		new_argv[count+*argcp] = NULL;
			
 
				+
			
 
				+		*argv = new_argv;
			
 
				+		*argcp += count - 1;
			
 
				+
			
 
				+		ret = 1;
			
 
				+	}
			
 
				+
			
 
				+	errno = saved_errno;
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+const char perf_version_string[] = PERF_VERSION;
			
 
				+
			
 
				+#define RUN_SETUP	(1<<0)
			
 
				+#define USE_PAGER	(1<<1)
			
 
				+/*
			
 
				+ * require working tree to be present -- anything uses this needs
			
 
				+ * RUN_SETUP for reading from the configuration file.
			
 
				+ */
			
 
				+#define NEED_WORK_TREE	(1<<2)
			
 
				+
			
 
				+struct cmd_struct {
			
 
				+	const char *cmd;
			
 
				+	int (*fn)(int, const char **, const char *);
			
 
				+	int option;
			
 
				+};
			
 
				+
			
 
				+static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
			
 
				+{
			
 
				+	int status;
			
 
				+	struct stat st;
			
 
				+	const char *prefix;
			
 
				+
			
 
				+	prefix = NULL;
			
 
				+	if (p->option & RUN_SETUP)
			
 
				+		prefix = NULL; /* setup_perf_directory(); */
			
 
				+
			
 
				+	if (use_pager == -1 && p->option & RUN_SETUP)
			
 
				+		use_pager = check_pager_config(p->cmd);
			
 
				+	if (use_pager == -1 && p->option & USE_PAGER)
			
 
				+		use_pager = 1;
			
 
				+	commit_pager_choice();
			
 
				+
			
 
				+	if (p->option & NEED_WORK_TREE)
			
 
				+		/* setup_work_tree() */;
			
 
				+
			
 
				+	status = p->fn(argc, argv, prefix);
			
 
				+	if (status)
			
 
				+		return status & 0xff;
			
 
				+
			
 
				+	/* Somebody closed stdout? */
			
 
				+	if (fstat(fileno(stdout), &st))
			
 
				+		return 0;
			
 
				+	/* Ignore write errors for pipes and sockets.. */
			
 
				+	if (S_ISFIFO(st.st_mode) || S_ISSOCK(st.st_mode))
			
 
				+		return 0;
			
 
				+
			
 
				+	/* Check for ENOSPC and EIO errors.. */
			
 
				+	if (fflush(stdout))
			
 
				+		die("write failure on standard output: %s", strerror(errno));
			
 
				+	if (ferror(stdout))
			
 
				+		die("unknown write failure on standard output");
			
 
				+	if (fclose(stdout))
			
 
				+		die("close failed on standard output: %s", strerror(errno));
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void handle_internal_command(int argc, const char **argv)
			
 
				+{
			
 
				+	const char *cmd = argv[0];
			
 
				+	static struct cmd_struct commands[] = {
			
 
				+		{ "help", cmd_help, 0 },
			
 
				+		{ "list", cmd_list, 0 },
			
 
				+		{ "record", cmd_record, 0 },
			
 
				+		{ "report", cmd_report, 0 },
			
 
				+		{ "stat", cmd_stat, 0 },
			
 
				+		{ "top", cmd_top, 0 },
			
 
				+		{ "annotate", cmd_annotate, 0 },
			
 
				+		{ "version", cmd_version, 0 },
			
 
				+	};
			
 
				+	int i;
			
 
				+	static const char ext[] = STRIP_EXTENSION;
			
 
				+
			
 
				+	if (sizeof(ext) > 1) {
			
 
				+		i = strlen(argv[0]) - strlen(ext);
			
 
				+		if (i > 0 && !strcmp(argv[0] + i, ext)) {
			
 
				+			char *argv0 = strdup(argv[0]);
			
 
				+			argv[0] = cmd = argv0;
			
 
				+			argv0[i] = '\0';
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* Turn "perf cmd --help" into "perf help cmd" */
			
 
				+	if (argc > 1 && !strcmp(argv[1], "--help")) {
			
 
				+		argv[1] = argv[0];
			
 
				+		argv[0] = cmd = "help";
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < ARRAY_SIZE(commands); i++) {
			
 
				+		struct cmd_struct *p = commands+i;
			
 
				+		if (strcmp(p->cmd, cmd))
			
 
				+			continue;
			
 
				+		exit(run_builtin(p, argc, argv));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void execv_dashed_external(const char **argv)
			
 
				+{
			
 
				+	struct strbuf cmd = STRBUF_INIT;
			
 
				+	const char *tmp;
			
 
				+	int status;
			
 
				+
			
 
				+	strbuf_addf(&cmd, "perf-%s", argv[0]);
			
 
				+
			
 
				+	/*
			
 
				+	 * argv[0] must be the perf command, but the argv array
			
 
				+	 * belongs to the caller, and may be reused in
			
 
				+	 * subsequent loop iterations. Save argv[0] and
			
 
				+	 * restore it on error.
			
 
				+	 */
			
 
				+	tmp = argv[0];
			
 
				+	argv[0] = cmd.buf;
			
 
				+
			
 
				+	/*
			
 
				+	 * if we fail because the command is not found, it is
			
 
				+	 * OK to return. Otherwise, we just pass along the status code.
			
 
				+	 */
			
 
				+	status = run_command_v_opt(argv, 0);
			
 
				+	if (status != -ERR_RUN_COMMAND_EXEC) {
			
 
				+		if (IS_RUN_COMMAND_ERR(status))
			
 
				+			die("unable to run '%s'", argv[0]);
			
 
				+		exit(-status);
			
 
				+	}
			
 
				+	errno = ENOENT; /* as if we called execvp */
			
 
				+
			
 
				+	argv[0] = tmp;
			
 
				+
			
 
				+	strbuf_release(&cmd);
			
 
				+}
			
 
				+
			
 
				+static int run_argv(int *argcp, const char ***argv)
			
 
				+{
			
 
				+	int done_alias = 0;
			
 
				+
			
 
				+	while (1) {
			
 
				+		/* See if it's an internal command */
			
 
				+		handle_internal_command(*argcp, *argv);
			
 
				+
			
 
				+		/* .. then try the external ones */
			
 
				+		execv_dashed_external(*argv);
			
 
				+
			
 
				+		/* It could be an alias -- this works around the insanity
			
 
				+		 * of overriding "perf log" with "perf show" by having
			
 
				+		 * alias.log = show
			
 
				+		 */
			
 
				+		if (done_alias || !handle_alias(argcp, argv))
			
 
				+			break;
			
 
				+		done_alias = 1;
			
 
				+	}
			
 
				+
			
 
				+	return done_alias;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int main(int argc, const char **argv)
			
 
				+{
			
 
				+	const char *cmd;
			
 
				+
			
 
				+	cmd = perf_extract_argv0_path(argv[0]);
			
 
				+	if (!cmd)
			
 
				+		cmd = "perf-help";
			
 
				+
			
 
				+	/*
			
 
				+	 * "perf-xxxx" is the same as "perf xxxx", but we obviously:
			
 
				+	 *
			
 
				+	 *  - cannot take flags in between the "perf" and the "xxxx".
			
 
				+	 *  - cannot execute it externally (since it would just do
			
 
				+	 *    the same thing over again)
			
 
				+	 *
			
 
				+	 * So we just directly call the internal command handler, and
			
 
				+	 * die if that one cannot handle it.
			
 
				+	 */
			
 
				+	if (!prefixcmp(cmd, "perf-")) {
			
 
				+		cmd += 5;
			
 
				+		argv[0] = cmd;
			
 
				+		handle_internal_command(argc, argv);
			
 
				+		die("cannot handle %s internally", cmd);
			
 
				+	}
			
 
				+
			
 
				+	/* Look for flags.. */
			
 
				+	argv++;
			
 
				+	argc--;
			
 
				+	handle_options(&argv, &argc, NULL);
			
 
				+	commit_pager_choice();
			
 
				+	if (argc > 0) {
			
 
				+		if (!prefixcmp(argv[0], "--"))
			
 
				+			argv[0] += 2;
			
 
				+	} else {
			
 
				+		/* The user didn't specify a command; give them help */
			
 
				+		printf("\n usage: %s\n\n", perf_usage_string);
			
 
				+		list_common_cmds_help();
			
 
				+		printf("\n %s\n\n", perf_more_info_string);
			
 
				+		exit(1);
			
 
				+	}
			
 
				+	cmd = argv[0];
			
 
				+
			
 
				+	/*
			
 
				+	 * We use PATH to find perf commands, but we prepend some higher
			
 
				+	 * precidence paths: the "--exec-path" option, the PERF_EXEC_PATH
			
 
				+	 * environment, and the $(perfexecdir) from the Makefile at build
			
 
				+	 * time.
			
 
				+	 */
			
 
				+	setup_path();
			
 
				+
			
 
				+	while (1) {
			
 
				+		static int done_help = 0;
			
 
				+		static int was_alias = 0;
			
 
				+
			
 
				+		was_alias = run_argv(&argc, &argv);
			
 
				+		if (errno != ENOENT)
			
 
				+			break;
			
 
				+
			
 
				+		if (was_alias) {
			
 
				+			fprintf(stderr, "Expansion of alias '%s' failed; "
			
 
				+				"'%s' is not a perf-command\n",
			
 
				+				cmd, argv[0]);
			
 
				+			exit(1);
			
 
				+		}
			
 
				+		if (!done_help) {
			
 
				+			cmd = argv[0] = help_unknown_cmd(cmd);
			
 
				+			done_help = 1;
			
 
				+		} else
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	fprintf(stderr, "Failed to run command '%s': %s\n",
			
 
				+		cmd, strerror(errno));
			
 
				+
			
 
				+	return 1;
			
 
				+}
			
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -0,0 +1,67 @@
 
				+#ifndef _PERF_PERF_H
			
 
				+#define _PERF_PERF_H
			
 
				+
			
 
				+#if defined(__x86_64__) || defined(__i386__)
			
 
				+#include "../../arch/x86/include/asm/unistd.h"
			
 
				+#define rmb()		asm volatile("lfence" ::: "memory")
			
 
				+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
			
 
				+#endif
			
 
				+
			
 
				+#ifdef __powerpc__
			
 
				+#include "../../arch/powerpc/include/asm/unistd.h"
			
 
				+#define rmb()		asm volatile ("sync" ::: "memory")
			
 
				+#define cpu_relax()	asm volatile ("" ::: "memory");
			
 
				+#endif
			
 
				+
			
 
				+#include <time.h>
			
 
				+#include <unistd.h>
			
 
				+#include <sys/types.h>
			
 
				+#include <sys/syscall.h>
			
 
				+
			
 
				+#include "../../include/linux/perf_counter.h"
			
 
				+
			
 
				+/*
			
 
				+ * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
			
 
				+ * counters in the current task.
			
 
				+ */
			
 
				+#define PR_TASK_PERF_COUNTERS_DISABLE   31
			
 
				+#define PR_TASK_PERF_COUNTERS_ENABLE    32
			
 
				+
			
 
				+#ifndef NSEC_PER_SEC
			
 
				+# define NSEC_PER_SEC			1000000000ULL
			
 
				+#endif
			
 
				+
			
 
				+static inline unsigned long long rdclock(void)
			
 
				+{
			
 
				+	struct timespec ts;
			
 
				+
			
 
				+	clock_gettime(CLOCK_MONOTONIC, &ts);
			
 
				+	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Pick up some kernel type conventions:
			
 
				+ */
			
 
				+#define __user
			
 
				+#define asmlinkage
			
 
				+
			
 
				+#define unlikely(x)	__builtin_expect(!!(x), 0)
			
 
				+#define min(x, y) ({				\
			
 
				+	typeof(x) _min1 = (x);			\
			
 
				+	typeof(y) _min2 = (y);			\
			
 
				+	(void) (&_min1 == &_min2);		\
			
 
				+	_min1 < _min2 ? _min1 : _min2; })
			
 
				+
			
 
				+static inline int
			
 
				+sys_perf_counter_open(struct perf_counter_attr *attr_uptr,
			
 
				+		      pid_t pid, int cpu, int group_fd,
			
 
				+		      unsigned long flags)
			
 
				+{
			
 
				+	return syscall(__NR_perf_counter_open, attr_uptr, pid, cpu,
			
 
				+		       group_fd, flags);
			
 
				+}
			
 
				+
			
 
				+#define MAX_COUNTERS			256
			
 
				+#define MAX_NR_CPUS			256
			
 
				+
			
 
				+#endif
			
--- a/tools/perf/util/PERF-VERSION-GEN
+++ b/tools/perf/util/PERF-VERSION-GEN
@@ -0,0 +1,42 @@
 
				+#!/bin/sh
			
 
				+
			
 
				+GVF=PERF-VERSION-FILE
			
 
				+DEF_VER=v0.0.1.PERF
			
 
				+
			
 
				+LF='
			
 
				+'
			
 
				+
			
 
				+# First see if there is a version file (included in release tarballs),
			
 
				+# then try git-describe, then default.
			
 
				+if test -f version
			
 
				+then
			
 
				+	VN=$(cat version) || VN="$DEF_VER"
			
 
				+elif test -d .git -o -f .git &&
			
 
				+	VN=$(git describe --abbrev=4 HEAD 2>/dev/null) &&
			
 
				+	case "$VN" in
			
 
				+	*$LF*) (exit 1) ;;
			
 
				+	v[0-9]*)
			
 
				+		git update-index -q --refresh
			
 
				+		test -z "$(git diff-index --name-only HEAD --)" ||
			
 
				+		VN="$VN-dirty" ;;
			
 
				+	esac
			
 
				+then
			
 
				+	VN=$(echo "$VN" | sed -e 's/-/./g');
			
 
				+else
			
 
				+	VN="$DEF_VER"
			
 
				+fi
			
 
				+
			
 
				+VN=$(expr "$VN" : v*'\(.*\)')
			
 
				+
			
 
				+if test -r $GVF
			
 
				+then
			
 
				+	VC=$(sed -e 's/^PERF_VERSION = //' <$GVF)
			
 
				+else
			
 
				+	VC=unset
			
 
				+fi
			
 
				+test "$VN" = "$VC" || {
			
 
				+	echo >&2 "PERF_VERSION = $VN"
			
 
				+	echo "PERF_VERSION = $VN" >$GVF
			
 
				+}
			
 
				+
			
 
				+
			
--- a/tools/perf/util/abspath.c
+++ b/tools/perf/util/abspath.c
@@ -0,0 +1,117 @@
 
				+#include "cache.h"
			
 
				+
			
 
				+/*
			
 
				+ * Do not use this for inspecting *tracked* content.  When path is a
			
 
				+ * symlink to a directory, we do not want to say it is a directory when
			
 
				+ * dealing with tracked content in the working tree.
			
 
				+ */
			
 
				+static int is_directory(const char *path)
			
 
				+{
			
 
				+	struct stat st;
			
 
				+	return (!stat(path, &st) && S_ISDIR(st.st_mode));
			
 
				+}
			
 
				+
			
 
				+/* We allow "recursive" symbolic links. Only within reason, though. */
			
 
				+#define MAXDEPTH 5
			
 
				+
			
 
				+const char *make_absolute_path(const char *path)
			
 
				+{
			
 
				+	static char bufs[2][PATH_MAX + 1], *buf = bufs[0], *next_buf = bufs[1];
			
 
				+	char cwd[1024] = "";
			
 
				+	int buf_index = 1, len;
			
 
				+
			
 
				+	int depth = MAXDEPTH;
			
 
				+	char *last_elem = NULL;
			
 
				+	struct stat st;
			
 
				+
			
 
				+	if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
			
 
				+		die ("Too long path: %.*s", 60, path);
			
 
				+
			
 
				+	while (depth--) {
			
 
				+		if (!is_directory(buf)) {
			
 
				+			char *last_slash = strrchr(buf, '/');
			
 
				+			if (last_slash) {
			
 
				+				*last_slash = '\0';
			
 
				+				last_elem = xstrdup(last_slash + 1);
			
 
				+			} else {
			
 
				+				last_elem = xstrdup(buf);
			
 
				+				*buf = '\0';
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		if (*buf) {
			
 
				+			if (!*cwd && !getcwd(cwd, sizeof(cwd)))
			
 
				+				die ("Could not get current working directory");
			
 
				+
			
 
				+			if (chdir(buf))
			
 
				+				die ("Could not switch to '%s'", buf);
			
 
				+		}
			
 
				+		if (!getcwd(buf, PATH_MAX))
			
 
				+			die ("Could not get current working directory");
			
 
				+
			
 
				+		if (last_elem) {
			
 
				+			int len = strlen(buf);
			
 
				+			if (len + strlen(last_elem) + 2 > PATH_MAX)
			
 
				+				die ("Too long path name: '%s/%s'",
			
 
				+						buf, last_elem);
			
 
				+			buf[len] = '/';
			
 
				+			strcpy(buf + len + 1, last_elem);
			
 
				+			free(last_elem);
			
 
				+			last_elem = NULL;
			
 
				+		}
			
 
				+
			
 
				+		if (!lstat(buf, &st) && S_ISLNK(st.st_mode)) {
			
 
				+			len = readlink(buf, next_buf, PATH_MAX);
			
 
				+			if (len < 0)
			
 
				+				die ("Invalid symlink: %s", buf);
			
 
				+			if (PATH_MAX <= len)
			
 
				+				die("symbolic link too long: %s", buf);
			
 
				+			next_buf[len] = '\0';
			
 
				+			buf = next_buf;
			
 
				+			buf_index = 1 - buf_index;
			
 
				+			next_buf = bufs[buf_index];
			
 
				+		} else
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	if (*cwd && chdir(cwd))
			
 
				+		die ("Could not change back to '%s'", cwd);
			
 
				+
			
 
				+	return buf;
			
 
				+}
			
 
				+
			
 
				+static const char *get_pwd_cwd(void)
			
 
				+{
			
 
				+	static char cwd[PATH_MAX + 1];
			
 
				+	char *pwd;
			
 
				+	struct stat cwd_stat, pwd_stat;
			
 
				+	if (getcwd(cwd, PATH_MAX) == NULL)
			
 
				+		return NULL;
			
 
				+	pwd = getenv("PWD");
			
 
				+	if (pwd && strcmp(pwd, cwd)) {
			
 
				+		stat(cwd, &cwd_stat);
			
 
				+		if (!stat(pwd, &pwd_stat) &&
			
 
				+		    pwd_stat.st_dev == cwd_stat.st_dev &&
			
 
				+		    pwd_stat.st_ino == cwd_stat.st_ino) {
			
 
				+			strlcpy(cwd, pwd, PATH_MAX);
			
 
				+		}
			
 
				+	}
			
 
				+	return cwd;
			
 
				+}
			
 
				+
			
 
				+const char *make_nonrelative_path(const char *path)
			
 
				+{
			
 
				+	static char buf[PATH_MAX + 1];
			
 
				+
			
 
				+	if (is_absolute_path(path)) {
			
 
				+		if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
			
 
				+			die("Too long path: %.*s", 60, path);
			
 
				+	} else {
			
 
				+		const char *cwd = get_pwd_cwd();
			
 
				+		if (!cwd)
			
 
				+			die("Cannot determine the current working directory");
			
 
				+		if (snprintf(buf, PATH_MAX, "%s/%s", cwd, path) >= PATH_MAX)
			
 
				+			die("Too long path: %.*s", 60, path);
			
 
				+	}
			
 
				+	return buf;
			
 
				+}