perf_counter.c 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266
  1. /*
  2. * Performance counter core code
  3. *
  4. * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
  5. * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
  6. *
  7. * For licencing details see kernel-base/COPYING
  8. */
  9. #include <linux/fs.h>
  10. #include <linux/cpu.h>
  11. #include <linux/smp.h>
  12. #include <linux/file.h>
  13. #include <linux/poll.h>
  14. #include <linux/sysfs.h>
  15. #include <linux/ptrace.h>
  16. #include <linux/percpu.h>
  17. #include <linux/uaccess.h>
  18. #include <linux/syscalls.h>
  19. #include <linux/anon_inodes.h>
  20. #include <linux/perf_counter.h>
  21. /*
  22. * Each CPU has a list of per CPU counters:
  23. */
  24. DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
  25. int perf_max_counters __read_mostly;
  26. static int perf_reserved_percpu __read_mostly;
  27. static int perf_overcommit __read_mostly = 1;
  28. /*
  29. * Mutex for (sysadmin-configurable) counter reservations:
  30. */
  31. static DEFINE_MUTEX(perf_resource_mutex);
  32. /*
  33. * Architecture provided APIs - weak aliases:
  34. */
  35. extern __weak const struct hw_perf_counter_ops *
  36. hw_perf_counter_init(struct perf_counter *counter)
  37. {
  38. return ERR_PTR(-EINVAL);
  39. }
  40. u64 __weak hw_perf_save_disable(void) { return 0; }
  41. void __weak hw_perf_restore(u64 ctrl) { }
  42. void __weak hw_perf_counter_setup(void) { }
  43. #if BITS_PER_LONG == 64
  44. /*
  45. * Read the cached counter in counter safe against cross CPU / NMI
  46. * modifications. 64 bit version - no complications.
  47. */
  48. static inline u64 perf_counter_read_safe(struct perf_counter *counter)
  49. {
  50. return (u64) atomic64_read(&counter->count);
  51. }
  52. void atomic64_counter_set(struct perf_counter *counter, u64 val)
  53. {
  54. atomic64_set(&counter->count, val);
  55. }
  56. u64 atomic64_counter_read(struct perf_counter *counter)
  57. {
  58. return atomic64_read(&counter->count);
  59. }
  60. #else
  61. /*
  62. * Read the cached counter in counter safe against cross CPU / NMI
  63. * modifications. 32 bit version.
  64. */
  65. static u64 perf_counter_read_safe(struct perf_counter *counter)
  66. {
  67. u32 cntl, cnth;
  68. local_irq_disable();
  69. do {
  70. cnth = atomic_read(&counter->count32[1]);
  71. cntl = atomic_read(&counter->count32[0]);
  72. } while (cnth != atomic_read(&counter->count32[1]));
  73. local_irq_enable();
  74. return cntl | ((u64) cnth) << 32;
  75. }
  76. void atomic64_counter_set(struct perf_counter *counter, u64 val64)
  77. {
  78. u32 *val32 = (void *)&val64;
  79. atomic_set(counter->count32 + 0, *(val32 + 0));
  80. atomic_set(counter->count32 + 1, *(val32 + 1));
  81. }
  82. u64 atomic64_counter_read(struct perf_counter *counter)
  83. {
  84. return atomic_read(counter->count32 + 0) |
  85. (u64) atomic_read(counter->count32 + 1) << 32;
  86. }
  87. #endif
  88. static void
  89. list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  90. {
  91. struct perf_counter *group_leader = counter->group_leader;
  92. /*
  93. * Depending on whether it is a standalone or sibling counter,
  94. * add it straight to the context's counter list, or to the group
  95. * leader's sibling list:
  96. */
  97. if (counter->group_leader == counter)
  98. list_add_tail(&counter->list_entry, &ctx->counter_list);
  99. else
  100. list_add_tail(&counter->list_entry, &group_leader->sibling_list);
  101. }
  102. static void
  103. list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  104. {
  105. struct perf_counter *sibling, *tmp;
  106. list_del_init(&counter->list_entry);
  107. /*
  108. * If this was a group counter with sibling counters then
  109. * upgrade the siblings to singleton counters by adding them
  110. * to the context list directly:
  111. */
  112. list_for_each_entry_safe(sibling, tmp,
  113. &counter->sibling_list, list_entry) {
  114. list_del_init(&sibling->list_entry);
  115. list_add_tail(&sibling->list_entry, &ctx->counter_list);
  116. WARN_ON_ONCE(!sibling->group_leader);
  117. WARN_ON_ONCE(sibling->group_leader == sibling);
  118. sibling->group_leader = sibling;
  119. }
  120. }
  121. /*
  122. * Cross CPU call to remove a performance counter
  123. *
  124. * We disable the counter on the hardware level first. After that we
  125. * remove it from the context list.
  126. */
  127. static void __perf_counter_remove_from_context(void *info)
  128. {
  129. struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
  130. struct perf_counter *counter = info;
  131. struct perf_counter_context *ctx = counter->ctx;
  132. u64 perf_flags;
  133. /*
  134. * If this is a task context, we need to check whether it is
  135. * the current task context of this cpu. If not it has been
  136. * scheduled out before the smp call arrived.
  137. */
  138. if (ctx->task && cpuctx->task_ctx != ctx)
  139. return;
  140. spin_lock(&ctx->lock);
  141. if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
  142. counter->hw_ops->hw_perf_counter_disable(counter);
  143. counter->state = PERF_COUNTER_STATE_INACTIVE;
  144. ctx->nr_active--;
  145. cpuctx->active_oncpu--;
  146. counter->task = NULL;
  147. }
  148. ctx->nr_counters--;
  149. /*
  150. * Protect the list operation against NMI by disabling the
  151. * counters on a global level. NOP for non NMI based counters.
  152. */
  153. perf_flags = hw_perf_save_disable();
  154. list_del_counter(counter, ctx);
  155. hw_perf_restore(perf_flags);
  156. if (!ctx->task) {
  157. /*
  158. * Allow more per task counters with respect to the
  159. * reservation:
  160. */
  161. cpuctx->max_pertask =
  162. min(perf_max_counters - ctx->nr_counters,
  163. perf_max_counters - perf_reserved_percpu);
  164. }
  165. spin_unlock(&ctx->lock);
  166. }
  167. /*
  168. * Remove the counter from a task's (or a CPU's) list of counters.
  169. *
  170. * Must be called with counter->mutex held.
  171. *
  172. * CPU counters are removed with a smp call. For task counters we only
  173. * call when the task is on a CPU.
  174. */
  175. static void perf_counter_remove_from_context(struct perf_counter *counter)
  176. {
  177. struct perf_counter_context *ctx = counter->ctx;
  178. struct task_struct *task = ctx->task;
  179. if (!task) {
  180. /*
  181. * Per cpu counters are removed via an smp call and
  182. * the removal is always sucessful.
  183. */
  184. smp_call_function_single(counter->cpu,
  185. __perf_counter_remove_from_context,
  186. counter, 1);
  187. return;
  188. }
  189. retry:
  190. task_oncpu_function_call(task, __perf_counter_remove_from_context,
  191. counter);
  192. spin_lock_irq(&ctx->lock);
  193. /*
  194. * If the context is active we need to retry the smp call.
  195. */
  196. if (ctx->nr_active && !list_empty(&counter->list_entry)) {
  197. spin_unlock_irq(&ctx->lock);
  198. goto retry;
  199. }
  200. /*
  201. * The lock prevents that this context is scheduled in so we
  202. * can remove the counter safely, if the call above did not
  203. * succeed.
  204. */
  205. if (!list_empty(&counter->list_entry)) {
  206. ctx->nr_counters--;
  207. list_del_counter(counter, ctx);
  208. counter->task = NULL;
  209. }
  210. spin_unlock_irq(&ctx->lock);
  211. }
  212. /*
  213. * Cross CPU call to install and enable a preformance counter
  214. */
  215. static void __perf_install_in_context(void *info)
  216. {
  217. struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
  218. struct perf_counter *counter = info;
  219. struct perf_counter_context *ctx = counter->ctx;
  220. int cpu = smp_processor_id();
  221. u64 perf_flags;
  222. /*
  223. * If this is a task context, we need to check whether it is
  224. * the current task context of this cpu. If not it has been
  225. * scheduled out before the smp call arrived.
  226. */
  227. if (ctx->task && cpuctx->task_ctx != ctx)
  228. return;
  229. spin_lock(&ctx->lock);
  230. /*
  231. * Protect the list operation against NMI by disabling the
  232. * counters on a global level. NOP for non NMI based counters.
  233. */
  234. perf_flags = hw_perf_save_disable();
  235. list_add_counter(counter, ctx);
  236. hw_perf_restore(perf_flags);
  237. ctx->nr_counters++;
  238. if (cpuctx->active_oncpu < perf_max_counters) {
  239. counter->hw_ops->hw_perf_counter_enable(counter);
  240. counter->state = PERF_COUNTER_STATE_ACTIVE;
  241. counter->oncpu = cpu;
  242. ctx->nr_active++;
  243. cpuctx->active_oncpu++;
  244. }
  245. if (!ctx->task && cpuctx->max_pertask)
  246. cpuctx->max_pertask--;
  247. spin_unlock(&ctx->lock);
  248. }
  249. /*
  250. * Attach a performance counter to a context
  251. *
  252. * First we add the counter to the list with the hardware enable bit
  253. * in counter->hw_config cleared.
  254. *
  255. * If the counter is attached to a task which is on a CPU we use a smp
  256. * call to enable it in the task context. The task might have been
  257. * scheduled away, but we check this in the smp call again.
  258. */
  259. static void
  260. perf_install_in_context(struct perf_counter_context *ctx,
  261. struct perf_counter *counter,
  262. int cpu)
  263. {
  264. struct task_struct *task = ctx->task;
  265. counter->ctx = ctx;
  266. if (!task) {
  267. /*
  268. * Per cpu counters are installed via an smp call and
  269. * the install is always sucessful.
  270. */
  271. smp_call_function_single(cpu, __perf_install_in_context,
  272. counter, 1);
  273. return;
  274. }
  275. counter->task = task;
  276. retry:
  277. task_oncpu_function_call(task, __perf_install_in_context,
  278. counter);
  279. spin_lock_irq(&ctx->lock);
  280. /*
  281. * we need to retry the smp call.
  282. */
  283. if (ctx->nr_active && list_empty(&counter->list_entry)) {
  284. spin_unlock_irq(&ctx->lock);
  285. goto retry;
  286. }
  287. /*
  288. * The lock prevents that this context is scheduled in so we
  289. * can add the counter safely, if it the call above did not
  290. * succeed.
  291. */
  292. if (list_empty(&counter->list_entry)) {
  293. list_add_counter(counter, ctx);
  294. ctx->nr_counters++;
  295. }
  296. spin_unlock_irq(&ctx->lock);
  297. }
  298. static void
  299. counter_sched_out(struct perf_counter *counter,
  300. struct perf_cpu_context *cpuctx,
  301. struct perf_counter_context *ctx)
  302. {
  303. if (counter->state != PERF_COUNTER_STATE_ACTIVE)
  304. return;
  305. counter->hw_ops->hw_perf_counter_disable(counter);
  306. counter->state = PERF_COUNTER_STATE_INACTIVE;
  307. counter->oncpu = -1;
  308. cpuctx->active_oncpu--;
  309. ctx->nr_active--;
  310. }
  311. static void
  312. group_sched_out(struct perf_counter *group_counter,
  313. struct perf_cpu_context *cpuctx,
  314. struct perf_counter_context *ctx)
  315. {
  316. struct perf_counter *counter;
  317. counter_sched_out(group_counter, cpuctx, ctx);
  318. /*
  319. * Schedule out siblings (if any):
  320. */
  321. list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
  322. counter_sched_out(counter, cpuctx, ctx);
  323. }
  324. /*
  325. * Called from scheduler to remove the counters of the current task,
  326. * with interrupts disabled.
  327. *
  328. * We stop each counter and update the counter value in counter->count.
  329. *
  330. * This does not protect us against NMI, but hw_perf_counter_disable()
  331. * sets the disabled bit in the control field of counter _before_
  332. * accessing the counter control register. If a NMI hits, then it will
  333. * not restart the counter.
  334. */
  335. void perf_counter_task_sched_out(struct task_struct *task, int cpu)
  336. {
  337. struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
  338. struct perf_counter_context *ctx = &task->perf_counter_ctx;
  339. struct perf_counter *counter;
  340. if (likely(!cpuctx->task_ctx))
  341. return;
  342. spin_lock(&ctx->lock);
  343. if (ctx->nr_active) {
  344. list_for_each_entry(counter, &ctx->counter_list, list_entry)
  345. group_sched_out(counter, cpuctx, ctx);
  346. }
  347. spin_unlock(&ctx->lock);
  348. cpuctx->task_ctx = NULL;
  349. }
  350. static void
  351. counter_sched_in(struct perf_counter *counter,
  352. struct perf_cpu_context *cpuctx,
  353. struct perf_counter_context *ctx,
  354. int cpu)
  355. {
  356. if (counter->state == PERF_COUNTER_STATE_OFF)
  357. return;
  358. counter->hw_ops->hw_perf_counter_enable(counter);
  359. counter->state = PERF_COUNTER_STATE_ACTIVE;
  360. counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
  361. cpuctx->active_oncpu++;
  362. ctx->nr_active++;
  363. }
  364. static void
  365. group_sched_in(struct perf_counter *group_counter,
  366. struct perf_cpu_context *cpuctx,
  367. struct perf_counter_context *ctx,
  368. int cpu)
  369. {
  370. struct perf_counter *counter;
  371. counter_sched_in(group_counter, cpuctx, ctx, cpu);
  372. /*
  373. * Schedule in siblings as one group (if any):
  374. */
  375. list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
  376. counter_sched_in(counter, cpuctx, ctx, cpu);
  377. }
  378. /*
  379. * Called from scheduler to add the counters of the current task
  380. * with interrupts disabled.
  381. *
  382. * We restore the counter value and then enable it.
  383. *
  384. * This does not protect us against NMI, but hw_perf_counter_enable()
  385. * sets the enabled bit in the control field of counter _before_
  386. * accessing the counter control register. If a NMI hits, then it will
  387. * keep the counter running.
  388. */
  389. void perf_counter_task_sched_in(struct task_struct *task, int cpu)
  390. {
  391. struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
  392. struct perf_counter_context *ctx = &task->perf_counter_ctx;
  393. struct perf_counter *counter;
  394. if (likely(!ctx->nr_counters))
  395. return;
  396. spin_lock(&ctx->lock);
  397. list_for_each_entry(counter, &ctx->counter_list, list_entry) {
  398. if (ctx->nr_active == cpuctx->max_pertask)
  399. break;
  400. /*
  401. * Listen to the 'cpu' scheduling filter constraint
  402. * of counters:
  403. */
  404. if (counter->cpu != -1 && counter->cpu != cpu)
  405. continue;
  406. group_sched_in(counter, cpuctx, ctx, cpu);
  407. }
  408. spin_unlock(&ctx->lock);
  409. cpuctx->task_ctx = ctx;
  410. }
  411. int perf_counter_task_disable(void)
  412. {
  413. struct task_struct *curr = current;
  414. struct perf_counter_context *ctx = &curr->perf_counter_ctx;
  415. struct perf_counter *counter;
  416. u64 perf_flags;
  417. int cpu;
  418. if (likely(!ctx->nr_counters))
  419. return 0;
  420. local_irq_disable();
  421. cpu = smp_processor_id();
  422. perf_counter_task_sched_out(curr, cpu);
  423. spin_lock(&ctx->lock);
  424. /*
  425. * Disable all the counters:
  426. */
  427. perf_flags = hw_perf_save_disable();
  428. list_for_each_entry(counter, &ctx->counter_list, list_entry) {
  429. WARN_ON_ONCE(counter->state == PERF_COUNTER_STATE_ACTIVE);
  430. counter->state = PERF_COUNTER_STATE_OFF;
  431. }
  432. hw_perf_restore(perf_flags);
  433. spin_unlock(&ctx->lock);
  434. local_irq_enable();
  435. return 0;
  436. }
  437. int perf_counter_task_enable(void)
  438. {
  439. struct task_struct *curr = current;
  440. struct perf_counter_context *ctx = &curr->perf_counter_ctx;
  441. struct perf_counter *counter;
  442. u64 perf_flags;
  443. int cpu;
  444. if (likely(!ctx->nr_counters))
  445. return 0;
  446. local_irq_disable();
  447. cpu = smp_processor_id();
  448. spin_lock(&ctx->lock);
  449. /*
  450. * Disable all the counters:
  451. */
  452. perf_flags = hw_perf_save_disable();
  453. list_for_each_entry(counter, &ctx->counter_list, list_entry) {
  454. if (counter->state != PERF_COUNTER_STATE_OFF)
  455. continue;
  456. counter->state = PERF_COUNTER_STATE_INACTIVE;
  457. }
  458. hw_perf_restore(perf_flags);
  459. spin_unlock(&ctx->lock);
  460. perf_counter_task_sched_in(curr, cpu);
  461. local_irq_enable();
  462. return 0;
  463. }
  464. void perf_counter_task_tick(struct task_struct *curr, int cpu)
  465. {
  466. struct perf_counter_context *ctx = &curr->perf_counter_ctx;
  467. struct perf_counter *counter;
  468. u64 perf_flags;
  469. if (likely(!ctx->nr_counters))
  470. return;
  471. perf_counter_task_sched_out(curr, cpu);
  472. spin_lock(&ctx->lock);
  473. /*
  474. * Rotate the first entry last (works just fine for group counters too):
  475. */
  476. perf_flags = hw_perf_save_disable();
  477. list_for_each_entry(counter, &ctx->counter_list, list_entry) {
  478. list_del(&counter->list_entry);
  479. list_add_tail(&counter->list_entry, &ctx->counter_list);
  480. break;
  481. }
  482. hw_perf_restore(perf_flags);
  483. spin_unlock(&ctx->lock);
  484. perf_counter_task_sched_in(curr, cpu);
  485. }
  486. /*
  487. * Initialize the perf_counter context in a task_struct:
  488. */
  489. static void
  490. __perf_counter_init_context(struct perf_counter_context *ctx,
  491. struct task_struct *task)
  492. {
  493. spin_lock_init(&ctx->lock);
  494. INIT_LIST_HEAD(&ctx->counter_list);
  495. ctx->nr_counters = 0;
  496. ctx->task = task;
  497. }
  498. /*
  499. * Initialize the perf_counter context in task_struct
  500. */
  501. void perf_counter_init_task(struct task_struct *task)
  502. {
  503. __perf_counter_init_context(&task->perf_counter_ctx, task);
  504. }
  505. /*
  506. * Cross CPU call to read the hardware counter
  507. */
  508. static void __hw_perf_counter_read(void *info)
  509. {
  510. struct perf_counter *counter = info;
  511. counter->hw_ops->hw_perf_counter_read(counter);
  512. }
  513. static u64 perf_counter_read(struct perf_counter *counter)
  514. {
  515. /*
  516. * If counter is enabled and currently active on a CPU, update the
  517. * value in the counter structure:
  518. */
  519. if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
  520. smp_call_function_single(counter->oncpu,
  521. __hw_perf_counter_read, counter, 1);
  522. }
  523. return perf_counter_read_safe(counter);
  524. }
  525. /*
  526. * Cross CPU call to switch performance data pointers
  527. */
  528. static void __perf_switch_irq_data(void *info)
  529. {
  530. struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
  531. struct perf_counter *counter = info;
  532. struct perf_counter_context *ctx = counter->ctx;
  533. struct perf_data *oldirqdata = counter->irqdata;
  534. /*
  535. * If this is a task context, we need to check whether it is
  536. * the current task context of this cpu. If not it has been
  537. * scheduled out before the smp call arrived.
  538. */
  539. if (ctx->task) {
  540. if (cpuctx->task_ctx != ctx)
  541. return;
  542. spin_lock(&ctx->lock);
  543. }
  544. /* Change the pointer NMI safe */
  545. atomic_long_set((atomic_long_t *)&counter->irqdata,
  546. (unsigned long) counter->usrdata);
  547. counter->usrdata = oldirqdata;
  548. if (ctx->task)
  549. spin_unlock(&ctx->lock);
  550. }
  551. static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
  552. {
  553. struct perf_counter_context *ctx = counter->ctx;
  554. struct perf_data *oldirqdata = counter->irqdata;
  555. struct task_struct *task = ctx->task;
  556. if (!task) {
  557. smp_call_function_single(counter->cpu,
  558. __perf_switch_irq_data,
  559. counter, 1);
  560. return counter->usrdata;
  561. }
  562. retry:
  563. spin_lock_irq(&ctx->lock);
  564. if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
  565. counter->irqdata = counter->usrdata;
  566. counter->usrdata = oldirqdata;
  567. spin_unlock_irq(&ctx->lock);
  568. return oldirqdata;
  569. }
  570. spin_unlock_irq(&ctx->lock);
  571. task_oncpu_function_call(task, __perf_switch_irq_data, counter);
  572. /* Might have failed, because task was scheduled out */
  573. if (counter->irqdata == oldirqdata)
  574. goto retry;
  575. return counter->usrdata;
  576. }
  577. static void put_context(struct perf_counter_context *ctx)
  578. {
  579. if (ctx->task)
  580. put_task_struct(ctx->task);
  581. }
  582. static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
  583. {
  584. struct perf_cpu_context *cpuctx;
  585. struct perf_counter_context *ctx;
  586. struct task_struct *task;
  587. /*
  588. * If cpu is not a wildcard then this is a percpu counter:
  589. */
  590. if (cpu != -1) {
  591. /* Must be root to operate on a CPU counter: */
  592. if (!capable(CAP_SYS_ADMIN))
  593. return ERR_PTR(-EACCES);
  594. if (cpu < 0 || cpu > num_possible_cpus())
  595. return ERR_PTR(-EINVAL);
  596. /*
  597. * We could be clever and allow to attach a counter to an
  598. * offline CPU and activate it when the CPU comes up, but
  599. * that's for later.
  600. */
  601. if (!cpu_isset(cpu, cpu_online_map))
  602. return ERR_PTR(-ENODEV);
  603. cpuctx = &per_cpu(perf_cpu_context, cpu);
  604. ctx = &cpuctx->ctx;
  605. WARN_ON_ONCE(ctx->task);
  606. return ctx;
  607. }
  608. rcu_read_lock();
  609. if (!pid)
  610. task = current;
  611. else
  612. task = find_task_by_vpid(pid);
  613. if (task)
  614. get_task_struct(task);
  615. rcu_read_unlock();
  616. if (!task)
  617. return ERR_PTR(-ESRCH);
  618. ctx = &task->perf_counter_ctx;
  619. ctx->task = task;
  620. /* Reuse ptrace permission checks for now. */
  621. if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
  622. put_context(ctx);
  623. return ERR_PTR(-EACCES);
  624. }
  625. return ctx;
  626. }
  627. /*
  628. * Called when the last reference to the file is gone.
  629. */
  630. static int perf_release(struct inode *inode, struct file *file)
  631. {
  632. struct perf_counter *counter = file->private_data;
  633. struct perf_counter_context *ctx = counter->ctx;
  634. file->private_data = NULL;
  635. mutex_lock(&counter->mutex);
  636. perf_counter_remove_from_context(counter);
  637. put_context(ctx);
  638. mutex_unlock(&counter->mutex);
  639. kfree(counter);
  640. return 0;
  641. }
  642. /*
  643. * Read the performance counter - simple non blocking version for now
  644. */
  645. static ssize_t
  646. perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
  647. {
  648. u64 cntval;
  649. if (count != sizeof(cntval))
  650. return -EINVAL;
  651. mutex_lock(&counter->mutex);
  652. cntval = perf_counter_read(counter);
  653. mutex_unlock(&counter->mutex);
  654. return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
  655. }
  656. static ssize_t
  657. perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
  658. {
  659. if (!usrdata->len)
  660. return 0;
  661. count = min(count, (size_t)usrdata->len);
  662. if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
  663. return -EFAULT;
  664. /* Adjust the counters */
  665. usrdata->len -= count;
  666. if (!usrdata->len)
  667. usrdata->rd_idx = 0;
  668. else
  669. usrdata->rd_idx += count;
  670. return count;
  671. }
  672. static ssize_t
  673. perf_read_irq_data(struct perf_counter *counter,
  674. char __user *buf,
  675. size_t count,
  676. int nonblocking)
  677. {
  678. struct perf_data *irqdata, *usrdata;
  679. DECLARE_WAITQUEUE(wait, current);
  680. ssize_t res;
  681. irqdata = counter->irqdata;
  682. usrdata = counter->usrdata;
  683. if (usrdata->len + irqdata->len >= count)
  684. goto read_pending;
  685. if (nonblocking)
  686. return -EAGAIN;
  687. spin_lock_irq(&counter->waitq.lock);
  688. __add_wait_queue(&counter->waitq, &wait);
  689. for (;;) {
  690. set_current_state(TASK_INTERRUPTIBLE);
  691. if (usrdata->len + irqdata->len >= count)
  692. break;
  693. if (signal_pending(current))
  694. break;
  695. spin_unlock_irq(&counter->waitq.lock);
  696. schedule();
  697. spin_lock_irq(&counter->waitq.lock);
  698. }
  699. __remove_wait_queue(&counter->waitq, &wait);
  700. __set_current_state(TASK_RUNNING);
  701. spin_unlock_irq(&counter->waitq.lock);
  702. if (usrdata->len + irqdata->len < count)
  703. return -ERESTARTSYS;
  704. read_pending:
  705. mutex_lock(&counter->mutex);
  706. /* Drain pending data first: */
  707. res = perf_copy_usrdata(usrdata, buf, count);
  708. if (res < 0 || res == count)
  709. goto out;
  710. /* Switch irq buffer: */
  711. usrdata = perf_switch_irq_data(counter);
  712. if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) {
  713. if (!res)
  714. res = -EFAULT;
  715. } else {
  716. res = count;
  717. }
  718. out:
  719. mutex_unlock(&counter->mutex);
  720. return res;
  721. }
  722. static ssize_t
  723. perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
  724. {
  725. struct perf_counter *counter = file->private_data;
  726. switch (counter->hw_event.record_type) {
  727. case PERF_RECORD_SIMPLE:
  728. return perf_read_hw(counter, buf, count);
  729. case PERF_RECORD_IRQ:
  730. case PERF_RECORD_GROUP:
  731. return perf_read_irq_data(counter, buf, count,
  732. file->f_flags & O_NONBLOCK);
  733. }
  734. return -EINVAL;
  735. }
  736. static unsigned int perf_poll(struct file *file, poll_table *wait)
  737. {
  738. struct perf_counter *counter = file->private_data;
  739. unsigned int events = 0;
  740. unsigned long flags;
  741. poll_wait(file, &counter->waitq, wait);
  742. spin_lock_irqsave(&counter->waitq.lock, flags);
  743. if (counter->usrdata->len || counter->irqdata->len)
  744. events |= POLLIN;
  745. spin_unlock_irqrestore(&counter->waitq.lock, flags);
  746. return events;
  747. }
  748. static const struct file_operations perf_fops = {
  749. .release = perf_release,
  750. .read = perf_read,
  751. .poll = perf_poll,
  752. };
  753. static void cpu_clock_perf_counter_enable(struct perf_counter *counter)
  754. {
  755. }
  756. static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
  757. {
  758. }
  759. static void cpu_clock_perf_counter_read(struct perf_counter *counter)
  760. {
  761. int cpu = raw_smp_processor_id();
  762. atomic64_counter_set(counter, cpu_clock(cpu));
  763. }
  764. static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
  765. .hw_perf_counter_enable = cpu_clock_perf_counter_enable,
  766. .hw_perf_counter_disable = cpu_clock_perf_counter_disable,
  767. .hw_perf_counter_read = cpu_clock_perf_counter_read,
  768. };
  769. static void task_clock_perf_counter_enable(struct perf_counter *counter)
  770. {
  771. }
  772. static void task_clock_perf_counter_disable(struct perf_counter *counter)
  773. {
  774. }
  775. static void task_clock_perf_counter_read(struct perf_counter *counter)
  776. {
  777. atomic64_counter_set(counter, current->se.sum_exec_runtime);
  778. }
  779. static const struct hw_perf_counter_ops perf_ops_task_clock = {
  780. .hw_perf_counter_enable = task_clock_perf_counter_enable,
  781. .hw_perf_counter_disable = task_clock_perf_counter_disable,
  782. .hw_perf_counter_read = task_clock_perf_counter_read,
  783. };
  784. static const struct hw_perf_counter_ops *
  785. sw_perf_counter_init(struct perf_counter *counter)
  786. {
  787. const struct hw_perf_counter_ops *hw_ops = NULL;
  788. switch (counter->hw_event.type) {
  789. case PERF_COUNT_CPU_CLOCK:
  790. hw_ops = &perf_ops_cpu_clock;
  791. break;
  792. case PERF_COUNT_TASK_CLOCK:
  793. hw_ops = &perf_ops_task_clock;
  794. break;
  795. default:
  796. break;
  797. }
  798. return hw_ops;
  799. }
  800. /*
  801. * Allocate and initialize a counter structure
  802. */
  803. static struct perf_counter *
  804. perf_counter_alloc(struct perf_counter_hw_event *hw_event,
  805. int cpu,
  806. struct perf_counter *group_leader)
  807. {
  808. const struct hw_perf_counter_ops *hw_ops;
  809. struct perf_counter *counter;
  810. counter = kzalloc(sizeof(*counter), GFP_KERNEL);
  811. if (!counter)
  812. return NULL;
  813. /*
  814. * Single counters are their own group leaders, with an
  815. * empty sibling list:
  816. */
  817. if (!group_leader)
  818. group_leader = counter;
  819. mutex_init(&counter->mutex);
  820. INIT_LIST_HEAD(&counter->list_entry);
  821. INIT_LIST_HEAD(&counter->sibling_list);
  822. init_waitqueue_head(&counter->waitq);
  823. counter->irqdata = &counter->data[0];
  824. counter->usrdata = &counter->data[1];
  825. counter->cpu = cpu;
  826. counter->hw_event = *hw_event;
  827. counter->wakeup_pending = 0;
  828. counter->group_leader = group_leader;
  829. counter->hw_ops = NULL;
  830. hw_ops = NULL;
  831. if (!hw_event->raw && hw_event->type < 0)
  832. hw_ops = sw_perf_counter_init(counter);
  833. if (!hw_ops) {
  834. hw_ops = hw_perf_counter_init(counter);
  835. }
  836. if (!hw_ops) {
  837. kfree(counter);
  838. return NULL;
  839. }
  840. counter->hw_ops = hw_ops;
  841. return counter;
  842. }
  843. /**
  844. * sys_perf_task_open - open a performance counter, associate it to a task/cpu
  845. *
  846. * @hw_event_uptr: event type attributes for monitoring/sampling
  847. * @pid: target pid
  848. * @cpu: target cpu
  849. * @group_fd: group leader counter fd
  850. */
  851. asmlinkage int
  852. sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
  853. pid_t pid, int cpu, int group_fd)
  854. {
  855. struct perf_counter *counter, *group_leader;
  856. struct perf_counter_hw_event hw_event;
  857. struct perf_counter_context *ctx;
  858. struct file *group_file = NULL;
  859. int fput_needed = 0;
  860. int ret;
  861. if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
  862. return -EFAULT;
  863. /*
  864. * Get the target context (task or percpu):
  865. */
  866. ctx = find_get_context(pid, cpu);
  867. if (IS_ERR(ctx))
  868. return PTR_ERR(ctx);
  869. /*
  870. * Look up the group leader (we will attach this counter to it):
  871. */
  872. group_leader = NULL;
  873. if (group_fd != -1) {
  874. ret = -EINVAL;
  875. group_file = fget_light(group_fd, &fput_needed);
  876. if (!group_file)
  877. goto err_put_context;
  878. if (group_file->f_op != &perf_fops)
  879. goto err_put_context;
  880. group_leader = group_file->private_data;
  881. /*
  882. * Do not allow a recursive hierarchy (this new sibling
  883. * becoming part of another group-sibling):
  884. */
  885. if (group_leader->group_leader != group_leader)
  886. goto err_put_context;
  887. /*
  888. * Do not allow to attach to a group in a different
  889. * task or CPU context:
  890. */
  891. if (group_leader->ctx != ctx)
  892. goto err_put_context;
  893. }
  894. ret = -EINVAL;
  895. counter = perf_counter_alloc(&hw_event, cpu, group_leader);
  896. if (!counter)
  897. goto err_put_context;
  898. perf_install_in_context(ctx, counter, cpu);
  899. ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
  900. if (ret < 0)
  901. goto err_remove_free_put_context;
  902. out_fput:
  903. fput_light(group_file, fput_needed);
  904. return ret;
  905. err_remove_free_put_context:
  906. mutex_lock(&counter->mutex);
  907. perf_counter_remove_from_context(counter);
  908. mutex_unlock(&counter->mutex);
  909. kfree(counter);
  910. err_put_context:
  911. put_context(ctx);
  912. goto out_fput;
  913. }
  914. static void __cpuinit perf_counter_init_cpu(int cpu)
  915. {
  916. struct perf_cpu_context *cpuctx;
  917. cpuctx = &per_cpu(perf_cpu_context, cpu);
  918. __perf_counter_init_context(&cpuctx->ctx, NULL);
  919. mutex_lock(&perf_resource_mutex);
  920. cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
  921. mutex_unlock(&perf_resource_mutex);
  922. hw_perf_counter_setup();
  923. }
  924. #ifdef CONFIG_HOTPLUG_CPU
  925. static void __perf_counter_exit_cpu(void *info)
  926. {
  927. struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
  928. struct perf_counter_context *ctx = &cpuctx->ctx;
  929. struct perf_counter *counter, *tmp;
  930. list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
  931. __perf_counter_remove_from_context(counter);
  932. }
  933. static void perf_counter_exit_cpu(int cpu)
  934. {
  935. smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
  936. }
  937. #else
  938. static inline void perf_counter_exit_cpu(int cpu) { }
  939. #endif
  940. static int __cpuinit
  941. perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
  942. {
  943. unsigned int cpu = (long)hcpu;
  944. switch (action) {
  945. case CPU_UP_PREPARE:
  946. case CPU_UP_PREPARE_FROZEN:
  947. perf_counter_init_cpu(cpu);
  948. break;
  949. case CPU_DOWN_PREPARE:
  950. case CPU_DOWN_PREPARE_FROZEN:
  951. perf_counter_exit_cpu(cpu);
  952. break;
  953. default:
  954. break;
  955. }
  956. return NOTIFY_OK;
  957. }
  958. static struct notifier_block __cpuinitdata perf_cpu_nb = {
  959. .notifier_call = perf_cpu_notify,
  960. };
  961. static int __init perf_counter_init(void)
  962. {
  963. perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
  964. (void *)(long)smp_processor_id());
  965. register_cpu_notifier(&perf_cpu_nb);
  966. return 0;
  967. }
  968. early_initcall(perf_counter_init);
  969. static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
  970. {
  971. return sprintf(buf, "%d\n", perf_reserved_percpu);
  972. }
  973. static ssize_t
  974. perf_set_reserve_percpu(struct sysdev_class *class,
  975. const char *buf,
  976. size_t count)
  977. {
  978. struct perf_cpu_context *cpuctx;
  979. unsigned long val;
  980. int err, cpu, mpt;
  981. err = strict_strtoul(buf, 10, &val);
  982. if (err)
  983. return err;
  984. if (val > perf_max_counters)
  985. return -EINVAL;
  986. mutex_lock(&perf_resource_mutex);
  987. perf_reserved_percpu = val;
  988. for_each_online_cpu(cpu) {
  989. cpuctx = &per_cpu(perf_cpu_context, cpu);
  990. spin_lock_irq(&cpuctx->ctx.lock);
  991. mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
  992. perf_max_counters - perf_reserved_percpu);
  993. cpuctx->max_pertask = mpt;
  994. spin_unlock_irq(&cpuctx->ctx.lock);
  995. }
  996. mutex_unlock(&perf_resource_mutex);
  997. return count;
  998. }
  999. static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
  1000. {
  1001. return sprintf(buf, "%d\n", perf_overcommit);
  1002. }
  1003. static ssize_t
  1004. perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
  1005. {
  1006. unsigned long val;
  1007. int err;
  1008. err = strict_strtoul(buf, 10, &val);
  1009. if (err)
  1010. return err;
  1011. if (val > 1)
  1012. return -EINVAL;
  1013. mutex_lock(&perf_resource_mutex);
  1014. perf_overcommit = val;
  1015. mutex_unlock(&perf_resource_mutex);
  1016. return count;
  1017. }
  1018. static SYSDEV_CLASS_ATTR(
  1019. reserve_percpu,
  1020. 0644,
  1021. perf_show_reserve_percpu,
  1022. perf_set_reserve_percpu
  1023. );
  1024. static SYSDEV_CLASS_ATTR(
  1025. overcommit,
  1026. 0644,
  1027. perf_show_overcommit,
  1028. perf_set_overcommit
  1029. );
  1030. static struct attribute *perfclass_attrs[] = {
  1031. &attr_reserve_percpu.attr,
  1032. &attr_overcommit.attr,
  1033. NULL
  1034. };
  1035. static struct attribute_group perfclass_attr_group = {
  1036. .attrs = perfclass_attrs,
  1037. .name = "perf_counters",
  1038. };
  1039. static int __init perf_counter_sysfs_init(void)
  1040. {
  1041. return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
  1042. &perfclass_attr_group);
  1043. }
  1044. device_initcall(perf_counter_sysfs_init);