perf_counter.c 58 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419
  1. /*
  2. * Performance counter core code
  3. *
  4. * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
  5. * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
  6. *
  7. * For licencing details see kernel-base/COPYING
  8. */
  9. #include <linux/fs.h>
  10. #include <linux/cpu.h>
  11. #include <linux/smp.h>
  12. #include <linux/file.h>
  13. #include <linux/poll.h>
  14. #include <linux/sysfs.h>
  15. #include <linux/ptrace.h>
  16. #include <linux/percpu.h>
  17. #include <linux/uaccess.h>
  18. #include <linux/syscalls.h>
  19. #include <linux/anon_inodes.h>
  20. #include <linux/kernel_stat.h>
  21. #include <linux/perf_counter.h>
  22. #include <linux/mm.h>
  23. #include <linux/vmstat.h>
  24. #include <linux/rculist.h>
  25. #include <asm/irq_regs.h>
  26. /*
  27. * Each CPU has a list of per CPU counters:
  28. */
  29. DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
  30. int perf_max_counters __read_mostly = 1;
  31. static int perf_reserved_percpu __read_mostly;
  32. static int perf_overcommit __read_mostly = 1;
  33. /*
  34. * Mutex for (sysadmin-configurable) counter reservations:
  35. */
  36. static DEFINE_MUTEX(perf_resource_mutex);
  37. /*
  38. * Architecture provided APIs - weak aliases:
  39. */
  40. extern __weak const struct hw_perf_counter_ops *
  41. hw_perf_counter_init(struct perf_counter *counter)
  42. {
  43. return NULL;
  44. }
  45. u64 __weak hw_perf_save_disable(void) { return 0; }
  46. void __weak hw_perf_restore(u64 ctrl) { barrier(); }
  47. void __weak hw_perf_counter_setup(int cpu) { barrier(); }
  48. int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
  49. struct perf_cpu_context *cpuctx,
  50. struct perf_counter_context *ctx, int cpu)
  51. {
  52. return 0;
  53. }
  54. void __weak perf_counter_print_debug(void) { }
  55. static void
  56. list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  57. {
  58. struct perf_counter *group_leader = counter->group_leader;
  59. /*
  60. * Depending on whether it is a standalone or sibling counter,
  61. * add it straight to the context's counter list, or to the group
  62. * leader's sibling list:
  63. */
  64. if (counter->group_leader == counter)
  65. list_add_tail(&counter->list_entry, &ctx->counter_list);
  66. else
  67. list_add_tail(&counter->list_entry, &group_leader->sibling_list);
  68. list_add_rcu(&counter->event_entry, &ctx->event_list);
  69. }
  70. static void
  71. list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  72. {
  73. struct perf_counter *sibling, *tmp;
  74. list_del_init(&counter->list_entry);
  75. list_del_rcu(&counter->event_entry);
  76. /*
  77. * If this was a group counter with sibling counters then
  78. * upgrade the siblings to singleton counters by adding them
  79. * to the context list directly:
  80. */
  81. list_for_each_entry_safe(sibling, tmp,
  82. &counter->sibling_list, list_entry) {
  83. list_move_tail(&sibling->list_entry, &ctx->counter_list);
  84. sibling->group_leader = sibling;
  85. }
  86. }
  87. static void
  88. counter_sched_out(struct perf_counter *counter,
  89. struct perf_cpu_context *cpuctx,
  90. struct perf_counter_context *ctx)
  91. {
  92. if (counter->state != PERF_COUNTER_STATE_ACTIVE)
  93. return;
  94. counter->state = PERF_COUNTER_STATE_INACTIVE;
  95. counter->hw_ops->disable(counter);
  96. counter->oncpu = -1;
  97. if (!is_software_counter(counter))
  98. cpuctx->active_oncpu--;
  99. ctx->nr_active--;
  100. if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
  101. cpuctx->exclusive = 0;
  102. }
  103. static void
  104. group_sched_out(struct perf_counter *group_counter,
  105. struct perf_cpu_context *cpuctx,
  106. struct perf_counter_context *ctx)
  107. {
  108. struct perf_counter *counter;
  109. if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
  110. return;
  111. counter_sched_out(group_counter, cpuctx, ctx);
  112. /*
  113. * Schedule out siblings (if any):
  114. */
  115. list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
  116. counter_sched_out(counter, cpuctx, ctx);
  117. if (group_counter->hw_event.exclusive)
  118. cpuctx->exclusive = 0;
  119. }
  120. /*
  121. * Cross CPU call to remove a performance counter
  122. *
  123. * We disable the counter on the hardware level first. After that we
  124. * remove it from the context list.
  125. */
  126. static void __perf_counter_remove_from_context(void *info)
  127. {
  128. struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
  129. struct perf_counter *counter = info;
  130. struct perf_counter_context *ctx = counter->ctx;
  131. unsigned long flags;
  132. u64 perf_flags;
  133. /*
  134. * If this is a task context, we need to check whether it is
  135. * the current task context of this cpu. If not it has been
  136. * scheduled out before the smp call arrived.
  137. */
  138. if (ctx->task && cpuctx->task_ctx != ctx)
  139. return;
  140. curr_rq_lock_irq_save(&flags);
  141. spin_lock(&ctx->lock);
  142. counter_sched_out(counter, cpuctx, ctx);
  143. counter->task = NULL;
  144. ctx->nr_counters--;
  145. /*
  146. * Protect the list operation against NMI by disabling the
  147. * counters on a global level. NOP for non NMI based counters.
  148. */
  149. perf_flags = hw_perf_save_disable();
  150. list_del_counter(counter, ctx);
  151. hw_perf_restore(perf_flags);
  152. if (!ctx->task) {
  153. /*
  154. * Allow more per task counters with respect to the
  155. * reservation:
  156. */
  157. cpuctx->max_pertask =
  158. min(perf_max_counters - ctx->nr_counters,
  159. perf_max_counters - perf_reserved_percpu);
  160. }
  161. spin_unlock(&ctx->lock);
  162. curr_rq_unlock_irq_restore(&flags);
  163. }
  164. /*
  165. * Remove the counter from a task's (or a CPU's) list of counters.
  166. *
  167. * Must be called with counter->mutex and ctx->mutex held.
  168. *
  169. * CPU counters are removed with a smp call. For task counters we only
  170. * call when the task is on a CPU.
  171. */
  172. static void perf_counter_remove_from_context(struct perf_counter *counter)
  173. {
  174. struct perf_counter_context *ctx = counter->ctx;
  175. struct task_struct *task = ctx->task;
  176. if (!task) {
  177. /*
  178. * Per cpu counters are removed via an smp call and
  179. * the removal is always sucessful.
  180. */
  181. smp_call_function_single(counter->cpu,
  182. __perf_counter_remove_from_context,
  183. counter, 1);
  184. return;
  185. }
  186. retry:
  187. task_oncpu_function_call(task, __perf_counter_remove_from_context,
  188. counter);
  189. spin_lock_irq(&ctx->lock);
  190. /*
  191. * If the context is active we need to retry the smp call.
  192. */
  193. if (ctx->nr_active && !list_empty(&counter->list_entry)) {
  194. spin_unlock_irq(&ctx->lock);
  195. goto retry;
  196. }
  197. /*
  198. * The lock prevents that this context is scheduled in so we
  199. * can remove the counter safely, if the call above did not
  200. * succeed.
  201. */
  202. if (!list_empty(&counter->list_entry)) {
  203. ctx->nr_counters--;
  204. list_del_counter(counter, ctx);
  205. counter->task = NULL;
  206. }
  207. spin_unlock_irq(&ctx->lock);
  208. }
  209. /*
  210. * Cross CPU call to disable a performance counter
  211. */
  212. static void __perf_counter_disable(void *info)
  213. {
  214. struct perf_counter *counter = info;
  215. struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
  216. struct perf_counter_context *ctx = counter->ctx;
  217. unsigned long flags;
  218. /*
  219. * If this is a per-task counter, need to check whether this
  220. * counter's task is the current task on this cpu.
  221. */
  222. if (ctx->task && cpuctx->task_ctx != ctx)
  223. return;
  224. curr_rq_lock_irq_save(&flags);
  225. spin_lock(&ctx->lock);
  226. /*
  227. * If the counter is on, turn it off.
  228. * If it is in error state, leave it in error state.
  229. */
  230. if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
  231. if (counter == counter->group_leader)
  232. group_sched_out(counter, cpuctx, ctx);
  233. else
  234. counter_sched_out(counter, cpuctx, ctx);
  235. counter->state = PERF_COUNTER_STATE_OFF;
  236. }
  237. spin_unlock(&ctx->lock);
  238. curr_rq_unlock_irq_restore(&flags);
  239. }
  240. /*
  241. * Disable a counter.
  242. */
  243. static void perf_counter_disable(struct perf_counter *counter)
  244. {
  245. struct perf_counter_context *ctx = counter->ctx;
  246. struct task_struct *task = ctx->task;
  247. if (!task) {
  248. /*
  249. * Disable the counter on the cpu that it's on
  250. */
  251. smp_call_function_single(counter->cpu, __perf_counter_disable,
  252. counter, 1);
  253. return;
  254. }
  255. retry:
  256. task_oncpu_function_call(task, __perf_counter_disable, counter);
  257. spin_lock_irq(&ctx->lock);
  258. /*
  259. * If the counter is still active, we need to retry the cross-call.
  260. */
  261. if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
  262. spin_unlock_irq(&ctx->lock);
  263. goto retry;
  264. }
  265. /*
  266. * Since we have the lock this context can't be scheduled
  267. * in, so we can change the state safely.
  268. */
  269. if (counter->state == PERF_COUNTER_STATE_INACTIVE)
  270. counter->state = PERF_COUNTER_STATE_OFF;
  271. spin_unlock_irq(&ctx->lock);
  272. }
  273. /*
  274. * Disable a counter and all its children.
  275. */
  276. static void perf_counter_disable_family(struct perf_counter *counter)
  277. {
  278. struct perf_counter *child;
  279. perf_counter_disable(counter);
  280. /*
  281. * Lock the mutex to protect the list of children
  282. */
  283. mutex_lock(&counter->mutex);
  284. list_for_each_entry(child, &counter->child_list, child_list)
  285. perf_counter_disable(child);
  286. mutex_unlock(&counter->mutex);
  287. }
  288. static int
  289. counter_sched_in(struct perf_counter *counter,
  290. struct perf_cpu_context *cpuctx,
  291. struct perf_counter_context *ctx,
  292. int cpu)
  293. {
  294. if (counter->state <= PERF_COUNTER_STATE_OFF)
  295. return 0;
  296. counter->state = PERF_COUNTER_STATE_ACTIVE;
  297. counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
  298. /*
  299. * The new state must be visible before we turn it on in the hardware:
  300. */
  301. smp_wmb();
  302. if (counter->hw_ops->enable(counter)) {
  303. counter->state = PERF_COUNTER_STATE_INACTIVE;
  304. counter->oncpu = -1;
  305. return -EAGAIN;
  306. }
  307. if (!is_software_counter(counter))
  308. cpuctx->active_oncpu++;
  309. ctx->nr_active++;
  310. if (counter->hw_event.exclusive)
  311. cpuctx->exclusive = 1;
  312. return 0;
  313. }
  314. /*
  315. * Return 1 for a group consisting entirely of software counters,
  316. * 0 if the group contains any hardware counters.
  317. */
  318. static int is_software_only_group(struct perf_counter *leader)
  319. {
  320. struct perf_counter *counter;
  321. if (!is_software_counter(leader))
  322. return 0;
  323. list_for_each_entry(counter, &leader->sibling_list, list_entry)
  324. if (!is_software_counter(counter))
  325. return 0;
  326. return 1;
  327. }
  328. /*
  329. * Work out whether we can put this counter group on the CPU now.
  330. */
  331. static int group_can_go_on(struct perf_counter *counter,
  332. struct perf_cpu_context *cpuctx,
  333. int can_add_hw)
  334. {
  335. /*
  336. * Groups consisting entirely of software counters can always go on.
  337. */
  338. if (is_software_only_group(counter))
  339. return 1;
  340. /*
  341. * If an exclusive group is already on, no other hardware
  342. * counters can go on.
  343. */
  344. if (cpuctx->exclusive)
  345. return 0;
  346. /*
  347. * If this group is exclusive and there are already
  348. * counters on the CPU, it can't go on.
  349. */
  350. if (counter->hw_event.exclusive && cpuctx->active_oncpu)
  351. return 0;
  352. /*
  353. * Otherwise, try to add it if all previous groups were able
  354. * to go on.
  355. */
  356. return can_add_hw;
  357. }
  358. /*
  359. * Cross CPU call to install and enable a performance counter
  360. */
  361. static void __perf_install_in_context(void *info)
  362. {
  363. struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
  364. struct perf_counter *counter = info;
  365. struct perf_counter_context *ctx = counter->ctx;
  366. struct perf_counter *leader = counter->group_leader;
  367. int cpu = smp_processor_id();
  368. unsigned long flags;
  369. u64 perf_flags;
  370. int err;
  371. /*
  372. * If this is a task context, we need to check whether it is
  373. * the current task context of this cpu. If not it has been
  374. * scheduled out before the smp call arrived.
  375. */
  376. if (ctx->task && cpuctx->task_ctx != ctx)
  377. return;
  378. curr_rq_lock_irq_save(&flags);
  379. spin_lock(&ctx->lock);
  380. /*
  381. * Protect the list operation against NMI by disabling the
  382. * counters on a global level. NOP for non NMI based counters.
  383. */
  384. perf_flags = hw_perf_save_disable();
  385. list_add_counter(counter, ctx);
  386. ctx->nr_counters++;
  387. counter->prev_state = PERF_COUNTER_STATE_OFF;
  388. /*
  389. * Don't put the counter on if it is disabled or if
  390. * it is in a group and the group isn't on.
  391. */
  392. if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
  393. (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
  394. goto unlock;
  395. /*
  396. * An exclusive counter can't go on if there are already active
  397. * hardware counters, and no hardware counter can go on if there
  398. * is already an exclusive counter on.
  399. */
  400. if (!group_can_go_on(counter, cpuctx, 1))
  401. err = -EEXIST;
  402. else
  403. err = counter_sched_in(counter, cpuctx, ctx, cpu);
  404. if (err) {
  405. /*
  406. * This counter couldn't go on. If it is in a group
  407. * then we have to pull the whole group off.
  408. * If the counter group is pinned then put it in error state.
  409. */
  410. if (leader != counter)
  411. group_sched_out(leader, cpuctx, ctx);
  412. if (leader->hw_event.pinned)
  413. leader->state = PERF_COUNTER_STATE_ERROR;
  414. }
  415. if (!err && !ctx->task && cpuctx->max_pertask)
  416. cpuctx->max_pertask--;
  417. unlock:
  418. hw_perf_restore(perf_flags);
  419. spin_unlock(&ctx->lock);
  420. curr_rq_unlock_irq_restore(&flags);
  421. }
  422. /*
  423. * Attach a performance counter to a context
  424. *
  425. * First we add the counter to the list with the hardware enable bit
  426. * in counter->hw_config cleared.
  427. *
  428. * If the counter is attached to a task which is on a CPU we use a smp
  429. * call to enable it in the task context. The task might have been
  430. * scheduled away, but we check this in the smp call again.
  431. *
  432. * Must be called with ctx->mutex held.
  433. */
  434. static void
  435. perf_install_in_context(struct perf_counter_context *ctx,
  436. struct perf_counter *counter,
  437. int cpu)
  438. {
  439. struct task_struct *task = ctx->task;
  440. if (!task) {
  441. /*
  442. * Per cpu counters are installed via an smp call and
  443. * the install is always sucessful.
  444. */
  445. smp_call_function_single(cpu, __perf_install_in_context,
  446. counter, 1);
  447. return;
  448. }
  449. counter->task = task;
  450. retry:
  451. task_oncpu_function_call(task, __perf_install_in_context,
  452. counter);
  453. spin_lock_irq(&ctx->lock);
  454. /*
  455. * we need to retry the smp call.
  456. */
  457. if (ctx->is_active && list_empty(&counter->list_entry)) {
  458. spin_unlock_irq(&ctx->lock);
  459. goto retry;
  460. }
  461. /*
  462. * The lock prevents that this context is scheduled in so we
  463. * can add the counter safely, if it the call above did not
  464. * succeed.
  465. */
  466. if (list_empty(&counter->list_entry)) {
  467. list_add_counter(counter, ctx);
  468. ctx->nr_counters++;
  469. }
  470. spin_unlock_irq(&ctx->lock);
  471. }
  472. /*
  473. * Cross CPU call to enable a performance counter
  474. */
  475. static void __perf_counter_enable(void *info)
  476. {
  477. struct perf_counter *counter = info;
  478. struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
  479. struct perf_counter_context *ctx = counter->ctx;
  480. struct perf_counter *leader = counter->group_leader;
  481. unsigned long flags;
  482. int err;
  483. /*
  484. * If this is a per-task counter, need to check whether this
  485. * counter's task is the current task on this cpu.
  486. */
  487. if (ctx->task && cpuctx->task_ctx != ctx)
  488. return;
  489. curr_rq_lock_irq_save(&flags);
  490. spin_lock(&ctx->lock);
  491. counter->prev_state = counter->state;
  492. if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
  493. goto unlock;
  494. counter->state = PERF_COUNTER_STATE_INACTIVE;
  495. /*
  496. * If the counter is in a group and isn't the group leader,
  497. * then don't put it on unless the group is on.
  498. */
  499. if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
  500. goto unlock;
  501. if (!group_can_go_on(counter, cpuctx, 1))
  502. err = -EEXIST;
  503. else
  504. err = counter_sched_in(counter, cpuctx, ctx,
  505. smp_processor_id());
  506. if (err) {
  507. /*
  508. * If this counter can't go on and it's part of a
  509. * group, then the whole group has to come off.
  510. */
  511. if (leader != counter)
  512. group_sched_out(leader, cpuctx, ctx);
  513. if (leader->hw_event.pinned)
  514. leader->state = PERF_COUNTER_STATE_ERROR;
  515. }
  516. unlock:
  517. spin_unlock(&ctx->lock);
  518. curr_rq_unlock_irq_restore(&flags);
  519. }
  520. /*
  521. * Enable a counter.
  522. */
  523. static void perf_counter_enable(struct perf_counter *counter)
  524. {
  525. struct perf_counter_context *ctx = counter->ctx;
  526. struct task_struct *task = ctx->task;
  527. if (!task) {
  528. /*
  529. * Enable the counter on the cpu that it's on
  530. */
  531. smp_call_function_single(counter->cpu, __perf_counter_enable,
  532. counter, 1);
  533. return;
  534. }
  535. spin_lock_irq(&ctx->lock);
  536. if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
  537. goto out;
  538. /*
  539. * If the counter is in error state, clear that first.
  540. * That way, if we see the counter in error state below, we
  541. * know that it has gone back into error state, as distinct
  542. * from the task having been scheduled away before the
  543. * cross-call arrived.
  544. */
  545. if (counter->state == PERF_COUNTER_STATE_ERROR)
  546. counter->state = PERF_COUNTER_STATE_OFF;
  547. retry:
  548. spin_unlock_irq(&ctx->lock);
  549. task_oncpu_function_call(task, __perf_counter_enable, counter);
  550. spin_lock_irq(&ctx->lock);
  551. /*
  552. * If the context is active and the counter is still off,
  553. * we need to retry the cross-call.
  554. */
  555. if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
  556. goto retry;
  557. /*
  558. * Since we have the lock this context can't be scheduled
  559. * in, so we can change the state safely.
  560. */
  561. if (counter->state == PERF_COUNTER_STATE_OFF)
  562. counter->state = PERF_COUNTER_STATE_INACTIVE;
  563. out:
  564. spin_unlock_irq(&ctx->lock);
  565. }
  566. /*
  567. * Enable a counter and all its children.
  568. */
  569. static void perf_counter_enable_family(struct perf_counter *counter)
  570. {
  571. struct perf_counter *child;
  572. perf_counter_enable(counter);
  573. /*
  574. * Lock the mutex to protect the list of children
  575. */
  576. mutex_lock(&counter->mutex);
  577. list_for_each_entry(child, &counter->child_list, child_list)
  578. perf_counter_enable(child);
  579. mutex_unlock(&counter->mutex);
  580. }
  581. void __perf_counter_sched_out(struct perf_counter_context *ctx,
  582. struct perf_cpu_context *cpuctx)
  583. {
  584. struct perf_counter *counter;
  585. u64 flags;
  586. spin_lock(&ctx->lock);
  587. ctx->is_active = 0;
  588. if (likely(!ctx->nr_counters))
  589. goto out;
  590. flags = hw_perf_save_disable();
  591. if (ctx->nr_active) {
  592. list_for_each_entry(counter, &ctx->counter_list, list_entry)
  593. group_sched_out(counter, cpuctx, ctx);
  594. }
  595. hw_perf_restore(flags);
  596. out:
  597. spin_unlock(&ctx->lock);
  598. }
  599. /*
  600. * Called from scheduler to remove the counters of the current task,
  601. * with interrupts disabled.
  602. *
  603. * We stop each counter and update the counter value in counter->count.
  604. *
  605. * This does not protect us against NMI, but disable()
  606. * sets the disabled bit in the control field of counter _before_
  607. * accessing the counter control register. If a NMI hits, then it will
  608. * not restart the counter.
  609. */
  610. void perf_counter_task_sched_out(struct task_struct *task, int cpu)
  611. {
  612. struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
  613. struct perf_counter_context *ctx = &task->perf_counter_ctx;
  614. if (likely(!cpuctx->task_ctx))
  615. return;
  616. __perf_counter_sched_out(ctx, cpuctx);
  617. cpuctx->task_ctx = NULL;
  618. }
  619. static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
  620. {
  621. __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
  622. }
  623. static int
  624. group_sched_in(struct perf_counter *group_counter,
  625. struct perf_cpu_context *cpuctx,
  626. struct perf_counter_context *ctx,
  627. int cpu)
  628. {
  629. struct perf_counter *counter, *partial_group;
  630. int ret;
  631. if (group_counter->state == PERF_COUNTER_STATE_OFF)
  632. return 0;
  633. ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
  634. if (ret)
  635. return ret < 0 ? ret : 0;
  636. group_counter->prev_state = group_counter->state;
  637. if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
  638. return -EAGAIN;
  639. /*
  640. * Schedule in siblings as one group (if any):
  641. */
  642. list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
  643. counter->prev_state = counter->state;
  644. if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
  645. partial_group = counter;
  646. goto group_error;
  647. }
  648. }
  649. return 0;
  650. group_error:
  651. /*
  652. * Groups can be scheduled in as one unit only, so undo any
  653. * partial group before returning:
  654. */
  655. list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
  656. if (counter == partial_group)
  657. break;
  658. counter_sched_out(counter, cpuctx, ctx);
  659. }
  660. counter_sched_out(group_counter, cpuctx, ctx);
  661. return -EAGAIN;
  662. }
  663. static void
  664. __perf_counter_sched_in(struct perf_counter_context *ctx,
  665. struct perf_cpu_context *cpuctx, int cpu)
  666. {
  667. struct perf_counter *counter;
  668. u64 flags;
  669. int can_add_hw = 1;
  670. spin_lock(&ctx->lock);
  671. ctx->is_active = 1;
  672. if (likely(!ctx->nr_counters))
  673. goto out;
  674. flags = hw_perf_save_disable();
  675. /*
  676. * First go through the list and put on any pinned groups
  677. * in order to give them the best chance of going on.
  678. */
  679. list_for_each_entry(counter, &ctx->counter_list, list_entry) {
  680. if (counter->state <= PERF_COUNTER_STATE_OFF ||
  681. !counter->hw_event.pinned)
  682. continue;
  683. if (counter->cpu != -1 && counter->cpu != cpu)
  684. continue;
  685. if (group_can_go_on(counter, cpuctx, 1))
  686. group_sched_in(counter, cpuctx, ctx, cpu);
  687. /*
  688. * If this pinned group hasn't been scheduled,
  689. * put it in error state.
  690. */
  691. if (counter->state == PERF_COUNTER_STATE_INACTIVE)
  692. counter->state = PERF_COUNTER_STATE_ERROR;
  693. }
  694. list_for_each_entry(counter, &ctx->counter_list, list_entry) {
  695. /*
  696. * Ignore counters in OFF or ERROR state, and
  697. * ignore pinned counters since we did them already.
  698. */
  699. if (counter->state <= PERF_COUNTER_STATE_OFF ||
  700. counter->hw_event.pinned)
  701. continue;
  702. /*
  703. * Listen to the 'cpu' scheduling filter constraint
  704. * of counters:
  705. */
  706. if (counter->cpu != -1 && counter->cpu != cpu)
  707. continue;
  708. if (group_can_go_on(counter, cpuctx, can_add_hw)) {
  709. if (group_sched_in(counter, cpuctx, ctx, cpu))
  710. can_add_hw = 0;
  711. }
  712. }
  713. hw_perf_restore(flags);
  714. out:
  715. spin_unlock(&ctx->lock);
  716. }
  717. /*
  718. * Called from scheduler to add the counters of the current task
  719. * with interrupts disabled.
  720. *
  721. * We restore the counter value and then enable it.
  722. *
  723. * This does not protect us against NMI, but enable()
  724. * sets the enabled bit in the control field of counter _before_
  725. * accessing the counter control register. If a NMI hits, then it will
  726. * keep the counter running.
  727. */
  728. void perf_counter_task_sched_in(struct task_struct *task, int cpu)
  729. {
  730. struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
  731. struct perf_counter_context *ctx = &task->perf_counter_ctx;
  732. __perf_counter_sched_in(ctx, cpuctx, cpu);
  733. cpuctx->task_ctx = ctx;
  734. }
  735. static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
  736. {
  737. struct perf_counter_context *ctx = &cpuctx->ctx;
  738. __perf_counter_sched_in(ctx, cpuctx, cpu);
  739. }
  740. int perf_counter_task_disable(void)
  741. {
  742. struct task_struct *curr = current;
  743. struct perf_counter_context *ctx = &curr->perf_counter_ctx;
  744. struct perf_counter *counter;
  745. unsigned long flags;
  746. u64 perf_flags;
  747. int cpu;
  748. if (likely(!ctx->nr_counters))
  749. return 0;
  750. curr_rq_lock_irq_save(&flags);
  751. cpu = smp_processor_id();
  752. /* force the update of the task clock: */
  753. __task_delta_exec(curr, 1);
  754. perf_counter_task_sched_out(curr, cpu);
  755. spin_lock(&ctx->lock);
  756. /*
  757. * Disable all the counters:
  758. */
  759. perf_flags = hw_perf_save_disable();
  760. list_for_each_entry(counter, &ctx->counter_list, list_entry) {
  761. if (counter->state != PERF_COUNTER_STATE_ERROR)
  762. counter->state = PERF_COUNTER_STATE_OFF;
  763. }
  764. hw_perf_restore(perf_flags);
  765. spin_unlock(&ctx->lock);
  766. curr_rq_unlock_irq_restore(&flags);
  767. return 0;
  768. }
  769. int perf_counter_task_enable(void)
  770. {
  771. struct task_struct *curr = current;
  772. struct perf_counter_context *ctx = &curr->perf_counter_ctx;
  773. struct perf_counter *counter;
  774. unsigned long flags;
  775. u64 perf_flags;
  776. int cpu;
  777. if (likely(!ctx->nr_counters))
  778. return 0;
  779. curr_rq_lock_irq_save(&flags);
  780. cpu = smp_processor_id();
  781. /* force the update of the task clock: */
  782. __task_delta_exec(curr, 1);
  783. perf_counter_task_sched_out(curr, cpu);
  784. spin_lock(&ctx->lock);
  785. /*
  786. * Disable all the counters:
  787. */
  788. perf_flags = hw_perf_save_disable();
  789. list_for_each_entry(counter, &ctx->counter_list, list_entry) {
  790. if (counter->state > PERF_COUNTER_STATE_OFF)
  791. continue;
  792. counter->state = PERF_COUNTER_STATE_INACTIVE;
  793. counter->hw_event.disabled = 0;
  794. }
  795. hw_perf_restore(perf_flags);
  796. spin_unlock(&ctx->lock);
  797. perf_counter_task_sched_in(curr, cpu);
  798. curr_rq_unlock_irq_restore(&flags);
  799. return 0;
  800. }
  801. /*
  802. * Round-robin a context's counters:
  803. */
  804. static void rotate_ctx(struct perf_counter_context *ctx)
  805. {
  806. struct perf_counter *counter;
  807. u64 perf_flags;
  808. if (!ctx->nr_counters)
  809. return;
  810. spin_lock(&ctx->lock);
  811. /*
  812. * Rotate the first entry last (works just fine for group counters too):
  813. */
  814. perf_flags = hw_perf_save_disable();
  815. list_for_each_entry(counter, &ctx->counter_list, list_entry) {
  816. list_move_tail(&counter->list_entry, &ctx->counter_list);
  817. break;
  818. }
  819. hw_perf_restore(perf_flags);
  820. spin_unlock(&ctx->lock);
  821. }
  822. void perf_counter_task_tick(struct task_struct *curr, int cpu)
  823. {
  824. struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
  825. struct perf_counter_context *ctx = &curr->perf_counter_ctx;
  826. const int rotate_percpu = 0;
  827. if (rotate_percpu)
  828. perf_counter_cpu_sched_out(cpuctx);
  829. perf_counter_task_sched_out(curr, cpu);
  830. if (rotate_percpu)
  831. rotate_ctx(&cpuctx->ctx);
  832. rotate_ctx(ctx);
  833. if (rotate_percpu)
  834. perf_counter_cpu_sched_in(cpuctx, cpu);
  835. perf_counter_task_sched_in(curr, cpu);
  836. }
  837. /*
  838. * Cross CPU call to read the hardware counter
  839. */
  840. static void __read(void *info)
  841. {
  842. struct perf_counter *counter = info;
  843. unsigned long flags;
  844. curr_rq_lock_irq_save(&flags);
  845. counter->hw_ops->read(counter);
  846. curr_rq_unlock_irq_restore(&flags);
  847. }
  848. static u64 perf_counter_read(struct perf_counter *counter)
  849. {
  850. /*
  851. * If counter is enabled and currently active on a CPU, update the
  852. * value in the counter structure:
  853. */
  854. if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
  855. smp_call_function_single(counter->oncpu,
  856. __read, counter, 1);
  857. }
  858. return atomic64_read(&counter->count);
  859. }
  860. /*
  861. * Cross CPU call to switch performance data pointers
  862. */
  863. static void __perf_switch_irq_data(void *info)
  864. {
  865. struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
  866. struct perf_counter *counter = info;
  867. struct perf_counter_context *ctx = counter->ctx;
  868. struct perf_data *oldirqdata = counter->irqdata;
  869. /*
  870. * If this is a task context, we need to check whether it is
  871. * the current task context of this cpu. If not it has been
  872. * scheduled out before the smp call arrived.
  873. */
  874. if (ctx->task) {
  875. if (cpuctx->task_ctx != ctx)
  876. return;
  877. spin_lock(&ctx->lock);
  878. }
  879. /* Change the pointer NMI safe */
  880. atomic_long_set((atomic_long_t *)&counter->irqdata,
  881. (unsigned long) counter->usrdata);
  882. counter->usrdata = oldirqdata;
  883. if (ctx->task)
  884. spin_unlock(&ctx->lock);
  885. }
  886. static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
  887. {
  888. struct perf_counter_context *ctx = counter->ctx;
  889. struct perf_data *oldirqdata = counter->irqdata;
  890. struct task_struct *task = ctx->task;
  891. if (!task) {
  892. smp_call_function_single(counter->cpu,
  893. __perf_switch_irq_data,
  894. counter, 1);
  895. return counter->usrdata;
  896. }
  897. retry:
  898. spin_lock_irq(&ctx->lock);
  899. if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
  900. counter->irqdata = counter->usrdata;
  901. counter->usrdata = oldirqdata;
  902. spin_unlock_irq(&ctx->lock);
  903. return oldirqdata;
  904. }
  905. spin_unlock_irq(&ctx->lock);
  906. task_oncpu_function_call(task, __perf_switch_irq_data, counter);
  907. /* Might have failed, because task was scheduled out */
  908. if (counter->irqdata == oldirqdata)
  909. goto retry;
  910. return counter->usrdata;
  911. }
  912. static void put_context(struct perf_counter_context *ctx)
  913. {
  914. if (ctx->task)
  915. put_task_struct(ctx->task);
  916. }
  917. static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
  918. {
  919. struct perf_cpu_context *cpuctx;
  920. struct perf_counter_context *ctx;
  921. struct task_struct *task;
  922. /*
  923. * If cpu is not a wildcard then this is a percpu counter:
  924. */
  925. if (cpu != -1) {
  926. /* Must be root to operate on a CPU counter: */
  927. if (!capable(CAP_SYS_ADMIN))
  928. return ERR_PTR(-EACCES);
  929. if (cpu < 0 || cpu > num_possible_cpus())
  930. return ERR_PTR(-EINVAL);
  931. /*
  932. * We could be clever and allow to attach a counter to an
  933. * offline CPU and activate it when the CPU comes up, but
  934. * that's for later.
  935. */
  936. if (!cpu_isset(cpu, cpu_online_map))
  937. return ERR_PTR(-ENODEV);
  938. cpuctx = &per_cpu(perf_cpu_context, cpu);
  939. ctx = &cpuctx->ctx;
  940. return ctx;
  941. }
  942. rcu_read_lock();
  943. if (!pid)
  944. task = current;
  945. else
  946. task = find_task_by_vpid(pid);
  947. if (task)
  948. get_task_struct(task);
  949. rcu_read_unlock();
  950. if (!task)
  951. return ERR_PTR(-ESRCH);
  952. ctx = &task->perf_counter_ctx;
  953. ctx->task = task;
  954. /* Reuse ptrace permission checks for now. */
  955. if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
  956. put_context(ctx);
  957. return ERR_PTR(-EACCES);
  958. }
  959. return ctx;
  960. }
  961. static void free_counter_rcu(struct rcu_head *head)
  962. {
  963. struct perf_counter *counter;
  964. counter = container_of(head, struct perf_counter, rcu_head);
  965. kfree(counter);
  966. }
  967. /*
  968. * Called when the last reference to the file is gone.
  969. */
  970. static int perf_release(struct inode *inode, struct file *file)
  971. {
  972. struct perf_counter *counter = file->private_data;
  973. struct perf_counter_context *ctx = counter->ctx;
  974. file->private_data = NULL;
  975. mutex_lock(&ctx->mutex);
  976. mutex_lock(&counter->mutex);
  977. perf_counter_remove_from_context(counter);
  978. mutex_unlock(&counter->mutex);
  979. mutex_unlock(&ctx->mutex);
  980. call_rcu(&counter->rcu_head, free_counter_rcu);
  981. put_context(ctx);
  982. return 0;
  983. }
  984. /*
  985. * Read the performance counter - simple non blocking version for now
  986. */
  987. static ssize_t
  988. perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
  989. {
  990. u64 cntval;
  991. if (count != sizeof(cntval))
  992. return -EINVAL;
  993. /*
  994. * Return end-of-file for a read on a counter that is in
  995. * error state (i.e. because it was pinned but it couldn't be
  996. * scheduled on to the CPU at some point).
  997. */
  998. if (counter->state == PERF_COUNTER_STATE_ERROR)
  999. return 0;
  1000. mutex_lock(&counter->mutex);
  1001. cntval = perf_counter_read(counter);
  1002. mutex_unlock(&counter->mutex);
  1003. return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
  1004. }
  1005. static ssize_t
  1006. perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
  1007. {
  1008. if (!usrdata->len)
  1009. return 0;
  1010. count = min(count, (size_t)usrdata->len);
  1011. if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
  1012. return -EFAULT;
  1013. /* Adjust the counters */
  1014. usrdata->len -= count;
  1015. if (!usrdata->len)
  1016. usrdata->rd_idx = 0;
  1017. else
  1018. usrdata->rd_idx += count;
  1019. return count;
  1020. }
  1021. static ssize_t
  1022. perf_read_irq_data(struct perf_counter *counter,
  1023. char __user *buf,
  1024. size_t count,
  1025. int nonblocking)
  1026. {
  1027. struct perf_data *irqdata, *usrdata;
  1028. DECLARE_WAITQUEUE(wait, current);
  1029. ssize_t res, res2;
  1030. irqdata = counter->irqdata;
  1031. usrdata = counter->usrdata;
  1032. if (usrdata->len + irqdata->len >= count)
  1033. goto read_pending;
  1034. if (nonblocking)
  1035. return -EAGAIN;
  1036. spin_lock_irq(&counter->waitq.lock);
  1037. __add_wait_queue(&counter->waitq, &wait);
  1038. for (;;) {
  1039. set_current_state(TASK_INTERRUPTIBLE);
  1040. if (usrdata->len + irqdata->len >= count)
  1041. break;
  1042. if (signal_pending(current))
  1043. break;
  1044. if (counter->state == PERF_COUNTER_STATE_ERROR)
  1045. break;
  1046. spin_unlock_irq(&counter->waitq.lock);
  1047. schedule();
  1048. spin_lock_irq(&counter->waitq.lock);
  1049. }
  1050. __remove_wait_queue(&counter->waitq, &wait);
  1051. __set_current_state(TASK_RUNNING);
  1052. spin_unlock_irq(&counter->waitq.lock);
  1053. if (usrdata->len + irqdata->len < count &&
  1054. counter->state != PERF_COUNTER_STATE_ERROR)
  1055. return -ERESTARTSYS;
  1056. read_pending:
  1057. mutex_lock(&counter->mutex);
  1058. /* Drain pending data first: */
  1059. res = perf_copy_usrdata(usrdata, buf, count);
  1060. if (res < 0 || res == count)
  1061. goto out;
  1062. /* Switch irq buffer: */
  1063. usrdata = perf_switch_irq_data(counter);
  1064. res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
  1065. if (res2 < 0) {
  1066. if (!res)
  1067. res = -EFAULT;
  1068. } else {
  1069. res += res2;
  1070. }
  1071. out:
  1072. mutex_unlock(&counter->mutex);
  1073. return res;
  1074. }
  1075. static ssize_t
  1076. perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
  1077. {
  1078. struct perf_counter *counter = file->private_data;
  1079. switch (counter->hw_event.record_type) {
  1080. case PERF_RECORD_SIMPLE:
  1081. return perf_read_hw(counter, buf, count);
  1082. case PERF_RECORD_IRQ:
  1083. case PERF_RECORD_GROUP:
  1084. return perf_read_irq_data(counter, buf, count,
  1085. file->f_flags & O_NONBLOCK);
  1086. }
  1087. return -EINVAL;
  1088. }
  1089. static unsigned int perf_poll(struct file *file, poll_table *wait)
  1090. {
  1091. struct perf_counter *counter = file->private_data;
  1092. unsigned int events = 0;
  1093. unsigned long flags;
  1094. poll_wait(file, &counter->waitq, wait);
  1095. spin_lock_irqsave(&counter->waitq.lock, flags);
  1096. if (counter->usrdata->len || counter->irqdata->len)
  1097. events |= POLLIN;
  1098. spin_unlock_irqrestore(&counter->waitq.lock, flags);
  1099. return events;
  1100. }
  1101. static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
  1102. {
  1103. struct perf_counter *counter = file->private_data;
  1104. int err = 0;
  1105. switch (cmd) {
  1106. case PERF_COUNTER_IOC_ENABLE:
  1107. perf_counter_enable_family(counter);
  1108. break;
  1109. case PERF_COUNTER_IOC_DISABLE:
  1110. perf_counter_disable_family(counter);
  1111. break;
  1112. default:
  1113. err = -ENOTTY;
  1114. }
  1115. return err;
  1116. }
  1117. static const struct file_operations perf_fops = {
  1118. .release = perf_release,
  1119. .read = perf_read,
  1120. .poll = perf_poll,
  1121. .unlocked_ioctl = perf_ioctl,
  1122. .compat_ioctl = perf_ioctl,
  1123. };
  1124. /*
  1125. * Generic software counter infrastructure
  1126. */
  1127. static void perf_swcounter_update(struct perf_counter *counter)
  1128. {
  1129. struct hw_perf_counter *hwc = &counter->hw;
  1130. u64 prev, now;
  1131. s64 delta;
  1132. again:
  1133. prev = atomic64_read(&hwc->prev_count);
  1134. now = atomic64_read(&hwc->count);
  1135. if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
  1136. goto again;
  1137. delta = now - prev;
  1138. atomic64_add(delta, &counter->count);
  1139. atomic64_sub(delta, &hwc->period_left);
  1140. }
  1141. static void perf_swcounter_set_period(struct perf_counter *counter)
  1142. {
  1143. struct hw_perf_counter *hwc = &counter->hw;
  1144. s64 left = atomic64_read(&hwc->period_left);
  1145. s64 period = hwc->irq_period;
  1146. if (unlikely(left <= -period)) {
  1147. left = period;
  1148. atomic64_set(&hwc->period_left, left);
  1149. }
  1150. if (unlikely(left <= 0)) {
  1151. left += period;
  1152. atomic64_add(period, &hwc->period_left);
  1153. }
  1154. atomic64_set(&hwc->prev_count, -left);
  1155. atomic64_set(&hwc->count, -left);
  1156. }
  1157. static void perf_swcounter_save_and_restart(struct perf_counter *counter)
  1158. {
  1159. perf_swcounter_update(counter);
  1160. perf_swcounter_set_period(counter);
  1161. }
  1162. static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data)
  1163. {
  1164. struct perf_data *irqdata = counter->irqdata;
  1165. if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
  1166. irqdata->overrun++;
  1167. } else {
  1168. u64 *p = (u64 *) &irqdata->data[irqdata->len];
  1169. *p = data;
  1170. irqdata->len += sizeof(u64);
  1171. }
  1172. }
  1173. static void perf_swcounter_handle_group(struct perf_counter *sibling)
  1174. {
  1175. struct perf_counter *counter, *group_leader = sibling->group_leader;
  1176. list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
  1177. counter->hw_ops->read(counter);
  1178. perf_swcounter_store_irq(sibling, counter->hw_event.type);
  1179. perf_swcounter_store_irq(sibling, atomic64_read(&counter->count));
  1180. }
  1181. }
  1182. static void perf_swcounter_interrupt(struct perf_counter *counter,
  1183. int nmi, struct pt_regs *regs)
  1184. {
  1185. switch (counter->hw_event.record_type) {
  1186. case PERF_RECORD_SIMPLE:
  1187. break;
  1188. case PERF_RECORD_IRQ:
  1189. perf_swcounter_store_irq(counter, instruction_pointer(regs));
  1190. break;
  1191. case PERF_RECORD_GROUP:
  1192. perf_swcounter_handle_group(counter);
  1193. break;
  1194. }
  1195. if (nmi) {
  1196. counter->wakeup_pending = 1;
  1197. set_perf_counter_pending();
  1198. } else
  1199. wake_up(&counter->waitq);
  1200. }
  1201. static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
  1202. {
  1203. struct perf_counter *counter;
  1204. struct pt_regs *regs;
  1205. counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
  1206. counter->hw_ops->read(counter);
  1207. regs = get_irq_regs();
  1208. /*
  1209. * In case we exclude kernel IPs or are somehow not in interrupt
  1210. * context, provide the next best thing, the user IP.
  1211. */
  1212. if ((counter->hw_event.exclude_kernel || !regs) &&
  1213. !counter->hw_event.exclude_user)
  1214. regs = task_pt_regs(current);
  1215. if (regs)
  1216. perf_swcounter_interrupt(counter, 0, regs);
  1217. hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
  1218. return HRTIMER_RESTART;
  1219. }
  1220. static void perf_swcounter_overflow(struct perf_counter *counter,
  1221. int nmi, struct pt_regs *regs)
  1222. {
  1223. perf_swcounter_save_and_restart(counter);
  1224. perf_swcounter_interrupt(counter, nmi, regs);
  1225. }
  1226. static int perf_swcounter_match(struct perf_counter *counter,
  1227. enum hw_event_types event,
  1228. struct pt_regs *regs)
  1229. {
  1230. if (counter->state != PERF_COUNTER_STATE_ACTIVE)
  1231. return 0;
  1232. if (counter->hw_event.raw)
  1233. return 0;
  1234. if (counter->hw_event.type != event)
  1235. return 0;
  1236. if (counter->hw_event.exclude_user && user_mode(regs))
  1237. return 0;
  1238. if (counter->hw_event.exclude_kernel && !user_mode(regs))
  1239. return 0;
  1240. return 1;
  1241. }
  1242. static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
  1243. int nmi, struct pt_regs *regs)
  1244. {
  1245. int neg = atomic64_add_negative(nr, &counter->hw.count);
  1246. if (counter->hw.irq_period && !neg)
  1247. perf_swcounter_overflow(counter, nmi, regs);
  1248. }
  1249. static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
  1250. enum hw_event_types event, u64 nr,
  1251. int nmi, struct pt_regs *regs)
  1252. {
  1253. struct perf_counter *counter;
  1254. if (list_empty(&ctx->event_list))
  1255. return;
  1256. rcu_read_lock();
  1257. list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
  1258. if (perf_swcounter_match(counter, event, regs))
  1259. perf_swcounter_add(counter, nr, nmi, regs);
  1260. }
  1261. rcu_read_unlock();
  1262. }
  1263. void perf_swcounter_event(enum hw_event_types event, u64 nr,
  1264. int nmi, struct pt_regs *regs)
  1265. {
  1266. struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
  1267. perf_swcounter_ctx_event(&cpuctx->ctx, event, nr, nmi, regs);
  1268. if (cpuctx->task_ctx)
  1269. perf_swcounter_ctx_event(cpuctx->task_ctx, event, nr, nmi, regs);
  1270. put_cpu_var(perf_cpu_context);
  1271. }
  1272. static void perf_swcounter_read(struct perf_counter *counter)
  1273. {
  1274. perf_swcounter_update(counter);
  1275. }
  1276. static int perf_swcounter_enable(struct perf_counter *counter)
  1277. {
  1278. perf_swcounter_set_period(counter);
  1279. return 0;
  1280. }
  1281. static void perf_swcounter_disable(struct perf_counter *counter)
  1282. {
  1283. perf_swcounter_update(counter);
  1284. }
  1285. static const struct hw_perf_counter_ops perf_ops_generic = {
  1286. .enable = perf_swcounter_enable,
  1287. .disable = perf_swcounter_disable,
  1288. .read = perf_swcounter_read,
  1289. };
  1290. /*
  1291. * Software counter: cpu wall time clock
  1292. */
  1293. static void cpu_clock_perf_counter_update(struct perf_counter *counter)
  1294. {
  1295. int cpu = raw_smp_processor_id();
  1296. s64 prev;
  1297. u64 now;
  1298. now = cpu_clock(cpu);
  1299. prev = atomic64_read(&counter->hw.prev_count);
  1300. atomic64_set(&counter->hw.prev_count, now);
  1301. atomic64_add(now - prev, &counter->count);
  1302. }
  1303. static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
  1304. {
  1305. struct hw_perf_counter *hwc = &counter->hw;
  1306. int cpu = raw_smp_processor_id();
  1307. atomic64_set(&hwc->prev_count, cpu_clock(cpu));
  1308. hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  1309. hwc->hrtimer.function = perf_swcounter_hrtimer;
  1310. if (hwc->irq_period) {
  1311. __hrtimer_start_range_ns(&hwc->hrtimer,
  1312. ns_to_ktime(hwc->irq_period), 0,
  1313. HRTIMER_MODE_REL, 0);
  1314. }
  1315. return 0;
  1316. }
  1317. static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
  1318. {
  1319. hrtimer_cancel(&counter->hw.hrtimer);
  1320. cpu_clock_perf_counter_update(counter);
  1321. }
  1322. static void cpu_clock_perf_counter_read(struct perf_counter *counter)
  1323. {
  1324. cpu_clock_perf_counter_update(counter);
  1325. }
  1326. static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
  1327. .enable = cpu_clock_perf_counter_enable,
  1328. .disable = cpu_clock_perf_counter_disable,
  1329. .read = cpu_clock_perf_counter_read,
  1330. };
  1331. /*
  1332. * Software counter: task time clock
  1333. */
  1334. /*
  1335. * Called from within the scheduler:
  1336. */
  1337. static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
  1338. {
  1339. struct task_struct *curr = counter->task;
  1340. u64 delta;
  1341. delta = __task_delta_exec(curr, update);
  1342. return curr->se.sum_exec_runtime + delta;
  1343. }
  1344. static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
  1345. {
  1346. u64 prev;
  1347. s64 delta;
  1348. prev = atomic64_read(&counter->hw.prev_count);
  1349. atomic64_set(&counter->hw.prev_count, now);
  1350. delta = now - prev;
  1351. atomic64_add(delta, &counter->count);
  1352. }
  1353. static int task_clock_perf_counter_enable(struct perf_counter *counter)
  1354. {
  1355. struct hw_perf_counter *hwc = &counter->hw;
  1356. atomic64_set(&hwc->prev_count, task_clock_perf_counter_val(counter, 0));
  1357. hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  1358. hwc->hrtimer.function = perf_swcounter_hrtimer;
  1359. if (hwc->irq_period) {
  1360. __hrtimer_start_range_ns(&hwc->hrtimer,
  1361. ns_to_ktime(hwc->irq_period), 0,
  1362. HRTIMER_MODE_REL, 0);
  1363. }
  1364. return 0;
  1365. }
  1366. static void task_clock_perf_counter_disable(struct perf_counter *counter)
  1367. {
  1368. hrtimer_cancel(&counter->hw.hrtimer);
  1369. task_clock_perf_counter_update(counter,
  1370. task_clock_perf_counter_val(counter, 0));
  1371. }
  1372. static void task_clock_perf_counter_read(struct perf_counter *counter)
  1373. {
  1374. task_clock_perf_counter_update(counter,
  1375. task_clock_perf_counter_val(counter, 1));
  1376. }
  1377. static const struct hw_perf_counter_ops perf_ops_task_clock = {
  1378. .enable = task_clock_perf_counter_enable,
  1379. .disable = task_clock_perf_counter_disable,
  1380. .read = task_clock_perf_counter_read,
  1381. };
  1382. /*
  1383. * Software counter: context switches
  1384. */
  1385. static u64 get_context_switches(struct perf_counter *counter)
  1386. {
  1387. struct task_struct *curr = counter->ctx->task;
  1388. if (curr)
  1389. return curr->nvcsw + curr->nivcsw;
  1390. return cpu_nr_switches(smp_processor_id());
  1391. }
  1392. static void context_switches_perf_counter_update(struct perf_counter *counter)
  1393. {
  1394. u64 prev, now;
  1395. s64 delta;
  1396. prev = atomic64_read(&counter->hw.prev_count);
  1397. now = get_context_switches(counter);
  1398. atomic64_set(&counter->hw.prev_count, now);
  1399. delta = now - prev;
  1400. atomic64_add(delta, &counter->count);
  1401. }
  1402. static void context_switches_perf_counter_read(struct perf_counter *counter)
  1403. {
  1404. context_switches_perf_counter_update(counter);
  1405. }
  1406. static int context_switches_perf_counter_enable(struct perf_counter *counter)
  1407. {
  1408. if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
  1409. atomic64_set(&counter->hw.prev_count,
  1410. get_context_switches(counter));
  1411. return 0;
  1412. }
  1413. static void context_switches_perf_counter_disable(struct perf_counter *counter)
  1414. {
  1415. context_switches_perf_counter_update(counter);
  1416. }
  1417. static const struct hw_perf_counter_ops perf_ops_context_switches = {
  1418. .enable = context_switches_perf_counter_enable,
  1419. .disable = context_switches_perf_counter_disable,
  1420. .read = context_switches_perf_counter_read,
  1421. };
  1422. /*
  1423. * Software counter: cpu migrations
  1424. */
  1425. static inline u64 get_cpu_migrations(struct perf_counter *counter)
  1426. {
  1427. struct task_struct *curr = counter->ctx->task;
  1428. if (curr)
  1429. return curr->se.nr_migrations;
  1430. return cpu_nr_migrations(smp_processor_id());
  1431. }
  1432. static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
  1433. {
  1434. u64 prev, now;
  1435. s64 delta;
  1436. prev = atomic64_read(&counter->hw.prev_count);
  1437. now = get_cpu_migrations(counter);
  1438. atomic64_set(&counter->hw.prev_count, now);
  1439. delta = now - prev;
  1440. atomic64_add(delta, &counter->count);
  1441. }
  1442. static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
  1443. {
  1444. cpu_migrations_perf_counter_update(counter);
  1445. }
  1446. static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
  1447. {
  1448. if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
  1449. atomic64_set(&counter->hw.prev_count,
  1450. get_cpu_migrations(counter));
  1451. return 0;
  1452. }
  1453. static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
  1454. {
  1455. cpu_migrations_perf_counter_update(counter);
  1456. }
  1457. static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
  1458. .enable = cpu_migrations_perf_counter_enable,
  1459. .disable = cpu_migrations_perf_counter_disable,
  1460. .read = cpu_migrations_perf_counter_read,
  1461. };
  1462. static const struct hw_perf_counter_ops *
  1463. sw_perf_counter_init(struct perf_counter *counter)
  1464. {
  1465. struct perf_counter_hw_event *hw_event = &counter->hw_event;
  1466. const struct hw_perf_counter_ops *hw_ops = NULL;
  1467. struct hw_perf_counter *hwc = &counter->hw;
  1468. /*
  1469. * Software counters (currently) can't in general distinguish
  1470. * between user, kernel and hypervisor events.
  1471. * However, context switches and cpu migrations are considered
  1472. * to be kernel events, and page faults are never hypervisor
  1473. * events.
  1474. */
  1475. switch (counter->hw_event.type) {
  1476. case PERF_COUNT_CPU_CLOCK:
  1477. hw_ops = &perf_ops_cpu_clock;
  1478. if (hw_event->irq_period && hw_event->irq_period < 10000)
  1479. hw_event->irq_period = 10000;
  1480. break;
  1481. case PERF_COUNT_TASK_CLOCK:
  1482. /*
  1483. * If the user instantiates this as a per-cpu counter,
  1484. * use the cpu_clock counter instead.
  1485. */
  1486. if (counter->ctx->task)
  1487. hw_ops = &perf_ops_task_clock;
  1488. else
  1489. hw_ops = &perf_ops_cpu_clock;
  1490. if (hw_event->irq_period && hw_event->irq_period < 10000)
  1491. hw_event->irq_period = 10000;
  1492. break;
  1493. case PERF_COUNT_PAGE_FAULTS:
  1494. case PERF_COUNT_PAGE_FAULTS_MIN:
  1495. case PERF_COUNT_PAGE_FAULTS_MAJ:
  1496. hw_ops = &perf_ops_generic;
  1497. break;
  1498. case PERF_COUNT_CONTEXT_SWITCHES:
  1499. if (!counter->hw_event.exclude_kernel)
  1500. hw_ops = &perf_ops_context_switches;
  1501. break;
  1502. case PERF_COUNT_CPU_MIGRATIONS:
  1503. if (!counter->hw_event.exclude_kernel)
  1504. hw_ops = &perf_ops_cpu_migrations;
  1505. break;
  1506. default:
  1507. break;
  1508. }
  1509. if (hw_ops)
  1510. hwc->irq_period = hw_event->irq_period;
  1511. return hw_ops;
  1512. }
  1513. /*
  1514. * Allocate and initialize a counter structure
  1515. */
  1516. static struct perf_counter *
  1517. perf_counter_alloc(struct perf_counter_hw_event *hw_event,
  1518. int cpu,
  1519. struct perf_counter_context *ctx,
  1520. struct perf_counter *group_leader,
  1521. gfp_t gfpflags)
  1522. {
  1523. const struct hw_perf_counter_ops *hw_ops;
  1524. struct perf_counter *counter;
  1525. counter = kzalloc(sizeof(*counter), gfpflags);
  1526. if (!counter)
  1527. return NULL;
  1528. /*
  1529. * Single counters are their own group leaders, with an
  1530. * empty sibling list:
  1531. */
  1532. if (!group_leader)
  1533. group_leader = counter;
  1534. mutex_init(&counter->mutex);
  1535. INIT_LIST_HEAD(&counter->list_entry);
  1536. INIT_LIST_HEAD(&counter->event_entry);
  1537. INIT_LIST_HEAD(&counter->sibling_list);
  1538. init_waitqueue_head(&counter->waitq);
  1539. INIT_LIST_HEAD(&counter->child_list);
  1540. counter->irqdata = &counter->data[0];
  1541. counter->usrdata = &counter->data[1];
  1542. counter->cpu = cpu;
  1543. counter->hw_event = *hw_event;
  1544. counter->wakeup_pending = 0;
  1545. counter->group_leader = group_leader;
  1546. counter->hw_ops = NULL;
  1547. counter->ctx = ctx;
  1548. counter->state = PERF_COUNTER_STATE_INACTIVE;
  1549. if (hw_event->disabled)
  1550. counter->state = PERF_COUNTER_STATE_OFF;
  1551. hw_ops = NULL;
  1552. if (!hw_event->raw && hw_event->type < 0)
  1553. hw_ops = sw_perf_counter_init(counter);
  1554. else
  1555. hw_ops = hw_perf_counter_init(counter);
  1556. if (!hw_ops) {
  1557. kfree(counter);
  1558. return NULL;
  1559. }
  1560. counter->hw_ops = hw_ops;
  1561. return counter;
  1562. }
  1563. /**
  1564. * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
  1565. *
  1566. * @hw_event_uptr: event type attributes for monitoring/sampling
  1567. * @pid: target pid
  1568. * @cpu: target cpu
  1569. * @group_fd: group leader counter fd
  1570. */
  1571. SYSCALL_DEFINE5(perf_counter_open,
  1572. const struct perf_counter_hw_event __user *, hw_event_uptr,
  1573. pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
  1574. {
  1575. struct perf_counter *counter, *group_leader;
  1576. struct perf_counter_hw_event hw_event;
  1577. struct perf_counter_context *ctx;
  1578. struct file *counter_file = NULL;
  1579. struct file *group_file = NULL;
  1580. int fput_needed = 0;
  1581. int fput_needed2 = 0;
  1582. int ret;
  1583. /* for future expandability... */
  1584. if (flags)
  1585. return -EINVAL;
  1586. if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
  1587. return -EFAULT;
  1588. /*
  1589. * Get the target context (task or percpu):
  1590. */
  1591. ctx = find_get_context(pid, cpu);
  1592. if (IS_ERR(ctx))
  1593. return PTR_ERR(ctx);
  1594. /*
  1595. * Look up the group leader (we will attach this counter to it):
  1596. */
  1597. group_leader = NULL;
  1598. if (group_fd != -1) {
  1599. ret = -EINVAL;
  1600. group_file = fget_light(group_fd, &fput_needed);
  1601. if (!group_file)
  1602. goto err_put_context;
  1603. if (group_file->f_op != &perf_fops)
  1604. goto err_put_context;
  1605. group_leader = group_file->private_data;
  1606. /*
  1607. * Do not allow a recursive hierarchy (this new sibling
  1608. * becoming part of another group-sibling):
  1609. */
  1610. if (group_leader->group_leader != group_leader)
  1611. goto err_put_context;
  1612. /*
  1613. * Do not allow to attach to a group in a different
  1614. * task or CPU context:
  1615. */
  1616. if (group_leader->ctx != ctx)
  1617. goto err_put_context;
  1618. /*
  1619. * Only a group leader can be exclusive or pinned
  1620. */
  1621. if (hw_event.exclusive || hw_event.pinned)
  1622. goto err_put_context;
  1623. }
  1624. ret = -EINVAL;
  1625. counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
  1626. GFP_KERNEL);
  1627. if (!counter)
  1628. goto err_put_context;
  1629. ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
  1630. if (ret < 0)
  1631. goto err_free_put_context;
  1632. counter_file = fget_light(ret, &fput_needed2);
  1633. if (!counter_file)
  1634. goto err_free_put_context;
  1635. counter->filp = counter_file;
  1636. mutex_lock(&ctx->mutex);
  1637. perf_install_in_context(ctx, counter, cpu);
  1638. mutex_unlock(&ctx->mutex);
  1639. fput_light(counter_file, fput_needed2);
  1640. out_fput:
  1641. fput_light(group_file, fput_needed);
  1642. return ret;
  1643. err_free_put_context:
  1644. kfree(counter);
  1645. err_put_context:
  1646. put_context(ctx);
  1647. goto out_fput;
  1648. }
  1649. /*
  1650. * Initialize the perf_counter context in a task_struct:
  1651. */
  1652. static void
  1653. __perf_counter_init_context(struct perf_counter_context *ctx,
  1654. struct task_struct *task)
  1655. {
  1656. memset(ctx, 0, sizeof(*ctx));
  1657. spin_lock_init(&ctx->lock);
  1658. mutex_init(&ctx->mutex);
  1659. INIT_LIST_HEAD(&ctx->counter_list);
  1660. INIT_LIST_HEAD(&ctx->event_list);
  1661. ctx->task = task;
  1662. }
  1663. /*
  1664. * inherit a counter from parent task to child task:
  1665. */
  1666. static struct perf_counter *
  1667. inherit_counter(struct perf_counter *parent_counter,
  1668. struct task_struct *parent,
  1669. struct perf_counter_context *parent_ctx,
  1670. struct task_struct *child,
  1671. struct perf_counter *group_leader,
  1672. struct perf_counter_context *child_ctx)
  1673. {
  1674. struct perf_counter *child_counter;
  1675. /*
  1676. * Instead of creating recursive hierarchies of counters,
  1677. * we link inherited counters back to the original parent,
  1678. * which has a filp for sure, which we use as the reference
  1679. * count:
  1680. */
  1681. if (parent_counter->parent)
  1682. parent_counter = parent_counter->parent;
  1683. child_counter = perf_counter_alloc(&parent_counter->hw_event,
  1684. parent_counter->cpu, child_ctx,
  1685. group_leader, GFP_KERNEL);
  1686. if (!child_counter)
  1687. return NULL;
  1688. /*
  1689. * Link it up in the child's context:
  1690. */
  1691. child_counter->task = child;
  1692. list_add_counter(child_counter, child_ctx);
  1693. child_ctx->nr_counters++;
  1694. child_counter->parent = parent_counter;
  1695. /*
  1696. * inherit into child's child as well:
  1697. */
  1698. child_counter->hw_event.inherit = 1;
  1699. /*
  1700. * Get a reference to the parent filp - we will fput it
  1701. * when the child counter exits. This is safe to do because
  1702. * we are in the parent and we know that the filp still
  1703. * exists and has a nonzero count:
  1704. */
  1705. atomic_long_inc(&parent_counter->filp->f_count);
  1706. /*
  1707. * Link this into the parent counter's child list
  1708. */
  1709. mutex_lock(&parent_counter->mutex);
  1710. list_add_tail(&child_counter->child_list, &parent_counter->child_list);
  1711. /*
  1712. * Make the child state follow the state of the parent counter,
  1713. * not its hw_event.disabled bit. We hold the parent's mutex,
  1714. * so we won't race with perf_counter_{en,dis}able_family.
  1715. */
  1716. if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
  1717. child_counter->state = PERF_COUNTER_STATE_INACTIVE;
  1718. else
  1719. child_counter->state = PERF_COUNTER_STATE_OFF;
  1720. mutex_unlock(&parent_counter->mutex);
  1721. return child_counter;
  1722. }
  1723. static int inherit_group(struct perf_counter *parent_counter,
  1724. struct task_struct *parent,
  1725. struct perf_counter_context *parent_ctx,
  1726. struct task_struct *child,
  1727. struct perf_counter_context *child_ctx)
  1728. {
  1729. struct perf_counter *leader;
  1730. struct perf_counter *sub;
  1731. leader = inherit_counter(parent_counter, parent, parent_ctx,
  1732. child, NULL, child_ctx);
  1733. if (!leader)
  1734. return -ENOMEM;
  1735. list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
  1736. if (!inherit_counter(sub, parent, parent_ctx,
  1737. child, leader, child_ctx))
  1738. return -ENOMEM;
  1739. }
  1740. return 0;
  1741. }
  1742. static void sync_child_counter(struct perf_counter *child_counter,
  1743. struct perf_counter *parent_counter)
  1744. {
  1745. u64 parent_val, child_val;
  1746. parent_val = atomic64_read(&parent_counter->count);
  1747. child_val = atomic64_read(&child_counter->count);
  1748. /*
  1749. * Add back the child's count to the parent's count:
  1750. */
  1751. atomic64_add(child_val, &parent_counter->count);
  1752. /*
  1753. * Remove this counter from the parent's list
  1754. */
  1755. mutex_lock(&parent_counter->mutex);
  1756. list_del_init(&child_counter->child_list);
  1757. mutex_unlock(&parent_counter->mutex);
  1758. /*
  1759. * Release the parent counter, if this was the last
  1760. * reference to it.
  1761. */
  1762. fput(parent_counter->filp);
  1763. }
  1764. static void
  1765. __perf_counter_exit_task(struct task_struct *child,
  1766. struct perf_counter *child_counter,
  1767. struct perf_counter_context *child_ctx)
  1768. {
  1769. struct perf_counter *parent_counter;
  1770. struct perf_counter *sub, *tmp;
  1771. /*
  1772. * If we do not self-reap then we have to wait for the
  1773. * child task to unschedule (it will happen for sure),
  1774. * so that its counter is at its final count. (This
  1775. * condition triggers rarely - child tasks usually get
  1776. * off their CPU before the parent has a chance to
  1777. * get this far into the reaping action)
  1778. */
  1779. if (child != current) {
  1780. wait_task_inactive(child, 0);
  1781. list_del_init(&child_counter->list_entry);
  1782. } else {
  1783. struct perf_cpu_context *cpuctx;
  1784. unsigned long flags;
  1785. u64 perf_flags;
  1786. /*
  1787. * Disable and unlink this counter.
  1788. *
  1789. * Be careful about zapping the list - IRQ/NMI context
  1790. * could still be processing it:
  1791. */
  1792. curr_rq_lock_irq_save(&flags);
  1793. perf_flags = hw_perf_save_disable();
  1794. cpuctx = &__get_cpu_var(perf_cpu_context);
  1795. group_sched_out(child_counter, cpuctx, child_ctx);
  1796. list_del_init(&child_counter->list_entry);
  1797. child_ctx->nr_counters--;
  1798. hw_perf_restore(perf_flags);
  1799. curr_rq_unlock_irq_restore(&flags);
  1800. }
  1801. parent_counter = child_counter->parent;
  1802. /*
  1803. * It can happen that parent exits first, and has counters
  1804. * that are still around due to the child reference. These
  1805. * counters need to be zapped - but otherwise linger.
  1806. */
  1807. if (parent_counter) {
  1808. sync_child_counter(child_counter, parent_counter);
  1809. list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
  1810. list_entry) {
  1811. if (sub->parent) {
  1812. sync_child_counter(sub, sub->parent);
  1813. kfree(sub);
  1814. }
  1815. }
  1816. kfree(child_counter);
  1817. }
  1818. }
  1819. /*
  1820. * When a child task exits, feed back counter values to parent counters.
  1821. *
  1822. * Note: we may be running in child context, but the PID is not hashed
  1823. * anymore so new counters will not be added.
  1824. */
  1825. void perf_counter_exit_task(struct task_struct *child)
  1826. {
  1827. struct perf_counter *child_counter, *tmp;
  1828. struct perf_counter_context *child_ctx;
  1829. child_ctx = &child->perf_counter_ctx;
  1830. if (likely(!child_ctx->nr_counters))
  1831. return;
  1832. list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
  1833. list_entry)
  1834. __perf_counter_exit_task(child, child_counter, child_ctx);
  1835. }
  1836. /*
  1837. * Initialize the perf_counter context in task_struct
  1838. */
  1839. void perf_counter_init_task(struct task_struct *child)
  1840. {
  1841. struct perf_counter_context *child_ctx, *parent_ctx;
  1842. struct perf_counter *counter;
  1843. struct task_struct *parent = current;
  1844. child_ctx = &child->perf_counter_ctx;
  1845. parent_ctx = &parent->perf_counter_ctx;
  1846. __perf_counter_init_context(child_ctx, child);
  1847. /*
  1848. * This is executed from the parent task context, so inherit
  1849. * counters that have been marked for cloning:
  1850. */
  1851. if (likely(!parent_ctx->nr_counters))
  1852. return;
  1853. /*
  1854. * Lock the parent list. No need to lock the child - not PID
  1855. * hashed yet and not running, so nobody can access it.
  1856. */
  1857. mutex_lock(&parent_ctx->mutex);
  1858. /*
  1859. * We dont have to disable NMIs - we are only looking at
  1860. * the list, not manipulating it:
  1861. */
  1862. list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
  1863. if (!counter->hw_event.inherit)
  1864. continue;
  1865. if (inherit_group(counter, parent,
  1866. parent_ctx, child, child_ctx))
  1867. break;
  1868. }
  1869. mutex_unlock(&parent_ctx->mutex);
  1870. }
  1871. static void __cpuinit perf_counter_init_cpu(int cpu)
  1872. {
  1873. struct perf_cpu_context *cpuctx;
  1874. cpuctx = &per_cpu(perf_cpu_context, cpu);
  1875. __perf_counter_init_context(&cpuctx->ctx, NULL);
  1876. mutex_lock(&perf_resource_mutex);
  1877. cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
  1878. mutex_unlock(&perf_resource_mutex);
  1879. hw_perf_counter_setup(cpu);
  1880. }
  1881. #ifdef CONFIG_HOTPLUG_CPU
  1882. static void __perf_counter_exit_cpu(void *info)
  1883. {
  1884. struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
  1885. struct perf_counter_context *ctx = &cpuctx->ctx;
  1886. struct perf_counter *counter, *tmp;
  1887. list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
  1888. __perf_counter_remove_from_context(counter);
  1889. }
  1890. static void perf_counter_exit_cpu(int cpu)
  1891. {
  1892. struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
  1893. struct perf_counter_context *ctx = &cpuctx->ctx;
  1894. mutex_lock(&ctx->mutex);
  1895. smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
  1896. mutex_unlock(&ctx->mutex);
  1897. }
  1898. #else
  1899. static inline void perf_counter_exit_cpu(int cpu) { }
  1900. #endif
  1901. static int __cpuinit
  1902. perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
  1903. {
  1904. unsigned int cpu = (long)hcpu;
  1905. switch (action) {
  1906. case CPU_UP_PREPARE:
  1907. case CPU_UP_PREPARE_FROZEN:
  1908. perf_counter_init_cpu(cpu);
  1909. break;
  1910. case CPU_DOWN_PREPARE:
  1911. case CPU_DOWN_PREPARE_FROZEN:
  1912. perf_counter_exit_cpu(cpu);
  1913. break;
  1914. default:
  1915. break;
  1916. }
  1917. return NOTIFY_OK;
  1918. }
  1919. static struct notifier_block __cpuinitdata perf_cpu_nb = {
  1920. .notifier_call = perf_cpu_notify,
  1921. };
  1922. static int __init perf_counter_init(void)
  1923. {
  1924. perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
  1925. (void *)(long)smp_processor_id());
  1926. register_cpu_notifier(&perf_cpu_nb);
  1927. return 0;
  1928. }
  1929. early_initcall(perf_counter_init);
  1930. static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
  1931. {
  1932. return sprintf(buf, "%d\n", perf_reserved_percpu);
  1933. }
  1934. static ssize_t
  1935. perf_set_reserve_percpu(struct sysdev_class *class,
  1936. const char *buf,
  1937. size_t count)
  1938. {
  1939. struct perf_cpu_context *cpuctx;
  1940. unsigned long val;
  1941. int err, cpu, mpt;
  1942. err = strict_strtoul(buf, 10, &val);
  1943. if (err)
  1944. return err;
  1945. if (val > perf_max_counters)
  1946. return -EINVAL;
  1947. mutex_lock(&perf_resource_mutex);
  1948. perf_reserved_percpu = val;
  1949. for_each_online_cpu(cpu) {
  1950. cpuctx = &per_cpu(perf_cpu_context, cpu);
  1951. spin_lock_irq(&cpuctx->ctx.lock);
  1952. mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
  1953. perf_max_counters - perf_reserved_percpu);
  1954. cpuctx->max_pertask = mpt;
  1955. spin_unlock_irq(&cpuctx->ctx.lock);
  1956. }
  1957. mutex_unlock(&perf_resource_mutex);
  1958. return count;
  1959. }
  1960. static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
  1961. {
  1962. return sprintf(buf, "%d\n", perf_overcommit);
  1963. }
  1964. static ssize_t
  1965. perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
  1966. {
  1967. unsigned long val;
  1968. int err;
  1969. err = strict_strtoul(buf, 10, &val);
  1970. if (err)
  1971. return err;
  1972. if (val > 1)
  1973. return -EINVAL;
  1974. mutex_lock(&perf_resource_mutex);
  1975. perf_overcommit = val;
  1976. mutex_unlock(&perf_resource_mutex);
  1977. return count;
  1978. }
  1979. static SYSDEV_CLASS_ATTR(
  1980. reserve_percpu,
  1981. 0644,
  1982. perf_show_reserve_percpu,
  1983. perf_set_reserve_percpu
  1984. );
  1985. static SYSDEV_CLASS_ATTR(
  1986. overcommit,
  1987. 0644,
  1988. perf_show_overcommit,
  1989. perf_set_overcommit
  1990. );
  1991. static struct attribute *perfclass_attrs[] = {
  1992. &attr_reserve_percpu.attr,
  1993. &attr_overcommit.attr,
  1994. NULL
  1995. };
  1996. static struct attribute_group perfclass_attr_group = {
  1997. .attrs = perfclass_attrs,
  1998. .name = "perf_counters",
  1999. };
  2000. static int __init perf_counter_sysfs_init(void)
  2001. {
  2002. return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
  2003. &perfclass_attr_group);
  2004. }
  2005. device_initcall(perf_counter_sysfs_init);