blktrace.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538
  1. /*
  2. * Copyright (C) 2006 Jens Axboe <axboe@suse.de>
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License version 2 as
  6. * published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. * GNU General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU General Public License
  14. * along with this program; if not, write to the Free Software
  15. * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  16. *
  17. */
  18. #include <linux/config.h>
  19. #include <linux/kernel.h>
  20. #include <linux/blkdev.h>
  21. #include <linux/blktrace_api.h>
  22. #include <linux/percpu.h>
  23. #include <linux/init.h>
  24. #include <linux/mutex.h>
  25. #include <linux/debugfs.h>
  26. #include <asm/uaccess.h>
  27. static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, };
  28. static unsigned int blktrace_seq __read_mostly = 1;
  29. /*
  30. * Send out a notify for this process, if we haven't done so since a trace
  31. * started
  32. */
  33. static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
  34. {
  35. struct blk_io_trace *t;
  36. t = relay_reserve(bt->rchan, sizeof(*t) + sizeof(tsk->comm));
  37. if (t) {
  38. t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
  39. t->device = bt->dev;
  40. t->action = BLK_TC_ACT(BLK_TC_NOTIFY);
  41. t->pid = tsk->pid;
  42. t->cpu = smp_processor_id();
  43. t->pdu_len = sizeof(tsk->comm);
  44. memcpy((void *) t + sizeof(*t), tsk->comm, t->pdu_len);
  45. tsk->btrace_seq = blktrace_seq;
  46. }
  47. }
  48. static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
  49. pid_t pid)
  50. {
  51. if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
  52. return 1;
  53. if (sector < bt->start_lba || sector > bt->end_lba)
  54. return 1;
  55. if (bt->pid && pid != bt->pid)
  56. return 1;
  57. return 0;
  58. }
  59. /*
  60. * Data direction bit lookup
  61. */
  62. static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) };
  63. /*
  64. * Bio action bits of interest
  65. */
  66. static u32 bio_act[3] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC) };
  67. /*
  68. * More could be added as needed, taking care to increment the decrementer
  69. * to get correct indexing
  70. */
  71. #define trace_barrier_bit(rw) \
  72. (((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0))
  73. #define trace_sync_bit(rw) \
  74. (((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1))
  75. /*
  76. * The worker for the various blk_add_trace*() types. Fills out a
  77. * blk_io_trace structure and places it in a per-cpu subbuffer.
  78. */
  79. void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
  80. int rw, u32 what, int error, int pdu_len, void *pdu_data)
  81. {
  82. struct task_struct *tsk = current;
  83. struct blk_io_trace *t;
  84. unsigned long flags;
  85. unsigned long *sequence;
  86. pid_t pid;
  87. int cpu;
  88. if (unlikely(bt->trace_state != Blktrace_running))
  89. return;
  90. what |= ddir_act[rw & WRITE];
  91. what |= bio_act[trace_barrier_bit(rw)];
  92. what |= bio_act[trace_sync_bit(rw)];
  93. pid = tsk->pid;
  94. if (unlikely(act_log_check(bt, what, sector, pid)))
  95. return;
  96. /*
  97. * A word about the locking here - we disable interrupts to reserve
  98. * some space in the relay per-cpu buffer, to prevent an irq
  99. * from coming in and stepping on our toes. Once reserved, it's
  100. * enough to get preemption disabled to prevent read of this data
  101. * before we are through filling it. get_cpu()/put_cpu() does this
  102. * for us
  103. */
  104. local_irq_save(flags);
  105. if (unlikely(tsk->btrace_seq != blktrace_seq))
  106. trace_note_tsk(bt, tsk);
  107. t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
  108. if (t) {
  109. cpu = smp_processor_id();
  110. sequence = per_cpu_ptr(bt->sequence, cpu);
  111. t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
  112. t->sequence = ++(*sequence);
  113. t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu);
  114. t->sector = sector;
  115. t->bytes = bytes;
  116. t->action = what;
  117. t->pid = pid;
  118. t->device = bt->dev;
  119. t->cpu = cpu;
  120. t->error = error;
  121. t->pdu_len = pdu_len;
  122. if (pdu_len)
  123. memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
  124. }
  125. local_irq_restore(flags);
  126. }
  127. EXPORT_SYMBOL_GPL(__blk_add_trace);
  128. static struct dentry *blk_tree_root;
  129. static struct mutex blk_tree_mutex;
  130. static unsigned int root_users;
  131. static inline void blk_remove_root(void)
  132. {
  133. if (blk_tree_root) {
  134. debugfs_remove(blk_tree_root);
  135. blk_tree_root = NULL;
  136. }
  137. }
  138. static void blk_remove_tree(struct dentry *dir)
  139. {
  140. mutex_lock(&blk_tree_mutex);
  141. debugfs_remove(dir);
  142. if (--root_users == 0)
  143. blk_remove_root();
  144. mutex_unlock(&blk_tree_mutex);
  145. }
  146. static struct dentry *blk_create_tree(const char *blk_name)
  147. {
  148. struct dentry *dir = NULL;
  149. mutex_lock(&blk_tree_mutex);
  150. if (!blk_tree_root) {
  151. blk_tree_root = debugfs_create_dir("block", NULL);
  152. if (!blk_tree_root)
  153. goto err;
  154. }
  155. dir = debugfs_create_dir(blk_name, blk_tree_root);
  156. if (dir)
  157. root_users++;
  158. else
  159. blk_remove_root();
  160. err:
  161. mutex_unlock(&blk_tree_mutex);
  162. return dir;
  163. }
  164. static void blk_trace_cleanup(struct blk_trace *bt)
  165. {
  166. relay_close(bt->rchan);
  167. debugfs_remove(bt->dropped_file);
  168. blk_remove_tree(bt->dir);
  169. free_percpu(bt->sequence);
  170. kfree(bt);
  171. }
  172. static int blk_trace_remove(request_queue_t *q)
  173. {
  174. struct blk_trace *bt;
  175. bt = xchg(&q->blk_trace, NULL);
  176. if (!bt)
  177. return -EINVAL;
  178. if (bt->trace_state == Blktrace_setup ||
  179. bt->trace_state == Blktrace_stopped)
  180. blk_trace_cleanup(bt);
  181. return 0;
  182. }
  183. static int blk_dropped_open(struct inode *inode, struct file *filp)
  184. {
  185. filp->private_data = inode->u.generic_ip;
  186. return 0;
  187. }
  188. static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
  189. size_t count, loff_t *ppos)
  190. {
  191. struct blk_trace *bt = filp->private_data;
  192. char buf[16];
  193. snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
  194. return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
  195. }
  196. static struct file_operations blk_dropped_fops = {
  197. .owner = THIS_MODULE,
  198. .open = blk_dropped_open,
  199. .read = blk_dropped_read,
  200. };
  201. /*
  202. * Keep track of how many times we encountered a full subbuffer, to aid
  203. * the user space app in telling how many lost events there were.
  204. */
  205. static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
  206. void *prev_subbuf, size_t prev_padding)
  207. {
  208. struct blk_trace *bt;
  209. if (!relay_buf_full(buf))
  210. return 1;
  211. bt = buf->chan->private_data;
  212. atomic_inc(&bt->dropped);
  213. return 0;
  214. }
  215. static int blk_remove_buf_file_callback(struct dentry *dentry)
  216. {
  217. debugfs_remove(dentry);
  218. return 0;
  219. }
  220. static struct dentry *blk_create_buf_file_callback(const char *filename,
  221. struct dentry *parent,
  222. int mode,
  223. struct rchan_buf *buf,
  224. int *is_global)
  225. {
  226. return debugfs_create_file(filename, mode, parent, buf,
  227. &relay_file_operations);
  228. }
  229. static struct rchan_callbacks blk_relay_callbacks = {
  230. .subbuf_start = blk_subbuf_start_callback,
  231. .create_buf_file = blk_create_buf_file_callback,
  232. .remove_buf_file = blk_remove_buf_file_callback,
  233. };
  234. /*
  235. * Setup everything required to start tracing
  236. */
  237. static int blk_trace_setup(request_queue_t *q, struct block_device *bdev,
  238. char __user *arg)
  239. {
  240. struct blk_user_trace_setup buts;
  241. struct blk_trace *old_bt, *bt = NULL;
  242. struct dentry *dir = NULL;
  243. char b[BDEVNAME_SIZE];
  244. int ret, i;
  245. if (copy_from_user(&buts, arg, sizeof(buts)))
  246. return -EFAULT;
  247. if (!buts.buf_size || !buts.buf_nr)
  248. return -EINVAL;
  249. strcpy(buts.name, bdevname(bdev, b));
  250. /*
  251. * some device names have larger paths - convert the slashes
  252. * to underscores for this to work as expected
  253. */
  254. for (i = 0; i < strlen(buts.name); i++)
  255. if (buts.name[i] == '/')
  256. buts.name[i] = '_';
  257. if (copy_to_user(arg, &buts, sizeof(buts)))
  258. return -EFAULT;
  259. ret = -ENOMEM;
  260. bt = kzalloc(sizeof(*bt), GFP_KERNEL);
  261. if (!bt)
  262. goto err;
  263. bt->sequence = alloc_percpu(unsigned long);
  264. if (!bt->sequence)
  265. goto err;
  266. ret = -ENOENT;
  267. dir = blk_create_tree(buts.name);
  268. if (!dir)
  269. goto err;
  270. bt->dir = dir;
  271. bt->dev = bdev->bd_dev;
  272. atomic_set(&bt->dropped, 0);
  273. ret = -EIO;
  274. bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
  275. if (!bt->dropped_file)
  276. goto err;
  277. bt->rchan = relay_open("trace", dir, buts.buf_size, buts.buf_nr, &blk_relay_callbacks);
  278. if (!bt->rchan)
  279. goto err;
  280. bt->rchan->private_data = bt;
  281. bt->act_mask = buts.act_mask;
  282. if (!bt->act_mask)
  283. bt->act_mask = (u16) -1;
  284. bt->start_lba = buts.start_lba;
  285. bt->end_lba = buts.end_lba;
  286. if (!bt->end_lba)
  287. bt->end_lba = -1ULL;
  288. bt->pid = buts.pid;
  289. bt->trace_state = Blktrace_setup;
  290. ret = -EBUSY;
  291. old_bt = xchg(&q->blk_trace, bt);
  292. if (old_bt) {
  293. (void) xchg(&q->blk_trace, old_bt);
  294. goto err;
  295. }
  296. return 0;
  297. err:
  298. if (dir)
  299. blk_remove_tree(dir);
  300. if (bt) {
  301. if (bt->dropped_file)
  302. debugfs_remove(bt->dropped_file);
  303. if (bt->sequence)
  304. free_percpu(bt->sequence);
  305. if (bt->rchan)
  306. relay_close(bt->rchan);
  307. kfree(bt);
  308. }
  309. return ret;
  310. }
  311. static int blk_trace_startstop(request_queue_t *q, int start)
  312. {
  313. struct blk_trace *bt;
  314. int ret;
  315. if ((bt = q->blk_trace) == NULL)
  316. return -EINVAL;
  317. /*
  318. * For starting a trace, we can transition from a setup or stopped
  319. * trace. For stopping a trace, the state must be running
  320. */
  321. ret = -EINVAL;
  322. if (start) {
  323. if (bt->trace_state == Blktrace_setup ||
  324. bt->trace_state == Blktrace_stopped) {
  325. blktrace_seq++;
  326. smp_mb();
  327. bt->trace_state = Blktrace_running;
  328. ret = 0;
  329. }
  330. } else {
  331. if (bt->trace_state == Blktrace_running) {
  332. bt->trace_state = Blktrace_stopped;
  333. relay_flush(bt->rchan);
  334. ret = 0;
  335. }
  336. }
  337. return ret;
  338. }
  339. /**
  340. * blk_trace_ioctl: - handle the ioctls associated with tracing
  341. * @bdev: the block device
  342. * @cmd: the ioctl cmd
  343. * @arg: the argument data, if any
  344. *
  345. **/
  346. int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
  347. {
  348. request_queue_t *q;
  349. int ret, start = 0;
  350. q = bdev_get_queue(bdev);
  351. if (!q)
  352. return -ENXIO;
  353. mutex_lock(&bdev->bd_mutex);
  354. switch (cmd) {
  355. case BLKTRACESETUP:
  356. ret = blk_trace_setup(q, bdev, arg);
  357. break;
  358. case BLKTRACESTART:
  359. start = 1;
  360. case BLKTRACESTOP:
  361. ret = blk_trace_startstop(q, start);
  362. break;
  363. case BLKTRACETEARDOWN:
  364. ret = blk_trace_remove(q);
  365. break;
  366. default:
  367. ret = -ENOTTY;
  368. break;
  369. }
  370. mutex_unlock(&bdev->bd_mutex);
  371. return ret;
  372. }
  373. /**
  374. * blk_trace_shutdown: - stop and cleanup trace structures
  375. * @q: the request queue associated with the device
  376. *
  377. **/
  378. void blk_trace_shutdown(request_queue_t *q)
  379. {
  380. blk_trace_startstop(q, 0);
  381. blk_trace_remove(q);
  382. }
  383. /*
  384. * Average offset over two calls to sched_clock() with a gettimeofday()
  385. * in the middle
  386. */
  387. static void blk_check_time(unsigned long long *t)
  388. {
  389. unsigned long long a, b;
  390. struct timeval tv;
  391. a = sched_clock();
  392. do_gettimeofday(&tv);
  393. b = sched_clock();
  394. *t = tv.tv_sec * 1000000000 + tv.tv_usec * 1000;
  395. *t -= (a + b) / 2;
  396. }
  397. static void blk_trace_check_cpu_time(void *data)
  398. {
  399. unsigned long long *t;
  400. int cpu = get_cpu();
  401. t = &per_cpu(blk_trace_cpu_offset, cpu);
  402. /*
  403. * Just call it twice, hopefully the second call will be cache hot
  404. * and a little more precise
  405. */
  406. blk_check_time(t);
  407. blk_check_time(t);
  408. put_cpu();
  409. }
  410. /*
  411. * Call blk_trace_check_cpu_time() on each CPU to calibrate our inter-CPU
  412. * timings
  413. */
  414. static void blk_trace_calibrate_offsets(void)
  415. {
  416. unsigned long flags;
  417. smp_call_function(blk_trace_check_cpu_time, NULL, 1, 1);
  418. local_irq_save(flags);
  419. blk_trace_check_cpu_time(NULL);
  420. local_irq_restore(flags);
  421. }
  422. static void blk_trace_set_ht_offsets(void)
  423. {
  424. #if defined(CONFIG_SCHED_SMT)
  425. int cpu, i;
  426. /*
  427. * now make sure HT siblings have the same time offset
  428. */
  429. preempt_disable();
  430. for_each_online_cpu(cpu) {
  431. unsigned long long *cpu_off, *sibling_off;
  432. for_each_cpu_mask(i, cpu_sibling_map[cpu]) {
  433. if (i == cpu)
  434. continue;
  435. cpu_off = &per_cpu(blk_trace_cpu_offset, cpu);
  436. sibling_off = &per_cpu(blk_trace_cpu_offset, i);
  437. *sibling_off = *cpu_off;
  438. }
  439. }
  440. preempt_enable();
  441. #endif
  442. }
  443. static __init int blk_trace_init(void)
  444. {
  445. mutex_init(&blk_tree_mutex);
  446. blk_trace_calibrate_offsets();
  447. blk_trace_set_ht_offsets();
  448. return 0;
  449. }
  450. module_init(blk_trace_init);