blktrace.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576
  1. /*
  2. * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License version 2 as
  6. * published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. * GNU General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU General Public License
  14. * along with this program; if not, write to the Free Software
  15. * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  16. *
  17. */
  18. #include <linux/kernel.h>
  19. #include <linux/blkdev.h>
  20. #include <linux/blktrace_api.h>
  21. #include <linux/percpu.h>
  22. #include <linux/init.h>
  23. #include <linux/mutex.h>
  24. #include <linux/debugfs.h>
  25. #include <linux/time.h>
  26. #include <asm/uaccess.h>
  27. static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, };
  28. static unsigned int blktrace_seq __read_mostly = 1;
  29. /*
  30. * Send out a notify message.
  31. */
  32. static void trace_note(struct blk_trace *bt, pid_t pid, int action,
  33. const void *data, size_t len)
  34. {
  35. struct blk_io_trace *t;
  36. t = relay_reserve(bt->rchan, sizeof(*t) + len);
  37. if (t) {
  38. const int cpu = smp_processor_id();
  39. t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
  40. t->time = cpu_clock(cpu) - per_cpu(blk_trace_cpu_offset, cpu);
  41. t->device = bt->dev;
  42. t->action = action;
  43. t->pid = pid;
  44. t->cpu = cpu;
  45. t->pdu_len = len;
  46. memcpy((void *) t + sizeof(*t), data, len);
  47. }
  48. }
  49. /*
  50. * Send out a notify for this process, if we haven't done so since a trace
  51. * started
  52. */
  53. static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
  54. {
  55. tsk->btrace_seq = blktrace_seq;
  56. trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm));
  57. }
  58. static void trace_note_time(struct blk_trace *bt)
  59. {
  60. struct timespec now;
  61. unsigned long flags;
  62. u32 words[2];
  63. getnstimeofday(&now);
  64. words[0] = now.tv_sec;
  65. words[1] = now.tv_nsec;
  66. local_irq_save(flags);
  67. trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
  68. local_irq_restore(flags);
  69. }
  70. static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
  71. pid_t pid)
  72. {
  73. if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
  74. return 1;
  75. if (sector < bt->start_lba || sector > bt->end_lba)
  76. return 1;
  77. if (bt->pid && pid != bt->pid)
  78. return 1;
  79. return 0;
  80. }
  81. /*
  82. * Data direction bit lookup
  83. */
  84. static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) };
  85. /*
  86. * Bio action bits of interest
  87. */
  88. static u32 bio_act[9] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC), 0, BLK_TC_ACT(BLK_TC_AHEAD), 0, 0, 0, BLK_TC_ACT(BLK_TC_META) };
  89. /*
  90. * More could be added as needed, taking care to increment the decrementer
  91. * to get correct indexing
  92. */
  93. #define trace_barrier_bit(rw) \
  94. (((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0))
  95. #define trace_sync_bit(rw) \
  96. (((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1))
  97. #define trace_ahead_bit(rw) \
  98. (((rw) & (1 << BIO_RW_AHEAD)) << (2 - BIO_RW_AHEAD))
  99. #define trace_meta_bit(rw) \
  100. (((rw) & (1 << BIO_RW_META)) >> (BIO_RW_META - 3))
  101. /*
  102. * The worker for the various blk_add_trace*() types. Fills out a
  103. * blk_io_trace structure and places it in a per-cpu subbuffer.
  104. */
  105. void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
  106. int rw, u32 what, int error, int pdu_len, void *pdu_data)
  107. {
  108. struct task_struct *tsk = current;
  109. struct blk_io_trace *t;
  110. unsigned long flags;
  111. unsigned long *sequence;
  112. pid_t pid;
  113. int cpu;
  114. if (unlikely(bt->trace_state != Blktrace_running))
  115. return;
  116. what |= ddir_act[rw & WRITE];
  117. what |= bio_act[trace_barrier_bit(rw)];
  118. what |= bio_act[trace_sync_bit(rw)];
  119. what |= bio_act[trace_ahead_bit(rw)];
  120. what |= bio_act[trace_meta_bit(rw)];
  121. pid = tsk->pid;
  122. if (unlikely(act_log_check(bt, what, sector, pid)))
  123. return;
  124. /*
  125. * A word about the locking here - we disable interrupts to reserve
  126. * some space in the relay per-cpu buffer, to prevent an irq
  127. * from coming in and stepping on our toes. Once reserved, it's
  128. * enough to get preemption disabled to prevent read of this data
  129. * before we are through filling it. get_cpu()/put_cpu() does this
  130. * for us
  131. */
  132. local_irq_save(flags);
  133. if (unlikely(tsk->btrace_seq != blktrace_seq))
  134. trace_note_tsk(bt, tsk);
  135. t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
  136. if (t) {
  137. cpu = smp_processor_id();
  138. sequence = per_cpu_ptr(bt->sequence, cpu);
  139. t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
  140. t->sequence = ++(*sequence);
  141. t->time = cpu_clock(cpu) - per_cpu(blk_trace_cpu_offset, cpu);
  142. t->sector = sector;
  143. t->bytes = bytes;
  144. t->action = what;
  145. t->pid = pid;
  146. t->device = bt->dev;
  147. t->cpu = cpu;
  148. t->error = error;
  149. t->pdu_len = pdu_len;
  150. if (pdu_len)
  151. memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
  152. }
  153. local_irq_restore(flags);
  154. }
  155. EXPORT_SYMBOL_GPL(__blk_add_trace);
  156. static struct dentry *blk_tree_root;
  157. static struct mutex blk_tree_mutex;
  158. static unsigned int root_users;
  159. static inline void blk_remove_root(void)
  160. {
  161. if (blk_tree_root) {
  162. debugfs_remove(blk_tree_root);
  163. blk_tree_root = NULL;
  164. }
  165. }
  166. static void blk_remove_tree(struct dentry *dir)
  167. {
  168. mutex_lock(&blk_tree_mutex);
  169. debugfs_remove(dir);
  170. if (--root_users == 0)
  171. blk_remove_root();
  172. mutex_unlock(&blk_tree_mutex);
  173. }
  174. static struct dentry *blk_create_tree(const char *blk_name)
  175. {
  176. struct dentry *dir = NULL;
  177. mutex_lock(&blk_tree_mutex);
  178. if (!blk_tree_root) {
  179. blk_tree_root = debugfs_create_dir("block", NULL);
  180. if (!blk_tree_root)
  181. goto err;
  182. }
  183. dir = debugfs_create_dir(blk_name, blk_tree_root);
  184. if (dir)
  185. root_users++;
  186. else
  187. blk_remove_root();
  188. err:
  189. mutex_unlock(&blk_tree_mutex);
  190. return dir;
  191. }
  192. static void blk_trace_cleanup(struct blk_trace *bt)
  193. {
  194. relay_close(bt->rchan);
  195. debugfs_remove(bt->dropped_file);
  196. blk_remove_tree(bt->dir);
  197. free_percpu(bt->sequence);
  198. kfree(bt);
  199. }
  200. static int blk_trace_remove(struct request_queue *q)
  201. {
  202. struct blk_trace *bt;
  203. bt = xchg(&q->blk_trace, NULL);
  204. if (!bt)
  205. return -EINVAL;
  206. if (bt->trace_state == Blktrace_setup ||
  207. bt->trace_state == Blktrace_stopped)
  208. blk_trace_cleanup(bt);
  209. return 0;
  210. }
  211. static int blk_dropped_open(struct inode *inode, struct file *filp)
  212. {
  213. filp->private_data = inode->i_private;
  214. return 0;
  215. }
  216. static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
  217. size_t count, loff_t *ppos)
  218. {
  219. struct blk_trace *bt = filp->private_data;
  220. char buf[16];
  221. snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
  222. return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
  223. }
  224. static const struct file_operations blk_dropped_fops = {
  225. .owner = THIS_MODULE,
  226. .open = blk_dropped_open,
  227. .read = blk_dropped_read,
  228. };
  229. /*
  230. * Keep track of how many times we encountered a full subbuffer, to aid
  231. * the user space app in telling how many lost events there were.
  232. */
  233. static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
  234. void *prev_subbuf, size_t prev_padding)
  235. {
  236. struct blk_trace *bt;
  237. if (!relay_buf_full(buf))
  238. return 1;
  239. bt = buf->chan->private_data;
  240. atomic_inc(&bt->dropped);
  241. return 0;
  242. }
  243. static int blk_remove_buf_file_callback(struct dentry *dentry)
  244. {
  245. debugfs_remove(dentry);
  246. return 0;
  247. }
  248. static struct dentry *blk_create_buf_file_callback(const char *filename,
  249. struct dentry *parent,
  250. int mode,
  251. struct rchan_buf *buf,
  252. int *is_global)
  253. {
  254. return debugfs_create_file(filename, mode, parent, buf,
  255. &relay_file_operations);
  256. }
  257. static struct rchan_callbacks blk_relay_callbacks = {
  258. .subbuf_start = blk_subbuf_start_callback,
  259. .create_buf_file = blk_create_buf_file_callback,
  260. .remove_buf_file = blk_remove_buf_file_callback,
  261. };
  262. /*
  263. * Setup everything required to start tracing
  264. */
  265. int do_blk_trace_setup(struct request_queue *q, struct block_device *bdev,
  266. struct blk_user_trace_setup *buts)
  267. {
  268. struct blk_trace *old_bt, *bt = NULL;
  269. struct dentry *dir = NULL;
  270. char b[BDEVNAME_SIZE];
  271. int ret, i;
  272. if (!buts->buf_size || !buts->buf_nr)
  273. return -EINVAL;
  274. strcpy(buts->name, bdevname(bdev, b));
  275. /*
  276. * some device names have larger paths - convert the slashes
  277. * to underscores for this to work as expected
  278. */
  279. for (i = 0; i < strlen(buts->name); i++)
  280. if (buts->name[i] == '/')
  281. buts->name[i] = '_';
  282. ret = -ENOMEM;
  283. bt = kzalloc(sizeof(*bt), GFP_KERNEL);
  284. if (!bt)
  285. goto err;
  286. bt->sequence = alloc_percpu(unsigned long);
  287. if (!bt->sequence)
  288. goto err;
  289. ret = -ENOENT;
  290. dir = blk_create_tree(buts->name);
  291. if (!dir)
  292. goto err;
  293. bt->dir = dir;
  294. bt->dev = bdev->bd_dev;
  295. atomic_set(&bt->dropped, 0);
  296. ret = -EIO;
  297. bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
  298. if (!bt->dropped_file)
  299. goto err;
  300. bt->rchan = relay_open("trace", dir, buts->buf_size,
  301. buts->buf_nr, &blk_relay_callbacks, bt);
  302. if (!bt->rchan)
  303. goto err;
  304. bt->act_mask = buts->act_mask;
  305. if (!bt->act_mask)
  306. bt->act_mask = (u16) -1;
  307. bt->start_lba = buts->start_lba;
  308. bt->end_lba = buts->end_lba;
  309. if (!bt->end_lba)
  310. bt->end_lba = -1ULL;
  311. bt->pid = buts->pid;
  312. bt->trace_state = Blktrace_setup;
  313. ret = -EBUSY;
  314. old_bt = xchg(&q->blk_trace, bt);
  315. if (old_bt) {
  316. (void) xchg(&q->blk_trace, old_bt);
  317. goto err;
  318. }
  319. return 0;
  320. err:
  321. if (dir)
  322. blk_remove_tree(dir);
  323. if (bt) {
  324. if (bt->dropped_file)
  325. debugfs_remove(bt->dropped_file);
  326. free_percpu(bt->sequence);
  327. if (bt->rchan)
  328. relay_close(bt->rchan);
  329. kfree(bt);
  330. }
  331. return ret;
  332. }
  333. static int blk_trace_setup(struct request_queue *q, struct block_device *bdev,
  334. char __user *arg)
  335. {
  336. struct blk_user_trace_setup buts;
  337. int ret;
  338. ret = copy_from_user(&buts, arg, sizeof(buts));
  339. if (ret)
  340. return -EFAULT;
  341. ret = do_blk_trace_setup(q, bdev, &buts);
  342. if (ret)
  343. return ret;
  344. if (copy_to_user(arg, &buts, sizeof(buts)))
  345. return -EFAULT;
  346. return 0;
  347. }
  348. static int blk_trace_startstop(struct request_queue *q, int start)
  349. {
  350. struct blk_trace *bt;
  351. int ret;
  352. if ((bt = q->blk_trace) == NULL)
  353. return -EINVAL;
  354. /*
  355. * For starting a trace, we can transition from a setup or stopped
  356. * trace. For stopping a trace, the state must be running
  357. */
  358. ret = -EINVAL;
  359. if (start) {
  360. if (bt->trace_state == Blktrace_setup ||
  361. bt->trace_state == Blktrace_stopped) {
  362. blktrace_seq++;
  363. smp_mb();
  364. bt->trace_state = Blktrace_running;
  365. trace_note_time(bt);
  366. ret = 0;
  367. }
  368. } else {
  369. if (bt->trace_state == Blktrace_running) {
  370. bt->trace_state = Blktrace_stopped;
  371. relay_flush(bt->rchan);
  372. ret = 0;
  373. }
  374. }
  375. return ret;
  376. }
  377. /**
  378. * blk_trace_ioctl: - handle the ioctls associated with tracing
  379. * @bdev: the block device
  380. * @cmd: the ioctl cmd
  381. * @arg: the argument data, if any
  382. *
  383. **/
  384. int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
  385. {
  386. struct request_queue *q;
  387. int ret, start = 0;
  388. q = bdev_get_queue(bdev);
  389. if (!q)
  390. return -ENXIO;
  391. mutex_lock(&bdev->bd_mutex);
  392. switch (cmd) {
  393. case BLKTRACESETUP:
  394. ret = blk_trace_setup(q, bdev, arg);
  395. break;
  396. case BLKTRACESTART:
  397. start = 1;
  398. case BLKTRACESTOP:
  399. ret = blk_trace_startstop(q, start);
  400. break;
  401. case BLKTRACETEARDOWN:
  402. ret = blk_trace_remove(q);
  403. break;
  404. default:
  405. ret = -ENOTTY;
  406. break;
  407. }
  408. mutex_unlock(&bdev->bd_mutex);
  409. return ret;
  410. }
  411. /**
  412. * blk_trace_shutdown: - stop and cleanup trace structures
  413. * @q: the request queue associated with the device
  414. *
  415. **/
  416. void blk_trace_shutdown(struct request_queue *q)
  417. {
  418. if (q->blk_trace) {
  419. blk_trace_startstop(q, 0);
  420. blk_trace_remove(q);
  421. }
  422. }
  423. /*
  424. * Average offset over two calls to cpu_clock() with a gettimeofday()
  425. * in the middle
  426. */
  427. static void blk_check_time(unsigned long long *t, int this_cpu)
  428. {
  429. unsigned long long a, b;
  430. struct timeval tv;
  431. a = cpu_clock(this_cpu);
  432. do_gettimeofday(&tv);
  433. b = cpu_clock(this_cpu);
  434. *t = tv.tv_sec * 1000000000 + tv.tv_usec * 1000;
  435. *t -= (a + b) / 2;
  436. }
  437. /*
  438. * calibrate our inter-CPU timings
  439. */
  440. static void blk_trace_check_cpu_time(void *data)
  441. {
  442. unsigned long long *t;
  443. int this_cpu = get_cpu();
  444. t = &per_cpu(blk_trace_cpu_offset, this_cpu);
  445. /*
  446. * Just call it twice, hopefully the second call will be cache hot
  447. * and a little more precise
  448. */
  449. blk_check_time(t, this_cpu);
  450. blk_check_time(t, this_cpu);
  451. put_cpu();
  452. }
  453. static void blk_trace_set_ht_offsets(void)
  454. {
  455. #if defined(CONFIG_SCHED_SMT)
  456. int cpu, i;
  457. /*
  458. * now make sure HT siblings have the same time offset
  459. */
  460. preempt_disable();
  461. for_each_online_cpu(cpu) {
  462. unsigned long long *cpu_off, *sibling_off;
  463. for_each_cpu_mask(i, per_cpu(cpu_sibling_map, cpu)) {
  464. if (i == cpu)
  465. continue;
  466. cpu_off = &per_cpu(blk_trace_cpu_offset, cpu);
  467. sibling_off = &per_cpu(blk_trace_cpu_offset, i);
  468. *sibling_off = *cpu_off;
  469. }
  470. }
  471. preempt_enable();
  472. #endif
  473. }
  474. static __init int blk_trace_init(void)
  475. {
  476. mutex_init(&blk_tree_mutex);
  477. on_each_cpu(blk_trace_check_cpu_time, NULL, 1, 1);
  478. blk_trace_set_ht_offsets();
  479. return 0;
  480. }
  481. module_init(blk_trace_init);