blktrace.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510
  1. /*
  2. * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License version 2 as
  6. * published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. * GNU General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU General Public License
  14. * along with this program; if not, write to the Free Software
  15. * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  16. *
  17. */
  18. #include <linux/kernel.h>
  19. #include <linux/blkdev.h>
  20. #include <linux/blktrace_api.h>
  21. #include <linux/percpu.h>
  22. #include <linux/init.h>
  23. #include <linux/mutex.h>
  24. #include <linux/debugfs.h>
  25. #include <linux/time.h>
  26. #include <asm/uaccess.h>
  27. static unsigned int blktrace_seq __read_mostly = 1;
  28. /*
  29. * Send out a notify message.
  30. */
  31. static void trace_note(struct blk_trace *bt, pid_t pid, int action,
  32. const void *data, size_t len)
  33. {
  34. struct blk_io_trace *t;
  35. t = relay_reserve(bt->rchan, sizeof(*t) + len);
  36. if (t) {
  37. const int cpu = smp_processor_id();
  38. t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
  39. t->time = ktime_to_ns(ktime_get());
  40. t->device = bt->dev;
  41. t->action = action;
  42. t->pid = pid;
  43. t->cpu = cpu;
  44. t->pdu_len = len;
  45. memcpy((void *) t + sizeof(*t), data, len);
  46. }
  47. }
  48. /*
  49. * Send out a notify for this process, if we haven't done so since a trace
  50. * started
  51. */
  52. static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
  53. {
  54. tsk->btrace_seq = blktrace_seq;
  55. trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm));
  56. }
  57. static void trace_note_time(struct blk_trace *bt)
  58. {
  59. struct timespec now;
  60. unsigned long flags;
  61. u32 words[2];
  62. getnstimeofday(&now);
  63. words[0] = now.tv_sec;
  64. words[1] = now.tv_nsec;
  65. local_irq_save(flags);
  66. trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
  67. local_irq_restore(flags);
  68. }
  69. static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
  70. pid_t pid)
  71. {
  72. if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
  73. return 1;
  74. if (sector < bt->start_lba || sector > bt->end_lba)
  75. return 1;
  76. if (bt->pid && pid != bt->pid)
  77. return 1;
  78. return 0;
  79. }
  80. /*
  81. * Data direction bit lookup
  82. */
  83. static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) };
  84. /*
  85. * Bio action bits of interest
  86. */
  87. static u32 bio_act[9] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC), 0, BLK_TC_ACT(BLK_TC_AHEAD), 0, 0, 0, BLK_TC_ACT(BLK_TC_META) };
  88. /*
  89. * More could be added as needed, taking care to increment the decrementer
  90. * to get correct indexing
  91. */
  92. #define trace_barrier_bit(rw) \
  93. (((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0))
  94. #define trace_sync_bit(rw) \
  95. (((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1))
  96. #define trace_ahead_bit(rw) \
  97. (((rw) & (1 << BIO_RW_AHEAD)) << (2 - BIO_RW_AHEAD))
  98. #define trace_meta_bit(rw) \
  99. (((rw) & (1 << BIO_RW_META)) >> (BIO_RW_META - 3))
  100. /*
  101. * The worker for the various blk_add_trace*() types. Fills out a
  102. * blk_io_trace structure and places it in a per-cpu subbuffer.
  103. */
  104. void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
  105. int rw, u32 what, int error, int pdu_len, void *pdu_data)
  106. {
  107. struct task_struct *tsk = current;
  108. struct blk_io_trace *t;
  109. unsigned long flags;
  110. unsigned long *sequence;
  111. pid_t pid;
  112. int cpu;
  113. if (unlikely(bt->trace_state != Blktrace_running))
  114. return;
  115. what |= ddir_act[rw & WRITE];
  116. what |= bio_act[trace_barrier_bit(rw)];
  117. what |= bio_act[trace_sync_bit(rw)];
  118. what |= bio_act[trace_ahead_bit(rw)];
  119. what |= bio_act[trace_meta_bit(rw)];
  120. pid = tsk->pid;
  121. if (unlikely(act_log_check(bt, what, sector, pid)))
  122. return;
  123. /*
  124. * A word about the locking here - we disable interrupts to reserve
  125. * some space in the relay per-cpu buffer, to prevent an irq
  126. * from coming in and stepping on our toes. Once reserved, it's
  127. * enough to get preemption disabled to prevent read of this data
  128. * before we are through filling it. get_cpu()/put_cpu() does this
  129. * for us
  130. */
  131. local_irq_save(flags);
  132. if (unlikely(tsk->btrace_seq != blktrace_seq))
  133. trace_note_tsk(bt, tsk);
  134. t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
  135. if (t) {
  136. cpu = smp_processor_id();
  137. sequence = per_cpu_ptr(bt->sequence, cpu);
  138. t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
  139. t->sequence = ++(*sequence);
  140. t->time = ktime_to_ns(ktime_get());
  141. t->sector = sector;
  142. t->bytes = bytes;
  143. t->action = what;
  144. t->pid = pid;
  145. t->device = bt->dev;
  146. t->cpu = cpu;
  147. t->error = error;
  148. t->pdu_len = pdu_len;
  149. if (pdu_len)
  150. memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
  151. }
  152. local_irq_restore(flags);
  153. }
  154. EXPORT_SYMBOL_GPL(__blk_add_trace);
  155. static struct dentry *blk_tree_root;
  156. static DEFINE_MUTEX(blk_tree_mutex);
  157. static unsigned int root_users;
  158. static inline void blk_remove_root(void)
  159. {
  160. if (blk_tree_root) {
  161. debugfs_remove(blk_tree_root);
  162. blk_tree_root = NULL;
  163. }
  164. }
  165. static void blk_remove_tree(struct dentry *dir)
  166. {
  167. mutex_lock(&blk_tree_mutex);
  168. debugfs_remove(dir);
  169. if (--root_users == 0)
  170. blk_remove_root();
  171. mutex_unlock(&blk_tree_mutex);
  172. }
  173. static struct dentry *blk_create_tree(const char *blk_name)
  174. {
  175. struct dentry *dir = NULL;
  176. int created = 0;
  177. mutex_lock(&blk_tree_mutex);
  178. if (!blk_tree_root) {
  179. blk_tree_root = debugfs_create_dir("block", NULL);
  180. if (!blk_tree_root)
  181. goto err;
  182. created = 1;
  183. }
  184. dir = debugfs_create_dir(blk_name, blk_tree_root);
  185. if (dir)
  186. root_users++;
  187. else {
  188. /* Delete root only if we created it */
  189. if (created)
  190. blk_remove_root();
  191. }
  192. err:
  193. mutex_unlock(&blk_tree_mutex);
  194. return dir;
  195. }
  196. static void blk_trace_cleanup(struct blk_trace *bt)
  197. {
  198. relay_close(bt->rchan);
  199. debugfs_remove(bt->dropped_file);
  200. blk_remove_tree(bt->dir);
  201. free_percpu(bt->sequence);
  202. kfree(bt);
  203. }
  204. int blk_trace_remove(struct request_queue *q)
  205. {
  206. struct blk_trace *bt;
  207. bt = xchg(&q->blk_trace, NULL);
  208. if (!bt)
  209. return -EINVAL;
  210. if (bt->trace_state == Blktrace_setup ||
  211. bt->trace_state == Blktrace_stopped)
  212. blk_trace_cleanup(bt);
  213. return 0;
  214. }
  215. EXPORT_SYMBOL_GPL(blk_trace_remove);
  216. static int blk_dropped_open(struct inode *inode, struct file *filp)
  217. {
  218. filp->private_data = inode->i_private;
  219. return 0;
  220. }
  221. static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
  222. size_t count, loff_t *ppos)
  223. {
  224. struct blk_trace *bt = filp->private_data;
  225. char buf[16];
  226. snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
  227. return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
  228. }
  229. static const struct file_operations blk_dropped_fops = {
  230. .owner = THIS_MODULE,
  231. .open = blk_dropped_open,
  232. .read = blk_dropped_read,
  233. };
  234. /*
  235. * Keep track of how many times we encountered a full subbuffer, to aid
  236. * the user space app in telling how many lost events there were.
  237. */
  238. static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
  239. void *prev_subbuf, size_t prev_padding)
  240. {
  241. struct blk_trace *bt;
  242. if (!relay_buf_full(buf))
  243. return 1;
  244. bt = buf->chan->private_data;
  245. atomic_inc(&bt->dropped);
  246. return 0;
  247. }
  248. static int blk_remove_buf_file_callback(struct dentry *dentry)
  249. {
  250. debugfs_remove(dentry);
  251. return 0;
  252. }
  253. static struct dentry *blk_create_buf_file_callback(const char *filename,
  254. struct dentry *parent,
  255. int mode,
  256. struct rchan_buf *buf,
  257. int *is_global)
  258. {
  259. return debugfs_create_file(filename, mode, parent, buf,
  260. &relay_file_operations);
  261. }
  262. static struct rchan_callbacks blk_relay_callbacks = {
  263. .subbuf_start = blk_subbuf_start_callback,
  264. .create_buf_file = blk_create_buf_file_callback,
  265. .remove_buf_file = blk_remove_buf_file_callback,
  266. };
  267. /*
  268. * Setup everything required to start tracing
  269. */
  270. int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
  271. struct blk_user_trace_setup *buts)
  272. {
  273. struct blk_trace *old_bt, *bt = NULL;
  274. struct dentry *dir = NULL;
  275. int ret, i;
  276. if (!buts->buf_size || !buts->buf_nr)
  277. return -EINVAL;
  278. strcpy(buts->name, name);
  279. /*
  280. * some device names have larger paths - convert the slashes
  281. * to underscores for this to work as expected
  282. */
  283. for (i = 0; i < strlen(buts->name); i++)
  284. if (buts->name[i] == '/')
  285. buts->name[i] = '_';
  286. ret = -ENOMEM;
  287. bt = kzalloc(sizeof(*bt), GFP_KERNEL);
  288. if (!bt)
  289. goto err;
  290. bt->sequence = alloc_percpu(unsigned long);
  291. if (!bt->sequence)
  292. goto err;
  293. ret = -ENOENT;
  294. dir = blk_create_tree(buts->name);
  295. if (!dir)
  296. goto err;
  297. bt->dir = dir;
  298. bt->dev = dev;
  299. atomic_set(&bt->dropped, 0);
  300. ret = -EIO;
  301. bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
  302. if (!bt->dropped_file)
  303. goto err;
  304. bt->rchan = relay_open("trace", dir, buts->buf_size,
  305. buts->buf_nr, &blk_relay_callbacks, bt);
  306. if (!bt->rchan)
  307. goto err;
  308. bt->act_mask = buts->act_mask;
  309. if (!bt->act_mask)
  310. bt->act_mask = (u16) -1;
  311. bt->start_lba = buts->start_lba;
  312. bt->end_lba = buts->end_lba;
  313. if (!bt->end_lba)
  314. bt->end_lba = -1ULL;
  315. bt->pid = buts->pid;
  316. bt->trace_state = Blktrace_setup;
  317. ret = -EBUSY;
  318. old_bt = xchg(&q->blk_trace, bt);
  319. if (old_bt) {
  320. (void) xchg(&q->blk_trace, old_bt);
  321. goto err;
  322. }
  323. return 0;
  324. err:
  325. if (dir)
  326. blk_remove_tree(dir);
  327. if (bt) {
  328. if (bt->dropped_file)
  329. debugfs_remove(bt->dropped_file);
  330. free_percpu(bt->sequence);
  331. if (bt->rchan)
  332. relay_close(bt->rchan);
  333. kfree(bt);
  334. }
  335. return ret;
  336. }
  337. int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
  338. char __user *arg)
  339. {
  340. struct blk_user_trace_setup buts;
  341. int ret;
  342. ret = copy_from_user(&buts, arg, sizeof(buts));
  343. if (ret)
  344. return -EFAULT;
  345. ret = do_blk_trace_setup(q, name, dev, &buts);
  346. if (ret)
  347. return ret;
  348. if (copy_to_user(arg, &buts, sizeof(buts)))
  349. return -EFAULT;
  350. return 0;
  351. }
  352. EXPORT_SYMBOL_GPL(blk_trace_setup);
  353. int blk_trace_startstop(struct request_queue *q, int start)
  354. {
  355. struct blk_trace *bt;
  356. int ret;
  357. if ((bt = q->blk_trace) == NULL)
  358. return -EINVAL;
  359. /*
  360. * For starting a trace, we can transition from a setup or stopped
  361. * trace. For stopping a trace, the state must be running
  362. */
  363. ret = -EINVAL;
  364. if (start) {
  365. if (bt->trace_state == Blktrace_setup ||
  366. bt->trace_state == Blktrace_stopped) {
  367. blktrace_seq++;
  368. smp_mb();
  369. bt->trace_state = Blktrace_running;
  370. trace_note_time(bt);
  371. ret = 0;
  372. }
  373. } else {
  374. if (bt->trace_state == Blktrace_running) {
  375. bt->trace_state = Blktrace_stopped;
  376. relay_flush(bt->rchan);
  377. ret = 0;
  378. }
  379. }
  380. return ret;
  381. }
  382. EXPORT_SYMBOL_GPL(blk_trace_startstop);
  383. /**
  384. * blk_trace_ioctl: - handle the ioctls associated with tracing
  385. * @bdev: the block device
  386. * @cmd: the ioctl cmd
  387. * @arg: the argument data, if any
  388. *
  389. **/
  390. int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
  391. {
  392. struct request_queue *q;
  393. int ret, start = 0;
  394. char b[BDEVNAME_SIZE];
  395. q = bdev_get_queue(bdev);
  396. if (!q)
  397. return -ENXIO;
  398. mutex_lock(&bdev->bd_mutex);
  399. switch (cmd) {
  400. case BLKTRACESETUP:
  401. bdevname(bdev, b);
  402. ret = blk_trace_setup(q, b, bdev->bd_dev, arg);
  403. break;
  404. case BLKTRACESTART:
  405. start = 1;
  406. case BLKTRACESTOP:
  407. ret = blk_trace_startstop(q, start);
  408. break;
  409. case BLKTRACETEARDOWN:
  410. ret = blk_trace_remove(q);
  411. break;
  412. default:
  413. ret = -ENOTTY;
  414. break;
  415. }
  416. mutex_unlock(&bdev->bd_mutex);
  417. return ret;
  418. }
  419. /**
  420. * blk_trace_shutdown: - stop and cleanup trace structures
  421. * @q: the request queue associated with the device
  422. *
  423. **/
  424. void blk_trace_shutdown(struct request_queue *q)
  425. {
  426. if (q->blk_trace) {
  427. blk_trace_startstop(q, 0);
  428. blk_trace_remove(q);
  429. }
  430. }