lguest_blk.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421
  1. /*D:400
  2. * The Guest block driver
  3. *
  4. * This is a simple block driver, which appears as /dev/lgba, lgbb, lgbc etc.
  5. * The mechanism is simple: we place the information about the request in the
  6. * device page, then use SEND_DMA (containing the data for a write, or an empty
  7. * "ping" DMA for a read).
  8. :*/
  9. /* Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
  10. *
  11. * This program is free software; you can redistribute it and/or modify
  12. * it under the terms of the GNU General Public License as published by
  13. * the Free Software Foundation; either version 2 of the License, or
  14. * (at your option) any later version.
  15. *
  16. * This program is distributed in the hope that it will be useful,
  17. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. * GNU General Public License for more details.
  20. *
  21. * You should have received a copy of the GNU General Public License
  22. * along with this program; if not, write to the Free Software
  23. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  24. */
  25. //#define DEBUG
  26. #include <linux/init.h>
  27. #include <linux/types.h>
  28. #include <linux/blkdev.h>
  29. #include <linux/interrupt.h>
  30. #include <linux/lguest_bus.h>
  31. static char next_block_index = 'a';
  32. /*D:420 Here is the structure which holds all the information we need about
  33. * each Guest block device.
  34. *
  35. * I'm sure at this stage, you're wondering "hey, where was the adventure I was
  36. * promised?" and thinking "Rusty sucks, I shall say nasty things about him on
  37. * my blog". I think Real adventures have boring bits, too, and you're in the
  38. * middle of one. But it gets better. Just not quite yet. */
  39. struct blockdev
  40. {
  41. /* The block queue infrastructure wants a spinlock: it is held while it
  42. * calls our block request function. We grab it in our interrupt
  43. * handler so the responses don't mess with new requests. */
  44. spinlock_t lock;
  45. /* The disk structure registered with kernel. */
  46. struct gendisk *disk;
  47. /* The major device number for this disk, and the interrupt. We only
  48. * really keep them here for completeness; we'd need them if we
  49. * supported device unplugging. */
  50. int major;
  51. int irq;
  52. /* The physical address of this device's memory page */
  53. unsigned long phys_addr;
  54. /* The mapped memory page for convenient acces. */
  55. struct lguest_block_page *lb_page;
  56. /* We only have a single request outstanding at a time: this is it. */
  57. struct lguest_dma dma;
  58. struct request *req;
  59. };
  60. /*D:495 We originally used end_request() throughout the driver, but it turns
  61. * out that end_request() is deprecated, and doesn't actually end the request
  62. * (which seems like a good reason to deprecate it!). It simply ends the first
  63. * bio. So if we had 3 bios in a "struct request" we would do all 3,
  64. * end_request(), do 2, end_request(), do 1 and end_request(): twice as much
  65. * work as we needed to do.
  66. *
  67. * This reinforced to me that I do not understand the block layer.
  68. *
  69. * Nonetheless, Jens Axboe gave me this nice helper to end all chunks of a
  70. * request. This improved disk speed by 130%. */
  71. static void end_entire_request(struct request *req, int uptodate)
  72. {
  73. if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
  74. BUG();
  75. add_disk_randomness(req->rq_disk);
  76. blkdev_dequeue_request(req);
  77. end_that_request_last(req, uptodate);
  78. }
  79. /* I'm told there are only two stories in the world worth telling: love and
  80. * hate. So there used to be a love scene here like this:
  81. *
  82. * Launcher: We could make beautiful I/O together, you and I.
  83. * Guest: My, that's a big disk!
  84. *
  85. * Unfortunately, it was just too raunchy for our otherwise-gentle tale. */
  86. /*D:490 This is the interrupt handler, called when a block read or write has
  87. * been completed for us. */
  88. static irqreturn_t lgb_irq(int irq, void *_bd)
  89. {
  90. /* We handed our "struct blockdev" as the argument to request_irq(), so
  91. * it is passed through to us here. This tells us which device we're
  92. * dealing with in case we have more than one. */
  93. struct blockdev *bd = _bd;
  94. unsigned long flags;
  95. /* We weren't doing anything? Strange, but could happen if we shared
  96. * interrupts (we don't!). */
  97. if (!bd->req) {
  98. pr_debug("No work!\n");
  99. return IRQ_NONE;
  100. }
  101. /* Not done yet? That's equally strange. */
  102. if (!bd->lb_page->result) {
  103. pr_debug("No result!\n");
  104. return IRQ_NONE;
  105. }
  106. /* We have to grab the lock before ending the request. */
  107. spin_lock_irqsave(&bd->lock, flags);
  108. /* "result" is 1 for success, 2 for failure: end_entire_request() wants
  109. * to know whether this succeeded or not. */
  110. end_entire_request(bd->req, bd->lb_page->result == 1);
  111. /* Clear out request, it's done. */
  112. bd->req = NULL;
  113. /* Reset incoming DMA for next time. */
  114. bd->dma.used_len = 0;
  115. /* Ready for more reads or writes */
  116. blk_start_queue(bd->disk->queue);
  117. spin_unlock_irqrestore(&bd->lock, flags);
  118. /* The interrupt was for us, we dealt with it. */
  119. return IRQ_HANDLED;
  120. }
  121. /*D:480 The block layer's "struct request" contains a number of "struct bio"s,
  122. * each of which contains "struct bio_vec"s, each of which contains a page, an
  123. * offset and a length.
  124. *
  125. * Fortunately there are iterators to help us walk through the "struct
  126. * request". Even more fortunately, there were plenty of places to steal the
  127. * code from. We pack the "struct request" into our "struct lguest_dma" and
  128. * return the total length. */
  129. static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma)
  130. {
  131. unsigned int i = 0, len = 0;
  132. struct req_iterator iter;
  133. struct bio_vec *bvec;
  134. rq_for_each_segment(bvec, req, iter) {
  135. /* We told the block layer not to give us too many. */
  136. BUG_ON(i == LGUEST_MAX_DMA_SECTIONS);
  137. /* If we had a zero-length segment, it would look like
  138. * the end of the data referred to by the "struct
  139. * lguest_dma", so make sure that doesn't happen. */
  140. BUG_ON(!bvec->bv_len);
  141. /* Convert page & offset to a physical address */
  142. dma->addr[i] = page_to_phys(bvec->bv_page)
  143. + bvec->bv_offset;
  144. dma->len[i] = bvec->bv_len;
  145. len += bvec->bv_len;
  146. i++;
  147. }
  148. /* If the array isn't full, we mark the end with a 0 length */
  149. if (i < LGUEST_MAX_DMA_SECTIONS)
  150. dma->len[i] = 0;
  151. return len;
  152. }
  153. /* This creates an empty DMA, useful for prodding the Host without sending data
  154. * (ie. when we want to do a read) */
  155. static void empty_dma(struct lguest_dma *dma)
  156. {
  157. dma->len[0] = 0;
  158. }
  159. /*D:470 Setting up a request is fairly easy: */
  160. static void setup_req(struct blockdev *bd,
  161. int type, struct request *req, struct lguest_dma *dma)
  162. {
  163. /* The type is 1 (write) or 0 (read). */
  164. bd->lb_page->type = type;
  165. /* The sector on disk where the read or write starts. */
  166. bd->lb_page->sector = req->sector;
  167. /* The result is initialized to 0 (unfinished). */
  168. bd->lb_page->result = 0;
  169. /* The current request (so we can end it in the interrupt handler). */
  170. bd->req = req;
  171. /* The number of bytes: returned as a side-effect of req_to_dma(),
  172. * which packs the block layer's "struct request" into our "struct
  173. * lguest_dma" */
  174. bd->lb_page->bytes = req_to_dma(req, dma);
  175. }
  176. /*D:450 Write is pretty straightforward: we pack the request into a "struct
  177. * lguest_dma", then use SEND_DMA to send the request. */
  178. static void do_write(struct blockdev *bd, struct request *req)
  179. {
  180. struct lguest_dma send;
  181. pr_debug("lgb: WRITE sector %li\n", (long)req->sector);
  182. setup_req(bd, 1, req, &send);
  183. lguest_send_dma(bd->phys_addr, &send);
  184. }
  185. /* Read is similar to write, except we pack the request into our receive
  186. * "struct lguest_dma" and send through an empty DMA just to tell the Host that
  187. * there's a request pending. */
  188. static void do_read(struct blockdev *bd, struct request *req)
  189. {
  190. struct lguest_dma ping;
  191. pr_debug("lgb: READ sector %li\n", (long)req->sector);
  192. setup_req(bd, 0, req, &bd->dma);
  193. empty_dma(&ping);
  194. lguest_send_dma(bd->phys_addr, &ping);
  195. }
  196. /*D:440 This where requests come in: we get handed the request queue and are
  197. * expected to pull a "struct request" off it until we've finished them or
  198. * we're waiting for a reply: */
  199. static void do_lgb_request(struct request_queue *q)
  200. {
  201. struct blockdev *bd;
  202. struct request *req;
  203. again:
  204. /* This sometimes returns NULL even on the very first time around. I
  205. * wonder if it's something to do with letting elves handle the request
  206. * queue... */
  207. req = elv_next_request(q);
  208. if (!req)
  209. return;
  210. /* We attached the struct blockdev to the disk: get it back */
  211. bd = req->rq_disk->private_data;
  212. /* Sometimes we get repeated requests after blk_stop_queue(), but we
  213. * can only handle one at a time. */
  214. if (bd->req)
  215. return;
  216. /* We only do reads and writes: no tricky business! */
  217. if (!blk_fs_request(req)) {
  218. pr_debug("Got non-command 0x%08x\n", req->cmd_type);
  219. req->errors++;
  220. end_entire_request(req, 0);
  221. goto again;
  222. }
  223. if (rq_data_dir(req) == WRITE)
  224. do_write(bd, req);
  225. else
  226. do_read(bd, req);
  227. /* We've put out the request, so stop any more coming in until we get
  228. * an interrupt, which takes us to lgb_irq() to re-enable the queue. */
  229. blk_stop_queue(q);
  230. }
  231. /*D:430 This is the "struct block_device_operations" we attach to the disk at
  232. * the end of lguestblk_probe(). It doesn't seem to want much. */
  233. static struct block_device_operations lguestblk_fops = {
  234. .owner = THIS_MODULE,
  235. };
  236. /*D:425 Setting up a disk device seems to involve a lot of code. I'm not sure
  237. * quite why. I do know that the IDE code sent two or three of the maintainers
  238. * insane, perhaps this is the fringe of the same disease?
  239. *
  240. * As in the console code, the probe function gets handed the generic
  241. * lguest_device from lguest_bus.c: */
  242. static int lguestblk_probe(struct lguest_device *lgdev)
  243. {
  244. struct blockdev *bd;
  245. int err;
  246. int irqflags = IRQF_SHARED;
  247. /* First we allocate our own "struct blockdev" and initialize the easy
  248. * fields. */
  249. bd = kmalloc(sizeof(*bd), GFP_KERNEL);
  250. if (!bd)
  251. return -ENOMEM;
  252. spin_lock_init(&bd->lock);
  253. bd->irq = lgdev_irq(lgdev);
  254. bd->req = NULL;
  255. bd->dma.used_len = 0;
  256. bd->dma.len[0] = 0;
  257. /* The descriptor in the lguest_devices array provided by the Host
  258. * gives the Guest the physical page number of the device's page. */
  259. bd->phys_addr = (lguest_devices[lgdev->index].pfn << PAGE_SHIFT);
  260. /* We use lguest_map() to get a pointer to the device page */
  261. bd->lb_page = lguest_map(bd->phys_addr, 1);
  262. if (!bd->lb_page) {
  263. err = -ENOMEM;
  264. goto out_free_bd;
  265. }
  266. /* We need a major device number: 0 means "assign one dynamically". */
  267. bd->major = register_blkdev(0, "lguestblk");
  268. if (bd->major < 0) {
  269. err = bd->major;
  270. goto out_unmap;
  271. }
  272. /* This allocates a "struct gendisk" where we pack all the information
  273. * about the disk which the rest of Linux sees. The argument is the
  274. * number of minor devices desired: we need one minor for the main
  275. * disk, and one for each partition. Of course, we can't possibly know
  276. * how many partitions are on the disk (add_disk does that).
  277. */
  278. bd->disk = alloc_disk(16);
  279. if (!bd->disk) {
  280. err = -ENOMEM;
  281. goto out_unregister_blkdev;
  282. }
  283. /* Every disk needs a queue for requests to come in: we set up the
  284. * queue with a callback function (the core of our driver) and the lock
  285. * to use. */
  286. bd->disk->queue = blk_init_queue(do_lgb_request, &bd->lock);
  287. if (!bd->disk->queue) {
  288. err = -ENOMEM;
  289. goto out_put_disk;
  290. }
  291. /* We can only handle a certain number of pointers in our SEND_DMA
  292. * call, so we set that with blk_queue_max_hw_segments(). This is not
  293. * to be confused with blk_queue_max_phys_segments() of course! I
  294. * know, who could possibly confuse the two?
  295. *
  296. * Well, it's simple to tell them apart: this one seems to work and the
  297. * other one didn't. */
  298. blk_queue_max_hw_segments(bd->disk->queue, LGUEST_MAX_DMA_SECTIONS);
  299. /* Due to technical limitations of our Host (and simple coding) we
  300. * can't have a single buffer which crosses a page boundary. Tell it
  301. * here. This means that our maximum request size is 16
  302. * (LGUEST_MAX_DMA_SECTIONS) pages. */
  303. blk_queue_segment_boundary(bd->disk->queue, PAGE_SIZE-1);
  304. /* We name our disk: this becomes the device name when udev does its
  305. * magic thing and creates the device node, such as /dev/lgba.
  306. * next_block_index is a global which starts at 'a'. Unfortunately
  307. * this simple increment logic means that the 27th disk will be called
  308. * "/dev/lgb{". In that case, I recommend having at least 29 disks, so
  309. * your /dev directory will be balanced. */
  310. sprintf(bd->disk->disk_name, "lgb%c", next_block_index++);
  311. /* We look to the device descriptor again to see if this device's
  312. * interrupts are expected to be random. If they are, we tell the irq
  313. * subsystem. At the moment this bit is always set. */
  314. if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS)
  315. irqflags |= IRQF_SAMPLE_RANDOM;
  316. /* Now we have the name and irqflags, we can request the interrupt; we
  317. * give it the "struct blockdev" we have set up to pass to lgb_irq()
  318. * when there is an interrupt. */
  319. err = request_irq(bd->irq, lgb_irq, irqflags, bd->disk->disk_name, bd);
  320. if (err)
  321. goto out_cleanup_queue;
  322. /* We bind our one-entry DMA pool to the key for this block device so
  323. * the Host can reply to our requests. The key is equal to the
  324. * physical address of the device's page, which is conveniently
  325. * unique. */
  326. err = lguest_bind_dma(bd->phys_addr, &bd->dma, 1, bd->irq);
  327. if (err)
  328. goto out_free_irq;
  329. /* We finish our disk initialization and add the disk to the system. */
  330. bd->disk->major = bd->major;
  331. bd->disk->first_minor = 0;
  332. bd->disk->private_data = bd;
  333. bd->disk->fops = &lguestblk_fops;
  334. /* This is initialized to the disk size by the Launcher. */
  335. set_capacity(bd->disk, bd->lb_page->num_sectors);
  336. add_disk(bd->disk);
  337. printk(KERN_INFO "%s: device %i at major %d\n",
  338. bd->disk->disk_name, lgdev->index, bd->major);
  339. /* We don't need to keep the "struct blockdev" around, but if we ever
  340. * implemented device removal, we'd need this. */
  341. lgdev->private = bd;
  342. return 0;
  343. out_free_irq:
  344. free_irq(bd->irq, bd);
  345. out_cleanup_queue:
  346. blk_cleanup_queue(bd->disk->queue);
  347. out_put_disk:
  348. put_disk(bd->disk);
  349. out_unregister_blkdev:
  350. unregister_blkdev(bd->major, "lguestblk");
  351. out_unmap:
  352. lguest_unmap(bd->lb_page);
  353. out_free_bd:
  354. kfree(bd);
  355. return err;
  356. }
  357. /*D:410 The boilerplate code for registering the lguest block driver is just
  358. * like the console: */
  359. static struct lguest_driver lguestblk_drv = {
  360. .name = "lguestblk",
  361. .owner = THIS_MODULE,
  362. .device_type = LGUEST_DEVICE_T_BLOCK,
  363. .probe = lguestblk_probe,
  364. };
  365. static __init int lguestblk_init(void)
  366. {
  367. return register_lguest_driver(&lguestblk_drv);
  368. }
  369. module_init(lguestblk_init);
  370. MODULE_DESCRIPTION("Lguest block driver");
  371. MODULE_LICENSE("GPL");