blkback.c 30 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135
  1. /******************************************************************************
  2. *
  3. * Back-end of the driver for virtual block devices. This portion of the
  4. * driver exports a 'unified' block-device interface that can be accessed
  5. * by any operating system that implements a compatible front end. A
  6. * reference front-end implementation can be found in:
  7. * drivers/block/xen-blkfront.c
  8. *
  9. * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
  10. * Copyright (c) 2005, Christopher Clark
  11. *
  12. * This program is free software; you can redistribute it and/or
  13. * modify it under the terms of the GNU General Public License version 2
  14. * as published by the Free Software Foundation; or, when distributed
  15. * separately from the Linux kernel or incorporated into other
  16. * software packages, subject to the following license:
  17. *
  18. * Permission is hereby granted, free of charge, to any person obtaining a copy
  19. * of this source file (the "Software"), to deal in the Software without
  20. * restriction, including without limitation the rights to use, copy, modify,
  21. * merge, publish, distribute, sublicense, and/or sell copies of the Software,
  22. * and to permit persons to whom the Software is furnished to do so, subject to
  23. * the following conditions:
  24. *
  25. * The above copyright notice and this permission notice shall be included in
  26. * all copies or substantial portions of the Software.
  27. *
  28. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  29. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  30. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  31. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  32. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  33. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  34. * IN THE SOFTWARE.
  35. */
  36. #include <linux/spinlock.h>
  37. #include <linux/kthread.h>
  38. #include <linux/list.h>
  39. #include <linux/delay.h>
  40. #include <linux/freezer.h>
  41. #include <linux/bitmap.h>
  42. #include <xen/events.h>
  43. #include <xen/page.h>
  44. #include <xen/xen.h>
  45. #include <asm/xen/hypervisor.h>
  46. #include <asm/xen/hypercall.h>
  47. #include "common.h"
  48. /*
  49. * These are rather arbitrary. They are fairly large because adjacent requests
  50. * pulled from a communication ring are quite likely to end up being part of
  51. * the same scatter/gather request at the disc.
  52. *
  53. * ** TRY INCREASING 'xen_blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
  54. *
  55. * This will increase the chances of being able to write whole tracks.
  56. * 64 should be enough to keep us competitive with Linux.
  57. */
  58. static int xen_blkif_reqs = 64;
  59. module_param_named(reqs, xen_blkif_reqs, int, 0);
  60. MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
  61. /* Run-time switchable: /sys/module/blkback/parameters/ */
  62. static unsigned int log_stats;
  63. module_param(log_stats, int, 0644);
  64. /*
  65. * Each outstanding request that we've passed to the lower device layers has a
  66. * 'pending_req' allocated to it. Each buffer_head that completes decrements
  67. * the pendcnt towards zero. When it hits zero, the specified domain has a
  68. * response queued for it, with the saved 'id' passed back.
  69. */
  70. struct pending_req {
  71. struct xen_blkif *blkif;
  72. u64 id;
  73. int nr_pages;
  74. atomic_t pendcnt;
  75. unsigned short operation;
  76. int status;
  77. struct list_head free_list;
  78. DECLARE_BITMAP(unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
  79. };
  80. #define BLKBACK_INVALID_HANDLE (~0)
  81. struct xen_blkbk {
  82. struct pending_req *pending_reqs;
  83. /* List of all 'pending_req' available */
  84. struct list_head pending_free;
  85. /* And its spinlock. */
  86. spinlock_t pending_free_lock;
  87. wait_queue_head_t pending_free_wq;
  88. /* The list of all pages that are available. */
  89. struct page **pending_pages;
  90. /* And the grant handles that are available. */
  91. grant_handle_t *pending_grant_handles;
  92. };
  93. static struct xen_blkbk *blkbk;
  94. /*
  95. * Maximum number of grant pages that can be mapped in blkback.
  96. * BLKIF_MAX_SEGMENTS_PER_REQUEST * RING_SIZE is the maximum number of
  97. * pages that blkback will persistently map.
  98. * Currently, this is:
  99. * RING_SIZE = 32 (for all known ring types)
  100. * BLKIF_MAX_SEGMENTS_PER_REQUEST = 11
  101. * sizeof(struct persistent_gnt) = 48
  102. * So the maximum memory used to store the grants is:
  103. * 32 * 11 * 48 = 16896 bytes
  104. */
  105. static inline unsigned int max_mapped_grant_pages(enum blkif_protocol protocol)
  106. {
  107. switch (protocol) {
  108. case BLKIF_PROTOCOL_NATIVE:
  109. return __CONST_RING_SIZE(blkif, PAGE_SIZE) *
  110. BLKIF_MAX_SEGMENTS_PER_REQUEST;
  111. case BLKIF_PROTOCOL_X86_32:
  112. return __CONST_RING_SIZE(blkif_x86_32, PAGE_SIZE) *
  113. BLKIF_MAX_SEGMENTS_PER_REQUEST;
  114. case BLKIF_PROTOCOL_X86_64:
  115. return __CONST_RING_SIZE(blkif_x86_64, PAGE_SIZE) *
  116. BLKIF_MAX_SEGMENTS_PER_REQUEST;
  117. default:
  118. BUG();
  119. }
  120. return 0;
  121. }
  122. /*
  123. * Little helpful macro to figure out the index and virtual address of the
  124. * pending_pages[..]. For each 'pending_req' we have have up to
  125. * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through
  126. * 10 and would index in the pending_pages[..].
  127. */
  128. static inline int vaddr_pagenr(struct pending_req *req, int seg)
  129. {
  130. return (req - blkbk->pending_reqs) *
  131. BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
  132. }
  133. #define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
  134. static inline unsigned long vaddr(struct pending_req *req, int seg)
  135. {
  136. unsigned long pfn = page_to_pfn(blkbk->pending_page(req, seg));
  137. return (unsigned long)pfn_to_kaddr(pfn);
  138. }
  139. #define pending_handle(_req, _seg) \
  140. (blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)])
  141. static int do_block_io_op(struct xen_blkif *blkif);
  142. static int dispatch_rw_block_io(struct xen_blkif *blkif,
  143. struct blkif_request *req,
  144. struct pending_req *pending_req);
  145. static void make_response(struct xen_blkif *blkif, u64 id,
  146. unsigned short op, int st);
  147. #define foreach_grant_safe(pos, n, rbtree, node) \
  148. for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node), \
  149. (n) = rb_next(&(pos)->node); \
  150. &(pos)->node != NULL; \
  151. (pos) = container_of(n, typeof(*(pos)), node), \
  152. (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL)
  153. static void add_persistent_gnt(struct rb_root *root,
  154. struct persistent_gnt *persistent_gnt)
  155. {
  156. struct rb_node **new = &(root->rb_node), *parent = NULL;
  157. struct persistent_gnt *this;
  158. /* Figure out where to put new node */
  159. while (*new) {
  160. this = container_of(*new, struct persistent_gnt, node);
  161. parent = *new;
  162. if (persistent_gnt->gnt < this->gnt)
  163. new = &((*new)->rb_left);
  164. else if (persistent_gnt->gnt > this->gnt)
  165. new = &((*new)->rb_right);
  166. else {
  167. pr_alert(DRV_PFX " trying to add a gref that's already in the tree\n");
  168. BUG();
  169. }
  170. }
  171. /* Add new node and rebalance tree. */
  172. rb_link_node(&(persistent_gnt->node), parent, new);
  173. rb_insert_color(&(persistent_gnt->node), root);
  174. }
  175. static struct persistent_gnt *get_persistent_gnt(struct rb_root *root,
  176. grant_ref_t gref)
  177. {
  178. struct persistent_gnt *data;
  179. struct rb_node *node = root->rb_node;
  180. while (node) {
  181. data = container_of(node, struct persistent_gnt, node);
  182. if (gref < data->gnt)
  183. node = node->rb_left;
  184. else if (gref > data->gnt)
  185. node = node->rb_right;
  186. else
  187. return data;
  188. }
  189. return NULL;
  190. }
  191. static void free_persistent_gnts(struct rb_root *root, unsigned int num)
  192. {
  193. struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  194. struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  195. struct persistent_gnt *persistent_gnt;
  196. struct rb_node *n;
  197. int ret = 0;
  198. int segs_to_unmap = 0;
  199. foreach_grant_safe(persistent_gnt, n, root, node) {
  200. BUG_ON(persistent_gnt->handle ==
  201. BLKBACK_INVALID_HANDLE);
  202. gnttab_set_unmap_op(&unmap[segs_to_unmap],
  203. (unsigned long) pfn_to_kaddr(page_to_pfn(
  204. persistent_gnt->page)),
  205. GNTMAP_host_map,
  206. persistent_gnt->handle);
  207. pages[segs_to_unmap] = persistent_gnt->page;
  208. if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST ||
  209. !rb_next(&persistent_gnt->node)) {
  210. ret = gnttab_unmap_refs(unmap, NULL, pages,
  211. segs_to_unmap);
  212. BUG_ON(ret);
  213. segs_to_unmap = 0;
  214. }
  215. rb_erase(&persistent_gnt->node, root);
  216. kfree(persistent_gnt);
  217. num--;
  218. }
  219. BUG_ON(num != 0);
  220. }
  221. /*
  222. * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
  223. */
  224. static struct pending_req *alloc_req(void)
  225. {
  226. struct pending_req *req = NULL;
  227. unsigned long flags;
  228. spin_lock_irqsave(&blkbk->pending_free_lock, flags);
  229. if (!list_empty(&blkbk->pending_free)) {
  230. req = list_entry(blkbk->pending_free.next, struct pending_req,
  231. free_list);
  232. list_del(&req->free_list);
  233. }
  234. spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
  235. return req;
  236. }
  237. /*
  238. * Return the 'pending_req' structure back to the freepool. We also
  239. * wake up the thread if it was waiting for a free page.
  240. */
  241. static void free_req(struct pending_req *req)
  242. {
  243. unsigned long flags;
  244. int was_empty;
  245. spin_lock_irqsave(&blkbk->pending_free_lock, flags);
  246. was_empty = list_empty(&blkbk->pending_free);
  247. list_add(&req->free_list, &blkbk->pending_free);
  248. spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
  249. if (was_empty)
  250. wake_up(&blkbk->pending_free_wq);
  251. }
  252. /*
  253. * Routines for managing virtual block devices (vbds).
  254. */
  255. static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
  256. int operation)
  257. {
  258. struct xen_vbd *vbd = &blkif->vbd;
  259. int rc = -EACCES;
  260. if ((operation != READ) && vbd->readonly)
  261. goto out;
  262. if (likely(req->nr_sects)) {
  263. blkif_sector_t end = req->sector_number + req->nr_sects;
  264. if (unlikely(end < req->sector_number))
  265. goto out;
  266. if (unlikely(end > vbd_sz(vbd)))
  267. goto out;
  268. }
  269. req->dev = vbd->pdevice;
  270. req->bdev = vbd->bdev;
  271. rc = 0;
  272. out:
  273. return rc;
  274. }
  275. static void xen_vbd_resize(struct xen_blkif *blkif)
  276. {
  277. struct xen_vbd *vbd = &blkif->vbd;
  278. struct xenbus_transaction xbt;
  279. int err;
  280. struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be);
  281. unsigned long long new_size = vbd_sz(vbd);
  282. pr_info(DRV_PFX "VBD Resize: Domid: %d, Device: (%d, %d)\n",
  283. blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
  284. pr_info(DRV_PFX "VBD Resize: new size %llu\n", new_size);
  285. vbd->size = new_size;
  286. again:
  287. err = xenbus_transaction_start(&xbt);
  288. if (err) {
  289. pr_warn(DRV_PFX "Error starting transaction");
  290. return;
  291. }
  292. err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
  293. (unsigned long long)vbd_sz(vbd));
  294. if (err) {
  295. pr_warn(DRV_PFX "Error writing new size");
  296. goto abort;
  297. }
  298. /*
  299. * Write the current state; we will use this to synchronize
  300. * the front-end. If the current state is "connected" the
  301. * front-end will get the new size information online.
  302. */
  303. err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
  304. if (err) {
  305. pr_warn(DRV_PFX "Error writing the state");
  306. goto abort;
  307. }
  308. err = xenbus_transaction_end(xbt, 0);
  309. if (err == -EAGAIN)
  310. goto again;
  311. if (err)
  312. pr_warn(DRV_PFX "Error ending transaction");
  313. return;
  314. abort:
  315. xenbus_transaction_end(xbt, 1);
  316. }
  317. /*
  318. * Notification from the guest OS.
  319. */
  320. static void blkif_notify_work(struct xen_blkif *blkif)
  321. {
  322. blkif->waiting_reqs = 1;
  323. wake_up(&blkif->wq);
  324. }
  325. irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
  326. {
  327. blkif_notify_work(dev_id);
  328. return IRQ_HANDLED;
  329. }
  330. /*
  331. * SCHEDULER FUNCTIONS
  332. */
  333. static void print_stats(struct xen_blkif *blkif)
  334. {
  335. pr_info("xen-blkback (%s): oo %3d | rd %4d | wr %4d | f %4d"
  336. " | ds %4d\n",
  337. current->comm, blkif->st_oo_req,
  338. blkif->st_rd_req, blkif->st_wr_req,
  339. blkif->st_f_req, blkif->st_ds_req);
  340. blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
  341. blkif->st_rd_req = 0;
  342. blkif->st_wr_req = 0;
  343. blkif->st_oo_req = 0;
  344. blkif->st_ds_req = 0;
  345. }
  346. int xen_blkif_schedule(void *arg)
  347. {
  348. struct xen_blkif *blkif = arg;
  349. struct xen_vbd *vbd = &blkif->vbd;
  350. xen_blkif_get(blkif);
  351. while (!kthread_should_stop()) {
  352. if (try_to_freeze())
  353. continue;
  354. if (unlikely(vbd->size != vbd_sz(vbd)))
  355. xen_vbd_resize(blkif);
  356. wait_event_interruptible(
  357. blkif->wq,
  358. blkif->waiting_reqs || kthread_should_stop());
  359. wait_event_interruptible(
  360. blkbk->pending_free_wq,
  361. !list_empty(&blkbk->pending_free) ||
  362. kthread_should_stop());
  363. blkif->waiting_reqs = 0;
  364. smp_mb(); /* clear flag *before* checking for work */
  365. if (do_block_io_op(blkif))
  366. blkif->waiting_reqs = 1;
  367. if (log_stats && time_after(jiffies, blkif->st_print))
  368. print_stats(blkif);
  369. }
  370. /* Free all persistent grant pages */
  371. if (!RB_EMPTY_ROOT(&blkif->persistent_gnts))
  372. free_persistent_gnts(&blkif->persistent_gnts,
  373. blkif->persistent_gnt_c);
  374. BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
  375. blkif->persistent_gnt_c = 0;
  376. if (log_stats)
  377. print_stats(blkif);
  378. blkif->xenblkd = NULL;
  379. xen_blkif_put(blkif);
  380. return 0;
  381. }
  382. struct seg_buf {
  383. unsigned long buf;
  384. unsigned int nsec;
  385. };
  386. /*
  387. * Unmap the grant references, and also remove the M2P over-rides
  388. * used in the 'pending_req'.
  389. */
  390. static void xen_blkbk_unmap(struct pending_req *req)
  391. {
  392. struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  393. struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  394. unsigned int i, invcount = 0;
  395. grant_handle_t handle;
  396. int ret;
  397. for (i = 0; i < req->nr_pages; i++) {
  398. if (!test_bit(i, req->unmap_seg))
  399. continue;
  400. handle = pending_handle(req, i);
  401. if (handle == BLKBACK_INVALID_HANDLE)
  402. continue;
  403. gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
  404. GNTMAP_host_map, handle);
  405. pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
  406. pages[invcount] = virt_to_page(vaddr(req, i));
  407. invcount++;
  408. }
  409. ret = gnttab_unmap_refs(unmap, NULL, pages, invcount);
  410. BUG_ON(ret);
  411. }
  412. static int xen_blkbk_map(struct blkif_request *req,
  413. struct pending_req *pending_req,
  414. struct seg_buf seg[],
  415. struct page *pages[])
  416. {
  417. struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  418. struct persistent_gnt *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  419. struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  420. struct persistent_gnt *persistent_gnt = NULL;
  421. struct xen_blkif *blkif = pending_req->blkif;
  422. phys_addr_t addr = 0;
  423. int i, j;
  424. bool new_map;
  425. int nseg = req->u.rw.nr_segments;
  426. int segs_to_map = 0;
  427. int ret = 0;
  428. int use_persistent_gnts;
  429. use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
  430. BUG_ON(blkif->persistent_gnt_c >
  431. max_mapped_grant_pages(pending_req->blkif->blk_protocol));
  432. /*
  433. * Fill out preq.nr_sects with proper amount of sectors, and setup
  434. * assign map[..] with the PFN of the page in our domain with the
  435. * corresponding grant reference for each page.
  436. */
  437. for (i = 0; i < nseg; i++) {
  438. uint32_t flags;
  439. if (use_persistent_gnts)
  440. persistent_gnt = get_persistent_gnt(
  441. &blkif->persistent_gnts,
  442. req->u.rw.seg[i].gref);
  443. if (persistent_gnt) {
  444. /*
  445. * We are using persistent grants and
  446. * the grant is already mapped
  447. */
  448. new_map = false;
  449. } else if (use_persistent_gnts &&
  450. blkif->persistent_gnt_c <
  451. max_mapped_grant_pages(blkif->blk_protocol)) {
  452. /*
  453. * We are using persistent grants, the grant is
  454. * not mapped but we have room for it
  455. */
  456. new_map = true;
  457. persistent_gnt = kmalloc(
  458. sizeof(struct persistent_gnt),
  459. GFP_KERNEL);
  460. if (!persistent_gnt)
  461. return -ENOMEM;
  462. persistent_gnt->page = alloc_page(GFP_KERNEL);
  463. if (!persistent_gnt->page) {
  464. kfree(persistent_gnt);
  465. return -ENOMEM;
  466. }
  467. persistent_gnt->gnt = req->u.rw.seg[i].gref;
  468. persistent_gnt->handle = BLKBACK_INVALID_HANDLE;
  469. pages_to_gnt[segs_to_map] =
  470. persistent_gnt->page;
  471. addr = (unsigned long) pfn_to_kaddr(
  472. page_to_pfn(persistent_gnt->page));
  473. add_persistent_gnt(&blkif->persistent_gnts,
  474. persistent_gnt);
  475. blkif->persistent_gnt_c++;
  476. pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n",
  477. persistent_gnt->gnt, blkif->persistent_gnt_c,
  478. max_mapped_grant_pages(blkif->blk_protocol));
  479. } else {
  480. /*
  481. * We are either using persistent grants and
  482. * hit the maximum limit of grants mapped,
  483. * or we are not using persistent grants.
  484. */
  485. if (use_persistent_gnts &&
  486. !blkif->vbd.overflow_max_grants) {
  487. blkif->vbd.overflow_max_grants = 1;
  488. pr_alert(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n",
  489. blkif->domid, blkif->vbd.handle);
  490. }
  491. new_map = true;
  492. pages[i] = blkbk->pending_page(pending_req, i);
  493. addr = vaddr(pending_req, i);
  494. pages_to_gnt[segs_to_map] =
  495. blkbk->pending_page(pending_req, i);
  496. }
  497. if (persistent_gnt) {
  498. pages[i] = persistent_gnt->page;
  499. persistent_gnts[i] = persistent_gnt;
  500. } else {
  501. persistent_gnts[i] = NULL;
  502. }
  503. if (new_map) {
  504. flags = GNTMAP_host_map;
  505. if (!persistent_gnt &&
  506. (pending_req->operation != BLKIF_OP_READ))
  507. flags |= GNTMAP_readonly;
  508. gnttab_set_map_op(&map[segs_to_map++], addr,
  509. flags, req->u.rw.seg[i].gref,
  510. blkif->domid);
  511. }
  512. }
  513. if (segs_to_map) {
  514. ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map);
  515. BUG_ON(ret);
  516. }
  517. /*
  518. * Now swizzle the MFN in our domain with the MFN from the other domain
  519. * so that when we access vaddr(pending_req,i) it has the contents of
  520. * the page from the other domain.
  521. */
  522. bitmap_zero(pending_req->unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
  523. for (i = 0, j = 0; i < nseg; i++) {
  524. if (!persistent_gnts[i] ||
  525. persistent_gnts[i]->handle == BLKBACK_INVALID_HANDLE) {
  526. /* This is a newly mapped grant */
  527. BUG_ON(j >= segs_to_map);
  528. if (unlikely(map[j].status != 0)) {
  529. pr_debug(DRV_PFX "invalid buffer -- could not remap it\n");
  530. map[j].handle = BLKBACK_INVALID_HANDLE;
  531. ret |= 1;
  532. if (persistent_gnts[i]) {
  533. rb_erase(&persistent_gnts[i]->node,
  534. &blkif->persistent_gnts);
  535. blkif->persistent_gnt_c--;
  536. kfree(persistent_gnts[i]);
  537. persistent_gnts[i] = NULL;
  538. }
  539. }
  540. }
  541. if (persistent_gnts[i]) {
  542. if (persistent_gnts[i]->handle ==
  543. BLKBACK_INVALID_HANDLE) {
  544. /*
  545. * If this is a new persistent grant
  546. * save the handler
  547. */
  548. persistent_gnts[i]->handle = map[j].handle;
  549. persistent_gnts[i]->dev_bus_addr =
  550. map[j++].dev_bus_addr;
  551. }
  552. pending_handle(pending_req, i) =
  553. persistent_gnts[i]->handle;
  554. if (ret)
  555. continue;
  556. seg[i].buf = persistent_gnts[i]->dev_bus_addr |
  557. (req->u.rw.seg[i].first_sect << 9);
  558. } else {
  559. pending_handle(pending_req, i) = map[j].handle;
  560. bitmap_set(pending_req->unmap_seg, i, 1);
  561. if (ret) {
  562. j++;
  563. continue;
  564. }
  565. seg[i].buf = map[j++].dev_bus_addr |
  566. (req->u.rw.seg[i].first_sect << 9);
  567. }
  568. }
  569. return ret;
  570. }
  571. static int dispatch_discard_io(struct xen_blkif *blkif,
  572. struct blkif_request *req)
  573. {
  574. int err = 0;
  575. int status = BLKIF_RSP_OKAY;
  576. struct block_device *bdev = blkif->vbd.bdev;
  577. unsigned long secure;
  578. blkif->st_ds_req++;
  579. xen_blkif_get(blkif);
  580. secure = (blkif->vbd.discard_secure &&
  581. (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
  582. BLKDEV_DISCARD_SECURE : 0;
  583. err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
  584. req->u.discard.nr_sectors,
  585. GFP_KERNEL, secure);
  586. if (err == -EOPNOTSUPP) {
  587. pr_debug(DRV_PFX "discard op failed, not supported\n");
  588. status = BLKIF_RSP_EOPNOTSUPP;
  589. } else if (err)
  590. status = BLKIF_RSP_ERROR;
  591. make_response(blkif, req->u.discard.id, req->operation, status);
  592. xen_blkif_put(blkif);
  593. return err;
  594. }
  595. static void xen_blk_drain_io(struct xen_blkif *blkif)
  596. {
  597. atomic_set(&blkif->drain, 1);
  598. do {
  599. /* The initial value is one, and one refcnt taken at the
  600. * start of the xen_blkif_schedule thread. */
  601. if (atomic_read(&blkif->refcnt) <= 2)
  602. break;
  603. wait_for_completion_interruptible_timeout(
  604. &blkif->drain_complete, HZ);
  605. if (!atomic_read(&blkif->drain))
  606. break;
  607. } while (!kthread_should_stop());
  608. atomic_set(&blkif->drain, 0);
  609. }
  610. /*
  611. * Completion callback on the bio's. Called as bh->b_end_io()
  612. */
  613. static void __end_block_io_op(struct pending_req *pending_req, int error)
  614. {
  615. /* An error fails the entire request. */
  616. if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
  617. (error == -EOPNOTSUPP)) {
  618. pr_debug(DRV_PFX "flush diskcache op failed, not supported\n");
  619. xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0);
  620. pending_req->status = BLKIF_RSP_EOPNOTSUPP;
  621. } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
  622. (error == -EOPNOTSUPP)) {
  623. pr_debug(DRV_PFX "write barrier op failed, not supported\n");
  624. xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0);
  625. pending_req->status = BLKIF_RSP_EOPNOTSUPP;
  626. } else if (error) {
  627. pr_debug(DRV_PFX "Buffer not up-to-date at end of operation,"
  628. " error=%d\n", error);
  629. pending_req->status = BLKIF_RSP_ERROR;
  630. }
  631. /*
  632. * If all of the bio's have completed it is time to unmap
  633. * the grant references associated with 'request' and provide
  634. * the proper response on the ring.
  635. */
  636. if (atomic_dec_and_test(&pending_req->pendcnt)) {
  637. xen_blkbk_unmap(pending_req);
  638. make_response(pending_req->blkif, pending_req->id,
  639. pending_req->operation, pending_req->status);
  640. xen_blkif_put(pending_req->blkif);
  641. if (atomic_read(&pending_req->blkif->refcnt) <= 2) {
  642. if (atomic_read(&pending_req->blkif->drain))
  643. complete(&pending_req->blkif->drain_complete);
  644. }
  645. free_req(pending_req);
  646. }
  647. }
  648. /*
  649. * bio callback.
  650. */
  651. static void end_block_io_op(struct bio *bio, int error)
  652. {
  653. __end_block_io_op(bio->bi_private, error);
  654. bio_put(bio);
  655. }
  656. /*
  657. * Function to copy the from the ring buffer the 'struct blkif_request'
  658. * (which has the sectors we want, number of them, grant references, etc),
  659. * and transmute it to the block API to hand it over to the proper block disk.
  660. */
  661. static int
  662. __do_block_io_op(struct xen_blkif *blkif)
  663. {
  664. union blkif_back_rings *blk_rings = &blkif->blk_rings;
  665. struct blkif_request req;
  666. struct pending_req *pending_req;
  667. RING_IDX rc, rp;
  668. int more_to_do = 0;
  669. rc = blk_rings->common.req_cons;
  670. rp = blk_rings->common.sring->req_prod;
  671. rmb(); /* Ensure we see queued requests up to 'rp'. */
  672. while (rc != rp) {
  673. if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
  674. break;
  675. if (kthread_should_stop()) {
  676. more_to_do = 1;
  677. break;
  678. }
  679. pending_req = alloc_req();
  680. if (NULL == pending_req) {
  681. blkif->st_oo_req++;
  682. more_to_do = 1;
  683. break;
  684. }
  685. switch (blkif->blk_protocol) {
  686. case BLKIF_PROTOCOL_NATIVE:
  687. memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
  688. break;
  689. case BLKIF_PROTOCOL_X86_32:
  690. blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
  691. break;
  692. case BLKIF_PROTOCOL_X86_64:
  693. blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
  694. break;
  695. default:
  696. BUG();
  697. }
  698. blk_rings->common.req_cons = ++rc; /* before make_response() */
  699. /* Apply all sanity checks to /private copy/ of request. */
  700. barrier();
  701. if (unlikely(req.operation == BLKIF_OP_DISCARD)) {
  702. free_req(pending_req);
  703. if (dispatch_discard_io(blkif, &req))
  704. break;
  705. } else if (dispatch_rw_block_io(blkif, &req, pending_req))
  706. break;
  707. /* Yield point for this unbounded loop. */
  708. cond_resched();
  709. }
  710. return more_to_do;
  711. }
  712. static int
  713. do_block_io_op(struct xen_blkif *blkif)
  714. {
  715. union blkif_back_rings *blk_rings = &blkif->blk_rings;
  716. int more_to_do;
  717. do {
  718. more_to_do = __do_block_io_op(blkif);
  719. if (more_to_do)
  720. break;
  721. RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
  722. } while (more_to_do);
  723. return more_to_do;
  724. }
  725. /*
  726. * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
  727. * and call the 'submit_bio' to pass it to the underlying storage.
  728. */
  729. static int dispatch_rw_block_io(struct xen_blkif *blkif,
  730. struct blkif_request *req,
  731. struct pending_req *pending_req)
  732. {
  733. struct phys_req preq;
  734. struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  735. unsigned int nseg;
  736. struct bio *bio = NULL;
  737. struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  738. int i, nbio = 0;
  739. int operation;
  740. struct blk_plug plug;
  741. bool drain = false;
  742. struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  743. switch (req->operation) {
  744. case BLKIF_OP_READ:
  745. blkif->st_rd_req++;
  746. operation = READ;
  747. break;
  748. case BLKIF_OP_WRITE:
  749. blkif->st_wr_req++;
  750. operation = WRITE_ODIRECT;
  751. break;
  752. case BLKIF_OP_WRITE_BARRIER:
  753. drain = true;
  754. case BLKIF_OP_FLUSH_DISKCACHE:
  755. blkif->st_f_req++;
  756. operation = WRITE_FLUSH;
  757. break;
  758. default:
  759. operation = 0; /* make gcc happy */
  760. goto fail_response;
  761. break;
  762. }
  763. /* Check that the number of segments is sane. */
  764. nseg = req->u.rw.nr_segments;
  765. if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
  766. unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
  767. pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
  768. nseg);
  769. /* Haven't submitted any bio's yet. */
  770. goto fail_response;
  771. }
  772. preq.dev = req->u.rw.handle;
  773. preq.sector_number = req->u.rw.sector_number;
  774. preq.nr_sects = 0;
  775. pending_req->blkif = blkif;
  776. pending_req->id = req->u.rw.id;
  777. pending_req->operation = req->operation;
  778. pending_req->status = BLKIF_RSP_OKAY;
  779. pending_req->nr_pages = nseg;
  780. for (i = 0; i < nseg; i++) {
  781. seg[i].nsec = req->u.rw.seg[i].last_sect -
  782. req->u.rw.seg[i].first_sect + 1;
  783. if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
  784. (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect))
  785. goto fail_response;
  786. preq.nr_sects += seg[i].nsec;
  787. }
  788. if (xen_vbd_translate(&preq, blkif, operation) != 0) {
  789. pr_debug(DRV_PFX "access denied: %s of [%llu,%llu] on dev=%04x\n",
  790. operation == READ ? "read" : "write",
  791. preq.sector_number,
  792. preq.sector_number + preq.nr_sects, preq.dev);
  793. goto fail_response;
  794. }
  795. /*
  796. * This check _MUST_ be done after xen_vbd_translate as the preq.bdev
  797. * is set there.
  798. */
  799. for (i = 0; i < nseg; i++) {
  800. if (((int)preq.sector_number|(int)seg[i].nsec) &
  801. ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
  802. pr_debug(DRV_PFX "Misaligned I/O request from domain %d",
  803. blkif->domid);
  804. goto fail_response;
  805. }
  806. }
  807. /* Wait on all outstanding I/O's and once that has been completed
  808. * issue the WRITE_FLUSH.
  809. */
  810. if (drain)
  811. xen_blk_drain_io(pending_req->blkif);
  812. /*
  813. * If we have failed at this point, we need to undo the M2P override,
  814. * set gnttab_set_unmap_op on all of the grant references and perform
  815. * the hypercall to unmap the grants - that is all done in
  816. * xen_blkbk_unmap.
  817. */
  818. if (xen_blkbk_map(req, pending_req, seg, pages))
  819. goto fail_flush;
  820. /*
  821. * This corresponding xen_blkif_put is done in __end_block_io_op, or
  822. * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
  823. */
  824. xen_blkif_get(blkif);
  825. for (i = 0; i < nseg; i++) {
  826. while ((bio == NULL) ||
  827. (bio_add_page(bio,
  828. pages[i],
  829. seg[i].nsec << 9,
  830. seg[i].buf & ~PAGE_MASK) == 0)) {
  831. bio = bio_alloc(GFP_KERNEL, nseg-i);
  832. if (unlikely(bio == NULL))
  833. goto fail_put_bio;
  834. biolist[nbio++] = bio;
  835. bio->bi_bdev = preq.bdev;
  836. bio->bi_private = pending_req;
  837. bio->bi_end_io = end_block_io_op;
  838. bio->bi_sector = preq.sector_number;
  839. }
  840. preq.sector_number += seg[i].nsec;
  841. }
  842. /* This will be hit if the operation was a flush or discard. */
  843. if (!bio) {
  844. BUG_ON(operation != WRITE_FLUSH);
  845. bio = bio_alloc(GFP_KERNEL, 0);
  846. if (unlikely(bio == NULL))
  847. goto fail_put_bio;
  848. biolist[nbio++] = bio;
  849. bio->bi_bdev = preq.bdev;
  850. bio->bi_private = pending_req;
  851. bio->bi_end_io = end_block_io_op;
  852. }
  853. /*
  854. * We set it one so that the last submit_bio does not have to call
  855. * atomic_inc.
  856. */
  857. atomic_set(&pending_req->pendcnt, nbio);
  858. /* Get a reference count for the disk queue and start sending I/O */
  859. blk_start_plug(&plug);
  860. for (i = 0; i < nbio; i++)
  861. submit_bio(operation, biolist[i]);
  862. /* Let the I/Os go.. */
  863. blk_finish_plug(&plug);
  864. if (operation == READ)
  865. blkif->st_rd_sect += preq.nr_sects;
  866. else if (operation & WRITE)
  867. blkif->st_wr_sect += preq.nr_sects;
  868. return 0;
  869. fail_flush:
  870. xen_blkbk_unmap(pending_req);
  871. fail_response:
  872. /* Haven't submitted any bio's yet. */
  873. make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR);
  874. free_req(pending_req);
  875. msleep(1); /* back off a bit */
  876. return -EIO;
  877. fail_put_bio:
  878. for (i = 0; i < nbio; i++)
  879. bio_put(biolist[i]);
  880. __end_block_io_op(pending_req, -EINVAL);
  881. msleep(1); /* back off a bit */
  882. return -EIO;
  883. }
  884. /*
  885. * Put a response on the ring on how the operation fared.
  886. */
  887. static void make_response(struct xen_blkif *blkif, u64 id,
  888. unsigned short op, int st)
  889. {
  890. struct blkif_response resp;
  891. unsigned long flags;
  892. union blkif_back_rings *blk_rings = &blkif->blk_rings;
  893. int notify;
  894. resp.id = id;
  895. resp.operation = op;
  896. resp.status = st;
  897. spin_lock_irqsave(&blkif->blk_ring_lock, flags);
  898. /* Place on the response ring for the relevant domain. */
  899. switch (blkif->blk_protocol) {
  900. case BLKIF_PROTOCOL_NATIVE:
  901. memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
  902. &resp, sizeof(resp));
  903. break;
  904. case BLKIF_PROTOCOL_X86_32:
  905. memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
  906. &resp, sizeof(resp));
  907. break;
  908. case BLKIF_PROTOCOL_X86_64:
  909. memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
  910. &resp, sizeof(resp));
  911. break;
  912. default:
  913. BUG();
  914. }
  915. blk_rings->common.rsp_prod_pvt++;
  916. RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
  917. spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
  918. if (notify)
  919. notify_remote_via_irq(blkif->irq);
  920. }
  921. static int __init xen_blkif_init(void)
  922. {
  923. int i, mmap_pages;
  924. int rc = 0;
  925. if (!xen_domain())
  926. return -ENODEV;
  927. blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL);
  928. if (!blkbk) {
  929. pr_alert(DRV_PFX "%s: out of memory!\n", __func__);
  930. return -ENOMEM;
  931. }
  932. mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
  933. blkbk->pending_reqs = kzalloc(sizeof(blkbk->pending_reqs[0]) *
  934. xen_blkif_reqs, GFP_KERNEL);
  935. blkbk->pending_grant_handles = kmalloc(sizeof(blkbk->pending_grant_handles[0]) *
  936. mmap_pages, GFP_KERNEL);
  937. blkbk->pending_pages = kzalloc(sizeof(blkbk->pending_pages[0]) *
  938. mmap_pages, GFP_KERNEL);
  939. if (!blkbk->pending_reqs || !blkbk->pending_grant_handles ||
  940. !blkbk->pending_pages) {
  941. rc = -ENOMEM;
  942. goto out_of_memory;
  943. }
  944. for (i = 0; i < mmap_pages; i++) {
  945. blkbk->pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
  946. blkbk->pending_pages[i] = alloc_page(GFP_KERNEL);
  947. if (blkbk->pending_pages[i] == NULL) {
  948. rc = -ENOMEM;
  949. goto out_of_memory;
  950. }
  951. }
  952. rc = xen_blkif_interface_init();
  953. if (rc)
  954. goto failed_init;
  955. INIT_LIST_HEAD(&blkbk->pending_free);
  956. spin_lock_init(&blkbk->pending_free_lock);
  957. init_waitqueue_head(&blkbk->pending_free_wq);
  958. for (i = 0; i < xen_blkif_reqs; i++)
  959. list_add_tail(&blkbk->pending_reqs[i].free_list,
  960. &blkbk->pending_free);
  961. rc = xen_blkif_xenbus_init();
  962. if (rc)
  963. goto failed_init;
  964. return 0;
  965. out_of_memory:
  966. pr_alert(DRV_PFX "%s: out of memory\n", __func__);
  967. failed_init:
  968. kfree(blkbk->pending_reqs);
  969. kfree(blkbk->pending_grant_handles);
  970. if (blkbk->pending_pages) {
  971. for (i = 0; i < mmap_pages; i++) {
  972. if (blkbk->pending_pages[i])
  973. __free_page(blkbk->pending_pages[i]);
  974. }
  975. kfree(blkbk->pending_pages);
  976. }
  977. kfree(blkbk);
  978. blkbk = NULL;
  979. return rc;
  980. }
  981. module_init(xen_blkif_init);
  982. MODULE_LICENSE("Dual BSD/GPL");
  983. MODULE_ALIAS("xen-backend:vbd");