dm-snap.c 27 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282
  1. /*
  2. * dm-snapshot.c
  3. *
  4. * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
  5. *
  6. * This file is released under the GPL.
  7. */
  8. #include <linux/blkdev.h>
  9. #include <linux/ctype.h>
  10. #include <linux/device-mapper.h>
  11. #include <linux/fs.h>
  12. #include <linux/init.h>
  13. #include <linux/kdev_t.h>
  14. #include <linux/list.h>
  15. #include <linux/mempool.h>
  16. #include <linux/module.h>
  17. #include <linux/slab.h>
  18. #include <linux/vmalloc.h>
  19. #include "dm-snap.h"
  20. #include "dm-bio-list.h"
  21. #include "kcopyd.h"
  22. #define DM_MSG_PREFIX "snapshots"
  23. /*
  24. * The percentage increment we will wake up users at
  25. */
  26. #define WAKE_UP_PERCENT 5
  27. /*
  28. * kcopyd priority of snapshot operations
  29. */
  30. #define SNAPSHOT_COPY_PRIORITY 2
  31. /*
  32. * Each snapshot reserves this many pages for io
  33. */
  34. #define SNAPSHOT_PAGES 256
  35. struct pending_exception {
  36. struct exception e;
  37. /*
  38. * Origin buffers waiting for this to complete are held
  39. * in a bio list
  40. */
  41. struct bio_list origin_bios;
  42. struct bio_list snapshot_bios;
  43. /*
  44. * Short-term queue of pending exceptions prior to submission.
  45. */
  46. struct list_head list;
  47. /*
  48. * The primary pending_exception is the one that holds
  49. * the sibling_count and the list of origin_bios for a
  50. * group of pending_exceptions. It is always last to get freed.
  51. * These fields get set up when writing to the origin.
  52. */
  53. struct pending_exception *primary_pe;
  54. /*
  55. * Number of pending_exceptions processing this chunk.
  56. * When this drops to zero we must complete the origin bios.
  57. * If incrementing or decrementing this, hold pe->snap->lock for
  58. * the sibling concerned and not pe->primary_pe->snap->lock unless
  59. * they are the same.
  60. */
  61. atomic_t sibling_count;
  62. /* Pointer back to snapshot context */
  63. struct dm_snapshot *snap;
  64. /*
  65. * 1 indicates the exception has already been sent to
  66. * kcopyd.
  67. */
  68. int started;
  69. };
  70. /*
  71. * Hash table mapping origin volumes to lists of snapshots and
  72. * a lock to protect it
  73. */
  74. static kmem_cache_t *exception_cache;
  75. static kmem_cache_t *pending_cache;
  76. static mempool_t *pending_pool;
  77. /*
  78. * One of these per registered origin, held in the snapshot_origins hash
  79. */
  80. struct origin {
  81. /* The origin device */
  82. struct block_device *bdev;
  83. struct list_head hash_list;
  84. /* List of snapshots for this origin */
  85. struct list_head snapshots;
  86. };
  87. /*
  88. * Size of the hash table for origin volumes. If we make this
  89. * the size of the minors list then it should be nearly perfect
  90. */
  91. #define ORIGIN_HASH_SIZE 256
  92. #define ORIGIN_MASK 0xFF
  93. static struct list_head *_origins;
  94. static struct rw_semaphore _origins_lock;
  95. static int init_origin_hash(void)
  96. {
  97. int i;
  98. _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
  99. GFP_KERNEL);
  100. if (!_origins) {
  101. DMERR("unable to allocate memory");
  102. return -ENOMEM;
  103. }
  104. for (i = 0; i < ORIGIN_HASH_SIZE; i++)
  105. INIT_LIST_HEAD(_origins + i);
  106. init_rwsem(&_origins_lock);
  107. return 0;
  108. }
  109. static void exit_origin_hash(void)
  110. {
  111. kfree(_origins);
  112. }
  113. static inline unsigned int origin_hash(struct block_device *bdev)
  114. {
  115. return bdev->bd_dev & ORIGIN_MASK;
  116. }
  117. static struct origin *__lookup_origin(struct block_device *origin)
  118. {
  119. struct list_head *ol;
  120. struct origin *o;
  121. ol = &_origins[origin_hash(origin)];
  122. list_for_each_entry (o, ol, hash_list)
  123. if (bdev_equal(o->bdev, origin))
  124. return o;
  125. return NULL;
  126. }
  127. static void __insert_origin(struct origin *o)
  128. {
  129. struct list_head *sl = &_origins[origin_hash(o->bdev)];
  130. list_add_tail(&o->hash_list, sl);
  131. }
  132. /*
  133. * Make a note of the snapshot and its origin so we can look it
  134. * up when the origin has a write on it.
  135. */
  136. static int register_snapshot(struct dm_snapshot *snap)
  137. {
  138. struct origin *o;
  139. struct block_device *bdev = snap->origin->bdev;
  140. down_write(&_origins_lock);
  141. o = __lookup_origin(bdev);
  142. if (!o) {
  143. /* New origin */
  144. o = kmalloc(sizeof(*o), GFP_KERNEL);
  145. if (!o) {
  146. up_write(&_origins_lock);
  147. return -ENOMEM;
  148. }
  149. /* Initialise the struct */
  150. INIT_LIST_HEAD(&o->snapshots);
  151. o->bdev = bdev;
  152. __insert_origin(o);
  153. }
  154. list_add_tail(&snap->list, &o->snapshots);
  155. up_write(&_origins_lock);
  156. return 0;
  157. }
  158. static void unregister_snapshot(struct dm_snapshot *s)
  159. {
  160. struct origin *o;
  161. down_write(&_origins_lock);
  162. o = __lookup_origin(s->origin->bdev);
  163. list_del(&s->list);
  164. if (list_empty(&o->snapshots)) {
  165. list_del(&o->hash_list);
  166. kfree(o);
  167. }
  168. up_write(&_origins_lock);
  169. }
  170. /*
  171. * Implementation of the exception hash tables.
  172. */
  173. static int init_exception_table(struct exception_table *et, uint32_t size)
  174. {
  175. unsigned int i;
  176. et->hash_mask = size - 1;
  177. et->table = dm_vcalloc(size, sizeof(struct list_head));
  178. if (!et->table)
  179. return -ENOMEM;
  180. for (i = 0; i < size; i++)
  181. INIT_LIST_HEAD(et->table + i);
  182. return 0;
  183. }
  184. static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem)
  185. {
  186. struct list_head *slot;
  187. struct exception *ex, *next;
  188. int i, size;
  189. size = et->hash_mask + 1;
  190. for (i = 0; i < size; i++) {
  191. slot = et->table + i;
  192. list_for_each_entry_safe (ex, next, slot, hash_list)
  193. kmem_cache_free(mem, ex);
  194. }
  195. vfree(et->table);
  196. }
  197. static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
  198. {
  199. return chunk & et->hash_mask;
  200. }
  201. static void insert_exception(struct exception_table *eh, struct exception *e)
  202. {
  203. struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)];
  204. list_add(&e->hash_list, l);
  205. }
  206. static inline void remove_exception(struct exception *e)
  207. {
  208. list_del(&e->hash_list);
  209. }
  210. /*
  211. * Return the exception data for a sector, or NULL if not
  212. * remapped.
  213. */
  214. static struct exception *lookup_exception(struct exception_table *et,
  215. chunk_t chunk)
  216. {
  217. struct list_head *slot;
  218. struct exception *e;
  219. slot = &et->table[exception_hash(et, chunk)];
  220. list_for_each_entry (e, slot, hash_list)
  221. if (e->old_chunk == chunk)
  222. return e;
  223. return NULL;
  224. }
  225. static inline struct exception *alloc_exception(void)
  226. {
  227. struct exception *e;
  228. e = kmem_cache_alloc(exception_cache, GFP_NOIO);
  229. if (!e)
  230. e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
  231. return e;
  232. }
  233. static inline void free_exception(struct exception *e)
  234. {
  235. kmem_cache_free(exception_cache, e);
  236. }
  237. static inline struct pending_exception *alloc_pending_exception(void)
  238. {
  239. return mempool_alloc(pending_pool, GFP_NOIO);
  240. }
  241. static inline void free_pending_exception(struct pending_exception *pe)
  242. {
  243. mempool_free(pe, pending_pool);
  244. }
  245. int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
  246. {
  247. struct exception *e;
  248. e = alloc_exception();
  249. if (!e)
  250. return -ENOMEM;
  251. e->old_chunk = old;
  252. e->new_chunk = new;
  253. insert_exception(&s->complete, e);
  254. return 0;
  255. }
  256. /*
  257. * Hard coded magic.
  258. */
  259. static int calc_max_buckets(void)
  260. {
  261. /* use a fixed size of 2MB */
  262. unsigned long mem = 2 * 1024 * 1024;
  263. mem /= sizeof(struct list_head);
  264. return mem;
  265. }
  266. /*
  267. * Rounds a number down to a power of 2.
  268. */
  269. static inline uint32_t round_down(uint32_t n)
  270. {
  271. while (n & (n - 1))
  272. n &= (n - 1);
  273. return n;
  274. }
  275. /*
  276. * Allocate room for a suitable hash table.
  277. */
  278. static int init_hash_tables(struct dm_snapshot *s)
  279. {
  280. sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
  281. /*
  282. * Calculate based on the size of the original volume or
  283. * the COW volume...
  284. */
  285. cow_dev_size = get_dev_size(s->cow->bdev);
  286. origin_dev_size = get_dev_size(s->origin->bdev);
  287. max_buckets = calc_max_buckets();
  288. hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift;
  289. hash_size = min(hash_size, max_buckets);
  290. /* Round it down to a power of 2 */
  291. hash_size = round_down(hash_size);
  292. if (init_exception_table(&s->complete, hash_size))
  293. return -ENOMEM;
  294. /*
  295. * Allocate hash table for in-flight exceptions
  296. * Make this smaller than the real hash table
  297. */
  298. hash_size >>= 3;
  299. if (hash_size < 64)
  300. hash_size = 64;
  301. if (init_exception_table(&s->pending, hash_size)) {
  302. exit_exception_table(&s->complete, exception_cache);
  303. return -ENOMEM;
  304. }
  305. return 0;
  306. }
  307. /*
  308. * Round a number up to the nearest 'size' boundary. size must
  309. * be a power of 2.
  310. */
  311. static inline ulong round_up(ulong n, ulong size)
  312. {
  313. size--;
  314. return (n + size) & ~size;
  315. }
  316. static int set_chunk_size(struct dm_snapshot *s, const char *chunk_size_arg,
  317. char **error)
  318. {
  319. unsigned long chunk_size;
  320. char *value;
  321. chunk_size = simple_strtoul(chunk_size_arg, &value, 10);
  322. if (*chunk_size_arg == '\0' || *value != '\0') {
  323. *error = "Invalid chunk size";
  324. return -EINVAL;
  325. }
  326. if (!chunk_size) {
  327. s->chunk_size = s->chunk_mask = s->chunk_shift = 0;
  328. return 0;
  329. }
  330. /*
  331. * Chunk size must be multiple of page size. Silently
  332. * round up if it's not.
  333. */
  334. chunk_size = round_up(chunk_size, PAGE_SIZE >> 9);
  335. /* Check chunk_size is a power of 2 */
  336. if (chunk_size & (chunk_size - 1)) {
  337. *error = "Chunk size is not a power of 2";
  338. return -EINVAL;
  339. }
  340. /* Validate the chunk size against the device block size */
  341. if (chunk_size % (bdev_hardsect_size(s->cow->bdev) >> 9)) {
  342. *error = "Chunk size is not a multiple of device blocksize";
  343. return -EINVAL;
  344. }
  345. s->chunk_size = chunk_size;
  346. s->chunk_mask = chunk_size - 1;
  347. s->chunk_shift = ffs(chunk_size) - 1;
  348. return 0;
  349. }
  350. /*
  351. * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
  352. */
  353. static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
  354. {
  355. struct dm_snapshot *s;
  356. int r = -EINVAL;
  357. char persistent;
  358. char *origin_path;
  359. char *cow_path;
  360. if (argc != 4) {
  361. ti->error = "requires exactly 4 arguments";
  362. r = -EINVAL;
  363. goto bad1;
  364. }
  365. origin_path = argv[0];
  366. cow_path = argv[1];
  367. persistent = toupper(*argv[2]);
  368. if (persistent != 'P' && persistent != 'N') {
  369. ti->error = "Persistent flag is not P or N";
  370. r = -EINVAL;
  371. goto bad1;
  372. }
  373. s = kmalloc(sizeof(*s), GFP_KERNEL);
  374. if (s == NULL) {
  375. ti->error = "Cannot allocate snapshot context private "
  376. "structure";
  377. r = -ENOMEM;
  378. goto bad1;
  379. }
  380. r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
  381. if (r) {
  382. ti->error = "Cannot get origin device";
  383. goto bad2;
  384. }
  385. r = dm_get_device(ti, cow_path, 0, 0,
  386. FMODE_READ | FMODE_WRITE, &s->cow);
  387. if (r) {
  388. dm_put_device(ti, s->origin);
  389. ti->error = "Cannot get COW device";
  390. goto bad2;
  391. }
  392. r = set_chunk_size(s, argv[3], &ti->error);
  393. if (r)
  394. goto bad3;
  395. s->type = persistent;
  396. s->valid = 1;
  397. s->active = 0;
  398. s->last_percent = 0;
  399. init_rwsem(&s->lock);
  400. s->table = ti->table;
  401. /* Allocate hash table for COW data */
  402. if (init_hash_tables(s)) {
  403. ti->error = "Unable to allocate hash table space";
  404. r = -ENOMEM;
  405. goto bad3;
  406. }
  407. s->store.snap = s;
  408. if (persistent == 'P')
  409. r = dm_create_persistent(&s->store);
  410. else
  411. r = dm_create_transient(&s->store);
  412. if (r) {
  413. ti->error = "Couldn't create exception store";
  414. r = -EINVAL;
  415. goto bad4;
  416. }
  417. r = kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
  418. if (r) {
  419. ti->error = "Could not create kcopyd client";
  420. goto bad5;
  421. }
  422. /* Metadata must only be loaded into one table at once */
  423. r = s->store.read_metadata(&s->store);
  424. if (r) {
  425. ti->error = "Failed to read snapshot metadata";
  426. goto bad6;
  427. }
  428. /* Add snapshot to the list of snapshots for this origin */
  429. /* Exceptions aren't triggered till snapshot_resume() is called */
  430. if (register_snapshot(s)) {
  431. r = -EINVAL;
  432. ti->error = "Cannot register snapshot origin";
  433. goto bad6;
  434. }
  435. ti->private = s;
  436. ti->split_io = s->chunk_size;
  437. return 0;
  438. bad6:
  439. kcopyd_client_destroy(s->kcopyd_client);
  440. bad5:
  441. s->store.destroy(&s->store);
  442. bad4:
  443. exit_exception_table(&s->pending, pending_cache);
  444. exit_exception_table(&s->complete, exception_cache);
  445. bad3:
  446. dm_put_device(ti, s->cow);
  447. dm_put_device(ti, s->origin);
  448. bad2:
  449. kfree(s);
  450. bad1:
  451. return r;
  452. }
  453. static void snapshot_dtr(struct dm_target *ti)
  454. {
  455. struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
  456. /* Prevent further origin writes from using this snapshot. */
  457. /* After this returns there can be no new kcopyd jobs. */
  458. unregister_snapshot(s);
  459. kcopyd_client_destroy(s->kcopyd_client);
  460. exit_exception_table(&s->pending, pending_cache);
  461. exit_exception_table(&s->complete, exception_cache);
  462. /* Deallocate memory used */
  463. s->store.destroy(&s->store);
  464. dm_put_device(ti, s->origin);
  465. dm_put_device(ti, s->cow);
  466. kfree(s);
  467. }
  468. /*
  469. * Flush a list of buffers.
  470. */
  471. static void flush_bios(struct bio *bio)
  472. {
  473. struct bio *n;
  474. while (bio) {
  475. n = bio->bi_next;
  476. bio->bi_next = NULL;
  477. generic_make_request(bio);
  478. bio = n;
  479. }
  480. }
  481. /*
  482. * Error a list of buffers.
  483. */
  484. static void error_bios(struct bio *bio)
  485. {
  486. struct bio *n;
  487. while (bio) {
  488. n = bio->bi_next;
  489. bio->bi_next = NULL;
  490. bio_io_error(bio, bio->bi_size);
  491. bio = n;
  492. }
  493. }
  494. static void __invalidate_snapshot(struct dm_snapshot *s,
  495. struct pending_exception *pe, int err)
  496. {
  497. if (!s->valid)
  498. return;
  499. if (err == -EIO)
  500. DMERR("Invalidating snapshot: Error reading/writing.");
  501. else if (err == -ENOMEM)
  502. DMERR("Invalidating snapshot: Unable to allocate exception.");
  503. if (pe)
  504. remove_exception(&pe->e);
  505. if (s->store.drop_snapshot)
  506. s->store.drop_snapshot(&s->store);
  507. s->valid = 0;
  508. dm_table_event(s->table);
  509. }
  510. static void pending_complete(struct pending_exception *pe, int success)
  511. {
  512. struct exception *e;
  513. struct pending_exception *primary_pe;
  514. struct dm_snapshot *s = pe->snap;
  515. struct bio *origin_bios = NULL;
  516. struct bio *snapshot_bios = NULL;
  517. int error = 0;
  518. if (!success) {
  519. /* Read/write error - snapshot is unusable */
  520. down_write(&s->lock);
  521. __invalidate_snapshot(s, pe, -EIO);
  522. error = 1;
  523. goto out;
  524. }
  525. e = alloc_exception();
  526. if (!e) {
  527. down_write(&s->lock);
  528. __invalidate_snapshot(s, pe, -ENOMEM);
  529. error = 1;
  530. goto out;
  531. }
  532. *e = pe->e;
  533. down_write(&s->lock);
  534. if (!s->valid) {
  535. free_exception(e);
  536. error = 1;
  537. goto out;
  538. }
  539. /*
  540. * Add a proper exception, and remove the
  541. * in-flight exception from the list.
  542. */
  543. insert_exception(&s->complete, e);
  544. remove_exception(&pe->e);
  545. out:
  546. snapshot_bios = bio_list_get(&pe->snapshot_bios);
  547. primary_pe = pe->primary_pe;
  548. /*
  549. * If this pe is involved in a write to the origin and
  550. * it is the last sibling to complete then release
  551. * the bios for the original write to the origin.
  552. */
  553. if (primary_pe &&
  554. atomic_dec_and_test(&primary_pe->sibling_count))
  555. origin_bios = bio_list_get(&primary_pe->origin_bios);
  556. /*
  557. * Free the pe if it's not linked to an origin write or if
  558. * it's not itself a primary pe.
  559. */
  560. if (!primary_pe || primary_pe != pe)
  561. free_pending_exception(pe);
  562. /*
  563. * Free the primary pe if nothing references it.
  564. */
  565. if (primary_pe && !atomic_read(&primary_pe->sibling_count))
  566. free_pending_exception(primary_pe);
  567. up_write(&s->lock);
  568. /* Submit any pending write bios */
  569. if (error)
  570. error_bios(snapshot_bios);
  571. else
  572. flush_bios(snapshot_bios);
  573. flush_bios(origin_bios);
  574. }
  575. static void commit_callback(void *context, int success)
  576. {
  577. struct pending_exception *pe = (struct pending_exception *) context;
  578. pending_complete(pe, success);
  579. }
  580. /*
  581. * Called when the copy I/O has finished. kcopyd actually runs
  582. * this code so don't block.
  583. */
  584. static void copy_callback(int read_err, unsigned int write_err, void *context)
  585. {
  586. struct pending_exception *pe = (struct pending_exception *) context;
  587. struct dm_snapshot *s = pe->snap;
  588. if (read_err || write_err)
  589. pending_complete(pe, 0);
  590. else
  591. /* Update the metadata if we are persistent */
  592. s->store.commit_exception(&s->store, &pe->e, commit_callback,
  593. pe);
  594. }
  595. /*
  596. * Dispatches the copy operation to kcopyd.
  597. */
  598. static void start_copy(struct pending_exception *pe)
  599. {
  600. struct dm_snapshot *s = pe->snap;
  601. struct io_region src, dest;
  602. struct block_device *bdev = s->origin->bdev;
  603. sector_t dev_size;
  604. dev_size = get_dev_size(bdev);
  605. src.bdev = bdev;
  606. src.sector = chunk_to_sector(s, pe->e.old_chunk);
  607. src.count = min(s->chunk_size, dev_size - src.sector);
  608. dest.bdev = s->cow->bdev;
  609. dest.sector = chunk_to_sector(s, pe->e.new_chunk);
  610. dest.count = src.count;
  611. /* Hand over to kcopyd */
  612. kcopyd_copy(s->kcopyd_client,
  613. &src, 1, &dest, 0, copy_callback, pe);
  614. }
  615. /*
  616. * Looks to see if this snapshot already has a pending exception
  617. * for this chunk, otherwise it allocates a new one and inserts
  618. * it into the pending table.
  619. *
  620. * NOTE: a write lock must be held on snap->lock before calling
  621. * this.
  622. */
  623. static struct pending_exception *
  624. __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
  625. {
  626. struct exception *e;
  627. struct pending_exception *pe;
  628. chunk_t chunk = sector_to_chunk(s, bio->bi_sector);
  629. /*
  630. * Is there a pending exception for this already ?
  631. */
  632. e = lookup_exception(&s->pending, chunk);
  633. if (e) {
  634. /* cast the exception to a pending exception */
  635. pe = container_of(e, struct pending_exception, e);
  636. goto out;
  637. }
  638. /*
  639. * Create a new pending exception, we don't want
  640. * to hold the lock while we do this.
  641. */
  642. up_write(&s->lock);
  643. pe = alloc_pending_exception();
  644. down_write(&s->lock);
  645. if (!s->valid) {
  646. free_pending_exception(pe);
  647. return NULL;
  648. }
  649. e = lookup_exception(&s->pending, chunk);
  650. if (e) {
  651. free_pending_exception(pe);
  652. pe = container_of(e, struct pending_exception, e);
  653. goto out;
  654. }
  655. pe->e.old_chunk = chunk;
  656. bio_list_init(&pe->origin_bios);
  657. bio_list_init(&pe->snapshot_bios);
  658. pe->primary_pe = NULL;
  659. atomic_set(&pe->sibling_count, 1);
  660. pe->snap = s;
  661. pe->started = 0;
  662. if (s->store.prepare_exception(&s->store, &pe->e)) {
  663. free_pending_exception(pe);
  664. return NULL;
  665. }
  666. insert_exception(&s->pending, &pe->e);
  667. out:
  668. return pe;
  669. }
  670. static inline void remap_exception(struct dm_snapshot *s, struct exception *e,
  671. struct bio *bio)
  672. {
  673. bio->bi_bdev = s->cow->bdev;
  674. bio->bi_sector = chunk_to_sector(s, e->new_chunk) +
  675. (bio->bi_sector & s->chunk_mask);
  676. }
  677. static int snapshot_map(struct dm_target *ti, struct bio *bio,
  678. union map_info *map_context)
  679. {
  680. struct exception *e;
  681. struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
  682. int r = 1;
  683. chunk_t chunk;
  684. struct pending_exception *pe = NULL;
  685. chunk = sector_to_chunk(s, bio->bi_sector);
  686. /* Full snapshots are not usable */
  687. /* To get here the table must be live so s->active is always set. */
  688. if (!s->valid)
  689. return -EIO;
  690. if (unlikely(bio_barrier(bio)))
  691. return -EOPNOTSUPP;
  692. /* FIXME: should only take write lock if we need
  693. * to copy an exception */
  694. down_write(&s->lock);
  695. if (!s->valid) {
  696. r = -EIO;
  697. goto out_unlock;
  698. }
  699. /* If the block is already remapped - use that, else remap it */
  700. e = lookup_exception(&s->complete, chunk);
  701. if (e) {
  702. remap_exception(s, e, bio);
  703. goto out_unlock;
  704. }
  705. /*
  706. * Write to snapshot - higher level takes care of RW/RO
  707. * flags so we should only get this if we are
  708. * writeable.
  709. */
  710. if (bio_rw(bio) == WRITE) {
  711. pe = __find_pending_exception(s, bio);
  712. if (!pe) {
  713. __invalidate_snapshot(s, pe, -ENOMEM);
  714. r = -EIO;
  715. goto out_unlock;
  716. }
  717. remap_exception(s, &pe->e, bio);
  718. bio_list_add(&pe->snapshot_bios, bio);
  719. r = 0;
  720. if (!pe->started) {
  721. /* this is protected by snap->lock */
  722. pe->started = 1;
  723. up_write(&s->lock);
  724. start_copy(pe);
  725. goto out;
  726. }
  727. } else
  728. /*
  729. * FIXME: this read path scares me because we
  730. * always use the origin when we have a pending
  731. * exception. However I can't think of a
  732. * situation where this is wrong - ejt.
  733. */
  734. bio->bi_bdev = s->origin->bdev;
  735. out_unlock:
  736. up_write(&s->lock);
  737. out:
  738. return r;
  739. }
  740. static void snapshot_resume(struct dm_target *ti)
  741. {
  742. struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
  743. down_write(&s->lock);
  744. s->active = 1;
  745. up_write(&s->lock);
  746. }
  747. static int snapshot_status(struct dm_target *ti, status_type_t type,
  748. char *result, unsigned int maxlen)
  749. {
  750. struct dm_snapshot *snap = (struct dm_snapshot *) ti->private;
  751. switch (type) {
  752. case STATUSTYPE_INFO:
  753. if (!snap->valid)
  754. snprintf(result, maxlen, "Invalid");
  755. else {
  756. if (snap->store.fraction_full) {
  757. sector_t numerator, denominator;
  758. snap->store.fraction_full(&snap->store,
  759. &numerator,
  760. &denominator);
  761. snprintf(result, maxlen, "%llu/%llu",
  762. (unsigned long long)numerator,
  763. (unsigned long long)denominator);
  764. }
  765. else
  766. snprintf(result, maxlen, "Unknown");
  767. }
  768. break;
  769. case STATUSTYPE_TABLE:
  770. /*
  771. * kdevname returns a static pointer so we need
  772. * to make private copies if the output is to
  773. * make sense.
  774. */
  775. snprintf(result, maxlen, "%s %s %c %llu",
  776. snap->origin->name, snap->cow->name,
  777. snap->type,
  778. (unsigned long long)snap->chunk_size);
  779. break;
  780. }
  781. return 0;
  782. }
  783. /*-----------------------------------------------------------------
  784. * Origin methods
  785. *---------------------------------------------------------------*/
  786. static int __origin_write(struct list_head *snapshots, struct bio *bio)
  787. {
  788. int r = 1, first = 0;
  789. struct dm_snapshot *snap;
  790. struct exception *e;
  791. struct pending_exception *pe, *next_pe, *primary_pe = NULL;
  792. chunk_t chunk;
  793. LIST_HEAD(pe_queue);
  794. /* Do all the snapshots on this origin */
  795. list_for_each_entry (snap, snapshots, list) {
  796. down_write(&snap->lock);
  797. /* Only deal with valid and active snapshots */
  798. if (!snap->valid || !snap->active)
  799. goto next_snapshot;
  800. /* Nothing to do if writing beyond end of snapshot */
  801. if (bio->bi_sector >= dm_table_get_size(snap->table))
  802. goto next_snapshot;
  803. /*
  804. * Remember, different snapshots can have
  805. * different chunk sizes.
  806. */
  807. chunk = sector_to_chunk(snap, bio->bi_sector);
  808. /*
  809. * Check exception table to see if block
  810. * is already remapped in this snapshot
  811. * and trigger an exception if not.
  812. *
  813. * sibling_count is initialised to 1 so pending_complete()
  814. * won't destroy the primary_pe while we're inside this loop.
  815. */
  816. e = lookup_exception(&snap->complete, chunk);
  817. if (e)
  818. goto next_snapshot;
  819. pe = __find_pending_exception(snap, bio);
  820. if (!pe) {
  821. __invalidate_snapshot(snap, pe, -ENOMEM);
  822. goto next_snapshot;
  823. }
  824. if (!primary_pe) {
  825. /*
  826. * Either every pe here has same
  827. * primary_pe or none has one yet.
  828. */
  829. if (pe->primary_pe)
  830. primary_pe = pe->primary_pe;
  831. else {
  832. primary_pe = pe;
  833. first = 1;
  834. }
  835. bio_list_add(&primary_pe->origin_bios, bio);
  836. r = 0;
  837. }
  838. if (!pe->primary_pe) {
  839. atomic_inc(&primary_pe->sibling_count);
  840. pe->primary_pe = primary_pe;
  841. }
  842. if (!pe->started) {
  843. pe->started = 1;
  844. list_add_tail(&pe->list, &pe_queue);
  845. }
  846. next_snapshot:
  847. up_write(&snap->lock);
  848. }
  849. if (!primary_pe)
  850. goto out;
  851. /*
  852. * If this is the first time we're processing this chunk and
  853. * sibling_count is now 1 it means all the pending exceptions
  854. * got completed while we were in the loop above, so it falls to
  855. * us here to remove the primary_pe and submit any origin_bios.
  856. */
  857. if (first && atomic_dec_and_test(&primary_pe->sibling_count)) {
  858. flush_bios(bio_list_get(&primary_pe->origin_bios));
  859. free_pending_exception(primary_pe);
  860. /* If we got here, pe_queue is necessarily empty. */
  861. goto out;
  862. }
  863. /*
  864. * Now that we have a complete pe list we can start the copying.
  865. */
  866. list_for_each_entry_safe(pe, next_pe, &pe_queue, list)
  867. start_copy(pe);
  868. out:
  869. return r;
  870. }
  871. /*
  872. * Called on a write from the origin driver.
  873. */
  874. static int do_origin(struct dm_dev *origin, struct bio *bio)
  875. {
  876. struct origin *o;
  877. int r = 1;
  878. down_read(&_origins_lock);
  879. o = __lookup_origin(origin->bdev);
  880. if (o)
  881. r = __origin_write(&o->snapshots, bio);
  882. up_read(&_origins_lock);
  883. return r;
  884. }
  885. /*
  886. * Origin: maps a linear range of a device, with hooks for snapshotting.
  887. */
  888. /*
  889. * Construct an origin mapping: <dev_path>
  890. * The context for an origin is merely a 'struct dm_dev *'
  891. * pointing to the real device.
  892. */
  893. static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
  894. {
  895. int r;
  896. struct dm_dev *dev;
  897. if (argc != 1) {
  898. ti->error = "origin: incorrect number of arguments";
  899. return -EINVAL;
  900. }
  901. r = dm_get_device(ti, argv[0], 0, ti->len,
  902. dm_table_get_mode(ti->table), &dev);
  903. if (r) {
  904. ti->error = "Cannot get target device";
  905. return r;
  906. }
  907. ti->private = dev;
  908. return 0;
  909. }
  910. static void origin_dtr(struct dm_target *ti)
  911. {
  912. struct dm_dev *dev = (struct dm_dev *) ti->private;
  913. dm_put_device(ti, dev);
  914. }
  915. static int origin_map(struct dm_target *ti, struct bio *bio,
  916. union map_info *map_context)
  917. {
  918. struct dm_dev *dev = (struct dm_dev *) ti->private;
  919. bio->bi_bdev = dev->bdev;
  920. if (unlikely(bio_barrier(bio)))
  921. return -EOPNOTSUPP;
  922. /* Only tell snapshots if this is a write */
  923. return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : 1;
  924. }
  925. #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
  926. /*
  927. * Set the target "split_io" field to the minimum of all the snapshots'
  928. * chunk sizes.
  929. */
  930. static void origin_resume(struct dm_target *ti)
  931. {
  932. struct dm_dev *dev = (struct dm_dev *) ti->private;
  933. struct dm_snapshot *snap;
  934. struct origin *o;
  935. chunk_t chunk_size = 0;
  936. down_read(&_origins_lock);
  937. o = __lookup_origin(dev->bdev);
  938. if (o)
  939. list_for_each_entry (snap, &o->snapshots, list)
  940. chunk_size = min_not_zero(chunk_size, snap->chunk_size);
  941. up_read(&_origins_lock);
  942. ti->split_io = chunk_size;
  943. }
  944. static int origin_status(struct dm_target *ti, status_type_t type, char *result,
  945. unsigned int maxlen)
  946. {
  947. struct dm_dev *dev = (struct dm_dev *) ti->private;
  948. switch (type) {
  949. case STATUSTYPE_INFO:
  950. result[0] = '\0';
  951. break;
  952. case STATUSTYPE_TABLE:
  953. snprintf(result, maxlen, "%s", dev->name);
  954. break;
  955. }
  956. return 0;
  957. }
  958. static struct target_type origin_target = {
  959. .name = "snapshot-origin",
  960. .version = {1, 5, 0},
  961. .module = THIS_MODULE,
  962. .ctr = origin_ctr,
  963. .dtr = origin_dtr,
  964. .map = origin_map,
  965. .resume = origin_resume,
  966. .status = origin_status,
  967. };
  968. static struct target_type snapshot_target = {
  969. .name = "snapshot",
  970. .version = {1, 5, 0},
  971. .module = THIS_MODULE,
  972. .ctr = snapshot_ctr,
  973. .dtr = snapshot_dtr,
  974. .map = snapshot_map,
  975. .resume = snapshot_resume,
  976. .status = snapshot_status,
  977. };
  978. static int __init dm_snapshot_init(void)
  979. {
  980. int r;
  981. r = dm_register_target(&snapshot_target);
  982. if (r) {
  983. DMERR("snapshot target register failed %d", r);
  984. return r;
  985. }
  986. r = dm_register_target(&origin_target);
  987. if (r < 0) {
  988. DMERR("Origin target register failed %d", r);
  989. goto bad1;
  990. }
  991. r = init_origin_hash();
  992. if (r) {
  993. DMERR("init_origin_hash failed.");
  994. goto bad2;
  995. }
  996. exception_cache = kmem_cache_create("dm-snapshot-ex",
  997. sizeof(struct exception),
  998. __alignof__(struct exception),
  999. 0, NULL, NULL);
  1000. if (!exception_cache) {
  1001. DMERR("Couldn't create exception cache.");
  1002. r = -ENOMEM;
  1003. goto bad3;
  1004. }
  1005. pending_cache =
  1006. kmem_cache_create("dm-snapshot-in",
  1007. sizeof(struct pending_exception),
  1008. __alignof__(struct pending_exception),
  1009. 0, NULL, NULL);
  1010. if (!pending_cache) {
  1011. DMERR("Couldn't create pending cache.");
  1012. r = -ENOMEM;
  1013. goto bad4;
  1014. }
  1015. pending_pool = mempool_create_slab_pool(128, pending_cache);
  1016. if (!pending_pool) {
  1017. DMERR("Couldn't create pending pool.");
  1018. r = -ENOMEM;
  1019. goto bad5;
  1020. }
  1021. return 0;
  1022. bad5:
  1023. kmem_cache_destroy(pending_cache);
  1024. bad4:
  1025. kmem_cache_destroy(exception_cache);
  1026. bad3:
  1027. exit_origin_hash();
  1028. bad2:
  1029. dm_unregister_target(&origin_target);
  1030. bad1:
  1031. dm_unregister_target(&snapshot_target);
  1032. return r;
  1033. }
  1034. static void __exit dm_snapshot_exit(void)
  1035. {
  1036. int r;
  1037. r = dm_unregister_target(&snapshot_target);
  1038. if (r)
  1039. DMERR("snapshot unregister failed %d", r);
  1040. r = dm_unregister_target(&origin_target);
  1041. if (r)
  1042. DMERR("origin unregister failed %d", r);
  1043. exit_origin_hash();
  1044. mempool_destroy(pending_pool);
  1045. kmem_cache_destroy(pending_cache);
  1046. kmem_cache_destroy(exception_cache);
  1047. }
  1048. /* Module hooks */
  1049. module_init(dm_snapshot_init);
  1050. module_exit(dm_snapshot_exit);
  1051. MODULE_DESCRIPTION(DM_NAME " snapshot target");
  1052. MODULE_AUTHOR("Joe Thornber");
  1053. MODULE_LICENSE("GPL");