dm-snap.c 31 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453
  1. /*
  2. * dm-snapshot.c
  3. *
  4. * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
  5. *
  6. * This file is released under the GPL.
  7. */
  8. #include <linux/blkdev.h>
  9. #include <linux/ctype.h>
  10. #include <linux/device-mapper.h>
  11. #include <linux/fs.h>
  12. #include <linux/init.h>
  13. #include <linux/kdev_t.h>
  14. #include <linux/list.h>
  15. #include <linux/mempool.h>
  16. #include <linux/module.h>
  17. #include <linux/slab.h>
  18. #include <linux/vmalloc.h>
  19. #include <linux/log2.h>
  20. #include <linux/dm-kcopyd.h>
  21. #include "dm-snap.h"
  22. #include "dm-bio-list.h"
  23. #define DM_MSG_PREFIX "snapshots"
  24. /*
  25. * The percentage increment we will wake up users at
  26. */
  27. #define WAKE_UP_PERCENT 5
  28. /*
  29. * kcopyd priority of snapshot operations
  30. */
  31. #define SNAPSHOT_COPY_PRIORITY 2
  32. /*
  33. * Reserve 1MB for each snapshot initially (with minimum of 1 page).
  34. */
  35. #define SNAPSHOT_PAGES (((1UL << 20) >> PAGE_SHIFT) ? : 1)
  36. /*
  37. * The size of the mempool used to track chunks in use.
  38. */
  39. #define MIN_IOS 256
  40. static struct workqueue_struct *ksnapd;
  41. static void flush_queued_bios(struct work_struct *work);
  42. struct dm_snap_pending_exception {
  43. struct dm_snap_exception e;
  44. /*
  45. * Origin buffers waiting for this to complete are held
  46. * in a bio list
  47. */
  48. struct bio_list origin_bios;
  49. struct bio_list snapshot_bios;
  50. /*
  51. * Short-term queue of pending exceptions prior to submission.
  52. */
  53. struct list_head list;
  54. /*
  55. * The primary pending_exception is the one that holds
  56. * the ref_count and the list of origin_bios for a
  57. * group of pending_exceptions. It is always last to get freed.
  58. * These fields get set up when writing to the origin.
  59. */
  60. struct dm_snap_pending_exception *primary_pe;
  61. /*
  62. * Number of pending_exceptions processing this chunk.
  63. * When this drops to zero we must complete the origin bios.
  64. * If incrementing or decrementing this, hold pe->snap->lock for
  65. * the sibling concerned and not pe->primary_pe->snap->lock unless
  66. * they are the same.
  67. */
  68. atomic_t ref_count;
  69. /* Pointer back to snapshot context */
  70. struct dm_snapshot *snap;
  71. /*
  72. * 1 indicates the exception has already been sent to
  73. * kcopyd.
  74. */
  75. int started;
  76. };
  77. /*
  78. * Hash table mapping origin volumes to lists of snapshots and
  79. * a lock to protect it
  80. */
  81. static struct kmem_cache *exception_cache;
  82. static struct kmem_cache *pending_cache;
  83. static mempool_t *pending_pool;
  84. struct dm_snap_tracked_chunk {
  85. struct hlist_node node;
  86. chunk_t chunk;
  87. };
  88. static struct kmem_cache *tracked_chunk_cache;
  89. static struct dm_snap_tracked_chunk *track_chunk(struct dm_snapshot *s,
  90. chunk_t chunk)
  91. {
  92. struct dm_snap_tracked_chunk *c = mempool_alloc(s->tracked_chunk_pool,
  93. GFP_NOIO);
  94. unsigned long flags;
  95. c->chunk = chunk;
  96. spin_lock_irqsave(&s->tracked_chunk_lock, flags);
  97. hlist_add_head(&c->node,
  98. &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]);
  99. spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
  100. return c;
  101. }
  102. static void stop_tracking_chunk(struct dm_snapshot *s,
  103. struct dm_snap_tracked_chunk *c)
  104. {
  105. unsigned long flags;
  106. spin_lock_irqsave(&s->tracked_chunk_lock, flags);
  107. hlist_del(&c->node);
  108. spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
  109. mempool_free(c, s->tracked_chunk_pool);
  110. }
  111. /*
  112. * One of these per registered origin, held in the snapshot_origins hash
  113. */
  114. struct origin {
  115. /* The origin device */
  116. struct block_device *bdev;
  117. struct list_head hash_list;
  118. /* List of snapshots for this origin */
  119. struct list_head snapshots;
  120. };
  121. /*
  122. * Size of the hash table for origin volumes. If we make this
  123. * the size of the minors list then it should be nearly perfect
  124. */
  125. #define ORIGIN_HASH_SIZE 256
  126. #define ORIGIN_MASK 0xFF
  127. static struct list_head *_origins;
  128. static struct rw_semaphore _origins_lock;
  129. static int init_origin_hash(void)
  130. {
  131. int i;
  132. _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
  133. GFP_KERNEL);
  134. if (!_origins) {
  135. DMERR("unable to allocate memory");
  136. return -ENOMEM;
  137. }
  138. for (i = 0; i < ORIGIN_HASH_SIZE; i++)
  139. INIT_LIST_HEAD(_origins + i);
  140. init_rwsem(&_origins_lock);
  141. return 0;
  142. }
  143. static void exit_origin_hash(void)
  144. {
  145. kfree(_origins);
  146. }
  147. static unsigned origin_hash(struct block_device *bdev)
  148. {
  149. return bdev->bd_dev & ORIGIN_MASK;
  150. }
  151. static struct origin *__lookup_origin(struct block_device *origin)
  152. {
  153. struct list_head *ol;
  154. struct origin *o;
  155. ol = &_origins[origin_hash(origin)];
  156. list_for_each_entry (o, ol, hash_list)
  157. if (bdev_equal(o->bdev, origin))
  158. return o;
  159. return NULL;
  160. }
  161. static void __insert_origin(struct origin *o)
  162. {
  163. struct list_head *sl = &_origins[origin_hash(o->bdev)];
  164. list_add_tail(&o->hash_list, sl);
  165. }
  166. /*
  167. * Make a note of the snapshot and its origin so we can look it
  168. * up when the origin has a write on it.
  169. */
  170. static int register_snapshot(struct dm_snapshot *snap)
  171. {
  172. struct origin *o;
  173. struct block_device *bdev = snap->origin->bdev;
  174. down_write(&_origins_lock);
  175. o = __lookup_origin(bdev);
  176. if (!o) {
  177. /* New origin */
  178. o = kmalloc(sizeof(*o), GFP_KERNEL);
  179. if (!o) {
  180. up_write(&_origins_lock);
  181. return -ENOMEM;
  182. }
  183. /* Initialise the struct */
  184. INIT_LIST_HEAD(&o->snapshots);
  185. o->bdev = bdev;
  186. __insert_origin(o);
  187. }
  188. list_add_tail(&snap->list, &o->snapshots);
  189. up_write(&_origins_lock);
  190. return 0;
  191. }
  192. static void unregister_snapshot(struct dm_snapshot *s)
  193. {
  194. struct origin *o;
  195. down_write(&_origins_lock);
  196. o = __lookup_origin(s->origin->bdev);
  197. list_del(&s->list);
  198. if (list_empty(&o->snapshots)) {
  199. list_del(&o->hash_list);
  200. kfree(o);
  201. }
  202. up_write(&_origins_lock);
  203. }
  204. /*
  205. * Implementation of the exception hash tables.
  206. * The lowest hash_shift bits of the chunk number are ignored, allowing
  207. * some consecutive chunks to be grouped together.
  208. */
  209. static int init_exception_table(struct exception_table *et, uint32_t size,
  210. unsigned hash_shift)
  211. {
  212. unsigned int i;
  213. et->hash_shift = hash_shift;
  214. et->hash_mask = size - 1;
  215. et->table = dm_vcalloc(size, sizeof(struct list_head));
  216. if (!et->table)
  217. return -ENOMEM;
  218. for (i = 0; i < size; i++)
  219. INIT_LIST_HEAD(et->table + i);
  220. return 0;
  221. }
  222. static void exit_exception_table(struct exception_table *et, struct kmem_cache *mem)
  223. {
  224. struct list_head *slot;
  225. struct dm_snap_exception *ex, *next;
  226. int i, size;
  227. size = et->hash_mask + 1;
  228. for (i = 0; i < size; i++) {
  229. slot = et->table + i;
  230. list_for_each_entry_safe (ex, next, slot, hash_list)
  231. kmem_cache_free(mem, ex);
  232. }
  233. vfree(et->table);
  234. }
  235. static uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
  236. {
  237. return (chunk >> et->hash_shift) & et->hash_mask;
  238. }
  239. static void insert_exception(struct exception_table *eh,
  240. struct dm_snap_exception *e)
  241. {
  242. struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)];
  243. list_add(&e->hash_list, l);
  244. }
  245. static void remove_exception(struct dm_snap_exception *e)
  246. {
  247. list_del(&e->hash_list);
  248. }
  249. /*
  250. * Return the exception data for a sector, or NULL if not
  251. * remapped.
  252. */
  253. static struct dm_snap_exception *lookup_exception(struct exception_table *et,
  254. chunk_t chunk)
  255. {
  256. struct list_head *slot;
  257. struct dm_snap_exception *e;
  258. slot = &et->table[exception_hash(et, chunk)];
  259. list_for_each_entry (e, slot, hash_list)
  260. if (chunk >= e->old_chunk &&
  261. chunk <= e->old_chunk + dm_consecutive_chunk_count(e))
  262. return e;
  263. return NULL;
  264. }
  265. static struct dm_snap_exception *alloc_exception(void)
  266. {
  267. struct dm_snap_exception *e;
  268. e = kmem_cache_alloc(exception_cache, GFP_NOIO);
  269. if (!e)
  270. e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
  271. return e;
  272. }
  273. static void free_exception(struct dm_snap_exception *e)
  274. {
  275. kmem_cache_free(exception_cache, e);
  276. }
  277. static struct dm_snap_pending_exception *alloc_pending_exception(void)
  278. {
  279. return mempool_alloc(pending_pool, GFP_NOIO);
  280. }
  281. static void free_pending_exception(struct dm_snap_pending_exception *pe)
  282. {
  283. mempool_free(pe, pending_pool);
  284. }
  285. static void insert_completed_exception(struct dm_snapshot *s,
  286. struct dm_snap_exception *new_e)
  287. {
  288. struct exception_table *eh = &s->complete;
  289. struct list_head *l;
  290. struct dm_snap_exception *e = NULL;
  291. l = &eh->table[exception_hash(eh, new_e->old_chunk)];
  292. /* Add immediately if this table doesn't support consecutive chunks */
  293. if (!eh->hash_shift)
  294. goto out;
  295. /* List is ordered by old_chunk */
  296. list_for_each_entry_reverse(e, l, hash_list) {
  297. /* Insert after an existing chunk? */
  298. if (new_e->old_chunk == (e->old_chunk +
  299. dm_consecutive_chunk_count(e) + 1) &&
  300. new_e->new_chunk == (dm_chunk_number(e->new_chunk) +
  301. dm_consecutive_chunk_count(e) + 1)) {
  302. dm_consecutive_chunk_count_inc(e);
  303. free_exception(new_e);
  304. return;
  305. }
  306. /* Insert before an existing chunk? */
  307. if (new_e->old_chunk == (e->old_chunk - 1) &&
  308. new_e->new_chunk == (dm_chunk_number(e->new_chunk) - 1)) {
  309. dm_consecutive_chunk_count_inc(e);
  310. e->old_chunk--;
  311. e->new_chunk--;
  312. free_exception(new_e);
  313. return;
  314. }
  315. if (new_e->old_chunk > e->old_chunk)
  316. break;
  317. }
  318. out:
  319. list_add(&new_e->hash_list, e ? &e->hash_list : l);
  320. }
  321. int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
  322. {
  323. struct dm_snap_exception *e;
  324. e = alloc_exception();
  325. if (!e)
  326. return -ENOMEM;
  327. e->old_chunk = old;
  328. /* Consecutive_count is implicitly initialised to zero */
  329. e->new_chunk = new;
  330. insert_completed_exception(s, e);
  331. return 0;
  332. }
  333. /*
  334. * Hard coded magic.
  335. */
  336. static int calc_max_buckets(void)
  337. {
  338. /* use a fixed size of 2MB */
  339. unsigned long mem = 2 * 1024 * 1024;
  340. mem /= sizeof(struct list_head);
  341. return mem;
  342. }
  343. /*
  344. * Allocate room for a suitable hash table.
  345. */
  346. static int init_hash_tables(struct dm_snapshot *s)
  347. {
  348. sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
  349. /*
  350. * Calculate based on the size of the original volume or
  351. * the COW volume...
  352. */
  353. cow_dev_size = get_dev_size(s->cow->bdev);
  354. origin_dev_size = get_dev_size(s->origin->bdev);
  355. max_buckets = calc_max_buckets();
  356. hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift;
  357. hash_size = min(hash_size, max_buckets);
  358. hash_size = rounddown_pow_of_two(hash_size);
  359. if (init_exception_table(&s->complete, hash_size,
  360. DM_CHUNK_CONSECUTIVE_BITS))
  361. return -ENOMEM;
  362. /*
  363. * Allocate hash table for in-flight exceptions
  364. * Make this smaller than the real hash table
  365. */
  366. hash_size >>= 3;
  367. if (hash_size < 64)
  368. hash_size = 64;
  369. if (init_exception_table(&s->pending, hash_size, 0)) {
  370. exit_exception_table(&s->complete, exception_cache);
  371. return -ENOMEM;
  372. }
  373. return 0;
  374. }
  375. /*
  376. * Round a number up to the nearest 'size' boundary. size must
  377. * be a power of 2.
  378. */
  379. static ulong round_up(ulong n, ulong size)
  380. {
  381. size--;
  382. return (n + size) & ~size;
  383. }
  384. static int set_chunk_size(struct dm_snapshot *s, const char *chunk_size_arg,
  385. char **error)
  386. {
  387. unsigned long chunk_size;
  388. char *value;
  389. chunk_size = simple_strtoul(chunk_size_arg, &value, 10);
  390. if (*chunk_size_arg == '\0' || *value != '\0') {
  391. *error = "Invalid chunk size";
  392. return -EINVAL;
  393. }
  394. if (!chunk_size) {
  395. s->chunk_size = s->chunk_mask = s->chunk_shift = 0;
  396. return 0;
  397. }
  398. /*
  399. * Chunk size must be multiple of page size. Silently
  400. * round up if it's not.
  401. */
  402. chunk_size = round_up(chunk_size, PAGE_SIZE >> 9);
  403. /* Check chunk_size is a power of 2 */
  404. if (!is_power_of_2(chunk_size)) {
  405. *error = "Chunk size is not a power of 2";
  406. return -EINVAL;
  407. }
  408. /* Validate the chunk size against the device block size */
  409. if (chunk_size % (bdev_hardsect_size(s->cow->bdev) >> 9)) {
  410. *error = "Chunk size is not a multiple of device blocksize";
  411. return -EINVAL;
  412. }
  413. s->chunk_size = chunk_size;
  414. s->chunk_mask = chunk_size - 1;
  415. s->chunk_shift = ffs(chunk_size) - 1;
  416. return 0;
  417. }
  418. /*
  419. * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
  420. */
  421. static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
  422. {
  423. struct dm_snapshot *s;
  424. int i;
  425. int r = -EINVAL;
  426. char persistent;
  427. char *origin_path;
  428. char *cow_path;
  429. if (argc != 4) {
  430. ti->error = "requires exactly 4 arguments";
  431. r = -EINVAL;
  432. goto bad1;
  433. }
  434. origin_path = argv[0];
  435. cow_path = argv[1];
  436. persistent = toupper(*argv[2]);
  437. if (persistent != 'P' && persistent != 'N') {
  438. ti->error = "Persistent flag is not P or N";
  439. r = -EINVAL;
  440. goto bad1;
  441. }
  442. s = kmalloc(sizeof(*s), GFP_KERNEL);
  443. if (s == NULL) {
  444. ti->error = "Cannot allocate snapshot context private "
  445. "structure";
  446. r = -ENOMEM;
  447. goto bad1;
  448. }
  449. r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
  450. if (r) {
  451. ti->error = "Cannot get origin device";
  452. goto bad2;
  453. }
  454. r = dm_get_device(ti, cow_path, 0, 0,
  455. FMODE_READ | FMODE_WRITE, &s->cow);
  456. if (r) {
  457. dm_put_device(ti, s->origin);
  458. ti->error = "Cannot get COW device";
  459. goto bad2;
  460. }
  461. r = set_chunk_size(s, argv[3], &ti->error);
  462. if (r)
  463. goto bad3;
  464. s->type = persistent;
  465. s->valid = 1;
  466. s->active = 0;
  467. s->last_percent = 0;
  468. init_rwsem(&s->lock);
  469. spin_lock_init(&s->pe_lock);
  470. s->ti = ti;
  471. /* Allocate hash table for COW data */
  472. if (init_hash_tables(s)) {
  473. ti->error = "Unable to allocate hash table space";
  474. r = -ENOMEM;
  475. goto bad3;
  476. }
  477. s->store.snap = s;
  478. if (persistent == 'P')
  479. r = dm_create_persistent(&s->store);
  480. else
  481. r = dm_create_transient(&s->store);
  482. if (r) {
  483. ti->error = "Couldn't create exception store";
  484. r = -EINVAL;
  485. goto bad4;
  486. }
  487. r = dm_kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
  488. if (r) {
  489. ti->error = "Could not create kcopyd client";
  490. goto bad5;
  491. }
  492. s->tracked_chunk_pool = mempool_create_slab_pool(MIN_IOS,
  493. tracked_chunk_cache);
  494. if (!s->tracked_chunk_pool) {
  495. ti->error = "Could not allocate tracked_chunk mempool for "
  496. "tracking reads";
  497. goto bad6;
  498. }
  499. for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
  500. INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]);
  501. spin_lock_init(&s->tracked_chunk_lock);
  502. /* Metadata must only be loaded into one table at once */
  503. r = s->store.read_metadata(&s->store);
  504. if (r < 0) {
  505. ti->error = "Failed to read snapshot metadata";
  506. goto bad_load_and_register;
  507. } else if (r > 0) {
  508. s->valid = 0;
  509. DMWARN("Snapshot is marked invalid.");
  510. }
  511. bio_list_init(&s->queued_bios);
  512. INIT_WORK(&s->queued_bios_work, flush_queued_bios);
  513. /* Add snapshot to the list of snapshots for this origin */
  514. /* Exceptions aren't triggered till snapshot_resume() is called */
  515. if (register_snapshot(s)) {
  516. r = -EINVAL;
  517. ti->error = "Cannot register snapshot origin";
  518. goto bad_load_and_register;
  519. }
  520. ti->private = s;
  521. ti->split_io = s->chunk_size;
  522. return 0;
  523. bad_load_and_register:
  524. mempool_destroy(s->tracked_chunk_pool);
  525. bad6:
  526. dm_kcopyd_client_destroy(s->kcopyd_client);
  527. bad5:
  528. s->store.destroy(&s->store);
  529. bad4:
  530. exit_exception_table(&s->pending, pending_cache);
  531. exit_exception_table(&s->complete, exception_cache);
  532. bad3:
  533. dm_put_device(ti, s->cow);
  534. dm_put_device(ti, s->origin);
  535. bad2:
  536. kfree(s);
  537. bad1:
  538. return r;
  539. }
  540. static void __free_exceptions(struct dm_snapshot *s)
  541. {
  542. dm_kcopyd_client_destroy(s->kcopyd_client);
  543. s->kcopyd_client = NULL;
  544. exit_exception_table(&s->pending, pending_cache);
  545. exit_exception_table(&s->complete, exception_cache);
  546. s->store.destroy(&s->store);
  547. }
  548. static void snapshot_dtr(struct dm_target *ti)
  549. {
  550. #ifdef CONFIG_DM_DEBUG
  551. int i;
  552. #endif
  553. struct dm_snapshot *s = ti->private;
  554. flush_workqueue(ksnapd);
  555. /* Prevent further origin writes from using this snapshot. */
  556. /* After this returns there can be no new kcopyd jobs. */
  557. unregister_snapshot(s);
  558. #ifdef CONFIG_DM_DEBUG
  559. for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
  560. BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i]));
  561. #endif
  562. mempool_destroy(s->tracked_chunk_pool);
  563. __free_exceptions(s);
  564. dm_put_device(ti, s->origin);
  565. dm_put_device(ti, s->cow);
  566. kfree(s);
  567. }
  568. /*
  569. * Flush a list of buffers.
  570. */
  571. static void flush_bios(struct bio *bio)
  572. {
  573. struct bio *n;
  574. while (bio) {
  575. n = bio->bi_next;
  576. bio->bi_next = NULL;
  577. generic_make_request(bio);
  578. bio = n;
  579. }
  580. }
  581. static void flush_queued_bios(struct work_struct *work)
  582. {
  583. struct dm_snapshot *s =
  584. container_of(work, struct dm_snapshot, queued_bios_work);
  585. struct bio *queued_bios;
  586. unsigned long flags;
  587. spin_lock_irqsave(&s->pe_lock, flags);
  588. queued_bios = bio_list_get(&s->queued_bios);
  589. spin_unlock_irqrestore(&s->pe_lock, flags);
  590. flush_bios(queued_bios);
  591. }
  592. /*
  593. * Error a list of buffers.
  594. */
  595. static void error_bios(struct bio *bio)
  596. {
  597. struct bio *n;
  598. while (bio) {
  599. n = bio->bi_next;
  600. bio->bi_next = NULL;
  601. bio_io_error(bio);
  602. bio = n;
  603. }
  604. }
  605. static void __invalidate_snapshot(struct dm_snapshot *s, int err)
  606. {
  607. if (!s->valid)
  608. return;
  609. if (err == -EIO)
  610. DMERR("Invalidating snapshot: Error reading/writing.");
  611. else if (err == -ENOMEM)
  612. DMERR("Invalidating snapshot: Unable to allocate exception.");
  613. if (s->store.drop_snapshot)
  614. s->store.drop_snapshot(&s->store);
  615. s->valid = 0;
  616. dm_table_event(s->ti->table);
  617. }
  618. static void get_pending_exception(struct dm_snap_pending_exception *pe)
  619. {
  620. atomic_inc(&pe->ref_count);
  621. }
  622. static struct bio *put_pending_exception(struct dm_snap_pending_exception *pe)
  623. {
  624. struct dm_snap_pending_exception *primary_pe;
  625. struct bio *origin_bios = NULL;
  626. primary_pe = pe->primary_pe;
  627. /*
  628. * If this pe is involved in a write to the origin and
  629. * it is the last sibling to complete then release
  630. * the bios for the original write to the origin.
  631. */
  632. if (primary_pe &&
  633. atomic_dec_and_test(&primary_pe->ref_count))
  634. origin_bios = bio_list_get(&primary_pe->origin_bios);
  635. /*
  636. * Free the pe if it's not linked to an origin write or if
  637. * it's not itself a primary pe.
  638. */
  639. if (!primary_pe || primary_pe != pe)
  640. free_pending_exception(pe);
  641. /*
  642. * Free the primary pe if nothing references it.
  643. */
  644. if (primary_pe && !atomic_read(&primary_pe->ref_count))
  645. free_pending_exception(primary_pe);
  646. return origin_bios;
  647. }
  648. static void pending_complete(struct dm_snap_pending_exception *pe, int success)
  649. {
  650. struct dm_snap_exception *e;
  651. struct dm_snapshot *s = pe->snap;
  652. struct bio *origin_bios = NULL;
  653. struct bio *snapshot_bios = NULL;
  654. int error = 0;
  655. if (!success) {
  656. /* Read/write error - snapshot is unusable */
  657. down_write(&s->lock);
  658. __invalidate_snapshot(s, -EIO);
  659. error = 1;
  660. goto out;
  661. }
  662. e = alloc_exception();
  663. if (!e) {
  664. down_write(&s->lock);
  665. __invalidate_snapshot(s, -ENOMEM);
  666. error = 1;
  667. goto out;
  668. }
  669. *e = pe->e;
  670. down_write(&s->lock);
  671. if (!s->valid) {
  672. free_exception(e);
  673. error = 1;
  674. goto out;
  675. }
  676. /*
  677. * Add a proper exception, and remove the
  678. * in-flight exception from the list.
  679. */
  680. insert_completed_exception(s, e);
  681. out:
  682. remove_exception(&pe->e);
  683. snapshot_bios = bio_list_get(&pe->snapshot_bios);
  684. origin_bios = put_pending_exception(pe);
  685. up_write(&s->lock);
  686. /* Submit any pending write bios */
  687. if (error)
  688. error_bios(snapshot_bios);
  689. else
  690. flush_bios(snapshot_bios);
  691. flush_bios(origin_bios);
  692. }
  693. static void commit_callback(void *context, int success)
  694. {
  695. struct dm_snap_pending_exception *pe = context;
  696. pending_complete(pe, success);
  697. }
  698. /*
  699. * Called when the copy I/O has finished. kcopyd actually runs
  700. * this code so don't block.
  701. */
  702. static void copy_callback(int read_err, unsigned long write_err, void *context)
  703. {
  704. struct dm_snap_pending_exception *pe = context;
  705. struct dm_snapshot *s = pe->snap;
  706. if (read_err || write_err)
  707. pending_complete(pe, 0);
  708. else
  709. /* Update the metadata if we are persistent */
  710. s->store.commit_exception(&s->store, &pe->e, commit_callback,
  711. pe);
  712. }
  713. /*
  714. * Dispatches the copy operation to kcopyd.
  715. */
  716. static void start_copy(struct dm_snap_pending_exception *pe)
  717. {
  718. struct dm_snapshot *s = pe->snap;
  719. struct dm_io_region src, dest;
  720. struct block_device *bdev = s->origin->bdev;
  721. sector_t dev_size;
  722. dev_size = get_dev_size(bdev);
  723. src.bdev = bdev;
  724. src.sector = chunk_to_sector(s, pe->e.old_chunk);
  725. src.count = min(s->chunk_size, dev_size - src.sector);
  726. dest.bdev = s->cow->bdev;
  727. dest.sector = chunk_to_sector(s, pe->e.new_chunk);
  728. dest.count = src.count;
  729. /* Hand over to kcopyd */
  730. dm_kcopyd_copy(s->kcopyd_client,
  731. &src, 1, &dest, 0, copy_callback, pe);
  732. }
  733. /*
  734. * Looks to see if this snapshot already has a pending exception
  735. * for this chunk, otherwise it allocates a new one and inserts
  736. * it into the pending table.
  737. *
  738. * NOTE: a write lock must be held on snap->lock before calling
  739. * this.
  740. */
  741. static struct dm_snap_pending_exception *
  742. __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
  743. {
  744. struct dm_snap_exception *e;
  745. struct dm_snap_pending_exception *pe;
  746. chunk_t chunk = sector_to_chunk(s, bio->bi_sector);
  747. /*
  748. * Is there a pending exception for this already ?
  749. */
  750. e = lookup_exception(&s->pending, chunk);
  751. if (e) {
  752. /* cast the exception to a pending exception */
  753. pe = container_of(e, struct dm_snap_pending_exception, e);
  754. goto out;
  755. }
  756. /*
  757. * Create a new pending exception, we don't want
  758. * to hold the lock while we do this.
  759. */
  760. up_write(&s->lock);
  761. pe = alloc_pending_exception();
  762. down_write(&s->lock);
  763. if (!s->valid) {
  764. free_pending_exception(pe);
  765. return NULL;
  766. }
  767. e = lookup_exception(&s->pending, chunk);
  768. if (e) {
  769. free_pending_exception(pe);
  770. pe = container_of(e, struct dm_snap_pending_exception, e);
  771. goto out;
  772. }
  773. pe->e.old_chunk = chunk;
  774. bio_list_init(&pe->origin_bios);
  775. bio_list_init(&pe->snapshot_bios);
  776. pe->primary_pe = NULL;
  777. atomic_set(&pe->ref_count, 0);
  778. pe->snap = s;
  779. pe->started = 0;
  780. if (s->store.prepare_exception(&s->store, &pe->e)) {
  781. free_pending_exception(pe);
  782. return NULL;
  783. }
  784. get_pending_exception(pe);
  785. insert_exception(&s->pending, &pe->e);
  786. out:
  787. return pe;
  788. }
  789. static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e,
  790. struct bio *bio, chunk_t chunk)
  791. {
  792. bio->bi_bdev = s->cow->bdev;
  793. bio->bi_sector = chunk_to_sector(s, dm_chunk_number(e->new_chunk) +
  794. (chunk - e->old_chunk)) +
  795. (bio->bi_sector & s->chunk_mask);
  796. }
  797. static int snapshot_map(struct dm_target *ti, struct bio *bio,
  798. union map_info *map_context)
  799. {
  800. struct dm_snap_exception *e;
  801. struct dm_snapshot *s = ti->private;
  802. int r = DM_MAPIO_REMAPPED;
  803. chunk_t chunk;
  804. struct dm_snap_pending_exception *pe = NULL;
  805. chunk = sector_to_chunk(s, bio->bi_sector);
  806. /* Full snapshots are not usable */
  807. /* To get here the table must be live so s->active is always set. */
  808. if (!s->valid)
  809. return -EIO;
  810. /* FIXME: should only take write lock if we need
  811. * to copy an exception */
  812. down_write(&s->lock);
  813. if (!s->valid) {
  814. r = -EIO;
  815. goto out_unlock;
  816. }
  817. /* If the block is already remapped - use that, else remap it */
  818. e = lookup_exception(&s->complete, chunk);
  819. if (e) {
  820. remap_exception(s, e, bio, chunk);
  821. goto out_unlock;
  822. }
  823. /*
  824. * Write to snapshot - higher level takes care of RW/RO
  825. * flags so we should only get this if we are
  826. * writeable.
  827. */
  828. if (bio_rw(bio) == WRITE) {
  829. pe = __find_pending_exception(s, bio);
  830. if (!pe) {
  831. __invalidate_snapshot(s, -ENOMEM);
  832. r = -EIO;
  833. goto out_unlock;
  834. }
  835. remap_exception(s, &pe->e, bio, chunk);
  836. bio_list_add(&pe->snapshot_bios, bio);
  837. r = DM_MAPIO_SUBMITTED;
  838. if (!pe->started) {
  839. /* this is protected by snap->lock */
  840. pe->started = 1;
  841. up_write(&s->lock);
  842. start_copy(pe);
  843. goto out;
  844. }
  845. } else {
  846. bio->bi_bdev = s->origin->bdev;
  847. map_context->ptr = track_chunk(s, chunk);
  848. }
  849. out_unlock:
  850. up_write(&s->lock);
  851. out:
  852. return r;
  853. }
  854. static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
  855. int error, union map_info *map_context)
  856. {
  857. struct dm_snapshot *s = ti->private;
  858. struct dm_snap_tracked_chunk *c = map_context->ptr;
  859. if (c)
  860. stop_tracking_chunk(s, c);
  861. return 0;
  862. }
  863. static void snapshot_resume(struct dm_target *ti)
  864. {
  865. struct dm_snapshot *s = ti->private;
  866. down_write(&s->lock);
  867. s->active = 1;
  868. up_write(&s->lock);
  869. }
  870. static int snapshot_status(struct dm_target *ti, status_type_t type,
  871. char *result, unsigned int maxlen)
  872. {
  873. struct dm_snapshot *snap = ti->private;
  874. switch (type) {
  875. case STATUSTYPE_INFO:
  876. if (!snap->valid)
  877. snprintf(result, maxlen, "Invalid");
  878. else {
  879. if (snap->store.fraction_full) {
  880. sector_t numerator, denominator;
  881. snap->store.fraction_full(&snap->store,
  882. &numerator,
  883. &denominator);
  884. snprintf(result, maxlen, "%llu/%llu",
  885. (unsigned long long)numerator,
  886. (unsigned long long)denominator);
  887. }
  888. else
  889. snprintf(result, maxlen, "Unknown");
  890. }
  891. break;
  892. case STATUSTYPE_TABLE:
  893. /*
  894. * kdevname returns a static pointer so we need
  895. * to make private copies if the output is to
  896. * make sense.
  897. */
  898. snprintf(result, maxlen, "%s %s %c %llu",
  899. snap->origin->name, snap->cow->name,
  900. snap->type,
  901. (unsigned long long)snap->chunk_size);
  902. break;
  903. }
  904. return 0;
  905. }
  906. /*-----------------------------------------------------------------
  907. * Origin methods
  908. *---------------------------------------------------------------*/
  909. static int __origin_write(struct list_head *snapshots, struct bio *bio)
  910. {
  911. int r = DM_MAPIO_REMAPPED, first = 0;
  912. struct dm_snapshot *snap;
  913. struct dm_snap_exception *e;
  914. struct dm_snap_pending_exception *pe, *next_pe, *primary_pe = NULL;
  915. chunk_t chunk;
  916. LIST_HEAD(pe_queue);
  917. /* Do all the snapshots on this origin */
  918. list_for_each_entry (snap, snapshots, list) {
  919. down_write(&snap->lock);
  920. /* Only deal with valid and active snapshots */
  921. if (!snap->valid || !snap->active)
  922. goto next_snapshot;
  923. /* Nothing to do if writing beyond end of snapshot */
  924. if (bio->bi_sector >= dm_table_get_size(snap->ti->table))
  925. goto next_snapshot;
  926. /*
  927. * Remember, different snapshots can have
  928. * different chunk sizes.
  929. */
  930. chunk = sector_to_chunk(snap, bio->bi_sector);
  931. /*
  932. * Check exception table to see if block
  933. * is already remapped in this snapshot
  934. * and trigger an exception if not.
  935. *
  936. * ref_count is initialised to 1 so pending_complete()
  937. * won't destroy the primary_pe while we're inside this loop.
  938. */
  939. e = lookup_exception(&snap->complete, chunk);
  940. if (e)
  941. goto next_snapshot;
  942. pe = __find_pending_exception(snap, bio);
  943. if (!pe) {
  944. __invalidate_snapshot(snap, -ENOMEM);
  945. goto next_snapshot;
  946. }
  947. if (!primary_pe) {
  948. /*
  949. * Either every pe here has same
  950. * primary_pe or none has one yet.
  951. */
  952. if (pe->primary_pe)
  953. primary_pe = pe->primary_pe;
  954. else {
  955. primary_pe = pe;
  956. first = 1;
  957. }
  958. bio_list_add(&primary_pe->origin_bios, bio);
  959. r = DM_MAPIO_SUBMITTED;
  960. }
  961. if (!pe->primary_pe) {
  962. pe->primary_pe = primary_pe;
  963. get_pending_exception(primary_pe);
  964. }
  965. if (!pe->started) {
  966. pe->started = 1;
  967. list_add_tail(&pe->list, &pe_queue);
  968. }
  969. next_snapshot:
  970. up_write(&snap->lock);
  971. }
  972. if (!primary_pe)
  973. return r;
  974. /*
  975. * If this is the first time we're processing this chunk and
  976. * ref_count is now 1 it means all the pending exceptions
  977. * got completed while we were in the loop above, so it falls to
  978. * us here to remove the primary_pe and submit any origin_bios.
  979. */
  980. if (first && atomic_dec_and_test(&primary_pe->ref_count)) {
  981. flush_bios(bio_list_get(&primary_pe->origin_bios));
  982. free_pending_exception(primary_pe);
  983. /* If we got here, pe_queue is necessarily empty. */
  984. return r;
  985. }
  986. /*
  987. * Now that we have a complete pe list we can start the copying.
  988. */
  989. list_for_each_entry_safe(pe, next_pe, &pe_queue, list)
  990. start_copy(pe);
  991. return r;
  992. }
  993. /*
  994. * Called on a write from the origin driver.
  995. */
  996. static int do_origin(struct dm_dev *origin, struct bio *bio)
  997. {
  998. struct origin *o;
  999. int r = DM_MAPIO_REMAPPED;
  1000. down_read(&_origins_lock);
  1001. o = __lookup_origin(origin->bdev);
  1002. if (o)
  1003. r = __origin_write(&o->snapshots, bio);
  1004. up_read(&_origins_lock);
  1005. return r;
  1006. }
  1007. /*
  1008. * Origin: maps a linear range of a device, with hooks for snapshotting.
  1009. */
  1010. /*
  1011. * Construct an origin mapping: <dev_path>
  1012. * The context for an origin is merely a 'struct dm_dev *'
  1013. * pointing to the real device.
  1014. */
  1015. static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
  1016. {
  1017. int r;
  1018. struct dm_dev *dev;
  1019. if (argc != 1) {
  1020. ti->error = "origin: incorrect number of arguments";
  1021. return -EINVAL;
  1022. }
  1023. r = dm_get_device(ti, argv[0], 0, ti->len,
  1024. dm_table_get_mode(ti->table), &dev);
  1025. if (r) {
  1026. ti->error = "Cannot get target device";
  1027. return r;
  1028. }
  1029. ti->private = dev;
  1030. return 0;
  1031. }
  1032. static void origin_dtr(struct dm_target *ti)
  1033. {
  1034. struct dm_dev *dev = ti->private;
  1035. dm_put_device(ti, dev);
  1036. }
  1037. static int origin_map(struct dm_target *ti, struct bio *bio,
  1038. union map_info *map_context)
  1039. {
  1040. struct dm_dev *dev = ti->private;
  1041. bio->bi_bdev = dev->bdev;
  1042. /* Only tell snapshots if this is a write */
  1043. return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED;
  1044. }
  1045. #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
  1046. /*
  1047. * Set the target "split_io" field to the minimum of all the snapshots'
  1048. * chunk sizes.
  1049. */
  1050. static void origin_resume(struct dm_target *ti)
  1051. {
  1052. struct dm_dev *dev = ti->private;
  1053. struct dm_snapshot *snap;
  1054. struct origin *o;
  1055. chunk_t chunk_size = 0;
  1056. down_read(&_origins_lock);
  1057. o = __lookup_origin(dev->bdev);
  1058. if (o)
  1059. list_for_each_entry (snap, &o->snapshots, list)
  1060. chunk_size = min_not_zero(chunk_size, snap->chunk_size);
  1061. up_read(&_origins_lock);
  1062. ti->split_io = chunk_size;
  1063. }
  1064. static int origin_status(struct dm_target *ti, status_type_t type, char *result,
  1065. unsigned int maxlen)
  1066. {
  1067. struct dm_dev *dev = ti->private;
  1068. switch (type) {
  1069. case STATUSTYPE_INFO:
  1070. result[0] = '\0';
  1071. break;
  1072. case STATUSTYPE_TABLE:
  1073. snprintf(result, maxlen, "%s", dev->name);
  1074. break;
  1075. }
  1076. return 0;
  1077. }
  1078. static struct target_type origin_target = {
  1079. .name = "snapshot-origin",
  1080. .version = {1, 6, 0},
  1081. .module = THIS_MODULE,
  1082. .ctr = origin_ctr,
  1083. .dtr = origin_dtr,
  1084. .map = origin_map,
  1085. .resume = origin_resume,
  1086. .status = origin_status,
  1087. };
  1088. static struct target_type snapshot_target = {
  1089. .name = "snapshot",
  1090. .version = {1, 6, 0},
  1091. .module = THIS_MODULE,
  1092. .ctr = snapshot_ctr,
  1093. .dtr = snapshot_dtr,
  1094. .map = snapshot_map,
  1095. .end_io = snapshot_end_io,
  1096. .resume = snapshot_resume,
  1097. .status = snapshot_status,
  1098. };
  1099. static int __init dm_snapshot_init(void)
  1100. {
  1101. int r;
  1102. r = dm_register_target(&snapshot_target);
  1103. if (r) {
  1104. DMERR("snapshot target register failed %d", r);
  1105. return r;
  1106. }
  1107. r = dm_register_target(&origin_target);
  1108. if (r < 0) {
  1109. DMERR("Origin target register failed %d", r);
  1110. goto bad1;
  1111. }
  1112. r = init_origin_hash();
  1113. if (r) {
  1114. DMERR("init_origin_hash failed.");
  1115. goto bad2;
  1116. }
  1117. exception_cache = KMEM_CACHE(dm_snap_exception, 0);
  1118. if (!exception_cache) {
  1119. DMERR("Couldn't create exception cache.");
  1120. r = -ENOMEM;
  1121. goto bad3;
  1122. }
  1123. pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0);
  1124. if (!pending_cache) {
  1125. DMERR("Couldn't create pending cache.");
  1126. r = -ENOMEM;
  1127. goto bad4;
  1128. }
  1129. tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0);
  1130. if (!tracked_chunk_cache) {
  1131. DMERR("Couldn't create cache to track chunks in use.");
  1132. r = -ENOMEM;
  1133. goto bad5;
  1134. }
  1135. pending_pool = mempool_create_slab_pool(128, pending_cache);
  1136. if (!pending_pool) {
  1137. DMERR("Couldn't create pending pool.");
  1138. r = -ENOMEM;
  1139. goto bad_pending_pool;
  1140. }
  1141. ksnapd = create_singlethread_workqueue("ksnapd");
  1142. if (!ksnapd) {
  1143. DMERR("Failed to create ksnapd workqueue.");
  1144. r = -ENOMEM;
  1145. goto bad6;
  1146. }
  1147. return 0;
  1148. bad6:
  1149. mempool_destroy(pending_pool);
  1150. bad_pending_pool:
  1151. kmem_cache_destroy(tracked_chunk_cache);
  1152. bad5:
  1153. kmem_cache_destroy(pending_cache);
  1154. bad4:
  1155. kmem_cache_destroy(exception_cache);
  1156. bad3:
  1157. exit_origin_hash();
  1158. bad2:
  1159. dm_unregister_target(&origin_target);
  1160. bad1:
  1161. dm_unregister_target(&snapshot_target);
  1162. return r;
  1163. }
  1164. static void __exit dm_snapshot_exit(void)
  1165. {
  1166. int r;
  1167. destroy_workqueue(ksnapd);
  1168. r = dm_unregister_target(&snapshot_target);
  1169. if (r)
  1170. DMERR("snapshot unregister failed %d", r);
  1171. r = dm_unregister_target(&origin_target);
  1172. if (r)
  1173. DMERR("origin unregister failed %d", r);
  1174. exit_origin_hash();
  1175. mempool_destroy(pending_pool);
  1176. kmem_cache_destroy(pending_cache);
  1177. kmem_cache_destroy(exception_cache);
  1178. kmem_cache_destroy(tracked_chunk_cache);
  1179. }
  1180. /* Module hooks */
  1181. module_init(dm_snapshot_init);
  1182. module_exit(dm_snapshot_exit);
  1183. MODULE_DESCRIPTION(DM_NAME " snapshot target");
  1184. MODULE_AUTHOR("Joe Thornber");
  1185. MODULE_LICENSE("GPL");