13 ani în urmă · 991d9fa02d
--- a/Documentation/device-mapper/thin-provisioning.txt
+++ b/Documentation/device-mapper/thin-provisioning.txt
@@ -0,0 +1,285 @@
 
															+Introduction
														
 
															+============
														
 
															+
														
 
															+This document descibes a collection of device-mapper targets that
														
 
															+between them implement thin-provisioning and snapshots.
														
 
															+
														
 
															+The main highlight of this implementation, compared to the previous
														
 
															+implementation of snapshots, is that it allows many virtual devices to
														
 
															+be stored on the same data volume.  This simplifies administration and
														
 
															+allows the sharing of data between volumes, thus reducing disk usage.
														
 
															+
														
 
															+Another significant feature is support for an arbitrary depth of
														
 
															+recursive snapshots (snapshots of snapshots of snapshots ...).  The
														
 
															+previous implementation of snapshots did this by chaining together
														
 
															+lookup tables, and so performance was O(depth).  This new
														
 
															+implementation uses a single data structure to avoid this degradation
														
 
															+with depth.  Fragmentation may still be an issue, however, in some
														
 
															+scenarios.
														
 
															+
														
 
															+Metadata is stored on a separate device from data, giving the
														
 
															+administrator some freedom, for example to:
														
 
															+
														
 
															+- Improve metadata resilience by storing metadata on a mirrored volume
														
 
															+  but data on a non-mirrored one.
														
 
															+
														
 
															+- Improve performance by storing the metadata on SSD.
														
 
															+
														
 
															+Status
														
 
															+======
														
 
															+
														
 
															+These targets are very much still in the EXPERIMENTAL state.  Please
														
 
															+do not yet rely on them in production.  But do experiment and offer us
														
 
															+feedback.  Different use cases will have different performance
														
 
															+characteristics, for example due to fragmentation of the data volume.
														
 
															+
														
 
															+If you find this software is not performing as expected please mail
														
 
															+dm-devel@redhat.com with details and we'll try our best to improve
														
 
															+things for you.
														
 
															+
														
 
															+Userspace tools for checking and repairing the metadata are under
														
 
															+development.
														
 
															+
														
 
															+Cookbook
														
 
															+========
														
 
															+
														
 
															+This section describes some quick recipes for using thin provisioning.
														
 
															+They use the dmsetup program to control the device-mapper driver
														
 
															+directly.  End users will be advised to use a higher-level volume
														
 
															+manager such as LVM2 once support has been added.
														
 
															+
														
 
															+Pool device
														
 
															+-----------
														
 
															+
														
 
															+The pool device ties together the metadata volume and the data volume.
														
 
															+It maps I/O linearly to the data volume and updates the metadata via
														
 
															+two mechanisms:
														
 
															+
														
 
															+- Function calls from the thin targets
														
 
															+
														
 
															+- Device-mapper 'messages' from userspace which control the creation of new
														
 
															+  virtual devices amongst other things.
														
 
															+
														
 
															+Setting up a fresh pool device
														
 
															+------------------------------
														
 
															+
														
 
															+Setting up a pool device requires a valid metadata device, and a
														
 
															+data device.  If you do not have an existing metadata device you can
														
 
															+make one by zeroing the first 4k to indicate empty metadata.
														
 
															+
														
 
															+    dd if=/dev/zero of=$metadata_dev bs=4096 count=1
														
 
															+
														
 
															+The amount of metadata you need will vary according to how many blocks
														
 
															+are shared between thin devices (i.e. through snapshots).  If you have
														
 
															+less sharing than average you'll need a larger-than-average metadata device.
														
 
															+
														
 
															+As a guide, we suggest you calculate the number of bytes to use in the
														
 
															+metadata device as 48 * $data_dev_size / $data_block_size but round it up
														
 
															+to 2MB if the answer is smaller.  The largest size supported is 16GB.
														
 
															+
														
 
															+If you're creating large numbers of snapshots which are recording large
														
 
															+amounts of change, you may need find you need to increase this.
														
 
															+
														
 
															+Reloading a pool table
														
 
															+----------------------
														
 
															+
														
 
															+You may reload a pool's table, indeed this is how the pool is resized
														
 
															+if it runs out of space.  (N.B. While specifying a different metadata
														
 
															+device when reloading is not forbidden at the moment, things will go
														
 
															+wrong if it does not route I/O to exactly the same on-disk location as
														
 
															+previously.)
														
 
															+
														
 
															+Using an existing pool device
														
 
															+-----------------------------
														
 
															+
														
 
															+    dmsetup create pool \
														
 
															+	--table "0 20971520 thin-pool $metadata_dev $data_dev \
														
 
															+		 $data_block_size $low_water_mark"
														
 
															+
														
 
															+$data_block_size gives the smallest unit of disk space that can be
														
 
															+allocated at a time expressed in units of 512-byte sectors.  People
														
 
															+primarily interested in thin provisioning may want to use a value such
														
 
															+as 1024 (512KB).  People doing lots of snapshotting may want a smaller value
														
 
															+such as 128 (64KB).  If you are not zeroing newly-allocated data,
														
 
															+a larger $data_block_size in the region of 256000 (128MB) is suggested.
														
 
															+$data_block_size must be the same for the lifetime of the
														
 
															+metadata device.
														
 
															+
														
 
															+$low_water_mark is expressed in blocks of size $data_block_size.  If
														
 
															+free space on the data device drops below this level then a dm event
														
 
															+will be triggered which a userspace daemon should catch allowing it to
														
 
															+extend the pool device.  Only one such event will be sent.
														
 
															+Resuming a device with a new table itself triggers an event so the
														
 
															+userspace daemon can use this to detect a situation where a new table
														
 
															+already exceeds the threshold.
														
 
															+
														
 
															+Thin provisioning
														
 
															+-----------------
														
 
															+
														
 
															+i) Creating a new thinly-provisioned volume.
														
 
															+
														
 
															+  To create a new thinly- provisioned volume you must send a message to an
														
 
															+  active pool device, /dev/mapper/pool in this example.
														
 
															+
														
 
															+    dmsetup message /dev/mapper/pool 0 "create_thin 0"
														
 
															+
														
 
															+  Here '0' is an identifier for the volume, a 24-bit number.  It's up
														
 
															+  to the caller to allocate and manage these identifiers.  If the
														
 
															+  identifier is already in use, the message will fail with -EEXIST.
														
 
															+
														
 
															+ii) Using a thinly-provisioned volume.
														
 
															+
														
 
															+  Thinly-provisioned volumes are activated using the 'thin' target:
														
 
															+
														
 
															+    dmsetup create thin --table "0 2097152 thin /dev/mapper/pool 0"
														
 
															+
														
 
															+  The last parameter is the identifier for the thinp device.
														
 
															+
														
 
															+Internal snapshots
														
 
															+------------------
														
 
															+
														
 
															+i) Creating an internal snapshot.
														
 
															+
														
 
															+  Snapshots are created with another message to the pool.
														
 
															+
														
 
															+  N.B.  If the origin device that you wish to snapshot is active, you
														
 
															+  must suspend it before creating the snapshot to avoid corruption.
														
 
															+  This is NOT enforced at the moment, so please be careful!
														
 
															+
														
 
															+    dmsetup suspend /dev/mapper/thin
														
 
															+    dmsetup message /dev/mapper/pool 0 "create_snap 1 0"
														
 
															+    dmsetup resume /dev/mapper/thin
														
 
															+
														
 
															+  Here '1' is the identifier for the volume, a 24-bit number.  '0' is the
														
 
															+  identifier for the origin device.
														
 
															+
														
 
															+ii) Using an internal snapshot.
														
 
															+
														
 
															+  Once created, the user doesn't have to worry about any connection
														
 
															+  between the origin and the snapshot.  Indeed the snapshot is no
														
 
															+  different from any other thinly-provisioned device and can be
														
 
															+  snapshotted itself via the same method.  It's perfectly legal to
														
 
															+  have only one of them active, and there's no ordering requirement on
														
 
															+  activating or removing them both.  (This differs from conventional
														
 
															+  device-mapper snapshots.)
														
 
															+
														
 
															+  Activate it exactly the same way as any other thinly-provisioned volume:
														
 
															+
														
 
															+    dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 1"
														
 
															+
														
 
															+Deactivation
														
 
															+------------
														
 
															+
														
 
															+All devices using a pool must be deactivated before the pool itself
														
 
															+can be.
														
 
															+
														
 
															+    dmsetup remove thin
														
 
															+    dmsetup remove snap
														
 
															+    dmsetup remove pool
														
 
															+
														
 
															+Reference
														
 
															+=========
														
 
															+
														
 
															+'thin-pool' target
														
 
															+------------------
														
 
															+
														
 
															+i) Constructor
														
 
															+
														
 
															+    thin-pool <metadata dev> <data dev> <data block size (sectors)> \
														
 
															+	      <low water mark (blocks)> [<number of feature args> [<arg>]*]
														
 
															+
														
 
															+    Optional feature arguments:
														
 
															+    - 'skip_block_zeroing': skips the zeroing of newly-provisioned blocks.
														
 
															+
														
 
															+    Data block size must be between 64KB (128 sectors) and 1GB
														
 
															+    (2097152 sectors) inclusive.
														
 
															+
														
 
															+
														
 
															+ii) Status
														
 
															+
														
 
															+    <transaction id> <used metadata blocks>/<total metadata blocks>
														
 
															+    <used data blocks>/<total data blocks> <held metadata root>
														
 
															+
														
 
															+
														
 
															+    transaction id:
														
 
															+	A 64-bit number used by userspace to help synchronise with metadata
														
 
															+	from volume managers.
														
 
															+
														
 
															+    used data blocks / total data blocks
														
 
															+	If the number of free blocks drops below the pool's low water mark a
														
 
															+	dm event will be sent to userspace.  This event is edge-triggered and
														
 
															+	it will occur only once after each resume so volume manager writers
														
 
															+	should register for the event and then check the target's status.
														
 
															+
														
 
															+    held metadata root:
														
 
															+	The location, in sectors, of the metadata root that has been
														
 
															+	'held' for userspace read access.  '-' indicates there is no
														
 
															+	held root.  This feature is not yet implemented so '-' is
														
 
															+	always returned.
														
 
															+
														
 
															+iii) Messages
														
 
															+
														
 
															+    create_thin <dev id>
														
 
															+
														
 
															+	Create a new thinly-provisioned device.
														
 
															+	<dev id> is an arbitrary unique 24-bit identifier chosen by
														
 
															+	the caller.
														
 
															+
														
 
															+    create_snap <dev id> <origin id>
														
 
															+
														
 
															+	Create a new snapshot of another thinly-provisioned device.
														
 
															+	<dev id> is an arbitrary unique 24-bit identifier chosen by
														
 
															+	the caller.
														
 
															+	<origin id> is the identifier of the thinly-provisioned device
														
 
															+	of which the new device will be a snapshot.
														
 
															+
														
 
															+    delete <dev id>
														
 
															+
														
 
															+	Deletes a thin device.  Irreversible.
														
 
															+
														
 
															+    trim <dev id> <new size in sectors>
														
 
															+
														
 
															+	Delete mappings from the end of a thin device.  Irreversible.
														
 
															+	You might want to use this if you're reducing the size of
														
 
															+	your thinly-provisioned device.  In many cases, due to the
														
 
															+	sharing of blocks between devices, it is not possible to
														
 
															+	determine in advance how much space 'trim' will release.  (In
														
 
															+	future a userspace tool might be able to perform this
														
 
															+	calculation.)
														
 
															+
														
 
															+    set_transaction_id <current id> <new id>
														
 
															+
														
 
															+	Userland volume managers, such as LVM, need a way to
														
 
															+	synchronise their external metadata with the internal metadata of the
														
 
															+	pool target.  The thin-pool target offers to store an
														
 
															+	arbitrary 64-bit transaction id and return it on the target's
														
 
															+	status line.  To avoid races you must provide what you think
														
 
															+	the current transaction id is when you change it with this
														
 
															+	compare-and-swap message.
														
 
															+
														
 
															+'thin' target
														
 
															+-------------
														
 
															+
														
 
															+i) Constructor
														
 
															+
														
 
															+    thin <pool dev> <dev id>
														
 
															+
														
 
															+    pool dev:
														
 
															+	the thin-pool device, e.g. /dev/mapper/my_pool or 253:0
														
 
															+
														
 
															+    dev id:
														
 
															+	the internal device identifier of the device to be
														
 
															+	activated.
														
 
															+
														
 
															+The pool doesn't store any size against the thin devices.  If you
														
 
															+load a thin target that is smaller than you've been using previously,
														
 
															+then you'll have no access to blocks mapped beyond the end.  If you
														
 
															+load a target that is bigger than before, then extra blocks will be
														
 
															+provisioned as and when needed.
														
 
															+
														
 
															+If you wish to reduce the size of your thin device and potentially
														
 
															+regain some space then send the 'trim' message to the pool.
														
 
															+
														
 
															+ii) Status
														
 
															+
														
 
															+     <nr mapped sectors> <highest mapped sector>
														
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -216,6 +216,8 @@ config DM_BUFIO
 
															 	 as a cache, holding recently-read blocks in memory and performing
														
 
															 	 delayed writes.
														
 
															+source "drivers/md/persistent-data/Kconfig"
														
 
															+
														
 
															 config DM_CRYPT
														
 
															 	tristate "Crypt target support"
														
 
															 	depends on BLK_DEV_DM
														
@@ -241,6 +243,32 @@ config DM_SNAPSHOT
 
															        ---help---
														
 
															          Allow volume managers to take writable snapshots of a device.
														
 
															+config DM_THIN_PROVISIONING
														
 
															+       tristate "Thin provisioning target (EXPERIMENTAL)"
														
 
															+       depends on BLK_DEV_DM && EXPERIMENTAL
														
 
															+       select DM_PERSISTENT_DATA
														
 
															+       ---help---
														
 
															+         Provides thin provisioning and snapshots that share a data store.
														
 
															+
														
 
															+config DM_DEBUG_BLOCK_STACK_TRACING
														
 
															+	boolean "Keep stack trace of thin provisioning block lock holders"
														
 
															+	depends on STACKTRACE_SUPPORT && DM_THIN_PROVISIONING
														
 
															+	select STACKTRACE
														
 
															+	---help---
														
 
															+	  Enable this for messages that may help debug problems with the
														
 
															+	  block manager locking used by thin provisioning.
														
 
															+
														
 
															+	  If unsure, say N.
														
 
															+
														
 
															+config DM_DEBUG_SPACE_MAPS
														
 
															+	boolean "Extra validation for thin provisioning space maps"
														
 
															+	depends on DM_THIN_PROVISIONING
														
 
															+	---help---
														
 
															+	  Enable this for messages that may help debug problems with the
														
 
															+	  space maps used by thin provisioning.
														
 
															+
														
 
															+          If unsure, say N.
														
 
															+
														
 
															 config DM_MIRROR
														
 
															        tristate "Mirror target"
														
 
															        depends on BLK_DEV_DM
														
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -10,6 +10,7 @@ dm-snapshot-y	+= dm-snap.o dm-exception-store.o dm-snap-transient.o \
 
															 dm-mirror-y	+= dm-raid1.o
														
 
															 dm-log-userspace-y \
														
 
															 		+= dm-log-userspace-base.o dm-log-userspace-transfer.o
														
 
															+dm-thin-pool-y	+= dm-thin.o dm-thin-metadata.o
														
 
															 md-mod-y	+= md.o bitmap.o
														
 
															 raid456-y	+= raid5.o
														
@@ -35,10 +36,12 @@ obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipath.o dm-round-robin.o
 
															 obj-$(CONFIG_DM_MULTIPATH_QL)	+= dm-queue-length.o
														
 
															 obj-$(CONFIG_DM_MULTIPATH_ST)	+= dm-service-time.o
														
 
															 obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
														
 
															+obj-$(CONFIG_DM_PERSISTENT_DATA)	+= persistent-data/
														
 
															 obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
														
 
															 obj-$(CONFIG_DM_LOG_USERSPACE)	+= dm-log-userspace.o
														
 
															 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
														
 
															 obj-$(CONFIG_DM_RAID)	+= dm-raid.o
														
 
															+obj-$(CONFIG_DM_THIN_PROVISIONING)	+= dm-thin-pool.o
														
 
															 ifeq ($(CONFIG_DM_UEVENT),y)
														
 
															 dm-mod-objs			+= dm-uevent.o
														
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -0,0 +1,1391 @@
 
															+/*
														
 
															+ * Copyright (C) 2011 Red Hat, Inc.
														
 
															+ *
														
 
															+ * This file is released under the GPL.
														
 
															+ */
														
 
															+
														
 
															+#include "dm-thin-metadata.h"
														
 
															+#include "persistent-data/dm-btree.h"
														
 
															+#include "persistent-data/dm-space-map.h"
														
 
															+#include "persistent-data/dm-space-map-disk.h"
														
 
															+#include "persistent-data/dm-transaction-manager.h"
														
 
															+
														
 
															+#include <linux/list.h>
														
 
															+#include <linux/device-mapper.h>
														
 
															+#include <linux/workqueue.h>
														
 
															+
														
 
															+/*--------------------------------------------------------------------------
														
 
															+ * As far as the metadata goes, there is:
														
 
															+ *
														
 
															+ * - A superblock in block zero, taking up fewer than 512 bytes for
														
 
															+ *   atomic writes.
														
 
															+ *
														
 
															+ * - A space map managing the metadata blocks.
														
 
															+ *
														
 
															+ * - A space map managing the data blocks.
														
 
															+ *
														
 
															+ * - A btree mapping our internal thin dev ids onto struct disk_device_details.
														
 
															+ *
														
 
															+ * - A hierarchical btree, with 2 levels which effectively maps (thin
														
 
															+ *   dev id, virtual block) -> block_time.  Block time is a 64-bit
														
 
															+ *   field holding the time in the low 24 bits, and block in the top 48
														
 
															+ *   bits.
														
 
															+ *
														
 
															+ * BTrees consist solely of btree_nodes, that fill a block.  Some are
														
 
															+ * internal nodes, as such their values are a __le64 pointing to other
														
 
															+ * nodes.  Leaf nodes can store data of any reasonable size (ie. much
														
 
															+ * smaller than the block size).  The nodes consist of the header,
														
 
															+ * followed by an array of keys, followed by an array of values.  We have
														
 
															+ * to binary search on the keys so they're all held together to help the
														
 
															+ * cpu cache.
														
 
															+ *
														
 
															+ * Space maps have 2 btrees:
														
 
															+ *
														
 
															+ * - One maps a uint64_t onto a struct index_entry.  Which points to a
														
 
															+ *   bitmap block, and has some details about how many free entries there
														
 
															+ *   are etc.
														
 
															+ *
														
 
															+ * - The bitmap blocks have a header (for the checksum).  Then the rest
														
 
															+ *   of the block is pairs of bits.  With the meaning being:
														
 
															+ *
														
 
															+ *   0 - ref count is 0
														
 
															+ *   1 - ref count is 1
														
 
															+ *   2 - ref count is 2
														
 
															+ *   3 - ref count is higher than 2
														
 
															+ *
														
 
															+ * - If the count is higher than 2 then the ref count is entered in a
														
 
															+ *   second btree that directly maps the block_address to a uint32_t ref
														
 
															+ *   count.
														
 
															+ *
														
 
															+ * The space map metadata variant doesn't have a bitmaps btree.  Instead
														
 
															+ * it has one single blocks worth of index_entries.  This avoids
														
 
															+ * recursive issues with the bitmap btree needing to allocate space in
														
 
															+ * order to insert.  With a small data block size such as 64k the
														
 
															+ * metadata support data devices that are hundreds of terrabytes.
														
 
															+ *
														
 
															+ * The space maps allocate space linearly from front to back.  Space that
														
 
															+ * is freed in a transaction is never recycled within that transaction.
														
 
															+ * To try and avoid fragmenting _free_ space the allocator always goes
														
 
															+ * back and fills in gaps.
														
 
															+ *
														
 
															+ * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks
														
 
															+ * from the block manager.
														
 
															+ *--------------------------------------------------------------------------*/
														
 
															+
														
 
															+#define DM_MSG_PREFIX   "thin metadata"
														
 
															+
														
 
															+#define THIN_SUPERBLOCK_MAGIC 27022010
														
 
															+#define THIN_SUPERBLOCK_LOCATION 0
														
 
															+#define THIN_VERSION 1
														
 
															+#define THIN_METADATA_CACHE_SIZE 64
														
 
															+#define SECTOR_TO_BLOCK_SHIFT 3
														
 
															+
														
 
															+/* This should be plenty */
														
 
															+#define SPACE_MAP_ROOT_SIZE 128
														
 
															+
														
 
															+/*
														
 
															+ * Little endian on-disk superblock and device details.
														
 
															+ */
														
 
															+struct thin_disk_superblock {
														
 
															+	__le32 csum;	/* Checksum of superblock except for this field. */
														
 
															+	__le32 flags;
														
 
															+	__le64 blocknr;	/* This block number, dm_block_t. */
														
 
															+
														
 
															+	__u8 uuid[16];
														
 
															+	__le64 magic;
														
 
															+	__le32 version;
														
 
															+	__le32 time;
														
 
															+
														
 
															+	__le64 trans_id;
														
 
															+
														
 
															+	/*
														
 
															+	 * Root held by userspace transactions.
														
 
															+	 */
														
 
															+	__le64 held_root;
														
 
															+
														
 
															+	__u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
														
 
															+	__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
														
 
															+
														
 
															+	/*
														
 
															+	 * 2-level btree mapping (dev_id, (dev block, time)) -> data block
														
 
															+	 */
														
 
															+	__le64 data_mapping_root;
														
 
															+
														
 
															+	/*
														
 
															+	 * Device detail root mapping dev_id -> device_details
														
 
															+	 */
														
 
															+	__le64 device_details_root;
														
 
															+
														
 
															+	__le32 data_block_size;		/* In 512-byte sectors. */
														
 
															+
														
 
															+	__le32 metadata_block_size;	/* In 512-byte sectors. */
														
 
															+	__le64 metadata_nr_blocks;
														
 
															+
														
 
															+	__le32 compat_flags;
														
 
															+	__le32 compat_ro_flags;
														
 
															+	__le32 incompat_flags;
														
 
															+} __packed;
														
 
															+
														
 
															+struct disk_device_details {
														
 
															+	__le64 mapped_blocks;
														
 
															+	__le64 transaction_id;		/* When created. */
														
 
															+	__le32 creation_time;
														
 
															+	__le32 snapshotted_time;
														
 
															+} __packed;
														
 
															+
														
 
															+struct dm_pool_metadata {
														
 
															+	struct hlist_node hash;
														
 
															+
														
 
															+	struct block_device *bdev;
														
 
															+	struct dm_block_manager *bm;
														
 
															+	struct dm_space_map *metadata_sm;
														
 
															+	struct dm_space_map *data_sm;
														
 
															+	struct dm_transaction_manager *tm;
														
 
															+	struct dm_transaction_manager *nb_tm;
														
 
															+
														
 
															+	/*
														
 
															+	 * Two-level btree.
														
 
															+	 * First level holds thin_dev_t.
														
 
															+	 * Second level holds mappings.
														
 
															+	 */
														
 
															+	struct dm_btree_info info;
														
 
															+
														
 
															+	/*
														
 
															+	 * Non-blocking version of the above.
														
 
															+	 */
														
 
															+	struct dm_btree_info nb_info;
														
 
															+
														
 
															+	/*
														
 
															+	 * Just the top level for deleting whole devices.
														
 
															+	 */
														
 
															+	struct dm_btree_info tl_info;
														
 
															+
														
 
															+	/*
														
 
															+	 * Just the bottom level for creating new devices.
														
 
															+	 */
														
 
															+	struct dm_btree_info bl_info;
														
 
															+
														
 
															+	/*
														
 
															+	 * Describes the device details btree.
														
 
															+	 */
														
 
															+	struct dm_btree_info details_info;
														
 
															+
														
 
															+	struct rw_semaphore root_lock;
														
 
															+	uint32_t time;
														
 
															+	int need_commit;
														
 
															+	dm_block_t root;
														
 
															+	dm_block_t details_root;
														
 
															+	struct list_head thin_devices;
														
 
															+	uint64_t trans_id;
														
 
															+	unsigned long flags;
														
 
															+	sector_t data_block_size;
														
 
															+};
														
 
															+
														
 
															+struct dm_thin_device {
														
 
															+	struct list_head list;
														
 
															+	struct dm_pool_metadata *pmd;
														
 
															+	dm_thin_id id;
														
 
															+
														
 
															+	int open_count;
														
 
															+	int changed;
														
 
															+	uint64_t mapped_blocks;
														
 
															+	uint64_t transaction_id;
														
 
															+	uint32_t creation_time;
														
 
															+	uint32_t snapshotted_time;
														
 
															+};
														
 
															+
														
 
															+/*----------------------------------------------------------------
														
 
															+ * superblock validator
														
 
															+ *--------------------------------------------------------------*/
														
 
															+
														
 
															+#define SUPERBLOCK_CSUM_XOR 160774
														
 
															+
														
 
															+static void sb_prepare_for_write(struct dm_block_validator *v,
														
 
															+				 struct dm_block *b,
														
 
															+				 size_t block_size)
														
 
															+{
														
 
															+	struct thin_disk_superblock *disk_super = dm_block_data(b);
														
 
															+
														
 
															+	disk_super->blocknr = cpu_to_le64(dm_block_location(b));
														
 
															+	disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
														
 
															+						      block_size - sizeof(__le32),
														
 
															+						      SUPERBLOCK_CSUM_XOR));
														
 
															+}
														
 
															+
														
 
															+static int sb_check(struct dm_block_validator *v,
														
 
															+		    struct dm_block *b,
														
 
															+		    size_t block_size)
														
 
															+{
														
 
															+	struct thin_disk_superblock *disk_super = dm_block_data(b);
														
 
															+	__le32 csum_le;
														
 
															+
														
 
															+	if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) {
														
 
															+		DMERR("sb_check failed: blocknr %llu: "
														
 
															+		      "wanted %llu", le64_to_cpu(disk_super->blocknr),
														
 
															+		      (unsigned long long)dm_block_location(b));
														
 
															+		return -ENOTBLK;
														
 
															+	}
														
 
															+
														
 
															+	if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) {
														
 
															+		DMERR("sb_check failed: magic %llu: "
														
 
															+		      "wanted %llu", le64_to_cpu(disk_super->magic),
														
 
															+		      (unsigned long long)THIN_SUPERBLOCK_MAGIC);
														
 
															+		return -EILSEQ;
														
 
															+	}
														
 
															+
														
 
															+	csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
														
 
															+					     block_size - sizeof(__le32),
														
 
															+					     SUPERBLOCK_CSUM_XOR));
														
 
															+	if (csum_le != disk_super->csum) {
														
 
															+		DMERR("sb_check failed: csum %u: wanted %u",
														
 
															+		      le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum));
														
 
															+		return -EILSEQ;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static struct dm_block_validator sb_validator = {
														
 
															+	.name = "superblock",
														
 
															+	.prepare_for_write = sb_prepare_for_write,
														
 
															+	.check = sb_check
														
 
															+};
														
 
															+
														
 
															+/*----------------------------------------------------------------
														
 
															+ * Methods for the btree value types
														
 
															+ *--------------------------------------------------------------*/
														
 
															+
														
 
															+static uint64_t pack_block_time(dm_block_t b, uint32_t t)
														
 
															+{
														
 
															+	return (b << 24) | t;
														
 
															+}
														
 
															+
														
 
															+static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t)
														
 
															+{
														
 
															+	*b = v >> 24;
														
 
															+	*t = v & ((1 << 24) - 1);
														
 
															+}
														
 
															+
														
 
															+static void data_block_inc(void *context, void *value_le)
														
 
															+{
														
 
															+	struct dm_space_map *sm = context;
														
 
															+	__le64 v_le;
														
 
															+	uint64_t b;
														
 
															+	uint32_t t;
														
 
															+
														
 
															+	memcpy(&v_le, value_le, sizeof(v_le));
														
 
															+	unpack_block_time(le64_to_cpu(v_le), &b, &t);
														
 
															+	dm_sm_inc_block(sm, b);
														
 
															+}
														
 
															+
														
 
															+static void data_block_dec(void *context, void *value_le)
														
 
															+{
														
 
															+	struct dm_space_map *sm = context;
														
 
															+	__le64 v_le;
														
 
															+	uint64_t b;
														
 
															+	uint32_t t;
														
 
															+
														
 
															+	memcpy(&v_le, value_le, sizeof(v_le));
														
 
															+	unpack_block_time(le64_to_cpu(v_le), &b, &t);
														
 
															+	dm_sm_dec_block(sm, b);
														
 
															+}
														
 
															+
														
 
															+static int data_block_equal(void *context, void *value1_le, void *value2_le)
														
 
															+{
														
 
															+	__le64 v1_le, v2_le;
														
 
															+	uint64_t b1, b2;
														
 
															+	uint32_t t;
														
 
															+
														
 
															+	memcpy(&v1_le, value1_le, sizeof(v1_le));
														
 
															+	memcpy(&v2_le, value2_le, sizeof(v2_le));
														
 
															+	unpack_block_time(le64_to_cpu(v1_le), &b1, &t);
														
 
															+	unpack_block_time(le64_to_cpu(v2_le), &b2, &t);
														
 
															+
														
 
															+	return b1 == b2;
														
 
															+}
														
 
															+
														
 
															+static void subtree_inc(void *context, void *value)
														
 
															+{
														
 
															+	struct dm_btree_info *info = context;
														
 
															+	__le64 root_le;
														
 
															+	uint64_t root;
														
 
															+
														
 
															+	memcpy(&root_le, value, sizeof(root_le));
														
 
															+	root = le64_to_cpu(root_le);
														
 
															+	dm_tm_inc(info->tm, root);
														
 
															+}
														
 
															+
														
 
															+static void subtree_dec(void *context, void *value)
														
 
															+{
														
 
															+	struct dm_btree_info *info = context;
														
 
															+	__le64 root_le;
														
 
															+	uint64_t root;
														
 
															+
														
 
															+	memcpy(&root_le, value, sizeof(root_le));
														
 
															+	root = le64_to_cpu(root_le);
														
 
															+	if (dm_btree_del(info, root))
														
 
															+		DMERR("btree delete failed\n");
														
 
															+}
														
 
															+
														
 
															+static int subtree_equal(void *context, void *value1_le, void *value2_le)
														
 
															+{
														
 
															+	__le64 v1_le, v2_le;
														
 
															+	memcpy(&v1_le, value1_le, sizeof(v1_le));
														
 
															+	memcpy(&v2_le, value2_le, sizeof(v2_le));
														
 
															+
														
 
															+	return v1_le == v2_le;
														
 
															+}
														
 
															+
														
 
															+/*----------------------------------------------------------------*/
														
 
															+
														
 
															+static int superblock_all_zeroes(struct dm_block_manager *bm, int *result)
														
 
															+{
														
 
															+	int r;
														
 
															+	unsigned i;
														
 
															+	struct dm_block *b;
														
 
															+	__le64 *data_le, zero = cpu_to_le64(0);
														
 
															+	unsigned block_size = dm_bm_block_size(bm) / sizeof(__le64);
														
 
															+
														
 
															+	/*
														
 
															+	 * We can't use a validator here - it may be all zeroes.
														
 
															+	 */
														
 
															+	r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															+	data_le = dm_block_data(b);
														
 
															+	*result = 1;
														
 
															+	for (i = 0; i < block_size; i++) {
														
 
															+		if (data_le[i] != zero) {
														
 
															+			*result = 0;
														
 
															+			break;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return dm_bm_unlock(b);
														
 
															+}
														
 
															+
														
 
															+static int init_pmd(struct dm_pool_metadata *pmd,
														
 
															+		    struct dm_block_manager *bm,
														
 
															+		    dm_block_t nr_blocks, int create)
														
 
															+{
														
 
															+	int r;
														
 
															+	struct dm_space_map *sm, *data_sm;
														
 
															+	struct dm_transaction_manager *tm;
														
 
															+	struct dm_block *sblock;
														
 
															+
														
 
															+	if (create) {
														
 
															+		r = dm_tm_create_with_sm(bm, THIN_SUPERBLOCK_LOCATION,
														
 
															+					 &sb_validator, &tm, &sm, &sblock);
														
 
															+		if (r < 0) {
														
 
															+			DMERR("tm_create_with_sm failed");
														
 
															+			return r;
														
 
															+		}
														
 
															+
														
 
															+		data_sm = dm_sm_disk_create(tm, nr_blocks);
														
 
															+		if (IS_ERR(data_sm)) {
														
 
															+			DMERR("sm_disk_create failed");
														
 
															+			r = PTR_ERR(data_sm);
														
 
															+			goto bad;
														
 
															+		}
														
 
															+	} else {
														
 
															+		struct thin_disk_superblock *disk_super = NULL;
														
 
															+		size_t space_map_root_offset =
														
 
															+			offsetof(struct thin_disk_superblock, metadata_space_map_root);
														
 
															+
														
 
															+		r = dm_tm_open_with_sm(bm, THIN_SUPERBLOCK_LOCATION,
														
 
															+				       &sb_validator, space_map_root_offset,
														
 
															+				       SPACE_MAP_ROOT_SIZE, &tm, &sm, &sblock);
														
 
															+		if (r < 0) {
														
 
															+			DMERR("tm_open_with_sm failed");
														
 
															+			return r;
														
 
															+		}
														
 
															+
														
 
															+		disk_super = dm_block_data(sblock);
														
 
															+		data_sm = dm_sm_disk_open(tm, disk_super->data_space_map_root,
														
 
															+					  sizeof(disk_super->data_space_map_root));
														
 
															+		if (IS_ERR(data_sm)) {
														
 
															+			DMERR("sm_disk_open failed");
														
 
															+			r = PTR_ERR(data_sm);
														
 
															+			goto bad;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+
														
 
															+	r = dm_tm_unlock(tm, sblock);
														
 
															+	if (r < 0) {
														
 
															+		DMERR("couldn't unlock superblock");
														
 
															+		goto bad_data_sm;
														
 
															+	}
														
 
															+
														
 
															+	pmd->bm = bm;
														
 
															+	pmd->metadata_sm = sm;
														
 
															+	pmd->data_sm = data_sm;
														
 
															+	pmd->tm = tm;
														
 
															+	pmd->nb_tm = dm_tm_create_non_blocking_clone(tm);
														
 
															+	if (!pmd->nb_tm) {
														
 
															+		DMERR("could not create clone tm");
														
 
															+		r = -ENOMEM;
														
 
															+		goto bad_data_sm;
														
 
															+	}
														
 
															+
														
 
															+	pmd->info.tm = tm;
														
 
															+	pmd->info.levels = 2;
														
 
															+	pmd->info.value_type.context = pmd->data_sm;
														
 
															+	pmd->info.value_type.size = sizeof(__le64);
														
 
															+	pmd->info.value_type.inc = data_block_inc;
														
 
															+	pmd->info.value_type.dec = data_block_dec;
														
 
															+	pmd->info.value_type.equal = data_block_equal;
														
 
															+
														
 
															+	memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info));
														
 
															+	pmd->nb_info.tm = pmd->nb_tm;
														
 
															+
														
 
															+	pmd->tl_info.tm = tm;
														
 
															+	pmd->tl_info.levels = 1;
														
 
															+	pmd->tl_info.value_type.context = &pmd->info;
														
 
															+	pmd->tl_info.value_type.size = sizeof(__le64);
														
 
															+	pmd->tl_info.value_type.inc = subtree_inc;
														
 
															+	pmd->tl_info.value_type.dec = subtree_dec;
														
 
															+	pmd->tl_info.value_type.equal = subtree_equal;
														
 
															+
														
 
															+	pmd->bl_info.tm = tm;
														
 
															+	pmd->bl_info.levels = 1;
														
 
															+	pmd->bl_info.value_type.context = pmd->data_sm;
														
 
															+	pmd->bl_info.value_type.size = sizeof(__le64);
														
 
															+	pmd->bl_info.value_type.inc = data_block_inc;
														
 
															+	pmd->bl_info.value_type.dec = data_block_dec;
														
 
															+	pmd->bl_info.value_type.equal = data_block_equal;
														
 
															+
														
 
															+	pmd->details_info.tm = tm;
														
 
															+	pmd->details_info.levels = 1;
														
 
															+	pmd->details_info.value_type.context = NULL;
														
 
															+	pmd->details_info.value_type.size = sizeof(struct disk_device_details);
														
 
															+	pmd->details_info.value_type.inc = NULL;
														
 
															+	pmd->details_info.value_type.dec = NULL;
														
 
															+	pmd->details_info.value_type.equal = NULL;
														
 
															+
														
 
															+	pmd->root = 0;
														
 
															+
														
 
															+	init_rwsem(&pmd->root_lock);
														
 
															+	pmd->time = 0;
														
 
															+	pmd->need_commit = 0;
														
 
															+	pmd->details_root = 0;
														
 
															+	pmd->trans_id = 0;
														
 
															+	pmd->flags = 0;
														
 
															+	INIT_LIST_HEAD(&pmd->thin_devices);
														
 
															+
														
 
															+	return 0;
														
 
															+
														
 
															+bad_data_sm:
														
 
															+	dm_sm_destroy(data_sm);
														
 
															+bad:
														
 
															+	dm_tm_destroy(tm);
														
 
															+	dm_sm_destroy(sm);
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+static int __begin_transaction(struct dm_pool_metadata *pmd)
														
 
															+{
														
 
															+	int r;
														
 
															+	u32 features;
														
 
															+	struct thin_disk_superblock *disk_super;
														
 
															+	struct dm_block *sblock;
														
 
															+
														
 
															+	/*
														
 
															+	 * __maybe_commit_transaction() resets these
														
 
															+	 */
														
 
															+	WARN_ON(pmd->need_commit);
														
 
															+
														
 
															+	/*
														
 
															+	 * We re-read the superblock every time.  Shouldn't need to do this
														
 
															+	 * really.
														
 
															+	 */
														
 
															+	r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
														
 
															+			    &sb_validator, &sblock);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															+	disk_super = dm_block_data(sblock);
														
 
															+	pmd->time = le32_to_cpu(disk_super->time);
														
 
															+	pmd->root = le64_to_cpu(disk_super->data_mapping_root);
														
 
															+	pmd->details_root = le64_to_cpu(disk_super->device_details_root);
														
 
															+	pmd->trans_id = le64_to_cpu(disk_super->trans_id);
														
 
															+	pmd->flags = le32_to_cpu(disk_super->flags);
														
 
															+	pmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
														
 
															+
														
 
															+	features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP;
														
 
															+	if (features) {
														
 
															+		DMERR("could not access metadata due to "
														
 
															+		      "unsupported optional features (%lx).",
														
 
															+		      (unsigned long)features);
														
 
															+		r = -EINVAL;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * Check for read-only metadata to skip the following RDWR checks.
														
 
															+	 */
														
 
															+	if (get_disk_ro(pmd->bdev->bd_disk))
														
 
															+		goto out;
														
 
															+
														
 
															+	features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP;
														
 
															+	if (features) {
														
 
															+		DMERR("could not access metadata RDWR due to "
														
 
															+		      "unsupported optional features (%lx).",
														
 
															+		      (unsigned long)features);
														
 
															+		r = -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+out:
														
 
															+	dm_bm_unlock(sblock);
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+static int __write_changed_details(struct dm_pool_metadata *pmd)
														
 
															+{
														
 
															+	int r;
														
 
															+	struct dm_thin_device *td, *tmp;
														
 
															+	struct disk_device_details details;
														
 
															+	uint64_t key;
														
 
															+
														
 
															+	list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
														
 
															+		if (!td->changed)
														
 
															+			continue;
														
 
															+
														
 
															+		key = td->id;
														
 
															+
														
 
															+		details.mapped_blocks = cpu_to_le64(td->mapped_blocks);
														
 
															+		details.transaction_id = cpu_to_le64(td->transaction_id);
														
 
															+		details.creation_time = cpu_to_le32(td->creation_time);
														
 
															+		details.snapshotted_time = cpu_to_le32(td->snapshotted_time);
														
 
															+		__dm_bless_for_disk(&details);
														
 
															+
														
 
															+		r = dm_btree_insert(&pmd->details_info, pmd->details_root,
														
 
															+				    &key, &details, &pmd->details_root);
														
 
															+		if (r)
														
 
															+			return r;
														
 
															+
														
 
															+		if (td->open_count)
														
 
															+			td->changed = 0;
														
 
															+		else {
														
 
															+			list_del(&td->list);
														
 
															+			kfree(td);
														
 
															+		}
														
 
															+
														
 
															+		pmd->need_commit = 1;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int __commit_transaction(struct dm_pool_metadata *pmd)
														
 
															+{
														
 
															+	/*
														
 
															+	 * FIXME: Associated pool should be made read-only on failure.
														
 
															+	 */
														
 
															+	int r;
														
 
															+	size_t metadata_len, data_len;
														
 
															+	struct thin_disk_superblock *disk_super;
														
 
															+	struct dm_block *sblock;
														
 
															+
														
 
															+	/*
														
 
															+	 * We need to know if the thin_disk_superblock exceeds a 512-byte sector.
														
 
															+	 */
														
 
															+	BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
														
 
															+
														
 
															+	r = __write_changed_details(pmd);
														
 
															+	if (r < 0)
														
 
															+		goto out;
														
 
															+
														
 
															+	if (!pmd->need_commit)
														
 
															+		goto out;
														
 
															+
														
 
															+	r = dm_sm_commit(pmd->data_sm);
														
 
															+	if (r < 0)
														
 
															+		goto out;
														
 
															+
														
 
															+	r = dm_tm_pre_commit(pmd->tm);
														
 
															+	if (r < 0)
														
 
															+		goto out;
														
 
															+
														
 
															+	r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
														
 
															+	if (r < 0)
														
 
															+		goto out;
														
 
															+
														
 
															+	r = dm_sm_root_size(pmd->metadata_sm, &data_len);
														
 
															+	if (r < 0)
														
 
															+		goto out;
														
 
															+
														
 
															+	r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
														
 
															+			     &sb_validator, &sblock);
														
 
															+	if (r)
														
 
															+		goto out;
														
 
															+
														
 
															+	disk_super = dm_block_data(sblock);
														
 
															+	disk_super->time = cpu_to_le32(pmd->time);
														
 
															+	disk_super->data_mapping_root = cpu_to_le64(pmd->root);
														
 
															+	disk_super->device_details_root = cpu_to_le64(pmd->details_root);
														
 
															+	disk_super->trans_id = cpu_to_le64(pmd->trans_id);
														
 
															+	disk_super->flags = cpu_to_le32(pmd->flags);
														
 
															+
														
 
															+	r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root,
														
 
															+			    metadata_len);
														
 
															+	if (r < 0)
														
 
															+		goto out_locked;
														
 
															+
														
 
															+	r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root,
														
 
															+			    data_len);
														
 
															+	if (r < 0)
														
 
															+		goto out_locked;
														
 
															+
														
 
															+	r = dm_tm_commit(pmd->tm, sblock);
														
 
															+	if (!r)
														
 
															+		pmd->need_commit = 0;
														
 
															+
														
 
															+out:
														
 
															+	return r;
														
 
															+
														
 
															+out_locked:
														
 
															+	dm_bm_unlock(sblock);
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
														
 
															+					       sector_t data_block_size)
														
 
															+{
														
 
															+	int r;
														
 
															+	struct thin_disk_superblock *disk_super;
														
 
															+	struct dm_pool_metadata *pmd;
														
 
															+	sector_t bdev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
														
 
															+	struct dm_block_manager *bm;
														
 
															+	int create;
														
 
															+	struct dm_block *sblock;
														
 
															+
														
 
															+	pmd = kmalloc(sizeof(*pmd), GFP_KERNEL);
														
 
															+	if (!pmd) {
														
 
															+		DMERR("could not allocate metadata struct");
														
 
															+		return ERR_PTR(-ENOMEM);
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * Max hex locks:
														
 
															+	 *  3 for btree insert +
														
 
															+	 *  2 for btree lookup used within space map
														
 
															+	 */
														
 
															+	bm = dm_block_manager_create(bdev, THIN_METADATA_BLOCK_SIZE,
														
 
															+				     THIN_METADATA_CACHE_SIZE, 5);
														
 
															+	if (!bm) {
														
 
															+		DMERR("could not create block manager");
														
 
															+		kfree(pmd);
														
 
															+		return ERR_PTR(-ENOMEM);
														
 
															+	}
														
 
															+
														
 
															+	r = superblock_all_zeroes(bm, &create);
														
 
															+	if (r) {
														
 
															+		dm_block_manager_destroy(bm);
														
 
															+		kfree(pmd);
														
 
															+		return ERR_PTR(r);
														
 
															+	}
														
 
															+
														
 
															+
														
 
															+	r = init_pmd(pmd, bm, 0, create);
														
 
															+	if (r) {
														
 
															+		dm_block_manager_destroy(bm);
														
 
															+		kfree(pmd);
														
 
															+		return ERR_PTR(r);
														
 
															+	}
														
 
															+	pmd->bdev = bdev;
														
 
															+
														
 
															+	if (!create) {
														
 
															+		r = __begin_transaction(pmd);
														
 
															+		if (r < 0)
														
 
															+			goto bad;
														
 
															+		return pmd;
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * Create.
														
 
															+	 */
														
 
															+	r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
														
 
															+			     &sb_validator, &sblock);
														
 
															+	if (r)
														
 
															+		goto bad;
														
 
															+
														
 
															+	disk_super = dm_block_data(sblock);
														
 
															+	disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
														
 
															+	disk_super->version = cpu_to_le32(THIN_VERSION);
														
 
															+	disk_super->time = 0;
														
 
															+	disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
														
 
															+	disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
														
 
															+	disk_super->data_block_size = cpu_to_le32(data_block_size);
														
 
															+
														
 
															+	r = dm_bm_unlock(sblock);
														
 
															+	if (r < 0)
														
 
															+		goto bad;
														
 
															+
														
 
															+	r = dm_btree_empty(&pmd->info, &pmd->root);
														
 
															+	if (r < 0)
														
 
															+		goto bad;
														
 
															+
														
 
															+	r = dm_btree_empty(&pmd->details_info, &pmd->details_root);
														
 
															+	if (r < 0) {
														
 
															+		DMERR("couldn't create devices root");
														
 
															+		goto bad;
														
 
															+	}
														
 
															+
														
 
															+	pmd->flags = 0;
														
 
															+	pmd->need_commit = 1;
														
 
															+	r = dm_pool_commit_metadata(pmd);
														
 
															+	if (r < 0) {
														
 
															+		DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
														
 
															+		      __func__, r);
														
 
															+		goto bad;
														
 
															+	}
														
 
															+
														
 
															+	return pmd;
														
 
															+
														
 
															+bad:
														
 
															+	if (dm_pool_metadata_close(pmd) < 0)
														
 
															+		DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
														
 
															+	return ERR_PTR(r);
														
 
															+}
														
 
															+
														
 
															+int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
														
 
															+{
														
 
															+	int r;
														
 
															+	unsigned open_devices = 0;
														
 
															+	struct dm_thin_device *td, *tmp;
														
 
															+
														
 
															+	down_read(&pmd->root_lock);
														
 
															+	list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
														
 
															+		if (td->open_count)
														
 
															+			open_devices++;
														
 
															+		else {
														
 
															+			list_del(&td->list);
														
 
															+			kfree(td);
														
 
															+		}
														
 
															+	}
														
 
															+	up_read(&pmd->root_lock);
														
 
															+
														
 
															+	if (open_devices) {
														
 
															+		DMERR("attempt to close pmd when %u device(s) are still open",
														
 
															+		       open_devices);
														
 
															+		return -EBUSY;
														
 
															+	}
														
 
															+
														
 
															+	r = __commit_transaction(pmd);
														
 
															+	if (r < 0)
														
 
															+		DMWARN("%s: __commit_transaction() failed, error = %d",
														
 
															+		       __func__, r);
														
 
															+
														
 
															+	dm_tm_destroy(pmd->tm);
														
 
															+	dm_tm_destroy(pmd->nb_tm);
														
 
															+	dm_block_manager_destroy(pmd->bm);
														
 
															+	dm_sm_destroy(pmd->metadata_sm);
														
 
															+	dm_sm_destroy(pmd->data_sm);
														
 
															+	kfree(pmd);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int __open_device(struct dm_pool_metadata *pmd,
														
 
															+			 dm_thin_id dev, int create,
														
 
															+			 struct dm_thin_device **td)
														
 
															+{
														
 
															+	int r, changed = 0;
														
 
															+	struct dm_thin_device *td2;
														
 
															+	uint64_t key = dev;
														
 
															+	struct disk_device_details details_le;
														
 
															+
														
 
															+	/*
														
 
															+	 * Check the device isn't already open.
														
 
															+	 */
														
 
															+	list_for_each_entry(td2, &pmd->thin_devices, list)
														
 
															+		if (td2->id == dev) {
														
 
															+			td2->open_count++;
														
 
															+			*td = td2;
														
 
															+			return 0;
														
 
															+		}
														
 
															+
														
 
															+	/*
														
 
															+	 * Check the device exists.
														
 
															+	 */
														
 
															+	r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
														
 
															+			    &key, &details_le);
														
 
															+	if (r) {
														
 
															+		if (r != -ENODATA || !create)
														
 
															+			return r;
														
 
															+
														
 
															+		changed = 1;
														
 
															+		details_le.mapped_blocks = 0;
														
 
															+		details_le.transaction_id = cpu_to_le64(pmd->trans_id);
														
 
															+		details_le.creation_time = cpu_to_le32(pmd->time);
														
 
															+		details_le.snapshotted_time = cpu_to_le32(pmd->time);
														
 
															+	}
														
 
															+
														
 
															+	*td = kmalloc(sizeof(**td), GFP_NOIO);
														
 
															+	if (!*td)
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															+	(*td)->pmd = pmd;
														
 
															+	(*td)->id = dev;
														
 
															+	(*td)->open_count = 1;
														
 
															+	(*td)->changed = changed;
														
 
															+	(*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks);
														
 
															+	(*td)->transaction_id = le64_to_cpu(details_le.transaction_id);
														
 
															+	(*td)->creation_time = le32_to_cpu(details_le.creation_time);
														
 
															+	(*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time);
														
 
															+
														
 
															+	list_add(&(*td)->list, &pmd->thin_devices);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void __close_device(struct dm_thin_device *td)
														
 
															+{
														
 
															+	--td->open_count;
														
 
															+}
														
 
															+
														
 
															+static int __create_thin(struct dm_pool_metadata *pmd,
														
 
															+			 dm_thin_id dev)
														
 
															+{
														
 
															+	int r;
														
 
															+	dm_block_t dev_root;
														
 
															+	uint64_t key = dev;
														
 
															+	struct disk_device_details details_le;
														
 
															+	struct dm_thin_device *td;
														
 
															+	__le64 value;
														
 
															+
														
 
															+	r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
														
 
															+			    &key, &details_le);
														
 
															+	if (!r)
														
 
															+		return -EEXIST;
														
 
															+
														
 
															+	/*
														
 
															+	 * Create an empty btree for the mappings.
														
 
															+	 */
														
 
															+	r = dm_btree_empty(&pmd->bl_info, &dev_root);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															+	/*
														
 
															+	 * Insert it into the main mapping tree.
														
 
															+	 */
														
 
															+	value = cpu_to_le64(dev_root);
														
 
															+	__dm_bless_for_disk(&value);
														
 
															+	r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
														
 
															+	if (r) {
														
 
															+		dm_btree_del(&pmd->bl_info, dev_root);
														
 
															+		return r;
														
 
															+	}
														
 
															+
														
 
															+	r = __open_device(pmd, dev, 1, &td);
														
 
															+	if (r) {
														
 
															+		__close_device(td);
														
 
															+		dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
														
 
															+		dm_btree_del(&pmd->bl_info, dev_root);
														
 
															+		return r;
														
 
															+	}
														
 
															+	td->changed = 1;
														
 
															+	__close_device(td);
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
														
 
															+{
														
 
															+	int r;
														
 
															+
														
 
															+	down_write(&pmd->root_lock);
														
 
															+	r = __create_thin(pmd, dev);
														
 
															+	up_write(&pmd->root_lock);
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+static int __set_snapshot_details(struct dm_pool_metadata *pmd,
														
 
															+				  struct dm_thin_device *snap,
														
 
															+				  dm_thin_id origin, uint32_t time)
														
 
															+{
														
 
															+	int r;
														
 
															+	struct dm_thin_device *td;
														
 
															+
														
 
															+	r = __open_device(pmd, origin, 0, &td);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															+	td->changed = 1;
														
 
															+	td->snapshotted_time = time;
														
 
															+
														
 
															+	snap->mapped_blocks = td->mapped_blocks;
														
 
															+	snap->snapshotted_time = time;
														
 
															+	__close_device(td);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int __create_snap(struct dm_pool_metadata *pmd,
														
 
															+			 dm_thin_id dev, dm_thin_id origin)
														
 
															+{
														
 
															+	int r;
														
 
															+	dm_block_t origin_root;
														
 
															+	uint64_t key = origin, dev_key = dev;
														
 
															+	struct dm_thin_device *td;
														
 
															+	struct disk_device_details details_le;
														
 
															+	__le64 value;
														
 
															+
														
 
															+	/* check this device is unused */
														
 
															+	r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
														
 
															+			    &dev_key, &details_le);
														
 
															+	if (!r)
														
 
															+		return -EEXIST;
														
 
															+
														
 
															+	/* find the mapping tree for the origin */
														
 
															+	r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+	origin_root = le64_to_cpu(value);
														
 
															+
														
 
															+	/* clone the origin, an inc will do */
														
 
															+	dm_tm_inc(pmd->tm, origin_root);
														
 
															+
														
 
															+	/* insert into the main mapping tree */
														
 
															+	value = cpu_to_le64(origin_root);
														
 
															+	__dm_bless_for_disk(&value);
														
 
															+	key = dev;
														
 
															+	r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
														
 
															+	if (r) {
														
 
															+		dm_tm_dec(pmd->tm, origin_root);
														
 
															+		return r;
														
 
															+	}
														
 
															+
														
 
															+	pmd->time++;
														
 
															+
														
 
															+	r = __open_device(pmd, dev, 1, &td);
														
 
															+	if (r)
														
 
															+		goto bad;
														
 
															+
														
 
															+	r = __set_snapshot_details(pmd, td, origin, pmd->time);
														
 
															+	if (r)
														
 
															+		goto bad;
														
 
															+
														
 
															+	__close_device(td);
														
 
															+	return 0;
														
 
															+
														
 
															+bad:
														
 
															+	__close_device(td);
														
 
															+	dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
														
 
															+	dm_btree_remove(&pmd->details_info, pmd->details_root,
														
 
															+			&key, &pmd->details_root);
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+int dm_pool_create_snap(struct dm_pool_metadata *pmd,
														
 
															+				 dm_thin_id dev,
														
 
															+				 dm_thin_id origin)
														
 
															+{
														
 
															+	int r;
														
 
															+
														
 
															+	down_write(&pmd->root_lock);
														
 
															+	r = __create_snap(pmd, dev, origin);
														
 
															+	up_write(&pmd->root_lock);
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev)
														
 
															+{
														
 
															+	int r;
														
 
															+	uint64_t key = dev;
														
 
															+	struct dm_thin_device *td;
														
 
															+
														
 
															+	/* TODO: failure should mark the transaction invalid */
														
 
															+	r = __open_device(pmd, dev, 0, &td);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															+	if (td->open_count > 1) {
														
 
															+		__close_device(td);
														
 
															+		return -EBUSY;
														
 
															+	}
														
 
															+
														
 
															+	list_del(&td->list);
														
 
															+	kfree(td);
														
 
															+	r = dm_btree_remove(&pmd->details_info, pmd->details_root,
														
 
															+			    &key, &pmd->details_root);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															+	r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															+	pmd->need_commit = 1;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
														
 
															+			       dm_thin_id dev)
														
 
															+{
														
 
															+	int r;
														
 
															+
														
 
															+	down_write(&pmd->root_lock);
														
 
															+	r = __delete_device(pmd, dev);
														
 
															+	up_write(&pmd->root_lock);
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
														
 
															+					uint64_t current_id,
														
 
															+					uint64_t new_id)
														
 
															+{
														
 
															+	down_write(&pmd->root_lock);
														
 
															+	if (pmd->trans_id != current_id) {
														
 
															+		up_write(&pmd->root_lock);
														
 
															+		DMERR("mismatched transaction id");
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	pmd->trans_id = new_id;
														
 
															+	pmd->need_commit = 1;
														
 
															+	up_write(&pmd->root_lock);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
														
 
															+					uint64_t *result)
														
 
															+{
														
 
															+	down_read(&pmd->root_lock);
														
 
															+	*result = pmd->trans_id;
														
 
															+	up_read(&pmd->root_lock);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int __get_held_metadata_root(struct dm_pool_metadata *pmd,
														
 
															+				    dm_block_t *result)
														
 
															+{
														
 
															+	int r;
														
 
															+	struct thin_disk_superblock *disk_super;
														
 
															+	struct dm_block *sblock;
														
 
															+
														
 
															+	r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
														
 
															+			     &sb_validator, &sblock);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															+	disk_super = dm_block_data(sblock);
														
 
															+	*result = le64_to_cpu(disk_super->held_root);
														
 
															+
														
 
															+	return dm_bm_unlock(sblock);
														
 
															+}
														
 
															+
														
 
															+int dm_pool_get_held_metadata_root(struct dm_pool_metadata *pmd,
														
 
															+				   dm_block_t *result)
														
 
															+{
														
 
															+	int r;
														
 
															+
														
 
															+	down_read(&pmd->root_lock);
														
 
															+	r = __get_held_metadata_root(pmd, result);
														
 
															+	up_read(&pmd->root_lock);
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
														
 
															+			     struct dm_thin_device **td)
														
 
															+{
														
 
															+	int r;
														
 
															+
														
 
															+	down_write(&pmd->root_lock);
														
 
															+	r = __open_device(pmd, dev, 0, td);
														
 
															+	up_write(&pmd->root_lock);
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+int dm_pool_close_thin_device(struct dm_thin_device *td)
														
 
															+{
														
 
															+	down_write(&td->pmd->root_lock);
														
 
															+	__close_device(td);
														
 
															+	up_write(&td->pmd->root_lock);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
														
 
															+{
														
 
															+	return td->id;
														
 
															+}
														
 
															+
														
 
															+static int __snapshotted_since(struct dm_thin_device *td, uint32_t time)
														
 
															+{
														
 
															+	return td->snapshotted_time > time;
														
 
															+}
														
 
															+
														
 
															+int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
														
 
															+		       int can_block, struct dm_thin_lookup_result *result)
														
 
															+{
														
 
															+	int r;
														
 
															+	uint64_t block_time = 0;
														
 
															+	__le64 value;
														
 
															+	struct dm_pool_metadata *pmd = td->pmd;
														
 
															+	dm_block_t keys[2] = { td->id, block };
														
 
															+
														
 
															+	if (can_block) {
														
 
															+		down_read(&pmd->root_lock);
														
 
															+		r = dm_btree_lookup(&pmd->info, pmd->root, keys, &value);
														
 
															+		if (!r)
														
 
															+			block_time = le64_to_cpu(value);
														
 
															+		up_read(&pmd->root_lock);
														
 
															+
														
 
															+	} else if (down_read_trylock(&pmd->root_lock)) {
														
 
															+		r = dm_btree_lookup(&pmd->nb_info, pmd->root, keys, &value);
														
 
															+		if (!r)
														
 
															+			block_time = le64_to_cpu(value);
														
 
															+		up_read(&pmd->root_lock);
														
 
															+
														
 
															+	} else
														
 
															+		return -EWOULDBLOCK;
														
 
															+
														
 
															+	if (!r) {
														
 
															+		dm_block_t exception_block;
														
 
															+		uint32_t exception_time;
														
 
															+		unpack_block_time(block_time, &exception_block,
														
 
															+				  &exception_time);
														
 
															+		result->block = exception_block;
														
 
															+		result->shared = __snapshotted_since(td, exception_time);
														
 
															+	}
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+static int __insert(struct dm_thin_device *td, dm_block_t block,
														
 
															+		    dm_block_t data_block)
														
 
															+{
														
 
															+	int r, inserted;
														
 
															+	__le64 value;
														
 
															+	struct dm_pool_metadata *pmd = td->pmd;
														
 
															+	dm_block_t keys[2] = { td->id, block };
														
 
															+
														
 
															+	pmd->need_commit = 1;
														
 
															+	value = cpu_to_le64(pack_block_time(data_block, pmd->time));
														
 
															+	__dm_bless_for_disk(&value);
														
 
															+
														
 
															+	r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value,
														
 
															+				   &pmd->root, &inserted);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															+	if (inserted) {
														
 
															+		td->mapped_blocks++;
														
 
															+		td->changed = 1;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
														
 
															+			 dm_block_t data_block)
														
 
															+{
														
 
															+	int r;
														
 
															+
														
 
															+	down_write(&td->pmd->root_lock);
														
 
															+	r = __insert(td, block, data_block);
														
 
															+	up_write(&td->pmd->root_lock);
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+static int __remove(struct dm_thin_device *td, dm_block_t block)
														
 
															+{
														
 
															+	int r;
														
 
															+	struct dm_pool_metadata *pmd = td->pmd;
														
 
															+	dm_block_t keys[2] = { td->id, block };
														
 
															+
														
 
															+	r = dm_btree_remove(&pmd->info, pmd->root, keys, &pmd->root);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															+	pmd->need_commit = 1;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
														
 
															+{
														
 
															+	int r;
														
 
															+
														
 
															+	down_write(&td->pmd->root_lock);
														
 
															+	r = __remove(td, block);
														
 
															+	up_write(&td->pmd->root_lock);
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
														
 
															+{
														
 
															+	int r;
														
 
															+
														
 
															+	down_write(&pmd->root_lock);
														
 
															+
														
 
															+	r = dm_sm_new_block(pmd->data_sm, result);
														
 
															+	pmd->need_commit = 1;
														
 
															+
														
 
															+	up_write(&pmd->root_lock);
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
														
 
															+{
														
 
															+	int r;
														
 
															+
														
 
															+	down_write(&pmd->root_lock);
														
 
															+
														
 
															+	r = __commit_transaction(pmd);
														
 
															+	if (r <= 0)
														
 
															+		goto out;
														
 
															+
														
 
															+	/*
														
 
															+	 * Open the next transaction.
														
 
															+	 */
														
 
															+	r = __begin_transaction(pmd);
														
 
															+out:
														
 
															+	up_write(&pmd->root_lock);
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result)
														
 
															+{
														
 
															+	int r;
														
 
															+
														
 
															+	down_read(&pmd->root_lock);
														
 
															+	r = dm_sm_get_nr_free(pmd->data_sm, result);
														
 
															+	up_read(&pmd->root_lock);
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
														
 
															+					  dm_block_t *result)
														
 
															+{
														
 
															+	int r;
														
 
															+
														
 
															+	down_read(&pmd->root_lock);
														
 
															+	r = dm_sm_get_nr_free(pmd->metadata_sm, result);
														
 
															+	up_read(&pmd->root_lock);
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
														
 
															+				  dm_block_t *result)
														
 
															+{
														
 
															+	int r;
														
 
															+
														
 
															+	down_read(&pmd->root_lock);
														
 
															+	r = dm_sm_get_nr_blocks(pmd->metadata_sm, result);
														
 
															+	up_read(&pmd->root_lock);
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result)
														
 
															+{
														
 
															+	down_read(&pmd->root_lock);
														
 
															+	*result = pmd->data_block_size;
														
 
															+	up_read(&pmd->root_lock);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
														
 
															+{
														
 
															+	int r;
														
 
															+
														
 
															+	down_read(&pmd->root_lock);
														
 
															+	r = dm_sm_get_nr_blocks(pmd->data_sm, result);
														
 
															+	up_read(&pmd->root_lock);
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result)
														
 
															+{
														
 
															+	struct dm_pool_metadata *pmd = td->pmd;
														
 
															+
														
 
															+	down_read(&pmd->root_lock);
														
 
															+	*result = td->mapped_blocks;
														
 
															+	up_read(&pmd->root_lock);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int __highest_block(struct dm_thin_device *td, dm_block_t *result)
														
 
															+{
														
 
															+	int r;
														
 
															+	__le64 value_le;
														
 
															+	dm_block_t thin_root;
														
 
															+	struct dm_pool_metadata *pmd = td->pmd;
														
 
															+
														
 
															+	r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															+	thin_root = le64_to_cpu(value_le);
														
 
															+
														
 
															+	return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result);
														
 
															+}
														
 
															+
														
 
															+int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
														
 
															+				     dm_block_t *result)
														
 
															+{
														
 
															+	int r;
														
 
															+	struct dm_pool_metadata *pmd = td->pmd;
														
 
															+
														
 
															+	down_read(&pmd->root_lock);
														
 
															+	r = __highest_block(td, result);
														
 
															+	up_read(&pmd->root_lock);
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+static int __resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
														
 
															+{
														
 
															+	int r;
														
 
															+	dm_block_t old_count;
														
 
															+
														
 
															+	r = dm_sm_get_nr_blocks(pmd->data_sm, &old_count);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															+	if (new_count == old_count)
														
 
															+		return 0;
														
 
															+
														
 
															+	if (new_count < old_count) {
														
 
															+		DMERR("cannot reduce size of data device");
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	r = dm_sm_extend(pmd->data_sm, new_count - old_count);
														
 
															+	if (!r)
														
 
															+		pmd->need_commit = 1;
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
														
 
															+{
														
 
															+	int r;
														
 
															+
														
 
															+	down_write(&pmd->root_lock);
														
 
															+	r = __resize_data_dev(pmd, new_count);
														
 
															+	up_write(&pmd->root_lock);
														
 
															+
														
 
															+	return r;
														
 
															+}
														
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -0,0 +1,156 @@
 
															+/*
														
 
															+ * Copyright (C) 2010-2011 Red Hat, Inc.
														
 
															+ *
														
 
															+ * This file is released under the GPL.
														
 
															+ */
														
 
															+
														
 
															+#ifndef DM_THIN_METADATA_H
														
 
															+#define DM_THIN_METADATA_H
														
 
															+
														
 
															+#include "persistent-data/dm-block-manager.h"
														
 
															+
														
 
															+#define THIN_METADATA_BLOCK_SIZE 4096
														
 
															+
														
 
															+/*----------------------------------------------------------------*/
														
 
															+
														
 
															+struct dm_pool_metadata;
														
 
															+struct dm_thin_device;
														
 
															+
														
 
															+/*
														
 
															+ * Device identifier
														
 
															+ */
														
 
															+typedef uint64_t dm_thin_id;
														
 
															+
														
 
															+/*
														
 
															+ * Reopens or creates a new, empty metadata volume.
														
 
															+ */
														
 
															+struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
														
 
															+					       sector_t data_block_size);
														
 
															+
														
 
															+int dm_pool_metadata_close(struct dm_pool_metadata *pmd);
														
 
															+
														
 
															+/*
														
 
															+ * Compat feature flags.  Any incompat flags beyond the ones
														
 
															+ * specified below will prevent use of the thin metadata.
														
 
															+ */
														
 
															+#define THIN_FEATURE_COMPAT_SUPP	  0UL
														
 
															+#define THIN_FEATURE_COMPAT_RO_SUPP	  0UL
														
 
															+#define THIN_FEATURE_INCOMPAT_SUPP	  0UL
														
 
															+
														
 
															+/*
														
 
															+ * Device creation/deletion.
														
 
															+ */
														
 
															+int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev);
														
 
															+
														
 
															+/*
														
 
															+ * An internal snapshot.
														
 
															+ *
														
 
															+ * You can only snapshot a quiesced origin i.e. one that is either
														
 
															+ * suspended or not instanced at all.
														
 
															+ */
														
 
															+int dm_pool_create_snap(struct dm_pool_metadata *pmd, dm_thin_id dev,
														
 
															+			dm_thin_id origin);
														
 
															+
														
 
															+/*
														
 
															+ * Deletes a virtual device from the metadata.  It _is_ safe to call this
														
 
															+ * when that device is open.  Operations on that device will just start
														
 
															+ * failing.  You still need to call close() on the device.
														
 
															+ */
														
 
															+int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
														
 
															+			       dm_thin_id dev);
														
 
															+
														
 
															+/*
														
 
															+ * Commits _all_ metadata changes: device creation, deletion, mapping
														
 
															+ * updates.
														
 
															+ */
														
 
															+int dm_pool_commit_metadata(struct dm_pool_metadata *pmd);
														
 
															+
														
 
															+/*
														
 
															+ * Set/get userspace transaction id.
														
 
															+ */
														
 
															+int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
														
 
															+					uint64_t current_id,
														
 
															+					uint64_t new_id);
														
 
															+
														
 
															+int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
														
 
															+					uint64_t *result);
														
 
															+
														
 
															+/*
														
 
															+ * Hold/get root for userspace transaction.
														
 
															+ */
														
 
															+int dm_pool_hold_metadata_root(struct dm_pool_metadata *pmd);
														
 
															+
														
 
															+int dm_pool_get_held_metadata_root(struct dm_pool_metadata *pmd,
														
 
															+				   dm_block_t *result);
														
 
															+
														
 
															+/*
														
 
															+ * Actions on a single virtual device.
														
 
															+ */
														
 
															+
														
 
															+/*
														
 
															+ * Opening the same device more than once will fail with -EBUSY.
														
 
															+ */
														
 
															+int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
														
 
															+			     struct dm_thin_device **td);
														
 
															+
														
 
															+int dm_pool_close_thin_device(struct dm_thin_device *td);
														
 
															+
														
 
															+dm_thin_id dm_thin_dev_id(struct dm_thin_device *td);
														
 
															+
														
 
															+struct dm_thin_lookup_result {
														
 
															+	dm_block_t block;
														
 
															+	int shared;
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * Returns:
														
 
															+ *   -EWOULDBLOCK iff @can_block is set and would block.
														
 
															+ *   -ENODATA iff that mapping is not present.
														
 
															+ *   0 success
														
 
															+ */
														
 
															+int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
														
 
															+		       int can_block, struct dm_thin_lookup_result *result);
														
 
															+
														
 
															+/*
														
 
															+ * Obtain an unused block.
														
 
															+ */
														
 
															+int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result);
														
 
															+
														
 
															+/*
														
 
															+ * Insert or remove block.
														
 
															+ */
														
 
															+int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
														
 
															+			 dm_block_t data_block);
														
 
															+
														
 
															+int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block);
														
 
															+
														
 
															+/*
														
 
															+ * Queries.
														
 
															+ */
														
 
															+int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
														
 
															+				     dm_block_t *highest_mapped);
														
 
															+
														
 
															+int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result);
														
 
															+
														
 
															+int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd,
														
 
															+				 dm_block_t *result);
														
 
															+
														
 
															+int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
														
 
															+					  dm_block_t *result);
														
 
															+
														
 
															+int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
														
 
															+				  dm_block_t *result);
														
 
															+
														
 
															+int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result);
														
 
															+
														
 
															+int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result);
														
 
															+
														
 
															+/*
														
 
															+ * Returns -ENOSPC if the new size is too small and already allocated
														
 
															+ * blocks would be lost.
														
 
															+ */
														
 
															+int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_size);
														
 
															+
														
 
															+/*----------------------------------------------------------------*/
														
 
															+
														
 
															+#endif
														
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -0,0 +1,2428 @@
 
															+/*
														
 
															+ * Copyright (C) 2011 Red Hat UK.
														
 
															+ *
														
 
															+ * This file is released under the GPL.
														
 
															+ */
														
 
															+
														
 
															+#include "dm-thin-metadata.h"
														
 
															+
														
 
															+#include <linux/device-mapper.h>
														
 
															+#include <linux/dm-io.h>
														
 
															+#include <linux/dm-kcopyd.h>
														
 
															+#include <linux/list.h>
														
 
															+#include <linux/init.h>
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/slab.h>
														
 
															+
														
 
															+#define	DM_MSG_PREFIX	"thin"
														
 
															+
														
 
															+/*
														
 
															+ * Tunable constants
														
 
															+ */
														
 
															+#define ENDIO_HOOK_POOL_SIZE 10240
														
 
															+#define DEFERRED_SET_SIZE 64
														
 
															+#define MAPPING_POOL_SIZE 1024
														
 
															+#define PRISON_CELLS 1024
														
 
															+
														
 
															+/*
														
 
															+ * The block size of the device holding pool data must be
														
 
															+ * between 64KB and 1GB.
														
 
															+ */
														
 
															+#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
														
 
															+#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
														
 
															+
														
 
															+/*
														
 
															+ * The metadata device is currently limited in size.  The limitation is
														
 
															+ * checked lower down in dm-space-map-metadata, but we also check it here
														
 
															+ * so we can fail early.
														
 
															+ *
														
 
															+ * We have one block of index, which can hold 255 index entries.  Each
														
 
															+ * index entry contains allocation info about 16k metadata blocks.
														
 
															+ */
														
 
															+#define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
														
 
															+
														
 
															+/*
														
 
															+ * Device id is restricted to 24 bits.
														
 
															+ */
														
 
															+#define MAX_DEV_ID ((1 << 24) - 1)
														
 
															+
														
 
															+/*
														
 
															+ * How do we handle breaking sharing of data blocks?
														
 
															+ * =================================================
														
 
															+ *
														
 
															+ * We use a standard copy-on-write btree to store the mappings for the
														
 
															+ * devices (note I'm talking about copy-on-write of the metadata here, not
														
 
															+ * the data).  When you take an internal snapshot you clone the root node
														
 
															+ * of the origin btree.  After this there is no concept of an origin or a
														
 
															+ * snapshot.  They are just two device trees that happen to point to the
														
 
															+ * same data blocks.
														
 
															+ *
														
 
															+ * When we get a write in we decide if it's to a shared data block using
														
 
															+ * some timestamp magic.  If it is, we have to break sharing.
														
 
															+ *
														
 
															+ * Let's say we write to a shared block in what was the origin.  The
														
 
															+ * steps are:
														
 
															+ *
														
 
															+ * i) plug io further to this physical block. (see bio_prison code).
														
 
															+ *
														
 
															+ * ii) quiesce any read io to that shared data block.  Obviously
														
 
															+ * including all devices that share this block.  (see deferred_set code)
														
 
															+ *
														
 
															+ * iii) copy the data block to a newly allocate block.  This step can be
														
 
															+ * missed out if the io covers the block. (schedule_copy).
														
 
															+ *
														
 
															+ * iv) insert the new mapping into the origin's btree
														
 
															+ * (process_prepared_mappings).  This act of inserting breaks some
														
 
															+ * sharing of btree nodes between the two devices.  Breaking sharing only
														
 
															+ * effects the btree of that specific device.  Btrees for the other
														
 
															+ * devices that share the block never change.  The btree for the origin
														
 
															+ * device as it was after the last commit is untouched, ie. we're using
														
 
															+ * persistent data structures in the functional programming sense.
														
 
															+ *
														
 
															+ * v) unplug io to this physical block, including the io that triggered
														
 
															+ * the breaking of sharing.
														
 
															+ *
														
 
															+ * Steps (ii) and (iii) occur in parallel.
														
 
															+ *
														
 
															+ * The metadata _doesn't_ need to be committed before the io continues.  We
														
 
															+ * get away with this because the io is always written to a _new_ block.
														
 
															+ * If there's a crash, then:
														
 
															+ *
														
 
															+ * - The origin mapping will point to the old origin block (the shared
														
 
															+ * one).  This will contain the data as it was before the io that triggered
														
 
															+ * the breaking of sharing came in.
														
 
															+ *
														
 
															+ * - The snap mapping still points to the old block.  As it would after
														
 
															+ * the commit.
														
 
															+ *
														
 
															+ * The downside of this scheme is the timestamp magic isn't perfect, and
														
 
															+ * will continue to think that data block in the snapshot device is shared
														
 
															+ * even after the write to the origin has broken sharing.  I suspect data
														
 
															+ * blocks will typically be shared by many different devices, so we're
														
 
															+ * breaking sharing n + 1 times, rather than n, where n is the number of
														
 
															+ * devices that reference this data block.  At the moment I think the
														
 
															+ * benefits far, far outweigh the disadvantages.
														
 
															+ */
														
 
															+
														
 
															+/*----------------------------------------------------------------*/
														
 
															+
														
 
															+/*
														
 
															+ * Sometimes we can't deal with a bio straight away.  We put them in prison
														
 
															+ * where they can't cause any mischief.  Bios are put in a cell identified
														
 
															+ * by a key, multiple bios can be in the same cell.  When the cell is
														
 
															+ * subsequently unlocked the bios become available.
														
 
															+ */
														
 
															+struct bio_prison;
														
 
															+
														
 
															+struct cell_key {
														
 
															+	int virtual;
														
 
															+	dm_thin_id dev;
														
 
															+	dm_block_t block;
														
 
															+};
														
 
															+
														
 
															+struct cell {
														
 
															+	struct hlist_node list;
														
 
															+	struct bio_prison *prison;
														
 
															+	struct cell_key key;
														
 
															+	unsigned count;
														
 
															+	struct bio_list bios;
														
 
															+};
														
 
															+
														
 
															+struct bio_prison {
														
 
															+	spinlock_t lock;
														
 
															+	mempool_t *cell_pool;
														
 
															+
														
 
															+	unsigned nr_buckets;
														
 
															+	unsigned hash_mask;
														
 
															+	struct hlist_head *cells;
														
 
															+};
														
 
															+
														
 
															+static uint32_t calc_nr_buckets(unsigned nr_cells)
														
 
															+{
														
 
															+	uint32_t n = 128;
														
 
															+
														
 
															+	nr_cells /= 4;
														
 
															+	nr_cells = min(nr_cells, 8192u);
														
 
															+
														
 
															+	while (n < nr_cells)
														
 
															+		n <<= 1;
														
 
															+
														
 
															+	return n;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * @nr_cells should be the number of cells you want in use _concurrently_.
														
 
															+ * Don't confuse it with the number of distinct keys.
														
 
															+ */
														
 
															+static struct bio_prison *prison_create(unsigned nr_cells)
														
 
															+{
														
 
															+	unsigned i;
														
 
															+	uint32_t nr_buckets = calc_nr_buckets(nr_cells);
														
 
															+	size_t len = sizeof(struct bio_prison) +
														
 
															+		(sizeof(struct hlist_head) * nr_buckets);
														
 
															+	struct bio_prison *prison = kmalloc(len, GFP_KERNEL);
														
 
															+
														
 
															+	if (!prison)
														
 
															+		return NULL;
														
 
															+
														
 
															+	spin_lock_init(&prison->lock);
														
 
															+	prison->cell_pool = mempool_create_kmalloc_pool(nr_cells,
														
 
															+							sizeof(struct cell));
														
 
															+	if (!prison->cell_pool) {
														
 
															+		kfree(prison);
														
 
															+		return NULL;
														
 
															+	}
														
 
															+
														
 
															+	prison->nr_buckets = nr_buckets;
														
 
															+	prison->hash_mask = nr_buckets - 1;
														
 
															+	prison->cells = (struct hlist_head *) (prison + 1);
														
 
															+	for (i = 0; i < nr_buckets; i++)
														
 
															+		INIT_HLIST_HEAD(prison->cells + i);
														
 
															+
														
 
															+	return prison;
														
 
															+}
														
 
															+
														
 
															+static void prison_destroy(struct bio_prison *prison)
														
 
															+{
														
 
															+	mempool_destroy(prison->cell_pool);
														
 
															+	kfree(prison);
														
 
															+}
														
 
															+
														
 
															+static uint32_t hash_key(struct bio_prison *prison, struct cell_key *key)
														
 
															+{
														
 
															+	const unsigned long BIG_PRIME = 4294967291UL;
														
 
															+	uint64_t hash = key->block * BIG_PRIME;
														
 
															+
														
 
															+	return (uint32_t) (hash & prison->hash_mask);
														
 
															+}
														
 
															+
														
 
															+static int keys_equal(struct cell_key *lhs, struct cell_key *rhs)
														
 
															+{
														
 
															+	       return (lhs->virtual == rhs->virtual) &&
														
 
															+		       (lhs->dev == rhs->dev) &&
														
 
															+		       (lhs->block == rhs->block);
														
 
															+}
														
 
															+
														
 
															+static struct cell *__search_bucket(struct hlist_head *bucket,
														
 
															+				    struct cell_key *key)
														
 
															+{
														
 
															+	struct cell *cell;
														
 
															+	struct hlist_node *tmp;
														
 
															+
														
 
															+	hlist_for_each_entry(cell, tmp, bucket, list)
														
 
															+		if (keys_equal(&cell->key, key))
														
 
															+			return cell;
														
 
															+
														
 
															+	return NULL;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * This may block if a new cell needs allocating.  You must ensure that
														
 
															+ * cells will be unlocked even if the calling thread is blocked.
														
 
															+ *
														
 
															+ * Returns the number of entries in the cell prior to the new addition
														
 
															+ * or < 0 on failure.
														
 
															+ */
														
 
															+static int bio_detain(struct bio_prison *prison, struct cell_key *key,
														
 
															+		      struct bio *inmate, struct cell **ref)
														
 
															+{
														
 
															+	int r;
														
 
															+	unsigned long flags;
														
 
															+	uint32_t hash = hash_key(prison, key);
														
 
															+	struct cell *uninitialized_var(cell), *cell2 = NULL;
														
 
															+
														
 
															+	BUG_ON(hash > prison->nr_buckets);
														
 
															+
														
 
															+	spin_lock_irqsave(&prison->lock, flags);
														
 
															+	cell = __search_bucket(prison->cells + hash, key);
														
 
															+
														
 
															+	if (!cell) {
														
 
															+		/*
														
 
															+		 * Allocate a new cell
														
 
															+		 */
														
 
															+		spin_unlock_irqrestore(&prison->lock, flags);
														
 
															+		cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
														
 
															+		spin_lock_irqsave(&prison->lock, flags);
														
 
															+
														
 
															+		/*
														
 
															+		 * We've been unlocked, so we have to double check that
														
 
															+		 * nobody else has inserted this cell in the meantime.
														
 
															+		 */
														
 
															+		cell = __search_bucket(prison->cells + hash, key);
														
 
															+
														
 
															+		if (!cell) {
														
 
															+			cell = cell2;
														
 
															+			cell2 = NULL;
														
 
															+
														
 
															+			cell->prison = prison;
														
 
															+			memcpy(&cell->key, key, sizeof(cell->key));
														
 
															+			cell->count = 0;
														
 
															+			bio_list_init(&cell->bios);
														
 
															+			hlist_add_head(&cell->list, prison->cells + hash);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	r = cell->count++;
														
 
															+	bio_list_add(&cell->bios, inmate);
														
 
															+	spin_unlock_irqrestore(&prison->lock, flags);
														
 
															+
														
 
															+	if (cell2)
														
 
															+		mempool_free(cell2, prison->cell_pool);
														
 
															+
														
 
															+	*ref = cell;
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * @inmates must have been initialised prior to this call
														
 
															+ */
														
 
															+static void __cell_release(struct cell *cell, struct bio_list *inmates)
														
 
															+{
														
 
															+	struct bio_prison *prison = cell->prison;
														
 
															+
														
 
															+	hlist_del(&cell->list);
														
 
															+
														
 
															+	if (inmates)
														
 
															+		bio_list_merge(inmates, &cell->bios);
														
 
															+
														
 
															+	mempool_free(cell, prison->cell_pool);
														
 
															+}
														
 
															+
														
 
															+static void cell_release(struct cell *cell, struct bio_list *bios)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+	struct bio_prison *prison = cell->prison;
														
 
															+
														
 
															+	spin_lock_irqsave(&prison->lock, flags);
														
 
															+	__cell_release(cell, bios);
														
 
															+	spin_unlock_irqrestore(&prison->lock, flags);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * There are a couple of places where we put a bio into a cell briefly
														
 
															+ * before taking it out again.  In these situations we know that no other
														
 
															+ * bio may be in the cell.  This function releases the cell, and also does
														
 
															+ * a sanity check.
														
 
															+ */
														
 
															+static void cell_release_singleton(struct cell *cell, struct bio *bio)
														
 
															+{
														
 
															+	struct bio_prison *prison = cell->prison;
														
 
															+	struct bio_list bios;
														
 
															+	struct bio *b;
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	bio_list_init(&bios);
														
 
															+
														
 
															+	spin_lock_irqsave(&prison->lock, flags);
														
 
															+	__cell_release(cell, &bios);
														
 
															+	spin_unlock_irqrestore(&prison->lock, flags);
														
 
															+
														
 
															+	b = bio_list_pop(&bios);
														
 
															+	BUG_ON(b != bio);
														
 
															+	BUG_ON(!bio_list_empty(&bios));
														
 
															+}
														
 
															+
														
 
															+static void cell_error(struct cell *cell)
														
 
															+{
														
 
															+	struct bio_prison *prison = cell->prison;
														
 
															+	struct bio_list bios;
														
 
															+	struct bio *bio;
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	bio_list_init(&bios);
														
 
															+
														
 
															+	spin_lock_irqsave(&prison->lock, flags);
														
 
															+	__cell_release(cell, &bios);
														
 
															+	spin_unlock_irqrestore(&prison->lock, flags);
														
 
															+
														
 
															+	while ((bio = bio_list_pop(&bios)))
														
 
															+		bio_io_error(bio);
														
 
															+}
														
 
															+
														
 
															+/*----------------------------------------------------------------*/
														
 
															+
														
 
															+/*
														
 
															+ * We use the deferred set to keep track of pending reads to shared blocks.
														
 
															+ * We do this to ensure the new mapping caused by a write isn't performed
														
 
															+ * until these prior reads have completed.  Otherwise the insertion of the
														
 
															+ * new mapping could free the old block that the read bios are mapped to.
														
 
															+ */
														
 
															+
														
 
															+struct deferred_set;
														
 
															+struct deferred_entry {
														
 
															+	struct deferred_set *ds;
														
 
															+	unsigned count;
														
 
															+	struct list_head work_items;
														
 
															+};
														
 
															+
														
 
															+struct deferred_set {
														
 
															+	spinlock_t lock;
														
 
															+	unsigned current_entry;
														
 
															+	unsigned sweeper;
														
 
															+	struct deferred_entry entries[DEFERRED_SET_SIZE];
														
 
															+};
														
 
															+
														
 
															+static void ds_init(struct deferred_set *ds)
														
 
															+{
														
 
															+	int i;
														
 
															+
														
 
															+	spin_lock_init(&ds->lock);
														
 
															+	ds->current_entry = 0;
														
 
															+	ds->sweeper = 0;
														
 
															+	for (i = 0; i < DEFERRED_SET_SIZE; i++) {
														
 
															+		ds->entries[i].ds = ds;
														
 
															+		ds->entries[i].count = 0;
														
 
															+		INIT_LIST_HEAD(&ds->entries[i].work_items);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static struct deferred_entry *ds_inc(struct deferred_set *ds)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+	struct deferred_entry *entry;
														
 
															+
														
 
															+	spin_lock_irqsave(&ds->lock, flags);
														
 
															+	entry = ds->entries + ds->current_entry;
														
 
															+	entry->count++;
														
 
															+	spin_unlock_irqrestore(&ds->lock, flags);
														
 
															+
														
 
															+	return entry;
														
 
															+}
														
 
															+
														
 
															+static unsigned ds_next(unsigned index)
														
 
															+{
														
 
															+	return (index + 1) % DEFERRED_SET_SIZE;
														
 
															+}
														
 
															+
														
 
															+static void __sweep(struct deferred_set *ds, struct list_head *head)
														
 
															+{
														
 
															+	while ((ds->sweeper != ds->current_entry) &&
														
 
															+	       !ds->entries[ds->sweeper].count) {
														
 
															+		list_splice_init(&ds->entries[ds->sweeper].work_items, head);
														
 
															+		ds->sweeper = ds_next(ds->sweeper);
														
 
															+	}
														
 
															+
														
 
															+	if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count)
														
 
															+		list_splice_init(&ds->entries[ds->sweeper].work_items, head);
														
 
															+}
														
 
															+
														
 
															+static void ds_dec(struct deferred_entry *entry, struct list_head *head)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	spin_lock_irqsave(&entry->ds->lock, flags);
														
 
															+	BUG_ON(!entry->count);
														
 
															+	--entry->count;
														
 
															+	__sweep(entry->ds, head);
														
 
															+	spin_unlock_irqrestore(&entry->ds->lock, flags);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Returns 1 if deferred or 0 if no pending items to delay job.
														
 
															+ */
														
 
															+static int ds_add_work(struct deferred_set *ds, struct list_head *work)
														
 
															+{
														
 
															+	int r = 1;
														
 
															+	unsigned long flags;
														
 
															+	unsigned next_entry;
														
 
															+
														
 
															+	spin_lock_irqsave(&ds->lock, flags);
														
 
															+	if ((ds->sweeper == ds->current_entry) &&
														
 
															+	    !ds->entries[ds->current_entry].count)
														
 
															+		r = 0;
														
 
															+	else {
														
 
															+		list_add(work, &ds->entries[ds->current_entry].work_items);
														
 
															+		next_entry = ds_next(ds->current_entry);
														
 
															+		if (!ds->entries[next_entry].count)
														
 
															+			ds->current_entry = next_entry;
														
 
															+	}
														
 
															+	spin_unlock_irqrestore(&ds->lock, flags);
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+/*----------------------------------------------------------------*/
														
 
															+
														
 
															+/*
														
 
															+ * Key building.
														
 
															+ */
														
 
															+static void build_data_key(struct dm_thin_device *td,
														
 
															+			   dm_block_t b, struct cell_key *key)
														
 
															+{
														
 
															+	key->virtual = 0;
														
 
															+	key->dev = dm_thin_dev_id(td);
														
 
															+	key->block = b;
														
 
															+}
														
 
															+
														
 
															+static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
														
 
															+			      struct cell_key *key)
														
 
															+{
														
 
															+	key->virtual = 1;
														
 
															+	key->dev = dm_thin_dev_id(td);
														
 
															+	key->block = b;
														
 
															+}
														
 
															+
														
 
															+/*----------------------------------------------------------------*/
														
 
															+
														
 
															+/*
														
 
															+ * A pool device ties together a metadata device and a data device.  It
														
 
															+ * also provides the interface for creating and destroying internal
														
 
															+ * devices.
														
 
															+ */
														
 
															+struct new_mapping;
														
 
															+struct pool {
														
 
															+	struct list_head list;
														
 
															+	struct dm_target *ti;	/* Only set if a pool target is bound */
														
 
															+
														
 
															+	struct mapped_device *pool_md;
														
 
															+	struct block_device *md_dev;
														
 
															+	struct dm_pool_metadata *pmd;
														
 
															+
														
 
															+	uint32_t sectors_per_block;
														
 
															+	unsigned block_shift;
														
 
															+	dm_block_t offset_mask;
														
 
															+	dm_block_t low_water_blocks;
														
 
															+
														
 
															+	unsigned zero_new_blocks:1;
														
 
															+	unsigned low_water_triggered:1;	/* A dm event has been sent */
														
 
															+	unsigned no_free_space:1;	/* A -ENOSPC warning has been issued */
														
 
															+
														
 
															+	struct bio_prison *prison;
														
 
															+	struct dm_kcopyd_client *copier;
														
 
															+
														
 
															+	struct workqueue_struct *wq;
														
 
															+	struct work_struct worker;
														
 
															+
														
 
															+	unsigned ref_count;
														
 
															+
														
 
															+	spinlock_t lock;
														
 
															+	struct bio_list deferred_bios;
														
 
															+	struct bio_list deferred_flush_bios;
														
 
															+	struct list_head prepared_mappings;
														
 
															+
														
 
															+	struct bio_list retry_on_resume_list;
														
 
															+
														
 
															+	struct deferred_set ds;	/* FIXME: move to thin_c */
														
 
															+
														
 
															+	struct new_mapping *next_mapping;
														
 
															+	mempool_t *mapping_pool;
														
 
															+	mempool_t *endio_hook_pool;
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * Target context for a pool.
														
 
															+ */
														
 
															+struct pool_c {
														
 
															+	struct dm_target *ti;
														
 
															+	struct pool *pool;
														
 
															+	struct dm_dev *data_dev;
														
 
															+	struct dm_dev *metadata_dev;
														
 
															+	struct dm_target_callbacks callbacks;
														
 
															+
														
 
															+	dm_block_t low_water_blocks;
														
 
															+	unsigned zero_new_blocks:1;
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * Target context for a thin.
														
 
															+ */
														
 
															+struct thin_c {
														
 
															+	struct dm_dev *pool_dev;
														
 
															+	dm_thin_id dev_id;
														
 
															+
														
 
															+	struct pool *pool;
														
 
															+	struct dm_thin_device *td;
														
 
															+};
														
 
															+
														
 
															+/*----------------------------------------------------------------*/
														
 
															+
														
 
															+/*
														
 
															+ * A global list of pools that uses a struct mapped_device as a key.
														
 
															+ */
														
 
															+static struct dm_thin_pool_table {
														
 
															+	struct mutex mutex;
														
 
															+	struct list_head pools;
														
 
															+} dm_thin_pool_table;
														
 
															+
														
 
															+static void pool_table_init(void)
														
 
															+{
														
 
															+	mutex_init(&dm_thin_pool_table.mutex);
														
 
															+	INIT_LIST_HEAD(&dm_thin_pool_table.pools);
														
 
															+}
														
 
															+
														
 
															+static void __pool_table_insert(struct pool *pool)
														
 
															+{
														
 
															+	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
														
 
															+	list_add(&pool->list, &dm_thin_pool_table.pools);
														
 
															+}
														
 
															+
														
 
															+static void __pool_table_remove(struct pool *pool)
														
 
															+{
														
 
															+	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
														
 
															+	list_del(&pool->list);
														
 
															+}
														
 
															+
														
 
															+static struct pool *__pool_table_lookup(struct mapped_device *md)
														
 
															+{
														
 
															+	struct pool *pool = NULL, *tmp;
														
 
															+
														
 
															+	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
														
 
															+
														
 
															+	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
														
 
															+		if (tmp->pool_md == md) {
														
 
															+			pool = tmp;
														
 
															+			break;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return pool;
														
 
															+}
														
 
															+
														
 
															+static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
														
 
															+{
														
 
															+	struct pool *pool = NULL, *tmp;
														
 
															+
														
 
															+	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
														
 
															+
														
 
															+	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
														
 
															+		if (tmp->md_dev == md_dev) {
														
 
															+			pool = tmp;
														
 
															+			break;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return pool;
														
 
															+}
														
 
															+
														
 
															+/*----------------------------------------------------------------*/
														
 
															+
														
 
															+static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
														
 
															+{
														
 
															+	struct bio *bio;
														
 
															+	struct bio_list bios;
														
 
															+
														
 
															+	bio_list_init(&bios);
														
 
															+	bio_list_merge(&bios, master);
														
 
															+	bio_list_init(master);
														
 
															+
														
 
															+	while ((bio = bio_list_pop(&bios))) {
														
 
															+		if (dm_get_mapinfo(bio)->ptr == tc)
														
 
															+			bio_endio(bio, DM_ENDIO_REQUEUE);
														
 
															+		else
														
 
															+			bio_list_add(master, bio);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void requeue_io(struct thin_c *tc)
														
 
															+{
														
 
															+	struct pool *pool = tc->pool;
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	spin_lock_irqsave(&pool->lock, flags);
														
 
															+	__requeue_bio_list(tc, &pool->deferred_bios);
														
 
															+	__requeue_bio_list(tc, &pool->retry_on_resume_list);
														
 
															+	spin_unlock_irqrestore(&pool->lock, flags);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * This section of code contains the logic for processing a thin device's IO.
														
 
															+ * Much of the code depends on pool object resources (lists, workqueues, etc)
														
 
															+ * but most is exclusively called from the thin target rather than the thin-pool
														
 
															+ * target.
														
 
															+ */
														
 
															+
														
 
															+static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
														
 
															+{
														
 
															+	return bio->bi_sector >> tc->pool->block_shift;
														
 
															+}
														
 
															+
														
 
															+static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
														
 
															+{
														
 
															+	struct pool *pool = tc->pool;
														
 
															+
														
 
															+	bio->bi_bdev = tc->pool_dev->bdev;
														
 
															+	bio->bi_sector = (block << pool->block_shift) +
														
 
															+		(bio->bi_sector & pool->offset_mask);
														
 
															+}
														
 
															+
														
 
															+static void remap_and_issue(struct thin_c *tc, struct bio *bio,
														
 
															+			    dm_block_t block)
														
 
															+{
														
 
															+	struct pool *pool = tc->pool;
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	remap(tc, bio, block);
														
 
															+
														
 
															+	/*
														
 
															+	 * Batch together any FUA/FLUSH bios we find and then issue
														
 
															+	 * a single commit for them in process_deferred_bios().
														
 
															+	 */
														
 
															+	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
														
 
															+		spin_lock_irqsave(&pool->lock, flags);
														
 
															+		bio_list_add(&pool->deferred_flush_bios, bio);
														
 
															+		spin_unlock_irqrestore(&pool->lock, flags);
														
 
															+	} else
														
 
															+		generic_make_request(bio);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * wake_worker() is used when new work is queued and when pool_resume is
														
 
															+ * ready to continue deferred IO processing.
														
 
															+ */
														
 
															+static void wake_worker(struct pool *pool)
														
 
															+{
														
 
															+	queue_work(pool->wq, &pool->worker);
														
 
															+}
														
 
															+
														
 
															+/*----------------------------------------------------------------*/
														
 
															+
														
 
															+/*
														
 
															+ * Bio endio functions.
														
 
															+ */
														
 
															+struct endio_hook {
														
 
															+	struct thin_c *tc;
														
 
															+	bio_end_io_t *saved_bi_end_io;
														
 
															+	struct deferred_entry *entry;
														
 
															+};
														
 
															+
														
 
															+struct new_mapping {
														
 
															+	struct list_head list;
														
 
															+
														
 
															+	int prepared;
														
 
															+
														
 
															+	struct thin_c *tc;
														
 
															+	dm_block_t virt_block;
														
 
															+	dm_block_t data_block;
														
 
															+	struct cell *cell;
														
 
															+	int err;
														
 
															+
														
 
															+	/*
														
 
															+	 * If the bio covers the whole area of a block then we can avoid
														
 
															+	 * zeroing or copying.  Instead this bio is hooked.  The bio will
														
 
															+	 * still be in the cell, so care has to be taken to avoid issuing
														
 
															+	 * the bio twice.
														
 
															+	 */
														
 
															+	struct bio *bio;
														
 
															+	bio_end_io_t *saved_bi_end_io;
														
 
															+};
														
 
															+
														
 
															+static void __maybe_add_mapping(struct new_mapping *m)
														
 
															+{
														
 
															+	struct pool *pool = m->tc->pool;
														
 
															+
														
 
															+	if (list_empty(&m->list) && m->prepared) {
														
 
															+		list_add(&m->list, &pool->prepared_mappings);
														
 
															+		wake_worker(pool);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void copy_complete(int read_err, unsigned long write_err, void *context)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+	struct new_mapping *m = context;
														
 
															+	struct pool *pool = m->tc->pool;
														
 
															+
														
 
															+	m->err = read_err || write_err ? -EIO : 0;
														
 
															+
														
 
															+	spin_lock_irqsave(&pool->lock, flags);
														
 
															+	m->prepared = 1;
														
 
															+	__maybe_add_mapping(m);
														
 
															+	spin_unlock_irqrestore(&pool->lock, flags);
														
 
															+}
														
 
															+
														
 
															+static void overwrite_endio(struct bio *bio, int err)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+	struct new_mapping *m = dm_get_mapinfo(bio)->ptr;
														
 
															+	struct pool *pool = m->tc->pool;
														
 
															+
														
 
															+	m->err = err;
														
 
															+
														
 
															+	spin_lock_irqsave(&pool->lock, flags);
														
 
															+	m->prepared = 1;
														
 
															+	__maybe_add_mapping(m);
														
 
															+	spin_unlock_irqrestore(&pool->lock, flags);
														
 
															+}
														
 
															+
														
 
															+static void shared_read_endio(struct bio *bio, int err)
														
 
															+{
														
 
															+	struct list_head mappings;
														
 
															+	struct new_mapping *m, *tmp;
														
 
															+	struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
														
 
															+	unsigned long flags;
														
 
															+	struct pool *pool = h->tc->pool;
														
 
															+
														
 
															+	bio->bi_end_io = h->saved_bi_end_io;
														
 
															+	bio_endio(bio, err);
														
 
															+
														
 
															+	INIT_LIST_HEAD(&mappings);
														
 
															+	ds_dec(h->entry, &mappings);
														
 
															+
														
 
															+	spin_lock_irqsave(&pool->lock, flags);
														
 
															+	list_for_each_entry_safe(m, tmp, &mappings, list) {
														
 
															+		list_del(&m->list);
														
 
															+		INIT_LIST_HEAD(&m->list);
														
 
															+		__maybe_add_mapping(m);
														
 
															+	}
														
 
															+	spin_unlock_irqrestore(&pool->lock, flags);
														
 
															+
														
 
															+	mempool_free(h, pool->endio_hook_pool);
														
 
															+}
														
 
															+
														
 
															+/*----------------------------------------------------------------*/
														
 
															+
														
 
															+/*
														
 
															+ * Workqueue.
														
 
															+ */
														
 
															+
														
 
															+/*
														
 
															+ * Prepared mapping jobs.
														
 
															+ */
														
 
															+
														
 
															+/*
														
 
															+ * This sends the bios in the cell back to the deferred_bios list.
														
 
															+ */
														
 
															+static void cell_defer(struct thin_c *tc, struct cell *cell,
														
 
															+		       dm_block_t data_block)
														
 
															+{
														
 
															+	struct pool *pool = tc->pool;
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	spin_lock_irqsave(&pool->lock, flags);
														
 
															+	cell_release(cell, &pool->deferred_bios);
														
 
															+	spin_unlock_irqrestore(&tc->pool->lock, flags);
														
 
															+
														
 
															+	wake_worker(pool);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Same as cell_defer above, except it omits one particular detainee,
														
 
															+ * a write bio that covers the block and has already been processed.
														
 
															+ */
														
 
															+static void cell_defer_except(struct thin_c *tc, struct cell *cell,
														
 
															+			      struct bio *exception)
														
 
															+{
														
 
															+	struct bio_list bios;
														
 
															+	struct bio *bio;
														
 
															+	struct pool *pool = tc->pool;
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	bio_list_init(&bios);
														
 
															+	cell_release(cell, &bios);
														
 
															+
														
 
															+	spin_lock_irqsave(&pool->lock, flags);
														
 
															+	while ((bio = bio_list_pop(&bios)))
														
 
															+		if (bio != exception)
														
 
															+			bio_list_add(&pool->deferred_bios, bio);
														
 
															+	spin_unlock_irqrestore(&pool->lock, flags);
														
 
															+
														
 
															+	wake_worker(pool);
														
 
															+}
														
 
															+
														
 
															+static void process_prepared_mapping(struct new_mapping *m)
														
 
															+{
														
 
															+	struct thin_c *tc = m->tc;
														
 
															+	struct bio *bio;
														
 
															+	int r;
														
 
															+
														
 
															+	bio = m->bio;
														
 
															+	if (bio)
														
 
															+		bio->bi_end_io = m->saved_bi_end_io;
														
 
															+
														
 
															+	if (m->err) {
														
 
															+		cell_error(m->cell);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * Commit the prepared block into the mapping btree.
														
 
															+	 * Any I/O for this block arriving after this point will get
														
 
															+	 * remapped to it directly.
														
 
															+	 */
														
 
															+	r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
														
 
															+	if (r) {
														
 
															+		DMERR("dm_thin_insert_block() failed");
														
 
															+		cell_error(m->cell);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * Release any bios held while the block was being provisioned.
														
 
															+	 * If we are processing a write bio that completely covers the block,
														
 
															+	 * we already processed it so can ignore it now when processing
														
 
															+	 * the bios in the cell.
														
 
															+	 */
														
 
															+	if (bio) {
														
 
															+		cell_defer_except(tc, m->cell, bio);
														
 
															+		bio_endio(bio, 0);
														
 
															+	} else
														
 
															+		cell_defer(tc, m->cell, m->data_block);
														
 
															+
														
 
															+	list_del(&m->list);
														
 
															+	mempool_free(m, tc->pool->mapping_pool);
														
 
															+}
														
 
															+
														
 
															+static void process_prepared_mappings(struct pool *pool)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+	struct list_head maps;
														
 
															+	struct new_mapping *m, *tmp;
														
 
															+
														
 
															+	INIT_LIST_HEAD(&maps);
														
 
															+	spin_lock_irqsave(&pool->lock, flags);
														
 
															+	list_splice_init(&pool->prepared_mappings, &maps);
														
 
															+	spin_unlock_irqrestore(&pool->lock, flags);
														
 
															+
														
 
															+	list_for_each_entry_safe(m, tmp, &maps, list)
														
 
															+		process_prepared_mapping(m);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Deferred bio jobs.
														
 
															+ */
														
 
															+static int io_overwrites_block(struct pool *pool, struct bio *bio)
														
 
															+{
														
 
															+	return ((bio_data_dir(bio) == WRITE) &&
														
 
															+		!(bio->bi_sector & pool->offset_mask)) &&
														
 
															+		(bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
														
 
															+}
														
 
															+
														
 
															+static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
														
 
															+			       bio_end_io_t *fn)
														
 
															+{
														
 
															+	*save = bio->bi_end_io;
														
 
															+	bio->bi_end_io = fn;
														
 
															+}
														
 
															+
														
 
															+static int ensure_next_mapping(struct pool *pool)
														
 
															+{
														
 
															+	if (pool->next_mapping)
														
 
															+		return 0;
														
 
															+
														
 
															+	pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
														
 
															+
														
 
															+	return pool->next_mapping ? 0 : -ENOMEM;
														
 
															+}
														
 
															+
														
 
															+static struct new_mapping *get_next_mapping(struct pool *pool)
														
 
															+{
														
 
															+	struct new_mapping *r = pool->next_mapping;
														
 
															+
														
 
															+	BUG_ON(!pool->next_mapping);
														
 
															+
														
 
															+	pool->next_mapping = NULL;
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
														
 
															+			  dm_block_t data_origin, dm_block_t data_dest,
														
 
															+			  struct cell *cell, struct bio *bio)
														
 
															+{
														
 
															+	int r;
														
 
															+	struct pool *pool = tc->pool;
														
 
															+	struct new_mapping *m = get_next_mapping(pool);
														
 
															+
														
 
															+	INIT_LIST_HEAD(&m->list);
														
 
															+	m->prepared = 0;
														
 
															+	m->tc = tc;
														
 
															+	m->virt_block = virt_block;
														
 
															+	m->data_block = data_dest;
														
 
															+	m->cell = cell;
														
 
															+	m->err = 0;
														
 
															+	m->bio = NULL;
														
 
															+
														
 
															+	ds_add_work(&pool->ds, &m->list);
														
 
															+
														
 
															+	/*
														
 
															+	 * IO to pool_dev remaps to the pool target's data_dev.
														
 
															+	 *
														
 
															+	 * If the whole block of data is being overwritten, we can issue the
														
 
															+	 * bio immediately. Otherwise we use kcopyd to clone the data first.
														
 
															+	 */
														
 
															+	if (io_overwrites_block(pool, bio)) {
														
 
															+		m->bio = bio;
														
 
															+		save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
														
 
															+		dm_get_mapinfo(bio)->ptr = m;
														
 
															+		remap_and_issue(tc, bio, data_dest);
														
 
															+	} else {
														
 
															+		struct dm_io_region from, to;
														
 
															+
														
 
															+		from.bdev = tc->pool_dev->bdev;
														
 
															+		from.sector = data_origin * pool->sectors_per_block;
														
 
															+		from.count = pool->sectors_per_block;
														
 
															+
														
 
															+		to.bdev = tc->pool_dev->bdev;
														
 
															+		to.sector = data_dest * pool->sectors_per_block;
														
 
															+		to.count = pool->sectors_per_block;
														
 
															+
														
 
															+		r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
														
 
															+				   0, copy_complete, m);
														
 
															+		if (r < 0) {
														
 
															+			mempool_free(m, pool->mapping_pool);
														
 
															+			DMERR("dm_kcopyd_copy() failed");
														
 
															+			cell_error(cell);
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
														
 
															+			  dm_block_t data_block, struct cell *cell,
														
 
															+			  struct bio *bio)
														
 
															+{
														
 
															+	struct pool *pool = tc->pool;
														
 
															+	struct new_mapping *m = get_next_mapping(pool);
														
 
															+
														
 
															+	INIT_LIST_HEAD(&m->list);
														
 
															+	m->prepared = 0;
														
 
															+	m->tc = tc;
														
 
															+	m->virt_block = virt_block;
														
 
															+	m->data_block = data_block;
														
 
															+	m->cell = cell;
														
 
															+	m->err = 0;
														
 
															+	m->bio = NULL;
														
 
															+
														
 
															+	/*
														
 
															+	 * If the whole block of data is being overwritten or we are not
														
 
															+	 * zeroing pre-existing data, we can issue the bio immediately.
														
 
															+	 * Otherwise we use kcopyd to zero the data first.
														
 
															+	 */
														
 
															+	if (!pool->zero_new_blocks)
														
 
															+		process_prepared_mapping(m);
														
 
															+
														
 
															+	else if (io_overwrites_block(pool, bio)) {
														
 
															+		m->bio = bio;
														
 
															+		save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
														
 
															+		dm_get_mapinfo(bio)->ptr = m;
														
 
															+		remap_and_issue(tc, bio, data_block);
														
 
															+
														
 
															+	} else {
														
 
															+		int r;
														
 
															+		struct dm_io_region to;
														
 
															+
														
 
															+		to.bdev = tc->pool_dev->bdev;
														
 
															+		to.sector = data_block * pool->sectors_per_block;
														
 
															+		to.count = pool->sectors_per_block;
														
 
															+
														
 
															+		r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
														
 
															+		if (r < 0) {
														
 
															+			mempool_free(m, pool->mapping_pool);
														
 
															+			DMERR("dm_kcopyd_zero() failed");
														
 
															+			cell_error(cell);
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
														
 
															+{
														
 
															+	int r;
														
 
															+	dm_block_t free_blocks;
														
 
															+	unsigned long flags;
														
 
															+	struct pool *pool = tc->pool;
														
 
															+
														
 
															+	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															+	if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
														
 
															+		DMWARN("%s: reached low water mark, sending event.",
														
 
															+		       dm_device_name(pool->pool_md));
														
 
															+		spin_lock_irqsave(&pool->lock, flags);
														
 
															+		pool->low_water_triggered = 1;
														
 
															+		spin_unlock_irqrestore(&pool->lock, flags);
														
 
															+		dm_table_event(pool->ti->table);
														
 
															+	}
														
 
															+
														
 
															+	if (!free_blocks) {
														
 
															+		if (pool->no_free_space)
														
 
															+			return -ENOSPC;
														
 
															+		else {
														
 
															+			/*
														
 
															+			 * Try to commit to see if that will free up some
														
 
															+			 * more space.
														
 
															+			 */
														
 
															+			r = dm_pool_commit_metadata(pool->pmd);
														
 
															+			if (r) {
														
 
															+				DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
														
 
															+				      __func__, r);
														
 
															+				return r;
														
 
															+			}
														
 
															+
														
 
															+			r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
														
 
															+			if (r)
														
 
															+				return r;
														
 
															+
														
 
															+			/*
														
 
															+			 * If we still have no space we set a flag to avoid
														
 
															+			 * doing all this checking and return -ENOSPC.
														
 
															+			 */
														
 
															+			if (!free_blocks) {
														
 
															+				DMWARN("%s: no free space available.",
														
 
															+				       dm_device_name(pool->pool_md));
														
 
															+				spin_lock_irqsave(&pool->lock, flags);
														
 
															+				pool->no_free_space = 1;
														
 
															+				spin_unlock_irqrestore(&pool->lock, flags);
														
 
															+				return -ENOSPC;
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	r = dm_pool_alloc_data_block(pool->pmd, result);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * If we have run out of space, queue bios until the device is
														
 
															+ * resumed, presumably after having been reloaded with more space.
														
 
															+ */
														
 
															+static void retry_on_resume(struct bio *bio)
														
 
															+{
														
 
															+	struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
														
 
															+	struct pool *pool = tc->pool;
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	spin_lock_irqsave(&pool->lock, flags);
														
 
															+	bio_list_add(&pool->retry_on_resume_list, bio);
														
 
															+	spin_unlock_irqrestore(&pool->lock, flags);
														
 
															+}
														
 
															+
														
 
															+static void no_space(struct cell *cell)
														
 
															+{
														
 
															+	struct bio *bio;
														
 
															+	struct bio_list bios;
														
 
															+
														
 
															+	bio_list_init(&bios);
														
 
															+	cell_release(cell, &bios);
														
 
															+
														
 
															+	while ((bio = bio_list_pop(&bios)))
														
 
															+		retry_on_resume(bio);
														
 
															+}
														
 
															+
														
 
															+static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
														
 
															+			  struct cell_key *key,
														
 
															+			  struct dm_thin_lookup_result *lookup_result,
														
 
															+			  struct cell *cell)
														
 
															+{
														
 
															+	int r;
														
 
															+	dm_block_t data_block;
														
 
															+
														
 
															+	r = alloc_data_block(tc, &data_block);
														
 
															+	switch (r) {
														
 
															+	case 0:
														
 
															+		schedule_copy(tc, block, lookup_result->block,
														
 
															+			      data_block, cell, bio);
														
 
															+		break;
														
 
															+
														
 
															+	case -ENOSPC:
														
 
															+		no_space(cell);
														
 
															+		break;
														
 
															+
														
 
															+	default:
														
 
															+		DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
														
 
															+		cell_error(cell);
														
 
															+		break;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void process_shared_bio(struct thin_c *tc, struct bio *bio,
														
 
															+			       dm_block_t block,
														
 
															+			       struct dm_thin_lookup_result *lookup_result)
														
 
															+{
														
 
															+	struct cell *cell;
														
 
															+	struct pool *pool = tc->pool;
														
 
															+	struct cell_key key;
														
 
															+
														
 
															+	/*
														
 
															+	 * If cell is already occupied, then sharing is already in the process
														
 
															+	 * of being broken so we have nothing further to do here.
														
 
															+	 */
														
 
															+	build_data_key(tc->td, lookup_result->block, &key);
														
 
															+	if (bio_detain(pool->prison, &key, bio, &cell))
														
 
															+		return;
														
 
															+
														
 
															+	if (bio_data_dir(bio) == WRITE)
														
 
															+		break_sharing(tc, bio, block, &key, lookup_result, cell);
														
 
															+	else {
														
 
															+		struct endio_hook *h;
														
 
															+		h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
														
 
															+
														
 
															+		h->tc = tc;
														
 
															+		h->entry = ds_inc(&pool->ds);
														
 
															+		save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio);
														
 
															+		dm_get_mapinfo(bio)->ptr = h;
														
 
															+
														
 
															+		cell_release_singleton(cell, bio);
														
 
															+		remap_and_issue(tc, bio, lookup_result->block);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
														
 
															+			    struct cell *cell)
														
 
															+{
														
 
															+	int r;
														
 
															+	dm_block_t data_block;
														
 
															+
														
 
															+	/*
														
 
															+	 * Remap empty bios (flushes) immediately, without provisioning.
														
 
															+	 */
														
 
															+	if (!bio->bi_size) {
														
 
															+		cell_release_singleton(cell, bio);
														
 
															+		remap_and_issue(tc, bio, 0);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * Fill read bios with zeroes and complete them immediately.
														
 
															+	 */
														
 
															+	if (bio_data_dir(bio) == READ) {
														
 
															+		zero_fill_bio(bio);
														
 
															+		cell_release_singleton(cell, bio);
														
 
															+		bio_endio(bio, 0);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	r = alloc_data_block(tc, &data_block);
														
 
															+	switch (r) {
														
 
															+	case 0:
														
 
															+		schedule_zero(tc, block, data_block, cell, bio);
														
 
															+		break;
														
 
															+
														
 
															+	case -ENOSPC:
														
 
															+		no_space(cell);
														
 
															+		break;
														
 
															+
														
 
															+	default:
														
 
															+		DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
														
 
															+		cell_error(cell);
														
 
															+		break;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void process_bio(struct thin_c *tc, struct bio *bio)
														
 
															+{
														
 
															+	int r;
														
 
															+	dm_block_t block = get_bio_block(tc, bio);
														
 
															+	struct cell *cell;
														
 
															+	struct cell_key key;
														
 
															+	struct dm_thin_lookup_result lookup_result;
														
 
															+
														
 
															+	/*
														
 
															+	 * If cell is already occupied, then the block is already
														
 
															+	 * being provisioned so we have nothing further to do here.
														
 
															+	 */
														
 
															+	build_virtual_key(tc->td, block, &key);
														
 
															+	if (bio_detain(tc->pool->prison, &key, bio, &cell))
														
 
															+		return;
														
 
															+
														
 
															+	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
														
 
															+	switch (r) {
														
 
															+	case 0:
														
 
															+		/*
														
 
															+		 * We can release this cell now.  This thread is the only
														
 
															+		 * one that puts bios into a cell, and we know there were
														
 
															+		 * no preceding bios.
														
 
															+		 */
														
 
															+		/*
														
 
															+		 * TODO: this will probably have to change when discard goes
														
 
															+		 * back in.
														
 
															+		 */
														
 
															+		cell_release_singleton(cell, bio);
														
 
															+
														
 
															+		if (lookup_result.shared)
														
 
															+			process_shared_bio(tc, bio, block, &lookup_result);
														
 
															+		else
														
 
															+			remap_and_issue(tc, bio, lookup_result.block);
														
 
															+		break;
														
 
															+
														
 
															+	case -ENODATA:
														
 
															+		provision_block(tc, bio, block, cell);
														
 
															+		break;
														
 
															+
														
 
															+	default:
														
 
															+		DMERR("dm_thin_find_block() failed, error = %d", r);
														
 
															+		bio_io_error(bio);
														
 
															+		break;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void process_deferred_bios(struct pool *pool)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+	struct bio *bio;
														
 
															+	struct bio_list bios;
														
 
															+	int r;
														
 
															+
														
 
															+	bio_list_init(&bios);
														
 
															+
														
 
															+	spin_lock_irqsave(&pool->lock, flags);
														
 
															+	bio_list_merge(&bios, &pool->deferred_bios);
														
 
															+	bio_list_init(&pool->deferred_bios);
														
 
															+	spin_unlock_irqrestore(&pool->lock, flags);
														
 
															+
														
 
															+	while ((bio = bio_list_pop(&bios))) {
														
 
															+		struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
														
 
															+		/*
														
 
															+		 * If we've got no free new_mapping structs, and processing
														
 
															+		 * this bio might require one, we pause until there are some
														
 
															+		 * prepared mappings to process.
														
 
															+		 */
														
 
															+		if (ensure_next_mapping(pool)) {
														
 
															+			spin_lock_irqsave(&pool->lock, flags);
														
 
															+			bio_list_merge(&pool->deferred_bios, &bios);
														
 
															+			spin_unlock_irqrestore(&pool->lock, flags);
														
 
															+
														
 
															+			break;
														
 
															+		}
														
 
															+		process_bio(tc, bio);
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * If there are any deferred flush bios, we must commit
														
 
															+	 * the metadata before issuing them.
														
 
															+	 */
														
 
															+	bio_list_init(&bios);
														
 
															+	spin_lock_irqsave(&pool->lock, flags);
														
 
															+	bio_list_merge(&bios, &pool->deferred_flush_bios);
														
 
															+	bio_list_init(&pool->deferred_flush_bios);
														
 
															+	spin_unlock_irqrestore(&pool->lock, flags);
														
 
															+
														
 
															+	if (bio_list_empty(&bios))
														
 
															+		return;
														
 
															+
														
 
															+	r = dm_pool_commit_metadata(pool->pmd);
														
 
															+	if (r) {
														
 
															+		DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
														
 
															+		      __func__, r);
														
 
															+		while ((bio = bio_list_pop(&bios)))
														
 
															+			bio_io_error(bio);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	while ((bio = bio_list_pop(&bios)))
														
 
															+		generic_make_request(bio);
														
 
															+}
														
 
															+
														
 
															+static void do_worker(struct work_struct *ws)
														
 
															+{
														
 
															+	struct pool *pool = container_of(ws, struct pool, worker);
														
 
															+
														
 
															+	process_prepared_mappings(pool);
														
 
															+	process_deferred_bios(pool);
														
 
															+}
														
 
															+
														
 
															+/*----------------------------------------------------------------*/
														
 
															+
														
 
															+/*
														
 
															+ * Mapping functions.
														
 
															+ */
														
 
															+
														
 
															+/*
														
 
															+ * Called only while mapping a thin bio to hand it over to the workqueue.
														
 
															+ */
														
 
															+static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+	struct pool *pool = tc->pool;
														
 
															+
														
 
															+	spin_lock_irqsave(&pool->lock, flags);
														
 
															+	bio_list_add(&pool->deferred_bios, bio);
														
 
															+	spin_unlock_irqrestore(&pool->lock, flags);
														
 
															+
														
 
															+	wake_worker(pool);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Non-blocking function called from the thin target's map function.
														
 
															+ */
														
 
															+static int thin_bio_map(struct dm_target *ti, struct bio *bio,
														
 
															+			union map_info *map_context)
														
 
															+{
														
 
															+	int r;
														
 
															+	struct thin_c *tc = ti->private;
														
 
															+	dm_block_t block = get_bio_block(tc, bio);
														
 
															+	struct dm_thin_device *td = tc->td;
														
 
															+	struct dm_thin_lookup_result result;
														
 
															+
														
 
															+	/*
														
 
															+	 * Save the thin context for easy access from the deferred bio later.
														
 
															+	 */
														
 
															+	map_context->ptr = tc;
														
 
															+
														
 
															+	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
														
 
															+		thin_defer_bio(tc, bio);
														
 
															+		return DM_MAPIO_SUBMITTED;
														
 
															+	}
														
 
															+
														
 
															+	r = dm_thin_find_block(td, block, 0, &result);
														
 
															+
														
 
															+	/*
														
 
															+	 * Note that we defer readahead too.
														
 
															+	 */
														
 
															+	switch (r) {
														
 
															+	case 0:
														
 
															+		if (unlikely(result.shared)) {
														
 
															+			/*
														
 
															+			 * We have a race condition here between the
														
 
															+			 * result.shared value returned by the lookup and
														
 
															+			 * snapshot creation, which may cause new
														
 
															+			 * sharing.
														
 
															+			 *
														
 
															+			 * To avoid this always quiesce the origin before
														
 
															+			 * taking the snap.  You want to do this anyway to
														
 
															+			 * ensure a consistent application view
														
 
															+			 * (i.e. lockfs).
														
 
															+			 *
														
 
															+			 * More distant ancestors are irrelevant. The
														
 
															+			 * shared flag will be set in their case.
														
 
															+			 */
														
 
															+			thin_defer_bio(tc, bio);
														
 
															+			r = DM_MAPIO_SUBMITTED;
														
 
															+		} else {
														
 
															+			remap(tc, bio, result.block);
														
 
															+			r = DM_MAPIO_REMAPPED;
														
 
															+		}
														
 
															+		break;
														
 
															+
														
 
															+	case -ENODATA:
														
 
															+		/*
														
 
															+		 * In future, the failed dm_thin_find_block above could
														
 
															+		 * provide the hint to load the metadata into cache.
														
 
															+		 */
														
 
															+	case -EWOULDBLOCK:
														
 
															+		thin_defer_bio(tc, bio);
														
 
															+		r = DM_MAPIO_SUBMITTED;
														
 
															+		break;
														
 
															+	}
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
														
 
															+{
														
 
															+	int r;
														
 
															+	unsigned long flags;
														
 
															+	struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
														
 
															+
														
 
															+	spin_lock_irqsave(&pt->pool->lock, flags);
														
 
															+	r = !bio_list_empty(&pt->pool->retry_on_resume_list);
														
 
															+	spin_unlock_irqrestore(&pt->pool->lock, flags);
														
 
															+
														
 
															+	if (!r) {
														
 
															+		struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
														
 
															+		r = bdi_congested(&q->backing_dev_info, bdi_bits);
														
 
															+	}
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+static void __requeue_bios(struct pool *pool)
														
 
															+{
														
 
															+	bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list);
														
 
															+	bio_list_init(&pool->retry_on_resume_list);
														
 
															+}
														
 
															+
														
 
															+/*----------------------------------------------------------------
														
 
															+ * Binding of control targets to a pool object
														
 
															+ *--------------------------------------------------------------*/
														
 
															+static int bind_control_target(struct pool *pool, struct dm_target *ti)
														
 
															+{
														
 
															+	struct pool_c *pt = ti->private;
														
 
															+
														
 
															+	pool->ti = ti;
														
 
															+	pool->low_water_blocks = pt->low_water_blocks;
														
 
															+	pool->zero_new_blocks = pt->zero_new_blocks;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void unbind_control_target(struct pool *pool, struct dm_target *ti)
														
 
															+{
														
 
															+	if (pool->ti == ti)
														
 
															+		pool->ti = NULL;
														
 
															+}
														
 
															+
														
 
															+/*----------------------------------------------------------------
														
 
															+ * Pool creation
														
 
															+ *--------------------------------------------------------------*/
														
 
															+static void __pool_destroy(struct pool *pool)
														
 
															+{
														
 
															+	__pool_table_remove(pool);
														
 
															+
														
 
															+	if (dm_pool_metadata_close(pool->pmd) < 0)
														
 
															+		DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
														
 
															+
														
 
															+	prison_destroy(pool->prison);
														
 
															+	dm_kcopyd_client_destroy(pool->copier);
														
 
															+
														
 
															+	if (pool->wq)
														
 
															+		destroy_workqueue(pool->wq);
														
 
															+
														
 
															+	if (pool->next_mapping)
														
 
															+		mempool_free(pool->next_mapping, pool->mapping_pool);
														
 
															+	mempool_destroy(pool->mapping_pool);
														
 
															+	mempool_destroy(pool->endio_hook_pool);
														
 
															+	kfree(pool);
														
 
															+}
														
 
															+
														
 
															+static struct pool *pool_create(struct mapped_device *pool_md,
														
 
															+				struct block_device *metadata_dev,
														
 
															+				unsigned long block_size, char **error)
														
 
															+{
														
 
															+	int r;
														
 
															+	void *err_p;
														
 
															+	struct pool *pool;
														
 
															+	struct dm_pool_metadata *pmd;
														
 
															+
														
 
															+	pmd = dm_pool_metadata_open(metadata_dev, block_size);
														
 
															+	if (IS_ERR(pmd)) {
														
 
															+		*error = "Error creating metadata object";
														
 
															+		return (struct pool *)pmd;
														
 
															+	}
														
 
															+
														
 
															+	pool = kmalloc(sizeof(*pool), GFP_KERNEL);
														
 
															+	if (!pool) {
														
 
															+		*error = "Error allocating memory for pool";
														
 
															+		err_p = ERR_PTR(-ENOMEM);
														
 
															+		goto bad_pool;
														
 
															+	}
														
 
															+
														
 
															+	pool->pmd = pmd;
														
 
															+	pool->sectors_per_block = block_size;
														
 
															+	pool->block_shift = ffs(block_size) - 1;
														
 
															+	pool->offset_mask = block_size - 1;
														
 
															+	pool->low_water_blocks = 0;
														
 
															+	pool->zero_new_blocks = 1;
														
 
															+	pool->prison = prison_create(PRISON_CELLS);
														
 
															+	if (!pool->prison) {
														
 
															+		*error = "Error creating pool's bio prison";
														
 
															+		err_p = ERR_PTR(-ENOMEM);
														
 
															+		goto bad_prison;
														
 
															+	}
														
 
															+
														
 
															+	pool->copier = dm_kcopyd_client_create();
														
 
															+	if (IS_ERR(pool->copier)) {
														
 
															+		r = PTR_ERR(pool->copier);
														
 
															+		*error = "Error creating pool's kcopyd client";
														
 
															+		err_p = ERR_PTR(r);
														
 
															+		goto bad_kcopyd_client;
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * Create singlethreaded workqueue that will service all devices
														
 
															+	 * that use this metadata.
														
 
															+	 */
														
 
															+	pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
														
 
															+	if (!pool->wq) {
														
 
															+		*error = "Error creating pool's workqueue";
														
 
															+		err_p = ERR_PTR(-ENOMEM);
														
 
															+		goto bad_wq;
														
 
															+	}
														
 
															+
														
 
															+	INIT_WORK(&pool->worker, do_worker);
														
 
															+	spin_lock_init(&pool->lock);
														
 
															+	bio_list_init(&pool->deferred_bios);
														
 
															+	bio_list_init(&pool->deferred_flush_bios);
														
 
															+	INIT_LIST_HEAD(&pool->prepared_mappings);
														
 
															+	pool->low_water_triggered = 0;
														
 
															+	pool->no_free_space = 0;
														
 
															+	bio_list_init(&pool->retry_on_resume_list);
														
 
															+	ds_init(&pool->ds);
														
 
															+
														
 
															+	pool->next_mapping = NULL;
														
 
															+	pool->mapping_pool =
														
 
															+		mempool_create_kmalloc_pool(MAPPING_POOL_SIZE, sizeof(struct new_mapping));
														
 
															+	if (!pool->mapping_pool) {
														
 
															+		*error = "Error creating pool's mapping mempool";
														
 
															+		err_p = ERR_PTR(-ENOMEM);
														
 
															+		goto bad_mapping_pool;
														
 
															+	}
														
 
															+
														
 
															+	pool->endio_hook_pool =
														
 
															+		mempool_create_kmalloc_pool(ENDIO_HOOK_POOL_SIZE, sizeof(struct endio_hook));
														
 
															+	if (!pool->endio_hook_pool) {
														
 
															+		*error = "Error creating pool's endio_hook mempool";
														
 
															+		err_p = ERR_PTR(-ENOMEM);
														
 
															+		goto bad_endio_hook_pool;
														
 
															+	}
														
 
															+	pool->ref_count = 1;
														
 
															+	pool->pool_md = pool_md;
														
 
															+	pool->md_dev = metadata_dev;
														
 
															+	__pool_table_insert(pool);
														
 
															+
														
 
															+	return pool;
														
 
															+
														
 
															+bad_endio_hook_pool:
														
 
															+	mempool_destroy(pool->mapping_pool);
														
 
															+bad_mapping_pool:
														
 
															+	destroy_workqueue(pool->wq);
														
 
															+bad_wq:
														
 
															+	dm_kcopyd_client_destroy(pool->copier);
														
 
															+bad_kcopyd_client:
														
 
															+	prison_destroy(pool->prison);
														
 
															+bad_prison:
														
 
															+	kfree(pool);
														
 
															+bad_pool:
														
 
															+	if (dm_pool_metadata_close(pmd))
														
 
															+		DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
														
 
															+
														
 
															+	return err_p;
														
 
															+}
														
 
															+
														
 
															+static void __pool_inc(struct pool *pool)
														
 
															+{
														
 
															+	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
														
 
															+	pool->ref_count++;
														
 
															+}
														
 
															+
														
 
															+static void __pool_dec(struct pool *pool)
														
 
															+{
														
 
															+	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
														
 
															+	BUG_ON(!pool->ref_count);
														
 
															+	if (!--pool->ref_count)
														
 
															+		__pool_destroy(pool);
														
 
															+}
														
 
															+
														
 
															+static struct pool *__pool_find(struct mapped_device *pool_md,
														
 
															+				struct block_device *metadata_dev,
														
 
															+				unsigned long block_size, char **error)
														
 
															+{
														
 
															+	struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
														
 
															+
														
 
															+	if (pool) {
														
 
															+		if (pool->pool_md != pool_md)
														
 
															+			return ERR_PTR(-EBUSY);
														
 
															+		__pool_inc(pool);
														
 
															+
														
 
															+	} else {
														
 
															+		pool = __pool_table_lookup(pool_md);
														
 
															+		if (pool) {
														
 
															+			if (pool->md_dev != metadata_dev)
														
 
															+				return ERR_PTR(-EINVAL);
														
 
															+			__pool_inc(pool);
														
 
															+
														
 
															+		} else
														
 
															+			pool = pool_create(pool_md, metadata_dev, block_size, error);
														
 
															+	}
														
 
															+
														
 
															+	return pool;
														
 
															+}
														
 
															+
														
 
															+/*----------------------------------------------------------------
														
 
															+ * Pool target methods
														
 
															+ *--------------------------------------------------------------*/
														
 
															+static void pool_dtr(struct dm_target *ti)
														
 
															+{
														
 
															+	struct pool_c *pt = ti->private;
														
 
															+
														
 
															+	mutex_lock(&dm_thin_pool_table.mutex);
														
 
															+
														
 
															+	unbind_control_target(pt->pool, ti);
														
 
															+	__pool_dec(pt->pool);
														
 
															+	dm_put_device(ti, pt->metadata_dev);
														
 
															+	dm_put_device(ti, pt->data_dev);
														
 
															+	kfree(pt);
														
 
															+
														
 
															+	mutex_unlock(&dm_thin_pool_table.mutex);
														
 
															+}
														
 
															+
														
 
															+struct pool_features {
														
 
															+	unsigned zero_new_blocks:1;
														
 
															+};
														
 
															+
														
 
															+static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
														
 
															+			       struct dm_target *ti)
														
 
															+{
														
 
															+	int r;
														
 
															+	unsigned argc;
														
 
															+	const char *arg_name;
														
 
															+
														
 
															+	static struct dm_arg _args[] = {
														
 
															+		{0, 1, "Invalid number of pool feature arguments"},
														
 
															+	};
														
 
															+
														
 
															+	/*
														
 
															+	 * No feature arguments supplied.
														
 
															+	 */
														
 
															+	if (!as->argc)
														
 
															+		return 0;
														
 
															+
														
 
															+	r = dm_read_arg_group(_args, as, &argc, &ti->error);
														
 
															+	if (r)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	while (argc && !r) {
														
 
															+		arg_name = dm_shift_arg(as);
														
 
															+		argc--;
														
 
															+
														
 
															+		if (!strcasecmp(arg_name, "skip_block_zeroing")) {
														
 
															+			pf->zero_new_blocks = 0;
														
 
															+			continue;
														
 
															+		}
														
 
															+
														
 
															+		ti->error = "Unrecognised pool feature requested";
														
 
															+		r = -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * thin-pool <metadata dev> <data dev>
														
 
															+ *	     <data block size (sectors)>
														
 
															+ *	     <low water mark (blocks)>
														
 
															+ *	     [<#feature args> [<arg>]*]
														
 
															+ *
														
 
															+ * Optional feature arguments are:
														
 
															+ *	     skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
														
 
															+ */
														
 
															+static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
														
 
															+{
														
 
															+	int r;
														
 
															+	struct pool_c *pt;
														
 
															+	struct pool *pool;
														
 
															+	struct pool_features pf;
														
 
															+	struct dm_arg_set as;
														
 
															+	struct dm_dev *data_dev;
														
 
															+	unsigned long block_size;
														
 
															+	dm_block_t low_water_blocks;
														
 
															+	struct dm_dev *metadata_dev;
														
 
															+	sector_t metadata_dev_size;
														
 
															+
														
 
															+	/*
														
 
															+	 * FIXME Remove validation from scope of lock.
														
 
															+	 */
														
 
															+	mutex_lock(&dm_thin_pool_table.mutex);
														
 
															+
														
 
															+	if (argc < 4) {
														
 
															+		ti->error = "Invalid argument count";
														
 
															+		r = -EINVAL;
														
 
															+		goto out_unlock;
														
 
															+	}
														
 
															+	as.argc = argc;
														
 
															+	as.argv = argv;
														
 
															+
														
 
															+	r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev);
														
 
															+	if (r) {
														
 
															+		ti->error = "Error opening metadata block device";
														
 
															+		goto out_unlock;
														
 
															+	}
														
 
															+
														
 
															+	metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
														
 
															+	if (metadata_dev_size > METADATA_DEV_MAX_SECTORS) {
														
 
															+		ti->error = "Metadata device is too large";
														
 
															+		r = -EINVAL;
														
 
															+		goto out_metadata;
														
 
															+	}
														
 
															+
														
 
															+	r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
														
 
															+	if (r) {
														
 
															+		ti->error = "Error getting data device";
														
 
															+		goto out_metadata;
														
 
															+	}
														
 
															+
														
 
															+	if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
														
 
															+	    block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
														
 
															+	    block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
														
 
															+	    !is_power_of_2(block_size)) {
														
 
															+		ti->error = "Invalid block size";
														
 
															+		r = -EINVAL;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
														
 
															+		ti->error = "Invalid low water mark";
														
 
															+		r = -EINVAL;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * Set default pool features.
														
 
															+	 */
														
 
															+	memset(&pf, 0, sizeof(pf));
														
 
															+	pf.zero_new_blocks = 1;
														
 
															+
														
 
															+	dm_consume_args(&as, 4);
														
 
															+	r = parse_pool_features(&as, &pf, ti);
														
 
															+	if (r)
														
 
															+		goto out;
														
 
															+
														
 
															+	pt = kzalloc(sizeof(*pt), GFP_KERNEL);
														
 
															+	if (!pt) {
														
 
															+		r = -ENOMEM;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
														
 
															+			   block_size, &ti->error);
														
 
															+	if (IS_ERR(pool)) {
														
 
															+		r = PTR_ERR(pool);
														
 
															+		goto out_free_pt;
														
 
															+	}
														
 
															+
														
 
															+	pt->pool = pool;
														
 
															+	pt->ti = ti;
														
 
															+	pt->metadata_dev = metadata_dev;
														
 
															+	pt->data_dev = data_dev;
														
 
															+	pt->low_water_blocks = low_water_blocks;
														
 
															+	pt->zero_new_blocks = pf.zero_new_blocks;
														
 
															+	ti->num_flush_requests = 1;
														
 
															+	ti->num_discard_requests = 0;
														
 
															+	ti->private = pt;
														
 
															+
														
 
															+	pt->callbacks.congested_fn = pool_is_congested;
														
 
															+	dm_table_add_target_callbacks(ti->table, &pt->callbacks);
														
 
															+
														
 
															+	mutex_unlock(&dm_thin_pool_table.mutex);
														
 
															+
														
 
															+	return 0;
														
 
															+
														
 
															+out_free_pt:
														
 
															+	kfree(pt);
														
 
															+out:
														
 
															+	dm_put_device(ti, data_dev);
														
 
															+out_metadata:
														
 
															+	dm_put_device(ti, metadata_dev);
														
 
															+out_unlock:
														
 
															+	mutex_unlock(&dm_thin_pool_table.mutex);
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+static int pool_map(struct dm_target *ti, struct bio *bio,
														
 
															+		    union map_info *map_context)
														
 
															+{
														
 
															+	int r;
														
 
															+	struct pool_c *pt = ti->private;
														
 
															+	struct pool *pool = pt->pool;
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	/*
														
 
															+	 * As this is a singleton target, ti->begin is always zero.
														
 
															+	 */
														
 
															+	spin_lock_irqsave(&pool->lock, flags);
														
 
															+	bio->bi_bdev = pt->data_dev->bdev;
														
 
															+	r = DM_MAPIO_REMAPPED;
														
 
															+	spin_unlock_irqrestore(&pool->lock, flags);
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Retrieves the number of blocks of the data device from
														
 
															+ * the superblock and compares it to the actual device size,
														
 
															+ * thus resizing the data device in case it has grown.
														
 
															+ *
														
 
															+ * This both copes with opening preallocated data devices in the ctr
														
 
															+ * being followed by a resume
														
 
															+ * -and-
														
 
															+ * calling the resume method individually after userspace has
														
 
															+ * grown the data device in reaction to a table event.
														
 
															+ */
														
 
															+static int pool_preresume(struct dm_target *ti)
														
 
															+{
														
 
															+	int r;
														
 
															+	struct pool_c *pt = ti->private;
														
 
															+	struct pool *pool = pt->pool;
														
 
															+	dm_block_t data_size, sb_data_size;
														
 
															+
														
 
															+	/*
														
 
															+	 * Take control of the pool object.
														
 
															+	 */
														
 
															+	r = bind_control_target(pool, ti);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															+	data_size = ti->len >> pool->block_shift;
														
 
															+	r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
														
 
															+	if (r) {
														
 
															+		DMERR("failed to retrieve data device size");
														
 
															+		return r;
														
 
															+	}
														
 
															+
														
 
															+	if (data_size < sb_data_size) {
														
 
															+		DMERR("pool target too small, is %llu blocks (expected %llu)",
														
 
															+		      data_size, sb_data_size);
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	} else if (data_size > sb_data_size) {
														
 
															+		r = dm_pool_resize_data_dev(pool->pmd, data_size);
														
 
															+		if (r) {
														
 
															+			DMERR("failed to resize data device");
														
 
															+			return r;
														
 
															+		}
														
 
															+
														
 
															+		r = dm_pool_commit_metadata(pool->pmd);
														
 
															+		if (r) {
														
 
															+			DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
														
 
															+			      __func__, r);
														
 
															+			return r;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void pool_resume(struct dm_target *ti)
														
 
															+{
														
 
															+	struct pool_c *pt = ti->private;
														
 
															+	struct pool *pool = pt->pool;
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	spin_lock_irqsave(&pool->lock, flags);
														
 
															+	pool->low_water_triggered = 0;
														
 
															+	pool->no_free_space = 0;
														
 
															+	__requeue_bios(pool);
														
 
															+	spin_unlock_irqrestore(&pool->lock, flags);
														
 
															+
														
 
															+	wake_worker(pool);
														
 
															+}
														
 
															+
														
 
															+static void pool_postsuspend(struct dm_target *ti)
														
 
															+{
														
 
															+	int r;
														
 
															+	struct pool_c *pt = ti->private;
														
 
															+	struct pool *pool = pt->pool;
														
 
															+
														
 
															+	flush_workqueue(pool->wq);
														
 
															+
														
 
															+	r = dm_pool_commit_metadata(pool->pmd);
														
 
															+	if (r < 0) {
														
 
															+		DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
														
 
															+		      __func__, r);
														
 
															+		/* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static int check_arg_count(unsigned argc, unsigned args_required)
														
 
															+{
														
 
															+	if (argc != args_required) {
														
 
															+		DMWARN("Message received with %u arguments instead of %u.",
														
 
															+		       argc, args_required);
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
														
 
															+{
														
 
															+	if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
														
 
															+	    *dev_id <= MAX_DEV_ID)
														
 
															+		return 0;
														
 
															+
														
 
															+	if (warning)
														
 
															+		DMWARN("Message received with invalid device id: %s", arg);
														
 
															+
														
 
															+	return -EINVAL;
														
 
															+}
														
 
															+
														
 
															+static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
														
 
															+{
														
 
															+	dm_thin_id dev_id;
														
 
															+	int r;
														
 
															+
														
 
															+	r = check_arg_count(argc, 2);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															+	r = read_dev_id(argv[1], &dev_id, 1);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															+	r = dm_pool_create_thin(pool->pmd, dev_id);
														
 
															+	if (r) {
														
 
															+		DMWARN("Creation of new thinly-provisioned device with id %s failed.",
														
 
															+		       argv[1]);
														
 
															+		return r;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
														
 
															+{
														
 
															+	dm_thin_id dev_id;
														
 
															+	dm_thin_id origin_dev_id;
														
 
															+	int r;
														
 
															+
														
 
															+	r = check_arg_count(argc, 3);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															+	r = read_dev_id(argv[1], &dev_id, 1);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															+	r = read_dev_id(argv[2], &origin_dev_id, 1);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															+	r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
														
 
															+	if (r) {
														
 
															+		DMWARN("Creation of new snapshot %s of device %s failed.",
														
 
															+		       argv[1], argv[2]);
														
 
															+		return r;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
														
 
															+{
														
 
															+	dm_thin_id dev_id;
														
 
															+	int r;
														
 
															+
														
 
															+	r = check_arg_count(argc, 2);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															+	r = read_dev_id(argv[1], &dev_id, 1);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															+	r = dm_pool_delete_thin_device(pool->pmd, dev_id);
														
 
															+	if (r)
														
 
															+		DMWARN("Deletion of thin device %s failed.", argv[1]);
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
														
 
															+{
														
 
															+	dm_thin_id old_id, new_id;
														
 
															+	int r;
														
 
															+
														
 
															+	r = check_arg_count(argc, 3);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															+	if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
														
 
															+		DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
														
 
															+		DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
														
 
															+	if (r) {
														
 
															+		DMWARN("Failed to change transaction id from %s to %s.",
														
 
															+		       argv[1], argv[2]);
														
 
															+		return r;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Messages supported:
														
 
															+ *   create_thin	<dev_id>
														
 
															+ *   create_snap	<dev_id> <origin_id>
														
 
															+ *   delete		<dev_id>
														
 
															+ *   trim		<dev_id> <new_size_in_sectors>
														
 
															+ *   set_transaction_id <current_trans_id> <new_trans_id>
														
 
															+ */
														
 
															+static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
														
 
															+{
														
 
															+	int r = -EINVAL;
														
 
															+	struct pool_c *pt = ti->private;
														
 
															+	struct pool *pool = pt->pool;
														
 
															+
														
 
															+	if (!strcasecmp(argv[0], "create_thin"))
														
 
															+		r = process_create_thin_mesg(argc, argv, pool);
														
 
															+
														
 
															+	else if (!strcasecmp(argv[0], "create_snap"))
														
 
															+		r = process_create_snap_mesg(argc, argv, pool);
														
 
															+
														
 
															+	else if (!strcasecmp(argv[0], "delete"))
														
 
															+		r = process_delete_mesg(argc, argv, pool);
														
 
															+
														
 
															+	else if (!strcasecmp(argv[0], "set_transaction_id"))
														
 
															+		r = process_set_transaction_id_mesg(argc, argv, pool);
														
 
															+
														
 
															+	else
														
 
															+		DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
														
 
															+
														
 
															+	if (!r) {
														
 
															+		r = dm_pool_commit_metadata(pool->pmd);
														
 
															+		if (r)
														
 
															+			DMERR("%s message: dm_pool_commit_metadata() failed, error = %d",
														
 
															+			      argv[0], r);
														
 
															+	}
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Status line is:
														
 
															+ *    <transaction id> <used metadata sectors>/<total metadata sectors>
														
 
															+ *    <used data sectors>/<total data sectors> <held metadata root>
														
 
															+ */
														
 
															+static int pool_status(struct dm_target *ti, status_type_t type,
														
 
															+		       char *result, unsigned maxlen)
														
 
															+{
														
 
															+	int r;
														
 
															+	unsigned sz = 0;
														
 
															+	uint64_t transaction_id;
														
 
															+	dm_block_t nr_free_blocks_data;
														
 
															+	dm_block_t nr_free_blocks_metadata;
														
 
															+	dm_block_t nr_blocks_data;
														
 
															+	dm_block_t nr_blocks_metadata;
														
 
															+	dm_block_t held_root;
														
 
															+	char buf[BDEVNAME_SIZE];
														
 
															+	char buf2[BDEVNAME_SIZE];
														
 
															+	struct pool_c *pt = ti->private;
														
 
															+	struct pool *pool = pt->pool;
														
 
															+
														
 
															+	switch (type) {
														
 
															+	case STATUSTYPE_INFO:
														
 
															+		r = dm_pool_get_metadata_transaction_id(pool->pmd,
														
 
															+							&transaction_id);
														
 
															+		if (r)
														
 
															+			return r;
														
 
															+
														
 
															+		r = dm_pool_get_free_metadata_block_count(pool->pmd,
														
 
															+							  &nr_free_blocks_metadata);
														
 
															+		if (r)
														
 
															+			return r;
														
 
															+
														
 
															+		r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
														
 
															+		if (r)
														
 
															+			return r;
														
 
															+
														
 
															+		r = dm_pool_get_free_block_count(pool->pmd,
														
 
															+						 &nr_free_blocks_data);
														
 
															+		if (r)
														
 
															+			return r;
														
 
															+
														
 
															+		r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
														
 
															+		if (r)
														
 
															+			return r;
														
 
															+
														
 
															+		r = dm_pool_get_held_metadata_root(pool->pmd, &held_root);
														
 
															+		if (r)
														
 
															+			return r;
														
 
															+
														
 
															+		DMEMIT("%llu %llu/%llu %llu/%llu ",
														
 
															+		       (unsigned long long)transaction_id,
														
 
															+		       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
														
 
															+		       (unsigned long long)nr_blocks_metadata,
														
 
															+		       (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
														
 
															+		       (unsigned long long)nr_blocks_data);
														
 
															+
														
 
															+		if (held_root)
														
 
															+			DMEMIT("%llu", held_root);
														
 
															+		else
														
 
															+			DMEMIT("-");
														
 
															+
														
 
															+		break;
														
 
															+
														
 
															+	case STATUSTYPE_TABLE:
														
 
															+		DMEMIT("%s %s %lu %llu ",
														
 
															+		       format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
														
 
															+		       format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
														
 
															+		       (unsigned long)pool->sectors_per_block,
														
 
															+		       (unsigned long long)pt->low_water_blocks);
														
 
															+
														
 
															+		DMEMIT("%u ", !pool->zero_new_blocks);
														
 
															+
														
 
															+		if (!pool->zero_new_blocks)
														
 
															+			DMEMIT("skip_block_zeroing ");
														
 
															+		break;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int pool_iterate_devices(struct dm_target *ti,
														
 
															+				iterate_devices_callout_fn fn, void *data)
														
 
															+{
														
 
															+	struct pool_c *pt = ti->private;
														
 
															+
														
 
															+	return fn(ti, pt->data_dev, 0, ti->len, data);
														
 
															+}
														
 
															+
														
 
															+static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
														
 
															+		      struct bio_vec *biovec, int max_size)
														
 
															+{
														
 
															+	struct pool_c *pt = ti->private;
														
 
															+	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
														
 
															+
														
 
															+	if (!q->merge_bvec_fn)
														
 
															+		return max_size;
														
 
															+
														
 
															+	bvm->bi_bdev = pt->data_dev->bdev;
														
 
															+
														
 
															+	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
														
 
															+}
														
 
															+
														
 
															+static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
														
 
															+{
														
 
															+	struct pool_c *pt = ti->private;
														
 
															+	struct pool *pool = pt->pool;
														
 
															+
														
 
															+	blk_limits_io_min(limits, 0);
														
 
															+	blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
														
 
															+}
														
 
															+
														
 
															+static struct target_type pool_target = {
														
 
															+	.name = "thin-pool",
														
 
															+	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
														
 
															+		    DM_TARGET_IMMUTABLE,
														
 
															+	.version = {1, 0, 0},
														
 
															+	.module = THIS_MODULE,
														
 
															+	.ctr = pool_ctr,
														
 
															+	.dtr = pool_dtr,
														
 
															+	.map = pool_map,
														
 
															+	.postsuspend = pool_postsuspend,
														
 
															+	.preresume = pool_preresume,
														
 
															+	.resume = pool_resume,
														
 
															+	.message = pool_message,
														
 
															+	.status = pool_status,
														
 
															+	.merge = pool_merge,
														
 
															+	.iterate_devices = pool_iterate_devices,
														
 
															+	.io_hints = pool_io_hints,
														
 
															+};
														
 
															+
														
 
															+/*----------------------------------------------------------------
														
 
															+ * Thin target methods
														
 
															+ *--------------------------------------------------------------*/
														
 
															+static void thin_dtr(struct dm_target *ti)
														
 
															+{
														
 
															+	struct thin_c *tc = ti->private;
														
 
															+
														
 
															+	mutex_lock(&dm_thin_pool_table.mutex);
														
 
															+
														
 
															+	__pool_dec(tc->pool);
														
 
															+	dm_pool_close_thin_device(tc->td);
														
 
															+	dm_put_device(ti, tc->pool_dev);
														
 
															+	kfree(tc);
														
 
															+
														
 
															+	mutex_unlock(&dm_thin_pool_table.mutex);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Thin target parameters:
														
 
															+ *
														
 
															+ * <pool_dev> <dev_id>
														
 
															+ *
														
 
															+ * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
														
 
															+ * dev_id: the internal device identifier
														
 
															+ */
														
 
															+static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
														
 
															+{
														
 
															+	int r;
														
 
															+	struct thin_c *tc;
														
 
															+	struct dm_dev *pool_dev;
														
 
															+	struct mapped_device *pool_md;
														
 
															+
														
 
															+	mutex_lock(&dm_thin_pool_table.mutex);
														
 
															+
														
 
															+	if (argc != 2) {
														
 
															+		ti->error = "Invalid argument count";
														
 
															+		r = -EINVAL;
														
 
															+		goto out_unlock;
														
 
															+	}
														
 
															+
														
 
															+	tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
														
 
															+	if (!tc) {
														
 
															+		ti->error = "Out of memory";
														
 
															+		r = -ENOMEM;
														
 
															+		goto out_unlock;
														
 
															+	}
														
 
															+
														
 
															+	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
														
 
															+	if (r) {
														
 
															+		ti->error = "Error opening pool device";
														
 
															+		goto bad_pool_dev;
														
 
															+	}
														
 
															+	tc->pool_dev = pool_dev;
														
 
															+
														
 
															+	if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
														
 
															+		ti->error = "Invalid device id";
														
 
															+		r = -EINVAL;
														
 
															+		goto bad_common;
														
 
															+	}
														
 
															+
														
 
															+	pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
														
 
															+	if (!pool_md) {
														
 
															+		ti->error = "Couldn't get pool mapped device";
														
 
															+		r = -EINVAL;
														
 
															+		goto bad_common;
														
 
															+	}
														
 
															+
														
 
															+	tc->pool = __pool_table_lookup(pool_md);
														
 
															+	if (!tc->pool) {
														
 
															+		ti->error = "Couldn't find pool object";
														
 
															+		r = -EINVAL;
														
 
															+		goto bad_pool_lookup;
														
 
															+	}
														
 
															+	__pool_inc(tc->pool);
														
 
															+
														
 
															+	r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
														
 
															+	if (r) {
														
 
															+		ti->error = "Couldn't open thin internal device";
														
 
															+		goto bad_thin_open;
														
 
															+	}
														
 
															+
														
 
															+	ti->split_io = tc->pool->sectors_per_block;
														
 
															+	ti->num_flush_requests = 1;
														
 
															+	ti->num_discard_requests = 0;
														
 
															+	ti->discards_supported = 0;
														
 
															+
														
 
															+	dm_put(pool_md);
														
 
															+
														
 
															+	mutex_unlock(&dm_thin_pool_table.mutex);
														
 
															+
														
 
															+	return 0;
														
 
															+
														
 
															+bad_thin_open:
														
 
															+	__pool_dec(tc->pool);
														
 
															+bad_pool_lookup:
														
 
															+	dm_put(pool_md);
														
 
															+bad_common:
														
 
															+	dm_put_device(ti, tc->pool_dev);
														
 
															+bad_pool_dev:
														
 
															+	kfree(tc);
														
 
															+out_unlock:
														
 
															+	mutex_unlock(&dm_thin_pool_table.mutex);
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+static int thin_map(struct dm_target *ti, struct bio *bio,
														
 
															+		    union map_info *map_context)
														
 
															+{
														
 
															+	bio->bi_sector -= ti->begin;
														
 
															+
														
 
															+	return thin_bio_map(ti, bio, map_context);
														
 
															+}
														
 
															+
														
 
															+static void thin_postsuspend(struct dm_target *ti)
														
 
															+{
														
 
															+	if (dm_noflush_suspending(ti))
														
 
															+		requeue_io((struct thin_c *)ti->private);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * <nr mapped sectors> <highest mapped sector>
														
 
															+ */
														
 
															+static int thin_status(struct dm_target *ti, status_type_t type,
														
 
															+		       char *result, unsigned maxlen)
														
 
															+{
														
 
															+	int r;
														
 
															+	ssize_t sz = 0;
														
 
															+	dm_block_t mapped, highest;
														
 
															+	char buf[BDEVNAME_SIZE];
														
 
															+	struct thin_c *tc = ti->private;
														
 
															+
														
 
															+	if (!tc->td)
														
 
															+		DMEMIT("-");
														
 
															+	else {
														
 
															+		switch (type) {
														
 
															+		case STATUSTYPE_INFO:
														
 
															+			r = dm_thin_get_mapped_count(tc->td, &mapped);
														
 
															+			if (r)
														
 
															+				return r;
														
 
															+
														
 
															+			r = dm_thin_get_highest_mapped_block(tc->td, &highest);
														
 
															+			if (r < 0)
														
 
															+				return r;
														
 
															+
														
 
															+			DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
														
 
															+			if (r)
														
 
															+				DMEMIT("%llu", ((highest + 1) *
														
 
															+						tc->pool->sectors_per_block) - 1);
														
 
															+			else
														
 
															+				DMEMIT("-");
														
 
															+			break;
														
 
															+
														
 
															+		case STATUSTYPE_TABLE:
														
 
															+			DMEMIT("%s %lu",
														
 
															+			       format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
														
 
															+			       (unsigned long) tc->dev_id);
														
 
															+			break;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int thin_iterate_devices(struct dm_target *ti,
														
 
															+				iterate_devices_callout_fn fn, void *data)
														
 
															+{
														
 
															+	dm_block_t blocks;
														
 
															+	struct thin_c *tc = ti->private;
														
 
															+
														
 
															+	/*
														
 
															+	 * We can't call dm_pool_get_data_dev_size() since that blocks.  So
														
 
															+	 * we follow a more convoluted path through to the pool's target.
														
 
															+	 */
														
 
															+	if (!tc->pool->ti)
														
 
															+		return 0;	/* nothing is bound */
														
 
															+
														
 
															+	blocks = tc->pool->ti->len >> tc->pool->block_shift;
														
 
															+	if (blocks)
														
 
															+		return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
														
 
															+{
														
 
															+	struct thin_c *tc = ti->private;
														
 
															+
														
 
															+	blk_limits_io_min(limits, 0);
														
 
															+	blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT);
														
 
															+}
														
 
															+
														
 
															+static struct target_type thin_target = {
														
 
															+	.name = "thin",
														
 
															+	.version = {1, 0, 0},
														
 
															+	.module	= THIS_MODULE,
														
 
															+	.ctr = thin_ctr,
														
 
															+	.dtr = thin_dtr,
														
 
															+	.map = thin_map,
														
 
															+	.postsuspend = thin_postsuspend,
														
 
															+	.status = thin_status,
														
 
															+	.iterate_devices = thin_iterate_devices,
														
 
															+	.io_hints = thin_io_hints,
														
 
															+};
														
 
															+
														
 
															+/*----------------------------------------------------------------*/
														
 
															+
														
 
															+static int __init dm_thin_init(void)
														
 
															+{
														
 
															+	int r;
														
 
															+
														
 
															+	pool_table_init();
														
 
															+
														
 
															+	r = dm_register_target(&thin_target);
														
 
															+	if (r)
														
 
															+		return r;
														
 
															+
														
 
															+	r = dm_register_target(&pool_target);
														
 
															+	if (r)
														
 
															+		dm_unregister_target(&thin_target);
														
 
															+
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+static void dm_thin_exit(void)
														
 
															+{
														
 
															+	dm_unregister_target(&thin_target);
														
 
															+	dm_unregister_target(&pool_target);
														
 
															+}
														
 
															+
														
 
															+module_init(dm_thin_init);
														
 
															+module_exit(dm_thin_exit);
														
 
															+
														
 
															+MODULE_DESCRIPTION(DM_NAME "device-mapper thin provisioning target");
														
 
															+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
														
 
															+MODULE_LICENSE("GPL");