12 years ago · 37cae6ad4c
--- a/Documentation/device-mapper/cache-policies.txt
+++ b/Documentation/device-mapper/cache-policies.txt
@@ -0,0 +1,77 @@
 
				+Guidance for writing policies
			
 
				+=============================
			
 
				+
			
 
				+Try to keep transactionality out of it.  The core is careful to
			
 
				+avoid asking about anything that is migrating.  This is a pain, but
			
 
				+makes it easier to write the policies.
			
 
				+
			
 
				+Mappings are loaded into the policy at construction time.
			
 
				+
			
 
				+Every bio that is mapped by the target is referred to the policy.
			
 
				+The policy can return a simple HIT or MISS or issue a migration.
			
 
				+
			
 
				+Currently there's no way for the policy to issue background work,
			
 
				+e.g. to start writing back dirty blocks that are going to be evicte
			
 
				+soon.
			
 
				+
			
 
				+Because we map bios, rather than requests it's easy for the policy
			
 
				+to get fooled by many small bios.  For this reason the core target
			
 
				+issues periodic ticks to the policy.  It's suggested that the policy
			
 
				+doesn't update states (eg, hit counts) for a block more than once
			
 
				+for each tick.  The core ticks by watching bios complete, and so
			
 
				+trying to see when the io scheduler has let the ios run.
			
 
				+
			
 
				+
			
 
				+Overview of supplied cache replacement policies
			
 
				+===============================================
			
 
				+
			
 
				+multiqueue
			
 
				+----------
			
 
				+
			
 
				+This policy is the default.
			
 
				+
			
 
				+The multiqueue policy has two sets of 16 queues: one set for entries
			
 
				+waiting for the cache and another one for those in the cache.
			
 
				+Cache entries in the queues are aged based on logical time. Entry into
			
 
				+the cache is based on variable thresholds and queue selection is based
			
 
				+on hit count on entry. The policy aims to take different cache miss
			
 
				+costs into account and to adjust to varying load patterns automatically.
			
 
				+
			
 
				+Message and constructor argument pairs are:
			
 
				+	'sequential_threshold <#nr_sequential_ios>' and
			
 
				+	'random_threshold <#nr_random_ios>'.
			
 
				+
			
 
				+The sequential threshold indicates the number of contiguous I/Os
			
 
				+required before a stream is treated as sequential.  The random threshold
			
 
				+is the number of intervening non-contiguous I/Os that must be seen
			
 
				+before the stream is treated as random again.
			
 
				+
			
 
				+The sequential and random thresholds default to 512 and 4 respectively.
			
 
				+
			
 
				+Large, sequential ios are probably better left on the origin device
			
 
				+since spindles tend to have good bandwidth. The io_tracker counts
			
 
				+contiguous I/Os to try to spot when the io is in one of these sequential
			
 
				+modes.
			
 
				+
			
 
				+cleaner
			
 
				+-------
			
 
				+
			
 
				+The cleaner writes back all dirty blocks in a cache to decommission it.
			
 
				+
			
 
				+Examples
			
 
				+========
			
 
				+
			
 
				+The syntax for a table is:
			
 
				+	cache <metadata dev> <cache dev> <origin dev> <block size>
			
 
				+	<#feature_args> [<feature arg>]*
			
 
				+	<policy> <#policy_args> [<policy arg>]*
			
 
				+
			
 
				+The syntax to send a message using the dmsetup command is:
			
 
				+	dmsetup message <mapped device> 0 sequential_threshold 1024
			
 
				+	dmsetup message <mapped device> 0 random_threshold 8
			
 
				+
			
 
				+Using dmsetup:
			
 
				+	dmsetup create blah --table "0 268435456 cache /dev/sdb /dev/sdc \
			
 
				+	    /dev/sdd 512 0 mq 4 sequential_threshold 1024 random_threshold 8"
			
 
				+	creates a 128GB large mapped device named 'blah' with the
			
 
				+	sequential threshold set to 1024 and the random_threshold set to 8.
			
--- a/Documentation/device-mapper/cache.txt
+++ b/Documentation/device-mapper/cache.txt
@@ -0,0 +1,243 @@
 
				+Introduction
			
 
				+============
			
 
				+
			
 
				+dm-cache is a device mapper target written by Joe Thornber, Heinz
			
 
				+Mauelshagen, and Mike Snitzer.
			
 
				+
			
 
				+It aims to improve performance of a block device (eg, a spindle) by
			
 
				+dynamically migrating some of its data to a faster, smaller device
			
 
				+(eg, an SSD).
			
 
				+
			
 
				+This device-mapper solution allows us to insert this caching at
			
 
				+different levels of the dm stack, for instance above the data device for
			
 
				+a thin-provisioning pool.  Caching solutions that are integrated more
			
 
				+closely with the virtual memory system should give better performance.
			
 
				+
			
 
				+The target reuses the metadata library used in the thin-provisioning
			
 
				+library.
			
 
				+
			
 
				+The decision as to what data to migrate and when is left to a plug-in
			
 
				+policy module.  Several of these have been written as we experiment,
			
 
				+and we hope other people will contribute others for specific io
			
 
				+scenarios (eg. a vm image server).
			
 
				+
			
 
				+Glossary
			
 
				+========
			
 
				+
			
 
				+  Migration -  Movement of the primary copy of a logical block from one
			
 
				+	       device to the other.
			
 
				+  Promotion -  Migration from slow device to fast device.
			
 
				+  Demotion  -  Migration from fast device to slow device.
			
 
				+
			
 
				+The origin device always contains a copy of the logical block, which
			
 
				+may be out of date or kept in sync with the copy on the cache device
			
 
				+(depending on policy).
			
 
				+
			
 
				+Design
			
 
				+======
			
 
				+
			
 
				+Sub-devices
			
 
				+-----------
			
 
				+
			
 
				+The target is constructed by passing three devices to it (along with
			
 
				+other parameters detailed later):
			
 
				+
			
 
				+1. An origin device - the big, slow one.
			
 
				+
			
 
				+2. A cache device - the small, fast one.
			
 
				+
			
 
				+3. A small metadata device - records which blocks are in the cache,
			
 
				+   which are dirty, and extra hints for use by the policy object.
			
 
				+   This information could be put on the cache device, but having it
			
 
				+   separate allows the volume manager to configure it differently,
			
 
				+   e.g. as a mirror for extra robustness.
			
 
				+
			
 
				+Fixed block size
			
 
				+----------------
			
 
				+
			
 
				+The origin is divided up into blocks of a fixed size.  This block size
			
 
				+is configurable when you first create the cache.  Typically we've been
			
 
				+using block sizes of 256k - 1024k.
			
 
				+
			
 
				+Having a fixed block size simplifies the target a lot.  But it is
			
 
				+something of a compromise.  For instance, a small part of a block may be
			
 
				+getting hit a lot, yet the whole block will be promoted to the cache.
			
 
				+So large block sizes are bad because they waste cache space.  And small
			
 
				+block sizes are bad because they increase the amount of metadata (both
			
 
				+in core and on disk).
			
 
				+
			
 
				+Writeback/writethrough
			
 
				+----------------------
			
 
				+
			
 
				+The cache has two modes, writeback and writethrough.
			
 
				+
			
 
				+If writeback, the default, is selected then a write to a block that is
			
 
				+cached will go only to the cache and the block will be marked dirty in
			
 
				+the metadata.
			
 
				+
			
 
				+If writethrough is selected then a write to a cached block will not
			
 
				+complete until it has hit both the origin and cache devices.  Clean
			
 
				+blocks should remain clean.
			
 
				+
			
 
				+A simple cleaner policy is provided, which will clean (write back) all
			
 
				+dirty blocks in a cache.  Useful for decommissioning a cache.
			
 
				+
			
 
				+Migration throttling
			
 
				+--------------------
			
 
				+
			
 
				+Migrating data between the origin and cache device uses bandwidth.
			
 
				+The user can set a throttle to prevent more than a certain amount of
			
 
				+migration occuring at any one time.  Currently we're not taking any
			
 
				+account of normal io traffic going to the devices.  More work needs
			
 
				+doing here to avoid migrating during those peak io moments.
			
 
				+
			
 
				+For the time being, a message "migration_threshold <#sectors>"
			
 
				+can be used to set the maximum number of sectors being migrated,
			
 
				+the default being 204800 sectors (or 100MB).
			
 
				+
			
 
				+Updating on-disk metadata
			
 
				+-------------------------
			
 
				+
			
 
				+On-disk metadata is committed every time a REQ_SYNC or REQ_FUA bio is
			
 
				+written.  If no such requests are made then commits will occur every
			
 
				+second.  This means the cache behaves like a physical disk that has a
			
 
				+write cache (the same is true of the thin-provisioning target).  If
			
 
				+power is lost you may lose some recent writes.  The metadata should
			
 
				+always be consistent in spite of any crash.
			
 
				+
			
 
				+The 'dirty' state for a cache block changes far too frequently for us
			
 
				+to keep updating it on the fly.  So we treat it as a hint.  In normal
			
 
				+operation it will be written when the dm device is suspended.  If the
			
 
				+system crashes all cache blocks will be assumed dirty when restarted.
			
 
				+
			
 
				+Per-block policy hints
			
 
				+----------------------
			
 
				+
			
 
				+Policy plug-ins can store a chunk of data per cache block.  It's up to
			
 
				+the policy how big this chunk is, but it should be kept small.  Like the
			
 
				+dirty flags this data is lost if there's a crash so a safe fallback
			
 
				+value should always be possible.
			
 
				+
			
 
				+For instance, the 'mq' policy, which is currently the default policy,
			
 
				+uses this facility to store the hit count of the cache blocks.  If
			
 
				+there's a crash this information will be lost, which means the cache
			
 
				+may be less efficient until those hit counts are regenerated.
			
 
				+
			
 
				+Policy hints affect performance, not correctness.
			
 
				+
			
 
				+Policy messaging
			
 
				+----------------
			
 
				+
			
 
				+Policies will have different tunables, specific to each one, so we
			
 
				+need a generic way of getting and setting these.  Device-mapper
			
 
				+messages are used.  Refer to cache-policies.txt.
			
 
				+
			
 
				+Discard bitset resolution
			
 
				+-------------------------
			
 
				+
			
 
				+We can avoid copying data during migration if we know the block has
			
 
				+been discarded.  A prime example of this is when mkfs discards the
			
 
				+whole block device.  We store a bitset tracking the discard state of
			
 
				+blocks.  However, we allow this bitset to have a different block size
			
 
				+from the cache blocks.  This is because we need to track the discard
			
 
				+state for all of the origin device (compare with the dirty bitset
			
 
				+which is just for the smaller cache device).
			
 
				+
			
 
				+Target interface
			
 
				+================
			
 
				+
			
 
				+Constructor
			
 
				+-----------
			
 
				+
			
 
				+ cache <metadata dev> <cache dev> <origin dev> <block size>
			
 
				+       <#feature args> [<feature arg>]*
			
 
				+       <policy> <#policy args> [policy args]*
			
 
				+
			
 
				+ metadata dev    : fast device holding the persistent metadata
			
 
				+ cache dev	 : fast device holding cached data blocks
			
 
				+ origin dev	 : slow device holding original data blocks
			
 
				+ block size      : cache unit size in sectors
			
 
				+
			
 
				+ #feature args   : number of feature arguments passed
			
 
				+ feature args    : writethrough.  (The default is writeback.)
			
 
				+
			
 
				+ policy          : the replacement policy to use
			
 
				+ #policy args    : an even number of arguments corresponding to
			
 
				+                   key/value pairs passed to the policy
			
 
				+ policy args     : key/value pairs passed to the policy
			
 
				+		   E.g. 'sequential_threshold 1024'
			
 
				+		   See cache-policies.txt for details.
			
 
				+
			
 
				+Optional feature arguments are:
			
 
				+   writethrough  : write through caching that prohibits cache block
			
 
				+		   content from being different from origin block content.
			
 
				+		   Without this argument, the default behaviour is to write
			
 
				+		   back cache block contents later for performance reasons,
			
 
				+		   so they may differ from the corresponding origin blocks.
			
 
				+
			
 
				+A policy called 'default' is always registered.  This is an alias for
			
 
				+the policy we currently think is giving best all round performance.
			
 
				+
			
 
				+As the default policy could vary between kernels, if you are relying on
			
 
				+the characteristics of a specific policy, always request it by name.
			
 
				+
			
 
				+Status
			
 
				+------
			
 
				+
			
 
				+<#used metadata blocks>/<#total metadata blocks> <#read hits> <#read misses>
			
 
				+<#write hits> <#write misses> <#demotions> <#promotions> <#blocks in cache>
			
 
				+<#dirty> <#features> <features>* <#core args> <core args>* <#policy args>
			
 
				+<policy args>*
			
 
				+
			
 
				+#used metadata blocks    : Number of metadata blocks used
			
 
				+#total metadata blocks   : Total number of metadata blocks
			
 
				+#read hits               : Number of times a READ bio has been mapped
			
 
				+			     to the cache
			
 
				+#read misses             : Number of times a READ bio has been mapped
			
 
				+			     to the origin
			
 
				+#write hits              : Number of times a WRITE bio has been mapped
			
 
				+			     to the cache
			
 
				+#write misses            : Number of times a WRITE bio has been
			
 
				+			     mapped to the origin
			
 
				+#demotions               : Number of times a block has been removed
			
 
				+			     from the cache
			
 
				+#promotions              : Number of times a block has been moved to
			
 
				+			     the cache
			
 
				+#blocks in cache         : Number of blocks resident in the cache
			
 
				+#dirty                   : Number of blocks in the cache that differ
			
 
				+			     from the origin
			
 
				+#feature args            : Number of feature args to follow
			
 
				+feature args             : 'writethrough' (optional)
			
 
				+#core args               : Number of core arguments (must be even)
			
 
				+core args                : Key/value pairs for tuning the core
			
 
				+			     e.g. migration_threshold
			
 
				+#policy args             : Number of policy arguments to follow (must be even)
			
 
				+policy args              : Key/value pairs
			
 
				+			     e.g. 'sequential_threshold 1024
			
 
				+
			
 
				+Messages
			
 
				+--------
			
 
				+
			
 
				+Policies will have different tunables, specific to each one, so we
			
 
				+need a generic way of getting and setting these.  Device-mapper
			
 
				+messages are used.  (A sysfs interface would also be possible.)
			
 
				+
			
 
				+The message format is:
			
 
				+
			
 
				+   <key> <value>
			
 
				+
			
 
				+E.g.
			
 
				+   dmsetup message my_cache 0 sequential_threshold 1024
			
 
				+
			
 
				+Examples
			
 
				+========
			
 
				+
			
 
				+The test suite can be found here:
			
 
				+
			
 
				+https://github.com/jthornber/thinp-test-suite
			
 
				+
			
 
				+dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \
			
 
				+	/dev/mapper/ssd /dev/mapper/origin 512 1 writeback default 0'
			
 
				+dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \
			
 
				+	/dev/mapper/ssd /dev/mapper/origin 1024 1 writeback \
			
 
				+	mq 4 sequential_threshold 1024 random_threshold 8'
			
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -210,7 +210,7 @@ config DM_DEBUG
 
				 
			
 
				 config DM_BUFIO
			
 
				        tristate
			
 
				-       depends on BLK_DEV_DM && EXPERIMENTAL
			
 
				+       depends on BLK_DEV_DM
			
 
				        ---help---
			
 
				 	 This interface allows you to do buffered I/O on a device and acts
			
 
				 	 as a cache, holding recently-read blocks in memory and performing
			
@@ -218,7 +218,7 @@ config DM_BUFIO
 
				 
			
 
				 config DM_BIO_PRISON
			
 
				        tristate
			
 
				-       depends on BLK_DEV_DM && EXPERIMENTAL
			
 
				+       depends on BLK_DEV_DM
			
 
				        ---help---
			
 
				 	 Some bio locking schemes used by other device-mapper targets
			
 
				 	 including thin provisioning.
			
@@ -251,8 +251,8 @@ config DM_SNAPSHOT
 
				          Allow volume managers to take writable snapshots of a device.
			
 
				 
			
 
				 config DM_THIN_PROVISIONING
			
 
				-       tristate "Thin provisioning target (EXPERIMENTAL)"
			
 
				-       depends on BLK_DEV_DM && EXPERIMENTAL
			
 
				+       tristate "Thin provisioning target"
			
 
				+       depends on BLK_DEV_DM
			
 
				        select DM_PERSISTENT_DATA
			
 
				        select DM_BIO_PRISON
			
 
				        ---help---
			
@@ -268,6 +268,37 @@ config DM_DEBUG_BLOCK_STACK_TRACING
 
				 
			
 
				 	  If unsure, say N.
			
 
				 
			
 
				+config DM_CACHE
			
 
				+       tristate "Cache target (EXPERIMENTAL)"
			
 
				+       depends on BLK_DEV_DM
			
 
				+       default n
			
 
				+       select DM_PERSISTENT_DATA
			
 
				+       select DM_BIO_PRISON
			
 
				+       ---help---
			
 
				+         dm-cache attempts to improve performance of a block device by
			
 
				+         moving frequently used data to a smaller, higher performance
			
 
				+         device.  Different 'policy' plugins can be used to change the
			
 
				+         algorithms used to select which blocks are promoted, demoted,
			
 
				+         cleaned etc.  It supports writeback and writethrough modes.
			
 
				+
			
 
				+config DM_CACHE_MQ
			
 
				+       tristate "MQ Cache Policy (EXPERIMENTAL)"
			
 
				+       depends on DM_CACHE
			
 
				+       default y
			
 
				+       ---help---
			
 
				+         A cache policy that uses a multiqueue ordered by recent hit
			
 
				+         count to select which blocks should be promoted and demoted.
			
 
				+         This is meant to be a general purpose policy.  It prioritises
			
 
				+         reads over writes.
			
 
				+
			
 
				+config DM_CACHE_CLEANER
			
 
				+       tristate "Cleaner Cache Policy (EXPERIMENTAL)"
			
 
				+       depends on DM_CACHE
			
 
				+       default y
			
 
				+       ---help---
			
 
				+         A simple cache policy that writes back all data to the
			
 
				+         origin.  Used when decommissioning a dm-cache.
			
 
				+
			
 
				 config DM_MIRROR
			
 
				        tristate "Mirror target"
			
 
				        depends on BLK_DEV_DM
			
@@ -302,8 +333,8 @@ config DM_RAID
 
				 	 in one of the available parity distribution methods.
			
 
				 
			
 
				 config DM_LOG_USERSPACE
			
 
				-	tristate "Mirror userspace logging (EXPERIMENTAL)"
			
 
				-	depends on DM_MIRROR && EXPERIMENTAL && NET
			
 
				+	tristate "Mirror userspace logging"
			
 
				+	depends on DM_MIRROR && NET
			
 
				 	select CONNECTOR
			
 
				 	---help---
			
 
				 	  The userspace logging module provides a mechanism for
			
@@ -350,8 +381,8 @@ config DM_MULTIPATH_ST
 
				 	  If unsure, say N.
			
 
				 
			
 
				 config DM_DELAY
			
 
				-	tristate "I/O delaying target (EXPERIMENTAL)"
			
 
				-	depends on BLK_DEV_DM && EXPERIMENTAL
			
 
				+	tristate "I/O delaying target"
			
 
				+	depends on BLK_DEV_DM
			
 
				 	---help---
			
 
				 	A target that delays reads and/or writes and can send
			
 
				 	them to different devices.  Useful for testing.
			
@@ -365,14 +396,14 @@ config DM_UEVENT
 
				 	Generate udev events for DM events.
			
 
				 
			
 
				 config DM_FLAKEY
			
 
				-       tristate "Flakey target (EXPERIMENTAL)"
			
 
				-       depends on BLK_DEV_DM && EXPERIMENTAL
			
 
				+       tristate "Flakey target"
			
 
				+       depends on BLK_DEV_DM
			
 
				        ---help---
			
 
				          A target that intermittently fails I/O for debugging purposes.
			
 
				 
			
 
				 config DM_VERITY
			
 
				-	tristate "Verity target support (EXPERIMENTAL)"
			
 
				-	depends on BLK_DEV_DM && EXPERIMENTAL
			
 
				+	tristate "Verity target support"
			
 
				+	depends on BLK_DEV_DM
			
 
				 	select CRYPTO
			
 
				 	select CRYPTO_HASH
			
 
				 	select DM_BUFIO
			
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -11,6 +11,9 @@ dm-mirror-y	+= dm-raid1.o
 
				 dm-log-userspace-y \
			
 
				 		+= dm-log-userspace-base.o dm-log-userspace-transfer.o
			
 
				 dm-thin-pool-y	+= dm-thin.o dm-thin-metadata.o
			
 
				+dm-cache-y	+= dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
			
 
				+dm-cache-mq-y   += dm-cache-policy-mq.o
			
 
				+dm-cache-cleaner-y += dm-cache-policy-cleaner.o
			
 
				 md-mod-y	+= md.o bitmap.o
			
 
				 raid456-y	+= raid5.o
			
 
				 
			
@@ -44,6 +47,9 @@ obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
 
				 obj-$(CONFIG_DM_RAID)	+= dm-raid.o
			
 
				 obj-$(CONFIG_DM_THIN_PROVISIONING)	+= dm-thin-pool.o
			
 
				 obj-$(CONFIG_DM_VERITY)		+= dm-verity.o
			
 
				+obj-$(CONFIG_DM_CACHE)		+= dm-cache.o
			
 
				+obj-$(CONFIG_DM_CACHE_MQ)	+= dm-cache-mq.o
			
 
				+obj-$(CONFIG_DM_CACHE_CLEANER)	+= dm-cache-cleaner.o
			
 
				 
			
 
				 ifeq ($(CONFIG_DM_UEVENT),y)
			
 
				 dm-mod-objs			+= dm-uevent.o
			
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
@@ -14,14 +14,6 @@
 
				 
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				-struct dm_bio_prison_cell {
			
 
				-	struct hlist_node list;
			
 
				-	struct dm_bio_prison *prison;
			
 
				-	struct dm_cell_key key;
			
 
				-	struct bio *holder;
			
 
				-	struct bio_list bios;
			
 
				-};
			
 
				-
			
 
				 struct dm_bio_prison {
			
 
				 	spinlock_t lock;
			
 
				 	mempool_t *cell_pool;
			
@@ -87,6 +79,19 @@ void dm_bio_prison_destroy(struct dm_bio_prison *prison)
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(dm_bio_prison_destroy);
			
 
				 
			
 
				+struct dm_bio_prison_cell *dm_bio_prison_alloc_cell(struct dm_bio_prison *prison, gfp_t gfp)
			
 
				+{
			
 
				+	return mempool_alloc(prison->cell_pool, gfp);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell);
			
 
				+
			
 
				+void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
			
 
				+			     struct dm_bio_prison_cell *cell)
			
 
				+{
			
 
				+	mempool_free(cell, prison->cell_pool);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell);
			
 
				+
			
 
				 static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key)
			
 
				 {
			
 
				 	const unsigned long BIG_PRIME = 4294967291UL;
			
@@ -114,91 +119,95 @@ static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket,
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * This may block if a new cell needs allocating.  You must ensure that
			
 
				- * cells will be unlocked even if the calling thread is blocked.
			
 
				- *
			
 
				- * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
			
 
				- */
			
 
				-int dm_bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key,
			
 
				-		  struct bio *inmate, struct dm_bio_prison_cell **ref)
			
 
				+static void __setup_new_cell(struct dm_bio_prison *prison,
			
 
				+			     struct dm_cell_key *key,
			
 
				+			     struct bio *holder,
			
 
				+			     uint32_t hash,
			
 
				+			     struct dm_bio_prison_cell *cell)
			
 
				 {
			
 
				-	int r = 1;
			
 
				-	unsigned long flags;
			
 
				-	uint32_t hash = hash_key(prison, key);
			
 
				-	struct dm_bio_prison_cell *cell, *cell2;
			
 
				-
			
 
				-	BUG_ON(hash > prison->nr_buckets);
			
 
				-
			
 
				-	spin_lock_irqsave(&prison->lock, flags);
			
 
				-
			
 
				-	cell = __search_bucket(prison->cells + hash, key);
			
 
				-	if (cell) {
			
 
				-		bio_list_add(&cell->bios, inmate);
			
 
				-		goto out;
			
 
				-	}
			
 
				+	memcpy(&cell->key, key, sizeof(cell->key));
			
 
				+	cell->holder = holder;
			
 
				+	bio_list_init(&cell->bios);
			
 
				+	hlist_add_head(&cell->list, prison->cells + hash);
			
 
				+}
			
 
				 
			
 
				-	/*
			
 
				-	 * Allocate a new cell
			
 
				-	 */
			
 
				-	spin_unlock_irqrestore(&prison->lock, flags);
			
 
				-	cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
			
 
				-	spin_lock_irqsave(&prison->lock, flags);
			
 
				+static int __bio_detain(struct dm_bio_prison *prison,
			
 
				+			struct dm_cell_key *key,
			
 
				+			struct bio *inmate,
			
 
				+			struct dm_bio_prison_cell *cell_prealloc,
			
 
				+			struct dm_bio_prison_cell **cell_result)
			
 
				+{
			
 
				+	uint32_t hash = hash_key(prison, key);
			
 
				+	struct dm_bio_prison_cell *cell;
			
 
				 
			
 
				-	/*
			
 
				-	 * We've been unlocked, so we have to double check that
			
 
				-	 * nobody else has inserted this cell in the meantime.
			
 
				-	 */
			
 
				 	cell = __search_bucket(prison->cells + hash, key);
			
 
				 	if (cell) {
			
 
				-		mempool_free(cell2, prison->cell_pool);
			
 
				-		bio_list_add(&cell->bios, inmate);
			
 
				-		goto out;
			
 
				+		if (inmate)
			
 
				+			bio_list_add(&cell->bios, inmate);
			
 
				+		*cell_result = cell;
			
 
				+		return 1;
			
 
				 	}
			
 
				 
			
 
				-	/*
			
 
				-	 * Use new cell.
			
 
				-	 */
			
 
				-	cell = cell2;
			
 
				-
			
 
				-	cell->prison = prison;
			
 
				-	memcpy(&cell->key, key, sizeof(cell->key));
			
 
				-	cell->holder = inmate;
			
 
				-	bio_list_init(&cell->bios);
			
 
				-	hlist_add_head(&cell->list, prison->cells + hash);
			
 
				+	__setup_new_cell(prison, key, inmate, hash, cell_prealloc);
			
 
				+	*cell_result = cell_prealloc;
			
 
				+	return 0;
			
 
				+}
			
 
				 
			
 
				-	r = 0;
			
 
				+static int bio_detain(struct dm_bio_prison *prison,
			
 
				+		      struct dm_cell_key *key,
			
 
				+		      struct bio *inmate,
			
 
				+		      struct dm_bio_prison_cell *cell_prealloc,
			
 
				+		      struct dm_bio_prison_cell **cell_result)
			
 
				+{
			
 
				+	int r;
			
 
				+	unsigned long flags;
			
 
				 
			
 
				-out:
			
 
				+	spin_lock_irqsave(&prison->lock, flags);
			
 
				+	r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result);
			
 
				 	spin_unlock_irqrestore(&prison->lock, flags);
			
 
				 
			
 
				-	*ref = cell;
			
 
				-
			
 
				 	return r;
			
 
				 }
			
 
				+
			
 
				+int dm_bio_detain(struct dm_bio_prison *prison,
			
 
				+		  struct dm_cell_key *key,
			
 
				+		  struct bio *inmate,
			
 
				+		  struct dm_bio_prison_cell *cell_prealloc,
			
 
				+		  struct dm_bio_prison_cell **cell_result)
			
 
				+{
			
 
				+	return bio_detain(prison, key, inmate, cell_prealloc, cell_result);
			
 
				+}
			
 
				 EXPORT_SYMBOL_GPL(dm_bio_detain);
			
 
				 
			
 
				+int dm_get_cell(struct dm_bio_prison *prison,
			
 
				+		struct dm_cell_key *key,
			
 
				+		struct dm_bio_prison_cell *cell_prealloc,
			
 
				+		struct dm_bio_prison_cell **cell_result)
			
 
				+{
			
 
				+	return bio_detain(prison, key, NULL, cell_prealloc, cell_result);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_get_cell);
			
 
				+
			
 
				 /*
			
 
				  * @inmates must have been initialised prior to this call
			
 
				  */
			
 
				-static void __cell_release(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
			
 
				+static void __cell_release(struct dm_bio_prison_cell *cell,
			
 
				+			   struct bio_list *inmates)
			
 
				 {
			
 
				-	struct dm_bio_prison *prison = cell->prison;
			
 
				-
			
 
				 	hlist_del(&cell->list);
			
 
				 
			
 
				 	if (inmates) {
			
 
				-		bio_list_add(inmates, cell->holder);
			
 
				+		if (cell->holder)
			
 
				+			bio_list_add(inmates, cell->holder);
			
 
				 		bio_list_merge(inmates, &cell->bios);
			
 
				 	}
			
 
				-
			
 
				-	mempool_free(cell, prison->cell_pool);
			
 
				 }
			
 
				 
			
 
				-void dm_cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios)
			
 
				+void dm_cell_release(struct dm_bio_prison *prison,
			
 
				+		     struct dm_bio_prison_cell *cell,
			
 
				+		     struct bio_list *bios)
			
 
				 {
			
 
				 	unsigned long flags;
			
 
				-	struct dm_bio_prison *prison = cell->prison;
			
 
				 
			
 
				 	spin_lock_irqsave(&prison->lock, flags);
			
 
				 	__cell_release(cell, bios);
			
@@ -209,20 +218,18 @@ EXPORT_SYMBOL_GPL(dm_cell_release);
 
				 /*
			
 
				  * Sometimes we don't want the holder, just the additional bios.
			
 
				  */
			
 
				-static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
			
 
				+static void __cell_release_no_holder(struct dm_bio_prison_cell *cell,
			
 
				+				     struct bio_list *inmates)
			
 
				 {
			
 
				-	struct dm_bio_prison *prison = cell->prison;
			
 
				-
			
 
				 	hlist_del(&cell->list);
			
 
				 	bio_list_merge(inmates, &cell->bios);
			
 
				-
			
 
				-	mempool_free(cell, prison->cell_pool);
			
 
				 }
			
 
				 
			
 
				-void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
			
 
				+void dm_cell_release_no_holder(struct dm_bio_prison *prison,
			
 
				+			       struct dm_bio_prison_cell *cell,
			
 
				+			       struct bio_list *inmates)
			
 
				 {
			
 
				 	unsigned long flags;
			
 
				-	struct dm_bio_prison *prison = cell->prison;
			
 
				 
			
 
				 	spin_lock_irqsave(&prison->lock, flags);
			
 
				 	__cell_release_no_holder(cell, inmates);
			
@@ -230,9 +237,9 @@ void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(dm_cell_release_no_holder);
			
 
				 
			
 
				-void dm_cell_error(struct dm_bio_prison_cell *cell)
			
 
				+void dm_cell_error(struct dm_bio_prison *prison,
			
 
				+		   struct dm_bio_prison_cell *cell)
			
 
				 {
			
 
				-	struct dm_bio_prison *prison = cell->prison;
			
 
				 	struct bio_list bios;
			
 
				 	struct bio *bio;
			
 
				 	unsigned long flags;
			
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
@@ -22,7 +22,6 @@
 
				  * subsequently unlocked the bios become available.
			
 
				  */
			
 
				 struct dm_bio_prison;
			
 
				-struct dm_bio_prison_cell;
			
 
				 
			
 
				 /* FIXME: this needs to be more abstract */
			
 
				 struct dm_cell_key {
			
@@ -31,21 +30,62 @@ struct dm_cell_key {
 
				 	dm_block_t block;
			
 
				 };
			
 
				 
			
 
				+/*
			
 
				+ * Treat this as opaque, only in header so callers can manage allocation
			
 
				+ * themselves.
			
 
				+ */
			
 
				+struct dm_bio_prison_cell {
			
 
				+	struct hlist_node list;
			
 
				+	struct dm_cell_key key;
			
 
				+	struct bio *holder;
			
 
				+	struct bio_list bios;
			
 
				+};
			
 
				+
			
 
				 struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells);
			
 
				 void dm_bio_prison_destroy(struct dm_bio_prison *prison);
			
 
				 
			
 
				 /*
			
 
				- * This may block if a new cell needs allocating.  You must ensure that
			
 
				- * cells will be unlocked even if the calling thread is blocked.
			
 
				+ * These two functions just wrap a mempool.  This is a transitory step:
			
 
				+ * Eventually all bio prison clients should manage their own cell memory.
			
 
				  *
			
 
				- * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
			
 
				+ * Like mempool_alloc(), dm_bio_prison_alloc_cell() can only fail if called
			
 
				+ * in interrupt context or passed GFP_NOWAIT.
			
 
				  */
			
 
				-int dm_bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key,
			
 
				-		  struct bio *inmate, struct dm_bio_prison_cell **ref);
			
 
				+struct dm_bio_prison_cell *dm_bio_prison_alloc_cell(struct dm_bio_prison *prison,
			
 
				+						    gfp_t gfp);
			
 
				+void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
			
 
				+			     struct dm_bio_prison_cell *cell);
			
 
				 
			
 
				-void dm_cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios);
			
 
				-void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates);
			
 
				-void dm_cell_error(struct dm_bio_prison_cell *cell);
			
 
				+/*
			
 
				+ * Creates, or retrieves a cell for the given key.
			
 
				+ *
			
 
				+ * Returns 1 if pre-existing cell returned, zero if new cell created using
			
 
				+ * @cell_prealloc.
			
 
				+ */
			
 
				+int dm_get_cell(struct dm_bio_prison *prison,
			
 
				+		struct dm_cell_key *key,
			
 
				+		struct dm_bio_prison_cell *cell_prealloc,
			
 
				+		struct dm_bio_prison_cell **cell_result);
			
 
				+
			
 
				+/*
			
 
				+ * An atomic op that combines retrieving a cell, and adding a bio to it.
			
 
				+ *
			
 
				+ * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
			
 
				+ */
			
 
				+int dm_bio_detain(struct dm_bio_prison *prison,
			
 
				+		  struct dm_cell_key *key,
			
 
				+		  struct bio *inmate,
			
 
				+		  struct dm_bio_prison_cell *cell_prealloc,
			
 
				+		  struct dm_bio_prison_cell **cell_result);
			
 
				+
			
 
				+void dm_cell_release(struct dm_bio_prison *prison,
			
 
				+		     struct dm_bio_prison_cell *cell,
			
 
				+		     struct bio_list *bios);
			
 
				+void dm_cell_release_no_holder(struct dm_bio_prison *prison,
			
 
				+			       struct dm_bio_prison_cell *cell,
			
 
				+			       struct bio_list *inmates);
			
 
				+void dm_cell_error(struct dm_bio_prison *prison,
			
 
				+		   struct dm_bio_prison_cell *cell);
			
 
				 
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1192,7 +1192,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
 
				 int dm_bufio_issue_flush(struct dm_bufio_client *c)
			
 
				 {
			
 
				 	struct dm_io_request io_req = {
			
 
				-		.bi_rw = REQ_FLUSH,
			
 
				+		.bi_rw = WRITE_FLUSH,
			
 
				 		.mem.type = DM_IO_KMEM,
			
 
				 		.mem.ptr.addr = NULL,
			
 
				 		.client = c->dm_io,
			
--- a/drivers/md/dm-cache-block-types.h
+++ b/drivers/md/dm-cache-block-types.h
@@ -0,0 +1,54 @@
 
				+/*
			
 
				+ * Copyright (C) 2012 Red Hat, Inc.
			
 
				+ *
			
 
				+ * This file is released under the GPL.
			
 
				+ */
			
 
				+
			
 
				+#ifndef DM_CACHE_BLOCK_TYPES_H
			
 
				+#define DM_CACHE_BLOCK_TYPES_H
			
 
				+
			
 
				+#include "persistent-data/dm-block-manager.h"
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * It's helpful to get sparse to differentiate between indexes into the
			
 
				+ * origin device, indexes into the cache device, and indexes into the
			
 
				+ * discard bitset.
			
 
				+ */
			
 
				+
			
 
				+typedef dm_block_t __bitwise__ dm_oblock_t;
			
 
				+typedef uint32_t __bitwise__ dm_cblock_t;
			
 
				+typedef dm_block_t __bitwise__ dm_dblock_t;
			
 
				+
			
 
				+static inline dm_oblock_t to_oblock(dm_block_t b)
			
 
				+{
			
 
				+	return (__force dm_oblock_t) b;
			
 
				+}
			
 
				+
			
 
				+static inline dm_block_t from_oblock(dm_oblock_t b)
			
 
				+{
			
 
				+	return (__force dm_block_t) b;
			
 
				+}
			
 
				+
			
 
				+static inline dm_cblock_t to_cblock(uint32_t b)
			
 
				+{
			
 
				+	return (__force dm_cblock_t) b;
			
 
				+}
			
 
				+
			
 
				+static inline uint32_t from_cblock(dm_cblock_t b)
			
 
				+{
			
 
				+	return (__force uint32_t) b;
			
 
				+}
			
 
				+
			
 
				+static inline dm_dblock_t to_dblock(dm_block_t b)
			
 
				+{
			
 
				+	return (__force dm_dblock_t) b;
			
 
				+}
			
 
				+
			
 
				+static inline dm_block_t from_dblock(dm_dblock_t b)
			
 
				+{
			
 
				+	return (__force dm_block_t) b;
			
 
				+}
			
 
				+
			
 
				+#endif /* DM_CACHE_BLOCK_TYPES_H */
			
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -0,0 +1,1146 @@
 
				+/*
			
 
				+ * Copyright (C) 2012 Red Hat, Inc.
			
 
				+ *
			
 
				+ * This file is released under the GPL.
			
 
				+ */
			
 
				+
			
 
				+#include "dm-cache-metadata.h"
			
 
				+
			
 
				+#include "persistent-data/dm-array.h"
			
 
				+#include "persistent-data/dm-bitset.h"
			
 
				+#include "persistent-data/dm-space-map.h"
			
 
				+#include "persistent-data/dm-space-map-disk.h"
			
 
				+#include "persistent-data/dm-transaction-manager.h"
			
 
				+
			
 
				+#include <linux/device-mapper.h>
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+#define DM_MSG_PREFIX   "cache metadata"
			
 
				+
			
 
				+#define CACHE_SUPERBLOCK_MAGIC 06142003
			
 
				+#define CACHE_SUPERBLOCK_LOCATION 0
			
 
				+#define CACHE_VERSION 1
			
 
				+#define CACHE_METADATA_CACHE_SIZE 64
			
 
				+
			
 
				+/*
			
 
				+ *  3 for btree insert +
			
 
				+ *  2 for btree lookup used within space map
			
 
				+ */
			
 
				+#define CACHE_MAX_CONCURRENT_LOCKS 5
			
 
				+#define SPACE_MAP_ROOT_SIZE 128
			
 
				+
			
 
				+enum superblock_flag_bits {
			
 
				+	/* for spotting crashes that would invalidate the dirty bitset */
			
 
				+	CLEAN_SHUTDOWN,
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Each mapping from cache block -> origin block carries a set of flags.
			
 
				+ */
			
 
				+enum mapping_bits {
			
 
				+	/*
			
 
				+	 * A valid mapping.  Because we're using an array we clear this
			
 
				+	 * flag for an non existant mapping.
			
 
				+	 */
			
 
				+	M_VALID = 1,
			
 
				+
			
 
				+	/*
			
 
				+	 * The data on the cache is different from that on the origin.
			
 
				+	 */
			
 
				+	M_DIRTY = 2
			
 
				+};
			
 
				+
			
 
				+struct cache_disk_superblock {
			
 
				+	__le32 csum;
			
 
				+	__le32 flags;
			
 
				+	__le64 blocknr;
			
 
				+
			
 
				+	__u8 uuid[16];
			
 
				+	__le64 magic;
			
 
				+	__le32 version;
			
 
				+
			
 
				+	__u8 policy_name[CACHE_POLICY_NAME_SIZE];
			
 
				+	__le32 policy_hint_size;
			
 
				+
			
 
				+	__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
			
 
				+	__le64 mapping_root;
			
 
				+	__le64 hint_root;
			
 
				+
			
 
				+	__le64 discard_root;
			
 
				+	__le64 discard_block_size;
			
 
				+	__le64 discard_nr_blocks;
			
 
				+
			
 
				+	__le32 data_block_size;
			
 
				+	__le32 metadata_block_size;
			
 
				+	__le32 cache_blocks;
			
 
				+
			
 
				+	__le32 compat_flags;
			
 
				+	__le32 compat_ro_flags;
			
 
				+	__le32 incompat_flags;
			
 
				+
			
 
				+	__le32 read_hits;
			
 
				+	__le32 read_misses;
			
 
				+	__le32 write_hits;
			
 
				+	__le32 write_misses;
			
 
				+} __packed;
			
 
				+
			
 
				+struct dm_cache_metadata {
			
 
				+	struct block_device *bdev;
			
 
				+	struct dm_block_manager *bm;
			
 
				+	struct dm_space_map *metadata_sm;
			
 
				+	struct dm_transaction_manager *tm;
			
 
				+
			
 
				+	struct dm_array_info info;
			
 
				+	struct dm_array_info hint_info;
			
 
				+	struct dm_disk_bitset discard_info;
			
 
				+
			
 
				+	struct rw_semaphore root_lock;
			
 
				+	dm_block_t root;
			
 
				+	dm_block_t hint_root;
			
 
				+	dm_block_t discard_root;
			
 
				+
			
 
				+	sector_t discard_block_size;
			
 
				+	dm_dblock_t discard_nr_blocks;
			
 
				+
			
 
				+	sector_t data_block_size;
			
 
				+	dm_cblock_t cache_blocks;
			
 
				+	bool changed:1;
			
 
				+	bool clean_when_opened:1;
			
 
				+
			
 
				+	char policy_name[CACHE_POLICY_NAME_SIZE];
			
 
				+	size_t policy_hint_size;
			
 
				+	struct dm_cache_statistics stats;
			
 
				+};
			
 
				+
			
 
				+/*-------------------------------------------------------------------
			
 
				+ * superblock validator
			
 
				+ *-----------------------------------------------------------------*/
			
 
				+
			
 
				+#define SUPERBLOCK_CSUM_XOR 9031977
			
 
				+
			
 
				+static void sb_prepare_for_write(struct dm_block_validator *v,
			
 
				+				 struct dm_block *b,
			
 
				+				 size_t sb_block_size)
			
 
				+{
			
 
				+	struct cache_disk_superblock *disk_super = dm_block_data(b);
			
 
				+
			
 
				+	disk_super->blocknr = cpu_to_le64(dm_block_location(b));
			
 
				+	disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
			
 
				+						      sb_block_size - sizeof(__le32),
			
 
				+						      SUPERBLOCK_CSUM_XOR));
			
 
				+}
			
 
				+
			
 
				+static int sb_check(struct dm_block_validator *v,
			
 
				+		    struct dm_block *b,
			
 
				+		    size_t sb_block_size)
			
 
				+{
			
 
				+	struct cache_disk_superblock *disk_super = dm_block_data(b);
			
 
				+	__le32 csum_le;
			
 
				+
			
 
				+	if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) {
			
 
				+		DMERR("sb_check failed: blocknr %llu: wanted %llu",
			
 
				+		      le64_to_cpu(disk_super->blocknr),
			
 
				+		      (unsigned long long)dm_block_location(b));
			
 
				+		return -ENOTBLK;
			
 
				+	}
			
 
				+
			
 
				+	if (le64_to_cpu(disk_super->magic) != CACHE_SUPERBLOCK_MAGIC) {
			
 
				+		DMERR("sb_check failed: magic %llu: wanted %llu",
			
 
				+		      le64_to_cpu(disk_super->magic),
			
 
				+		      (unsigned long long)CACHE_SUPERBLOCK_MAGIC);
			
 
				+		return -EILSEQ;
			
 
				+	}
			
 
				+
			
 
				+	csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
			
 
				+					     sb_block_size - sizeof(__le32),
			
 
				+					     SUPERBLOCK_CSUM_XOR));
			
 
				+	if (csum_le != disk_super->csum) {
			
 
				+		DMERR("sb_check failed: csum %u: wanted %u",
			
 
				+		      le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum));
			
 
				+		return -EILSEQ;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static struct dm_block_validator sb_validator = {
			
 
				+	.name = "superblock",
			
 
				+	.prepare_for_write = sb_prepare_for_write,
			
 
				+	.check = sb_check
			
 
				+};
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static int superblock_read_lock(struct dm_cache_metadata *cmd,
			
 
				+				struct dm_block **sblock)
			
 
				+{
			
 
				+	return dm_bm_read_lock(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
			
 
				+			       &sb_validator, sblock);
			
 
				+}
			
 
				+
			
 
				+static int superblock_lock_zero(struct dm_cache_metadata *cmd,
			
 
				+				struct dm_block **sblock)
			
 
				+{
			
 
				+	return dm_bm_write_lock_zero(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
			
 
				+				     &sb_validator, sblock);
			
 
				+}
			
 
				+
			
 
				+static int superblock_lock(struct dm_cache_metadata *cmd,
			
 
				+			   struct dm_block **sblock)
			
 
				+{
			
 
				+	return dm_bm_write_lock(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
			
 
				+				&sb_validator, sblock);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
			
 
				+{
			
 
				+	int r;
			
 
				+	unsigned i;
			
 
				+	struct dm_block *b;
			
 
				+	__le64 *data_le, zero = cpu_to_le64(0);
			
 
				+	unsigned sb_block_size = dm_bm_block_size(bm) / sizeof(__le64);
			
 
				+
			
 
				+	/*
			
 
				+	 * We can't use a validator here - it may be all zeroes.
			
 
				+	 */
			
 
				+	r = dm_bm_read_lock(bm, CACHE_SUPERBLOCK_LOCATION, NULL, &b);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	data_le = dm_block_data(b);
			
 
				+	*result = 1;
			
 
				+	for (i = 0; i < sb_block_size; i++) {
			
 
				+		if (data_le[i] != zero) {
			
 
				+			*result = 0;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return dm_bm_unlock(b);
			
 
				+}
			
 
				+
			
 
				+static void __setup_mapping_info(struct dm_cache_metadata *cmd)
			
 
				+{
			
 
				+	struct dm_btree_value_type vt;
			
 
				+
			
 
				+	vt.context = NULL;
			
 
				+	vt.size = sizeof(__le64);
			
 
				+	vt.inc = NULL;
			
 
				+	vt.dec = NULL;
			
 
				+	vt.equal = NULL;
			
 
				+	dm_array_info_init(&cmd->info, cmd->tm, &vt);
			
 
				+
			
 
				+	if (cmd->policy_hint_size) {
			
 
				+		vt.size = sizeof(__le32);
			
 
				+		dm_array_info_init(&cmd->hint_info, cmd->tm, &vt);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int __write_initial_superblock(struct dm_cache_metadata *cmd)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct dm_block *sblock;
			
 
				+	size_t metadata_len;
			
 
				+	struct cache_disk_superblock *disk_super;
			
 
				+	sector_t bdev_size = i_size_read(cmd->bdev->bd_inode) >> SECTOR_SHIFT;
			
 
				+
			
 
				+	/* FIXME: see if we can lose the max sectors limit */
			
 
				+	if (bdev_size > DM_CACHE_METADATA_MAX_SECTORS)
			
 
				+		bdev_size = DM_CACHE_METADATA_MAX_SECTORS;
			
 
				+
			
 
				+	r = dm_sm_root_size(cmd->metadata_sm, &metadata_len);
			
 
				+	if (r < 0)
			
 
				+		return r;
			
 
				+
			
 
				+	r = dm_tm_pre_commit(cmd->tm);
			
 
				+	if (r < 0)
			
 
				+		return r;
			
 
				+
			
 
				+	r = superblock_lock_zero(cmd, &sblock);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	disk_super = dm_block_data(sblock);
			
 
				+	disk_super->flags = 0;
			
 
				+	memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
			
 
				+	disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC);
			
 
				+	disk_super->version = cpu_to_le32(CACHE_VERSION);
			
 
				+	memset(disk_super->policy_name, 0, CACHE_POLICY_NAME_SIZE);
			
 
				+	disk_super->policy_hint_size = 0;
			
 
				+
			
 
				+	r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root,
			
 
				+			    metadata_len);
			
 
				+	if (r < 0)
			
 
				+		goto bad_locked;
			
 
				+
			
 
				+	disk_super->mapping_root = cpu_to_le64(cmd->root);
			
 
				+	disk_super->hint_root = cpu_to_le64(cmd->hint_root);
			
 
				+	disk_super->discard_root = cpu_to_le64(cmd->discard_root);
			
 
				+	disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
			
 
				+	disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
			
 
				+	disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
			
 
				+	disk_super->data_block_size = cpu_to_le32(cmd->data_block_size);
			
 
				+	disk_super->cache_blocks = cpu_to_le32(0);
			
 
				+	memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));
			
 
				+
			
 
				+	disk_super->read_hits = cpu_to_le32(0);
			
 
				+	disk_super->read_misses = cpu_to_le32(0);
			
 
				+	disk_super->write_hits = cpu_to_le32(0);
			
 
				+	disk_super->write_misses = cpu_to_le32(0);
			
 
				+
			
 
				+	return dm_tm_commit(cmd->tm, sblock);
			
 
				+
			
 
				+bad_locked:
			
 
				+	dm_bm_unlock(sblock);
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int __format_metadata(struct dm_cache_metadata *cmd)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	r = dm_tm_create_with_sm(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
			
 
				+				 &cmd->tm, &cmd->metadata_sm);
			
 
				+	if (r < 0) {
			
 
				+		DMERR("tm_create_with_sm failed");
			
 
				+		return r;
			
 
				+	}
			
 
				+
			
 
				+	__setup_mapping_info(cmd);
			
 
				+
			
 
				+	r = dm_array_empty(&cmd->info, &cmd->root);
			
 
				+	if (r < 0)
			
 
				+		goto bad;
			
 
				+
			
 
				+	dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
			
 
				+
			
 
				+	r = dm_bitset_empty(&cmd->discard_info, &cmd->discard_root);
			
 
				+	if (r < 0)
			
 
				+		goto bad;
			
 
				+
			
 
				+	cmd->discard_block_size = 0;
			
 
				+	cmd->discard_nr_blocks = 0;
			
 
				+
			
 
				+	r = __write_initial_superblock(cmd);
			
 
				+	if (r)
			
 
				+		goto bad;
			
 
				+
			
 
				+	cmd->clean_when_opened = true;
			
 
				+	return 0;
			
 
				+
			
 
				+bad:
			
 
				+	dm_tm_destroy(cmd->tm);
			
 
				+	dm_sm_destroy(cmd->metadata_sm);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int __check_incompat_features(struct cache_disk_superblock *disk_super,
			
 
				+				     struct dm_cache_metadata *cmd)
			
 
				+{
			
 
				+	uint32_t features;
			
 
				+
			
 
				+	features = le32_to_cpu(disk_super->incompat_flags) & ~DM_CACHE_FEATURE_INCOMPAT_SUPP;
			
 
				+	if (features) {
			
 
				+		DMERR("could not access metadata due to unsupported optional features (%lx).",
			
 
				+		      (unsigned long)features);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Check for read-only metadata to skip the following RDWR checks.
			
 
				+	 */
			
 
				+	if (get_disk_ro(cmd->bdev->bd_disk))
			
 
				+		return 0;
			
 
				+
			
 
				+	features = le32_to_cpu(disk_super->compat_ro_flags) & ~DM_CACHE_FEATURE_COMPAT_RO_SUPP;
			
 
				+	if (features) {
			
 
				+		DMERR("could not access metadata RDWR due to unsupported optional features (%lx).",
			
 
				+		      (unsigned long)features);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int __open_metadata(struct dm_cache_metadata *cmd)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct dm_block *sblock;
			
 
				+	struct cache_disk_superblock *disk_super;
			
 
				+	unsigned long sb_flags;
			
 
				+
			
 
				+	r = superblock_read_lock(cmd, &sblock);
			
 
				+	if (r < 0) {
			
 
				+		DMERR("couldn't read lock superblock");
			
 
				+		return r;
			
 
				+	}
			
 
				+
			
 
				+	disk_super = dm_block_data(sblock);
			
 
				+
			
 
				+	r = __check_incompat_features(disk_super, cmd);
			
 
				+	if (r < 0)
			
 
				+		goto bad;
			
 
				+
			
 
				+	r = dm_tm_open_with_sm(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
			
 
				+			       disk_super->metadata_space_map_root,
			
 
				+			       sizeof(disk_super->metadata_space_map_root),
			
 
				+			       &cmd->tm, &cmd->metadata_sm);
			
 
				+	if (r < 0) {
			
 
				+		DMERR("tm_open_with_sm failed");
			
 
				+		goto bad;
			
 
				+	}
			
 
				+
			
 
				+	__setup_mapping_info(cmd);
			
 
				+	dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
			
 
				+	sb_flags = le32_to_cpu(disk_super->flags);
			
 
				+	cmd->clean_when_opened = test_bit(CLEAN_SHUTDOWN, &sb_flags);
			
 
				+	return dm_bm_unlock(sblock);
			
 
				+
			
 
				+bad:
			
 
				+	dm_bm_unlock(sblock);
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int __open_or_format_metadata(struct dm_cache_metadata *cmd,
			
 
				+				     bool format_device)
			
 
				+{
			
 
				+	int r, unformatted;
			
 
				+
			
 
				+	r = __superblock_all_zeroes(cmd->bm, &unformatted);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	if (unformatted)
			
 
				+		return format_device ? __format_metadata(cmd) : -EPERM;
			
 
				+
			
 
				+	return __open_metadata(cmd);
			
 
				+}
			
 
				+
			
 
				+static int __create_persistent_data_objects(struct dm_cache_metadata *cmd,
			
 
				+					    bool may_format_device)
			
 
				+{
			
 
				+	int r;
			
 
				+	cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE,
			
 
				+					  CACHE_METADATA_CACHE_SIZE,
			
 
				+					  CACHE_MAX_CONCURRENT_LOCKS);
			
 
				+	if (IS_ERR(cmd->bm)) {
			
 
				+		DMERR("could not create block manager");
			
 
				+		return PTR_ERR(cmd->bm);
			
 
				+	}
			
 
				+
			
 
				+	r = __open_or_format_metadata(cmd, may_format_device);
			
 
				+	if (r)
			
 
				+		dm_block_manager_destroy(cmd->bm);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static void __destroy_persistent_data_objects(struct dm_cache_metadata *cmd)
			
 
				+{
			
 
				+	dm_sm_destroy(cmd->metadata_sm);
			
 
				+	dm_tm_destroy(cmd->tm);
			
 
				+	dm_block_manager_destroy(cmd->bm);
			
 
				+}
			
 
				+
			
 
				+typedef unsigned long (*flags_mutator)(unsigned long);
			
 
				+
			
 
				+static void update_flags(struct cache_disk_superblock *disk_super,
			
 
				+			 flags_mutator mutator)
			
 
				+{
			
 
				+	uint32_t sb_flags = mutator(le32_to_cpu(disk_super->flags));
			
 
				+	disk_super->flags = cpu_to_le32(sb_flags);
			
 
				+}
			
 
				+
			
 
				+static unsigned long set_clean_shutdown(unsigned long flags)
			
 
				+{
			
 
				+	set_bit(CLEAN_SHUTDOWN, &flags);
			
 
				+	return flags;
			
 
				+}
			
 
				+
			
 
				+static unsigned long clear_clean_shutdown(unsigned long flags)
			
 
				+{
			
 
				+	clear_bit(CLEAN_SHUTDOWN, &flags);
			
 
				+	return flags;
			
 
				+}
			
 
				+
			
 
				+static void read_superblock_fields(struct dm_cache_metadata *cmd,
			
 
				+				   struct cache_disk_superblock *disk_super)
			
 
				+{
			
 
				+	cmd->root = le64_to_cpu(disk_super->mapping_root);
			
 
				+	cmd->hint_root = le64_to_cpu(disk_super->hint_root);
			
 
				+	cmd->discard_root = le64_to_cpu(disk_super->discard_root);
			
 
				+	cmd->discard_block_size = le64_to_cpu(disk_super->discard_block_size);
			
 
				+	cmd->discard_nr_blocks = to_dblock(le64_to_cpu(disk_super->discard_nr_blocks));
			
 
				+	cmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
			
 
				+	cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks));
			
 
				+	strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name));
			
 
				+	cmd->policy_hint_size = le32_to_cpu(disk_super->policy_hint_size);
			
 
				+
			
 
				+	cmd->stats.read_hits = le32_to_cpu(disk_super->read_hits);
			
 
				+	cmd->stats.read_misses = le32_to_cpu(disk_super->read_misses);
			
 
				+	cmd->stats.write_hits = le32_to_cpu(disk_super->write_hits);
			
 
				+	cmd->stats.write_misses = le32_to_cpu(disk_super->write_misses);
			
 
				+
			
 
				+	cmd->changed = false;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * The mutator updates the superblock flags.
			
 
				+ */
			
 
				+static int __begin_transaction_flags(struct dm_cache_metadata *cmd,
			
 
				+				     flags_mutator mutator)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct cache_disk_superblock *disk_super;
			
 
				+	struct dm_block *sblock;
			
 
				+
			
 
				+	r = superblock_lock(cmd, &sblock);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	disk_super = dm_block_data(sblock);
			
 
				+	update_flags(disk_super, mutator);
			
 
				+	read_superblock_fields(cmd, disk_super);
			
 
				+
			
 
				+	return dm_bm_flush_and_unlock(cmd->bm, sblock);
			
 
				+}
			
 
				+
			
 
				+static int __begin_transaction(struct dm_cache_metadata *cmd)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct cache_disk_superblock *disk_super;
			
 
				+	struct dm_block *sblock;
			
 
				+
			
 
				+	/*
			
 
				+	 * We re-read the superblock every time.  Shouldn't need to do this
			
 
				+	 * really.
			
 
				+	 */
			
 
				+	r = superblock_read_lock(cmd, &sblock);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	disk_super = dm_block_data(sblock);
			
 
				+	read_superblock_fields(cmd, disk_super);
			
 
				+	dm_bm_unlock(sblock);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int __commit_transaction(struct dm_cache_metadata *cmd,
			
 
				+				flags_mutator mutator)
			
 
				+{
			
 
				+	int r;
			
 
				+	size_t metadata_len;
			
 
				+	struct cache_disk_superblock *disk_super;
			
 
				+	struct dm_block *sblock;
			
 
				+
			
 
				+	/*
			
 
				+	 * We need to know if the cache_disk_superblock exceeds a 512-byte sector.
			
 
				+	 */
			
 
				+	BUILD_BUG_ON(sizeof(struct cache_disk_superblock) > 512);
			
 
				+
			
 
				+	r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root,
			
 
				+			    &cmd->discard_root);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	r = dm_tm_pre_commit(cmd->tm);
			
 
				+	if (r < 0)
			
 
				+		return r;
			
 
				+
			
 
				+	r = dm_sm_root_size(cmd->metadata_sm, &metadata_len);
			
 
				+	if (r < 0)
			
 
				+		return r;
			
 
				+
			
 
				+	r = superblock_lock(cmd, &sblock);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	disk_super = dm_block_data(sblock);
			
 
				+
			
 
				+	if (mutator)
			
 
				+		update_flags(disk_super, mutator);
			
 
				+
			
 
				+	disk_super->mapping_root = cpu_to_le64(cmd->root);
			
 
				+	disk_super->hint_root = cpu_to_le64(cmd->hint_root);
			
 
				+	disk_super->discard_root = cpu_to_le64(cmd->discard_root);
			
 
				+	disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
			
 
				+	disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
			
 
				+	disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks));
			
 
				+	strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name));
			
 
				+
			
 
				+	disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits);
			
 
				+	disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses);
			
 
				+	disk_super->write_hits = cpu_to_le32(cmd->stats.write_hits);
			
 
				+	disk_super->write_misses = cpu_to_le32(cmd->stats.write_misses);
			
 
				+
			
 
				+	r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root,
			
 
				+			    metadata_len);
			
 
				+	if (r < 0) {
			
 
				+		dm_bm_unlock(sblock);
			
 
				+		return r;
			
 
				+	}
			
 
				+
			
 
				+	return dm_tm_commit(cmd->tm, sblock);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * The mappings are held in a dm-array that has 64-bit values stored in
			
 
				+ * little-endian format.  The index is the cblock, the high 48bits of the
			
 
				+ * value are the oblock and the low 16 bit the flags.
			
 
				+ */
			
 
				+#define FLAGS_MASK ((1 << 16) - 1)
			
 
				+
			
 
				+static __le64 pack_value(dm_oblock_t block, unsigned flags)
			
 
				+{
			
 
				+	uint64_t value = from_oblock(block);
			
 
				+	value <<= 16;
			
 
				+	value = value | (flags & FLAGS_MASK);
			
 
				+	return cpu_to_le64(value);
			
 
				+}
			
 
				+
			
 
				+static void unpack_value(__le64 value_le, dm_oblock_t *block, unsigned *flags)
			
 
				+{
			
 
				+	uint64_t value = le64_to_cpu(value_le);
			
 
				+	uint64_t b = value >> 16;
			
 
				+	*block = to_oblock(b);
			
 
				+	*flags = value & FLAGS_MASK;
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
			
 
				+						 sector_t data_block_size,
			
 
				+						 bool may_format_device,
			
 
				+						 size_t policy_hint_size)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct dm_cache_metadata *cmd;
			
 
				+
			
 
				+	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
			
 
				+	if (!cmd) {
			
 
				+		DMERR("could not allocate metadata struct");
			
 
				+		return NULL;
			
 
				+	}
			
 
				+
			
 
				+	init_rwsem(&cmd->root_lock);
			
 
				+	cmd->bdev = bdev;
			
 
				+	cmd->data_block_size = data_block_size;
			
 
				+	cmd->cache_blocks = 0;
			
 
				+	cmd->policy_hint_size = policy_hint_size;
			
 
				+	cmd->changed = true;
			
 
				+
			
 
				+	r = __create_persistent_data_objects(cmd, may_format_device);
			
 
				+	if (r) {
			
 
				+		kfree(cmd);
			
 
				+		return ERR_PTR(r);
			
 
				+	}
			
 
				+
			
 
				+	r = __begin_transaction_flags(cmd, clear_clean_shutdown);
			
 
				+	if (r < 0) {
			
 
				+		dm_cache_metadata_close(cmd);
			
 
				+		return ERR_PTR(r);
			
 
				+	}
			
 
				+
			
 
				+	return cmd;
			
 
				+}
			
 
				+
			
 
				+void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
			
 
				+{
			
 
				+	__destroy_persistent_data_objects(cmd);
			
 
				+	kfree(cmd);
			
 
				+}
			
 
				+
			
 
				+int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
			
 
				+{
			
 
				+	int r;
			
 
				+	__le64 null_mapping = pack_value(0, 0);
			
 
				+
			
 
				+	down_write(&cmd->root_lock);
			
 
				+	__dm_bless_for_disk(&null_mapping);
			
 
				+	r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks),
			
 
				+			    from_cblock(new_cache_size),
			
 
				+			    &null_mapping, &cmd->root);
			
 
				+	if (!r)
			
 
				+		cmd->cache_blocks = new_cache_size;
			
 
				+	cmd->changed = true;
			
 
				+	up_write(&cmd->root_lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
			
 
				+				   sector_t discard_block_size,
			
 
				+				   dm_dblock_t new_nr_entries)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	down_write(&cmd->root_lock);
			
 
				+	r = dm_bitset_resize(&cmd->discard_info,
			
 
				+			     cmd->discard_root,
			
 
				+			     from_dblock(cmd->discard_nr_blocks),
			
 
				+			     from_dblock(new_nr_entries),
			
 
				+			     false, &cmd->discard_root);
			
 
				+	if (!r) {
			
 
				+		cmd->discard_block_size = discard_block_size;
			
 
				+		cmd->discard_nr_blocks = new_nr_entries;
			
 
				+	}
			
 
				+
			
 
				+	cmd->changed = true;
			
 
				+	up_write(&cmd->root_lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int __set_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
			
 
				+{
			
 
				+	return dm_bitset_set_bit(&cmd->discard_info, cmd->discard_root,
			
 
				+				 from_dblock(b), &cmd->discard_root);
			
 
				+}
			
 
				+
			
 
				+static int __clear_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
			
 
				+{
			
 
				+	return dm_bitset_clear_bit(&cmd->discard_info, cmd->discard_root,
			
 
				+				   from_dblock(b), &cmd->discard_root);
			
 
				+}
			
 
				+
			
 
				+static int __is_discarded(struct dm_cache_metadata *cmd, dm_dblock_t b,
			
 
				+			  bool *is_discarded)
			
 
				+{
			
 
				+	return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root,
			
 
				+				  from_dblock(b), &cmd->discard_root,
			
 
				+				  is_discarded);
			
 
				+}
			
 
				+
			
 
				+static int __discard(struct dm_cache_metadata *cmd,
			
 
				+		     dm_dblock_t dblock, bool discard)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	r = (discard ? __set_discard : __clear_discard)(cmd, dblock);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	cmd->changed = true;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int dm_cache_set_discard(struct dm_cache_metadata *cmd,
			
 
				+			 dm_dblock_t dblock, bool discard)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	down_write(&cmd->root_lock);
			
 
				+	r = __discard(cmd, dblock, discard);
			
 
				+	up_write(&cmd->root_lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int __load_discards(struct dm_cache_metadata *cmd,
			
 
				+			   load_discard_fn fn, void *context)
			
 
				+{
			
 
				+	int r = 0;
			
 
				+	dm_block_t b;
			
 
				+	bool discard;
			
 
				+
			
 
				+	for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
			
 
				+		dm_dblock_t dblock = to_dblock(b);
			
 
				+
			
 
				+		if (cmd->clean_when_opened) {
			
 
				+			r = __is_discarded(cmd, dblock, &discard);
			
 
				+			if (r)
			
 
				+				return r;
			
 
				+		} else
			
 
				+			discard = false;
			
 
				+
			
 
				+		r = fn(context, cmd->discard_block_size, dblock, discard);
			
 
				+		if (r)
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+int dm_cache_load_discards(struct dm_cache_metadata *cmd,
			
 
				+			   load_discard_fn fn, void *context)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	down_read(&cmd->root_lock);
			
 
				+	r = __load_discards(cmd, fn, context);
			
 
				+	up_read(&cmd->root_lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd)
			
 
				+{
			
 
				+	dm_cblock_t r;
			
 
				+
			
 
				+	down_read(&cmd->root_lock);
			
 
				+	r = cmd->cache_blocks;
			
 
				+	up_read(&cmd->root_lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int __remove(struct dm_cache_metadata *cmd, dm_cblock_t cblock)
			
 
				+{
			
 
				+	int r;
			
 
				+	__le64 value = pack_value(0, 0);
			
 
				+
			
 
				+	__dm_bless_for_disk(&value);
			
 
				+	r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock),
			
 
				+			       &value, &cmd->root);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	cmd->changed = true;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	down_write(&cmd->root_lock);
			
 
				+	r = __remove(cmd, cblock);
			
 
				+	up_write(&cmd->root_lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int __insert(struct dm_cache_metadata *cmd,
			
 
				+		    dm_cblock_t cblock, dm_oblock_t oblock)
			
 
				+{
			
 
				+	int r;
			
 
				+	__le64 value = pack_value(oblock, M_VALID);
			
 
				+	__dm_bless_for_disk(&value);
			
 
				+
			
 
				+	r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock),
			
 
				+			       &value, &cmd->root);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	cmd->changed = true;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int dm_cache_insert_mapping(struct dm_cache_metadata *cmd,
			
 
				+			    dm_cblock_t cblock, dm_oblock_t oblock)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	down_write(&cmd->root_lock);
			
 
				+	r = __insert(cmd, cblock, oblock);
			
 
				+	up_write(&cmd->root_lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+struct thunk {
			
 
				+	load_mapping_fn fn;
			
 
				+	void *context;
			
 
				+
			
 
				+	struct dm_cache_metadata *cmd;
			
 
				+	bool respect_dirty_flags;
			
 
				+	bool hints_valid;
			
 
				+};
			
 
				+
			
 
				+static bool hints_array_initialized(struct dm_cache_metadata *cmd)
			
 
				+{
			
 
				+	return cmd->hint_root && cmd->policy_hint_size;
			
 
				+}
			
 
				+
			
 
				+static bool hints_array_available(struct dm_cache_metadata *cmd,
			
 
				+				  const char *policy_name)
			
 
				+{
			
 
				+	bool policy_names_match = !strncmp(cmd->policy_name, policy_name,
			
 
				+					   sizeof(cmd->policy_name));
			
 
				+
			
 
				+	return cmd->clean_when_opened && policy_names_match &&
			
 
				+		hints_array_initialized(cmd);
			
 
				+}
			
 
				+
			
 
				+static int __load_mapping(void *context, uint64_t cblock, void *leaf)
			
 
				+{
			
 
				+	int r = 0;
			
 
				+	bool dirty;
			
 
				+	__le64 value;
			
 
				+	__le32 hint_value = 0;
			
 
				+	dm_oblock_t oblock;
			
 
				+	unsigned flags;
			
 
				+	struct thunk *thunk = context;
			
 
				+	struct dm_cache_metadata *cmd = thunk->cmd;
			
 
				+
			
 
				+	memcpy(&value, leaf, sizeof(value));
			
 
				+	unpack_value(value, &oblock, &flags);
			
 
				+
			
 
				+	if (flags & M_VALID) {
			
 
				+		if (thunk->hints_valid) {
			
 
				+			r = dm_array_get_value(&cmd->hint_info, cmd->hint_root,
			
 
				+					       cblock, &hint_value);
			
 
				+			if (r && r != -ENODATA)
			
 
				+				return r;
			
 
				+		}
			
 
				+
			
 
				+		dirty = thunk->respect_dirty_flags ? (flags & M_DIRTY) : true;
			
 
				+		r = thunk->fn(thunk->context, oblock, to_cblock(cblock),
			
 
				+			      dirty, le32_to_cpu(hint_value), thunk->hints_valid);
			
 
				+	}
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int __load_mappings(struct dm_cache_metadata *cmd, const char *policy_name,
			
 
				+			   load_mapping_fn fn, void *context)
			
 
				+{
			
 
				+	struct thunk thunk;
			
 
				+
			
 
				+	thunk.fn = fn;
			
 
				+	thunk.context = context;
			
 
				+
			
 
				+	thunk.cmd = cmd;
			
 
				+	thunk.respect_dirty_flags = cmd->clean_when_opened;
			
 
				+	thunk.hints_valid = hints_array_available(cmd, policy_name);
			
 
				+
			
 
				+	return dm_array_walk(&cmd->info, cmd->root, __load_mapping, &thunk);
			
 
				+}
			
 
				+
			
 
				+int dm_cache_load_mappings(struct dm_cache_metadata *cmd, const char *policy_name,
			
 
				+			   load_mapping_fn fn, void *context)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	down_read(&cmd->root_lock);
			
 
				+	r = __load_mappings(cmd, policy_name, fn, context);
			
 
				+	up_read(&cmd->root_lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int __dump_mapping(void *context, uint64_t cblock, void *leaf)
			
 
				+{
			
 
				+	int r = 0;
			
 
				+	__le64 value;
			
 
				+	dm_oblock_t oblock;
			
 
				+	unsigned flags;
			
 
				+
			
 
				+	memcpy(&value, leaf, sizeof(value));
			
 
				+	unpack_value(value, &oblock, &flags);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int __dump_mappings(struct dm_cache_metadata *cmd)
			
 
				+{
			
 
				+	return dm_array_walk(&cmd->info, cmd->root, __dump_mapping, NULL);
			
 
				+}
			
 
				+
			
 
				+void dm_cache_dump(struct dm_cache_metadata *cmd)
			
 
				+{
			
 
				+	down_read(&cmd->root_lock);
			
 
				+	__dump_mappings(cmd);
			
 
				+	up_read(&cmd->root_lock);
			
 
				+}
			
 
				+
			
 
				+int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	down_read(&cmd->root_lock);
			
 
				+	r = cmd->changed;
			
 
				+	up_read(&cmd->root_lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty)
			
 
				+{
			
 
				+	int r;
			
 
				+	unsigned flags;
			
 
				+	dm_oblock_t oblock;
			
 
				+	__le64 value;
			
 
				+
			
 
				+	r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(cblock), &value);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	unpack_value(value, &oblock, &flags);
			
 
				+
			
 
				+	if (((flags & M_DIRTY) && dirty) || (!(flags & M_DIRTY) && !dirty))
			
 
				+		/* nothing to be done */
			
 
				+		return 0;
			
 
				+
			
 
				+	value = pack_value(oblock, flags | (dirty ? M_DIRTY : 0));
			
 
				+	__dm_bless_for_disk(&value);
			
 
				+
			
 
				+	r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock),
			
 
				+			       &value, &cmd->root);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	cmd->changed = true;
			
 
				+	return 0;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+int dm_cache_set_dirty(struct dm_cache_metadata *cmd,
			
 
				+		       dm_cblock_t cblock, bool dirty)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	down_write(&cmd->root_lock);
			
 
				+	r = __dirty(cmd, cblock, dirty);
			
 
				+	up_write(&cmd->root_lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
			
 
				+				 struct dm_cache_statistics *stats)
			
 
				+{
			
 
				+	down_read(&cmd->root_lock);
			
 
				+	memcpy(stats, &cmd->stats, sizeof(*stats));
			
 
				+	up_read(&cmd->root_lock);
			
 
				+}
			
 
				+
			
 
				+void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
			
 
				+				 struct dm_cache_statistics *stats)
			
 
				+{
			
 
				+	down_write(&cmd->root_lock);
			
 
				+	memcpy(&cmd->stats, stats, sizeof(*stats));
			
 
				+	up_write(&cmd->root_lock);
			
 
				+}
			
 
				+
			
 
				+int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
			
 
				+{
			
 
				+	int r;
			
 
				+	flags_mutator mutator = (clean_shutdown ? set_clean_shutdown :
			
 
				+				 clear_clean_shutdown);
			
 
				+
			
 
				+	down_write(&cmd->root_lock);
			
 
				+	r = __commit_transaction(cmd, mutator);
			
 
				+	if (r)
			
 
				+		goto out;
			
 
				+
			
 
				+	r = __begin_transaction(cmd);
			
 
				+
			
 
				+out:
			
 
				+	up_write(&cmd->root_lock);
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd,
			
 
				+					   dm_block_t *result)
			
 
				+{
			
 
				+	int r = -EINVAL;
			
 
				+
			
 
				+	down_read(&cmd->root_lock);
			
 
				+	r = dm_sm_get_nr_free(cmd->metadata_sm, result);
			
 
				+	up_read(&cmd->root_lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd,
			
 
				+				   dm_block_t *result)
			
 
				+{
			
 
				+	int r = -EINVAL;
			
 
				+
			
 
				+	down_read(&cmd->root_lock);
			
 
				+	r = dm_sm_get_nr_blocks(cmd->metadata_sm, result);
			
 
				+	up_read(&cmd->root_lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
			
 
				+{
			
 
				+	int r;
			
 
				+	__le32 value;
			
 
				+	size_t hint_size;
			
 
				+	const char *policy_name = dm_cache_policy_get_name(policy);
			
 
				+
			
 
				+	if (!policy_name[0] ||
			
 
				+	    (strlen(policy_name) > sizeof(cmd->policy_name) - 1))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	if (strcmp(cmd->policy_name, policy_name)) {
			
 
				+		strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name));
			
 
				+
			
 
				+		hint_size = dm_cache_policy_get_hint_size(policy);
			
 
				+		if (!hint_size)
			
 
				+			return 0; /* short-circuit hints initialization */
			
 
				+		cmd->policy_hint_size = hint_size;
			
 
				+
			
 
				+		if (cmd->hint_root) {
			
 
				+			r = dm_array_del(&cmd->hint_info, cmd->hint_root);
			
 
				+			if (r)
			
 
				+				return r;
			
 
				+		}
			
 
				+
			
 
				+		r = dm_array_empty(&cmd->hint_info, &cmd->hint_root);
			
 
				+		if (r)
			
 
				+			return r;
			
 
				+
			
 
				+		value = cpu_to_le32(0);
			
 
				+		__dm_bless_for_disk(&value);
			
 
				+		r = dm_array_resize(&cmd->hint_info, cmd->hint_root, 0,
			
 
				+				    from_cblock(cmd->cache_blocks),
			
 
				+				    &value, &cmd->hint_root);
			
 
				+		if (r)
			
 
				+			return r;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	down_write(&cmd->root_lock);
			
 
				+	r = begin_hints(cmd, policy);
			
 
				+	up_write(&cmd->root_lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock,
			
 
				+		     uint32_t hint)
			
 
				+{
			
 
				+	int r;
			
 
				+	__le32 value = cpu_to_le32(hint);
			
 
				+	__dm_bless_for_disk(&value);
			
 
				+
			
 
				+	r = dm_array_set_value(&cmd->hint_info, cmd->hint_root,
			
 
				+			       from_cblock(cblock), &value, &cmd->hint_root);
			
 
				+	cmd->changed = true;
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+int dm_cache_save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock,
			
 
				+		       uint32_t hint)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	if (!hints_array_initialized(cmd))
			
 
				+		return 0;
			
 
				+
			
 
				+	down_write(&cmd->root_lock);
			
 
				+	r = save_hint(cmd, cblock, hint);
			
 
				+	up_write(&cmd->root_lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -0,0 +1,142 @@
 
				+/*
			
 
				+ * Copyright (C) 2012 Red Hat, Inc.
			
 
				+ *
			
 
				+ * This file is released under the GPL.
			
 
				+ */
			
 
				+
			
 
				+#ifndef DM_CACHE_METADATA_H
			
 
				+#define DM_CACHE_METADATA_H
			
 
				+
			
 
				+#include "dm-cache-block-types.h"
			
 
				+#include "dm-cache-policy-internal.h"
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+#define DM_CACHE_METADATA_BLOCK_SIZE 4096
			
 
				+
			
 
				+/* FIXME: remove this restriction */
			
 
				+/*
			
 
				+ * The metadata device is currently limited in size.
			
 
				+ *
			
 
				+ * We have one block of index, which can hold 255 index entries.  Each
			
 
				+ * index entry contains allocation info about 16k metadata blocks.
			
 
				+ */
			
 
				+#define DM_CACHE_METADATA_MAX_SECTORS (255 * (1 << 14) * (DM_CACHE_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
			
 
				+
			
 
				+/*
			
 
				+ * A metadata device larger than 16GB triggers a warning.
			
 
				+ */
			
 
				+#define DM_CACHE_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT))
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * Ext[234]-style compat feature flags.
			
 
				+ *
			
 
				+ * A new feature which old metadata will still be compatible with should
			
 
				+ * define a DM_CACHE_FEATURE_COMPAT_* flag (rarely useful).
			
 
				+ *
			
 
				+ * A new feature that is not compatible with old code should define a
			
 
				+ * DM_CACHE_FEATURE_INCOMPAT_* flag and guard the relevant code with
			
 
				+ * that flag.
			
 
				+ *
			
 
				+ * A new feature that is not compatible with old code accessing the
			
 
				+ * metadata RDWR should define a DM_CACHE_FEATURE_RO_COMPAT_* flag and
			
 
				+ * guard the relevant code with that flag.
			
 
				+ *
			
 
				+ * As these various flags are defined they should be added to the
			
 
				+ * following masks.
			
 
				+ */
			
 
				+#define DM_CACHE_FEATURE_COMPAT_SUPP	  0UL
			
 
				+#define DM_CACHE_FEATURE_COMPAT_RO_SUPP	  0UL
			
 
				+#define DM_CACHE_FEATURE_INCOMPAT_SUPP	  0UL
			
 
				+
			
 
				+/*
			
 
				+ * Reopens or creates a new, empty metadata volume.
			
 
				+ * Returns an ERR_PTR on failure.
			
 
				+ */
			
 
				+struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
			
 
				+						 sector_t data_block_size,
			
 
				+						 bool may_format_device,
			
 
				+						 size_t policy_hint_size);
			
 
				+
			
 
				+void dm_cache_metadata_close(struct dm_cache_metadata *cmd);
			
 
				+
			
 
				+/*
			
 
				+ * The metadata needs to know how many cache blocks there are.  We don't
			
 
				+ * care about the origin, assuming the core target is giving us valid
			
 
				+ * origin blocks to map to.
			
 
				+ */
			
 
				+int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size);
			
 
				+dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd);
			
 
				+
			
 
				+int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
			
 
				+				   sector_t discard_block_size,
			
 
				+				   dm_dblock_t new_nr_entries);
			
 
				+
			
 
				+typedef int (*load_discard_fn)(void *context, sector_t discard_block_size,
			
 
				+			       dm_dblock_t dblock, bool discarded);
			
 
				+int dm_cache_load_discards(struct dm_cache_metadata *cmd,
			
 
				+			   load_discard_fn fn, void *context);
			
 
				+
			
 
				+int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_dblock_t dblock, bool discard);
			
 
				+
			
 
				+int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock);
			
 
				+int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock, dm_oblock_t oblock);
			
 
				+int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd);
			
 
				+
			
 
				+typedef int (*load_mapping_fn)(void *context, dm_oblock_t oblock,
			
 
				+			       dm_cblock_t cblock, bool dirty,
			
 
				+			       uint32_t hint, bool hint_valid);
			
 
				+int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
			
 
				+			   const char *policy_name,
			
 
				+			   load_mapping_fn fn,
			
 
				+			   void *context);
			
 
				+
			
 
				+int dm_cache_set_dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty);
			
 
				+
			
 
				+struct dm_cache_statistics {
			
 
				+	uint32_t read_hits;
			
 
				+	uint32_t read_misses;
			
 
				+	uint32_t write_hits;
			
 
				+	uint32_t write_misses;
			
 
				+};
			
 
				+
			
 
				+void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
			
 
				+				 struct dm_cache_statistics *stats);
			
 
				+void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
			
 
				+				 struct dm_cache_statistics *stats);
			
 
				+
			
 
				+int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown);
			
 
				+
			
 
				+int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd,
			
 
				+					   dm_block_t *result);
			
 
				+
			
 
				+int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd,
			
 
				+				   dm_block_t *result);
			
 
				+
			
 
				+void dm_cache_dump(struct dm_cache_metadata *cmd);
			
 
				+
			
 
				+/*
			
 
				+ * The policy is invited to save a 32bit hint value for every cblock (eg,
			
 
				+ * for a hit count).  These are stored against the policy name.  If
			
 
				+ * policies are changed, then hints will be lost.  If the machine crashes,
			
 
				+ * hints will be lost.
			
 
				+ *
			
 
				+ * The hints are indexed by the cblock, but many policies will not
			
 
				+ * neccessarily have a fast way of accessing efficiently via cblock.  So
			
 
				+ * rather than querying the policy for each cblock, we let it walk its data
			
 
				+ * structures and fill in the hints in whatever order it wishes.
			
 
				+ */
			
 
				+
			
 
				+int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *p);
			
 
				+
			
 
				+/*
			
 
				+ * requests hints for every cblock and stores in the metadata device.
			
 
				+ */
			
 
				+int dm_cache_save_hint(struct dm_cache_metadata *cmd,
			
 
				+		       dm_cblock_t cblock, uint32_t hint);
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+#endif /* DM_CACHE_METADATA_H */
			
--- a/drivers/md/dm-cache-policy-cleaner.c
+++ b/drivers/md/dm-cache-policy-cleaner.c
@@ -0,0 +1,464 @@
 
				+/*
			
 
				+ * Copyright (C) 2012 Red Hat. All rights reserved.
			
 
				+ *
			
 
				+ * writeback cache policy supporting flushing out dirty cache blocks.
			
 
				+ *
			
 
				+ * This file is released under the GPL.
			
 
				+ */
			
 
				+
			
 
				+#include "dm-cache-policy.h"
			
 
				+#include "dm.h"
			
 
				+
			
 
				+#include <linux/hash.h>
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/slab.h>
			
 
				+#include <linux/vmalloc.h>
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+#define DM_MSG_PREFIX "cache cleaner"
			
 
				+#define CLEANER_VERSION "1.0.0"
			
 
				+
			
 
				+/* Cache entry struct. */
			
 
				+struct wb_cache_entry {
			
 
				+	struct list_head list;
			
 
				+	struct hlist_node hlist;
			
 
				+
			
 
				+	dm_oblock_t oblock;
			
 
				+	dm_cblock_t cblock;
			
 
				+	bool dirty:1;
			
 
				+	bool pending:1;
			
 
				+};
			
 
				+
			
 
				+struct hash {
			
 
				+	struct hlist_head *table;
			
 
				+	dm_block_t hash_bits;
			
 
				+	unsigned nr_buckets;
			
 
				+};
			
 
				+
			
 
				+struct policy {
			
 
				+	struct dm_cache_policy policy;
			
 
				+	spinlock_t lock;
			
 
				+
			
 
				+	struct list_head free;
			
 
				+	struct list_head clean;
			
 
				+	struct list_head clean_pending;
			
 
				+	struct list_head dirty;
			
 
				+
			
 
				+	/*
			
 
				+	 * We know exactly how many cblocks will be needed,
			
 
				+	 * so we can allocate them up front.
			
 
				+	 */
			
 
				+	dm_cblock_t cache_size, nr_cblocks_allocated;
			
 
				+	struct wb_cache_entry *cblocks;
			
 
				+	struct hash chash;
			
 
				+};
			
 
				+
			
 
				+/*----------------------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * Low-level functions.
			
 
				+ */
			
 
				+static unsigned next_power(unsigned n, unsigned min)
			
 
				+{
			
 
				+	return roundup_pow_of_two(max(n, min));
			
 
				+}
			
 
				+
			
 
				+static struct policy *to_policy(struct dm_cache_policy *p)
			
 
				+{
			
 
				+	return container_of(p, struct policy, policy);
			
 
				+}
			
 
				+
			
 
				+static struct list_head *list_pop(struct list_head *q)
			
 
				+{
			
 
				+	struct list_head *r = q->next;
			
 
				+
			
 
				+	list_del(r);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------------------*/
			
 
				+
			
 
				+/* Allocate/free various resources. */
			
 
				+static int alloc_hash(struct hash *hash, unsigned elts)
			
 
				+{
			
 
				+	hash->nr_buckets = next_power(elts >> 4, 16);
			
 
				+	hash->hash_bits = ffs(hash->nr_buckets) - 1;
			
 
				+	hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets);
			
 
				+
			
 
				+	return hash->table ? 0 : -ENOMEM;
			
 
				+}
			
 
				+
			
 
				+static void free_hash(struct hash *hash)
			
 
				+{
			
 
				+	vfree(hash->table);
			
 
				+}
			
 
				+
			
 
				+static int alloc_cache_blocks_with_hash(struct policy *p, dm_cblock_t cache_size)
			
 
				+{
			
 
				+	int r = -ENOMEM;
			
 
				+
			
 
				+	p->cblocks = vzalloc(sizeof(*p->cblocks) * from_cblock(cache_size));
			
 
				+	if (p->cblocks) {
			
 
				+		unsigned u = from_cblock(cache_size);
			
 
				+
			
 
				+		while (u--)
			
 
				+			list_add(&p->cblocks[u].list, &p->free);
			
 
				+
			
 
				+		p->nr_cblocks_allocated = 0;
			
 
				+
			
 
				+		/* Cache entries hash. */
			
 
				+		r = alloc_hash(&p->chash, from_cblock(cache_size));
			
 
				+		if (r)
			
 
				+			vfree(p->cblocks);
			
 
				+	}
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static void free_cache_blocks_and_hash(struct policy *p)
			
 
				+{
			
 
				+	free_hash(&p->chash);
			
 
				+	vfree(p->cblocks);
			
 
				+}
			
 
				+
			
 
				+static struct wb_cache_entry *alloc_cache_entry(struct policy *p)
			
 
				+{
			
 
				+	struct wb_cache_entry *e;
			
 
				+
			
 
				+	BUG_ON(from_cblock(p->nr_cblocks_allocated) >= from_cblock(p->cache_size));
			
 
				+
			
 
				+	e = list_entry(list_pop(&p->free), struct wb_cache_entry, list);
			
 
				+	p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) + 1);
			
 
				+
			
 
				+	return e;
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------------------*/
			
 
				+
			
 
				+/* Hash functions (lookup, insert, remove). */
			
 
				+static struct wb_cache_entry *lookup_cache_entry(struct policy *p, dm_oblock_t oblock)
			
 
				+{
			
 
				+	struct hash *hash = &p->chash;
			
 
				+	unsigned h = hash_64(from_oblock(oblock), hash->hash_bits);
			
 
				+	struct wb_cache_entry *cur;
			
 
				+	struct hlist_head *bucket = &hash->table[h];
			
 
				+
			
 
				+	hlist_for_each_entry(cur, bucket, hlist) {
			
 
				+		if (cur->oblock == oblock) {
			
 
				+			/* Move upfront bucket for faster access. */
			
 
				+			hlist_del(&cur->hlist);
			
 
				+			hlist_add_head(&cur->hlist, bucket);
			
 
				+			return cur;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static void insert_cache_hash_entry(struct policy *p, struct wb_cache_entry *e)
			
 
				+{
			
 
				+	unsigned h = hash_64(from_oblock(e->oblock), p->chash.hash_bits);
			
 
				+
			
 
				+	hlist_add_head(&e->hlist, &p->chash.table[h]);
			
 
				+}
			
 
				+
			
 
				+static void remove_cache_hash_entry(struct wb_cache_entry *e)
			
 
				+{
			
 
				+	hlist_del(&e->hlist);
			
 
				+}
			
 
				+
			
 
				+/* Public interface (see dm-cache-policy.h */
			
 
				+static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock,
			
 
				+		  bool can_block, bool can_migrate, bool discarded_oblock,
			
 
				+		  struct bio *bio, struct policy_result *result)
			
 
				+{
			
 
				+	struct policy *p = to_policy(pe);
			
 
				+	struct wb_cache_entry *e;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	result->op = POLICY_MISS;
			
 
				+
			
 
				+	if (can_block)
			
 
				+		spin_lock_irqsave(&p->lock, flags);
			
 
				+
			
 
				+	else if (!spin_trylock_irqsave(&p->lock, flags))
			
 
				+		return -EWOULDBLOCK;
			
 
				+
			
 
				+	e = lookup_cache_entry(p, oblock);
			
 
				+	if (e) {
			
 
				+		result->op = POLICY_HIT;
			
 
				+		result->cblock = e->cblock;
			
 
				+
			
 
				+	}
			
 
				+
			
 
				+	spin_unlock_irqrestore(&p->lock, flags);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int wb_lookup(struct dm_cache_policy *pe, dm_oblock_t oblock, dm_cblock_t *cblock)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct policy *p = to_policy(pe);
			
 
				+	struct wb_cache_entry *e;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	if (!spin_trylock_irqsave(&p->lock, flags))
			
 
				+		return -EWOULDBLOCK;
			
 
				+
			
 
				+	e = lookup_cache_entry(p, oblock);
			
 
				+	if (e) {
			
 
				+		*cblock = e->cblock;
			
 
				+		r = 0;
			
 
				+
			
 
				+	} else
			
 
				+		r = -ENOENT;
			
 
				+
			
 
				+	spin_unlock_irqrestore(&p->lock, flags);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static void __set_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock, bool set)
			
 
				+{
			
 
				+	struct policy *p = to_policy(pe);
			
 
				+	struct wb_cache_entry *e;
			
 
				+
			
 
				+	e = lookup_cache_entry(p, oblock);
			
 
				+	BUG_ON(!e);
			
 
				+
			
 
				+	if (set) {
			
 
				+		if (!e->dirty) {
			
 
				+			e->dirty = true;
			
 
				+			list_move(&e->list, &p->dirty);
			
 
				+		}
			
 
				+
			
 
				+	} else {
			
 
				+		if (e->dirty) {
			
 
				+			e->pending = false;
			
 
				+			e->dirty = false;
			
 
				+			list_move(&e->list, &p->clean);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void wb_set_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
			
 
				+{
			
 
				+	struct policy *p = to_policy(pe);
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&p->lock, flags);
			
 
				+	__set_clear_dirty(pe, oblock, true);
			
 
				+	spin_unlock_irqrestore(&p->lock, flags);
			
 
				+}
			
 
				+
			
 
				+static void wb_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
			
 
				+{
			
 
				+	struct policy *p = to_policy(pe);
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&p->lock, flags);
			
 
				+	__set_clear_dirty(pe, oblock, false);
			
 
				+	spin_unlock_irqrestore(&p->lock, flags);
			
 
				+}
			
 
				+
			
 
				+static void add_cache_entry(struct policy *p, struct wb_cache_entry *e)
			
 
				+{
			
 
				+	insert_cache_hash_entry(p, e);
			
 
				+	if (e->dirty)
			
 
				+		list_add(&e->list, &p->dirty);
			
 
				+	else
			
 
				+		list_add(&e->list, &p->clean);
			
 
				+}
			
 
				+
			
 
				+static int wb_load_mapping(struct dm_cache_policy *pe,
			
 
				+			   dm_oblock_t oblock, dm_cblock_t cblock,
			
 
				+			   uint32_t hint, bool hint_valid)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct policy *p = to_policy(pe);
			
 
				+	struct wb_cache_entry *e = alloc_cache_entry(p);
			
 
				+
			
 
				+	if (e) {
			
 
				+		e->cblock = cblock;
			
 
				+		e->oblock = oblock;
			
 
				+		e->dirty = false; /* blocks default to clean */
			
 
				+		add_cache_entry(p, e);
			
 
				+		r = 0;
			
 
				+
			
 
				+	} else
			
 
				+		r = -ENOMEM;
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static void wb_destroy(struct dm_cache_policy *pe)
			
 
				+{
			
 
				+	struct policy *p = to_policy(pe);
			
 
				+
			
 
				+	free_cache_blocks_and_hash(p);
			
 
				+	kfree(p);
			
 
				+}
			
 
				+
			
 
				+static struct wb_cache_entry *__wb_force_remove_mapping(struct policy *p, dm_oblock_t oblock)
			
 
				+{
			
 
				+	struct wb_cache_entry *r = lookup_cache_entry(p, oblock);
			
 
				+
			
 
				+	BUG_ON(!r);
			
 
				+
			
 
				+	remove_cache_hash_entry(r);
			
 
				+	list_del(&r->list);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static void wb_remove_mapping(struct dm_cache_policy *pe, dm_oblock_t oblock)
			
 
				+{
			
 
				+	struct policy *p = to_policy(pe);
			
 
				+	struct wb_cache_entry *e;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&p->lock, flags);
			
 
				+	e = __wb_force_remove_mapping(p, oblock);
			
 
				+	list_add_tail(&e->list, &p->free);
			
 
				+	BUG_ON(!from_cblock(p->nr_cblocks_allocated));
			
 
				+	p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) - 1);
			
 
				+	spin_unlock_irqrestore(&p->lock, flags);
			
 
				+}
			
 
				+
			
 
				+static void wb_force_mapping(struct dm_cache_policy *pe,
			
 
				+				dm_oblock_t current_oblock, dm_oblock_t oblock)
			
 
				+{
			
 
				+	struct policy *p = to_policy(pe);
			
 
				+	struct wb_cache_entry *e;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&p->lock, flags);
			
 
				+	e = __wb_force_remove_mapping(p, current_oblock);
			
 
				+	e->oblock = oblock;
			
 
				+	add_cache_entry(p, e);
			
 
				+	spin_unlock_irqrestore(&p->lock, flags);
			
 
				+}
			
 
				+
			
 
				+static struct wb_cache_entry *get_next_dirty_entry(struct policy *p)
			
 
				+{
			
 
				+	struct list_head *l;
			
 
				+	struct wb_cache_entry *r;
			
 
				+
			
 
				+	if (list_empty(&p->dirty))
			
 
				+		return NULL;
			
 
				+
			
 
				+	l = list_pop(&p->dirty);
			
 
				+	r = container_of(l, struct wb_cache_entry, list);
			
 
				+	list_add(l, &p->clean_pending);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int wb_writeback_work(struct dm_cache_policy *pe,
			
 
				+			     dm_oblock_t *oblock,
			
 
				+			     dm_cblock_t *cblock)
			
 
				+{
			
 
				+	int r = -ENOENT;
			
 
				+	struct policy *p = to_policy(pe);
			
 
				+	struct wb_cache_entry *e;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&p->lock, flags);
			
 
				+
			
 
				+	e = get_next_dirty_entry(p);
			
 
				+	if (e) {
			
 
				+		*oblock = e->oblock;
			
 
				+		*cblock = e->cblock;
			
 
				+		r = 0;
			
 
				+	}
			
 
				+
			
 
				+	spin_unlock_irqrestore(&p->lock, flags);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static dm_cblock_t wb_residency(struct dm_cache_policy *pe)
			
 
				+{
			
 
				+	return to_policy(pe)->nr_cblocks_allocated;
			
 
				+}
			
 
				+
			
 
				+/* Init the policy plugin interface function pointers. */
			
 
				+static void init_policy_functions(struct policy *p)
			
 
				+{
			
 
				+	p->policy.destroy = wb_destroy;
			
 
				+	p->policy.map = wb_map;
			
 
				+	p->policy.lookup = wb_lookup;
			
 
				+	p->policy.set_dirty = wb_set_dirty;
			
 
				+	p->policy.clear_dirty = wb_clear_dirty;
			
 
				+	p->policy.load_mapping = wb_load_mapping;
			
 
				+	p->policy.walk_mappings = NULL;
			
 
				+	p->policy.remove_mapping = wb_remove_mapping;
			
 
				+	p->policy.writeback_work = wb_writeback_work;
			
 
				+	p->policy.force_mapping = wb_force_mapping;
			
 
				+	p->policy.residency = wb_residency;
			
 
				+	p->policy.tick = NULL;
			
 
				+}
			
 
				+
			
 
				+static struct dm_cache_policy *wb_create(dm_cblock_t cache_size,
			
 
				+					 sector_t origin_size,
			
 
				+					 sector_t cache_block_size)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct policy *p = kzalloc(sizeof(*p), GFP_KERNEL);
			
 
				+
			
 
				+	if (!p)
			
 
				+		return NULL;
			
 
				+
			
 
				+	init_policy_functions(p);
			
 
				+	INIT_LIST_HEAD(&p->free);
			
 
				+	INIT_LIST_HEAD(&p->clean);
			
 
				+	INIT_LIST_HEAD(&p->clean_pending);
			
 
				+	INIT_LIST_HEAD(&p->dirty);
			
 
				+
			
 
				+	p->cache_size = cache_size;
			
 
				+	spin_lock_init(&p->lock);
			
 
				+
			
 
				+	/* Allocate cache entry structs and add them to free list. */
			
 
				+	r = alloc_cache_blocks_with_hash(p, cache_size);
			
 
				+	if (!r)
			
 
				+		return &p->policy;
			
 
				+
			
 
				+	kfree(p);
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+/*----------------------------------------------------------------------------*/
			
 
				+
			
 
				+static struct dm_cache_policy_type wb_policy_type = {
			
 
				+	.name = "cleaner",
			
 
				+	.hint_size = 0,
			
 
				+	.owner = THIS_MODULE,
			
 
				+	.create = wb_create
			
 
				+};
			
 
				+
			
 
				+static int __init wb_init(void)
			
 
				+{
			
 
				+	int r = dm_cache_policy_register(&wb_policy_type);
			
 
				+
			
 
				+	if (r < 0)
			
 
				+		DMERR("register failed %d", r);
			
 
				+	else
			
 
				+		DMINFO("version " CLEANER_VERSION " loaded");
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static void __exit wb_exit(void)
			
 
				+{
			
 
				+	dm_cache_policy_unregister(&wb_policy_type);
			
 
				+}
			
 
				+
			
 
				+module_init(wb_init);
			
 
				+module_exit(wb_exit);
			
 
				+
			
 
				+MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>");
			
 
				+MODULE_LICENSE("GPL");
			
 
				+MODULE_DESCRIPTION("cleaner cache policy");
			
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -0,0 +1,124 @@
 
				+/*
			
 
				+ * Copyright (C) 2012 Red Hat. All rights reserved.
			
 
				+ *
			
 
				+ * This file is released under the GPL.
			
 
				+ */
			
 
				+
			
 
				+#ifndef DM_CACHE_POLICY_INTERNAL_H
			
 
				+#define DM_CACHE_POLICY_INTERNAL_H
			
 
				+
			
 
				+#include "dm-cache-policy.h"
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * Little inline functions that simplify calling the policy methods.
			
 
				+ */
			
 
				+static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock,
			
 
				+			     bool can_block, bool can_migrate, bool discarded_oblock,
			
 
				+			     struct bio *bio, struct policy_result *result)
			
 
				+{
			
 
				+	return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, result);
			
 
				+}
			
 
				+
			
 
				+static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
			
 
				+{
			
 
				+	BUG_ON(!p->lookup);
			
 
				+	return p->lookup(p, oblock, cblock);
			
 
				+}
			
 
				+
			
 
				+static inline void policy_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
			
 
				+{
			
 
				+	if (p->set_dirty)
			
 
				+		p->set_dirty(p, oblock);
			
 
				+}
			
 
				+
			
 
				+static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
			
 
				+{
			
 
				+	if (p->clear_dirty)
			
 
				+		p->clear_dirty(p, oblock);
			
 
				+}
			
 
				+
			
 
				+static inline int policy_load_mapping(struct dm_cache_policy *p,
			
 
				+				      dm_oblock_t oblock, dm_cblock_t cblock,
			
 
				+				      uint32_t hint, bool hint_valid)
			
 
				+{
			
 
				+	return p->load_mapping(p, oblock, cblock, hint, hint_valid);
			
 
				+}
			
 
				+
			
 
				+static inline int policy_walk_mappings(struct dm_cache_policy *p,
			
 
				+				      policy_walk_fn fn, void *context)
			
 
				+{
			
 
				+	return p->walk_mappings ? p->walk_mappings(p, fn, context) : 0;
			
 
				+}
			
 
				+
			
 
				+static inline int policy_writeback_work(struct dm_cache_policy *p,
			
 
				+					dm_oblock_t *oblock,
			
 
				+					dm_cblock_t *cblock)
			
 
				+{
			
 
				+	return p->writeback_work ? p->writeback_work(p, oblock, cblock) : -ENOENT;
			
 
				+}
			
 
				+
			
 
				+static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
			
 
				+{
			
 
				+	return p->remove_mapping(p, oblock);
			
 
				+}
			
 
				+
			
 
				+static inline void policy_force_mapping(struct dm_cache_policy *p,
			
 
				+					dm_oblock_t current_oblock, dm_oblock_t new_oblock)
			
 
				+{
			
 
				+	return p->force_mapping(p, current_oblock, new_oblock);
			
 
				+}
			
 
				+
			
 
				+static inline dm_cblock_t policy_residency(struct dm_cache_policy *p)
			
 
				+{
			
 
				+	return p->residency(p);
			
 
				+}
			
 
				+
			
 
				+static inline void policy_tick(struct dm_cache_policy *p)
			
 
				+{
			
 
				+	if (p->tick)
			
 
				+		return p->tick(p);
			
 
				+}
			
 
				+
			
 
				+static inline int policy_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
			
 
				+{
			
 
				+	ssize_t sz = 0;
			
 
				+	if (p->emit_config_values)
			
 
				+		return p->emit_config_values(p, result, maxlen);
			
 
				+
			
 
				+	DMEMIT("0");
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int policy_set_config_value(struct dm_cache_policy *p,
			
 
				+					  const char *key, const char *value)
			
 
				+{
			
 
				+	return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL;
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * Creates a new cache policy given a policy name, a cache size, an origin size and the block size.
			
 
				+ */
			
 
				+struct dm_cache_policy *dm_cache_policy_create(const char *name, dm_cblock_t cache_size,
			
 
				+					       sector_t origin_size, sector_t block_size);
			
 
				+
			
 
				+/*
			
 
				+ * Destroys the policy.  This drops references to the policy module as well
			
 
				+ * as calling it's destroy method.  So always use this rather than calling
			
 
				+ * the policy->destroy method directly.
			
 
				+ */
			
 
				+void dm_cache_policy_destroy(struct dm_cache_policy *p);
			
 
				+
			
 
				+/*
			
 
				+ * In case we've forgotten.
			
 
				+ */
			
 
				+const char *dm_cache_policy_get_name(struct dm_cache_policy *p);
			
 
				+
			
 
				+size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p);
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+#endif /* DM_CACHE_POLICY_INTERNAL_H */
			
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -0,0 +1,1195 @@
 
				+/*
			
 
				+ * Copyright (C) 2012 Red Hat. All rights reserved.
			
 
				+ *
			
 
				+ * This file is released under the GPL.
			
 
				+ */
			
 
				+
			
 
				+#include "dm-cache-policy.h"
			
 
				+#include "dm.h"
			
 
				+
			
 
				+#include <linux/hash.h>
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/mutex.h>
			
 
				+#include <linux/slab.h>
			
 
				+#include <linux/vmalloc.h>
			
 
				+
			
 
				+#define DM_MSG_PREFIX "cache-policy-mq"
			
 
				+#define MQ_VERSION	"1.0.0"
			
 
				+
			
 
				+static struct kmem_cache *mq_entry_cache;
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static unsigned next_power(unsigned n, unsigned min)
			
 
				+{
			
 
				+	return roundup_pow_of_two(max(n, min));
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static unsigned long *alloc_bitset(unsigned nr_entries)
			
 
				+{
			
 
				+	size_t s = sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
			
 
				+	return vzalloc(s);
			
 
				+}
			
 
				+
			
 
				+static void free_bitset(unsigned long *bits)
			
 
				+{
			
 
				+	vfree(bits);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * Large, sequential ios are probably better left on the origin device since
			
 
				+ * spindles tend to have good bandwidth.
			
 
				+ *
			
 
				+ * The io_tracker tries to spot when the io is in one of these sequential
			
 
				+ * modes.
			
 
				+ *
			
 
				+ * Two thresholds to switch between random and sequential io mode are defaulting
			
 
				+ * as follows and can be adjusted via the constructor and message interfaces.
			
 
				+ */
			
 
				+#define RANDOM_THRESHOLD_DEFAULT 4
			
 
				+#define SEQUENTIAL_THRESHOLD_DEFAULT 512
			
 
				+
			
 
				+enum io_pattern {
			
 
				+	PATTERN_SEQUENTIAL,
			
 
				+	PATTERN_RANDOM
			
 
				+};
			
 
				+
			
 
				+struct io_tracker {
			
 
				+	enum io_pattern pattern;
			
 
				+
			
 
				+	unsigned nr_seq_samples;
			
 
				+	unsigned nr_rand_samples;
			
 
				+	unsigned thresholds[2];
			
 
				+
			
 
				+	dm_oblock_t last_end_oblock;
			
 
				+};
			
 
				+
			
 
				+static void iot_init(struct io_tracker *t,
			
 
				+		     int sequential_threshold, int random_threshold)
			
 
				+{
			
 
				+	t->pattern = PATTERN_RANDOM;
			
 
				+	t->nr_seq_samples = 0;
			
 
				+	t->nr_rand_samples = 0;
			
 
				+	t->last_end_oblock = 0;
			
 
				+	t->thresholds[PATTERN_RANDOM] = random_threshold;
			
 
				+	t->thresholds[PATTERN_SEQUENTIAL] = sequential_threshold;
			
 
				+}
			
 
				+
			
 
				+static enum io_pattern iot_pattern(struct io_tracker *t)
			
 
				+{
			
 
				+	return t->pattern;
			
 
				+}
			
 
				+
			
 
				+static void iot_update_stats(struct io_tracker *t, struct bio *bio)
			
 
				+{
			
 
				+	if (bio->bi_sector == from_oblock(t->last_end_oblock) + 1)
			
 
				+		t->nr_seq_samples++;
			
 
				+	else {
			
 
				+		/*
			
 
				+		 * Just one non-sequential IO is enough to reset the
			
 
				+		 * counters.
			
 
				+		 */
			
 
				+		if (t->nr_seq_samples) {
			
 
				+			t->nr_seq_samples = 0;
			
 
				+			t->nr_rand_samples = 0;
			
 
				+		}
			
 
				+
			
 
				+		t->nr_rand_samples++;
			
 
				+	}
			
 
				+
			
 
				+	t->last_end_oblock = to_oblock(bio->bi_sector + bio_sectors(bio) - 1);
			
 
				+}
			
 
				+
			
 
				+static void iot_check_for_pattern_switch(struct io_tracker *t)
			
 
				+{
			
 
				+	switch (t->pattern) {
			
 
				+	case PATTERN_SEQUENTIAL:
			
 
				+		if (t->nr_rand_samples >= t->thresholds[PATTERN_RANDOM]) {
			
 
				+			t->pattern = PATTERN_RANDOM;
			
 
				+			t->nr_seq_samples = t->nr_rand_samples = 0;
			
 
				+		}
			
 
				+		break;
			
 
				+
			
 
				+	case PATTERN_RANDOM:
			
 
				+		if (t->nr_seq_samples >= t->thresholds[PATTERN_SEQUENTIAL]) {
			
 
				+			t->pattern = PATTERN_SEQUENTIAL;
			
 
				+			t->nr_seq_samples = t->nr_rand_samples = 0;
			
 
				+		}
			
 
				+		break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void iot_examine_bio(struct io_tracker *t, struct bio *bio)
			
 
				+{
			
 
				+	iot_update_stats(t, bio);
			
 
				+	iot_check_for_pattern_switch(t);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * This queue is divided up into different levels.  Allowing us to push
			
 
				+ * entries to the back of any of the levels.  Think of it as a partially
			
 
				+ * sorted queue.
			
 
				+ */
			
 
				+#define NR_QUEUE_LEVELS 16u
			
 
				+
			
 
				+struct queue {
			
 
				+	struct list_head qs[NR_QUEUE_LEVELS];
			
 
				+};
			
 
				+
			
 
				+static void queue_init(struct queue *q)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+
			
 
				+	for (i = 0; i < NR_QUEUE_LEVELS; i++)
			
 
				+		INIT_LIST_HEAD(q->qs + i);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Insert an entry to the back of the given level.
			
 
				+ */
			
 
				+static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
			
 
				+{
			
 
				+	list_add_tail(elt, q->qs + level);
			
 
				+}
			
 
				+
			
 
				+static void queue_remove(struct list_head *elt)
			
 
				+{
			
 
				+	list_del(elt);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Shifts all regions down one level.  This has no effect on the order of
			
 
				+ * the queue.
			
 
				+ */
			
 
				+static void queue_shift_down(struct queue *q)
			
 
				+{
			
 
				+	unsigned level;
			
 
				+
			
 
				+	for (level = 1; level < NR_QUEUE_LEVELS; level++)
			
 
				+		list_splice_init(q->qs + level, q->qs + level - 1);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Gives us the oldest entry of the lowest popoulated level.  If the first
			
 
				+ * level is emptied then we shift down one level.
			
 
				+ */
			
 
				+static struct list_head *queue_pop(struct queue *q)
			
 
				+{
			
 
				+	unsigned level;
			
 
				+	struct list_head *r;
			
 
				+
			
 
				+	for (level = 0; level < NR_QUEUE_LEVELS; level++)
			
 
				+		if (!list_empty(q->qs + level)) {
			
 
				+			r = q->qs[level].next;
			
 
				+			list_del(r);
			
 
				+
			
 
				+			/* have we just emptied the bottom level? */
			
 
				+			if (level == 0 && list_empty(q->qs))
			
 
				+				queue_shift_down(q);
			
 
				+
			
 
				+			return r;
			
 
				+		}
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static struct list_head *list_pop(struct list_head *lh)
			
 
				+{
			
 
				+	struct list_head *r = lh->next;
			
 
				+
			
 
				+	BUG_ON(!r);
			
 
				+	list_del_init(r);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * Describes a cache entry.  Used in both the cache and the pre_cache.
			
 
				+ */
			
 
				+struct entry {
			
 
				+	struct hlist_node hlist;
			
 
				+	struct list_head list;
			
 
				+	dm_oblock_t oblock;
			
 
				+	dm_cblock_t cblock;	/* valid iff in_cache */
			
 
				+
			
 
				+	/*
			
 
				+	 * FIXME: pack these better
			
 
				+	 */
			
 
				+	bool in_cache:1;
			
 
				+	unsigned hit_count;
			
 
				+	unsigned generation;
			
 
				+	unsigned tick;
			
 
				+};
			
 
				+
			
 
				+struct mq_policy {
			
 
				+	struct dm_cache_policy policy;
			
 
				+
			
 
				+	/* protects everything */
			
 
				+	struct mutex lock;
			
 
				+	dm_cblock_t cache_size;
			
 
				+	struct io_tracker tracker;
			
 
				+
			
 
				+	/*
			
 
				+	 * We maintain two queues of entries.  The cache proper contains
			
 
				+	 * the currently active mappings.  Whereas the pre_cache tracks
			
 
				+	 * blocks that are being hit frequently and potential candidates
			
 
				+	 * for promotion to the cache.
			
 
				+	 */
			
 
				+	struct queue pre_cache;
			
 
				+	struct queue cache;
			
 
				+
			
 
				+	/*
			
 
				+	 * Keeps track of time, incremented by the core.  We use this to
			
 
				+	 * avoid attributing multiple hits within the same tick.
			
 
				+	 *
			
 
				+	 * Access to tick_protected should be done with the spin lock held.
			
 
				+	 * It's copied to tick at the start of the map function (within the
			
 
				+	 * mutex).
			
 
				+	 */
			
 
				+	spinlock_t tick_lock;
			
 
				+	unsigned tick_protected;
			
 
				+	unsigned tick;
			
 
				+
			
 
				+	/*
			
 
				+	 * A count of the number of times the map function has been called
			
 
				+	 * and found an entry in the pre_cache or cache.  Currently used to
			
 
				+	 * calculate the generation.
			
 
				+	 */
			
 
				+	unsigned hit_count;
			
 
				+
			
 
				+	/*
			
 
				+	 * A generation is a longish period that is used to trigger some
			
 
				+	 * book keeping effects.  eg, decrementing hit counts on entries.
			
 
				+	 * This is needed to allow the cache to evolve as io patterns
			
 
				+	 * change.
			
 
				+	 */
			
 
				+	unsigned generation;
			
 
				+	unsigned generation_period; /* in lookups (will probably change) */
			
 
				+
			
 
				+	/*
			
 
				+	 * Entries in the pre_cache whose hit count passes the promotion
			
 
				+	 * threshold move to the cache proper.  Working out the correct
			
 
				+	 * value for the promotion_threshold is crucial to this policy.
			
 
				+	 */
			
 
				+	unsigned promote_threshold;
			
 
				+
			
 
				+	/*
			
 
				+	 * We need cache_size entries for the cache, and choose to have
			
 
				+	 * cache_size entries for the pre_cache too.  One motivation for
			
 
				+	 * using the same size is to make the hit counts directly
			
 
				+	 * comparable between pre_cache and cache.
			
 
				+	 */
			
 
				+	unsigned nr_entries;
			
 
				+	unsigned nr_entries_allocated;
			
 
				+	struct list_head free;
			
 
				+
			
 
				+	/*
			
 
				+	 * Cache blocks may be unallocated.  We store this info in a
			
 
				+	 * bitset.
			
 
				+	 */
			
 
				+	unsigned long *allocation_bitset;
			
 
				+	unsigned nr_cblocks_allocated;
			
 
				+	unsigned find_free_nr_words;
			
 
				+	unsigned find_free_last_word;
			
 
				+
			
 
				+	/*
			
 
				+	 * The hash table allows us to quickly find an entry by origin
			
 
				+	 * block.  Both pre_cache and cache entries are in here.
			
 
				+	 */
			
 
				+	unsigned nr_buckets;
			
 
				+	dm_block_t hash_bits;
			
 
				+	struct hlist_head *table;
			
 
				+};
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+/* Free/alloc mq cache entry structures. */
			
 
				+static void takeout_queue(struct list_head *lh, struct queue *q)
			
 
				+{
			
 
				+	unsigned level;
			
 
				+
			
 
				+	for (level = 0; level < NR_QUEUE_LEVELS; level++)
			
 
				+		list_splice(q->qs + level, lh);
			
 
				+}
			
 
				+
			
 
				+static void free_entries(struct mq_policy *mq)
			
 
				+{
			
 
				+	struct entry *e, *tmp;
			
 
				+
			
 
				+	takeout_queue(&mq->free, &mq->pre_cache);
			
 
				+	takeout_queue(&mq->free, &mq->cache);
			
 
				+
			
 
				+	list_for_each_entry_safe(e, tmp, &mq->free, list)
			
 
				+		kmem_cache_free(mq_entry_cache, e);
			
 
				+}
			
 
				+
			
 
				+static int alloc_entries(struct mq_policy *mq, unsigned elts)
			
 
				+{
			
 
				+	unsigned u = mq->nr_entries;
			
 
				+
			
 
				+	INIT_LIST_HEAD(&mq->free);
			
 
				+	mq->nr_entries_allocated = 0;
			
 
				+
			
 
				+	while (u--) {
			
 
				+		struct entry *e = kmem_cache_zalloc(mq_entry_cache, GFP_KERNEL);
			
 
				+
			
 
				+		if (!e) {
			
 
				+			free_entries(mq);
			
 
				+			return -ENOMEM;
			
 
				+		}
			
 
				+
			
 
				+
			
 
				+		list_add(&e->list, &mq->free);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * Simple hash table implementation.  Should replace with the standard hash
			
 
				+ * table that's making its way upstream.
			
 
				+ */
			
 
				+static void hash_insert(struct mq_policy *mq, struct entry *e)
			
 
				+{
			
 
				+	unsigned h = hash_64(from_oblock(e->oblock), mq->hash_bits);
			
 
				+
			
 
				+	hlist_add_head(&e->hlist, mq->table + h);
			
 
				+}
			
 
				+
			
 
				+static struct entry *hash_lookup(struct mq_policy *mq, dm_oblock_t oblock)
			
 
				+{
			
 
				+	unsigned h = hash_64(from_oblock(oblock), mq->hash_bits);
			
 
				+	struct hlist_head *bucket = mq->table + h;
			
 
				+	struct entry *e;
			
 
				+
			
 
				+	hlist_for_each_entry(e, bucket, hlist)
			
 
				+		if (e->oblock == oblock) {
			
 
				+			hlist_del(&e->hlist);
			
 
				+			hlist_add_head(&e->hlist, bucket);
			
 
				+			return e;
			
 
				+		}
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static void hash_remove(struct entry *e)
			
 
				+{
			
 
				+	hlist_del(&e->hlist);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * Allocates a new entry structure.  The memory is allocated in one lump,
			
 
				+ * so we just handing it out here.  Returns NULL if all entries have
			
 
				+ * already been allocated.  Cannot fail otherwise.
			
 
				+ */
			
 
				+static struct entry *alloc_entry(struct mq_policy *mq)
			
 
				+{
			
 
				+	struct entry *e;
			
 
				+
			
 
				+	if (mq->nr_entries_allocated >= mq->nr_entries) {
			
 
				+		BUG_ON(!list_empty(&mq->free));
			
 
				+		return NULL;
			
 
				+	}
			
 
				+
			
 
				+	e = list_entry(list_pop(&mq->free), struct entry, list);
			
 
				+	INIT_LIST_HEAD(&e->list);
			
 
				+	INIT_HLIST_NODE(&e->hlist);
			
 
				+
			
 
				+	mq->nr_entries_allocated++;
			
 
				+	return e;
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * Mark cache blocks allocated or not in the bitset.
			
 
				+ */
			
 
				+static void alloc_cblock(struct mq_policy *mq, dm_cblock_t cblock)
			
 
				+{
			
 
				+	BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size));
			
 
				+	BUG_ON(test_bit(from_cblock(cblock), mq->allocation_bitset));
			
 
				+
			
 
				+	set_bit(from_cblock(cblock), mq->allocation_bitset);
			
 
				+	mq->nr_cblocks_allocated++;
			
 
				+}
			
 
				+
			
 
				+static void free_cblock(struct mq_policy *mq, dm_cblock_t cblock)
			
 
				+{
			
 
				+	BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size));
			
 
				+	BUG_ON(!test_bit(from_cblock(cblock), mq->allocation_bitset));
			
 
				+
			
 
				+	clear_bit(from_cblock(cblock), mq->allocation_bitset);
			
 
				+	mq->nr_cblocks_allocated--;
			
 
				+}
			
 
				+
			
 
				+static bool any_free_cblocks(struct mq_policy *mq)
			
 
				+{
			
 
				+	return mq->nr_cblocks_allocated < from_cblock(mq->cache_size);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Fills result out with a cache block that isn't in use, or return
			
 
				+ * -ENOSPC.  This does _not_ mark the cblock as allocated, the caller is
			
 
				+ * reponsible for that.
			
 
				+ */
			
 
				+static int __find_free_cblock(struct mq_policy *mq, unsigned begin, unsigned end,
			
 
				+			      dm_cblock_t *result, unsigned *last_word)
			
 
				+{
			
 
				+	int r = -ENOSPC;
			
 
				+	unsigned w;
			
 
				+
			
 
				+	for (w = begin; w < end; w++) {
			
 
				+		/*
			
 
				+		 * ffz is undefined if no zero exists
			
 
				+		 */
			
 
				+		if (mq->allocation_bitset[w] != ~0UL) {
			
 
				+			*last_word = w;
			
 
				+			*result = to_cblock((w * BITS_PER_LONG) + ffz(mq->allocation_bitset[w]));
			
 
				+			if (from_cblock(*result) < from_cblock(mq->cache_size))
			
 
				+				r = 0;
			
 
				+
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int find_free_cblock(struct mq_policy *mq, dm_cblock_t *result)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	if (!any_free_cblocks(mq))
			
 
				+		return -ENOSPC;
			
 
				+
			
 
				+	r = __find_free_cblock(mq, mq->find_free_last_word, mq->find_free_nr_words, result, &mq->find_free_last_word);
			
 
				+	if (r == -ENOSPC && mq->find_free_last_word)
			
 
				+		r = __find_free_cblock(mq, 0, mq->find_free_last_word, result, &mq->find_free_last_word);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * Now we get to the meat of the policy.  This section deals with deciding
			
 
				+ * when to to add entries to the pre_cache and cache, and move between
			
 
				+ * them.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * The queue level is based on the log2 of the hit count.
			
 
				+ */
			
 
				+static unsigned queue_level(struct entry *e)
			
 
				+{
			
 
				+	return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Inserts the entry into the pre_cache or the cache.  Ensures the cache
			
 
				+ * block is marked as allocated if necc.  Inserts into the hash table.  Sets the
			
 
				+ * tick which records when the entry was last moved about.
			
 
				+ */
			
 
				+static void push(struct mq_policy *mq, struct entry *e)
			
 
				+{
			
 
				+	e->tick = mq->tick;
			
 
				+	hash_insert(mq, e);
			
 
				+
			
 
				+	if (e->in_cache) {
			
 
				+		alloc_cblock(mq, e->cblock);
			
 
				+		queue_push(&mq->cache, queue_level(e), &e->list);
			
 
				+	} else
			
 
				+		queue_push(&mq->pre_cache, queue_level(e), &e->list);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Removes an entry from pre_cache or cache.  Removes from the hash table.
			
 
				+ * Frees off the cache block if necc.
			
 
				+ */
			
 
				+static void del(struct mq_policy *mq, struct entry *e)
			
 
				+{
			
 
				+	queue_remove(&e->list);
			
 
				+	hash_remove(e);
			
 
				+	if (e->in_cache)
			
 
				+		free_cblock(mq, e->cblock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Like del, except it removes the first entry in the queue (ie. the least
			
 
				+ * recently used).
			
 
				+ */
			
 
				+static struct entry *pop(struct mq_policy *mq, struct queue *q)
			
 
				+{
			
 
				+	struct entry *e = container_of(queue_pop(q), struct entry, list);
			
 
				+
			
 
				+	if (e) {
			
 
				+		hash_remove(e);
			
 
				+
			
 
				+		if (e->in_cache)
			
 
				+			free_cblock(mq, e->cblock);
			
 
				+	}
			
 
				+
			
 
				+	return e;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Has this entry already been updated?
			
 
				+ */
			
 
				+static bool updated_this_tick(struct mq_policy *mq, struct entry *e)
			
 
				+{
			
 
				+	return mq->tick == e->tick;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * The promotion threshold is adjusted every generation.  As are the counts
			
 
				+ * of the entries.
			
 
				+ *
			
 
				+ * At the moment the threshold is taken by averaging the hit counts of some
			
 
				+ * of the entries in the cache (the first 20 entries of the first level).
			
 
				+ *
			
 
				+ * We can be much cleverer than this though.  For example, each promotion
			
 
				+ * could bump up the threshold helping to prevent churn.  Much more to do
			
 
				+ * here.
			
 
				+ */
			
 
				+
			
 
				+#define MAX_TO_AVERAGE 20
			
 
				+
			
 
				+static void check_generation(struct mq_policy *mq)
			
 
				+{
			
 
				+	unsigned total = 0, nr = 0, count = 0, level;
			
 
				+	struct list_head *head;
			
 
				+	struct entry *e;
			
 
				+
			
 
				+	if ((mq->hit_count >= mq->generation_period) &&
			
 
				+	    (mq->nr_cblocks_allocated == from_cblock(mq->cache_size))) {
			
 
				+
			
 
				+		mq->hit_count = 0;
			
 
				+		mq->generation++;
			
 
				+
			
 
				+		for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) {
			
 
				+			head = mq->cache.qs + level;
			
 
				+			list_for_each_entry(e, head, list) {
			
 
				+				nr++;
			
 
				+				total += e->hit_count;
			
 
				+
			
 
				+				if (++count >= MAX_TO_AVERAGE)
			
 
				+					break;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		mq->promote_threshold = nr ? total / nr : 1;
			
 
				+		if (mq->promote_threshold * nr < total)
			
 
				+			mq->promote_threshold++;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Whenever we use an entry we bump up it's hit counter, and push it to the
			
 
				+ * back to it's current level.
			
 
				+ */
			
 
				+static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e)
			
 
				+{
			
 
				+	if (updated_this_tick(mq, e))
			
 
				+		return;
			
 
				+
			
 
				+	e->hit_count++;
			
 
				+	mq->hit_count++;
			
 
				+	check_generation(mq);
			
 
				+
			
 
				+	/* generation adjustment, to stop the counts increasing forever. */
			
 
				+	/* FIXME: divide? */
			
 
				+	/* e->hit_count -= min(e->hit_count - 1, mq->generation - e->generation); */
			
 
				+	e->generation = mq->generation;
			
 
				+
			
 
				+	del(mq, e);
			
 
				+	push(mq, e);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Demote the least recently used entry from the cache to the pre_cache.
			
 
				+ * Returns the new cache entry to use, and the old origin block it was
			
 
				+ * mapped to.
			
 
				+ *
			
 
				+ * We drop the hit count on the demoted entry back to 1 to stop it bouncing
			
 
				+ * straight back into the cache if it's subsequently hit.  There are
			
 
				+ * various options here, and more experimentation would be good:
			
 
				+ *
			
 
				+ * - just forget about the demoted entry completely (ie. don't insert it
			
 
				+     into the pre_cache).
			
 
				+ * - divide the hit count rather that setting to some hard coded value.
			
 
				+ * - set the hit count to a hard coded value other than 1, eg, is it better
			
 
				+ *   if it goes in at level 2?
			
 
				+ */
			
 
				+static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
			
 
				+{
			
 
				+	dm_cblock_t result;
			
 
				+	struct entry *demoted = pop(mq, &mq->cache);
			
 
				+
			
 
				+	BUG_ON(!demoted);
			
 
				+	result = demoted->cblock;
			
 
				+	*oblock = demoted->oblock;
			
 
				+	demoted->in_cache = false;
			
 
				+	demoted->hit_count = 1;
			
 
				+	push(mq, demoted);
			
 
				+
			
 
				+	return result;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * We modify the basic promotion_threshold depending on the specific io.
			
 
				+ *
			
 
				+ * If the origin block has been discarded then there's no cost to copy it
			
 
				+ * to the cache.
			
 
				+ *
			
 
				+ * We bias towards reads, since they can be demoted at no cost if they
			
 
				+ * haven't been dirtied.
			
 
				+ */
			
 
				+#define DISCARDED_PROMOTE_THRESHOLD 1
			
 
				+#define READ_PROMOTE_THRESHOLD 4
			
 
				+#define WRITE_PROMOTE_THRESHOLD 8
			
 
				+
			
 
				+static unsigned adjusted_promote_threshold(struct mq_policy *mq,
			
 
				+					   bool discarded_oblock, int data_dir)
			
 
				+{
			
 
				+	if (discarded_oblock && any_free_cblocks(mq) && data_dir == WRITE)
			
 
				+		/*
			
 
				+		 * We don't need to do any copying at all, so give this a
			
 
				+		 * very low threshold.  In practice this only triggers
			
 
				+		 * during initial population after a format.
			
 
				+		 */
			
 
				+		return DISCARDED_PROMOTE_THRESHOLD;
			
 
				+
			
 
				+	return data_dir == READ ?
			
 
				+		(mq->promote_threshold + READ_PROMOTE_THRESHOLD) :
			
 
				+		(mq->promote_threshold + WRITE_PROMOTE_THRESHOLD);
			
 
				+}
			
 
				+
			
 
				+static bool should_promote(struct mq_policy *mq, struct entry *e,
			
 
				+			   bool discarded_oblock, int data_dir)
			
 
				+{
			
 
				+	return e->hit_count >=
			
 
				+		adjusted_promote_threshold(mq, discarded_oblock, data_dir);
			
 
				+}
			
 
				+
			
 
				+static int cache_entry_found(struct mq_policy *mq,
			
 
				+			     struct entry *e,
			
 
				+			     struct policy_result *result)
			
 
				+{
			
 
				+	requeue_and_update_tick(mq, e);
			
 
				+
			
 
				+	if (e->in_cache) {
			
 
				+		result->op = POLICY_HIT;
			
 
				+		result->cblock = e->cblock;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Moves and entry from the pre_cache to the cache.  The main work is
			
 
				+ * finding which cache block to use.
			
 
				+ */
			
 
				+static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
			
 
				+			      struct policy_result *result)
			
 
				+{
			
 
				+	dm_cblock_t cblock;
			
 
				+
			
 
				+	if (find_free_cblock(mq, &cblock) == -ENOSPC) {
			
 
				+		result->op = POLICY_REPLACE;
			
 
				+		cblock = demote_cblock(mq, &result->old_oblock);
			
 
				+	} else
			
 
				+		result->op = POLICY_NEW;
			
 
				+
			
 
				+	result->cblock = e->cblock = cblock;
			
 
				+
			
 
				+	del(mq, e);
			
 
				+	e->in_cache = true;
			
 
				+	push(mq, e);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
			
 
				+				 bool can_migrate, bool discarded_oblock,
			
 
				+				 int data_dir, struct policy_result *result)
			
 
				+{
			
 
				+	int r = 0;
			
 
				+	bool updated = updated_this_tick(mq, e);
			
 
				+
			
 
				+	requeue_and_update_tick(mq, e);
			
 
				+
			
 
				+	if ((!discarded_oblock && updated) ||
			
 
				+	    !should_promote(mq, e, discarded_oblock, data_dir))
			
 
				+		result->op = POLICY_MISS;
			
 
				+	else if (!can_migrate)
			
 
				+		r = -EWOULDBLOCK;
			
 
				+	else
			
 
				+		r = pre_cache_to_cache(mq, e, result);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static void insert_in_pre_cache(struct mq_policy *mq,
			
 
				+				dm_oblock_t oblock)
			
 
				+{
			
 
				+	struct entry *e = alloc_entry(mq);
			
 
				+
			
 
				+	if (!e)
			
 
				+		/*
			
 
				+		 * There's no spare entry structure, so we grab the least
			
 
				+		 * used one from the pre_cache.
			
 
				+		 */
			
 
				+		e = pop(mq, &mq->pre_cache);
			
 
				+
			
 
				+	if (unlikely(!e)) {
			
 
				+		DMWARN("couldn't pop from pre cache");
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	e->in_cache = false;
			
 
				+	e->oblock = oblock;
			
 
				+	e->hit_count = 1;
			
 
				+	e->generation = mq->generation;
			
 
				+	push(mq, e);
			
 
				+}
			
 
				+
			
 
				+static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
			
 
				+			    struct policy_result *result)
			
 
				+{
			
 
				+	struct entry *e;
			
 
				+	dm_cblock_t cblock;
			
 
				+
			
 
				+	if (find_free_cblock(mq, &cblock) == -ENOSPC) {
			
 
				+		result->op = POLICY_MISS;
			
 
				+		insert_in_pre_cache(mq, oblock);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	e = alloc_entry(mq);
			
 
				+	if (unlikely(!e)) {
			
 
				+		result->op = POLICY_MISS;
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	e->oblock = oblock;
			
 
				+	e->cblock = cblock;
			
 
				+	e->in_cache = true;
			
 
				+	e->hit_count = 1;
			
 
				+	e->generation = mq->generation;
			
 
				+	push(mq, e);
			
 
				+
			
 
				+	result->op = POLICY_NEW;
			
 
				+	result->cblock = e->cblock;
			
 
				+}
			
 
				+
			
 
				+static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
			
 
				+			  bool can_migrate, bool discarded_oblock,
			
 
				+			  int data_dir, struct policy_result *result)
			
 
				+{
			
 
				+	if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) == 1) {
			
 
				+		if (can_migrate)
			
 
				+			insert_in_cache(mq, oblock, result);
			
 
				+		else
			
 
				+			return -EWOULDBLOCK;
			
 
				+	} else {
			
 
				+		insert_in_pre_cache(mq, oblock);
			
 
				+		result->op = POLICY_MISS;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Looks the oblock up in the hash table, then decides whether to put in
			
 
				+ * pre_cache, or cache etc.
			
 
				+ */
			
 
				+static int map(struct mq_policy *mq, dm_oblock_t oblock,
			
 
				+	       bool can_migrate, bool discarded_oblock,
			
 
				+	       int data_dir, struct policy_result *result)
			
 
				+{
			
 
				+	int r = 0;
			
 
				+	struct entry *e = hash_lookup(mq, oblock);
			
 
				+
			
 
				+	if (e && e->in_cache)
			
 
				+		r = cache_entry_found(mq, e, result);
			
 
				+	else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL)
			
 
				+		result->op = POLICY_MISS;
			
 
				+	else if (e)
			
 
				+		r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock,
			
 
				+					  data_dir, result);
			
 
				+	else
			
 
				+		r = no_entry_found(mq, oblock, can_migrate, discarded_oblock,
			
 
				+				   data_dir, result);
			
 
				+
			
 
				+	if (r == -EWOULDBLOCK)
			
 
				+		result->op = POLICY_MISS;
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * Public interface, via the policy struct.  See dm-cache-policy.h for a
			
 
				+ * description of these.
			
 
				+ */
			
 
				+
			
 
				+static struct mq_policy *to_mq_policy(struct dm_cache_policy *p)
			
 
				+{
			
 
				+	return container_of(p, struct mq_policy, policy);
			
 
				+}
			
 
				+
			
 
				+static void mq_destroy(struct dm_cache_policy *p)
			
 
				+{
			
 
				+	struct mq_policy *mq = to_mq_policy(p);
			
 
				+
			
 
				+	free_bitset(mq->allocation_bitset);
			
 
				+	kfree(mq->table);
			
 
				+	free_entries(mq);
			
 
				+	kfree(mq);
			
 
				+}
			
 
				+
			
 
				+static void copy_tick(struct mq_policy *mq)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&mq->tick_lock, flags);
			
 
				+	mq->tick = mq->tick_protected;
			
 
				+	spin_unlock_irqrestore(&mq->tick_lock, flags);
			
 
				+}
			
 
				+
			
 
				+static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
			
 
				+		  bool can_block, bool can_migrate, bool discarded_oblock,
			
 
				+		  struct bio *bio, struct policy_result *result)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct mq_policy *mq = to_mq_policy(p);
			
 
				+
			
 
				+	result->op = POLICY_MISS;
			
 
				+
			
 
				+	if (can_block)
			
 
				+		mutex_lock(&mq->lock);
			
 
				+	else if (!mutex_trylock(&mq->lock))
			
 
				+		return -EWOULDBLOCK;
			
 
				+
			
 
				+	copy_tick(mq);
			
 
				+
			
 
				+	iot_examine_bio(&mq->tracker, bio);
			
 
				+	r = map(mq, oblock, can_migrate, discarded_oblock,
			
 
				+		bio_data_dir(bio), result);
			
 
				+
			
 
				+	mutex_unlock(&mq->lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct mq_policy *mq = to_mq_policy(p);
			
 
				+	struct entry *e;
			
 
				+
			
 
				+	if (!mutex_trylock(&mq->lock))
			
 
				+		return -EWOULDBLOCK;
			
 
				+
			
 
				+	e = hash_lookup(mq, oblock);
			
 
				+	if (e && e->in_cache) {
			
 
				+		*cblock = e->cblock;
			
 
				+		r = 0;
			
 
				+	} else
			
 
				+		r = -ENOENT;
			
 
				+
			
 
				+	mutex_unlock(&mq->lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int mq_load_mapping(struct dm_cache_policy *p,
			
 
				+			   dm_oblock_t oblock, dm_cblock_t cblock,
			
 
				+			   uint32_t hint, bool hint_valid)
			
 
				+{
			
 
				+	struct mq_policy *mq = to_mq_policy(p);
			
 
				+	struct entry *e;
			
 
				+
			
 
				+	e = alloc_entry(mq);
			
 
				+	if (!e)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	e->cblock = cblock;
			
 
				+	e->oblock = oblock;
			
 
				+	e->in_cache = true;
			
 
				+	e->hit_count = hint_valid ? hint : 1;
			
 
				+	e->generation = mq->generation;
			
 
				+	push(mq, e);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
			
 
				+			    void *context)
			
 
				+{
			
 
				+	struct mq_policy *mq = to_mq_policy(p);
			
 
				+	int r = 0;
			
 
				+	struct entry *e;
			
 
				+	unsigned level;
			
 
				+
			
 
				+	mutex_lock(&mq->lock);
			
 
				+
			
 
				+	for (level = 0; level < NR_QUEUE_LEVELS; level++)
			
 
				+		list_for_each_entry(e, &mq->cache.qs[level], list) {
			
 
				+			r = fn(context, e->cblock, e->oblock, e->hit_count);
			
 
				+			if (r)
			
 
				+				goto out;
			
 
				+		}
			
 
				+
			
 
				+out:
			
 
				+	mutex_unlock(&mq->lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static void remove_mapping(struct mq_policy *mq, dm_oblock_t oblock)
			
 
				+{
			
 
				+	struct entry *e = hash_lookup(mq, oblock);
			
 
				+
			
 
				+	BUG_ON(!e || !e->in_cache);
			
 
				+
			
 
				+	del(mq, e);
			
 
				+	e->in_cache = false;
			
 
				+	push(mq, e);
			
 
				+}
			
 
				+
			
 
				+static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
			
 
				+{
			
 
				+	struct mq_policy *mq = to_mq_policy(p);
			
 
				+
			
 
				+	mutex_lock(&mq->lock);
			
 
				+	remove_mapping(mq, oblock);
			
 
				+	mutex_unlock(&mq->lock);
			
 
				+}
			
 
				+
			
 
				+static void force_mapping(struct mq_policy *mq,
			
 
				+			  dm_oblock_t current_oblock, dm_oblock_t new_oblock)
			
 
				+{
			
 
				+	struct entry *e = hash_lookup(mq, current_oblock);
			
 
				+
			
 
				+	BUG_ON(!e || !e->in_cache);
			
 
				+
			
 
				+	del(mq, e);
			
 
				+	e->oblock = new_oblock;
			
 
				+	push(mq, e);
			
 
				+}
			
 
				+
			
 
				+static void mq_force_mapping(struct dm_cache_policy *p,
			
 
				+			     dm_oblock_t current_oblock, dm_oblock_t new_oblock)
			
 
				+{
			
 
				+	struct mq_policy *mq = to_mq_policy(p);
			
 
				+
			
 
				+	mutex_lock(&mq->lock);
			
 
				+	force_mapping(mq, current_oblock, new_oblock);
			
 
				+	mutex_unlock(&mq->lock);
			
 
				+}
			
 
				+
			
 
				+static dm_cblock_t mq_residency(struct dm_cache_policy *p)
			
 
				+{
			
 
				+	struct mq_policy *mq = to_mq_policy(p);
			
 
				+
			
 
				+	/* FIXME: lock mutex, not sure we can block here */
			
 
				+	return to_cblock(mq->nr_cblocks_allocated);
			
 
				+}
			
 
				+
			
 
				+static void mq_tick(struct dm_cache_policy *p)
			
 
				+{
			
 
				+	struct mq_policy *mq = to_mq_policy(p);
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&mq->tick_lock, flags);
			
 
				+	mq->tick_protected++;
			
 
				+	spin_unlock_irqrestore(&mq->tick_lock, flags);
			
 
				+}
			
 
				+
			
 
				+static int mq_set_config_value(struct dm_cache_policy *p,
			
 
				+			       const char *key, const char *value)
			
 
				+{
			
 
				+	struct mq_policy *mq = to_mq_policy(p);
			
 
				+	enum io_pattern pattern;
			
 
				+	unsigned long tmp;
			
 
				+
			
 
				+	if (!strcasecmp(key, "random_threshold"))
			
 
				+		pattern = PATTERN_RANDOM;
			
 
				+	else if (!strcasecmp(key, "sequential_threshold"))
			
 
				+		pattern = PATTERN_SEQUENTIAL;
			
 
				+	else
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	if (kstrtoul(value, 10, &tmp))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	mq->tracker.thresholds[pattern] = tmp;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
			
 
				+{
			
 
				+	ssize_t sz = 0;
			
 
				+	struct mq_policy *mq = to_mq_policy(p);
			
 
				+
			
 
				+	DMEMIT("4 random_threshold %u sequential_threshold %u",
			
 
				+	       mq->tracker.thresholds[PATTERN_RANDOM],
			
 
				+	       mq->tracker.thresholds[PATTERN_SEQUENTIAL]);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* Init the policy plugin interface function pointers. */
			
 
				+static void init_policy_functions(struct mq_policy *mq)
			
 
				+{
			
 
				+	mq->policy.destroy = mq_destroy;
			
 
				+	mq->policy.map = mq_map;
			
 
				+	mq->policy.lookup = mq_lookup;
			
 
				+	mq->policy.load_mapping = mq_load_mapping;
			
 
				+	mq->policy.walk_mappings = mq_walk_mappings;
			
 
				+	mq->policy.remove_mapping = mq_remove_mapping;
			
 
				+	mq->policy.writeback_work = NULL;
			
 
				+	mq->policy.force_mapping = mq_force_mapping;
			
 
				+	mq->policy.residency = mq_residency;
			
 
				+	mq->policy.tick = mq_tick;
			
 
				+	mq->policy.emit_config_values = mq_emit_config_values;
			
 
				+	mq->policy.set_config_value = mq_set_config_value;
			
 
				+}
			
 
				+
			
 
				+static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
			
 
				+					 sector_t origin_size,
			
 
				+					 sector_t cache_block_size)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
			
 
				+
			
 
				+	if (!mq)
			
 
				+		return NULL;
			
 
				+
			
 
				+	init_policy_functions(mq);
			
 
				+	iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT);
			
 
				+
			
 
				+	mq->cache_size = cache_size;
			
 
				+	mq->tick_protected = 0;
			
 
				+	mq->tick = 0;
			
 
				+	mq->hit_count = 0;
			
 
				+	mq->generation = 0;
			
 
				+	mq->promote_threshold = 0;
			
 
				+	mutex_init(&mq->lock);
			
 
				+	spin_lock_init(&mq->tick_lock);
			
 
				+	mq->find_free_nr_words = dm_div_up(from_cblock(mq->cache_size), BITS_PER_LONG);
			
 
				+	mq->find_free_last_word = 0;
			
 
				+
			
 
				+	queue_init(&mq->pre_cache);
			
 
				+	queue_init(&mq->cache);
			
 
				+	mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U);
			
 
				+
			
 
				+	mq->nr_entries = 2 * from_cblock(cache_size);
			
 
				+	r = alloc_entries(mq, mq->nr_entries);
			
 
				+	if (r)
			
 
				+		goto bad_cache_alloc;
			
 
				+
			
 
				+	mq->nr_entries_allocated = 0;
			
 
				+	mq->nr_cblocks_allocated = 0;
			
 
				+
			
 
				+	mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
			
 
				+	mq->hash_bits = ffs(mq->nr_buckets) - 1;
			
 
				+	mq->table = kzalloc(sizeof(*mq->table) * mq->nr_buckets, GFP_KERNEL);
			
 
				+	if (!mq->table)
			
 
				+		goto bad_alloc_table;
			
 
				+
			
 
				+	mq->allocation_bitset = alloc_bitset(from_cblock(cache_size));
			
 
				+	if (!mq->allocation_bitset)
			
 
				+		goto bad_alloc_bitset;
			
 
				+
			
 
				+	return &mq->policy;
			
 
				+
			
 
				+bad_alloc_bitset:
			
 
				+	kfree(mq->table);
			
 
				+bad_alloc_table:
			
 
				+	free_entries(mq);
			
 
				+bad_cache_alloc:
			
 
				+	kfree(mq);
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static struct dm_cache_policy_type mq_policy_type = {
			
 
				+	.name = "mq",
			
 
				+	.hint_size = 4,
			
 
				+	.owner = THIS_MODULE,
			
 
				+	.create = mq_create
			
 
				+};
			
 
				+
			
 
				+static struct dm_cache_policy_type default_policy_type = {
			
 
				+	.name = "default",
			
 
				+	.hint_size = 4,
			
 
				+	.owner = THIS_MODULE,
			
 
				+	.create = mq_create
			
 
				+};
			
 
				+
			
 
				+static int __init mq_init(void)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	mq_entry_cache = kmem_cache_create("dm_mq_policy_cache_entry",
			
 
				+					   sizeof(struct entry),
			
 
				+					   __alignof__(struct entry),
			
 
				+					   0, NULL);
			
 
				+	if (!mq_entry_cache)
			
 
				+		goto bad;
			
 
				+
			
 
				+	r = dm_cache_policy_register(&mq_policy_type);
			
 
				+	if (r) {
			
 
				+		DMERR("register failed %d", r);
			
 
				+		goto bad_register_mq;
			
 
				+	}
			
 
				+
			
 
				+	r = dm_cache_policy_register(&default_policy_type);
			
 
				+	if (!r) {
			
 
				+		DMINFO("version " MQ_VERSION " loaded");
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	DMERR("register failed (as default) %d", r);
			
 
				+
			
 
				+	dm_cache_policy_unregister(&mq_policy_type);
			
 
				+bad_register_mq:
			
 
				+	kmem_cache_destroy(mq_entry_cache);
			
 
				+bad:
			
 
				+	return -ENOMEM;
			
 
				+}
			
 
				+
			
 
				+static void __exit mq_exit(void)
			
 
				+{
			
 
				+	dm_cache_policy_unregister(&mq_policy_type);
			
 
				+	dm_cache_policy_unregister(&default_policy_type);
			
 
				+
			
 
				+	kmem_cache_destroy(mq_entry_cache);
			
 
				+}
			
 
				+
			
 
				+module_init(mq_init);
			
 
				+module_exit(mq_exit);
			
 
				+
			
 
				+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
			
 
				+MODULE_LICENSE("GPL");
			
 
				+MODULE_DESCRIPTION("mq cache policy");
			
 
				+
			
 
				+MODULE_ALIAS("dm-cache-default");
			
--- a/drivers/md/dm-cache-policy.c
+++ b/drivers/md/dm-cache-policy.c
@@ -0,0 +1,161 @@
 
				+/*
			
 
				+ * Copyright (C) 2012 Red Hat. All rights reserved.
			
 
				+ *
			
 
				+ * This file is released under the GPL.
			
 
				+ */
			
 
				+
			
 
				+#include "dm-cache-policy-internal.h"
			
 
				+#include "dm.h"
			
 
				+
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/slab.h>
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+#define DM_MSG_PREFIX "cache-policy"
			
 
				+
			
 
				+static DEFINE_SPINLOCK(register_lock);
			
 
				+static LIST_HEAD(register_list);
			
 
				+
			
 
				+static struct dm_cache_policy_type *__find_policy(const char *name)
			
 
				+{
			
 
				+	struct dm_cache_policy_type *t;
			
 
				+
			
 
				+	list_for_each_entry(t, &register_list, list)
			
 
				+		if (!strcmp(t->name, name))
			
 
				+			return t;
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static struct dm_cache_policy_type *__get_policy_once(const char *name)
			
 
				+{
			
 
				+	struct dm_cache_policy_type *t = __find_policy(name);
			
 
				+
			
 
				+	if (t && !try_module_get(t->owner)) {
			
 
				+		DMWARN("couldn't get module %s", name);
			
 
				+		t = ERR_PTR(-EINVAL);
			
 
				+	}
			
 
				+
			
 
				+	return t;
			
 
				+}
			
 
				+
			
 
				+static struct dm_cache_policy_type *get_policy_once(const char *name)
			
 
				+{
			
 
				+	struct dm_cache_policy_type *t;
			
 
				+
			
 
				+	spin_lock(&register_lock);
			
 
				+	t = __get_policy_once(name);
			
 
				+	spin_unlock(&register_lock);
			
 
				+
			
 
				+	return t;
			
 
				+}
			
 
				+
			
 
				+static struct dm_cache_policy_type *get_policy(const char *name)
			
 
				+{
			
 
				+	struct dm_cache_policy_type *t;
			
 
				+
			
 
				+	t = get_policy_once(name);
			
 
				+	if (IS_ERR(t))
			
 
				+		return NULL;
			
 
				+
			
 
				+	if (t)
			
 
				+		return t;
			
 
				+
			
 
				+	request_module("dm-cache-%s", name);
			
 
				+
			
 
				+	t = get_policy_once(name);
			
 
				+	if (IS_ERR(t))
			
 
				+		return NULL;
			
 
				+
			
 
				+	return t;
			
 
				+}
			
 
				+
			
 
				+static void put_policy(struct dm_cache_policy_type *t)
			
 
				+{
			
 
				+	module_put(t->owner);
			
 
				+}
			
 
				+
			
 
				+int dm_cache_policy_register(struct dm_cache_policy_type *type)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	/* One size fits all for now */
			
 
				+	if (type->hint_size != 0 && type->hint_size != 4) {
			
 
				+		DMWARN("hint size must be 0 or 4 but %llu supplied.", (unsigned long long) type->hint_size);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	spin_lock(&register_lock);
			
 
				+	if (__find_policy(type->name)) {
			
 
				+		DMWARN("attempt to register policy under duplicate name %s", type->name);
			
 
				+		r = -EINVAL;
			
 
				+	} else {
			
 
				+		list_add(&type->list, &register_list);
			
 
				+		r = 0;
			
 
				+	}
			
 
				+	spin_unlock(&register_lock);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_cache_policy_register);
			
 
				+
			
 
				+void dm_cache_policy_unregister(struct dm_cache_policy_type *type)
			
 
				+{
			
 
				+	spin_lock(&register_lock);
			
 
				+	list_del_init(&type->list);
			
 
				+	spin_unlock(&register_lock);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_cache_policy_unregister);
			
 
				+
			
 
				+struct dm_cache_policy *dm_cache_policy_create(const char *name,
			
 
				+					       dm_cblock_t cache_size,
			
 
				+					       sector_t origin_size,
			
 
				+					       sector_t cache_block_size)
			
 
				+{
			
 
				+	struct dm_cache_policy *p = NULL;
			
 
				+	struct dm_cache_policy_type *type;
			
 
				+
			
 
				+	type = get_policy(name);
			
 
				+	if (!type) {
			
 
				+		DMWARN("unknown policy type");
			
 
				+		return NULL;
			
 
				+	}
			
 
				+
			
 
				+	p = type->create(cache_size, origin_size, cache_block_size);
			
 
				+	if (!p) {
			
 
				+		put_policy(type);
			
 
				+		return NULL;
			
 
				+	}
			
 
				+	p->private = type;
			
 
				+
			
 
				+	return p;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_cache_policy_create);
			
 
				+
			
 
				+void dm_cache_policy_destroy(struct dm_cache_policy *p)
			
 
				+{
			
 
				+	struct dm_cache_policy_type *t = p->private;
			
 
				+
			
 
				+	p->destroy(p);
			
 
				+	put_policy(t);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_cache_policy_destroy);
			
 
				+
			
 
				+const char *dm_cache_policy_get_name(struct dm_cache_policy *p)
			
 
				+{
			
 
				+	struct dm_cache_policy_type *t = p->private;
			
 
				+
			
 
				+	return t->name;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_cache_policy_get_name);
			
 
				+
			
 
				+size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p)
			
 
				+{
			
 
				+	struct dm_cache_policy_type *t = p->private;
			
 
				+
			
 
				+	return t->hint_size;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_cache_policy_get_hint_size);
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -0,0 +1,228 @@
 
				+/*
			
 
				+ * Copyright (C) 2012 Red Hat. All rights reserved.
			
 
				+ *
			
 
				+ * This file is released under the GPL.
			
 
				+ */
			
 
				+
			
 
				+#ifndef DM_CACHE_POLICY_H
			
 
				+#define DM_CACHE_POLICY_H
			
 
				+
			
 
				+#include "dm-cache-block-types.h"
			
 
				+
			
 
				+#include <linux/device-mapper.h>
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/* FIXME: make it clear which methods are optional.  Get debug policy to
			
 
				+ * double check this at start.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * The cache policy makes the important decisions about which blocks get to
			
 
				+ * live on the faster cache device.
			
 
				+ *
			
 
				+ * When the core target has to remap a bio it calls the 'map' method of the
			
 
				+ * policy.  This returns an instruction telling the core target what to do.
			
 
				+ *
			
 
				+ * POLICY_HIT:
			
 
				+ *   That block is in the cache.  Remap to the cache and carry on.
			
 
				+ *
			
 
				+ * POLICY_MISS:
			
 
				+ *   This block is on the origin device.  Remap and carry on.
			
 
				+ *
			
 
				+ * POLICY_NEW:
			
 
				+ *   This block is currently on the origin device, but the policy wants to
			
 
				+ *   move it.  The core should:
			
 
				+ *
			
 
				+ *   - hold any further io to this origin block
			
 
				+ *   - copy the origin to the given cache block
			
 
				+ *   - release all the held blocks
			
 
				+ *   - remap the original block to the cache
			
 
				+ *
			
 
				+ * POLICY_REPLACE:
			
 
				+ *   This block is currently on the origin device.  The policy wants to
			
 
				+ *   move it to the cache, with the added complication that the destination
			
 
				+ *   cache block needs a writeback first.  The core should:
			
 
				+ *
			
 
				+ *   - hold any further io to this origin block
			
 
				+ *   - hold any further io to the origin block that's being written back
			
 
				+ *   - writeback
			
 
				+ *   - copy new block to cache
			
 
				+ *   - release held blocks
			
 
				+ *   - remap bio to cache and reissue.
			
 
				+ *
			
 
				+ * Should the core run into trouble while processing a POLICY_NEW or
			
 
				+ * POLICY_REPLACE instruction it will roll back the policies mapping using
			
 
				+ * remove_mapping() or force_mapping().  These methods must not fail.  This
			
 
				+ * approach avoids having transactional semantics in the policy (ie, the
			
 
				+ * core informing the policy when a migration is complete), and hence makes
			
 
				+ * it easier to write new policies.
			
 
				+ *
			
 
				+ * In general policy methods should never block, except in the case of the
			
 
				+ * map function when can_migrate is set.  So be careful to implement using
			
 
				+ * bounded, preallocated memory.
			
 
				+ */
			
 
				+enum policy_operation {
			
 
				+	POLICY_HIT,
			
 
				+	POLICY_MISS,
			
 
				+	POLICY_NEW,
			
 
				+	POLICY_REPLACE
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * This is the instruction passed back to the core target.
			
 
				+ */
			
 
				+struct policy_result {
			
 
				+	enum policy_operation op;
			
 
				+	dm_oblock_t old_oblock;	/* POLICY_REPLACE */
			
 
				+	dm_cblock_t cblock;	/* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */
			
 
				+};
			
 
				+
			
 
				+typedef int (*policy_walk_fn)(void *context, dm_cblock_t cblock,
			
 
				+			      dm_oblock_t oblock, uint32_t hint);
			
 
				+
			
 
				+/*
			
 
				+ * The cache policy object.  Just a bunch of methods.  It is envisaged that
			
 
				+ * this structure will be embedded in a bigger, policy specific structure
			
 
				+ * (ie. use container_of()).
			
 
				+ */
			
 
				+struct dm_cache_policy {
			
 
				+
			
 
				+	/*
			
 
				+	 * FIXME: make it clear which methods are optional, and which may
			
 
				+	 * block.
			
 
				+	 */
			
 
				+
			
 
				+	/*
			
 
				+	 * Destroys this object.
			
 
				+	 */
			
 
				+	void (*destroy)(struct dm_cache_policy *p);
			
 
				+
			
 
				+	/*
			
 
				+	 * See large comment above.
			
 
				+	 *
			
 
				+	 * oblock      - the origin block we're interested in.
			
 
				+	 *
			
 
				+	 * can_block - indicates whether the current thread is allowed to
			
 
				+	 *             block.  -EWOULDBLOCK returned if it can't and would.
			
 
				+	 *
			
 
				+	 * can_migrate - gives permission for POLICY_NEW or POLICY_REPLACE
			
 
				+	 *               instructions.  If denied and the policy would have
			
 
				+	 *               returned one of these instructions it should
			
 
				+	 *               return -EWOULDBLOCK.
			
 
				+	 *
			
 
				+	 * discarded_oblock - indicates whether the whole origin block is
			
 
				+	 *               in a discarded state (FIXME: better to tell the
			
 
				+	 *               policy about this sooner, so it can recycle that
			
 
				+	 *               cache block if it wants.)
			
 
				+	 * bio         - the bio that triggered this call.
			
 
				+	 * result      - gets filled in with the instruction.
			
 
				+	 *
			
 
				+	 * May only return 0, or -EWOULDBLOCK (if !can_migrate)
			
 
				+	 */
			
 
				+	int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock,
			
 
				+		   bool can_block, bool can_migrate, bool discarded_oblock,
			
 
				+		   struct bio *bio, struct policy_result *result);
			
 
				+
			
 
				+	/*
			
 
				+	 * Sometimes we want to see if a block is in the cache, without
			
 
				+	 * triggering any update of stats.  (ie. it's not a real hit).
			
 
				+	 *
			
 
				+	 * Must not block.
			
 
				+	 *
			
 
				+	 * Returns 1 iff in cache, 0 iff not, < 0 on error (-EWOULDBLOCK
			
 
				+	 * would be typical).
			
 
				+	 */
			
 
				+	int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock);
			
 
				+
			
 
				+	/*
			
 
				+	 * oblock must be a mapped block.  Must not block.
			
 
				+	 */
			
 
				+	void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
			
 
				+	void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
			
 
				+
			
 
				+	/*
			
 
				+	 * Called when a cache target is first created.  Used to load a
			
 
				+	 * mapping from the metadata device into the policy.
			
 
				+	 */
			
 
				+	int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock,
			
 
				+			    dm_cblock_t cblock, uint32_t hint, bool hint_valid);
			
 
				+
			
 
				+	int (*walk_mappings)(struct dm_cache_policy *p, policy_walk_fn fn,
			
 
				+			     void *context);
			
 
				+
			
 
				+	/*
			
 
				+	 * Override functions used on the error paths of the core target.
			
 
				+	 * They must succeed.
			
 
				+	 */
			
 
				+	void (*remove_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock);
			
 
				+	void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock,
			
 
				+			      dm_oblock_t new_oblock);
			
 
				+
			
 
				+	int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock);
			
 
				+
			
 
				+
			
 
				+	/*
			
 
				+	 * How full is the cache?
			
 
				+	 */
			
 
				+	dm_cblock_t (*residency)(struct dm_cache_policy *p);
			
 
				+
			
 
				+	/*
			
 
				+	 * Because of where we sit in the block layer, we can be asked to
			
 
				+	 * map a lot of little bios that are all in the same block (no
			
 
				+	 * queue merging has occurred).  To stop the policy being fooled by
			
 
				+	 * these the core target sends regular tick() calls to the policy.
			
 
				+	 * The policy should only count an entry as hit once per tick.
			
 
				+	 */
			
 
				+	void (*tick)(struct dm_cache_policy *p);
			
 
				+
			
 
				+	/*
			
 
				+	 * Configuration.
			
 
				+	 */
			
 
				+	int (*emit_config_values)(struct dm_cache_policy *p,
			
 
				+				  char *result, unsigned maxlen);
			
 
				+	int (*set_config_value)(struct dm_cache_policy *p,
			
 
				+				const char *key, const char *value);
			
 
				+
			
 
				+	/*
			
 
				+	 * Book keeping ptr for the policy register, not for general use.
			
 
				+	 */
			
 
				+	void *private;
			
 
				+};
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * We maintain a little register of the different policy types.
			
 
				+ */
			
 
				+#define CACHE_POLICY_NAME_SIZE 16
			
 
				+
			
 
				+struct dm_cache_policy_type {
			
 
				+	/* For use by the register code only. */
			
 
				+	struct list_head list;
			
 
				+
			
 
				+	/*
			
 
				+	 * Policy writers should fill in these fields.  The name field is
			
 
				+	 * what gets passed on the target line to select your policy.
			
 
				+	 */
			
 
				+	char name[CACHE_POLICY_NAME_SIZE];
			
 
				+
			
 
				+	/*
			
 
				+	 * Policies may store a hint for each each cache block.
			
 
				+	 * Currently the size of this hint must be 0 or 4 bytes but we
			
 
				+	 * expect to relax this in future.
			
 
				+	 */
			
 
				+	size_t hint_size;
			
 
				+
			
 
				+	struct module *owner;
			
 
				+	struct dm_cache_policy *(*create)(dm_cblock_t cache_size,
			
 
				+					  sector_t origin_size,
			
 
				+					  sector_t block_size);
			
 
				+};
			
 
				+
			
 
				+int dm_cache_policy_register(struct dm_cache_policy_type *type);
			
 
				+void dm_cache_policy_unregister(struct dm_cache_policy_type *type);
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+#endif	/* DM_CACHE_POLICY_H */
			
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -0,0 +1,2584 @@
 
				+/*
			
 
				+ * Copyright (C) 2012 Red Hat. All rights reserved.
			
 
				+ *
			
 
				+ * This file is released under the GPL.
			
 
				+ */
			
 
				+
			
 
				+#include "dm.h"
			
 
				+#include "dm-bio-prison.h"
			
 
				+#include "dm-cache-metadata.h"
			
 
				+
			
 
				+#include <linux/dm-io.h>
			
 
				+#include <linux/dm-kcopyd.h>
			
 
				+#include <linux/init.h>
			
 
				+#include <linux/mempool.h>
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/slab.h>
			
 
				+#include <linux/vmalloc.h>
			
 
				+
			
 
				+#define DM_MSG_PREFIX "cache"
			
 
				+
			
 
				+DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
			
 
				+	"A percentage of time allocated for copying to and/or from cache");
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * Glossary:
			
 
				+ *
			
 
				+ * oblock: index of an origin block
			
 
				+ * cblock: index of a cache block
			
 
				+ * promotion: movement of a block from origin to cache
			
 
				+ * demotion: movement of a block from cache to origin
			
 
				+ * migration: movement of a block between the origin and cache device,
			
 
				+ *	      either direction
			
 
				+ */
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static size_t bitset_size_in_bytes(unsigned nr_entries)
			
 
				+{
			
 
				+	return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
			
 
				+}
			
 
				+
			
 
				+static unsigned long *alloc_bitset(unsigned nr_entries)
			
 
				+{
			
 
				+	size_t s = bitset_size_in_bytes(nr_entries);
			
 
				+	return vzalloc(s);
			
 
				+}
			
 
				+
			
 
				+static void clear_bitset(void *bitset, unsigned nr_entries)
			
 
				+{
			
 
				+	size_t s = bitset_size_in_bytes(nr_entries);
			
 
				+	memset(bitset, 0, s);
			
 
				+}
			
 
				+
			
 
				+static void free_bitset(unsigned long *bits)
			
 
				+{
			
 
				+	vfree(bits);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+#define PRISON_CELLS 1024
			
 
				+#define MIGRATION_POOL_SIZE 128
			
 
				+#define COMMIT_PERIOD HZ
			
 
				+#define MIGRATION_COUNT_WINDOW 10
			
 
				+
			
 
				+/*
			
 
				+ * The block size of the device holding cache data must be >= 32KB
			
 
				+ */
			
 
				+#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
			
 
				+
			
 
				+/*
			
 
				+ * FIXME: the cache is read/write for the time being.
			
 
				+ */
			
 
				+enum cache_mode {
			
 
				+	CM_WRITE,		/* metadata may be changed */
			
 
				+	CM_READ_ONLY,		/* metadata may not be changed */
			
 
				+};
			
 
				+
			
 
				+struct cache_features {
			
 
				+	enum cache_mode mode;
			
 
				+	bool write_through:1;
			
 
				+};
			
 
				+
			
 
				+struct cache_stats {
			
 
				+	atomic_t read_hit;
			
 
				+	atomic_t read_miss;
			
 
				+	atomic_t write_hit;
			
 
				+	atomic_t write_miss;
			
 
				+	atomic_t demotion;
			
 
				+	atomic_t promotion;
			
 
				+	atomic_t copies_avoided;
			
 
				+	atomic_t cache_cell_clash;
			
 
				+	atomic_t commit_count;
			
 
				+	atomic_t discard_count;
			
 
				+};
			
 
				+
			
 
				+struct cache {
			
 
				+	struct dm_target *ti;
			
 
				+	struct dm_target_callbacks callbacks;
			
 
				+
			
 
				+	/*
			
 
				+	 * Metadata is written to this device.
			
 
				+	 */
			
 
				+	struct dm_dev *metadata_dev;
			
 
				+
			
 
				+	/*
			
 
				+	 * The slower of the two data devices.  Typically a spindle.
			
 
				+	 */
			
 
				+	struct dm_dev *origin_dev;
			
 
				+
			
 
				+	/*
			
 
				+	 * The faster of the two data devices.  Typically an SSD.
			
 
				+	 */
			
 
				+	struct dm_dev *cache_dev;
			
 
				+
			
 
				+	/*
			
 
				+	 * Cache features such as write-through.
			
 
				+	 */
			
 
				+	struct cache_features features;
			
 
				+
			
 
				+	/*
			
 
				+	 * Size of the origin device in _complete_ blocks and native sectors.
			
 
				+	 */
			
 
				+	dm_oblock_t origin_blocks;
			
 
				+	sector_t origin_sectors;
			
 
				+
			
 
				+	/*
			
 
				+	 * Size of the cache device in blocks.
			
 
				+	 */
			
 
				+	dm_cblock_t cache_size;
			
 
				+
			
 
				+	/*
			
 
				+	 * Fields for converting from sectors to blocks.
			
 
				+	 */
			
 
				+	uint32_t sectors_per_block;
			
 
				+	int sectors_per_block_shift;
			
 
				+
			
 
				+	struct dm_cache_metadata *cmd;
			
 
				+
			
 
				+	spinlock_t lock;
			
 
				+	struct bio_list deferred_bios;
			
 
				+	struct bio_list deferred_flush_bios;
			
 
				+	struct list_head quiesced_migrations;
			
 
				+	struct list_head completed_migrations;
			
 
				+	struct list_head need_commit_migrations;
			
 
				+	sector_t migration_threshold;
			
 
				+	atomic_t nr_migrations;
			
 
				+	wait_queue_head_t migration_wait;
			
 
				+
			
 
				+	/*
			
 
				+	 * cache_size entries, dirty if set
			
 
				+	 */
			
 
				+	dm_cblock_t nr_dirty;
			
 
				+	unsigned long *dirty_bitset;
			
 
				+
			
 
				+	/*
			
 
				+	 * origin_blocks entries, discarded if set.
			
 
				+	 */
			
 
				+	sector_t discard_block_size; /* a power of 2 times sectors per block */
			
 
				+	dm_dblock_t discard_nr_blocks;
			
 
				+	unsigned long *discard_bitset;
			
 
				+
			
 
				+	struct dm_kcopyd_client *copier;
			
 
				+	struct workqueue_struct *wq;
			
 
				+	struct work_struct worker;
			
 
				+
			
 
				+	struct delayed_work waker;
			
 
				+	unsigned long last_commit_jiffies;
			
 
				+
			
 
				+	struct dm_bio_prison *prison;
			
 
				+	struct dm_deferred_set *all_io_ds;
			
 
				+
			
 
				+	mempool_t *migration_pool;
			
 
				+	struct dm_cache_migration *next_migration;
			
 
				+
			
 
				+	struct dm_cache_policy *policy;
			
 
				+	unsigned policy_nr_args;
			
 
				+
			
 
				+	bool need_tick_bio:1;
			
 
				+	bool sized:1;
			
 
				+	bool quiescing:1;
			
 
				+	bool commit_requested:1;
			
 
				+	bool loaded_mappings:1;
			
 
				+	bool loaded_discards:1;
			
 
				+
			
 
				+	struct cache_stats stats;
			
 
				+
			
 
				+	/*
			
 
				+	 * Rather than reconstructing the table line for the status we just
			
 
				+	 * save it and regurgitate.
			
 
				+	 */
			
 
				+	unsigned nr_ctr_args;
			
 
				+	const char **ctr_args;
			
 
				+};
			
 
				+
			
 
				+struct per_bio_data {
			
 
				+	bool tick:1;
			
 
				+	unsigned req_nr:2;
			
 
				+	struct dm_deferred_entry *all_io_entry;
			
 
				+};
			
 
				+
			
 
				+struct dm_cache_migration {
			
 
				+	struct list_head list;
			
 
				+	struct cache *cache;
			
 
				+
			
 
				+	unsigned long start_jiffies;
			
 
				+	dm_oblock_t old_oblock;
			
 
				+	dm_oblock_t new_oblock;
			
 
				+	dm_cblock_t cblock;
			
 
				+
			
 
				+	bool err:1;
			
 
				+	bool writeback:1;
			
 
				+	bool demote:1;
			
 
				+	bool promote:1;
			
 
				+
			
 
				+	struct dm_bio_prison_cell *old_ocell;
			
 
				+	struct dm_bio_prison_cell *new_ocell;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Processing a bio in the worker thread may require these memory
			
 
				+ * allocations.  We prealloc to avoid deadlocks (the same worker thread
			
 
				+ * frees them back to the mempool).
			
 
				+ */
			
 
				+struct prealloc {
			
 
				+	struct dm_cache_migration *mg;
			
 
				+	struct dm_bio_prison_cell *cell1;
			
 
				+	struct dm_bio_prison_cell *cell2;
			
 
				+};
			
 
				+
			
 
				+static void wake_worker(struct cache *cache)
			
 
				+{
			
 
				+	queue_work(cache->wq, &cache->worker);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
			
 
				+{
			
 
				+	/* FIXME: change to use a local slab. */
			
 
				+	return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
			
 
				+}
			
 
				+
			
 
				+static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
			
 
				+{
			
 
				+	dm_bio_prison_free_cell(cache->prison, cell);
			
 
				+}
			
 
				+
			
 
				+static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
			
 
				+{
			
 
				+	if (!p->mg) {
			
 
				+		p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
			
 
				+		if (!p->mg)
			
 
				+			return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	if (!p->cell1) {
			
 
				+		p->cell1 = alloc_prison_cell(cache);
			
 
				+		if (!p->cell1)
			
 
				+			return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	if (!p->cell2) {
			
 
				+		p->cell2 = alloc_prison_cell(cache);
			
 
				+		if (!p->cell2)
			
 
				+			return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
			
 
				+{
			
 
				+	if (p->cell2)
			
 
				+		free_prison_cell(cache, p->cell2);
			
 
				+
			
 
				+	if (p->cell1)
			
 
				+		free_prison_cell(cache, p->cell1);
			
 
				+
			
 
				+	if (p->mg)
			
 
				+		mempool_free(p->mg, cache->migration_pool);
			
 
				+}
			
 
				+
			
 
				+static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
			
 
				+{
			
 
				+	struct dm_cache_migration *mg = p->mg;
			
 
				+
			
 
				+	BUG_ON(!mg);
			
 
				+	p->mg = NULL;
			
 
				+
			
 
				+	return mg;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * You must have a cell within the prealloc struct to return.  If not this
			
 
				+ * function will BUG() rather than returning NULL.
			
 
				+ */
			
 
				+static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
			
 
				+{
			
 
				+	struct dm_bio_prison_cell *r = NULL;
			
 
				+
			
 
				+	if (p->cell1) {
			
 
				+		r = p->cell1;
			
 
				+		p->cell1 = NULL;
			
 
				+
			
 
				+	} else if (p->cell2) {
			
 
				+		r = p->cell2;
			
 
				+		p->cell2 = NULL;
			
 
				+	} else
			
 
				+		BUG();
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * You can't have more than two cells in a prealloc struct.  BUG() will be
			
 
				+ * called if you try and overfill.
			
 
				+ */
			
 
				+static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
			
 
				+{
			
 
				+	if (!p->cell2)
			
 
				+		p->cell2 = cell;
			
 
				+
			
 
				+	else if (!p->cell1)
			
 
				+		p->cell1 = cell;
			
 
				+
			
 
				+	else
			
 
				+		BUG();
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
			
 
				+{
			
 
				+	key->virtual = 0;
			
 
				+	key->dev = 0;
			
 
				+	key->block = from_oblock(oblock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * The caller hands in a preallocated cell, and a free function for it.
			
 
				+ * The cell will be freed if there's an error, or if it wasn't used because
			
 
				+ * a cell with that key already exists.
			
 
				+ */
			
 
				+typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
			
 
				+
			
 
				+static int bio_detain(struct cache *cache, dm_oblock_t oblock,
			
 
				+		      struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
			
 
				+		      cell_free_fn free_fn, void *free_context,
			
 
				+		      struct dm_bio_prison_cell **cell_result)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct dm_cell_key key;
			
 
				+
			
 
				+	build_key(oblock, &key);
			
 
				+	r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
			
 
				+	if (r)
			
 
				+		free_fn(free_context, cell_prealloc);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int get_cell(struct cache *cache,
			
 
				+		    dm_oblock_t oblock,
			
 
				+		    struct prealloc *structs,
			
 
				+		    struct dm_bio_prison_cell **cell_result)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct dm_cell_key key;
			
 
				+	struct dm_bio_prison_cell *cell_prealloc;
			
 
				+
			
 
				+	cell_prealloc = prealloc_get_cell(structs);
			
 
				+
			
 
				+	build_key(oblock, &key);
			
 
				+	r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
			
 
				+	if (r)
			
 
				+		prealloc_put_cell(structs, cell_prealloc);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+ /*----------------------------------------------------------------*/
			
 
				+
			
 
				+static bool is_dirty(struct cache *cache, dm_cblock_t b)
			
 
				+{
			
 
				+	return test_bit(from_cblock(b), cache->dirty_bitset);
			
 
				+}
			
 
				+
			
 
				+static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
			
 
				+{
			
 
				+	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
			
 
				+		cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1);
			
 
				+		policy_set_dirty(cache->policy, oblock);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
			
 
				+{
			
 
				+	if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
			
 
				+		policy_clear_dirty(cache->policy, oblock);
			
 
				+		cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1);
			
 
				+		if (!from_cblock(cache->nr_dirty))
			
 
				+			dm_table_event(cache->ti->table);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+static bool block_size_is_power_of_two(struct cache *cache)
			
 
				+{
			
 
				+	return cache->sectors_per_block_shift >= 0;
			
 
				+}
			
 
				+
			
 
				+static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
			
 
				+{
			
 
				+	sector_t discard_blocks = cache->discard_block_size;
			
 
				+	dm_block_t b = from_oblock(oblock);
			
 
				+
			
 
				+	if (!block_size_is_power_of_two(cache))
			
 
				+		(void) sector_div(discard_blocks, cache->sectors_per_block);
			
 
				+	else
			
 
				+		discard_blocks >>= cache->sectors_per_block_shift;
			
 
				+
			
 
				+	(void) sector_div(b, discard_blocks);
			
 
				+
			
 
				+	return to_dblock(b);
			
 
				+}
			
 
				+
			
 
				+static void set_discard(struct cache *cache, dm_dblock_t b)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	atomic_inc(&cache->stats.discard_count);
			
 
				+
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	set_bit(from_dblock(b), cache->discard_bitset);
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+}
			
 
				+
			
 
				+static void clear_discard(struct cache *cache, dm_dblock_t b)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	clear_bit(from_dblock(b), cache->discard_bitset);
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+}
			
 
				+
			
 
				+static bool is_discarded(struct cache *cache, dm_dblock_t b)
			
 
				+{
			
 
				+	int r;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	r = test_bit(from_dblock(b), cache->discard_bitset);
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
			
 
				+{
			
 
				+	int r;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
			
 
				+		     cache->discard_bitset);
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static void load_stats(struct cache *cache)
			
 
				+{
			
 
				+	struct dm_cache_statistics stats;
			
 
				+
			
 
				+	dm_cache_metadata_get_stats(cache->cmd, &stats);
			
 
				+	atomic_set(&cache->stats.read_hit, stats.read_hits);
			
 
				+	atomic_set(&cache->stats.read_miss, stats.read_misses);
			
 
				+	atomic_set(&cache->stats.write_hit, stats.write_hits);
			
 
				+	atomic_set(&cache->stats.write_miss, stats.write_misses);
			
 
				+}
			
 
				+
			
 
				+static void save_stats(struct cache *cache)
			
 
				+{
			
 
				+	struct dm_cache_statistics stats;
			
 
				+
			
 
				+	stats.read_hits = atomic_read(&cache->stats.read_hit);
			
 
				+	stats.read_misses = atomic_read(&cache->stats.read_miss);
			
 
				+	stats.write_hits = atomic_read(&cache->stats.write_hit);
			
 
				+	stats.write_misses = atomic_read(&cache->stats.write_miss);
			
 
				+
			
 
				+	dm_cache_metadata_set_stats(cache->cmd, &stats);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------
			
 
				+ * Per bio data
			
 
				+ *--------------------------------------------------------------*/
			
 
				+static struct per_bio_data *get_per_bio_data(struct bio *bio)
			
 
				+{
			
 
				+	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
			
 
				+	BUG_ON(!pb);
			
 
				+	return pb;
			
 
				+}
			
 
				+
			
 
				+static struct per_bio_data *init_per_bio_data(struct bio *bio)
			
 
				+{
			
 
				+	struct per_bio_data *pb = get_per_bio_data(bio);
			
 
				+
			
 
				+	pb->tick = false;
			
 
				+	pb->req_nr = dm_bio_get_target_bio_nr(bio);
			
 
				+	pb->all_io_entry = NULL;
			
 
				+
			
 
				+	return pb;
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------
			
 
				+ * Remapping
			
 
				+ *--------------------------------------------------------------*/
			
 
				+static void remap_to_origin(struct cache *cache, struct bio *bio)
			
 
				+{
			
 
				+	bio->bi_bdev = cache->origin_dev->bdev;
			
 
				+}
			
 
				+
			
 
				+static void remap_to_cache(struct cache *cache, struct bio *bio,
			
 
				+			   dm_cblock_t cblock)
			
 
				+{
			
 
				+	sector_t bi_sector = bio->bi_sector;
			
 
				+
			
 
				+	bio->bi_bdev = cache->cache_dev->bdev;
			
 
				+	if (!block_size_is_power_of_two(cache))
			
 
				+		bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) +
			
 
				+				sector_div(bi_sector, cache->sectors_per_block);
			
 
				+	else
			
 
				+		bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) |
			
 
				+				(bi_sector & (cache->sectors_per_block - 1));
			
 
				+}
			
 
				+
			
 
				+static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+	struct per_bio_data *pb = get_per_bio_data(bio);
			
 
				+
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	if (cache->need_tick_bio &&
			
 
				+	    !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
			
 
				+		pb->tick = true;
			
 
				+		cache->need_tick_bio = false;
			
 
				+	}
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+}
			
 
				+
			
 
				+static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
			
 
				+				  dm_oblock_t oblock)
			
 
				+{
			
 
				+	check_if_tick_bio_needed(cache, bio);
			
 
				+	remap_to_origin(cache, bio);
			
 
				+	if (bio_data_dir(bio) == WRITE)
			
 
				+		clear_discard(cache, oblock_to_dblock(cache, oblock));
			
 
				+}
			
 
				+
			
 
				+static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
			
 
				+				 dm_oblock_t oblock, dm_cblock_t cblock)
			
 
				+{
			
 
				+	remap_to_cache(cache, bio, cblock);
			
 
				+	if (bio_data_dir(bio) == WRITE) {
			
 
				+		set_dirty(cache, oblock, cblock);
			
 
				+		clear_discard(cache, oblock_to_dblock(cache, oblock));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
			
 
				+{
			
 
				+	sector_t block_nr = bio->bi_sector;
			
 
				+
			
 
				+	if (!block_size_is_power_of_two(cache))
			
 
				+		(void) sector_div(block_nr, cache->sectors_per_block);
			
 
				+	else
			
 
				+		block_nr >>= cache->sectors_per_block_shift;
			
 
				+
			
 
				+	return to_oblock(block_nr);
			
 
				+}
			
 
				+
			
 
				+static int bio_triggers_commit(struct cache *cache, struct bio *bio)
			
 
				+{
			
 
				+	return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
			
 
				+}
			
 
				+
			
 
				+static void issue(struct cache *cache, struct bio *bio)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	if (!bio_triggers_commit(cache, bio)) {
			
 
				+		generic_make_request(bio);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Batch together any bios that trigger commits and then issue a
			
 
				+	 * single commit for them in do_worker().
			
 
				+	 */
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	cache->commit_requested = true;
			
 
				+	bio_list_add(&cache->deferred_flush_bios, bio);
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------
			
 
				+ * Migration processing
			
 
				+ *
			
 
				+ * Migration covers moving data from the origin device to the cache, or
			
 
				+ * vice versa.
			
 
				+ *--------------------------------------------------------------*/
			
 
				+static void free_migration(struct dm_cache_migration *mg)
			
 
				+{
			
 
				+	mempool_free(mg, mg->cache->migration_pool);
			
 
				+}
			
 
				+
			
 
				+static void inc_nr_migrations(struct cache *cache)
			
 
				+{
			
 
				+	atomic_inc(&cache->nr_migrations);
			
 
				+}
			
 
				+
			
 
				+static void dec_nr_migrations(struct cache *cache)
			
 
				+{
			
 
				+	atomic_dec(&cache->nr_migrations);
			
 
				+
			
 
				+	/*
			
 
				+	 * Wake the worker in case we're suspending the target.
			
 
				+	 */
			
 
				+	wake_up(&cache->migration_wait);
			
 
				+}
			
 
				+
			
 
				+static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
			
 
				+			 bool holder)
			
 
				+{
			
 
				+	(holder ? dm_cell_release : dm_cell_release_no_holder)
			
 
				+		(cache->prison, cell, &cache->deferred_bios);
			
 
				+	free_prison_cell(cache, cell);
			
 
				+}
			
 
				+
			
 
				+static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
			
 
				+		       bool holder)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	__cell_defer(cache, cell, holder);
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+
			
 
				+	wake_worker(cache);
			
 
				+}
			
 
				+
			
 
				+static void cleanup_migration(struct dm_cache_migration *mg)
			
 
				+{
			
 
				+	dec_nr_migrations(mg->cache);
			
 
				+	free_migration(mg);
			
 
				+}
			
 
				+
			
 
				+static void migration_failure(struct dm_cache_migration *mg)
			
 
				+{
			
 
				+	struct cache *cache = mg->cache;
			
 
				+
			
 
				+	if (mg->writeback) {
			
 
				+		DMWARN_LIMIT("writeback failed; couldn't copy block");
			
 
				+		set_dirty(cache, mg->old_oblock, mg->cblock);
			
 
				+		cell_defer(cache, mg->old_ocell, false);
			
 
				+
			
 
				+	} else if (mg->demote) {
			
 
				+		DMWARN_LIMIT("demotion failed; couldn't copy block");
			
 
				+		policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
			
 
				+
			
 
				+		cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
			
 
				+		if (mg->promote)
			
 
				+			cell_defer(cache, mg->new_ocell, 1);
			
 
				+	} else {
			
 
				+		DMWARN_LIMIT("promotion failed; couldn't copy block");
			
 
				+		policy_remove_mapping(cache->policy, mg->new_oblock);
			
 
				+		cell_defer(cache, mg->new_ocell, 1);
			
 
				+	}
			
 
				+
			
 
				+	cleanup_migration(mg);
			
 
				+}
			
 
				+
			
 
				+static void migration_success_pre_commit(struct dm_cache_migration *mg)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+	struct cache *cache = mg->cache;
			
 
				+
			
 
				+	if (mg->writeback) {
			
 
				+		cell_defer(cache, mg->old_ocell, false);
			
 
				+		clear_dirty(cache, mg->old_oblock, mg->cblock);
			
 
				+		cleanup_migration(mg);
			
 
				+		return;
			
 
				+
			
 
				+	} else if (mg->demote) {
			
 
				+		if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
			
 
				+			DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
			
 
				+			policy_force_mapping(cache->policy, mg->new_oblock,
			
 
				+					     mg->old_oblock);
			
 
				+			if (mg->promote)
			
 
				+				cell_defer(cache, mg->new_ocell, true);
			
 
				+			cleanup_migration(mg);
			
 
				+			return;
			
 
				+		}
			
 
				+	} else {
			
 
				+		if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
			
 
				+			DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
			
 
				+			policy_remove_mapping(cache->policy, mg->new_oblock);
			
 
				+			cleanup_migration(mg);
			
 
				+			return;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	list_add_tail(&mg->list, &cache->need_commit_migrations);
			
 
				+	cache->commit_requested = true;
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+}
			
 
				+
			
 
				+static void migration_success_post_commit(struct dm_cache_migration *mg)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+	struct cache *cache = mg->cache;
			
 
				+
			
 
				+	if (mg->writeback) {
			
 
				+		DMWARN("writeback unexpectedly triggered commit");
			
 
				+		return;
			
 
				+
			
 
				+	} else if (mg->demote) {
			
 
				+		cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
			
 
				+
			
 
				+		if (mg->promote) {
			
 
				+			mg->demote = false;
			
 
				+
			
 
				+			spin_lock_irqsave(&cache->lock, flags);
			
 
				+			list_add_tail(&mg->list, &cache->quiesced_migrations);
			
 
				+			spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+
			
 
				+		} else
			
 
				+			cleanup_migration(mg);
			
 
				+
			
 
				+	} else {
			
 
				+		cell_defer(cache, mg->new_ocell, true);
			
 
				+		clear_dirty(cache, mg->new_oblock, mg->cblock);
			
 
				+		cleanup_migration(mg);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void copy_complete(int read_err, unsigned long write_err, void *context)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+	struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
			
 
				+	struct cache *cache = mg->cache;
			
 
				+
			
 
				+	if (read_err || write_err)
			
 
				+		mg->err = true;
			
 
				+
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	list_add_tail(&mg->list, &cache->completed_migrations);
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+
			
 
				+	wake_worker(cache);
			
 
				+}
			
 
				+
			
 
				+static void issue_copy_real(struct dm_cache_migration *mg)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct dm_io_region o_region, c_region;
			
 
				+	struct cache *cache = mg->cache;
			
 
				+
			
 
				+	o_region.bdev = cache->origin_dev->bdev;
			
 
				+	o_region.count = cache->sectors_per_block;
			
 
				+
			
 
				+	c_region.bdev = cache->cache_dev->bdev;
			
 
				+	c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block;
			
 
				+	c_region.count = cache->sectors_per_block;
			
 
				+
			
 
				+	if (mg->writeback || mg->demote) {
			
 
				+		/* demote */
			
 
				+		o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
			
 
				+		r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
			
 
				+	} else {
			
 
				+		/* promote */
			
 
				+		o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
			
 
				+		r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
			
 
				+	}
			
 
				+
			
 
				+	if (r < 0)
			
 
				+		migration_failure(mg);
			
 
				+}
			
 
				+
			
 
				+static void avoid_copy(struct dm_cache_migration *mg)
			
 
				+{
			
 
				+	atomic_inc(&mg->cache->stats.copies_avoided);
			
 
				+	migration_success_pre_commit(mg);
			
 
				+}
			
 
				+
			
 
				+static void issue_copy(struct dm_cache_migration *mg)
			
 
				+{
			
 
				+	bool avoid;
			
 
				+	struct cache *cache = mg->cache;
			
 
				+
			
 
				+	if (mg->writeback || mg->demote)
			
 
				+		avoid = !is_dirty(cache, mg->cblock) ||
			
 
				+			is_discarded_oblock(cache, mg->old_oblock);
			
 
				+	else
			
 
				+		avoid = is_discarded_oblock(cache, mg->new_oblock);
			
 
				+
			
 
				+	avoid ? avoid_copy(mg) : issue_copy_real(mg);
			
 
				+}
			
 
				+
			
 
				+static void complete_migration(struct dm_cache_migration *mg)
			
 
				+{
			
 
				+	if (mg->err)
			
 
				+		migration_failure(mg);
			
 
				+	else
			
 
				+		migration_success_pre_commit(mg);
			
 
				+}
			
 
				+
			
 
				+static void process_migrations(struct cache *cache, struct list_head *head,
			
 
				+			       void (*fn)(struct dm_cache_migration *))
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+	struct list_head list;
			
 
				+	struct dm_cache_migration *mg, *tmp;
			
 
				+
			
 
				+	INIT_LIST_HEAD(&list);
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	list_splice_init(head, &list);
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+
			
 
				+	list_for_each_entry_safe(mg, tmp, &list, list)
			
 
				+		fn(mg);
			
 
				+}
			
 
				+
			
 
				+static void __queue_quiesced_migration(struct dm_cache_migration *mg)
			
 
				+{
			
 
				+	list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
			
 
				+}
			
 
				+
			
 
				+static void queue_quiesced_migration(struct dm_cache_migration *mg)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+	struct cache *cache = mg->cache;
			
 
				+
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	__queue_quiesced_migration(mg);
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+
			
 
				+	wake_worker(cache);
			
 
				+}
			
 
				+
			
 
				+static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+	struct dm_cache_migration *mg, *tmp;
			
 
				+
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	list_for_each_entry_safe(mg, tmp, work, list)
			
 
				+		__queue_quiesced_migration(mg);
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+
			
 
				+	wake_worker(cache);
			
 
				+}
			
 
				+
			
 
				+static void check_for_quiesced_migrations(struct cache *cache,
			
 
				+					  struct per_bio_data *pb)
			
 
				+{
			
 
				+	struct list_head work;
			
 
				+
			
 
				+	if (!pb->all_io_entry)
			
 
				+		return;
			
 
				+
			
 
				+	INIT_LIST_HEAD(&work);
			
 
				+	if (pb->all_io_entry)
			
 
				+		dm_deferred_entry_dec(pb->all_io_entry, &work);
			
 
				+
			
 
				+	if (!list_empty(&work))
			
 
				+		queue_quiesced_migrations(cache, &work);
			
 
				+}
			
 
				+
			
 
				+static void quiesce_migration(struct dm_cache_migration *mg)
			
 
				+{
			
 
				+	if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
			
 
				+		queue_quiesced_migration(mg);
			
 
				+}
			
 
				+
			
 
				+static void promote(struct cache *cache, struct prealloc *structs,
			
 
				+		    dm_oblock_t oblock, dm_cblock_t cblock,
			
 
				+		    struct dm_bio_prison_cell *cell)
			
 
				+{
			
 
				+	struct dm_cache_migration *mg = prealloc_get_migration(structs);
			
 
				+
			
 
				+	mg->err = false;
			
 
				+	mg->writeback = false;
			
 
				+	mg->demote = false;
			
 
				+	mg->promote = true;
			
 
				+	mg->cache = cache;
			
 
				+	mg->new_oblock = oblock;
			
 
				+	mg->cblock = cblock;
			
 
				+	mg->old_ocell = NULL;
			
 
				+	mg->new_ocell = cell;
			
 
				+	mg->start_jiffies = jiffies;
			
 
				+
			
 
				+	inc_nr_migrations(cache);
			
 
				+	quiesce_migration(mg);
			
 
				+}
			
 
				+
			
 
				+static void writeback(struct cache *cache, struct prealloc *structs,
			
 
				+		      dm_oblock_t oblock, dm_cblock_t cblock,
			
 
				+		      struct dm_bio_prison_cell *cell)
			
 
				+{
			
 
				+	struct dm_cache_migration *mg = prealloc_get_migration(structs);
			
 
				+
			
 
				+	mg->err = false;
			
 
				+	mg->writeback = true;
			
 
				+	mg->demote = false;
			
 
				+	mg->promote = false;
			
 
				+	mg->cache = cache;
			
 
				+	mg->old_oblock = oblock;
			
 
				+	mg->cblock = cblock;
			
 
				+	mg->old_ocell = cell;
			
 
				+	mg->new_ocell = NULL;
			
 
				+	mg->start_jiffies = jiffies;
			
 
				+
			
 
				+	inc_nr_migrations(cache);
			
 
				+	quiesce_migration(mg);
			
 
				+}
			
 
				+
			
 
				+static void demote_then_promote(struct cache *cache, struct prealloc *structs,
			
 
				+				dm_oblock_t old_oblock, dm_oblock_t new_oblock,
			
 
				+				dm_cblock_t cblock,
			
 
				+				struct dm_bio_prison_cell *old_ocell,
			
 
				+				struct dm_bio_prison_cell *new_ocell)
			
 
				+{
			
 
				+	struct dm_cache_migration *mg = prealloc_get_migration(structs);
			
 
				+
			
 
				+	mg->err = false;
			
 
				+	mg->writeback = false;
			
 
				+	mg->demote = true;
			
 
				+	mg->promote = true;
			
 
				+	mg->cache = cache;
			
 
				+	mg->old_oblock = old_oblock;
			
 
				+	mg->new_oblock = new_oblock;
			
 
				+	mg->cblock = cblock;
			
 
				+	mg->old_ocell = old_ocell;
			
 
				+	mg->new_ocell = new_ocell;
			
 
				+	mg->start_jiffies = jiffies;
			
 
				+
			
 
				+	inc_nr_migrations(cache);
			
 
				+	quiesce_migration(mg);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------
			
 
				+ * bio processing
			
 
				+ *--------------------------------------------------------------*/
			
 
				+static void defer_bio(struct cache *cache, struct bio *bio)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	bio_list_add(&cache->deferred_bios, bio);
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+
			
 
				+	wake_worker(cache);
			
 
				+}
			
 
				+
			
 
				+static void process_flush_bio(struct cache *cache, struct bio *bio)
			
 
				+{
			
 
				+	struct per_bio_data *pb = get_per_bio_data(bio);
			
 
				+
			
 
				+	BUG_ON(bio->bi_size);
			
 
				+	if (!pb->req_nr)
			
 
				+		remap_to_origin(cache, bio);
			
 
				+	else
			
 
				+		remap_to_cache(cache, bio, 0);
			
 
				+
			
 
				+	issue(cache, bio);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * People generally discard large parts of a device, eg, the whole device
			
 
				+ * when formatting.  Splitting these large discards up into cache block
			
 
				+ * sized ios and then quiescing (always neccessary for discard) takes too
			
 
				+ * long.
			
 
				+ *
			
 
				+ * We keep it simple, and allow any size of discard to come in, and just
			
 
				+ * mark off blocks on the discard bitset.  No passdown occurs!
			
 
				+ *
			
 
				+ * To implement passdown we need to change the bio_prison such that a cell
			
 
				+ * can have a key that spans many blocks.
			
 
				+ */
			
 
				+static void process_discard_bio(struct cache *cache, struct bio *bio)
			
 
				+{
			
 
				+	dm_block_t start_block = dm_sector_div_up(bio->bi_sector,
			
 
				+						  cache->discard_block_size);
			
 
				+	dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
			
 
				+	dm_block_t b;
			
 
				+
			
 
				+	(void) sector_div(end_block, cache->discard_block_size);
			
 
				+
			
 
				+	for (b = start_block; b < end_block; b++)
			
 
				+		set_discard(cache, to_dblock(b));
			
 
				+
			
 
				+	bio_endio(bio, 0);
			
 
				+}
			
 
				+
			
 
				+static bool spare_migration_bandwidth(struct cache *cache)
			
 
				+{
			
 
				+	sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) *
			
 
				+		cache->sectors_per_block;
			
 
				+	return current_volume < cache->migration_threshold;
			
 
				+}
			
 
				+
			
 
				+static bool is_writethrough_io(struct cache *cache, struct bio *bio,
			
 
				+			       dm_cblock_t cblock)
			
 
				+{
			
 
				+	return bio_data_dir(bio) == WRITE &&
			
 
				+		cache->features.write_through && !is_dirty(cache, cblock);
			
 
				+}
			
 
				+
			
 
				+static void inc_hit_counter(struct cache *cache, struct bio *bio)
			
 
				+{
			
 
				+	atomic_inc(bio_data_dir(bio) == READ ?
			
 
				+		   &cache->stats.read_hit : &cache->stats.write_hit);
			
 
				+}
			
 
				+
			
 
				+static void inc_miss_counter(struct cache *cache, struct bio *bio)
			
 
				+{
			
 
				+	atomic_inc(bio_data_dir(bio) == READ ?
			
 
				+		   &cache->stats.read_miss : &cache->stats.write_miss);
			
 
				+}
			
 
				+
			
 
				+static void process_bio(struct cache *cache, struct prealloc *structs,
			
 
				+			struct bio *bio)
			
 
				+{
			
 
				+	int r;
			
 
				+	bool release_cell = true;
			
 
				+	dm_oblock_t block = get_bio_block(cache, bio);
			
 
				+	struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
			
 
				+	struct policy_result lookup_result;
			
 
				+	struct per_bio_data *pb = get_per_bio_data(bio);
			
 
				+	bool discarded_block = is_discarded_oblock(cache, block);
			
 
				+	bool can_migrate = discarded_block || spare_migration_bandwidth(cache);
			
 
				+
			
 
				+	/*
			
 
				+	 * Check to see if that block is currently migrating.
			
 
				+	 */
			
 
				+	cell_prealloc = prealloc_get_cell(structs);
			
 
				+	r = bio_detain(cache, block, bio, cell_prealloc,
			
 
				+		       (cell_free_fn) prealloc_put_cell,
			
 
				+		       structs, &new_ocell);
			
 
				+	if (r > 0)
			
 
				+		return;
			
 
				+
			
 
				+	r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
			
 
				+		       bio, &lookup_result);
			
 
				+
			
 
				+	if (r == -EWOULDBLOCK)
			
 
				+		/* migration has been denied */
			
 
				+		lookup_result.op = POLICY_MISS;
			
 
				+
			
 
				+	switch (lookup_result.op) {
			
 
				+	case POLICY_HIT:
			
 
				+		inc_hit_counter(cache, bio);
			
 
				+		pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
			
 
				+
			
 
				+		if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
			
 
				+			/*
			
 
				+			 * No need to mark anything dirty in write through mode.
			
 
				+			 */
			
 
				+			pb->req_nr == 0 ?
			
 
				+				remap_to_cache(cache, bio, lookup_result.cblock) :
			
 
				+				remap_to_origin_clear_discard(cache, bio, block);
			
 
				+		} else
			
 
				+			remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
			
 
				+
			
 
				+		issue(cache, bio);
			
 
				+		break;
			
 
				+
			
 
				+	case POLICY_MISS:
			
 
				+		inc_miss_counter(cache, bio);
			
 
				+		pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
			
 
				+
			
 
				+		if (pb->req_nr != 0) {
			
 
				+			/*
			
 
				+			 * This is a duplicate writethrough io that is no
			
 
				+			 * longer needed because the block has been demoted.
			
 
				+			 */
			
 
				+			bio_endio(bio, 0);
			
 
				+		} else {
			
 
				+			remap_to_origin_clear_discard(cache, bio, block);
			
 
				+			issue(cache, bio);
			
 
				+		}
			
 
				+		break;
			
 
				+
			
 
				+	case POLICY_NEW:
			
 
				+		atomic_inc(&cache->stats.promotion);
			
 
				+		promote(cache, structs, block, lookup_result.cblock, new_ocell);
			
 
				+		release_cell = false;
			
 
				+		break;
			
 
				+
			
 
				+	case POLICY_REPLACE:
			
 
				+		cell_prealloc = prealloc_get_cell(structs);
			
 
				+		r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
			
 
				+			       (cell_free_fn) prealloc_put_cell,
			
 
				+			       structs, &old_ocell);
			
 
				+		if (r > 0) {
			
 
				+			/*
			
 
				+			 * We have to be careful to avoid lock inversion of
			
 
				+			 * the cells.  So we back off, and wait for the
			
 
				+			 * old_ocell to become free.
			
 
				+			 */
			
 
				+			policy_force_mapping(cache->policy, block,
			
 
				+					     lookup_result.old_oblock);
			
 
				+			atomic_inc(&cache->stats.cache_cell_clash);
			
 
				+			break;
			
 
				+		}
			
 
				+		atomic_inc(&cache->stats.demotion);
			
 
				+		atomic_inc(&cache->stats.promotion);
			
 
				+
			
 
				+		demote_then_promote(cache, structs, lookup_result.old_oblock,
			
 
				+				    block, lookup_result.cblock,
			
 
				+				    old_ocell, new_ocell);
			
 
				+		release_cell = false;
			
 
				+		break;
			
 
				+
			
 
				+	default:
			
 
				+		DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
			
 
				+			    (unsigned) lookup_result.op);
			
 
				+		bio_io_error(bio);
			
 
				+	}
			
 
				+
			
 
				+	if (release_cell)
			
 
				+		cell_defer(cache, new_ocell, false);
			
 
				+}
			
 
				+
			
 
				+static int need_commit_due_to_time(struct cache *cache)
			
 
				+{
			
 
				+	return jiffies < cache->last_commit_jiffies ||
			
 
				+	       jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
			
 
				+}
			
 
				+
			
 
				+static int commit_if_needed(struct cache *cache)
			
 
				+{
			
 
				+	if (dm_cache_changed_this_transaction(cache->cmd) &&
			
 
				+	    (cache->commit_requested || need_commit_due_to_time(cache))) {
			
 
				+		atomic_inc(&cache->stats.commit_count);
			
 
				+		cache->last_commit_jiffies = jiffies;
			
 
				+		cache->commit_requested = false;
			
 
				+		return dm_cache_commit(cache->cmd, false);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void process_deferred_bios(struct cache *cache)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+	struct bio_list bios;
			
 
				+	struct bio *bio;
			
 
				+	struct prealloc structs;
			
 
				+
			
 
				+	memset(&structs, 0, sizeof(structs));
			
 
				+	bio_list_init(&bios);
			
 
				+
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	bio_list_merge(&bios, &cache->deferred_bios);
			
 
				+	bio_list_init(&cache->deferred_bios);
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+
			
 
				+	while (!bio_list_empty(&bios)) {
			
 
				+		/*
			
 
				+		 * If we've got no free migration structs, and processing
			
 
				+		 * this bio might require one, we pause until there are some
			
 
				+		 * prepared mappings to process.
			
 
				+		 */
			
 
				+		if (prealloc_data_structs(cache, &structs)) {
			
 
				+			spin_lock_irqsave(&cache->lock, flags);
			
 
				+			bio_list_merge(&cache->deferred_bios, &bios);
			
 
				+			spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		bio = bio_list_pop(&bios);
			
 
				+
			
 
				+		if (bio->bi_rw & REQ_FLUSH)
			
 
				+			process_flush_bio(cache, bio);
			
 
				+		else if (bio->bi_rw & REQ_DISCARD)
			
 
				+			process_discard_bio(cache, bio);
			
 
				+		else
			
 
				+			process_bio(cache, &structs, bio);
			
 
				+	}
			
 
				+
			
 
				+	prealloc_free_structs(cache, &structs);
			
 
				+}
			
 
				+
			
 
				+static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+	struct bio_list bios;
			
 
				+	struct bio *bio;
			
 
				+
			
 
				+	bio_list_init(&bios);
			
 
				+
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	bio_list_merge(&bios, &cache->deferred_flush_bios);
			
 
				+	bio_list_init(&cache->deferred_flush_bios);
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+
			
 
				+	while ((bio = bio_list_pop(&bios)))
			
 
				+		submit_bios ? generic_make_request(bio) : bio_io_error(bio);
			
 
				+}
			
 
				+
			
 
				+static void writeback_some_dirty_blocks(struct cache *cache)
			
 
				+{
			
 
				+	int r = 0;
			
 
				+	dm_oblock_t oblock;
			
 
				+	dm_cblock_t cblock;
			
 
				+	struct prealloc structs;
			
 
				+	struct dm_bio_prison_cell *old_ocell;
			
 
				+
			
 
				+	memset(&structs, 0, sizeof(structs));
			
 
				+
			
 
				+	while (spare_migration_bandwidth(cache)) {
			
 
				+		if (prealloc_data_structs(cache, &structs))
			
 
				+			break;
			
 
				+
			
 
				+		r = policy_writeback_work(cache->policy, &oblock, &cblock);
			
 
				+		if (r)
			
 
				+			break;
			
 
				+
			
 
				+		r = get_cell(cache, oblock, &structs, &old_ocell);
			
 
				+		if (r) {
			
 
				+			policy_set_dirty(cache->policy, oblock);
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		writeback(cache, &structs, oblock, cblock, old_ocell);
			
 
				+	}
			
 
				+
			
 
				+	prealloc_free_structs(cache, &structs);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------
			
 
				+ * Main worker loop
			
 
				+ *--------------------------------------------------------------*/
			
 
				+static void start_quiescing(struct cache *cache)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	cache->quiescing = 1;
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+}
			
 
				+
			
 
				+static void stop_quiescing(struct cache *cache)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	cache->quiescing = 0;
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+}
			
 
				+
			
 
				+static bool is_quiescing(struct cache *cache)
			
 
				+{
			
 
				+	int r;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&cache->lock, flags);
			
 
				+	r = cache->quiescing;
			
 
				+	spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static void wait_for_migrations(struct cache *cache)
			
 
				+{
			
 
				+	wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations));
			
 
				+}
			
 
				+
			
 
				+static void stop_worker(struct cache *cache)
			
 
				+{
			
 
				+	cancel_delayed_work(&cache->waker);
			
 
				+	flush_workqueue(cache->wq);
			
 
				+}
			
 
				+
			
 
				+static void requeue_deferred_io(struct cache *cache)
			
 
				+{
			
 
				+	struct bio *bio;
			
 
				+	struct bio_list bios;
			
 
				+
			
 
				+	bio_list_init(&bios);
			
 
				+	bio_list_merge(&bios, &cache->deferred_bios);
			
 
				+	bio_list_init(&cache->deferred_bios);
			
 
				+
			
 
				+	while ((bio = bio_list_pop(&bios)))
			
 
				+		bio_endio(bio, DM_ENDIO_REQUEUE);
			
 
				+}
			
 
				+
			
 
				+static int more_work(struct cache *cache)
			
 
				+{
			
 
				+	if (is_quiescing(cache))
			
 
				+		return !list_empty(&cache->quiesced_migrations) ||
			
 
				+			!list_empty(&cache->completed_migrations) ||
			
 
				+			!list_empty(&cache->need_commit_migrations);
			
 
				+	else
			
 
				+		return !bio_list_empty(&cache->deferred_bios) ||
			
 
				+			!bio_list_empty(&cache->deferred_flush_bios) ||
			
 
				+			!list_empty(&cache->quiesced_migrations) ||
			
 
				+			!list_empty(&cache->completed_migrations) ||
			
 
				+			!list_empty(&cache->need_commit_migrations);
			
 
				+}
			
 
				+
			
 
				+static void do_worker(struct work_struct *ws)
			
 
				+{
			
 
				+	struct cache *cache = container_of(ws, struct cache, worker);
			
 
				+
			
 
				+	do {
			
 
				+		if (!is_quiescing(cache))
			
 
				+			process_deferred_bios(cache);
			
 
				+
			
 
				+		process_migrations(cache, &cache->quiesced_migrations, issue_copy);
			
 
				+		process_migrations(cache, &cache->completed_migrations, complete_migration);
			
 
				+
			
 
				+		writeback_some_dirty_blocks(cache);
			
 
				+
			
 
				+		if (commit_if_needed(cache)) {
			
 
				+			process_deferred_flush_bios(cache, false);
			
 
				+
			
 
				+			/*
			
 
				+			 * FIXME: rollback metadata or just go into a
			
 
				+			 * failure mode and error everything
			
 
				+			 */
			
 
				+		} else {
			
 
				+			process_deferred_flush_bios(cache, true);
			
 
				+			process_migrations(cache, &cache->need_commit_migrations,
			
 
				+					   migration_success_post_commit);
			
 
				+		}
			
 
				+	} while (more_work(cache));
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * We want to commit periodically so that not too much
			
 
				+ * unwritten metadata builds up.
			
 
				+ */
			
 
				+static void do_waker(struct work_struct *ws)
			
 
				+{
			
 
				+	struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
			
 
				+	wake_worker(cache);
			
 
				+	queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static int is_congested(struct dm_dev *dev, int bdi_bits)
			
 
				+{
			
 
				+	struct request_queue *q = bdev_get_queue(dev->bdev);
			
 
				+	return bdi_congested(&q->backing_dev_info, bdi_bits);
			
 
				+}
			
 
				+
			
 
				+static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
			
 
				+{
			
 
				+	struct cache *cache = container_of(cb, struct cache, callbacks);
			
 
				+
			
 
				+	return is_congested(cache->origin_dev, bdi_bits) ||
			
 
				+		is_congested(cache->cache_dev, bdi_bits);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------
			
 
				+ * Target methods
			
 
				+ *--------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * This function gets called on the error paths of the constructor, so we
			
 
				+ * have to cope with a partially initialised struct.
			
 
				+ */
			
 
				+static void destroy(struct cache *cache)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+
			
 
				+	if (cache->next_migration)
			
 
				+		mempool_free(cache->next_migration, cache->migration_pool);
			
 
				+
			
 
				+	if (cache->migration_pool)
			
 
				+		mempool_destroy(cache->migration_pool);
			
 
				+
			
 
				+	if (cache->all_io_ds)
			
 
				+		dm_deferred_set_destroy(cache->all_io_ds);
			
 
				+
			
 
				+	if (cache->prison)
			
 
				+		dm_bio_prison_destroy(cache->prison);
			
 
				+
			
 
				+	if (cache->wq)
			
 
				+		destroy_workqueue(cache->wq);
			
 
				+
			
 
				+	if (cache->dirty_bitset)
			
 
				+		free_bitset(cache->dirty_bitset);
			
 
				+
			
 
				+	if (cache->discard_bitset)
			
 
				+		free_bitset(cache->discard_bitset);
			
 
				+
			
 
				+	if (cache->copier)
			
 
				+		dm_kcopyd_client_destroy(cache->copier);
			
 
				+
			
 
				+	if (cache->cmd)
			
 
				+		dm_cache_metadata_close(cache->cmd);
			
 
				+
			
 
				+	if (cache->metadata_dev)
			
 
				+		dm_put_device(cache->ti, cache->metadata_dev);
			
 
				+
			
 
				+	if (cache->origin_dev)
			
 
				+		dm_put_device(cache->ti, cache->origin_dev);
			
 
				+
			
 
				+	if (cache->cache_dev)
			
 
				+		dm_put_device(cache->ti, cache->cache_dev);
			
 
				+
			
 
				+	if (cache->policy)
			
 
				+		dm_cache_policy_destroy(cache->policy);
			
 
				+
			
 
				+	for (i = 0; i < cache->nr_ctr_args ; i++)
			
 
				+		kfree(cache->ctr_args[i]);
			
 
				+	kfree(cache->ctr_args);
			
 
				+
			
 
				+	kfree(cache);
			
 
				+}
			
 
				+
			
 
				+static void cache_dtr(struct dm_target *ti)
			
 
				+{
			
 
				+	struct cache *cache = ti->private;
			
 
				+
			
 
				+	destroy(cache);
			
 
				+}
			
 
				+
			
 
				+static sector_t get_dev_size(struct dm_dev *dev)
			
 
				+{
			
 
				+	return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * Construct a cache device mapping.
			
 
				+ *
			
 
				+ * cache <metadata dev> <cache dev> <origin dev> <block size>
			
 
				+ *       <#feature args> [<feature arg>]*
			
 
				+ *       <policy> <#policy args> [<policy arg>]*
			
 
				+ *
			
 
				+ * metadata dev    : fast device holding the persistent metadata
			
 
				+ * cache dev	   : fast device holding cached data blocks
			
 
				+ * origin dev	   : slow device holding original data blocks
			
 
				+ * block size	   : cache unit size in sectors
			
 
				+ *
			
 
				+ * #feature args   : number of feature arguments passed
			
 
				+ * feature args    : writethrough.  (The default is writeback.)
			
 
				+ *
			
 
				+ * policy	   : the replacement policy to use
			
 
				+ * #policy args    : an even number of policy arguments corresponding
			
 
				+ *		     to key/value pairs passed to the policy
			
 
				+ * policy args	   : key/value pairs passed to the policy
			
 
				+ *		     E.g. 'sequential_threshold 1024'
			
 
				+ *		     See cache-policies.txt for details.
			
 
				+ *
			
 
				+ * Optional feature arguments are:
			
 
				+ *   writethrough  : write through caching that prohibits cache block
			
 
				+ *		     content from being different from origin block content.
			
 
				+ *		     Without this argument, the default behaviour is to write
			
 
				+ *		     back cache block contents later for performance reasons,
			
 
				+ *		     so they may differ from the corresponding origin blocks.
			
 
				+ */
			
 
				+struct cache_args {
			
 
				+	struct dm_target *ti;
			
 
				+
			
 
				+	struct dm_dev *metadata_dev;
			
 
				+
			
 
				+	struct dm_dev *cache_dev;
			
 
				+	sector_t cache_sectors;
			
 
				+
			
 
				+	struct dm_dev *origin_dev;
			
 
				+	sector_t origin_sectors;
			
 
				+
			
 
				+	uint32_t block_size;
			
 
				+
			
 
				+	const char *policy_name;
			
 
				+	int policy_argc;
			
 
				+	const char **policy_argv;
			
 
				+
			
 
				+	struct cache_features features;
			
 
				+};
			
 
				+
			
 
				+static void destroy_cache_args(struct cache_args *ca)
			
 
				+{
			
 
				+	if (ca->metadata_dev)
			
 
				+		dm_put_device(ca->ti, ca->metadata_dev);
			
 
				+
			
 
				+	if (ca->cache_dev)
			
 
				+		dm_put_device(ca->ti, ca->cache_dev);
			
 
				+
			
 
				+	if (ca->origin_dev)
			
 
				+		dm_put_device(ca->ti, ca->origin_dev);
			
 
				+
			
 
				+	kfree(ca);
			
 
				+}
			
 
				+
			
 
				+static bool at_least_one_arg(struct dm_arg_set *as, char **error)
			
 
				+{
			
 
				+	if (!as->argc) {
			
 
				+		*error = "Insufficient args";
			
 
				+		return false;
			
 
				+	}
			
 
				+
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
			
 
				+			      char **error)
			
 
				+{
			
 
				+	int r;
			
 
				+	sector_t metadata_dev_size;
			
 
				+	char b[BDEVNAME_SIZE];
			
 
				+
			
 
				+	if (!at_least_one_arg(as, error))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
			
 
				+			  &ca->metadata_dev);
			
 
				+	if (r) {
			
 
				+		*error = "Error opening metadata device";
			
 
				+		return r;
			
 
				+	}
			
 
				+
			
 
				+	metadata_dev_size = get_dev_size(ca->metadata_dev);
			
 
				+	if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
			
 
				+		DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
			
 
				+		       bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
			
 
				+			   char **error)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	if (!at_least_one_arg(as, error))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
			
 
				+			  &ca->cache_dev);
			
 
				+	if (r) {
			
 
				+		*error = "Error opening cache device";
			
 
				+		return r;
			
 
				+	}
			
 
				+	ca->cache_sectors = get_dev_size(ca->cache_dev);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
			
 
				+			    char **error)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	if (!at_least_one_arg(as, error))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
			
 
				+			  &ca->origin_dev);
			
 
				+	if (r) {
			
 
				+		*error = "Error opening origin device";
			
 
				+		return r;
			
 
				+	}
			
 
				+
			
 
				+	ca->origin_sectors = get_dev_size(ca->origin_dev);
			
 
				+	if (ca->ti->len > ca->origin_sectors) {
			
 
				+		*error = "Device size larger than cached device";
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
			
 
				+			    char **error)
			
 
				+{
			
 
				+	unsigned long tmp;
			
 
				+
			
 
				+	if (!at_least_one_arg(as, error))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	if (kstrtoul(dm_shift_arg(as), 10, &tmp) || !tmp ||
			
 
				+	    tmp < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
			
 
				+	    tmp & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
			
 
				+		*error = "Invalid data block size";
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	if (tmp > ca->cache_sectors) {
			
 
				+		*error = "Data block size is larger than the cache device";
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	ca->block_size = tmp;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void init_features(struct cache_features *cf)
			
 
				+{
			
 
				+	cf->mode = CM_WRITE;
			
 
				+	cf->write_through = false;
			
 
				+}
			
 
				+
			
 
				+static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
			
 
				+			  char **error)
			
 
				+{
			
 
				+	static struct dm_arg _args[] = {
			
 
				+		{0, 1, "Invalid number of cache feature arguments"},
			
 
				+	};
			
 
				+
			
 
				+	int r;
			
 
				+	unsigned argc;
			
 
				+	const char *arg;
			
 
				+	struct cache_features *cf = &ca->features;
			
 
				+
			
 
				+	init_features(cf);
			
 
				+
			
 
				+	r = dm_read_arg_group(_args, as, &argc, error);
			
 
				+	if (r)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	while (argc--) {
			
 
				+		arg = dm_shift_arg(as);
			
 
				+
			
 
				+		if (!strcasecmp(arg, "writeback"))
			
 
				+			cf->write_through = false;
			
 
				+
			
 
				+		else if (!strcasecmp(arg, "writethrough"))
			
 
				+			cf->write_through = true;
			
 
				+
			
 
				+		else {
			
 
				+			*error = "Unrecognised cache feature requested";
			
 
				+			return -EINVAL;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
			
 
				+			char **error)
			
 
				+{
			
 
				+	static struct dm_arg _args[] = {
			
 
				+		{0, 1024, "Invalid number of policy arguments"},
			
 
				+	};
			
 
				+
			
 
				+	int r;
			
 
				+
			
 
				+	if (!at_least_one_arg(as, error))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	ca->policy_name = dm_shift_arg(as);
			
 
				+
			
 
				+	r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
			
 
				+	if (r)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	ca->policy_argv = (const char **)as->argv;
			
 
				+	dm_consume_args(as, ca->policy_argc);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
			
 
				+			    char **error)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct dm_arg_set as;
			
 
				+
			
 
				+	as.argc = argc;
			
 
				+	as.argv = argv;
			
 
				+
			
 
				+	r = parse_metadata_dev(ca, &as, error);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	r = parse_cache_dev(ca, &as, error);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	r = parse_origin_dev(ca, &as, error);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	r = parse_block_size(ca, &as, error);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	r = parse_features(ca, &as, error);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	r = parse_policy(ca, &as, error);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static struct kmem_cache *migration_cache;
			
 
				+
			
 
				+static int set_config_values(struct dm_cache_policy *p, int argc, const char **argv)
			
 
				+{
			
 
				+	int r = 0;
			
 
				+
			
 
				+	if (argc & 1) {
			
 
				+		DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	while (argc) {
			
 
				+		r = policy_set_config_value(p, argv[0], argv[1]);
			
 
				+		if (r) {
			
 
				+			DMWARN("policy_set_config_value failed: key = '%s', value = '%s'",
			
 
				+			       argv[0], argv[1]);
			
 
				+			return r;
			
 
				+		}
			
 
				+
			
 
				+		argc -= 2;
			
 
				+		argv += 2;
			
 
				+	}
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int create_cache_policy(struct cache *cache, struct cache_args *ca,
			
 
				+			       char **error)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	cache->policy =	dm_cache_policy_create(ca->policy_name,
			
 
				+					       cache->cache_size,
			
 
				+					       cache->origin_sectors,
			
 
				+					       cache->sectors_per_block);
			
 
				+	if (!cache->policy) {
			
 
				+		*error = "Error creating cache's policy";
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv);
			
 
				+	if (r)
			
 
				+		dm_cache_policy_destroy(cache->policy);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * We want the discard block size to be a power of two, at least the size
			
 
				+ * of the cache block size, and have no more than 2^14 discard blocks
			
 
				+ * across the origin.
			
 
				+ */
			
 
				+#define MAX_DISCARD_BLOCKS (1 << 14)
			
 
				+
			
 
				+static bool too_many_discard_blocks(sector_t discard_block_size,
			
 
				+				    sector_t origin_size)
			
 
				+{
			
 
				+	(void) sector_div(origin_size, discard_block_size);
			
 
				+
			
 
				+	return origin_size > MAX_DISCARD_BLOCKS;
			
 
				+}
			
 
				+
			
 
				+static sector_t calculate_discard_block_size(sector_t cache_block_size,
			
 
				+					     sector_t origin_size)
			
 
				+{
			
 
				+	sector_t discard_block_size;
			
 
				+
			
 
				+	discard_block_size = roundup_pow_of_two(cache_block_size);
			
 
				+
			
 
				+	if (origin_size)
			
 
				+		while (too_many_discard_blocks(discard_block_size, origin_size))
			
 
				+			discard_block_size *= 2;
			
 
				+
			
 
				+	return discard_block_size;
			
 
				+}
			
 
				+
			
 
				+#define DEFAULT_MIGRATION_THRESHOLD (2048 * 100)
			
 
				+
			
 
				+static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio);
			
 
				+
			
 
				+static int cache_create(struct cache_args *ca, struct cache **result)
			
 
				+{
			
 
				+	int r = 0;
			
 
				+	char **error = &ca->ti->error;
			
 
				+	struct cache *cache;
			
 
				+	struct dm_target *ti = ca->ti;
			
 
				+	dm_block_t origin_blocks;
			
 
				+	struct dm_cache_metadata *cmd;
			
 
				+	bool may_format = ca->features.mode == CM_WRITE;
			
 
				+
			
 
				+	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
			
 
				+	if (!cache)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	cache->ti = ca->ti;
			
 
				+	ti->private = cache;
			
 
				+	ti->per_bio_data_size = sizeof(struct per_bio_data);
			
 
				+	ti->num_flush_bios = 2;
			
 
				+	ti->flush_supported = true;
			
 
				+
			
 
				+	ti->num_discard_bios = 1;
			
 
				+	ti->discards_supported = true;
			
 
				+	ti->discard_zeroes_data_unsupported = true;
			
 
				+
			
 
				+	memcpy(&cache->features, &ca->features, sizeof(cache->features));
			
 
				+
			
 
				+	if (cache->features.write_through)
			
 
				+		ti->num_write_bios = cache_num_write_bios;
			
 
				+
			
 
				+	cache->callbacks.congested_fn = cache_is_congested;
			
 
				+	dm_table_add_target_callbacks(ti->table, &cache->callbacks);
			
 
				+
			
 
				+	cache->metadata_dev = ca->metadata_dev;
			
 
				+	cache->origin_dev = ca->origin_dev;
			
 
				+	cache->cache_dev = ca->cache_dev;
			
 
				+
			
 
				+	ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
			
 
				+
			
 
				+	/* FIXME: factor out this whole section */
			
 
				+	origin_blocks = cache->origin_sectors = ca->origin_sectors;
			
 
				+	(void) sector_div(origin_blocks, ca->block_size);
			
 
				+	cache->origin_blocks = to_oblock(origin_blocks);
			
 
				+
			
 
				+	cache->sectors_per_block = ca->block_size;
			
 
				+	if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
			
 
				+		r = -EINVAL;
			
 
				+		goto bad;
			
 
				+	}
			
 
				+
			
 
				+	if (ca->block_size & (ca->block_size - 1)) {
			
 
				+		dm_block_t cache_size = ca->cache_sectors;
			
 
				+
			
 
				+		cache->sectors_per_block_shift = -1;
			
 
				+		(void) sector_div(cache_size, ca->block_size);
			
 
				+		cache->cache_size = to_cblock(cache_size);
			
 
				+	} else {
			
 
				+		cache->sectors_per_block_shift = __ffs(ca->block_size);
			
 
				+		cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift);
			
 
				+	}
			
 
				+
			
 
				+	r = create_cache_policy(cache, ca, error);
			
 
				+	if (r)
			
 
				+		goto bad;
			
 
				+	cache->policy_nr_args = ca->policy_argc;
			
 
				+
			
 
				+	cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
			
 
				+				     ca->block_size, may_format,
			
 
				+				     dm_cache_policy_get_hint_size(cache->policy));
			
 
				+	if (IS_ERR(cmd)) {
			
 
				+		*error = "Error creating metadata object";
			
 
				+		r = PTR_ERR(cmd);
			
 
				+		goto bad;
			
 
				+	}
			
 
				+	cache->cmd = cmd;
			
 
				+
			
 
				+	spin_lock_init(&cache->lock);
			
 
				+	bio_list_init(&cache->deferred_bios);
			
 
				+	bio_list_init(&cache->deferred_flush_bios);
			
 
				+	INIT_LIST_HEAD(&cache->quiesced_migrations);
			
 
				+	INIT_LIST_HEAD(&cache->completed_migrations);
			
 
				+	INIT_LIST_HEAD(&cache->need_commit_migrations);
			
 
				+	cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
			
 
				+	atomic_set(&cache->nr_migrations, 0);
			
 
				+	init_waitqueue_head(&cache->migration_wait);
			
 
				+
			
 
				+	cache->nr_dirty = 0;
			
 
				+	cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
			
 
				+	if (!cache->dirty_bitset) {
			
 
				+		*error = "could not allocate dirty bitset";
			
 
				+		goto bad;
			
 
				+	}
			
 
				+	clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
			
 
				+
			
 
				+	cache->discard_block_size =
			
 
				+		calculate_discard_block_size(cache->sectors_per_block,
			
 
				+					     cache->origin_sectors);
			
 
				+	cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks);
			
 
				+	cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
			
 
				+	if (!cache->discard_bitset) {
			
 
				+		*error = "could not allocate discard bitset";
			
 
				+		goto bad;
			
 
				+	}
			
 
				+	clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
			
 
				+
			
 
				+	cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
			
 
				+	if (IS_ERR(cache->copier)) {
			
 
				+		*error = "could not create kcopyd client";
			
 
				+		r = PTR_ERR(cache->copier);
			
 
				+		goto bad;
			
 
				+	}
			
 
				+
			
 
				+	cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
			
 
				+	if (!cache->wq) {
			
 
				+		*error = "could not create workqueue for metadata object";
			
 
				+		goto bad;
			
 
				+	}
			
 
				+	INIT_WORK(&cache->worker, do_worker);
			
 
				+	INIT_DELAYED_WORK(&cache->waker, do_waker);
			
 
				+	cache->last_commit_jiffies = jiffies;
			
 
				+
			
 
				+	cache->prison = dm_bio_prison_create(PRISON_CELLS);
			
 
				+	if (!cache->prison) {
			
 
				+		*error = "could not create bio prison";
			
 
				+		goto bad;
			
 
				+	}
			
 
				+
			
 
				+	cache->all_io_ds = dm_deferred_set_create();
			
 
				+	if (!cache->all_io_ds) {
			
 
				+		*error = "could not create all_io deferred set";
			
 
				+		goto bad;
			
 
				+	}
			
 
				+
			
 
				+	cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
			
 
				+							 migration_cache);
			
 
				+	if (!cache->migration_pool) {
			
 
				+		*error = "Error creating cache's migration mempool";
			
 
				+		goto bad;
			
 
				+	}
			
 
				+
			
 
				+	cache->next_migration = NULL;
			
 
				+
			
 
				+	cache->need_tick_bio = true;
			
 
				+	cache->sized = false;
			
 
				+	cache->quiescing = false;
			
 
				+	cache->commit_requested = false;
			
 
				+	cache->loaded_mappings = false;
			
 
				+	cache->loaded_discards = false;
			
 
				+
			
 
				+	load_stats(cache);
			
 
				+
			
 
				+	atomic_set(&cache->stats.demotion, 0);
			
 
				+	atomic_set(&cache->stats.promotion, 0);
			
 
				+	atomic_set(&cache->stats.copies_avoided, 0);
			
 
				+	atomic_set(&cache->stats.cache_cell_clash, 0);
			
 
				+	atomic_set(&cache->stats.commit_count, 0);
			
 
				+	atomic_set(&cache->stats.discard_count, 0);
			
 
				+
			
 
				+	*result = cache;
			
 
				+	return 0;
			
 
				+
			
 
				+bad:
			
 
				+	destroy(cache);
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	const char **copy;
			
 
				+
			
 
				+	copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
			
 
				+	if (!copy)
			
 
				+		return -ENOMEM;
			
 
				+	for (i = 0; i < argc; i++) {
			
 
				+		copy[i] = kstrdup(argv[i], GFP_KERNEL);
			
 
				+		if (!copy[i]) {
			
 
				+			while (i--)
			
 
				+				kfree(copy[i]);
			
 
				+			kfree(copy);
			
 
				+			return -ENOMEM;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	cache->nr_ctr_args = argc;
			
 
				+	cache->ctr_args = copy;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
			
 
				+{
			
 
				+	int r = -EINVAL;
			
 
				+	struct cache_args *ca;
			
 
				+	struct cache *cache = NULL;
			
 
				+
			
 
				+	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
			
 
				+	if (!ca) {
			
 
				+		ti->error = "Error allocating memory for cache";
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+	ca->ti = ti;
			
 
				+
			
 
				+	r = parse_cache_args(ca, argc, argv, &ti->error);
			
 
				+	if (r)
			
 
				+		goto out;
			
 
				+
			
 
				+	r = cache_create(ca, &cache);
			
 
				+
			
 
				+	r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
			
 
				+	if (r) {
			
 
				+		destroy(cache);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	ti->private = cache;
			
 
				+
			
 
				+out:
			
 
				+	destroy_cache_args(ca);
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct cache *cache = ti->private;
			
 
				+	dm_oblock_t block = get_bio_block(cache, bio);
			
 
				+	dm_cblock_t cblock;
			
 
				+
			
 
				+	r = policy_lookup(cache->policy, block, &cblock);
			
 
				+	if (r < 0)
			
 
				+		return 2;	/* assume the worst */
			
 
				+
			
 
				+	return (!r && !is_dirty(cache, cblock)) ? 2 : 1;
			
 
				+}
			
 
				+
			
 
				+static int cache_map(struct dm_target *ti, struct bio *bio)
			
 
				+{
			
 
				+	struct cache *cache = ti->private;
			
 
				+
			
 
				+	int r;
			
 
				+	dm_oblock_t block = get_bio_block(cache, bio);
			
 
				+	bool can_migrate = false;
			
 
				+	bool discarded_block;
			
 
				+	struct dm_bio_prison_cell *cell;
			
 
				+	struct policy_result lookup_result;
			
 
				+	struct per_bio_data *pb;
			
 
				+
			
 
				+	if (from_oblock(block) > from_oblock(cache->origin_blocks)) {
			
 
				+		/*
			
 
				+		 * This can only occur if the io goes to a partial block at
			
 
				+		 * the end of the origin device.  We don't cache these.
			
 
				+		 * Just remap to the origin and carry on.
			
 
				+		 */
			
 
				+		remap_to_origin_clear_discard(cache, bio, block);
			
 
				+		return DM_MAPIO_REMAPPED;
			
 
				+	}
			
 
				+
			
 
				+	pb = init_per_bio_data(bio);
			
 
				+
			
 
				+	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
			
 
				+		defer_bio(cache, bio);
			
 
				+		return DM_MAPIO_SUBMITTED;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Check to see if that block is currently migrating.
			
 
				+	 */
			
 
				+	cell = alloc_prison_cell(cache);
			
 
				+	if (!cell) {
			
 
				+		defer_bio(cache, bio);
			
 
				+		return DM_MAPIO_SUBMITTED;
			
 
				+	}
			
 
				+
			
 
				+	r = bio_detain(cache, block, bio, cell,
			
 
				+		       (cell_free_fn) free_prison_cell,
			
 
				+		       cache, &cell);
			
 
				+	if (r) {
			
 
				+		if (r < 0)
			
 
				+			defer_bio(cache, bio);
			
 
				+
			
 
				+		return DM_MAPIO_SUBMITTED;
			
 
				+	}
			
 
				+
			
 
				+	discarded_block = is_discarded_oblock(cache, block);
			
 
				+
			
 
				+	r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
			
 
				+		       bio, &lookup_result);
			
 
				+	if (r == -EWOULDBLOCK) {
			
 
				+		cell_defer(cache, cell, true);
			
 
				+		return DM_MAPIO_SUBMITTED;
			
 
				+
			
 
				+	} else if (r) {
			
 
				+		DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
			
 
				+		bio_io_error(bio);
			
 
				+		return DM_MAPIO_SUBMITTED;
			
 
				+	}
			
 
				+
			
 
				+	switch (lookup_result.op) {
			
 
				+	case POLICY_HIT:
			
 
				+		inc_hit_counter(cache, bio);
			
 
				+		pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
			
 
				+
			
 
				+		if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
			
 
				+			/*
			
 
				+			 * No need to mark anything dirty in write through mode.
			
 
				+			 */
			
 
				+			pb->req_nr == 0 ?
			
 
				+				remap_to_cache(cache, bio, lookup_result.cblock) :
			
 
				+				remap_to_origin_clear_discard(cache, bio, block);
			
 
				+			cell_defer(cache, cell, false);
			
 
				+		} else {
			
 
				+			remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
			
 
				+			cell_defer(cache, cell, false);
			
 
				+		}
			
 
				+		break;
			
 
				+
			
 
				+	case POLICY_MISS:
			
 
				+		inc_miss_counter(cache, bio);
			
 
				+		pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
			
 
				+
			
 
				+		if (pb->req_nr != 0) {
			
 
				+			/*
			
 
				+			 * This is a duplicate writethrough io that is no
			
 
				+			 * longer needed because the block has been demoted.
			
 
				+			 */
			
 
				+			bio_endio(bio, 0);
			
 
				+			cell_defer(cache, cell, false);
			
 
				+			return DM_MAPIO_SUBMITTED;
			
 
				+		} else {
			
 
				+			remap_to_origin_clear_discard(cache, bio, block);
			
 
				+			cell_defer(cache, cell, false);
			
 
				+		}
			
 
				+		break;
			
 
				+
			
 
				+	default:
			
 
				+		DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
			
 
				+			    (unsigned) lookup_result.op);
			
 
				+		bio_io_error(bio);
			
 
				+		return DM_MAPIO_SUBMITTED;
			
 
				+	}
			
 
				+
			
 
				+	return DM_MAPIO_REMAPPED;
			
 
				+}
			
 
				+
			
 
				+static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
			
 
				+{
			
 
				+	struct cache *cache = ti->private;
			
 
				+	unsigned long flags;
			
 
				+	struct per_bio_data *pb = get_per_bio_data(bio);
			
 
				+
			
 
				+	if (pb->tick) {
			
 
				+		policy_tick(cache->policy);
			
 
				+
			
 
				+		spin_lock_irqsave(&cache->lock, flags);
			
 
				+		cache->need_tick_bio = true;
			
 
				+		spin_unlock_irqrestore(&cache->lock, flags);
			
 
				+	}
			
 
				+
			
 
				+	check_for_quiesced_migrations(cache, pb);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int write_dirty_bitset(struct cache *cache)
			
 
				+{
			
 
				+	unsigned i, r;
			
 
				+
			
 
				+	for (i = 0; i < from_cblock(cache->cache_size); i++) {
			
 
				+		r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
			
 
				+				       is_dirty(cache, to_cblock(i)));
			
 
				+		if (r)
			
 
				+			return r;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int write_discard_bitset(struct cache *cache)
			
 
				+{
			
 
				+	unsigned i, r;
			
 
				+
			
 
				+	r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
			
 
				+					   cache->discard_nr_blocks);
			
 
				+	if (r) {
			
 
				+		DMERR("could not resize on-disk discard bitset");
			
 
				+		return r;
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
			
 
				+		r = dm_cache_set_discard(cache->cmd, to_dblock(i),
			
 
				+					 is_discarded(cache, to_dblock(i)));
			
 
				+		if (r)
			
 
				+			return r;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock,
			
 
				+		     uint32_t hint)
			
 
				+{
			
 
				+	struct cache *cache = context;
			
 
				+	return dm_cache_save_hint(cache->cmd, cblock, hint);
			
 
				+}
			
 
				+
			
 
				+static int write_hints(struct cache *cache)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	r = dm_cache_begin_hints(cache->cmd, cache->policy);
			
 
				+	if (r) {
			
 
				+		DMERR("dm_cache_begin_hints failed");
			
 
				+		return r;
			
 
				+	}
			
 
				+
			
 
				+	r = policy_walk_mappings(cache->policy, save_hint, cache);
			
 
				+	if (r)
			
 
				+		DMERR("policy_walk_mappings failed");
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * returns true on success
			
 
				+ */
			
 
				+static bool sync_metadata(struct cache *cache)
			
 
				+{
			
 
				+	int r1, r2, r3, r4;
			
 
				+
			
 
				+	r1 = write_dirty_bitset(cache);
			
 
				+	if (r1)
			
 
				+		DMERR("could not write dirty bitset");
			
 
				+
			
 
				+	r2 = write_discard_bitset(cache);
			
 
				+	if (r2)
			
 
				+		DMERR("could not write discard bitset");
			
 
				+
			
 
				+	save_stats(cache);
			
 
				+
			
 
				+	r3 = write_hints(cache);
			
 
				+	if (r3)
			
 
				+		DMERR("could not write hints");
			
 
				+
			
 
				+	/*
			
 
				+	 * If writing the above metadata failed, we still commit, but don't
			
 
				+	 * set the clean shutdown flag.  This will effectively force every
			
 
				+	 * dirty bit to be set on reload.
			
 
				+	 */
			
 
				+	r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
			
 
				+	if (r4)
			
 
				+		DMERR("could not write cache metadata.  Data loss may occur.");
			
 
				+
			
 
				+	return !r1 && !r2 && !r3 && !r4;
			
 
				+}
			
 
				+
			
 
				+static void cache_postsuspend(struct dm_target *ti)
			
 
				+{
			
 
				+	struct cache *cache = ti->private;
			
 
				+
			
 
				+	start_quiescing(cache);
			
 
				+	wait_for_migrations(cache);
			
 
				+	stop_worker(cache);
			
 
				+	requeue_deferred_io(cache);
			
 
				+	stop_quiescing(cache);
			
 
				+
			
 
				+	(void) sync_metadata(cache);
			
 
				+}
			
 
				+
			
 
				+static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
			
 
				+			bool dirty, uint32_t hint, bool hint_valid)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct cache *cache = context;
			
 
				+
			
 
				+	r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	if (dirty)
			
 
				+		set_dirty(cache, oblock, cblock);
			
 
				+	else
			
 
				+		clear_dirty(cache, oblock, cblock);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int load_discard(void *context, sector_t discard_block_size,
			
 
				+			dm_dblock_t dblock, bool discard)
			
 
				+{
			
 
				+	struct cache *cache = context;
			
 
				+
			
 
				+	/* FIXME: handle mis-matched block size */
			
 
				+
			
 
				+	if (discard)
			
 
				+		set_discard(cache, dblock);
			
 
				+	else
			
 
				+		clear_discard(cache, dblock);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int cache_preresume(struct dm_target *ti)
			
 
				+{
			
 
				+	int r = 0;
			
 
				+	struct cache *cache = ti->private;
			
 
				+	sector_t actual_cache_size = get_dev_size(cache->cache_dev);
			
 
				+	(void) sector_div(actual_cache_size, cache->sectors_per_block);
			
 
				+
			
 
				+	/*
			
 
				+	 * Check to see if the cache has resized.
			
 
				+	 */
			
 
				+	if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) {
			
 
				+		cache->cache_size = to_cblock(actual_cache_size);
			
 
				+
			
 
				+		r = dm_cache_resize(cache->cmd, cache->cache_size);
			
 
				+		if (r) {
			
 
				+			DMERR("could not resize cache metadata");
			
 
				+			return r;
			
 
				+		}
			
 
				+
			
 
				+		cache->sized = true;
			
 
				+	}
			
 
				+
			
 
				+	if (!cache->loaded_mappings) {
			
 
				+		r = dm_cache_load_mappings(cache->cmd,
			
 
				+					   dm_cache_policy_get_name(cache->policy),
			
 
				+					   load_mapping, cache);
			
 
				+		if (r) {
			
 
				+			DMERR("could not load cache mappings");
			
 
				+			return r;
			
 
				+		}
			
 
				+
			
 
				+		cache->loaded_mappings = true;
			
 
				+	}
			
 
				+
			
 
				+	if (!cache->loaded_discards) {
			
 
				+		r = dm_cache_load_discards(cache->cmd, load_discard, cache);
			
 
				+		if (r) {
			
 
				+			DMERR("could not load origin discards");
			
 
				+			return r;
			
 
				+		}
			
 
				+
			
 
				+		cache->loaded_discards = true;
			
 
				+	}
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static void cache_resume(struct dm_target *ti)
			
 
				+{
			
 
				+	struct cache *cache = ti->private;
			
 
				+
			
 
				+	cache->need_tick_bio = true;
			
 
				+	do_waker(&cache->waker.work);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Status format:
			
 
				+ *
			
 
				+ * <#used metadata blocks>/<#total metadata blocks>
			
 
				+ * <#read hits> <#read misses> <#write hits> <#write misses>
			
 
				+ * <#demotions> <#promotions> <#blocks in cache> <#dirty>
			
 
				+ * <#features> <features>*
			
 
				+ * <#core args> <core args>
			
 
				+ * <#policy args> <policy args>*
			
 
				+ */
			
 
				+static void cache_status(struct dm_target *ti, status_type_t type,
			
 
				+			 unsigned status_flags, char *result, unsigned maxlen)
			
 
				+{
			
 
				+	int r = 0;
			
 
				+	unsigned i;
			
 
				+	ssize_t sz = 0;
			
 
				+	dm_block_t nr_free_blocks_metadata = 0;
			
 
				+	dm_block_t nr_blocks_metadata = 0;
			
 
				+	char buf[BDEVNAME_SIZE];
			
 
				+	struct cache *cache = ti->private;
			
 
				+	dm_cblock_t residency;
			
 
				+
			
 
				+	switch (type) {
			
 
				+	case STATUSTYPE_INFO:
			
 
				+		/* Commit to ensure statistics aren't out-of-date */
			
 
				+		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
			
 
				+			r = dm_cache_commit(cache->cmd, false);
			
 
				+			if (r)
			
 
				+				DMERR("could not commit metadata for accurate status");
			
 
				+		}
			
 
				+
			
 
				+		r = dm_cache_get_free_metadata_block_count(cache->cmd,
			
 
				+							   &nr_free_blocks_metadata);
			
 
				+		if (r) {
			
 
				+			DMERR("could not get metadata free block count");
			
 
				+			goto err;
			
 
				+		}
			
 
				+
			
 
				+		r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
			
 
				+		if (r) {
			
 
				+			DMERR("could not get metadata device size");
			
 
				+			goto err;
			
 
				+		}
			
 
				+
			
 
				+		residency = policy_residency(cache->policy);
			
 
				+
			
 
				+		DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ",
			
 
				+		       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
			
 
				+		       (unsigned long long)nr_blocks_metadata,
			
 
				+		       (unsigned) atomic_read(&cache->stats.read_hit),
			
 
				+		       (unsigned) atomic_read(&cache->stats.read_miss),
			
 
				+		       (unsigned) atomic_read(&cache->stats.write_hit),
			
 
				+		       (unsigned) atomic_read(&cache->stats.write_miss),
			
 
				+		       (unsigned) atomic_read(&cache->stats.demotion),
			
 
				+		       (unsigned) atomic_read(&cache->stats.promotion),
			
 
				+		       (unsigned long long) from_cblock(residency),
			
 
				+		       cache->nr_dirty);
			
 
				+
			
 
				+		if (cache->features.write_through)
			
 
				+			DMEMIT("1 writethrough ");
			
 
				+		else
			
 
				+			DMEMIT("0 ");
			
 
				+
			
 
				+		DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
			
 
				+		if (sz < maxlen) {
			
 
				+			r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
			
 
				+			if (r)
			
 
				+				DMERR("policy_emit_config_values returned %d", r);
			
 
				+		}
			
 
				+
			
 
				+		break;
			
 
				+
			
 
				+	case STATUSTYPE_TABLE:
			
 
				+		format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
			
 
				+		DMEMIT("%s ", buf);
			
 
				+		format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
			
 
				+		DMEMIT("%s ", buf);
			
 
				+		format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
			
 
				+		DMEMIT("%s", buf);
			
 
				+
			
 
				+		for (i = 0; i < cache->nr_ctr_args - 1; i++)
			
 
				+			DMEMIT(" %s", cache->ctr_args[i]);
			
 
				+		if (cache->nr_ctr_args)
			
 
				+			DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
			
 
				+	}
			
 
				+
			
 
				+	return;
			
 
				+
			
 
				+err:
			
 
				+	DMEMIT("Error");
			
 
				+}
			
 
				+
			
 
				+#define NOT_CORE_OPTION 1
			
 
				+
			
 
				+static int process_config_option(struct cache *cache, char **argv)
			
 
				+{
			
 
				+	unsigned long tmp;
			
 
				+
			
 
				+	if (!strcasecmp(argv[0], "migration_threshold")) {
			
 
				+		if (kstrtoul(argv[1], 10, &tmp))
			
 
				+			return -EINVAL;
			
 
				+
			
 
				+		cache->migration_threshold = tmp;
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	return NOT_CORE_OPTION;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Supports <key> <value>.
			
 
				+ *
			
 
				+ * The key migration_threshold is supported by the cache target core.
			
 
				+ */
			
 
				+static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct cache *cache = ti->private;
			
 
				+
			
 
				+	if (argc != 2)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	r = process_config_option(cache, argv);
			
 
				+	if (r == NOT_CORE_OPTION)
			
 
				+		return policy_set_config_value(cache->policy, argv[0], argv[1]);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int cache_iterate_devices(struct dm_target *ti,
			
 
				+				 iterate_devices_callout_fn fn, void *data)
			
 
				+{
			
 
				+	int r = 0;
			
 
				+	struct cache *cache = ti->private;
			
 
				+
			
 
				+	r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
			
 
				+	if (!r)
			
 
				+		r = fn(ti, cache->origin_dev, 0, ti->len, data);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * We assume I/O is going to the origin (which is the volume
			
 
				+ * more likely to have restrictions e.g. by being striped).
			
 
				+ * (Looking up the exact location of the data would be expensive
			
 
				+ * and could always be out of date by the time the bio is submitted.)
			
 
				+ */
			
 
				+static int cache_bvec_merge(struct dm_target *ti,
			
 
				+			    struct bvec_merge_data *bvm,
			
 
				+			    struct bio_vec *biovec, int max_size)
			
 
				+{
			
 
				+	struct cache *cache = ti->private;
			
 
				+	struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
			
 
				+
			
 
				+	if (!q->merge_bvec_fn)
			
 
				+		return max_size;
			
 
				+
			
 
				+	bvm->bi_bdev = cache->origin_dev->bdev;
			
 
				+	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
			
 
				+}
			
 
				+
			
 
				+static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
			
 
				+{
			
 
				+	/*
			
 
				+	 * FIXME: these limits may be incompatible with the cache device
			
 
				+	 */
			
 
				+	limits->max_discard_sectors = cache->discard_block_size * 1024;
			
 
				+	limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
			
 
				+}
			
 
				+
			
 
				+static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
			
 
				+{
			
 
				+	struct cache *cache = ti->private;
			
 
				+
			
 
				+	blk_limits_io_min(limits, 0);
			
 
				+	blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
			
 
				+	set_discard_limits(cache, limits);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static struct target_type cache_target = {
			
 
				+	.name = "cache",
			
 
				+	.version = {1, 0, 0},
			
 
				+	.module = THIS_MODULE,
			
 
				+	.ctr = cache_ctr,
			
 
				+	.dtr = cache_dtr,
			
 
				+	.map = cache_map,
			
 
				+	.end_io = cache_end_io,
			
 
				+	.postsuspend = cache_postsuspend,
			
 
				+	.preresume = cache_preresume,
			
 
				+	.resume = cache_resume,
			
 
				+	.status = cache_status,
			
 
				+	.message = cache_message,
			
 
				+	.iterate_devices = cache_iterate_devices,
			
 
				+	.merge = cache_bvec_merge,
			
 
				+	.io_hints = cache_io_hints,
			
 
				+};
			
 
				+
			
 
				+static int __init dm_cache_init(void)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	r = dm_register_target(&cache_target);
			
 
				+	if (r) {
			
 
				+		DMERR("cache target registration failed: %d", r);
			
 
				+		return r;
			
 
				+	}
			
 
				+
			
 
				+	migration_cache = KMEM_CACHE(dm_cache_migration, 0);
			
 
				+	if (!migration_cache) {
			
 
				+		dm_unregister_target(&cache_target);
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void __exit dm_cache_exit(void)
			
 
				+{
			
 
				+	dm_unregister_target(&cache_target);
			
 
				+	kmem_cache_destroy(migration_cache);
			
 
				+}
			
 
				+
			
 
				+module_init(dm_cache_init);
			
 
				+module_exit(dm_cache_exit);
			
 
				+
			
 
				+MODULE_DESCRIPTION(DM_NAME " cache target");
			
 
				+MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
			
 
				+MODULE_LICENSE("GPL");
			
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1234,20 +1234,6 @@ static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Encode key into its hex representation
			
 
				- */
			
 
				-static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
			
 
				-{
			
 
				-	unsigned int i;
			
 
				-
			
 
				-	for (i = 0; i < size; i++) {
			
 
				-		sprintf(hex, "%02x", *key);
			
 
				-		hex += 2;
			
 
				-		key++;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				 static void crypt_free_tfms(struct crypt_config *cc)
			
 
				 {
			
 
				 	unsigned i;
			
@@ -1651,7 +1637,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 
			
 
				 		if (opt_params == 1 && opt_string &&
			
 
				 		    !strcasecmp(opt_string, "allow_discards"))
			
 
				-			ti->num_discard_requests = 1;
			
 
				+			ti->num_discard_bios = 1;
			
 
				 		else if (opt_params) {
			
 
				 			ret = -EINVAL;
			
 
				 			ti->error = "Invalid feature arguments";
			
@@ -1679,7 +1665,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 		goto bad;
			
 
				 	}
			
 
				 
			
 
				-	ti->num_flush_requests = 1;
			
 
				+	ti->num_flush_bios = 1;
			
 
				 	ti->discard_zeroes_data_unsupported = true;
			
 
				 
			
 
				 	return 0;
			
@@ -1717,11 +1703,11 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
 
				 	return DM_MAPIO_SUBMITTED;
			
 
				 }
			
 
				 
			
 
				-static int crypt_status(struct dm_target *ti, status_type_t type,
			
 
				-			unsigned status_flags, char *result, unsigned maxlen)
			
 
				+static void crypt_status(struct dm_target *ti, status_type_t type,
			
 
				+			 unsigned status_flags, char *result, unsigned maxlen)
			
 
				 {
			
 
				 	struct crypt_config *cc = ti->private;
			
 
				-	unsigned int sz = 0;
			
 
				+	unsigned i, sz = 0;
			
 
				 
			
 
				 	switch (type) {
			
 
				 	case STATUSTYPE_INFO:
			
@@ -1731,27 +1717,20 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
 
				 	case STATUSTYPE_TABLE:
			
 
				 		DMEMIT("%s ", cc->cipher_string);
			
 
				 
			
 
				-		if (cc->key_size > 0) {
			
 
				-			if ((maxlen - sz) < ((cc->key_size << 1) + 1))
			
 
				-				return -ENOMEM;
			
 
				-
			
 
				-			crypt_encode_key(result + sz, cc->key, cc->key_size);
			
 
				-			sz += cc->key_size << 1;
			
 
				-		} else {
			
 
				-			if (sz >= maxlen)
			
 
				-				return -ENOMEM;
			
 
				-			result[sz++] = '-';
			
 
				-		}
			
 
				+		if (cc->key_size > 0)
			
 
				+			for (i = 0; i < cc->key_size; i++)
			
 
				+				DMEMIT("%02x", cc->key[i]);
			
 
				+		else
			
 
				+			DMEMIT("-");
			
 
				 
			
 
				 		DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset,
			
 
				 				cc->dev->name, (unsigned long long)cc->start);
			
 
				 
			
 
				-		if (ti->num_discard_requests)
			
 
				+		if (ti->num_discard_bios)
			
 
				 			DMEMIT(" 1 allow_discards");
			
 
				 
			
 
				 		break;
			
 
				 	}
			
 
				-	return 0;
			
 
				 }
			
 
				 
			
 
				 static void crypt_postsuspend(struct dm_target *ti)
			
@@ -1845,7 +1824,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
 
				 
			
 
				 static struct target_type crypt_target = {
			
 
				 	.name   = "crypt",
			
 
				-	.version = {1, 12, 0},
			
 
				+	.version = {1, 12, 1},
			
 
				 	.module = THIS_MODULE,
			
 
				 	.ctr    = crypt_ctr,
			
 
				 	.dtr    = crypt_dtr,
			
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -198,8 +198,8 @@ out:
 
				 	mutex_init(&dc->timer_lock);
			
 
				 	atomic_set(&dc->may_delay, 1);
			
 
				 
			
 
				-	ti->num_flush_requests = 1;
			
 
				-	ti->num_discard_requests = 1;
			
 
				+	ti->num_flush_bios = 1;
			
 
				+	ti->num_discard_bios = 1;
			
 
				 	ti->private = dc;
			
 
				 	return 0;
			
 
				 
			
@@ -293,8 +293,8 @@ static int delay_map(struct dm_target *ti, struct bio *bio)
 
				 	return delay_bio(dc, dc->read_delay, bio);
			
 
				 }
			
 
				 
			
 
				-static int delay_status(struct dm_target *ti, status_type_t type,
			
 
				-			unsigned status_flags, char *result, unsigned maxlen)
			
 
				+static void delay_status(struct dm_target *ti, status_type_t type,
			
 
				+			 unsigned status_flags, char *result, unsigned maxlen)
			
 
				 {
			
 
				 	struct delay_c *dc = ti->private;
			
 
				 	int sz = 0;
			
@@ -314,8 +314,6 @@ static int delay_status(struct dm_target *ti, status_type_t type,
 
				 			       dc->write_delay);
			
 
				 		break;
			
 
				 	}
			
 
				-
			
 
				-	return 0;
			
 
				 }
			
 
				 
			
 
				 static int delay_iterate_devices(struct dm_target *ti,
			
@@ -337,7 +335,7 @@ out:
 
				 
			
 
				 static struct target_type delay_target = {
			
 
				 	.name	     = "delay",
			
 
				-	.version     = {1, 2, 0},
			
 
				+	.version     = {1, 2, 1},
			
 
				 	.module      = THIS_MODULE,
			
 
				 	.ctr	     = delay_ctr,
			
 
				 	.dtr	     = delay_dtr,
			
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -216,8 +216,8 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 		goto bad;
			
 
				 	}
			
 
				 
			
 
				-	ti->num_flush_requests = 1;
			
 
				-	ti->num_discard_requests = 1;
			
 
				+	ti->num_flush_bios = 1;
			
 
				+	ti->num_discard_bios = 1;
			
 
				 	ti->per_bio_data_size = sizeof(struct per_bio_data);
			
 
				 	ti->private = fc;
			
 
				 	return 0;
			
@@ -337,8 +337,8 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, int error)
 
				 	return error;
			
 
				 }
			
 
				 
			
 
				-static int flakey_status(struct dm_target *ti, status_type_t type,
			
 
				-			 unsigned status_flags, char *result, unsigned maxlen)
			
 
				+static void flakey_status(struct dm_target *ti, status_type_t type,
			
 
				+			  unsigned status_flags, char *result, unsigned maxlen)
			
 
				 {
			
 
				 	unsigned sz = 0;
			
 
				 	struct flakey_c *fc = ti->private;
			
@@ -368,7 +368,6 @@ static int flakey_status(struct dm_target *ti, status_type_t type,
 
				 
			
 
				 		break;
			
 
				 	}
			
 
				-	return 0;
			
 
				 }
			
 
				 
			
 
				 static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg)
			
@@ -411,7 +410,7 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_
 
				 
			
 
				 static struct target_type flakey_target = {
			
 
				 	.name   = "flakey",
			
 
				-	.version = {1, 3, 0},
			
 
				+	.version = {1, 3, 1},
			
 
				 	.module = THIS_MODULE,
			
 
				 	.ctr    = flakey_ctr,
			
 
				 	.dtr    = flakey_dtr,
			
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1067,6 +1067,7 @@ static void retrieve_status(struct dm_table *table,
 
				 	num_targets = dm_table_get_num_targets(table);
			
 
				 	for (i = 0; i < num_targets; i++) {
			
 
				 		struct dm_target *ti = dm_table_get_target(table, i);
			
 
				+		size_t l;
			
 
				 
			
 
				 		remaining = len - (outptr - outbuf);
			
 
				 		if (remaining <= sizeof(struct dm_target_spec)) {
			
@@ -1093,14 +1094,17 @@ static void retrieve_status(struct dm_table *table,
 
				 		if (ti->type->status) {
			
 
				 			if (param->flags & DM_NOFLUSH_FLAG)
			
 
				 				status_flags |= DM_STATUS_NOFLUSH_FLAG;
			
 
				-			if (ti->type->status(ti, type, status_flags, outptr, remaining)) {
			
 
				-				param->flags |= DM_BUFFER_FULL_FLAG;
			
 
				-				break;
			
 
				-			}
			
 
				+			ti->type->status(ti, type, status_flags, outptr, remaining);
			
 
				 		} else
			
 
				 			outptr[0] = '\0';
			
 
				 
			
 
				-		outptr += strlen(outptr) + 1;
			
 
				+		l = strlen(outptr) + 1;
			
 
				+		if (l == remaining) {
			
 
				+			param->flags |= DM_BUFFER_FULL_FLAG;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		outptr += l;
			
 
				 		used = param->data_start + (outptr - outbuf);
			
 
				 
			
 
				 		outptr = align_ptr(outptr);
			
@@ -1410,6 +1414,22 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static bool buffer_test_overflow(char *result, unsigned maxlen)
			
 
				+{
			
 
				+	return !maxlen || strlen(result) + 1 >= maxlen;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Process device-mapper dependent messages.
			
 
				+ * Returns a number <= 1 if message was processed by device mapper.
			
 
				+ * Returns 2 if message should be delivered to the target.
			
 
				+ */
			
 
				+static int message_for_md(struct mapped_device *md, unsigned argc, char **argv,
			
 
				+			  char *result, unsigned maxlen)
			
 
				+{
			
 
				+	return 2;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Pass a message to the target that's at the supplied device offset.
			
 
				  */
			
@@ -1421,6 +1441,8 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
 
				 	struct dm_table *table;
			
 
				 	struct dm_target *ti;
			
 
				 	struct dm_target_msg *tmsg = (void *) param + param->data_start;
			
 
				+	size_t maxlen;
			
 
				+	char *result = get_result_buffer(param, param_size, &maxlen);
			
 
				 
			
 
				 	md = find_device(param);
			
 
				 	if (!md)
			
@@ -1444,6 +1466,10 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
 
				 		goto out_argv;
			
 
				 	}
			
 
				 
			
 
				+	r = message_for_md(md, argc, argv, result, maxlen);
			
 
				+	if (r <= 1)
			
 
				+		goto out_argv;
			
 
				+
			
 
				 	table = dm_get_live_table(md);
			
 
				 	if (!table)
			
 
				 		goto out_argv;
			
@@ -1469,44 +1495,68 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
 
				  out_argv:
			
 
				 	kfree(argv);
			
 
				  out:
			
 
				-	param->data_size = 0;
			
 
				+	if (r >= 0)
			
 
				+		__dev_status(md, param);
			
 
				+
			
 
				+	if (r == 1) {
			
 
				+		param->flags |= DM_DATA_OUT_FLAG;
			
 
				+		if (buffer_test_overflow(result, maxlen))
			
 
				+			param->flags |= DM_BUFFER_FULL_FLAG;
			
 
				+		else
			
 
				+			param->data_size = param->data_start + strlen(result) + 1;
			
 
				+		r = 0;
			
 
				+	}
			
 
				+
			
 
				 	dm_put(md);
			
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * The ioctl parameter block consists of two parts, a dm_ioctl struct
			
 
				+ * followed by a data buffer.  This flag is set if the second part,
			
 
				+ * which has a variable size, is not used by the function processing
			
 
				+ * the ioctl.
			
 
				+ */
			
 
				+#define IOCTL_FLAGS_NO_PARAMS	1
			
 
				+
			
 
				 /*-----------------------------------------------------------------
			
 
				  * Implementation of open/close/ioctl on the special char
			
 
				  * device.
			
 
				  *---------------------------------------------------------------*/
			
 
				-static ioctl_fn lookup_ioctl(unsigned int cmd)
			
 
				+static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags)
			
 
				 {
			
 
				 	static struct {
			
 
				 		int cmd;
			
 
				+		int flags;
			
 
				 		ioctl_fn fn;
			
 
				 	} _ioctls[] = {
			
 
				-		{DM_VERSION_CMD, NULL},	/* version is dealt with elsewhere */
			
 
				-		{DM_REMOVE_ALL_CMD, remove_all},
			
 
				-		{DM_LIST_DEVICES_CMD, list_devices},
			
 
				-
			
 
				-		{DM_DEV_CREATE_CMD, dev_create},
			
 
				-		{DM_DEV_REMOVE_CMD, dev_remove},
			
 
				-		{DM_DEV_RENAME_CMD, dev_rename},
			
 
				-		{DM_DEV_SUSPEND_CMD, dev_suspend},
			
 
				-		{DM_DEV_STATUS_CMD, dev_status},
			
 
				-		{DM_DEV_WAIT_CMD, dev_wait},
			
 
				-
			
 
				-		{DM_TABLE_LOAD_CMD, table_load},
			
 
				-		{DM_TABLE_CLEAR_CMD, table_clear},
			
 
				-		{DM_TABLE_DEPS_CMD, table_deps},
			
 
				-		{DM_TABLE_STATUS_CMD, table_status},
			
 
				-
			
 
				-		{DM_LIST_VERSIONS_CMD, list_versions},
			
 
				-
			
 
				-		{DM_TARGET_MSG_CMD, target_message},
			
 
				-		{DM_DEV_SET_GEOMETRY_CMD, dev_set_geometry}
			
 
				+		{DM_VERSION_CMD, 0, NULL}, /* version is dealt with elsewhere */
			
 
				+		{DM_REMOVE_ALL_CMD, IOCTL_FLAGS_NO_PARAMS, remove_all},
			
 
				+		{DM_LIST_DEVICES_CMD, 0, list_devices},
			
 
				+
			
 
				+		{DM_DEV_CREATE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_create},
			
 
				+		{DM_DEV_REMOVE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_remove},
			
 
				+		{DM_DEV_RENAME_CMD, 0, dev_rename},
			
 
				+		{DM_DEV_SUSPEND_CMD, IOCTL_FLAGS_NO_PARAMS, dev_suspend},
			
 
				+		{DM_DEV_STATUS_CMD, IOCTL_FLAGS_NO_PARAMS, dev_status},
			
 
				+		{DM_DEV_WAIT_CMD, 0, dev_wait},
			
 
				+
			
 
				+		{DM_TABLE_LOAD_CMD, 0, table_load},
			
 
				+		{DM_TABLE_CLEAR_CMD, IOCTL_FLAGS_NO_PARAMS, table_clear},
			
 
				+		{DM_TABLE_DEPS_CMD, 0, table_deps},
			
 
				+		{DM_TABLE_STATUS_CMD, 0, table_status},
			
 
				+
			
 
				+		{DM_LIST_VERSIONS_CMD, 0, list_versions},
			
 
				+
			
 
				+		{DM_TARGET_MSG_CMD, 0, target_message},
			
 
				+		{DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry}
			
 
				 	};
			
 
				 
			
 
				-	return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn;
			
 
				+	if (unlikely(cmd >= ARRAY_SIZE(_ioctls)))
			
 
				+		return NULL;
			
 
				+
			
 
				+	*ioctl_flags = _ioctls[cmd].flags;
			
 
				+	return _ioctls[cmd].fn;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1543,7 +1593,8 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user)
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				-#define DM_PARAMS_VMALLOC	0x0001	/* Params alloced with vmalloc not kmalloc */
			
 
				+#define DM_PARAMS_KMALLOC	0x0001	/* Params alloced with kmalloc */
			
 
				+#define DM_PARAMS_VMALLOC	0x0002	/* Params alloced with vmalloc */
			
 
				 #define DM_WIPE_BUFFER		0x0010	/* Wipe input buffer before returning from ioctl */
			
 
				 
			
 
				 static void free_params(struct dm_ioctl *param, size_t param_size, int param_flags)
			
@@ -1551,66 +1602,80 @@ static void free_params(struct dm_ioctl *param, size_t param_size, int param_fla
 
				 	if (param_flags & DM_WIPE_BUFFER)
			
 
				 		memset(param, 0, param_size);
			
 
				 
			
 
				+	if (param_flags & DM_PARAMS_KMALLOC)
			
 
				+		kfree(param);
			
 
				 	if (param_flags & DM_PARAMS_VMALLOC)
			
 
				 		vfree(param);
			
 
				-	else
			
 
				-		kfree(param);
			
 
				 }
			
 
				 
			
 
				-static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param, int *param_flags)
			
 
				+static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kernel,
			
 
				+		       int ioctl_flags,
			
 
				+		       struct dm_ioctl **param, int *param_flags)
			
 
				 {
			
 
				-	struct dm_ioctl tmp, *dmi;
			
 
				+	struct dm_ioctl *dmi;
			
 
				 	int secure_data;
			
 
				+	const size_t minimum_data_size = sizeof(*param_kernel) - sizeof(param_kernel->data);
			
 
				 
			
 
				-	if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data)))
			
 
				+	if (copy_from_user(param_kernel, user, minimum_data_size))
			
 
				 		return -EFAULT;
			
 
				 
			
 
				-	if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data)))
			
 
				+	if (param_kernel->data_size < minimum_data_size)
			
 
				 		return -EINVAL;
			
 
				 
			
 
				-	secure_data = tmp.flags & DM_SECURE_DATA_FLAG;
			
 
				+	secure_data = param_kernel->flags & DM_SECURE_DATA_FLAG;
			
 
				 
			
 
				 	*param_flags = secure_data ? DM_WIPE_BUFFER : 0;
			
 
				 
			
 
				+	if (ioctl_flags & IOCTL_FLAGS_NO_PARAMS) {
			
 
				+		dmi = param_kernel;
			
 
				+		dmi->data_size = minimum_data_size;
			
 
				+		goto data_copied;
			
 
				+	}
			
 
				+
			
 
				 	/*
			
 
				 	 * Try to avoid low memory issues when a device is suspended.
			
 
				 	 * Use kmalloc() rather than vmalloc() when we can.
			
 
				 	 */
			
 
				 	dmi = NULL;
			
 
				-	if (tmp.data_size <= KMALLOC_MAX_SIZE)
			
 
				-		dmi = kmalloc(tmp.data_size, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
			
 
				+	if (param_kernel->data_size <= KMALLOC_MAX_SIZE) {
			
 
				+		dmi = kmalloc(param_kernel->data_size, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
			
 
				+		if (dmi)
			
 
				+			*param_flags |= DM_PARAMS_KMALLOC;
			
 
				+	}
			
 
				 
			
 
				 	if (!dmi) {
			
 
				-		dmi = __vmalloc(tmp.data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH, PAGE_KERNEL);
			
 
				-		*param_flags |= DM_PARAMS_VMALLOC;
			
 
				+		dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH, PAGE_KERNEL);
			
 
				+		if (dmi)
			
 
				+			*param_flags |= DM_PARAMS_VMALLOC;
			
 
				 	}
			
 
				 
			
 
				 	if (!dmi) {
			
 
				-		if (secure_data && clear_user(user, tmp.data_size))
			
 
				+		if (secure_data && clear_user(user, param_kernel->data_size))
			
 
				 			return -EFAULT;
			
 
				 		return -ENOMEM;
			
 
				 	}
			
 
				 
			
 
				-	if (copy_from_user(dmi, user, tmp.data_size))
			
 
				+	if (copy_from_user(dmi, user, param_kernel->data_size))
			
 
				 		goto bad;
			
 
				 
			
 
				+data_copied:
			
 
				 	/*
			
 
				 	 * Abort if something changed the ioctl data while it was being copied.
			
 
				 	 */
			
 
				-	if (dmi->data_size != tmp.data_size) {
			
 
				+	if (dmi->data_size != param_kernel->data_size) {
			
 
				 		DMERR("rejecting ioctl: data size modified while processing parameters");
			
 
				 		goto bad;
			
 
				 	}
			
 
				 
			
 
				 	/* Wipe the user buffer so we do not return it to userspace */
			
 
				-	if (secure_data && clear_user(user, tmp.data_size))
			
 
				+	if (secure_data && clear_user(user, param_kernel->data_size))
			
 
				 		goto bad;
			
 
				 
			
 
				 	*param = dmi;
			
 
				 	return 0;
			
 
				 
			
 
				 bad:
			
 
				-	free_params(dmi, tmp.data_size, *param_flags);
			
 
				+	free_params(dmi, param_kernel->data_size, *param_flags);
			
 
				 
			
 
				 	return -EFAULT;
			
 
				 }
			
@@ -1621,6 +1686,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
 
				 	param->flags &= ~DM_BUFFER_FULL_FLAG;
			
 
				 	param->flags &= ~DM_UEVENT_GENERATED_FLAG;
			
 
				 	param->flags &= ~DM_SECURE_DATA_FLAG;
			
 
				+	param->flags &= ~DM_DATA_OUT_FLAG;
			
 
				 
			
 
				 	/* Ignores parameters */
			
 
				 	if (cmd == DM_REMOVE_ALL_CMD ||
			
@@ -1648,11 +1714,13 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
 
				 static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
			
 
				 {
			
 
				 	int r = 0;
			
 
				+	int ioctl_flags;
			
 
				 	int param_flags;
			
 
				 	unsigned int cmd;
			
 
				 	struct dm_ioctl *uninitialized_var(param);
			
 
				 	ioctl_fn fn = NULL;
			
 
				 	size_t input_param_size;
			
 
				+	struct dm_ioctl param_kernel;
			
 
				 
			
 
				 	/* only root can play with this */
			
 
				 	if (!capable(CAP_SYS_ADMIN))
			
@@ -1677,7 +1745,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 
				 	if (cmd == DM_VERSION_CMD)
			
 
				 		return 0;
			
 
				 
			
 
				-	fn = lookup_ioctl(cmd);
			
 
				+	fn = lookup_ioctl(cmd, &ioctl_flags);
			
 
				 	if (!fn) {
			
 
				 		DMWARN("dm_ctl_ioctl: unknown command 0x%x", command);
			
 
				 		return -ENOTTY;
			
@@ -1686,7 +1754,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 
				 	/*
			
 
				 	 * Copy the parameters into kernel space.
			
 
				 	 */
			
 
				-	r = copy_params(user, &param, &param_flags);
			
 
				+	r = copy_params(user, &param_kernel, ioctl_flags, &param, &param_flags);
			
 
				 
			
 
				 	if (r)
			
 
				 		return r;
			
@@ -1699,6 +1767,10 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 
				 	param->data_size = sizeof(*param);
			
 
				 	r = fn(param, input_param_size);
			
 
				 
			
 
				+	if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) &&
			
 
				+	    unlikely(ioctl_flags & IOCTL_FLAGS_NO_PARAMS))
			
 
				+		DMERR("ioctl %d tried to output some data but has IOCTL_FLAGS_NO_PARAMS set", cmd);
			
 
				+
			
 
				 	/*
			
 
				 	 * Copy the results back to userland.
			
 
				 	 */
			
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -22,6 +22,7 @@
 
				 #include <linux/vmalloc.h>
			
 
				 #include <linux/workqueue.h>
			
 
				 #include <linux/mutex.h>
			
 
				+#include <linux/delay.h>
			
 
				 #include <linux/device-mapper.h>
			
 
				 #include <linux/dm-kcopyd.h>
			
 
				 
			
@@ -51,6 +52,8 @@ struct dm_kcopyd_client {
 
				 	struct workqueue_struct *kcopyd_wq;
			
 
				 	struct work_struct kcopyd_work;
			
 
				 
			
 
				+	struct dm_kcopyd_throttle *throttle;
			
 
				+
			
 
				 /*
			
 
				  * We maintain three lists of jobs:
			
 
				  *
			
@@ -68,6 +71,117 @@ struct dm_kcopyd_client {
 
				 
			
 
				 static struct page_list zero_page_list;
			
 
				 
			
 
				+static DEFINE_SPINLOCK(throttle_spinlock);
			
 
				+
			
 
				+/*
			
 
				+ * IO/IDLE accounting slowly decays after (1 << ACCOUNT_INTERVAL_SHIFT) period.
			
 
				+ * When total_period >= (1 << ACCOUNT_INTERVAL_SHIFT) the counters are divided
			
 
				+ * by 2.
			
 
				+ */
			
 
				+#define ACCOUNT_INTERVAL_SHIFT		SHIFT_HZ
			
 
				+
			
 
				+/*
			
 
				+ * Sleep this number of milliseconds.
			
 
				+ *
			
 
				+ * The value was decided experimentally.
			
 
				+ * Smaller values seem to cause an increased copy rate above the limit.
			
 
				+ * The reason for this is unknown but possibly due to jiffies rounding errors
			
 
				+ * or read/write cache inside the disk.
			
 
				+ */
			
 
				+#define SLEEP_MSEC			100
			
 
				+
			
 
				+/*
			
 
				+ * Maximum number of sleep events. There is a theoretical livelock if more
			
 
				+ * kcopyd clients do work simultaneously which this limit avoids.
			
 
				+ */
			
 
				+#define MAX_SLEEPS			10
			
 
				+
			
 
				+static void io_job_start(struct dm_kcopyd_throttle *t)
			
 
				+{
			
 
				+	unsigned throttle, now, difference;
			
 
				+	int slept = 0, skew;
			
 
				+
			
 
				+	if (unlikely(!t))
			
 
				+		return;
			
 
				+
			
 
				+try_again:
			
 
				+	spin_lock_irq(&throttle_spinlock);
			
 
				+
			
 
				+	throttle = ACCESS_ONCE(t->throttle);
			
 
				+
			
 
				+	if (likely(throttle >= 100))
			
 
				+		goto skip_limit;
			
 
				+
			
 
				+	now = jiffies;
			
 
				+	difference = now - t->last_jiffies;
			
 
				+	t->last_jiffies = now;
			
 
				+	if (t->num_io_jobs)
			
 
				+		t->io_period += difference;
			
 
				+	t->total_period += difference;
			
 
				+
			
 
				+	/*
			
 
				+	 * Maintain sane values if we got a temporary overflow.
			
 
				+	 */
			
 
				+	if (unlikely(t->io_period > t->total_period))
			
 
				+		t->io_period = t->total_period;
			
 
				+
			
 
				+	if (unlikely(t->total_period >= (1 << ACCOUNT_INTERVAL_SHIFT))) {
			
 
				+		int shift = fls(t->total_period >> ACCOUNT_INTERVAL_SHIFT);
			
 
				+		t->total_period >>= shift;
			
 
				+		t->io_period >>= shift;
			
 
				+	}
			
 
				+
			
 
				+	skew = t->io_period - throttle * t->total_period / 100;
			
 
				+
			
 
				+	if (unlikely(skew > 0) && slept < MAX_SLEEPS) {
			
 
				+		slept++;
			
 
				+		spin_unlock_irq(&throttle_spinlock);
			
 
				+		msleep(SLEEP_MSEC);
			
 
				+		goto try_again;
			
 
				+	}
			
 
				+
			
 
				+skip_limit:
			
 
				+	t->num_io_jobs++;
			
 
				+
			
 
				+	spin_unlock_irq(&throttle_spinlock);
			
 
				+}
			
 
				+
			
 
				+static void io_job_finish(struct dm_kcopyd_throttle *t)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	if (unlikely(!t))
			
 
				+		return;
			
 
				+
			
 
				+	spin_lock_irqsave(&throttle_spinlock, flags);
			
 
				+
			
 
				+	t->num_io_jobs--;
			
 
				+
			
 
				+	if (likely(ACCESS_ONCE(t->throttle) >= 100))
			
 
				+		goto skip_limit;
			
 
				+
			
 
				+	if (!t->num_io_jobs) {
			
 
				+		unsigned now, difference;
			
 
				+
			
 
				+		now = jiffies;
			
 
				+		difference = now - t->last_jiffies;
			
 
				+		t->last_jiffies = now;
			
 
				+
			
 
				+		t->io_period += difference;
			
 
				+		t->total_period += difference;
			
 
				+
			
 
				+		/*
			
 
				+		 * Maintain sane values if we got a temporary overflow.
			
 
				+		 */
			
 
				+		if (unlikely(t->io_period > t->total_period))
			
 
				+			t->io_period = t->total_period;
			
 
				+	}
			
 
				+
			
 
				+skip_limit:
			
 
				+	spin_unlock_irqrestore(&throttle_spinlock, flags);
			
 
				+}
			
 
				+
			
 
				+
			
 
				 static void wake(struct dm_kcopyd_client *kc)
			
 
				 {
			
 
				 	queue_work(kc->kcopyd_wq, &kc->kcopyd_work);
			
@@ -348,6 +462,8 @@ static void complete_io(unsigned long error, void *context)
 
				 	struct kcopyd_job *job = (struct kcopyd_job *) context;
			
 
				 	struct dm_kcopyd_client *kc = job->kc;
			
 
				 
			
 
				+	io_job_finish(kc->throttle);
			
 
				+
			
 
				 	if (error) {
			
 
				 		if (job->rw & WRITE)
			
 
				 			job->write_err |= error;
			
@@ -389,6 +505,8 @@ static int run_io_job(struct kcopyd_job *job)
 
				 		.client = job->kc->io_client,
			
 
				 	};
			
 
				 
			
 
				+	io_job_start(job->kc->throttle);
			
 
				+
			
 
				 	if (job->rw == READ)
			
 
				 		r = dm_io(&io_req, 1, &job->source, NULL);
			
 
				 	else
			
@@ -695,7 +813,7 @@ int kcopyd_cancel(struct kcopyd_job *job, int block)
 
				 /*-----------------------------------------------------------------
			
 
				  * Client setup
			
 
				  *---------------------------------------------------------------*/
			
 
				-struct dm_kcopyd_client *dm_kcopyd_client_create(void)
			
 
				+struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle)
			
 
				 {
			
 
				 	int r = -ENOMEM;
			
 
				 	struct dm_kcopyd_client *kc;
			
@@ -708,6 +826,7 @@ struct dm_kcopyd_client *dm_kcopyd_client_create(void)
 
				 	INIT_LIST_HEAD(&kc->complete_jobs);
			
 
				 	INIT_LIST_HEAD(&kc->io_jobs);
			
 
				 	INIT_LIST_HEAD(&kc->pages_jobs);
			
 
				+	kc->throttle = throttle;
			
 
				 
			
 
				 	kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
			
 
				 	if (!kc->job_pool)
			
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -53,9 +53,9 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 		goto bad;
			
 
				 	}
			
 
				 
			
 
				-	ti->num_flush_requests = 1;
			
 
				-	ti->num_discard_requests = 1;
			
 
				-	ti->num_write_same_requests = 1;
			
 
				+	ti->num_flush_bios = 1;
			
 
				+	ti->num_discard_bios = 1;
			
 
				+	ti->num_write_same_bios = 1;
			
 
				 	ti->private = lc;
			
 
				 	return 0;
			
 
				 
			
@@ -95,8 +95,8 @@ static int linear_map(struct dm_target *ti, struct bio *bio)
 
				 	return DM_MAPIO_REMAPPED;
			
 
				 }
			
 
				 
			
 
				-static int linear_status(struct dm_target *ti, status_type_t type,
			
 
				-			 unsigned status_flags, char *result, unsigned maxlen)
			
 
				+static void linear_status(struct dm_target *ti, status_type_t type,
			
 
				+			  unsigned status_flags, char *result, unsigned maxlen)
			
 
				 {
			
 
				 	struct linear_c *lc = (struct linear_c *) ti->private;
			
 
				 
			
@@ -110,7 +110,6 @@ static int linear_status(struct dm_target *ti, status_type_t type,
 
				 				(unsigned long long)lc->start);
			
 
				 		break;
			
 
				 	}
			
 
				-	return 0;
			
 
				 }
			
 
				 
			
 
				 static int linear_ioctl(struct dm_target *ti, unsigned int cmd,
			
@@ -155,7 +154,7 @@ static int linear_iterate_devices(struct dm_target *ti,
 
				 
			
 
				 static struct target_type linear_target = {
			
 
				 	.name   = "linear",
			
 
				-	.version = {1, 2, 0},
			
 
				+	.version = {1, 2, 1},
			
 
				 	.module = THIS_MODULE,
			
 
				 	.ctr    = linear_ctr,
			
 
				 	.dtr    = linear_dtr,
			
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -905,8 +905,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 
				 		goto bad;
			
 
				 	}
			
 
				 
			
 
				-	ti->num_flush_requests = 1;
			
 
				-	ti->num_discard_requests = 1;
			
 
				+	ti->num_flush_bios = 1;
			
 
				+	ti->num_discard_bios = 1;
			
 
				 
			
 
				 	return 0;
			
 
				 
			
@@ -1378,8 +1378,8 @@ static void multipath_resume(struct dm_target *ti)
 
				  *     [priority selector-name num_ps_args [ps_args]*
			
 
				  *      num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
			
 
				  */
			
 
				-static int multipath_status(struct dm_target *ti, status_type_t type,
			
 
				-			    unsigned status_flags, char *result, unsigned maxlen)
			
 
				+static void multipath_status(struct dm_target *ti, status_type_t type,
			
 
				+			     unsigned status_flags, char *result, unsigned maxlen)
			
 
				 {
			
 
				 	int sz = 0;
			
 
				 	unsigned long flags;
			
@@ -1485,8 +1485,6 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
 
				 	}
			
 
				 
			
 
				 	spin_unlock_irqrestore(&m->lock, flags);
			
 
				-
			
 
				-	return 0;
			
 
				 }
			
 
				 
			
 
				 static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
			
@@ -1695,7 +1693,7 @@ out:
 
				  *---------------------------------------------------------------*/
			
 
				 static struct target_type multipath_target = {
			
 
				 	.name = "multipath",
			
 
				-	.version = {1, 5, 0},
			
 
				+	.version = {1, 5, 1},
			
 
				 	.module = THIS_MODULE,
			
 
				 	.ctr = multipath_ctr,
			
 
				 	.dtr = multipath_dtr,
			
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1151,7 +1151,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
				 
			
 
				 	INIT_WORK(&rs->md.event_work, do_table_event);
			
 
				 	ti->private = rs;
			
 
				-	ti->num_flush_requests = 1;
			
 
				+	ti->num_flush_bios = 1;
			
 
				 
			
 
				 	mutex_lock(&rs->md.reconfig_mutex);
			
 
				 	ret = md_run(&rs->md);
			
@@ -1201,8 +1201,8 @@ static int raid_map(struct dm_target *ti, struct bio *bio)
 
				 	return DM_MAPIO_SUBMITTED;
			
 
				 }
			
 
				 
			
 
				-static int raid_status(struct dm_target *ti, status_type_t type,
			
 
				-		       unsigned status_flags, char *result, unsigned maxlen)
			
 
				+static void raid_status(struct dm_target *ti, status_type_t type,
			
 
				+			unsigned status_flags, char *result, unsigned maxlen)
			
 
				 {
			
 
				 	struct raid_set *rs = ti->private;
			
 
				 	unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
			
@@ -1344,8 +1344,6 @@ static int raid_status(struct dm_target *ti, status_type_t type,
 
				 				DMEMIT(" -");
			
 
				 		}
			
 
				 	}
			
 
				-
			
 
				-	return 0;
			
 
				 }
			
 
				 
			
 
				 static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
			
@@ -1405,7 +1403,7 @@ static void raid_resume(struct dm_target *ti)
 
				 
			
 
				 static struct target_type raid_target = {
			
 
				 	.name = "raid",
			
 
				-	.version = {1, 4, 1},
			
 
				+	.version = {1, 4, 2},
			
 
				 	.module = THIS_MODULE,
			
 
				 	.ctr = raid_ctr,
			
 
				 	.dtr = raid_dtr,
			
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -82,6 +82,9 @@ struct mirror_set {
 
				 	struct mirror mirror[0];
			
 
				 };
			
 
				 
			
 
				+DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(raid1_resync_throttle,
			
 
				+		"A percentage of time allocated for raid resynchronization");
			
 
				+
			
 
				 static void wakeup_mirrord(void *context)
			
 
				 {
			
 
				 	struct mirror_set *ms = context;
			
@@ -1072,8 +1075,8 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 	if (r)
			
 
				 		goto err_free_context;
			
 
				 
			
 
				-	ti->num_flush_requests = 1;
			
 
				-	ti->num_discard_requests = 1;
			
 
				+	ti->num_flush_bios = 1;
			
 
				+	ti->num_discard_bios = 1;
			
 
				 	ti->per_bio_data_size = sizeof(struct dm_raid1_bio_record);
			
 
				 	ti->discard_zeroes_data_unsupported = true;
			
 
				 
			
@@ -1111,7 +1114,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 		goto err_destroy_wq;
			
 
				 	}
			
 
				 
			
 
				-	ms->kcopyd_client = dm_kcopyd_client_create();
			
 
				+	ms->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
			
 
				 	if (IS_ERR(ms->kcopyd_client)) {
			
 
				 		r = PTR_ERR(ms->kcopyd_client);
			
 
				 		goto err_destroy_wq;
			
@@ -1347,8 +1350,8 @@ static char device_status_char(struct mirror *m)
 
				 }
			
 
				 
			
 
				 
			
 
				-static int mirror_status(struct dm_target *ti, status_type_t type,
			
 
				-			 unsigned status_flags, char *result, unsigned maxlen)
			
 
				+static void mirror_status(struct dm_target *ti, status_type_t type,
			
 
				+			  unsigned status_flags, char *result, unsigned maxlen)
			
 
				 {
			
 
				 	unsigned int m, sz = 0;
			
 
				 	struct mirror_set *ms = (struct mirror_set *) ti->private;
			
@@ -1383,8 +1386,6 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
 
				 		if (ms->features & DM_RAID1_HANDLE_ERRORS)
			
 
				 			DMEMIT(" 1 handle_errors");
			
 
				 	}
			
 
				-
			
 
				-	return 0;
			
 
				 }
			
 
				 
			
 
				 static int mirror_iterate_devices(struct dm_target *ti,
			
@@ -1403,7 +1404,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
 
				 
			
 
				 static struct target_type mirror_target = {
			
 
				 	.name	 = "mirror",
			
 
				-	.version = {1, 13, 1},
			
 
				+	.version = {1, 13, 2},
			
 
				 	.module	 = THIS_MODULE,
			
 
				 	.ctr	 = mirror_ctr,
			
 
				 	.dtr	 = mirror_dtr,
			
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -124,6 +124,9 @@ struct dm_snapshot {
 
				 #define RUNNING_MERGE          0
			
 
				 #define SHUTDOWN_MERGE         1
			
 
				 
			
 
				+DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
			
 
				+		"A percentage of time allocated for copy on write");
			
 
				+
			
 
				 struct dm_dev *dm_snap_origin(struct dm_snapshot *s)
			
 
				 {
			
 
				 	return s->origin;
			
@@ -1037,7 +1040,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 	int i;
			
 
				 	int r = -EINVAL;
			
 
				 	char *origin_path, *cow_path;
			
 
				-	unsigned args_used, num_flush_requests = 1;
			
 
				+	unsigned args_used, num_flush_bios = 1;
			
 
				 	fmode_t origin_mode = FMODE_READ;
			
 
				 
			
 
				 	if (argc != 4) {
			
@@ -1047,7 +1050,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 	}
			
 
				 
			
 
				 	if (dm_target_is_snapshot_merge(ti)) {
			
 
				-		num_flush_requests = 2;
			
 
				+		num_flush_bios = 2;
			
 
				 		origin_mode = FMODE_WRITE;
			
 
				 	}
			
 
				 
			
@@ -1108,7 +1111,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 		goto bad_hash_tables;
			
 
				 	}
			
 
				 
			
 
				-	s->kcopyd_client = dm_kcopyd_client_create();
			
 
				+	s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
			
 
				 	if (IS_ERR(s->kcopyd_client)) {
			
 
				 		r = PTR_ERR(s->kcopyd_client);
			
 
				 		ti->error = "Could not create kcopyd client";
			
@@ -1127,7 +1130,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 	spin_lock_init(&s->tracked_chunk_lock);
			
 
				 
			
 
				 	ti->private = s;
			
 
				-	ti->num_flush_requests = num_flush_requests;
			
 
				+	ti->num_flush_bios = num_flush_bios;
			
 
				 	ti->per_bio_data_size = sizeof(struct dm_snap_tracked_chunk);
			
 
				 
			
 
				 	/* Add snapshot to the list of snapshots for this origin */
			
@@ -1691,7 +1694,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
 
				 	init_tracked_chunk(bio);
			
 
				 
			
 
				 	if (bio->bi_rw & REQ_FLUSH) {
			
 
				-		if (!dm_bio_get_target_request_nr(bio))
			
 
				+		if (!dm_bio_get_target_bio_nr(bio))
			
 
				 			bio->bi_bdev = s->origin->bdev;
			
 
				 		else
			
 
				 			bio->bi_bdev = s->cow->bdev;
			
@@ -1836,8 +1839,8 @@ static void snapshot_merge_resume(struct dm_target *ti)
 
				 	start_merge(s);
			
 
				 }
			
 
				 
			
 
				-static int snapshot_status(struct dm_target *ti, status_type_t type,
			
 
				-			   unsigned status_flags, char *result, unsigned maxlen)
			
 
				+static void snapshot_status(struct dm_target *ti, status_type_t type,
			
 
				+			    unsigned status_flags, char *result, unsigned maxlen)
			
 
				 {
			
 
				 	unsigned sz = 0;
			
 
				 	struct dm_snapshot *snap = ti->private;
			
@@ -1883,8 +1886,6 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
 
				 					  maxlen - sz);
			
 
				 		break;
			
 
				 	}
			
 
				-
			
 
				-	return 0;
			
 
				 }
			
 
				 
			
 
				 static int snapshot_iterate_devices(struct dm_target *ti,
			
@@ -2104,7 +2105,7 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 	}
			
 
				 
			
 
				 	ti->private = dev;
			
 
				-	ti->num_flush_requests = 1;
			
 
				+	ti->num_flush_bios = 1;
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -2138,8 +2139,8 @@ static void origin_resume(struct dm_target *ti)
 
				 	ti->max_io_len = get_origin_minimum_chunksize(dev->bdev);
			
 
				 }
			
 
				 
			
 
				-static int origin_status(struct dm_target *ti, status_type_t type,
			
 
				-			 unsigned status_flags, char *result, unsigned maxlen)
			
 
				+static void origin_status(struct dm_target *ti, status_type_t type,
			
 
				+			  unsigned status_flags, char *result, unsigned maxlen)
			
 
				 {
			
 
				 	struct dm_dev *dev = ti->private;
			
 
				 
			
@@ -2152,8 +2153,6 @@ static int origin_status(struct dm_target *ti, status_type_t type,
 
				 		snprintf(result, maxlen, "%s", dev->name);
			
 
				 		break;
			
 
				 	}
			
 
				-
			
 
				-	return 0;
			
 
				 }
			
 
				 
			
 
				 static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
			
@@ -2180,7 +2179,7 @@ static int origin_iterate_devices(struct dm_target *ti,
 
				 
			
 
				 static struct target_type origin_target = {
			
 
				 	.name    = "snapshot-origin",
			
 
				-	.version = {1, 8, 0},
			
 
				+	.version = {1, 8, 1},
			
 
				 	.module  = THIS_MODULE,
			
 
				 	.ctr     = origin_ctr,
			
 
				 	.dtr     = origin_dtr,
			
@@ -2193,7 +2192,7 @@ static struct target_type origin_target = {
 
				 
			
 
				 static struct target_type snapshot_target = {
			
 
				 	.name    = "snapshot",
			
 
				-	.version = {1, 11, 0},
			
 
				+	.version = {1, 11, 1},
			
 
				 	.module  = THIS_MODULE,
			
 
				 	.ctr     = snapshot_ctr,
			
 
				 	.dtr     = snapshot_dtr,
			
@@ -2306,3 +2305,5 @@ module_exit(dm_snapshot_exit);
 
				 MODULE_DESCRIPTION(DM_NAME " snapshot target");
			
 
				 MODULE_AUTHOR("Joe Thornber");
			
 
				 MODULE_LICENSE("GPL");
			
 
				+MODULE_ALIAS("dm-snapshot-origin");
			
 
				+MODULE_ALIAS("dm-snapshot-merge");
			
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -160,9 +160,9 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 	if (r)
			
 
				 		return r;
			
 
				 
			
 
				-	ti->num_flush_requests = stripes;
			
 
				-	ti->num_discard_requests = stripes;
			
 
				-	ti->num_write_same_requests = stripes;
			
 
				+	ti->num_flush_bios = stripes;
			
 
				+	ti->num_discard_bios = stripes;
			
 
				+	ti->num_write_same_bios = stripes;
			
 
				 
			
 
				 	sc->chunk_size = chunk_size;
			
 
				 	if (chunk_size & (chunk_size - 1))
			
@@ -276,19 +276,19 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
 
				 {
			
 
				 	struct stripe_c *sc = ti->private;
			
 
				 	uint32_t stripe;
			
 
				-	unsigned target_request_nr;
			
 
				+	unsigned target_bio_nr;
			
 
				 
			
 
				 	if (bio->bi_rw & REQ_FLUSH) {
			
 
				-		target_request_nr = dm_bio_get_target_request_nr(bio);
			
 
				-		BUG_ON(target_request_nr >= sc->stripes);
			
 
				-		bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev;
			
 
				+		target_bio_nr = dm_bio_get_target_bio_nr(bio);
			
 
				+		BUG_ON(target_bio_nr >= sc->stripes);
			
 
				+		bio->bi_bdev = sc->stripe[target_bio_nr].dev->bdev;
			
 
				 		return DM_MAPIO_REMAPPED;
			
 
				 	}
			
 
				 	if (unlikely(bio->bi_rw & REQ_DISCARD) ||
			
 
				 	    unlikely(bio->bi_rw & REQ_WRITE_SAME)) {
			
 
				-		target_request_nr = dm_bio_get_target_request_nr(bio);
			
 
				-		BUG_ON(target_request_nr >= sc->stripes);
			
 
				-		return stripe_map_range(sc, bio, target_request_nr);
			
 
				+		target_bio_nr = dm_bio_get_target_bio_nr(bio);
			
 
				+		BUG_ON(target_bio_nr >= sc->stripes);
			
 
				+		return stripe_map_range(sc, bio, target_bio_nr);
			
 
				 	}
			
 
				 
			
 
				 	stripe_map_sector(sc, bio->bi_sector, &stripe, &bio->bi_sector);
			
@@ -312,8 +312,8 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
 
				  *
			
 
				  */
			
 
				 
			
 
				-static int stripe_status(struct dm_target *ti, status_type_t type,
			
 
				-			 unsigned status_flags, char *result, unsigned maxlen)
			
 
				+static void stripe_status(struct dm_target *ti, status_type_t type,
			
 
				+			  unsigned status_flags, char *result, unsigned maxlen)
			
 
				 {
			
 
				 	struct stripe_c *sc = (struct stripe_c *) ti->private;
			
 
				 	char buffer[sc->stripes + 1];
			
@@ -340,7 +340,6 @@ static int stripe_status(struct dm_target *ti, status_type_t type,
 
				 			    (unsigned long long)sc->stripe[i].physical_start);
			
 
				 		break;
			
 
				 	}
			
 
				-	return 0;
			
 
				 }
			
 
				 
			
 
				 static int stripe_end_io(struct dm_target *ti, struct bio *bio, int error)
			
@@ -428,7 +427,7 @@ static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 
				 
			
 
				 static struct target_type stripe_target = {
			
 
				 	.name   = "striped",
			
 
				-	.version = {1, 5, 0},
			
 
				+	.version = {1, 5, 1},
			
 
				 	.module = THIS_MODULE,
			
 
				 	.ctr    = stripe_ctr,
			
 
				 	.dtr    = stripe_dtr,
			
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -217,7 +217,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
 
				 
			
 
				 	if (alloc_targets(t, num_targets)) {
			
 
				 		kfree(t);
			
 
				-		t = NULL;
			
 
				 		return -ENOMEM;
			
 
				 	}
			
 
				 
			
@@ -823,8 +822,8 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 
				 
			
 
				 	t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
			
 
				 
			
 
				-	if (!tgt->num_discard_requests && tgt->discards_supported)
			
 
				-		DMWARN("%s: %s: ignoring discards_supported because num_discard_requests is zero.",
			
 
				+	if (!tgt->num_discard_bios && tgt->discards_supported)
			
 
				+		DMWARN("%s: %s: ignoring discards_supported because num_discard_bios is zero.",
			
 
				 		       dm_device_name(t->md), type);
			
 
				 
			
 
				 	return 0;
			
@@ -1360,7 +1359,7 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
 
				 	while (i < dm_table_get_num_targets(t)) {
			
 
				 		ti = dm_table_get_target(t, i++);
			
 
				 
			
 
				-		if (!ti->num_flush_requests)
			
 
				+		if (!ti->num_flush_bios)
			
 
				 			continue;
			
 
				 
			
 
				 		if (ti->flush_supported)
			
@@ -1439,7 +1438,7 @@ static bool dm_table_supports_write_same(struct dm_table *t)
 
				 	while (i < dm_table_get_num_targets(t)) {
			
 
				 		ti = dm_table_get_target(t, i++);
			
 
				 
			
 
				-		if (!ti->num_write_same_requests)
			
 
				+		if (!ti->num_write_same_bios)
			
 
				 			return false;
			
 
				 
			
 
				 		if (!ti->type->iterate_devices ||
			
@@ -1657,7 +1656,7 @@ bool dm_table_supports_discards(struct dm_table *t)
 
				 	while (i < dm_table_get_num_targets(t)) {
			
 
				 		ti = dm_table_get_target(t, i++);
			
 
				 
			
 
				-		if (!ti->num_discard_requests)
			
 
				+		if (!ti->num_discard_bios)
			
 
				 			continue;
			
 
				 
			
 
				 		if (ti->discards_supported)
			
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -116,7 +116,7 @@ static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args)
 
				 	/*
			
 
				 	 * Return error for discards instead of -EOPNOTSUPP
			
 
				 	 */
			
 
				-	tt->num_discard_requests = 1;
			
 
				+	tt->num_discard_bios = 1;
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -280,7 +280,7 @@ static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t)
 
				 	*t = v & ((1 << 24) - 1);
			
 
				 }
			
 
				 
			
 
				-static void data_block_inc(void *context, void *value_le)
			
 
				+static void data_block_inc(void *context, const void *value_le)
			
 
				 {
			
 
				 	struct dm_space_map *sm = context;
			
 
				 	__le64 v_le;
			
@@ -292,7 +292,7 @@ static void data_block_inc(void *context, void *value_le)
 
				 	dm_sm_inc_block(sm, b);
			
 
				 }
			
 
				 
			
 
				-static void data_block_dec(void *context, void *value_le)
			
 
				+static void data_block_dec(void *context, const void *value_le)
			
 
				 {
			
 
				 	struct dm_space_map *sm = context;
			
 
				 	__le64 v_le;
			
@@ -304,7 +304,7 @@ static void data_block_dec(void *context, void *value_le)
 
				 	dm_sm_dec_block(sm, b);
			
 
				 }
			
 
				 
			
 
				-static int data_block_equal(void *context, void *value1_le, void *value2_le)
			
 
				+static int data_block_equal(void *context, const void *value1_le, const void *value2_le)
			
 
				 {
			
 
				 	__le64 v1_le, v2_le;
			
 
				 	uint64_t b1, b2;
			
@@ -318,7 +318,7 @@ static int data_block_equal(void *context, void *value1_le, void *value2_le)
 
				 	return b1 == b2;
			
 
				 }
			
 
				 
			
 
				-static void subtree_inc(void *context, void *value)
			
 
				+static void subtree_inc(void *context, const void *value)
			
 
				 {
			
 
				 	struct dm_btree_info *info = context;
			
 
				 	__le64 root_le;
			
@@ -329,7 +329,7 @@ static void subtree_inc(void *context, void *value)
 
				 	dm_tm_inc(info->tm, root);
			
 
				 }
			
 
				 
			
 
				-static void subtree_dec(void *context, void *value)
			
 
				+static void subtree_dec(void *context, const void *value)
			
 
				 {
			
 
				 	struct dm_btree_info *info = context;
			
 
				 	__le64 root_le;
			
@@ -341,7 +341,7 @@ static void subtree_dec(void *context, void *value)
 
				 		DMERR("btree delete failed\n");
			
 
				 }
			
 
				 
			
 
				-static int subtree_equal(void *context, void *value1_le, void *value2_le)
			
 
				+static int subtree_equal(void *context, const void *value1_le, const void *value2_le)
			
 
				 {
			
 
				 	__le64 v1_le, v2_le;
			
 
				 	memcpy(&v1_le, value1_le, sizeof(v1_le));
			
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -26,6 +26,9 @@
 
				 #define PRISON_CELLS 1024
			
 
				 #define COMMIT_PERIOD HZ
			
 
				 
			
 
				+DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
			
 
				+		"A percentage of time allocated for copy on write");
			
 
				+
			
 
				 /*
			
 
				  * The block size of the device holding pool data must be
			
 
				  * between 64KB and 1GB.
			
@@ -226,6 +229,78 @@ struct thin_c {
 
				 
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				+/*
			
 
				+ * wake_worker() is used when new work is queued and when pool_resume is
			
 
				+ * ready to continue deferred IO processing.
			
 
				+ */
			
 
				+static void wake_worker(struct pool *pool)
			
 
				+{
			
 
				+	queue_work(pool->wq, &pool->worker);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio,
			
 
				+		      struct dm_bio_prison_cell **cell_result)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct dm_bio_prison_cell *cell_prealloc;
			
 
				+
			
 
				+	/*
			
 
				+	 * Allocate a cell from the prison's mempool.
			
 
				+	 * This might block but it can't fail.
			
 
				+	 */
			
 
				+	cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);
			
 
				+
			
 
				+	r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);
			
 
				+	if (r)
			
 
				+		/*
			
 
				+		 * We reused an old cell; we can get rid of
			
 
				+		 * the new one.
			
 
				+		 */
			
 
				+		dm_bio_prison_free_cell(pool->prison, cell_prealloc);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static void cell_release(struct pool *pool,
			
 
				+			 struct dm_bio_prison_cell *cell,
			
 
				+			 struct bio_list *bios)
			
 
				+{
			
 
				+	dm_cell_release(pool->prison, cell, bios);
			
 
				+	dm_bio_prison_free_cell(pool->prison, cell);
			
 
				+}
			
 
				+
			
 
				+static void cell_release_no_holder(struct pool *pool,
			
 
				+				   struct dm_bio_prison_cell *cell,
			
 
				+				   struct bio_list *bios)
			
 
				+{
			
 
				+	dm_cell_release_no_holder(pool->prison, cell, bios);
			
 
				+	dm_bio_prison_free_cell(pool->prison, cell);
			
 
				+}
			
 
				+
			
 
				+static void cell_defer_no_holder_no_free(struct thin_c *tc,
			
 
				+					 struct dm_bio_prison_cell *cell)
			
 
				+{
			
 
				+	struct pool *pool = tc->pool;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&pool->lock, flags);
			
 
				+	dm_cell_release_no_holder(pool->prison, cell, &pool->deferred_bios);
			
 
				+	spin_unlock_irqrestore(&pool->lock, flags);
			
 
				+
			
 
				+	wake_worker(pool);
			
 
				+}
			
 
				+
			
 
				+static void cell_error(struct pool *pool,
			
 
				+		       struct dm_bio_prison_cell *cell)
			
 
				+{
			
 
				+	dm_cell_error(pool->prison, cell);
			
 
				+	dm_bio_prison_free_cell(pool->prison, cell);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				 /*
			
 
				  * A global list of pools that uses a struct mapped_device as a key.
			
 
				  */
			
@@ -330,14 +405,20 @@ static void requeue_io(struct thin_c *tc)
 
				  * target.
			
 
				  */
			
 
				 
			
 
				+static bool block_size_is_power_of_two(struct pool *pool)
			
 
				+{
			
 
				+	return pool->sectors_per_block_shift >= 0;
			
 
				+}
			
 
				+
			
 
				 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
			
 
				 {
			
 
				+	struct pool *pool = tc->pool;
			
 
				 	sector_t block_nr = bio->bi_sector;
			
 
				 
			
 
				-	if (tc->pool->sectors_per_block_shift < 0)
			
 
				-		(void) sector_div(block_nr, tc->pool->sectors_per_block);
			
 
				+	if (block_size_is_power_of_two(pool))
			
 
				+		block_nr >>= pool->sectors_per_block_shift;
			
 
				 	else
			
 
				-		block_nr >>= tc->pool->sectors_per_block_shift;
			
 
				+		(void) sector_div(block_nr, pool->sectors_per_block);
			
 
				 
			
 
				 	return block_nr;
			
 
				 }
			
@@ -348,12 +429,12 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 
				 	sector_t bi_sector = bio->bi_sector;
			
 
				 
			
 
				 	bio->bi_bdev = tc->pool_dev->bdev;
			
 
				-	if (tc->pool->sectors_per_block_shift < 0)
			
 
				-		bio->bi_sector = (block * pool->sectors_per_block) +
			
 
				-				 sector_div(bi_sector, pool->sectors_per_block);
			
 
				-	else
			
 
				+	if (block_size_is_power_of_two(pool))
			
 
				 		bio->bi_sector = (block << pool->sectors_per_block_shift) |
			
 
				 				(bi_sector & (pool->sectors_per_block - 1));
			
 
				+	else
			
 
				+		bio->bi_sector = (block * pool->sectors_per_block) +
			
 
				+				 sector_div(bi_sector, pool->sectors_per_block);
			
 
				 }
			
 
				 
			
 
				 static void remap_to_origin(struct thin_c *tc, struct bio *bio)
			
@@ -420,15 +501,6 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
 
				 	issue(tc, bio);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * wake_worker() is used when new work is queued and when pool_resume is
			
 
				- * ready to continue deferred IO processing.
			
 
				- */
			
 
				-static void wake_worker(struct pool *pool)
			
 
				-{
			
 
				-	queue_work(pool->wq, &pool->worker);
			
 
				-}
			
 
				-
			
 
				 /*----------------------------------------------------------------*/
			
 
				 
			
 
				 /*
			
@@ -515,14 +587,14 @@ static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 
				 	unsigned long flags;
			
 
				 
			
 
				 	spin_lock_irqsave(&pool->lock, flags);
			
 
				-	dm_cell_release(cell, &pool->deferred_bios);
			
 
				+	cell_release(pool, cell, &pool->deferred_bios);
			
 
				 	spin_unlock_irqrestore(&tc->pool->lock, flags);
			
 
				 
			
 
				 	wake_worker(pool);
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Same as cell_defer except it omits the original holder of the cell.
			
 
				+ * Same as cell_defer above, except it omits the original holder of the cell.
			
 
				  */
			
 
				 static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
			
 
				 {
			
@@ -530,7 +602,7 @@ static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *c
 
				 	unsigned long flags;
			
 
				 
			
 
				 	spin_lock_irqsave(&pool->lock, flags);
			
 
				-	dm_cell_release_no_holder(cell, &pool->deferred_bios);
			
 
				+	cell_release_no_holder(pool, cell, &pool->deferred_bios);
			
 
				 	spin_unlock_irqrestore(&pool->lock, flags);
			
 
				 
			
 
				 	wake_worker(pool);
			
@@ -540,13 +612,15 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 
				 {
			
 
				 	if (m->bio)
			
 
				 		m->bio->bi_end_io = m->saved_bi_end_io;
			
 
				-	dm_cell_error(m->cell);
			
 
				+	cell_error(m->tc->pool, m->cell);
			
 
				 	list_del(&m->list);
			
 
				 	mempool_free(m, m->tc->pool->mapping_pool);
			
 
				 }
			
 
				+
			
 
				 static void process_prepared_mapping(struct dm_thin_new_mapping *m)
			
 
				 {
			
 
				 	struct thin_c *tc = m->tc;
			
 
				+	struct pool *pool = tc->pool;
			
 
				 	struct bio *bio;
			
 
				 	int r;
			
 
				 
			
@@ -555,7 +629,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 
				 		bio->bi_end_io = m->saved_bi_end_io;
			
 
				 
			
 
				 	if (m->err) {
			
 
				-		dm_cell_error(m->cell);
			
 
				+		cell_error(pool, m->cell);
			
 
				 		goto out;
			
 
				 	}
			
 
				 
			
@@ -567,7 +641,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 
				 	r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
			
 
				 	if (r) {
			
 
				 		DMERR_LIMIT("dm_thin_insert_block() failed");
			
 
				-		dm_cell_error(m->cell);
			
 
				+		cell_error(pool, m->cell);
			
 
				 		goto out;
			
 
				 	}
			
 
				 
			
@@ -585,7 +659,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 
				 
			
 
				 out:
			
 
				 	list_del(&m->list);
			
 
				-	mempool_free(m, tc->pool->mapping_pool);
			
 
				+	mempool_free(m, pool->mapping_pool);
			
 
				 }
			
 
				 
			
 
				 static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
			
@@ -736,7 +810,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 
				 		if (r < 0) {
			
 
				 			mempool_free(m, pool->mapping_pool);
			
 
				 			DMERR_LIMIT("dm_kcopyd_copy() failed");
			
 
				-			dm_cell_error(cell);
			
 
				+			cell_error(pool, cell);
			
 
				 		}
			
 
				 	}
			
 
				 }
			
@@ -802,7 +876,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 
				 		if (r < 0) {
			
 
				 			mempool_free(m, pool->mapping_pool);
			
 
				 			DMERR_LIMIT("dm_kcopyd_zero() failed");
			
 
				-			dm_cell_error(cell);
			
 
				+			cell_error(pool, cell);
			
 
				 		}
			
 
				 	}
			
 
				 }
			
@@ -908,13 +982,13 @@ static void retry_on_resume(struct bio *bio)
 
				 	spin_unlock_irqrestore(&pool->lock, flags);
			
 
				 }
			
 
				 
			
 
				-static void no_space(struct dm_bio_prison_cell *cell)
			
 
				+static void no_space(struct pool *pool, struct dm_bio_prison_cell *cell)
			
 
				 {
			
 
				 	struct bio *bio;
			
 
				 	struct bio_list bios;
			
 
				 
			
 
				 	bio_list_init(&bios);
			
 
				-	dm_cell_release(cell, &bios);
			
 
				+	cell_release(pool, cell, &bios);
			
 
				 
			
 
				 	while ((bio = bio_list_pop(&bios)))
			
 
				 		retry_on_resume(bio);
			
@@ -932,7 +1006,7 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
 
				 	struct dm_thin_new_mapping *m;
			
 
				 
			
 
				 	build_virtual_key(tc->td, block, &key);
			
 
				-	if (dm_bio_detain(tc->pool->prison, &key, bio, &cell))
			
 
				+	if (bio_detain(tc->pool, &key, bio, &cell))
			
 
				 		return;
			
 
				 
			
 
				 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
			
@@ -944,7 +1018,7 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
 
				 		 * on this block.
			
 
				 		 */
			
 
				 		build_data_key(tc->td, lookup_result.block, &key2);
			
 
				-		if (dm_bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
			
 
				+		if (bio_detain(tc->pool, &key2, bio, &cell2)) {
			
 
				 			cell_defer_no_holder(tc, cell);
			
 
				 			break;
			
 
				 		}
			
@@ -1020,13 +1094,13 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
 
				 		break;
			
 
				 
			
 
				 	case -ENOSPC:
			
 
				-		no_space(cell);
			
 
				+		no_space(tc->pool, cell);
			
 
				 		break;
			
 
				 
			
 
				 	default:
			
 
				 		DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
			
 
				 			    __func__, r);
			
 
				-		dm_cell_error(cell);
			
 
				+		cell_error(tc->pool, cell);
			
 
				 		break;
			
 
				 	}
			
 
				 }
			
@@ -1044,7 +1118,7 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
 
				 	 * of being broken so we have nothing further to do here.
			
 
				 	 */
			
 
				 	build_data_key(tc->td, lookup_result->block, &key);
			
 
				-	if (dm_bio_detain(pool->prison, &key, bio, &cell))
			
 
				+	if (bio_detain(pool, &key, bio, &cell))
			
 
				 		return;
			
 
				 
			
 
				 	if (bio_data_dir(bio) == WRITE && bio->bi_size)
			
@@ -1065,12 +1139,13 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
 
				 {
			
 
				 	int r;
			
 
				 	dm_block_t data_block;
			
 
				+	struct pool *pool = tc->pool;
			
 
				 
			
 
				 	/*
			
 
				 	 * Remap empty bios (flushes) immediately, without provisioning.
			
 
				 	 */
			
 
				 	if (!bio->bi_size) {
			
 
				-		inc_all_io_entry(tc->pool, bio);
			
 
				+		inc_all_io_entry(pool, bio);
			
 
				 		cell_defer_no_holder(tc, cell);
			
 
				 
			
 
				 		remap_and_issue(tc, bio, 0);
			
@@ -1097,14 +1172,14 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
 
				 		break;
			
 
				 
			
 
				 	case -ENOSPC:
			
 
				-		no_space(cell);
			
 
				+		no_space(pool, cell);
			
 
				 		break;
			
 
				 
			
 
				 	default:
			
 
				 		DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
			
 
				 			    __func__, r);
			
 
				-		set_pool_mode(tc->pool, PM_READ_ONLY);
			
 
				-		dm_cell_error(cell);
			
 
				+		set_pool_mode(pool, PM_READ_ONLY);
			
 
				+		cell_error(pool, cell);
			
 
				 		break;
			
 
				 	}
			
 
				 }
			
@@ -1112,6 +1187,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
 
				 static void process_bio(struct thin_c *tc, struct bio *bio)
			
 
				 {
			
 
				 	int r;
			
 
				+	struct pool *pool = tc->pool;
			
 
				 	dm_block_t block = get_bio_block(tc, bio);
			
 
				 	struct dm_bio_prison_cell *cell;
			
 
				 	struct dm_cell_key key;
			
@@ -1122,7 +1198,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
 
				 	 * being provisioned so we have nothing further to do here.
			
 
				 	 */
			
 
				 	build_virtual_key(tc->td, block, &key);
			
 
				-	if (dm_bio_detain(tc->pool->prison, &key, bio, &cell))
			
 
				+	if (bio_detain(pool, &key, bio, &cell))
			
 
				 		return;
			
 
				 
			
 
				 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
			
@@ -1130,9 +1206,9 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
 
				 	case 0:
			
 
				 		if (lookup_result.shared) {
			
 
				 			process_shared_bio(tc, bio, block, &lookup_result);
			
 
				-			cell_defer_no_holder(tc, cell);
			
 
				+			cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */
			
 
				 		} else {
			
 
				-			inc_all_io_entry(tc->pool, bio);
			
 
				+			inc_all_io_entry(pool, bio);
			
 
				 			cell_defer_no_holder(tc, cell);
			
 
				 
			
 
				 			remap_and_issue(tc, bio, lookup_result.block);
			
@@ -1141,7 +1217,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
 
				 
			
 
				 	case -ENODATA:
			
 
				 		if (bio_data_dir(bio) == READ && tc->origin_dev) {
			
 
				-			inc_all_io_entry(tc->pool, bio);
			
 
				+			inc_all_io_entry(pool, bio);
			
 
				 			cell_defer_no_holder(tc, cell);
			
 
				 
			
 
				 			remap_to_origin_and_issue(tc, bio);
			
@@ -1378,7 +1454,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 
				 	dm_block_t block = get_bio_block(tc, bio);
			
 
				 	struct dm_thin_device *td = tc->td;
			
 
				 	struct dm_thin_lookup_result result;
			
 
				-	struct dm_bio_prison_cell *cell1, *cell2;
			
 
				+	struct dm_bio_prison_cell cell1, cell2;
			
 
				+	struct dm_bio_prison_cell *cell_result;
			
 
				 	struct dm_cell_key key;
			
 
				 
			
 
				 	thin_hook_bio(tc, bio);
			
@@ -1420,18 +1497,18 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 
				 		}
			
 
				 
			
 
				 		build_virtual_key(tc->td, block, &key);
			
 
				-		if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1))
			
 
				+		if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result))
			
 
				 			return DM_MAPIO_SUBMITTED;
			
 
				 
			
 
				 		build_data_key(tc->td, result.block, &key);
			
 
				-		if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2)) {
			
 
				-			cell_defer_no_holder(tc, cell1);
			
 
				+		if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) {
			
 
				+			cell_defer_no_holder_no_free(tc, &cell1);
			
 
				 			return DM_MAPIO_SUBMITTED;
			
 
				 		}
			
 
				 
			
 
				 		inc_all_io_entry(tc->pool, bio);
			
 
				-		cell_defer_no_holder(tc, cell2);
			
 
				-		cell_defer_no_holder(tc, cell1);
			
 
				+		cell_defer_no_holder_no_free(tc, &cell2);
			
 
				+		cell_defer_no_holder_no_free(tc, &cell1);
			
 
				 
			
 
				 		remap(tc, bio, result.block);
			
 
				 		return DM_MAPIO_REMAPPED;
			
@@ -1636,7 +1713,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 
				 		goto bad_prison;
			
 
				 	}
			
 
				 
			
 
				-	pool->copier = dm_kcopyd_client_create();
			
 
				+	pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
			
 
				 	if (IS_ERR(pool->copier)) {
			
 
				 		r = PTR_ERR(pool->copier);
			
 
				 		*error = "Error creating pool's kcopyd client";
			
@@ -1938,7 +2015,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
				 	pt->data_dev = data_dev;
			
 
				 	pt->low_water_blocks = low_water_blocks;
			
 
				 	pt->adjusted_pf = pt->requested_pf = pf;
			
 
				-	ti->num_flush_requests = 1;
			
 
				+	ti->num_flush_bios = 1;
			
 
				 
			
 
				 	/*
			
 
				 	 * Only need to enable discards if the pool should pass
			
@@ -1946,7 +2023,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
				 	 * processing will cause mappings to be removed from the btree.
			
 
				 	 */
			
 
				 	if (pf.discard_enabled && pf.discard_passdown) {
			
 
				-		ti->num_discard_requests = 1;
			
 
				+		ti->num_discard_bios = 1;
			
 
				 
			
 
				 		/*
			
 
				 		 * Setting 'discards_supported' circumvents the normal
			
@@ -2299,8 +2376,8 @@ static void emit_flags(struct pool_features *pf, char *result,
 
				  *    <transaction id> <used metadata sectors>/<total metadata sectors>
			
 
				  *    <used data sectors>/<total data sectors> <held metadata root>
			
 
				  */
			
 
				-static int pool_status(struct dm_target *ti, status_type_t type,
			
 
				-		       unsigned status_flags, char *result, unsigned maxlen)
			
 
				+static void pool_status(struct dm_target *ti, status_type_t type,
			
 
				+			unsigned status_flags, char *result, unsigned maxlen)
			
 
				 {
			
 
				 	int r;
			
 
				 	unsigned sz = 0;
			
@@ -2326,32 +2403,41 @@ static int pool_status(struct dm_target *ti, status_type_t type,
 
				 		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
			
 
				 			(void) commit_or_fallback(pool);
			
 
				 
			
 
				-		r = dm_pool_get_metadata_transaction_id(pool->pmd,
			
 
				-							&transaction_id);
			
 
				-		if (r)
			
 
				-			return r;
			
 
				+		r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
			
 
				+		if (r) {
			
 
				+			DMERR("dm_pool_get_metadata_transaction_id returned %d", r);
			
 
				+			goto err;
			
 
				+		}
			
 
				 
			
 
				-		r = dm_pool_get_free_metadata_block_count(pool->pmd,
			
 
				-							  &nr_free_blocks_metadata);
			
 
				-		if (r)
			
 
				-			return r;
			
 
				+		r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);
			
 
				+		if (r) {
			
 
				+			DMERR("dm_pool_get_free_metadata_block_count returned %d", r);
			
 
				+			goto err;
			
 
				+		}
			
 
				 
			
 
				 		r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
			
 
				-		if (r)
			
 
				-			return r;
			
 
				+		if (r) {
			
 
				+			DMERR("dm_pool_get_metadata_dev_size returned %d", r);
			
 
				+			goto err;
			
 
				+		}
			
 
				 
			
 
				-		r = dm_pool_get_free_block_count(pool->pmd,
			
 
				-						 &nr_free_blocks_data);
			
 
				-		if (r)
			
 
				-			return r;
			
 
				+		r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);
			
 
				+		if (r) {
			
 
				+			DMERR("dm_pool_get_free_block_count returned %d", r);
			
 
				+			goto err;
			
 
				+		}
			
 
				 
			
 
				 		r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
			
 
				-		if (r)
			
 
				-			return r;
			
 
				+		if (r) {
			
 
				+			DMERR("dm_pool_get_data_dev_size returned %d", r);
			
 
				+			goto err;
			
 
				+		}
			
 
				 
			
 
				 		r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
			
 
				-		if (r)
			
 
				-			return r;
			
 
				+		if (r) {
			
 
				+			DMERR("dm_pool_get_metadata_snap returned %d", r);
			
 
				+			goto err;
			
 
				+		}
			
 
				 
			
 
				 		DMEMIT("%llu %llu/%llu %llu/%llu ",
			
 
				 		       (unsigned long long)transaction_id,
			
@@ -2388,8 +2474,10 @@ static int pool_status(struct dm_target *ti, status_type_t type,
 
				 		emit_flags(&pt->requested_pf, result, sz, maxlen);
			
 
				 		break;
			
 
				 	}
			
 
				+	return;
			
 
				 
			
 
				-	return 0;
			
 
				+err:
			
 
				+	DMEMIT("Error");
			
 
				 }
			
 
				 
			
 
				 static int pool_iterate_devices(struct dm_target *ti,
			
@@ -2414,11 +2502,6 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 
				 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
			
 
				 }
			
 
				 
			
 
				-static bool block_size_is_power_of_two(struct pool *pool)
			
 
				-{
			
 
				-	return pool->sectors_per_block_shift >= 0;
			
 
				-}
			
 
				-
			
 
				 static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
			
 
				 {
			
 
				 	struct pool *pool = pt->pool;
			
@@ -2432,15 +2515,8 @@ static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
 
				 	if (pt->adjusted_pf.discard_passdown) {
			
 
				 		data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
			
 
				 		limits->discard_granularity = data_limits->discard_granularity;
			
 
				-	} else if (block_size_is_power_of_two(pool))
			
 
				+	} else
			
 
				 		limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
			
 
				-	else
			
 
				-		/*
			
 
				-		 * Use largest power of 2 that is a factor of sectors_per_block
			
 
				-		 * but at least DATA_DEV_BLOCK_SIZE_MIN_SECTORS.
			
 
				-		 */
			
 
				-		limits->discard_granularity = max(1 << (ffs(pool->sectors_per_block) - 1),
			
 
				-						  DATA_DEV_BLOCK_SIZE_MIN_SECTORS) << SECTOR_SHIFT;
			
 
				 }
			
 
				 
			
 
				 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
			
@@ -2468,7 +2544,7 @@ static struct target_type pool_target = {
 
				 	.name = "thin-pool",
			
 
				 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
			
 
				 		    DM_TARGET_IMMUTABLE,
			
 
				-	.version = {1, 6, 0},
			
 
				+	.version = {1, 6, 1},
			
 
				 	.module = THIS_MODULE,
			
 
				 	.ctr = pool_ctr,
			
 
				 	.dtr = pool_dtr,
			
@@ -2588,17 +2664,17 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
				 	if (r)
			
 
				 		goto bad_thin_open;
			
 
				 
			
 
				-	ti->num_flush_requests = 1;
			
 
				+	ti->num_flush_bios = 1;
			
 
				 	ti->flush_supported = true;
			
 
				 	ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook);
			
 
				 
			
 
				 	/* In case the pool supports discards, pass them on. */
			
 
				 	if (tc->pool->pf.discard_enabled) {
			
 
				 		ti->discards_supported = true;
			
 
				-		ti->num_discard_requests = 1;
			
 
				+		ti->num_discard_bios = 1;
			
 
				 		ti->discard_zeroes_data_unsupported = true;
			
 
				-		/* Discard requests must be split on a block boundary */
			
 
				-		ti->split_discard_requests = true;
			
 
				+		/* Discard bios must be split on a block boundary */
			
 
				+		ti->split_discard_bios = true;
			
 
				 	}
			
 
				 
			
 
				 	dm_put(pool_md);
			
@@ -2676,8 +2752,8 @@ static void thin_postsuspend(struct dm_target *ti)
 
				 /*
			
 
				  * <nr mapped sectors> <highest mapped sector>
			
 
				  */
			
 
				-static int thin_status(struct dm_target *ti, status_type_t type,
			
 
				-		       unsigned status_flags, char *result, unsigned maxlen)
			
 
				+static void thin_status(struct dm_target *ti, status_type_t type,
			
 
				+			unsigned status_flags, char *result, unsigned maxlen)
			
 
				 {
			
 
				 	int r;
			
 
				 	ssize_t sz = 0;
			
@@ -2687,7 +2763,7 @@ static int thin_status(struct dm_target *ti, status_type_t type,
 
				 
			
 
				 	if (get_pool_mode(tc->pool) == PM_FAIL) {
			
 
				 		DMEMIT("Fail");
			
 
				-		return 0;
			
 
				+		return;
			
 
				 	}
			
 
				 
			
 
				 	if (!tc->td)
			
@@ -2696,12 +2772,16 @@ static int thin_status(struct dm_target *ti, status_type_t type,
 
				 		switch (type) {
			
 
				 		case STATUSTYPE_INFO:
			
 
				 			r = dm_thin_get_mapped_count(tc->td, &mapped);
			
 
				-			if (r)
			
 
				-				return r;
			
 
				+			if (r) {
			
 
				+				DMERR("dm_thin_get_mapped_count returned %d", r);
			
 
				+				goto err;
			
 
				+			}
			
 
				 
			
 
				 			r = dm_thin_get_highest_mapped_block(tc->td, &highest);
			
 
				-			if (r < 0)
			
 
				-				return r;
			
 
				+			if (r < 0) {
			
 
				+				DMERR("dm_thin_get_highest_mapped_block returned %d", r);
			
 
				+				goto err;
			
 
				+			}
			
 
				 
			
 
				 			DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
			
 
				 			if (r)
			
@@ -2721,7 +2801,10 @@ static int thin_status(struct dm_target *ti, status_type_t type,
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	return 0;
			
 
				+	return;
			
 
				+
			
 
				+err:
			
 
				+	DMEMIT("Error");
			
 
				 }
			
 
				 
			
 
				 static int thin_iterate_devices(struct dm_target *ti,
			
@@ -2748,7 +2831,7 @@ static int thin_iterate_devices(struct dm_target *ti,
 
				 
			
 
				 static struct target_type thin_target = {
			
 
				 	.name = "thin",
			
 
				-	.version = {1, 7, 0},
			
 
				+	.version = {1, 7, 1},
			
 
				 	.module	= THIS_MODULE,
			
 
				 	.ctr = thin_ctr,
			
 
				 	.dtr = thin_dtr,
			
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -508,8 +508,8 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
 
				 /*
			
 
				  * Status: V (valid) or C (corruption found)
			
 
				  */
			
 
				-static int verity_status(struct dm_target *ti, status_type_t type,
			
 
				-			 unsigned status_flags, char *result, unsigned maxlen)
			
 
				+static void verity_status(struct dm_target *ti, status_type_t type,
			
 
				+			  unsigned status_flags, char *result, unsigned maxlen)
			
 
				 {
			
 
				 	struct dm_verity *v = ti->private;
			
 
				 	unsigned sz = 0;
			
@@ -540,8 +540,6 @@ static int verity_status(struct dm_target *ti, status_type_t type,
 
				 				DMEMIT("%02x", v->salt[x]);
			
 
				 		break;
			
 
				 	}
			
 
				-
			
 
				-	return 0;
			
 
				 }
			
 
				 
			
 
				 static int verity_ioctl(struct dm_target *ti, unsigned cmd,
			
@@ -860,7 +858,7 @@ bad:
 
				 
			
 
				 static struct target_type verity_target = {
			
 
				 	.name		= "verity",
			
 
				-	.version	= {1, 1, 0},
			
 
				+	.version	= {1, 1, 1},
			
 
				 	.module		= THIS_MODULE,
			
 
				 	.ctr		= verity_ctr,
			
 
				 	.dtr		= verity_dtr,
			
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -25,7 +25,7 @@ static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
				 	/*
			
 
				 	 * Silently drop discards, avoiding -EOPNOTSUPP.
			
 
				 	 */
			
 
				-	ti->num_discard_requests = 1;
			
 
				+	ti->num_discard_bios = 1;
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -163,7 +163,6 @@ struct mapped_device {
 
				 	 * io objects are allocated from here.
			
 
				 	 */
			
 
				 	mempool_t *io_pool;
			
 
				-	mempool_t *tio_pool;
			
 
				 
			
 
				 	struct bio_set *bs;
			
 
				 
			
@@ -197,7 +196,6 @@ struct mapped_device {
 
				  */
			
 
				 struct dm_md_mempools {
			
 
				 	mempool_t *io_pool;
			
 
				-	mempool_t *tio_pool;
			
 
				 	struct bio_set *bs;
			
 
				 };
			
 
				 
			
@@ -205,12 +203,6 @@ struct dm_md_mempools {
 
				 static struct kmem_cache *_io_cache;
			
 
				 static struct kmem_cache *_rq_tio_cache;
			
 
				 
			
 
				-/*
			
 
				- * Unused now, and needs to be deleted. But since io_pool is overloaded and it's
			
 
				- * still used for _io_cache, I'm leaving this for a later cleanup
			
 
				- */
			
 
				-static struct kmem_cache *_rq_bio_info_cache;
			
 
				-
			
 
				 static int __init local_init(void)
			
 
				 {
			
 
				 	int r = -ENOMEM;
			
@@ -224,13 +216,9 @@ static int __init local_init(void)
 
				 	if (!_rq_tio_cache)
			
 
				 		goto out_free_io_cache;
			
 
				 
			
 
				-	_rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
			
 
				-	if (!_rq_bio_info_cache)
			
 
				-		goto out_free_rq_tio_cache;
			
 
				-
			
 
				 	r = dm_uevent_init();
			
 
				 	if (r)
			
 
				-		goto out_free_rq_bio_info_cache;
			
 
				+		goto out_free_rq_tio_cache;
			
 
				 
			
 
				 	_major = major;
			
 
				 	r = register_blkdev(_major, _name);
			
@@ -244,8 +232,6 @@ static int __init local_init(void)
 
				 
			
 
				 out_uevent_exit:
			
 
				 	dm_uevent_exit();
			
 
				-out_free_rq_bio_info_cache:
			
 
				-	kmem_cache_destroy(_rq_bio_info_cache);
			
 
				 out_free_rq_tio_cache:
			
 
				 	kmem_cache_destroy(_rq_tio_cache);
			
 
				 out_free_io_cache:
			
@@ -256,7 +242,6 @@ out_free_io_cache:
 
				 
			
 
				 static void local_exit(void)
			
 
				 {
			
 
				-	kmem_cache_destroy(_rq_bio_info_cache);
			
 
				 	kmem_cache_destroy(_rq_tio_cache);
			
 
				 	kmem_cache_destroy(_io_cache);
			
 
				 	unregister_blkdev(_major, _name);
			
@@ -448,12 +433,12 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
 
				 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
			
 
				 					    gfp_t gfp_mask)
			
 
				 {
			
 
				-	return mempool_alloc(md->tio_pool, gfp_mask);
			
 
				+	return mempool_alloc(md->io_pool, gfp_mask);
			
 
				 }
			
 
				 
			
 
				 static void free_rq_tio(struct dm_rq_target_io *tio)
			
 
				 {
			
 
				-	mempool_free(tio, tio->md->tio_pool);
			
 
				+	mempool_free(tio, tio->md->io_pool);
			
 
				 }
			
 
				 
			
 
				 static int md_in_flight(struct mapped_device *md)
			
@@ -985,12 +970,13 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
			
 
				 
			
 
				-static void __map_bio(struct dm_target *ti, struct dm_target_io *tio)
			
 
				+static void __map_bio(struct dm_target_io *tio)
			
 
				 {
			
 
				 	int r;
			
 
				 	sector_t sector;
			
 
				 	struct mapped_device *md;
			
 
				 	struct bio *clone = &tio->clone;
			
 
				+	struct dm_target *ti = tio->ti;
			
 
				 
			
 
				 	clone->bi_end_io = clone_endio;
			
 
				 	clone->bi_private = tio;
			
@@ -1031,32 +1017,54 @@ struct clone_info {
 
				 	unsigned short idx;
			
 
				 };
			
 
				 
			
 
				+static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len)
			
 
				+{
			
 
				+	bio->bi_sector = sector;
			
 
				+	bio->bi_size = to_bytes(len);
			
 
				+}
			
 
				+
			
 
				+static void bio_setup_bv(struct bio *bio, unsigned short idx, unsigned short bv_count)
			
 
				+{
			
 
				+	bio->bi_idx = idx;
			
 
				+	bio->bi_vcnt = idx + bv_count;
			
 
				+	bio->bi_flags &= ~(1 << BIO_SEG_VALID);
			
 
				+}
			
 
				+
			
 
				+static void clone_bio_integrity(struct bio *bio, struct bio *clone,
			
 
				+				unsigned short idx, unsigned len, unsigned offset,
			
 
				+				unsigned trim)
			
 
				+{
			
 
				+	if (!bio_integrity(bio))
			
 
				+		return;
			
 
				+
			
 
				+	bio_integrity_clone(clone, bio, GFP_NOIO);
			
 
				+
			
 
				+	if (trim)
			
 
				+		bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), len);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Creates a little bio that just does part of a bvec.
			
 
				  */
			
 
				-static void split_bvec(struct dm_target_io *tio, struct bio *bio,
			
 
				-		       sector_t sector, unsigned short idx, unsigned int offset,
			
 
				-		       unsigned int len, struct bio_set *bs)
			
 
				+static void clone_split_bio(struct dm_target_io *tio, struct bio *bio,
			
 
				+			    sector_t sector, unsigned short idx,
			
 
				+			    unsigned offset, unsigned len)
			
 
				 {
			
 
				 	struct bio *clone = &tio->clone;
			
 
				 	struct bio_vec *bv = bio->bi_io_vec + idx;
			
 
				 
			
 
				 	*clone->bi_io_vec = *bv;
			
 
				 
			
 
				-	clone->bi_sector = sector;
			
 
				+	bio_setup_sector(clone, sector, len);
			
 
				+
			
 
				 	clone->bi_bdev = bio->bi_bdev;
			
 
				 	clone->bi_rw = bio->bi_rw;
			
 
				 	clone->bi_vcnt = 1;
			
 
				-	clone->bi_size = to_bytes(len);
			
 
				 	clone->bi_io_vec->bv_offset = offset;
			
 
				 	clone->bi_io_vec->bv_len = clone->bi_size;
			
 
				 	clone->bi_flags |= 1 << BIO_CLONED;
			
 
				 
			
 
				-	if (bio_integrity(bio)) {
			
 
				-		bio_integrity_clone(clone, bio, GFP_NOIO);
			
 
				-		bio_integrity_trim(clone,
			
 
				-				   bio_sector_offset(bio, idx, offset), len);
			
 
				-	}
			
 
				+	clone_bio_integrity(bio, clone, idx, len, offset, 1);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1064,29 +1072,23 @@ static void split_bvec(struct dm_target_io *tio, struct bio *bio,
 
				  */
			
 
				 static void clone_bio(struct dm_target_io *tio, struct bio *bio,
			
 
				 		      sector_t sector, unsigned short idx,
			
 
				-		      unsigned short bv_count, unsigned int len,
			
 
				-		      struct bio_set *bs)
			
 
				+		      unsigned short bv_count, unsigned len)
			
 
				 {
			
 
				 	struct bio *clone = &tio->clone;
			
 
				+	unsigned trim = 0;
			
 
				 
			
 
				 	__bio_clone(clone, bio);
			
 
				-	clone->bi_sector = sector;
			
 
				-	clone->bi_idx = idx;
			
 
				-	clone->bi_vcnt = idx + bv_count;
			
 
				-	clone->bi_size = to_bytes(len);
			
 
				-	clone->bi_flags &= ~(1 << BIO_SEG_VALID);
			
 
				-
			
 
				-	if (bio_integrity(bio)) {
			
 
				-		bio_integrity_clone(clone, bio, GFP_NOIO);
			
 
				-
			
 
				-		if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
			
 
				-			bio_integrity_trim(clone,
			
 
				-					   bio_sector_offset(bio, idx, 0), len);
			
 
				-	}
			
 
				+	bio_setup_sector(clone, sector, len);
			
 
				+	bio_setup_bv(clone, idx, bv_count);
			
 
				+
			
 
				+	if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
			
 
				+		trim = 1;
			
 
				+	clone_bio_integrity(bio, clone, idx, len, 0, trim);
			
 
				 }
			
 
				 
			
 
				 static struct dm_target_io *alloc_tio(struct clone_info *ci,
			
 
				-				      struct dm_target *ti, int nr_iovecs)
			
 
				+				      struct dm_target *ti, int nr_iovecs,
			
 
				+				      unsigned target_bio_nr)
			
 
				 {
			
 
				 	struct dm_target_io *tio;
			
 
				 	struct bio *clone;
			
@@ -1097,96 +1099,104 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci,
 
				 	tio->io = ci->io;
			
 
				 	tio->ti = ti;
			
 
				 	memset(&tio->info, 0, sizeof(tio->info));
			
 
				-	tio->target_request_nr = 0;
			
 
				+	tio->target_bio_nr = target_bio_nr;
			
 
				 
			
 
				 	return tio;
			
 
				 }
			
 
				 
			
 
				-static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
			
 
				-				   unsigned request_nr, sector_t len)
			
 
				+static void __clone_and_map_simple_bio(struct clone_info *ci,
			
 
				+				       struct dm_target *ti,
			
 
				+				       unsigned target_bio_nr, sector_t len)
			
 
				 {
			
 
				-	struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs);
			
 
				+	struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs, target_bio_nr);
			
 
				 	struct bio *clone = &tio->clone;
			
 
				 
			
 
				-	tio->target_request_nr = request_nr;
			
 
				-
			
 
				 	/*
			
 
				 	 * Discard requests require the bio's inline iovecs be initialized.
			
 
				 	 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
			
 
				 	 * and discard, so no need for concern about wasted bvec allocations.
			
 
				 	 */
			
 
				-
			
 
				 	 __bio_clone(clone, ci->bio);
			
 
				-	if (len) {
			
 
				-		clone->bi_sector = ci->sector;
			
 
				-		clone->bi_size = to_bytes(len);
			
 
				-	}
			
 
				+	if (len)
			
 
				+		bio_setup_sector(clone, ci->sector, len);
			
 
				 
			
 
				-	__map_bio(ti, tio);
			
 
				+	__map_bio(tio);
			
 
				 }
			
 
				 
			
 
				-static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
			
 
				-				    unsigned num_requests, sector_t len)
			
 
				+static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
			
 
				+				  unsigned num_bios, sector_t len)
			
 
				 {
			
 
				-	unsigned request_nr;
			
 
				+	unsigned target_bio_nr;
			
 
				 
			
 
				-	for (request_nr = 0; request_nr < num_requests; request_nr++)
			
 
				-		__issue_target_request(ci, ti, request_nr, len);
			
 
				+	for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++)
			
 
				+		__clone_and_map_simple_bio(ci, ti, target_bio_nr, len);
			
 
				 }
			
 
				 
			
 
				-static int __clone_and_map_empty_flush(struct clone_info *ci)
			
 
				+static int __send_empty_flush(struct clone_info *ci)
			
 
				 {
			
 
				 	unsigned target_nr = 0;
			
 
				 	struct dm_target *ti;
			
 
				 
			
 
				 	BUG_ON(bio_has_data(ci->bio));
			
 
				 	while ((ti = dm_table_get_target(ci->map, target_nr++)))
			
 
				-		__issue_target_requests(ci, ti, ti->num_flush_requests, 0);
			
 
				+		__send_duplicate_bios(ci, ti, ti->num_flush_bios, 0);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Perform all io with a single clone.
			
 
				- */
			
 
				-static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti)
			
 
				+static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
			
 
				+				     sector_t sector, int nr_iovecs,
			
 
				+				     unsigned short idx, unsigned short bv_count,
			
 
				+				     unsigned offset, unsigned len,
			
 
				+				     unsigned split_bvec)
			
 
				 {
			
 
				 	struct bio *bio = ci->bio;
			
 
				 	struct dm_target_io *tio;
			
 
				+	unsigned target_bio_nr;
			
 
				+	unsigned num_target_bios = 1;
			
 
				+
			
 
				+	/*
			
 
				+	 * Does the target want to receive duplicate copies of the bio?
			
 
				+	 */
			
 
				+	if (bio_data_dir(bio) == WRITE && ti->num_write_bios)
			
 
				+		num_target_bios = ti->num_write_bios(ti, bio);
			
 
				 
			
 
				-	tio = alloc_tio(ci, ti, bio->bi_max_vecs);
			
 
				-	clone_bio(tio, bio, ci->sector, ci->idx, bio->bi_vcnt - ci->idx,
			
 
				-		  ci->sector_count, ci->md->bs);
			
 
				-	__map_bio(ti, tio);
			
 
				-	ci->sector_count = 0;
			
 
				+	for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
			
 
				+		tio = alloc_tio(ci, ti, nr_iovecs, target_bio_nr);
			
 
				+		if (split_bvec)
			
 
				+			clone_split_bio(tio, bio, sector, idx, offset, len);
			
 
				+		else
			
 
				+			clone_bio(tio, bio, sector, idx, bv_count, len);
			
 
				+		__map_bio(tio);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				-typedef unsigned (*get_num_requests_fn)(struct dm_target *ti);
			
 
				+typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
			
 
				 
			
 
				-static unsigned get_num_discard_requests(struct dm_target *ti)
			
 
				+static unsigned get_num_discard_bios(struct dm_target *ti)
			
 
				 {
			
 
				-	return ti->num_discard_requests;
			
 
				+	return ti->num_discard_bios;
			
 
				 }
			
 
				 
			
 
				-static unsigned get_num_write_same_requests(struct dm_target *ti)
			
 
				+static unsigned get_num_write_same_bios(struct dm_target *ti)
			
 
				 {
			
 
				-	return ti->num_write_same_requests;
			
 
				+	return ti->num_write_same_bios;
			
 
				 }
			
 
				 
			
 
				 typedef bool (*is_split_required_fn)(struct dm_target *ti);
			
 
				 
			
 
				 static bool is_split_required_for_discard(struct dm_target *ti)
			
 
				 {
			
 
				-	return ti->split_discard_requests;
			
 
				+	return ti->split_discard_bios;
			
 
				 }
			
 
				 
			
 
				-static int __clone_and_map_changing_extent_only(struct clone_info *ci,
			
 
				-						get_num_requests_fn get_num_requests,
			
 
				-						is_split_required_fn is_split_required)
			
 
				+static int __send_changing_extent_only(struct clone_info *ci,
			
 
				+				       get_num_bios_fn get_num_bios,
			
 
				+				       is_split_required_fn is_split_required)
			
 
				 {
			
 
				 	struct dm_target *ti;
			
 
				 	sector_t len;
			
 
				-	unsigned num_requests;
			
 
				+	unsigned num_bios;
			
 
				 
			
 
				 	do {
			
 
				 		ti = dm_table_find_target(ci->map, ci->sector);
			
@@ -1199,8 +1209,8 @@ static int __clone_and_map_changing_extent_only(struct clone_info *ci,
 
				 		 * reconfiguration might also have changed that since the
			
 
				 		 * check was performed.
			
 
				 		 */
			
 
				-		num_requests = get_num_requests ? get_num_requests(ti) : 0;
			
 
				-		if (!num_requests)
			
 
				+		num_bios = get_num_bios ? get_num_bios(ti) : 0;
			
 
				+		if (!num_bios)
			
 
				 			return -EOPNOTSUPP;
			
 
				 
			
 
				 		if (is_split_required && !is_split_required(ti))
			
@@ -1208,7 +1218,7 @@ static int __clone_and_map_changing_extent_only(struct clone_info *ci,
 
				 		else
			
 
				 			len = min(ci->sector_count, max_io_len(ci->sector, ti));
			
 
				 
			
 
				-		__issue_target_requests(ci, ti, num_requests, len);
			
 
				+		__send_duplicate_bios(ci, ti, num_bios, len);
			
 
				 
			
 
				 		ci->sector += len;
			
 
				 	} while (ci->sector_count -= len);
			
@@ -1216,108 +1226,129 @@ static int __clone_and_map_changing_extent_only(struct clone_info *ci,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int __clone_and_map_discard(struct clone_info *ci)
			
 
				+static int __send_discard(struct clone_info *ci)
			
 
				 {
			
 
				-	return __clone_and_map_changing_extent_only(ci, get_num_discard_requests,
			
 
				-						    is_split_required_for_discard);
			
 
				+	return __send_changing_extent_only(ci, get_num_discard_bios,
			
 
				+					   is_split_required_for_discard);
			
 
				 }
			
 
				 
			
 
				-static int __clone_and_map_write_same(struct clone_info *ci)
			
 
				+static int __send_write_same(struct clone_info *ci)
			
 
				 {
			
 
				-	return __clone_and_map_changing_extent_only(ci, get_num_write_same_requests, NULL);
			
 
				+	return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
			
 
				 }
			
 
				 
			
 
				-static int __clone_and_map(struct clone_info *ci)
			
 
				+/*
			
 
				+ * Find maximum number of sectors / bvecs we can process with a single bio.
			
 
				+ */
			
 
				+static sector_t __len_within_target(struct clone_info *ci, sector_t max, int *idx)
			
 
				 {
			
 
				 	struct bio *bio = ci->bio;
			
 
				-	struct dm_target *ti;
			
 
				-	sector_t len = 0, max;
			
 
				-	struct dm_target_io *tio;
			
 
				-
			
 
				-	if (unlikely(bio->bi_rw & REQ_DISCARD))
			
 
				-		return __clone_and_map_discard(ci);
			
 
				-	else if (unlikely(bio->bi_rw & REQ_WRITE_SAME))
			
 
				-		return __clone_and_map_write_same(ci);
			
 
				+	sector_t bv_len, total_len = 0;
			
 
				 
			
 
				-	ti = dm_table_find_target(ci->map, ci->sector);
			
 
				-	if (!dm_target_is_valid(ti))
			
 
				-		return -EIO;
			
 
				+	for (*idx = ci->idx; max && (*idx < bio->bi_vcnt); (*idx)++) {
			
 
				+		bv_len = to_sector(bio->bi_io_vec[*idx].bv_len);
			
 
				 
			
 
				-	max = max_io_len(ci->sector, ti);
			
 
				+		if (bv_len > max)
			
 
				+			break;
			
 
				 
			
 
				-	if (ci->sector_count <= max) {
			
 
				-		/*
			
 
				-		 * Optimise for the simple case where we can do all of
			
 
				-		 * the remaining io with a single clone.
			
 
				-		 */
			
 
				-		__clone_and_map_simple(ci, ti);
			
 
				+		max -= bv_len;
			
 
				+		total_len += bv_len;
			
 
				+	}
			
 
				 
			
 
				-	} else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
			
 
				-		/*
			
 
				-		 * There are some bvecs that don't span targets.
			
 
				-		 * Do as many of these as possible.
			
 
				-		 */
			
 
				-		int i;
			
 
				-		sector_t remaining = max;
			
 
				-		sector_t bv_len;
			
 
				+	return total_len;
			
 
				+}
			
 
				 
			
 
				-		for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
			
 
				-			bv_len = to_sector(bio->bi_io_vec[i].bv_len);
			
 
				+static int __split_bvec_across_targets(struct clone_info *ci,
			
 
				+				       struct dm_target *ti, sector_t max)
			
 
				+{
			
 
				+	struct bio *bio = ci->bio;
			
 
				+	struct bio_vec *bv = bio->bi_io_vec + ci->idx;
			
 
				+	sector_t remaining = to_sector(bv->bv_len);
			
 
				+	unsigned offset = 0;
			
 
				+	sector_t len;
			
 
				 
			
 
				-			if (bv_len > remaining)
			
 
				-				break;
			
 
				+	do {
			
 
				+		if (offset) {
			
 
				+			ti = dm_table_find_target(ci->map, ci->sector);
			
 
				+			if (!dm_target_is_valid(ti))
			
 
				+				return -EIO;
			
 
				 
			
 
				-			remaining -= bv_len;
			
 
				-			len += bv_len;
			
 
				+			max = max_io_len(ci->sector, ti);
			
 
				 		}
			
 
				 
			
 
				-		tio = alloc_tio(ci, ti, bio->bi_max_vecs);
			
 
				-		clone_bio(tio, bio, ci->sector, ci->idx, i - ci->idx, len,
			
 
				-			  ci->md->bs);
			
 
				-		__map_bio(ti, tio);
			
 
				+		len = min(remaining, max);
			
 
				+
			
 
				+		__clone_and_map_data_bio(ci, ti, ci->sector, 1, ci->idx, 0,
			
 
				+					 bv->bv_offset + offset, len, 1);
			
 
				 
			
 
				 		ci->sector += len;
			
 
				 		ci->sector_count -= len;
			
 
				-		ci->idx = i;
			
 
				+		offset += to_bytes(len);
			
 
				+	} while (remaining -= len);
			
 
				 
			
 
				-	} else {
			
 
				-		/*
			
 
				-		 * Handle a bvec that must be split between two or more targets.
			
 
				-		 */
			
 
				-		struct bio_vec *bv = bio->bi_io_vec + ci->idx;
			
 
				-		sector_t remaining = to_sector(bv->bv_len);
			
 
				-		unsigned int offset = 0;
			
 
				+	ci->idx++;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Select the correct strategy for processing a non-flush bio.
			
 
				+ */
			
 
				+static int __split_and_process_non_flush(struct clone_info *ci)
			
 
				+{
			
 
				+	struct bio *bio = ci->bio;
			
 
				+	struct dm_target *ti;
			
 
				+	sector_t len, max;
			
 
				+	int idx;
			
 
				+
			
 
				+	if (unlikely(bio->bi_rw & REQ_DISCARD))
			
 
				+		return __send_discard(ci);
			
 
				+	else if (unlikely(bio->bi_rw & REQ_WRITE_SAME))
			
 
				+		return __send_write_same(ci);
			
 
				 
			
 
				-		do {
			
 
				-			if (offset) {
			
 
				-				ti = dm_table_find_target(ci->map, ci->sector);
			
 
				-				if (!dm_target_is_valid(ti))
			
 
				-					return -EIO;
			
 
				+	ti = dm_table_find_target(ci->map, ci->sector);
			
 
				+	if (!dm_target_is_valid(ti))
			
 
				+		return -EIO;
			
 
				 
			
 
				-				max = max_io_len(ci->sector, ti);
			
 
				-			}
			
 
				+	max = max_io_len(ci->sector, ti);
			
 
				 
			
 
				-			len = min(remaining, max);
			
 
				+	/*
			
 
				+	 * Optimise for the simple case where we can do all of
			
 
				+	 * the remaining io with a single clone.
			
 
				+	 */
			
 
				+	if (ci->sector_count <= max) {
			
 
				+		__clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
			
 
				+					 ci->idx, bio->bi_vcnt - ci->idx, 0,
			
 
				+					 ci->sector_count, 0);
			
 
				+		ci->sector_count = 0;
			
 
				+		return 0;
			
 
				+	}
			
 
				 
			
 
				-			tio = alloc_tio(ci, ti, 1);
			
 
				-			split_bvec(tio, bio, ci->sector, ci->idx,
			
 
				-				   bv->bv_offset + offset, len, ci->md->bs);
			
 
				+	/*
			
 
				+	 * There are some bvecs that don't span targets.
			
 
				+	 * Do as many of these as possible.
			
 
				+	 */
			
 
				+	if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
			
 
				+		len = __len_within_target(ci, max, &idx);
			
 
				 
			
 
				-			__map_bio(ti, tio);
			
 
				+		__clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
			
 
				+					 ci->idx, idx - ci->idx, 0, len, 0);
			
 
				 
			
 
				-			ci->sector += len;
			
 
				-			ci->sector_count -= len;
			
 
				-			offset += to_bytes(len);
			
 
				-		} while (remaining -= len);
			
 
				+		ci->sector += len;
			
 
				+		ci->sector_count -= len;
			
 
				+		ci->idx = idx;
			
 
				 
			
 
				-		ci->idx++;
			
 
				+		return 0;
			
 
				 	}
			
 
				 
			
 
				-	return 0;
			
 
				+	/*
			
 
				+	 * Handle a bvec that must be split between two or more targets.
			
 
				+	 */
			
 
				+	return __split_bvec_across_targets(ci, ti, max);
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Split the bio into several clones and submit it to targets.
			
 
				+ * Entry point to split a bio into clones and submit them to the targets.
			
 
				  */
			
 
				 static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
			
 
				 {
			
@@ -1341,16 +1372,17 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 
				 	ci.idx = bio->bi_idx;
			
 
				 
			
 
				 	start_io_acct(ci.io);
			
 
				+
			
 
				 	if (bio->bi_rw & REQ_FLUSH) {
			
 
				 		ci.bio = &ci.md->flush_bio;
			
 
				 		ci.sector_count = 0;
			
 
				-		error = __clone_and_map_empty_flush(&ci);
			
 
				+		error = __send_empty_flush(&ci);
			
 
				 		/* dec_pending submits any data associated with flush */
			
 
				 	} else {
			
 
				 		ci.bio = bio;
			
 
				 		ci.sector_count = bio_sectors(bio);
			
 
				 		while (ci.sector_count && !error)
			
 
				-			error = __clone_and_map(&ci);
			
 
				+			error = __split_and_process_non_flush(&ci);
			
 
				 	}
			
 
				 
			
 
				 	/* drop the extra reference count */
			
@@ -1923,8 +1955,6 @@ static void free_dev(struct mapped_device *md)
 
				 	unlock_fs(md);
			
 
				 	bdput(md->bdev);
			
 
				 	destroy_workqueue(md->wq);
			
 
				-	if (md->tio_pool)
			
 
				-		mempool_destroy(md->tio_pool);
			
 
				 	if (md->io_pool)
			
 
				 		mempool_destroy(md->io_pool);
			
 
				 	if (md->bs)
			
@@ -1947,24 +1977,33 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 
				 {
			
 
				 	struct dm_md_mempools *p = dm_table_get_md_mempools(t);
			
 
				 
			
 
				-	if (md->io_pool && (md->tio_pool || dm_table_get_type(t) == DM_TYPE_BIO_BASED) && md->bs) {
			
 
				-		/*
			
 
				-		 * The md already has necessary mempools. Reload just the
			
 
				-		 * bioset because front_pad may have changed because
			
 
				-		 * a different table was loaded.
			
 
				-		 */
			
 
				-		bioset_free(md->bs);
			
 
				-		md->bs = p->bs;
			
 
				-		p->bs = NULL;
			
 
				+	if (md->io_pool && md->bs) {
			
 
				+		/* The md already has necessary mempools. */
			
 
				+		if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
			
 
				+			/*
			
 
				+			 * Reload bioset because front_pad may have changed
			
 
				+			 * because a different table was loaded.
			
 
				+			 */
			
 
				+			bioset_free(md->bs);
			
 
				+			md->bs = p->bs;
			
 
				+			p->bs = NULL;
			
 
				+		} else if (dm_table_get_type(t) == DM_TYPE_REQUEST_BASED) {
			
 
				+			/*
			
 
				+			 * There's no need to reload with request-based dm
			
 
				+			 * because the size of front_pad doesn't change.
			
 
				+			 * Note for future: If you are to reload bioset,
			
 
				+			 * prep-ed requests in the queue may refer
			
 
				+			 * to bio from the old bioset, so you must walk
			
 
				+			 * through the queue to unprep.
			
 
				+			 */
			
 
				+		}
			
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				-	BUG_ON(!p || md->io_pool || md->tio_pool || md->bs);
			
 
				+	BUG_ON(!p || md->io_pool || md->bs);
			
 
				 
			
 
				 	md->io_pool = p->io_pool;
			
 
				 	p->io_pool = NULL;
			
 
				-	md->tio_pool = p->tio_pool;
			
 
				-	p->tio_pool = NULL;
			
 
				 	md->bs = p->bs;
			
 
				 	p->bs = NULL;
			
 
				 
			
@@ -2395,7 +2434,7 @@ static void dm_queue_flush(struct mapped_device *md)
 
				  */
			
 
				 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
			
 
				 {
			
 
				-	struct dm_table *live_map, *map = ERR_PTR(-EINVAL);
			
 
				+	struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
			
 
				 	struct queue_limits limits;
			
 
				 	int r;
			
 
				 
			
@@ -2418,10 +2457,12 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
 
				 		dm_table_put(live_map);
			
 
				 	}
			
 
				 
			
 
				-	r = dm_calculate_queue_limits(table, &limits);
			
 
				-	if (r) {
			
 
				-		map = ERR_PTR(r);
			
 
				-		goto out;
			
 
				+	if (!live_map) {
			
 
				+		r = dm_calculate_queue_limits(table, &limits);
			
 
				+		if (r) {
			
 
				+			map = ERR_PTR(r);
			
 
				+			goto out;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	map = __bind(md, table, &limits);
			
@@ -2719,52 +2760,42 @@ EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
				 
			
 
				 struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size)
			
 
				 {
			
 
				-	struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
			
 
				-	unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS;
			
 
				+	struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
			
 
				+	struct kmem_cache *cachep;
			
 
				+	unsigned int pool_size;
			
 
				+	unsigned int front_pad;
			
 
				 
			
 
				 	if (!pools)
			
 
				 		return NULL;
			
 
				 
			
 
				-	per_bio_data_size = roundup(per_bio_data_size, __alignof__(struct dm_target_io));
			
 
				+	if (type == DM_TYPE_BIO_BASED) {
			
 
				+		cachep = _io_cache;
			
 
				+		pool_size = 16;
			
 
				+		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
			
 
				+	} else if (type == DM_TYPE_REQUEST_BASED) {
			
 
				+		cachep = _rq_tio_cache;
			
 
				+		pool_size = MIN_IOS;
			
 
				+		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
			
 
				+		/* per_bio_data_size is not used. See __bind_mempools(). */
			
 
				+		WARN_ON(per_bio_data_size != 0);
			
 
				+	} else
			
 
				+		goto out;
			
 
				 
			
 
				-	pools->io_pool = (type == DM_TYPE_BIO_BASED) ?
			
 
				-			 mempool_create_slab_pool(MIN_IOS, _io_cache) :
			
 
				-			 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);
			
 
				+	pools->io_pool = mempool_create_slab_pool(MIN_IOS, cachep);
			
 
				 	if (!pools->io_pool)
			
 
				-		goto free_pools_and_out;
			
 
				-
			
 
				-	pools->tio_pool = NULL;
			
 
				-	if (type == DM_TYPE_REQUEST_BASED) {
			
 
				-		pools->tio_pool = mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
			
 
				-		if (!pools->tio_pool)
			
 
				-			goto free_io_pool_and_out;
			
 
				-	}
			
 
				+		goto out;
			
 
				 
			
 
				-	pools->bs = (type == DM_TYPE_BIO_BASED) ?
			
 
				-		bioset_create(pool_size,
			
 
				-			      per_bio_data_size + offsetof(struct dm_target_io, clone)) :
			
 
				-		bioset_create(pool_size,
			
 
				-			      offsetof(struct dm_rq_clone_bio_info, clone));
			
 
				+	pools->bs = bioset_create(pool_size, front_pad);
			
 
				 	if (!pools->bs)
			
 
				-		goto free_tio_pool_and_out;
			
 
				+		goto out;
			
 
				 
			
 
				 	if (integrity && bioset_integrity_create(pools->bs, pool_size))
			
 
				-		goto free_bioset_and_out;
			
 
				+		goto out;
			
 
				 
			
 
				 	return pools;
			
 
				 
			
 
				-free_bioset_and_out:
			
 
				-	bioset_free(pools->bs);
			
 
				-
			
 
				-free_tio_pool_and_out:
			
 
				-	if (pools->tio_pool)
			
 
				-		mempool_destroy(pools->tio_pool);
			
 
				-
			
 
				-free_io_pool_and_out:
			
 
				-	mempool_destroy(pools->io_pool);
			
 
				-
			
 
				-free_pools_and_out:
			
 
				-	kfree(pools);
			
 
				+out:
			
 
				+	dm_free_md_mempools(pools);
			
 
				 
			
 
				 	return NULL;
			
 
				 }
			
@@ -2777,9 +2808,6 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
 
				 	if (pools->io_pool)
			
 
				 		mempool_destroy(pools->io_pool);
			
 
				 
			
 
				-	if (pools->tio_pool)
			
 
				-		mempool_destroy(pools->tio_pool);
			
 
				-
			
 
				 	if (pools->bs)
			
 
				 		bioset_free(pools->bs);
			
 
				 
			
--- a/drivers/md/persistent-data/Kconfig
+++ b/drivers/md/persistent-data/Kconfig
@@ -1,6 +1,6 @@
 
				 config DM_PERSISTENT_DATA
			
 
				        tristate
			
 
				-       depends on BLK_DEV_DM && EXPERIMENTAL
			
 
				+       depends on BLK_DEV_DM
			
 
				        select LIBCRC32C
			
 
				        select DM_BUFIO
			
 
				        ---help---
			
--- a/drivers/md/persistent-data/Makefile
+++ b/drivers/md/persistent-data/Makefile
@@ -1,5 +1,7 @@
 
				 obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o
			
 
				 dm-persistent-data-objs := \
			
 
				+	dm-array.o \
			
 
				+	dm-bitset.o \
			
 
				 	dm-block-manager.o \
			
 
				 	dm-space-map-common.o \
			
 
				 	dm-space-map-disk.o \
			
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -0,0 +1,808 @@
 
				+/*
			
 
				+ * Copyright (C) 2012 Red Hat, Inc.
			
 
				+ *
			
 
				+ * This file is released under the GPL.
			
 
				+ */
			
 
				+
			
 
				+#include "dm-array.h"
			
 
				+#include "dm-space-map.h"
			
 
				+#include "dm-transaction-manager.h"
			
 
				+
			
 
				+#include <linux/export.h>
			
 
				+#include <linux/device-mapper.h>
			
 
				+
			
 
				+#define DM_MSG_PREFIX "array"
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * The array is implemented as a fully populated btree, which points to
			
 
				+ * blocks that contain the packed values.  This is more space efficient
			
 
				+ * than just using a btree since we don't store 1 key per value.
			
 
				+ */
			
 
				+struct array_block {
			
 
				+	__le32 csum;
			
 
				+	__le32 max_entries;
			
 
				+	__le32 nr_entries;
			
 
				+	__le32 value_size;
			
 
				+	__le64 blocknr; /* Block this node is supposed to live in. */
			
 
				+} __packed;
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * Validator methods.  As usual we calculate a checksum, and also write the
			
 
				+ * block location into the header (paranoia about ssds remapping areas by
			
 
				+ * mistake).
			
 
				+ */
			
 
				+#define CSUM_XOR 595846735
			
 
				+
			
 
				+static void array_block_prepare_for_write(struct dm_block_validator *v,
			
 
				+					  struct dm_block *b,
			
 
				+					  size_t size_of_block)
			
 
				+{
			
 
				+	struct array_block *bh_le = dm_block_data(b);
			
 
				+
			
 
				+	bh_le->blocknr = cpu_to_le64(dm_block_location(b));
			
 
				+	bh_le->csum = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries,
			
 
				+						 size_of_block - sizeof(__le32),
			
 
				+						 CSUM_XOR));
			
 
				+}
			
 
				+
			
 
				+static int array_block_check(struct dm_block_validator *v,
			
 
				+			     struct dm_block *b,
			
 
				+			     size_t size_of_block)
			
 
				+{
			
 
				+	struct array_block *bh_le = dm_block_data(b);
			
 
				+	__le32 csum_disk;
			
 
				+
			
 
				+	if (dm_block_location(b) != le64_to_cpu(bh_le->blocknr)) {
			
 
				+		DMERR_LIMIT("array_block_check failed: blocknr %llu != wanted %llu",
			
 
				+			    (unsigned long long) le64_to_cpu(bh_le->blocknr),
			
 
				+			    (unsigned long long) dm_block_location(b));
			
 
				+		return -ENOTBLK;
			
 
				+	}
			
 
				+
			
 
				+	csum_disk = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries,
			
 
				+					       size_of_block - sizeof(__le32),
			
 
				+					       CSUM_XOR));
			
 
				+	if (csum_disk != bh_le->csum) {
			
 
				+		DMERR_LIMIT("array_block_check failed: csum %u != wanted %u",
			
 
				+			    (unsigned) le32_to_cpu(csum_disk),
			
 
				+			    (unsigned) le32_to_cpu(bh_le->csum));
			
 
				+		return -EILSEQ;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static struct dm_block_validator array_validator = {
			
 
				+	.name = "array",
			
 
				+	.prepare_for_write = array_block_prepare_for_write,
			
 
				+	.check = array_block_check
			
 
				+};
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * Functions for manipulating the array blocks.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Returns a pointer to a value within an array block.
			
 
				+ *
			
 
				+ * index - The index into _this_ specific block.
			
 
				+ */
			
 
				+static void *element_at(struct dm_array_info *info, struct array_block *ab,
			
 
				+			unsigned index)
			
 
				+{
			
 
				+	unsigned char *entry = (unsigned char *) (ab + 1);
			
 
				+
			
 
				+	entry += index * info->value_type.size;
			
 
				+
			
 
				+	return entry;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Utility function that calls one of the value_type methods on every value
			
 
				+ * in an array block.
			
 
				+ */
			
 
				+static void on_entries(struct dm_array_info *info, struct array_block *ab,
			
 
				+		       void (*fn)(void *, const void *))
			
 
				+{
			
 
				+	unsigned i, nr_entries = le32_to_cpu(ab->nr_entries);
			
 
				+
			
 
				+	for (i = 0; i < nr_entries; i++)
			
 
				+		fn(info->value_type.context, element_at(info, ab, i));
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Increment every value in an array block.
			
 
				+ */
			
 
				+static void inc_ablock_entries(struct dm_array_info *info, struct array_block *ab)
			
 
				+{
			
 
				+	struct dm_btree_value_type *vt = &info->value_type;
			
 
				+
			
 
				+	if (vt->inc)
			
 
				+		on_entries(info, ab, vt->inc);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Decrement every value in an array block.
			
 
				+ */
			
 
				+static void dec_ablock_entries(struct dm_array_info *info, struct array_block *ab)
			
 
				+{
			
 
				+	struct dm_btree_value_type *vt = &info->value_type;
			
 
				+
			
 
				+	if (vt->dec)
			
 
				+		on_entries(info, ab, vt->dec);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Each array block can hold this many values.
			
 
				+ */
			
 
				+static uint32_t calc_max_entries(size_t value_size, size_t size_of_block)
			
 
				+{
			
 
				+	return (size_of_block - sizeof(struct array_block)) / value_size;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Allocate a new array block.  The caller will need to unlock block.
			
 
				+ */
			
 
				+static int alloc_ablock(struct dm_array_info *info, size_t size_of_block,
			
 
				+			uint32_t max_entries,
			
 
				+			struct dm_block **block, struct array_block **ab)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	r = dm_tm_new_block(info->btree_info.tm, &array_validator, block);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	(*ab) = dm_block_data(*block);
			
 
				+	(*ab)->max_entries = cpu_to_le32(max_entries);
			
 
				+	(*ab)->nr_entries = cpu_to_le32(0);
			
 
				+	(*ab)->value_size = cpu_to_le32(info->value_type.size);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Pad an array block out with a particular value.  Every instance will
			
 
				+ * cause an increment of the value_type.  new_nr must always be more than
			
 
				+ * the current number of entries.
			
 
				+ */
			
 
				+static void fill_ablock(struct dm_array_info *info, struct array_block *ab,
			
 
				+			const void *value, unsigned new_nr)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	uint32_t nr_entries;
			
 
				+	struct dm_btree_value_type *vt = &info->value_type;
			
 
				+
			
 
				+	BUG_ON(new_nr > le32_to_cpu(ab->max_entries));
			
 
				+	BUG_ON(new_nr < le32_to_cpu(ab->nr_entries));
			
 
				+
			
 
				+	nr_entries = le32_to_cpu(ab->nr_entries);
			
 
				+	for (i = nr_entries; i < new_nr; i++) {
			
 
				+		if (vt->inc)
			
 
				+			vt->inc(vt->context, value);
			
 
				+		memcpy(element_at(info, ab, i), value, vt->size);
			
 
				+	}
			
 
				+	ab->nr_entries = cpu_to_le32(new_nr);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Remove some entries from the back of an array block.  Every value
			
 
				+ * removed will be decremented.  new_nr must be <= the current number of
			
 
				+ * entries.
			
 
				+ */
			
 
				+static void trim_ablock(struct dm_array_info *info, struct array_block *ab,
			
 
				+			unsigned new_nr)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	uint32_t nr_entries;
			
 
				+	struct dm_btree_value_type *vt = &info->value_type;
			
 
				+
			
 
				+	BUG_ON(new_nr > le32_to_cpu(ab->max_entries));
			
 
				+	BUG_ON(new_nr > le32_to_cpu(ab->nr_entries));
			
 
				+
			
 
				+	nr_entries = le32_to_cpu(ab->nr_entries);
			
 
				+	for (i = nr_entries; i > new_nr; i--)
			
 
				+		if (vt->dec)
			
 
				+			vt->dec(vt->context, element_at(info, ab, i - 1));
			
 
				+	ab->nr_entries = cpu_to_le32(new_nr);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Read locks a block, and coerces it to an array block.  The caller must
			
 
				+ * unlock 'block' when finished.
			
 
				+ */
			
 
				+static int get_ablock(struct dm_array_info *info, dm_block_t b,
			
 
				+		      struct dm_block **block, struct array_block **ab)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	r = dm_tm_read_lock(info->btree_info.tm, b, &array_validator, block);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	*ab = dm_block_data(*block);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Unlocks an array block.
			
 
				+ */
			
 
				+static int unlock_ablock(struct dm_array_info *info, struct dm_block *block)
			
 
				+{
			
 
				+	return dm_tm_unlock(info->btree_info.tm, block);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * Btree manipulation.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Looks up an array block in the btree, and then read locks it.
			
 
				+ *
			
 
				+ * index is the index of the index of the array_block, (ie. the array index
			
 
				+ * / max_entries).
			
 
				+ */
			
 
				+static int lookup_ablock(struct dm_array_info *info, dm_block_t root,
			
 
				+			 unsigned index, struct dm_block **block,
			
 
				+			 struct array_block **ab)
			
 
				+{
			
 
				+	int r;
			
 
				+	uint64_t key = index;
			
 
				+	__le64 block_le;
			
 
				+
			
 
				+	r = dm_btree_lookup(&info->btree_info, root, &key, &block_le);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	return get_ablock(info, le64_to_cpu(block_le), block, ab);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Insert an array block into the btree.  The block is _not_ unlocked.
			
 
				+ */
			
 
				+static int insert_ablock(struct dm_array_info *info, uint64_t index,
			
 
				+			 struct dm_block *block, dm_block_t *root)
			
 
				+{
			
 
				+	__le64 block_le = cpu_to_le64(dm_block_location(block));
			
 
				+
			
 
				+	__dm_bless_for_disk(block_le);
			
 
				+	return dm_btree_insert(&info->btree_info, *root, &index, &block_le, root);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Looks up an array block in the btree.  Then shadows it, and updates the
			
 
				+ * btree to point to this new shadow.  'root' is an input/output parameter
			
 
				+ * for both the current root block, and the new one.
			
 
				+ */
			
 
				+static int shadow_ablock(struct dm_array_info *info, dm_block_t *root,
			
 
				+			 unsigned index, struct dm_block **block,
			
 
				+			 struct array_block **ab)
			
 
				+{
			
 
				+	int r, inc;
			
 
				+	uint64_t key = index;
			
 
				+	dm_block_t b;
			
 
				+	__le64 block_le;
			
 
				+
			
 
				+	/*
			
 
				+	 * lookup
			
 
				+	 */
			
 
				+	r = dm_btree_lookup(&info->btree_info, *root, &key, &block_le);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+	b = le64_to_cpu(block_le);
			
 
				+
			
 
				+	/*
			
 
				+	 * shadow
			
 
				+	 */
			
 
				+	r = dm_tm_shadow_block(info->btree_info.tm, b,
			
 
				+			       &array_validator, block, &inc);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	*ab = dm_block_data(*block);
			
 
				+	if (inc)
			
 
				+		inc_ablock_entries(info, *ab);
			
 
				+
			
 
				+	/*
			
 
				+	 * Reinsert.
			
 
				+	 *
			
 
				+	 * The shadow op will often be a noop.  Only insert if it really
			
 
				+	 * copied data.
			
 
				+	 */
			
 
				+	if (dm_block_location(*block) != b)
			
 
				+		r = insert_ablock(info, index, *block, root);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Allocate an new array block, and fill it with some values.
			
 
				+ */
			
 
				+static int insert_new_ablock(struct dm_array_info *info, size_t size_of_block,
			
 
				+			     uint32_t max_entries,
			
 
				+			     unsigned block_index, uint32_t nr,
			
 
				+			     const void *value, dm_block_t *root)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct dm_block *block;
			
 
				+	struct array_block *ab;
			
 
				+
			
 
				+	r = alloc_ablock(info, size_of_block, max_entries, &block, &ab);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	fill_ablock(info, ab, value, nr);
			
 
				+	r = insert_ablock(info, block_index, block, root);
			
 
				+	unlock_ablock(info, block);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int insert_full_ablocks(struct dm_array_info *info, size_t size_of_block,
			
 
				+			       unsigned begin_block, unsigned end_block,
			
 
				+			       unsigned max_entries, const void *value,
			
 
				+			       dm_block_t *root)
			
 
				+{
			
 
				+	int r = 0;
			
 
				+
			
 
				+	for (; !r && begin_block != end_block; begin_block++)
			
 
				+		r = insert_new_ablock(info, size_of_block, max_entries, begin_block, max_entries, value, root);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * There are a bunch of functions involved with resizing an array.  This
			
 
				+ * structure holds information that commonly needed by them.  Purely here
			
 
				+ * to reduce parameter count.
			
 
				+ */
			
 
				+struct resize {
			
 
				+	/*
			
 
				+	 * Describes the array.
			
 
				+	 */
			
 
				+	struct dm_array_info *info;
			
 
				+
			
 
				+	/*
			
 
				+	 * The current root of the array.  This gets updated.
			
 
				+	 */
			
 
				+	dm_block_t root;
			
 
				+
			
 
				+	/*
			
 
				+	 * Metadata block size.  Used to calculate the nr entries in an
			
 
				+	 * array block.
			
 
				+	 */
			
 
				+	size_t size_of_block;
			
 
				+
			
 
				+	/*
			
 
				+	 * Maximum nr entries in an array block.
			
 
				+	 */
			
 
				+	unsigned max_entries;
			
 
				+
			
 
				+	/*
			
 
				+	 * nr of completely full blocks in the array.
			
 
				+	 *
			
 
				+	 * 'old' refers to before the resize, 'new' after.
			
 
				+	 */
			
 
				+	unsigned old_nr_full_blocks, new_nr_full_blocks;
			
 
				+
			
 
				+	/*
			
 
				+	 * Number of entries in the final block.  0 iff only full blocks in
			
 
				+	 * the array.
			
 
				+	 */
			
 
				+	unsigned old_nr_entries_in_last_block, new_nr_entries_in_last_block;
			
 
				+
			
 
				+	/*
			
 
				+	 * The default value used when growing the array.
			
 
				+	 */
			
 
				+	const void *value;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Removes a consecutive set of array blocks from the btree.  The values
			
 
				+ * in block are decremented as a side effect of the btree remove.
			
 
				+ *
			
 
				+ * begin_index - the index of the first array block to remove.
			
 
				+ * end_index - the one-past-the-end value.  ie. this block is not removed.
			
 
				+ */
			
 
				+static int drop_blocks(struct resize *resize, unsigned begin_index,
			
 
				+		       unsigned end_index)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	while (begin_index != end_index) {
			
 
				+		uint64_t key = begin_index++;
			
 
				+		r = dm_btree_remove(&resize->info->btree_info, resize->root,
			
 
				+				    &key, &resize->root);
			
 
				+		if (r)
			
 
				+			return r;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Calculates how many blocks are needed for the array.
			
 
				+ */
			
 
				+static unsigned total_nr_blocks_needed(unsigned nr_full_blocks,
			
 
				+				       unsigned nr_entries_in_last_block)
			
 
				+{
			
 
				+	return nr_full_blocks + (nr_entries_in_last_block ? 1 : 0);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Shrink an array.
			
 
				+ */
			
 
				+static int shrink(struct resize *resize)
			
 
				+{
			
 
				+	int r;
			
 
				+	unsigned begin, end;
			
 
				+	struct dm_block *block;
			
 
				+	struct array_block *ab;
			
 
				+
			
 
				+	/*
			
 
				+	 * Lose some blocks from the back?
			
 
				+	 */
			
 
				+	if (resize->new_nr_full_blocks < resize->old_nr_full_blocks) {
			
 
				+		begin = total_nr_blocks_needed(resize->new_nr_full_blocks,
			
 
				+					       resize->new_nr_entries_in_last_block);
			
 
				+		end = total_nr_blocks_needed(resize->old_nr_full_blocks,
			
 
				+					     resize->old_nr_entries_in_last_block);
			
 
				+
			
 
				+		r = drop_blocks(resize, begin, end);
			
 
				+		if (r)
			
 
				+			return r;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Trim the new tail block
			
 
				+	 */
			
 
				+	if (resize->new_nr_entries_in_last_block) {
			
 
				+		r = shadow_ablock(resize->info, &resize->root,
			
 
				+				  resize->new_nr_full_blocks, &block, &ab);
			
 
				+		if (r)
			
 
				+			return r;
			
 
				+
			
 
				+		trim_ablock(resize->info, ab, resize->new_nr_entries_in_last_block);
			
 
				+		unlock_ablock(resize->info, block);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Grow an array.
			
 
				+ */
			
 
				+static int grow_extend_tail_block(struct resize *resize, uint32_t new_nr_entries)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct dm_block *block;
			
 
				+	struct array_block *ab;
			
 
				+
			
 
				+	r = shadow_ablock(resize->info, &resize->root,
			
 
				+			  resize->old_nr_full_blocks, &block, &ab);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	fill_ablock(resize->info, ab, resize->value, new_nr_entries);
			
 
				+	unlock_ablock(resize->info, block);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int grow_add_tail_block(struct resize *resize)
			
 
				+{
			
 
				+	return insert_new_ablock(resize->info, resize->size_of_block,
			
 
				+				 resize->max_entries,
			
 
				+				 resize->new_nr_full_blocks,
			
 
				+				 resize->new_nr_entries_in_last_block,
			
 
				+				 resize->value, &resize->root);
			
 
				+}
			
 
				+
			
 
				+static int grow_needs_more_blocks(struct resize *resize)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	if (resize->old_nr_entries_in_last_block > 0) {
			
 
				+		r = grow_extend_tail_block(resize, resize->max_entries);
			
 
				+		if (r)
			
 
				+			return r;
			
 
				+	}
			
 
				+
			
 
				+	r = insert_full_ablocks(resize->info, resize->size_of_block,
			
 
				+				resize->old_nr_full_blocks,
			
 
				+				resize->new_nr_full_blocks,
			
 
				+				resize->max_entries, resize->value,
			
 
				+				&resize->root);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	if (resize->new_nr_entries_in_last_block)
			
 
				+		r = grow_add_tail_block(resize);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static int grow(struct resize *resize)
			
 
				+{
			
 
				+	if (resize->new_nr_full_blocks > resize->old_nr_full_blocks)
			
 
				+		return grow_needs_more_blocks(resize);
			
 
				+
			
 
				+	else if (resize->old_nr_entries_in_last_block)
			
 
				+		return grow_extend_tail_block(resize, resize->new_nr_entries_in_last_block);
			
 
				+
			
 
				+	else
			
 
				+		return grow_add_tail_block(resize);
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * These are the value_type functions for the btree elements, which point
			
 
				+ * to array blocks.
			
 
				+ */
			
 
				+static void block_inc(void *context, const void *value)
			
 
				+{
			
 
				+	__le64 block_le;
			
 
				+	struct dm_array_info *info = context;
			
 
				+
			
 
				+	memcpy(&block_le, value, sizeof(block_le));
			
 
				+	dm_tm_inc(info->btree_info.tm, le64_to_cpu(block_le));
			
 
				+}
			
 
				+
			
 
				+static void block_dec(void *context, const void *value)
			
 
				+{
			
 
				+	int r;
			
 
				+	uint64_t b;
			
 
				+	__le64 block_le;
			
 
				+	uint32_t ref_count;
			
 
				+	struct dm_block *block;
			
 
				+	struct array_block *ab;
			
 
				+	struct dm_array_info *info = context;
			
 
				+
			
 
				+	memcpy(&block_le, value, sizeof(block_le));
			
 
				+	b = le64_to_cpu(block_le);
			
 
				+
			
 
				+	r = dm_tm_ref(info->btree_info.tm, b, &ref_count);
			
 
				+	if (r) {
			
 
				+		DMERR_LIMIT("couldn't get reference count for block %llu",
			
 
				+			    (unsigned long long) b);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	if (ref_count == 1) {
			
 
				+		/*
			
 
				+		 * We're about to drop the last reference to this ablock.
			
 
				+		 * So we need to decrement the ref count of the contents.
			
 
				+		 */
			
 
				+		r = get_ablock(info, b, &block, &ab);
			
 
				+		if (r) {
			
 
				+			DMERR_LIMIT("couldn't get array block %llu",
			
 
				+				    (unsigned long long) b);
			
 
				+			return;
			
 
				+		}
			
 
				+
			
 
				+		dec_ablock_entries(info, ab);
			
 
				+		unlock_ablock(info, block);
			
 
				+	}
			
 
				+
			
 
				+	dm_tm_dec(info->btree_info.tm, b);
			
 
				+}
			
 
				+
			
 
				+static int block_equal(void *context, const void *value1, const void *value2)
			
 
				+{
			
 
				+	return !memcmp(value1, value2, sizeof(__le64));
			
 
				+}
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+void dm_array_info_init(struct dm_array_info *info,
			
 
				+			struct dm_transaction_manager *tm,
			
 
				+			struct dm_btree_value_type *vt)
			
 
				+{
			
 
				+	struct dm_btree_value_type *bvt = &info->btree_info.value_type;
			
 
				+
			
 
				+	memcpy(&info->value_type, vt, sizeof(info->value_type));
			
 
				+	info->btree_info.tm = tm;
			
 
				+	info->btree_info.levels = 1;
			
 
				+
			
 
				+	bvt->context = info;
			
 
				+	bvt->size = sizeof(__le64);
			
 
				+	bvt->inc = block_inc;
			
 
				+	bvt->dec = block_dec;
			
 
				+	bvt->equal = block_equal;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_array_info_init);
			
 
				+
			
 
				+int dm_array_empty(struct dm_array_info *info, dm_block_t *root)
			
 
				+{
			
 
				+	return dm_btree_empty(&info->btree_info, root);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_array_empty);
			
 
				+
			
 
				+static int array_resize(struct dm_array_info *info, dm_block_t root,
			
 
				+			uint32_t old_size, uint32_t new_size,
			
 
				+			const void *value, dm_block_t *new_root)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct resize resize;
			
 
				+
			
 
				+	if (old_size == new_size)
			
 
				+		return 0;
			
 
				+
			
 
				+	resize.info = info;
			
 
				+	resize.root = root;
			
 
				+	resize.size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm));
			
 
				+	resize.max_entries = calc_max_entries(info->value_type.size,
			
 
				+					      resize.size_of_block);
			
 
				+
			
 
				+	resize.old_nr_full_blocks = old_size / resize.max_entries;
			
 
				+	resize.old_nr_entries_in_last_block = old_size % resize.max_entries;
			
 
				+	resize.new_nr_full_blocks = new_size / resize.max_entries;
			
 
				+	resize.new_nr_entries_in_last_block = new_size % resize.max_entries;
			
 
				+	resize.value = value;
			
 
				+
			
 
				+	r = ((new_size > old_size) ? grow : shrink)(&resize);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	*new_root = resize.root;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int dm_array_resize(struct dm_array_info *info, dm_block_t root,
			
 
				+		    uint32_t old_size, uint32_t new_size,
			
 
				+		    const void *value, dm_block_t *new_root)
			
 
				+		    __dm_written_to_disk(value)
			
 
				+{
			
 
				+	int r = array_resize(info, root, old_size, new_size, value, new_root);
			
 
				+	__dm_unbless_for_disk(value);
			
 
				+	return r;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_array_resize);
			
 
				+
			
 
				+int dm_array_del(struct dm_array_info *info, dm_block_t root)
			
 
				+{
			
 
				+	return dm_btree_del(&info->btree_info, root);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_array_del);
			
 
				+
			
 
				+int dm_array_get_value(struct dm_array_info *info, dm_block_t root,
			
 
				+		       uint32_t index, void *value_le)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct dm_block *block;
			
 
				+	struct array_block *ab;
			
 
				+	size_t size_of_block;
			
 
				+	unsigned entry, max_entries;
			
 
				+
			
 
				+	size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm));
			
 
				+	max_entries = calc_max_entries(info->value_type.size, size_of_block);
			
 
				+
			
 
				+	r = lookup_ablock(info, root, index / max_entries, &block, &ab);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	entry = index % max_entries;
			
 
				+	if (entry >= le32_to_cpu(ab->nr_entries))
			
 
				+		r = -ENODATA;
			
 
				+	else
			
 
				+		memcpy(value_le, element_at(info, ab, entry),
			
 
				+		       info->value_type.size);
			
 
				+
			
 
				+	unlock_ablock(info, block);
			
 
				+	return r;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_array_get_value);
			
 
				+
			
 
				+static int array_set_value(struct dm_array_info *info, dm_block_t root,
			
 
				+			   uint32_t index, const void *value, dm_block_t *new_root)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct dm_block *block;
			
 
				+	struct array_block *ab;
			
 
				+	size_t size_of_block;
			
 
				+	unsigned max_entries;
			
 
				+	unsigned entry;
			
 
				+	void *old_value;
			
 
				+	struct dm_btree_value_type *vt = &info->value_type;
			
 
				+
			
 
				+	size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm));
			
 
				+	max_entries = calc_max_entries(info->value_type.size, size_of_block);
			
 
				+
			
 
				+	r = shadow_ablock(info, &root, index / max_entries, &block, &ab);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+	*new_root = root;
			
 
				+
			
 
				+	entry = index % max_entries;
			
 
				+	if (entry >= le32_to_cpu(ab->nr_entries)) {
			
 
				+		r = -ENODATA;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	old_value = element_at(info, ab, entry);
			
 
				+	if (vt->dec &&
			
 
				+	    (!vt->equal || !vt->equal(vt->context, old_value, value))) {
			
 
				+		vt->dec(vt->context, old_value);
			
 
				+		if (vt->inc)
			
 
				+			vt->inc(vt->context, value);
			
 
				+	}
			
 
				+
			
 
				+	memcpy(old_value, value, info->value_type.size);
			
 
				+
			
 
				+out:
			
 
				+	unlock_ablock(info, block);
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+int dm_array_set_value(struct dm_array_info *info, dm_block_t root,
			
 
				+		 uint32_t index, const void *value, dm_block_t *new_root)
			
 
				+		 __dm_written_to_disk(value)
			
 
				+{
			
 
				+	int r;
			
 
				+
			
 
				+	r = array_set_value(info, root, index, value, new_root);
			
 
				+	__dm_unbless_for_disk(value);
			
 
				+	return r;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_array_set_value);
			
 
				+
			
 
				+struct walk_info {
			
 
				+	struct dm_array_info *info;
			
 
				+	int (*fn)(void *context, uint64_t key, void *leaf);
			
 
				+	void *context;
			
 
				+};
			
 
				+
			
 
				+static int walk_ablock(void *context, uint64_t *keys, void *leaf)
			
 
				+{
			
 
				+	struct walk_info *wi = context;
			
 
				+
			
 
				+	int r;
			
 
				+	unsigned i;
			
 
				+	__le64 block_le;
			
 
				+	unsigned nr_entries, max_entries;
			
 
				+	struct dm_block *block;
			
 
				+	struct array_block *ab;
			
 
				+
			
 
				+	memcpy(&block_le, leaf, sizeof(block_le));
			
 
				+	r = get_ablock(wi->info, le64_to_cpu(block_le), &block, &ab);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	max_entries = le32_to_cpu(ab->max_entries);
			
 
				+	nr_entries = le32_to_cpu(ab->nr_entries);
			
 
				+	for (i = 0; i < nr_entries; i++) {
			
 
				+		r = wi->fn(wi->context, keys[0] * max_entries + i,
			
 
				+			   element_at(wi->info, ab, i));
			
 
				+
			
 
				+		if (r)
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	unlock_ablock(wi->info, block);
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+int dm_array_walk(struct dm_array_info *info, dm_block_t root,
			
 
				+		  int (*fn)(void *, uint64_t key, void *leaf),
			
 
				+		  void *context)
			
 
				+{
			
 
				+	struct walk_info wi;
			
 
				+
			
 
				+	wi.info = info;
			
 
				+	wi.fn = fn;
			
 
				+	wi.context = context;
			
 
				+
			
 
				+	return dm_btree_walk(&info->btree_info, root, walk_ablock, &wi);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_array_walk);
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
--- a/drivers/md/persistent-data/dm-array.h
+++ b/drivers/md/persistent-data/dm-array.h
@@ -0,0 +1,166 @@
 
				+/*
			
 
				+ * Copyright (C) 2012 Red Hat, Inc.
			
 
				+ *
			
 
				+ * This file is released under the GPL.
			
 
				+ */
			
 
				+#ifndef _LINUX_DM_ARRAY_H
			
 
				+#define _LINUX_DM_ARRAY_H
			
 
				+
			
 
				+#include "dm-btree.h"
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * The dm-array is a persistent version of an array.  It packs the data
			
 
				+ * more efficiently than a btree which will result in less disk space use,
			
 
				+ * and a performance boost.  The element get and set operations are still
			
 
				+ * O(ln(n)), but with a much smaller constant.
			
 
				+ *
			
 
				+ * The value type structure is reused from the btree type to support proper
			
 
				+ * reference counting of values.
			
 
				+ *
			
 
				+ * The arrays implicitly know their length, and bounds are checked for
			
 
				+ * lookups and updated.  It doesn't store this in an accessible place
			
 
				+ * because it would waste a whole metadata block.  Make sure you store the
			
 
				+ * size along with the array root in your encompassing data.
			
 
				+ *
			
 
				+ * Array entries are indexed via an unsigned integer starting from zero.
			
 
				+ * Arrays are not sparse; if you resize an array to have 'n' entries then
			
 
				+ * 'n - 1' will be the last valid index.
			
 
				+ *
			
 
				+ * Typical use:
			
 
				+ *
			
 
				+ * a) initialise a dm_array_info structure.  This describes the array
			
 
				+ *    values and ties it into a specific transaction manager.  It holds no
			
 
				+ *    instance data; the same info can be used for many similar arrays if
			
 
				+ *    you wish.
			
 
				+ *
			
 
				+ * b) Get yourself a root.  The root is the index of a block of data on the
			
 
				+ *    disk that holds a particular instance of an array.  You may have a
			
 
				+ *    pre existing root in your metadata that you wish to use, or you may
			
 
				+ *    want to create a brand new, empty array with dm_array_empty().
			
 
				+ *
			
 
				+ * Like the other data structures in this library, dm_array objects are
			
 
				+ * immutable between transactions.  Update functions will return you the
			
 
				+ * root for a _new_ array.  If you've incremented the old root, via
			
 
				+ * dm_tm_inc(), before calling the update function you may continue to use
			
 
				+ * it in parallel with the new root.
			
 
				+ *
			
 
				+ * c) resize an array with dm_array_resize().
			
 
				+ *
			
 
				+ * d) Get a value from the array with dm_array_get_value().
			
 
				+ *
			
 
				+ * e) Set a value in the array with dm_array_set_value().
			
 
				+ *
			
 
				+ * f) Walk an array of values in index order with dm_array_walk().  More
			
 
				+ *    efficient than making many calls to dm_array_get_value().
			
 
				+ *
			
 
				+ * g) Destroy the array with dm_array_del().  This tells the transaction
			
 
				+ *    manager that you're no longer using this data structure so it can
			
 
				+ *    recycle it's blocks.  (dm_array_dec() would be a better name for it,
			
 
				+ *    but del is in keeping with dm_btree_del()).
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Describes an array.  Don't initialise this structure yourself, use the
			
 
				+ * init function below.
			
 
				+ */
			
 
				+struct dm_array_info {
			
 
				+	struct dm_transaction_manager *tm;
			
 
				+	struct dm_btree_value_type value_type;
			
 
				+	struct dm_btree_info btree_info;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Sets up a dm_array_info structure.  You don't need to do anything with
			
 
				+ * this structure when you finish using it.
			
 
				+ *
			
 
				+ * info - the structure being filled in.
			
 
				+ * tm   - the transaction manager that should supervise this structure.
			
 
				+ * vt   - describes the leaf values.
			
 
				+ */
			
 
				+void dm_array_info_init(struct dm_array_info *info,
			
 
				+			struct dm_transaction_manager *tm,
			
 
				+			struct dm_btree_value_type *vt);
			
 
				+
			
 
				+/*
			
 
				+ * Create an empty, zero length array.
			
 
				+ *
			
 
				+ * info - describes the array
			
 
				+ * root - on success this will be filled out with the root block
			
 
				+ */
			
 
				+int dm_array_empty(struct dm_array_info *info, dm_block_t *root);
			
 
				+
			
 
				+/*
			
 
				+ * Resizes the array.
			
 
				+ *
			
 
				+ * info - describes the array
			
 
				+ * root - the root block of the array on disk
			
 
				+ * old_size - the caller is responsible for remembering the size of
			
 
				+ *            the array
			
 
				+ * new_size - can be bigger or smaller than old_size
			
 
				+ * value - if we're growing the array the new entries will have this value
			
 
				+ * new_root - on success, points to the new root block
			
 
				+ *
			
 
				+ * If growing the inc function for 'value' will be called the appropriate
			
 
				+ * number of times.  So if the caller is holding a reference they may want
			
 
				+ * to drop it.
			
 
				+ */
			
 
				+int dm_array_resize(struct dm_array_info *info, dm_block_t root,
			
 
				+		    uint32_t old_size, uint32_t new_size,
			
 
				+		    const void *value, dm_block_t *new_root)
			
 
				+	__dm_written_to_disk(value);
			
 
				+
			
 
				+/*
			
 
				+ * Frees a whole array.  The value_type's decrement operation will be called
			
 
				+ * for all values in the array
			
 
				+ */
			
 
				+int dm_array_del(struct dm_array_info *info, dm_block_t root);
			
 
				+
			
 
				+/*
			
 
				+ * Lookup a value in the array
			
 
				+ *
			
 
				+ * info - describes the array
			
 
				+ * root - root block of the array
			
 
				+ * index - array index
			
 
				+ * value - the value to be read.  Will be in on-disk format of course.
			
 
				+ *
			
 
				+ * -ENODATA will be returned if the index is out of bounds.
			
 
				+ */
			
 
				+int dm_array_get_value(struct dm_array_info *info, dm_block_t root,
			
 
				+		       uint32_t index, void *value);
			
 
				+
			
 
				+/*
			
 
				+ * Set an entry in the array.
			
 
				+ *
			
 
				+ * info - describes the array
			
 
				+ * root - root block of the array
			
 
				+ * index - array index
			
 
				+ * value - value to be written to disk.  Make sure you confirm the value is
			
 
				+ *         in on-disk format with__dm_bless_for_disk() before calling.
			
 
				+ * new_root - the new root block
			
 
				+ *
			
 
				+ * The old value being overwritten will be decremented, the new value
			
 
				+ * incremented.
			
 
				+ *
			
 
				+ * -ENODATA will be returned if the index is out of bounds.
			
 
				+ */
			
 
				+int dm_array_set_value(struct dm_array_info *info, dm_block_t root,
			
 
				+		       uint32_t index, const void *value, dm_block_t *new_root)
			
 
				+	__dm_written_to_disk(value);
			
 
				+
			
 
				+/*
			
 
				+ * Walk through all the entries in an array.
			
 
				+ *
			
 
				+ * info - describes the array
			
 
				+ * root - root block of the array
			
 
				+ * fn - called back for every element
			
 
				+ * context - passed to the callback
			
 
				+ */
			
 
				+int dm_array_walk(struct dm_array_info *info, dm_block_t root,
			
 
				+		  int (*fn)(void *context, uint64_t key, void *leaf),
			
 
				+		  void *context);
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+#endif	/* _LINUX_DM_ARRAY_H */
			
--- a/drivers/md/persistent-data/dm-bitset.c
+++ b/drivers/md/persistent-data/dm-bitset.c
@@ -0,0 +1,163 @@
 
				+/*
			
 
				+ * Copyright (C) 2012 Red Hat, Inc.
			
 
				+ *
			
 
				+ * This file is released under the GPL.
			
 
				+ */
			
 
				+
			
 
				+#include "dm-bitset.h"
			
 
				+#include "dm-transaction-manager.h"
			
 
				+
			
 
				+#include <linux/export.h>
			
 
				+#include <linux/device-mapper.h>
			
 
				+
			
 
				+#define DM_MSG_PREFIX "bitset"
			
 
				+#define BITS_PER_ARRAY_ENTRY 64
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+static struct dm_btree_value_type bitset_bvt = {
			
 
				+	.context = NULL,
			
 
				+	.size = sizeof(__le64),
			
 
				+	.inc = NULL,
			
 
				+	.dec = NULL,
			
 
				+	.equal = NULL,
			
 
				+};
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+void dm_disk_bitset_init(struct dm_transaction_manager *tm,
			
 
				+			 struct dm_disk_bitset *info)
			
 
				+{
			
 
				+	dm_array_info_init(&info->array_info, tm, &bitset_bvt);
			
 
				+	info->current_index_set = false;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_disk_bitset_init);
			
 
				+
			
 
				+int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *root)
			
 
				+{
			
 
				+	return dm_array_empty(&info->array_info, root);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_bitset_empty);
			
 
				+
			
 
				+int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t root,
			
 
				+		     uint32_t old_nr_entries, uint32_t new_nr_entries,
			
 
				+		     bool default_value, dm_block_t *new_root)
			
 
				+{
			
 
				+	uint32_t old_blocks = dm_div_up(old_nr_entries, BITS_PER_ARRAY_ENTRY);
			
 
				+	uint32_t new_blocks = dm_div_up(new_nr_entries, BITS_PER_ARRAY_ENTRY);
			
 
				+	__le64 value = default_value ? cpu_to_le64(~0) : cpu_to_le64(0);
			
 
				+
			
 
				+	__dm_bless_for_disk(&value);
			
 
				+	return dm_array_resize(&info->array_info, root, old_blocks, new_blocks,
			
 
				+			       &value, new_root);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_bitset_resize);
			
 
				+
			
 
				+int dm_bitset_del(struct dm_disk_bitset *info, dm_block_t root)
			
 
				+{
			
 
				+	return dm_array_del(&info->array_info, root);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_bitset_del);
			
 
				+
			
 
				+int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root,
			
 
				+		    dm_block_t *new_root)
			
 
				+{
			
 
				+	int r;
			
 
				+	__le64 value;
			
 
				+
			
 
				+	if (!info->current_index_set)
			
 
				+		return 0;
			
 
				+
			
 
				+	value = cpu_to_le64(info->current_bits);
			
 
				+
			
 
				+	__dm_bless_for_disk(&value);
			
 
				+	r = dm_array_set_value(&info->array_info, root, info->current_index,
			
 
				+			       &value, new_root);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	info->current_index_set = false;
			
 
				+	return 0;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_bitset_flush);
			
 
				+
			
 
				+static int read_bits(struct dm_disk_bitset *info, dm_block_t root,
			
 
				+		     uint32_t array_index)
			
 
				+{
			
 
				+	int r;
			
 
				+	__le64 value;
			
 
				+
			
 
				+	r = dm_array_get_value(&info->array_info, root, array_index, &value);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	info->current_bits = le64_to_cpu(value);
			
 
				+	info->current_index_set = true;
			
 
				+	info->current_index = array_index;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int get_array_entry(struct dm_disk_bitset *info, dm_block_t root,
			
 
				+			   uint32_t index, dm_block_t *new_root)
			
 
				+{
			
 
				+	int r;
			
 
				+	unsigned array_index = index / BITS_PER_ARRAY_ENTRY;
			
 
				+
			
 
				+	if (info->current_index_set) {
			
 
				+		if (info->current_index == array_index)
			
 
				+			return 0;
			
 
				+
			
 
				+		r = dm_bitset_flush(info, root, new_root);
			
 
				+		if (r)
			
 
				+			return r;
			
 
				+	}
			
 
				+
			
 
				+	return read_bits(info, root, array_index);
			
 
				+}
			
 
				+
			
 
				+int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root,
			
 
				+		      uint32_t index, dm_block_t *new_root)
			
 
				+{
			
 
				+	int r;
			
 
				+	unsigned b = index % BITS_PER_ARRAY_ENTRY;
			
 
				+
			
 
				+	r = get_array_entry(info, root, index, new_root);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	set_bit(b, (unsigned long *) &info->current_bits);
			
 
				+	return 0;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_bitset_set_bit);
			
 
				+
			
 
				+int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root,
			
 
				+			uint32_t index, dm_block_t *new_root)
			
 
				+{
			
 
				+	int r;
			
 
				+	unsigned b = index % BITS_PER_ARRAY_ENTRY;
			
 
				+
			
 
				+	r = get_array_entry(info, root, index, new_root);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	clear_bit(b, (unsigned long *) &info->current_bits);
			
 
				+	return 0;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_bitset_clear_bit);
			
 
				+
			
 
				+int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root,
			
 
				+		       uint32_t index, dm_block_t *new_root, bool *result)
			
 
				+{
			
 
				+	int r;
			
 
				+	unsigned b = index % BITS_PER_ARRAY_ENTRY;
			
 
				+
			
 
				+	r = get_array_entry(info, root, index, new_root);
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				+	*result = test_bit(b, (unsigned long *) &info->current_bits);
			
 
				+	return 0;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_bitset_test_bit);
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
--- a/drivers/md/persistent-data/dm-bitset.h
+++ b/drivers/md/persistent-data/dm-bitset.h
@@ -0,0 +1,165 @@
 
				+/*
			
 
				+ * Copyright (C) 2012 Red Hat, Inc.
			
 
				+ *
			
 
				+ * This file is released under the GPL.
			
 
				+ */
			
 
				+#ifndef _LINUX_DM_BITSET_H
			
 
				+#define _LINUX_DM_BITSET_H
			
 
				+
			
 
				+#include "dm-array.h"
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+/*
			
 
				+ * This bitset type is a thin wrapper round a dm_array of 64bit words.  It
			
 
				+ * uses a tiny, one word cache to reduce the number of array lookups and so
			
 
				+ * increase performance.
			
 
				+ *
			
 
				+ * Like the dm-array that it's based on, the caller needs to keep track of
			
 
				+ * the size of the bitset separately.  The underlying dm-array implicitly
			
 
				+ * knows how many words it's storing and will return -ENODATA if you try
			
 
				+ * and access an out of bounds word.  However, an out of bounds bit in the
			
 
				+ * final word will _not_ be detected, you have been warned.
			
 
				+ *
			
 
				+ * Bits are indexed from zero.
			
 
				+
			
 
				+ * Typical use:
			
 
				+ *
			
 
				+ * a) Initialise a dm_disk_bitset structure with dm_disk_bitset_init().
			
 
				+ *    This describes the bitset and includes the cache.  It's not called it
			
 
				+ *    dm_bitset_info in line with other data structures because it does
			
 
				+ *    include instance data.
			
 
				+ *
			
 
				+ * b) Get yourself a root.  The root is the index of a block of data on the
			
 
				+ *    disk that holds a particular instance of an bitset.  You may have a
			
 
				+ *    pre existing root in your metadata that you wish to use, or you may
			
 
				+ *    want to create a brand new, empty bitset with dm_bitset_empty().
			
 
				+ *
			
 
				+ * Like the other data structures in this library, dm_bitset objects are
			
 
				+ * immutable between transactions.  Update functions will return you the
			
 
				+ * root for a _new_ array.  If you've incremented the old root, via
			
 
				+ * dm_tm_inc(), before calling the update function you may continue to use
			
 
				+ * it in parallel with the new root.
			
 
				+ *
			
 
				+ * Even read operations may trigger the cache to be flushed and as such
			
 
				+ * return a root for a new, updated bitset.
			
 
				+ *
			
 
				+ * c) resize a bitset with dm_bitset_resize().
			
 
				+ *
			
 
				+ * d) Set a bit with dm_bitset_set_bit().
			
 
				+ *
			
 
				+ * e) Clear a bit with dm_bitset_clear_bit().
			
 
				+ *
			
 
				+ * f) Test a bit with dm_bitset_test_bit().
			
 
				+ *
			
 
				+ * g) Flush all updates from the cache with dm_bitset_flush().
			
 
				+ *
			
 
				+ * h) Destroy the bitset with dm_bitset_del().  This tells the transaction
			
 
				+ *    manager that you're no longer using this data structure so it can
			
 
				+ *    recycle it's blocks.  (dm_bitset_dec() would be a better name for it,
			
 
				+ *    but del is in keeping with dm_btree_del()).
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Opaque object.  Unlike dm_array_info, you should have one of these per
			
 
				+ * bitset.  Initialise with dm_disk_bitset_init().
			
 
				+ */
			
 
				+struct dm_disk_bitset {
			
 
				+	struct dm_array_info array_info;
			
 
				+
			
 
				+	uint32_t current_index;
			
 
				+	uint64_t current_bits;
			
 
				+
			
 
				+	bool current_index_set:1;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Sets up a dm_disk_bitset structure.  You don't need to do anything with
			
 
				+ * this structure when you finish using it.
			
 
				+ *
			
 
				+ * tm - the transaction manager that should supervise this structure
			
 
				+ * info - the structure being initialised
			
 
				+ */
			
 
				+void dm_disk_bitset_init(struct dm_transaction_manager *tm,
			
 
				+			 struct dm_disk_bitset *info);
			
 
				+
			
 
				+/*
			
 
				+ * Create an empty, zero length bitset.
			
 
				+ *
			
 
				+ * info - describes the bitset
			
 
				+ * new_root - on success, points to the new root block
			
 
				+ */
			
 
				+int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *new_root);
			
 
				+
			
 
				+/*
			
 
				+ * Resize the bitset.
			
 
				+ *
			
 
				+ * info - describes the bitset
			
 
				+ * old_root - the root block of the array on disk
			
 
				+ * old_nr_entries - the number of bits in the old bitset
			
 
				+ * new_nr_entries - the number of bits you want in the new bitset
			
 
				+ * default_value - the value for any new bits
			
 
				+ * new_root - on success, points to the new root block
			
 
				+ */
			
 
				+int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t old_root,
			
 
				+		     uint32_t old_nr_entries, uint32_t new_nr_entries,
			
 
				+		     bool default_value, dm_block_t *new_root);
			
 
				+
			
 
				+/*
			
 
				+ * Frees the bitset.
			
 
				+ */
			
 
				+int dm_bitset_del(struct dm_disk_bitset *info, dm_block_t root);
			
 
				+
			
 
				+/*
			
 
				+ * Set a bit.
			
 
				+ *
			
 
				+ * info - describes the bitset
			
 
				+ * root - the root block of the bitset
			
 
				+ * index - the bit index
			
 
				+ * new_root - on success, points to the new root block
			
 
				+ *
			
 
				+ * -ENODATA will be returned if the index is out of bounds.
			
 
				+ */
			
 
				+int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root,
			
 
				+		      uint32_t index, dm_block_t *new_root);
			
 
				+
			
 
				+/*
			
 
				+ * Clears a bit.
			
 
				+ *
			
 
				+ * info - describes the bitset
			
 
				+ * root - the root block of the bitset
			
 
				+ * index - the bit index
			
 
				+ * new_root - on success, points to the new root block
			
 
				+ *
			
 
				+ * -ENODATA will be returned if the index is out of bounds.
			
 
				+ */
			
 
				+int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root,
			
 
				+			uint32_t index, dm_block_t *new_root);
			
 
				+
			
 
				+/*
			
 
				+ * Tests a bit.
			
 
				+ *
			
 
				+ * info - describes the bitset
			
 
				+ * root - the root block of the bitset
			
 
				+ * index - the bit index
			
 
				+ * new_root - on success, points to the new root block (cached values may have been written)
			
 
				+ * result - the bit value you're after
			
 
				+ *
			
 
				+ * -ENODATA will be returned if the index is out of bounds.
			
 
				+ */
			
 
				+int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root,
			
 
				+		       uint32_t index, dm_block_t *new_root, bool *result);
			
 
				+
			
 
				+/*
			
 
				+ * Flush any cached changes to disk.
			
 
				+ *
			
 
				+ * info - describes the bitset
			
 
				+ * root - the root block of the bitset
			
 
				+ * new_root - on success, points to the new root block
			
 
				+ */
			
 
				+int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root,
			
 
				+		    dm_block_t *new_root);
			
 
				+
			
 
				+/*----------------------------------------------------------------*/
			
 
				+
			
 
				+#endif /* _LINUX_DM_BITSET_H */
			
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -613,6 +613,7 @@ int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
 
				 
			
 
				 	return dm_bufio_write_dirty_buffers(bm->bufio);
			
 
				 }
			
 
				+EXPORT_SYMBOL_GPL(dm_bm_flush_and_unlock);
			
 
				 
			
 
				 void dm_bm_set_read_only(struct dm_block_manager *bm)
			
 
				 {
			
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -64,6 +64,7 @@ struct ro_spine {
 
				 void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info);
			
 
				 int exit_ro_spine(struct ro_spine *s);
			
 
				 int ro_step(struct ro_spine *s, dm_block_t new_child);
			
 
				+void ro_pop(struct ro_spine *s);
			
 
				 struct btree_node *ro_node(struct ro_spine *s);
			
 
				 
			
 
				 struct shadow_spine {
			
--- a/drivers/md/persistent-data/dm-btree-spine.c
+++ b/drivers/md/persistent-data/dm-btree-spine.c
@@ -164,6 +164,13 @@ int ro_step(struct ro_spine *s, dm_block_t new_child)
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				+void ro_pop(struct ro_spine *s)
			
 
				+{
			
 
				+	BUG_ON(!s->count);
			
 
				+	--s->count;
			
 
				+	unlock_block(s->info, s->nodes[s->count]);
			
 
				+}
			
 
				+
			
 
				 struct btree_node *ro_node(struct ro_spine *s)
			
 
				 {
			
 
				 	struct dm_block *block;
			
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -807,3 +807,55 @@ int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
 
				 	return r ? r : count;
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
			
 
				+
			
 
				+/*
			
 
				+ * FIXME: We shouldn't use a recursive algorithm when we have limited stack
			
 
				+ * space.  Also this only works for single level trees.
			
 
				+ */
			
 
				+static int walk_node(struct ro_spine *s, dm_block_t block,
			
 
				+		     int (*fn)(void *context, uint64_t *keys, void *leaf),
			
 
				+		     void *context)
			
 
				+{
			
 
				+	int r;
			
 
				+	unsigned i, nr;
			
 
				+	struct btree_node *n;
			
 
				+	uint64_t keys;
			
 
				+
			
 
				+	r = ro_step(s, block);
			
 
				+	n = ro_node(s);
			
 
				+
			
 
				+	nr = le32_to_cpu(n->header.nr_entries);
			
 
				+	for (i = 0; i < nr; i++) {
			
 
				+		if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
			
 
				+			r = walk_node(s, value64(n, i), fn, context);
			
 
				+			if (r)
			
 
				+				goto out;
			
 
				+		} else {
			
 
				+			keys = le64_to_cpu(*key_ptr(n, i));
			
 
				+			r = fn(context, &keys, value_ptr(n, i));
			
 
				+			if (r)
			
 
				+				goto out;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+out:
			
 
				+	ro_pop(s);
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
			
 
				+		  int (*fn)(void *context, uint64_t *keys, void *leaf),
			
 
				+		  void *context)
			
 
				+{
			
 
				+	int r;
			
 
				+	struct ro_spine spine;
			
 
				+
			
 
				+	BUG_ON(info->levels > 1);
			
 
				+
			
 
				+	init_ro_spine(&spine, info);
			
 
				+	r = walk_node(&spine, root, fn, context);
			
 
				+	exit_ro_spine(&spine);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(dm_btree_walk);
			
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -58,21 +58,21 @@ struct dm_btree_value_type {
 
				 	 * somewhere.) This method is _not_ called for insertion of a new
			
 
				 	 * value: It is assumed the ref count is already 1.
			
 
				 	 */
			
 
				-	void (*inc)(void *context, void *value);
			
 
				+	void (*inc)(void *context, const void *value);
			
 
				 
			
 
				 	/*
			
 
				 	 * This value is being deleted.  The btree takes care of freeing
			
 
				 	 * the memory pointed to by @value.  Often the del function just
			
 
				 	 * needs to decrement a reference count somewhere.
			
 
				 	 */
			
 
				-	void (*dec)(void *context, void *value);
			
 
				+	void (*dec)(void *context, const void *value);
			
 
				 
			
 
				 	/*
			
 
				 	 * A test for equality between two values.  When a value is
			
 
				 	 * overwritten with a new one, the old one has the dec method
			
 
				 	 * called _unless_ the new and old value are deemed equal.
			
 
				 	 */
			
 
				-	int (*equal)(void *context, void *value1, void *value2);
			
 
				+	int (*equal)(void *context, const void *value1, const void *value2);
			
 
				 };
			
 
				 
			
 
				 /*
			
@@ -142,4 +142,13 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
 
				 int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
			
 
				 			      uint64_t *result_keys);
			
 
				 
			
 
				+/*
			
 
				+ * Iterate through the a btree, calling fn() on each entry.
			
 
				+ * It only works for single level trees and is internally recursive, so
			
 
				+ * monitor stack usage carefully.
			
 
				+ */
			
 
				+int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
			
 
				+		  int (*fn)(void *context, uint64_t *keys, void *leaf),
			
 
				+		  void *context);
			
 
				+
			
 
				 #endif	/* _LINUX_DM_BTREE_H */
			
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -68,8 +68,8 @@ typedef void (*dm_postsuspend_fn) (struct dm_target *ti);
 
				 typedef int (*dm_preresume_fn) (struct dm_target *ti);
			
 
				 typedef void (*dm_resume_fn) (struct dm_target *ti);
			
 
				 
			
 
				-typedef int (*dm_status_fn) (struct dm_target *ti, status_type_t status_type,
			
 
				-			     unsigned status_flags, char *result, unsigned maxlen);
			
 
				+typedef void (*dm_status_fn) (struct dm_target *ti, status_type_t status_type,
			
 
				+			      unsigned status_flags, char *result, unsigned maxlen);
			
 
				 
			
 
				 typedef int (*dm_message_fn) (struct dm_target *ti, unsigned argc, char **argv);
			
 
				 
			
@@ -175,6 +175,14 @@ struct target_type {
 
				 #define DM_TARGET_IMMUTABLE		0x00000004
			
 
				 #define dm_target_is_immutable(type)	((type)->features & DM_TARGET_IMMUTABLE)
			
 
				 
			
 
				+/*
			
 
				+ * Some targets need to be sent the same WRITE bio severals times so
			
 
				+ * that they can send copies of it to different devices.  This function
			
 
				+ * examines any supplied bio and returns the number of copies of it the
			
 
				+ * target requires.
			
 
				+ */
			
 
				+typedef unsigned (*dm_num_write_bios_fn) (struct dm_target *ti, struct bio *bio);
			
 
				+
			
 
				 struct dm_target {
			
 
				 	struct dm_table *table;
			
 
				 	struct target_type *type;
			
@@ -187,26 +195,26 @@ struct dm_target {
 
				 	uint32_t max_io_len;
			
 
				 
			
 
				 	/*
			
 
				-	 * A number of zero-length barrier requests that will be submitted
			
 
				+	 * A number of zero-length barrier bios that will be submitted
			
 
				 	 * to the target for the purpose of flushing cache.
			
 
				 	 *
			
 
				-	 * The request number can be accessed with dm_bio_get_target_request_nr.
			
 
				-	 * It is a responsibility of the target driver to remap these requests
			
 
				+	 * The bio number can be accessed with dm_bio_get_target_bio_nr.
			
 
				+	 * It is a responsibility of the target driver to remap these bios
			
 
				 	 * to the real underlying devices.
			
 
				 	 */
			
 
				-	unsigned num_flush_requests;
			
 
				+	unsigned num_flush_bios;
			
 
				 
			
 
				 	/*
			
 
				-	 * The number of discard requests that will be submitted to the target.
			
 
				-	 * The request number can be accessed with dm_bio_get_target_request_nr.
			
 
				+	 * The number of discard bios that will be submitted to the target.
			
 
				+	 * The bio number can be accessed with dm_bio_get_target_bio_nr.
			
 
				 	 */
			
 
				-	unsigned num_discard_requests;
			
 
				+	unsigned num_discard_bios;
			
 
				 
			
 
				 	/*
			
 
				-	 * The number of WRITE SAME requests that will be submitted to the target.
			
 
				-	 * The request number can be accessed with dm_bio_get_target_request_nr.
			
 
				+	 * The number of WRITE SAME bios that will be submitted to the target.
			
 
				+	 * The bio number can be accessed with dm_bio_get_target_bio_nr.
			
 
				 	 */
			
 
				-	unsigned num_write_same_requests;
			
 
				+	unsigned num_write_same_bios;
			
 
				 
			
 
				 	/*
			
 
				 	 * The minimum number of extra bytes allocated in each bio for the
			
@@ -214,6 +222,13 @@ struct dm_target {
 
				 	 */
			
 
				 	unsigned per_bio_data_size;
			
 
				 
			
 
				+	/*
			
 
				+	 * If defined, this function is called to find out how many
			
 
				+	 * duplicate bios should be sent to the target when writing
			
 
				+	 * data.
			
 
				+	 */
			
 
				+	dm_num_write_bios_fn num_write_bios;
			
 
				+
			
 
				 	/* target specific data */
			
 
				 	void *private;
			
 
				 
			
@@ -233,10 +248,10 @@ struct dm_target {
 
				 	bool discards_supported:1;
			
 
				 
			
 
				 	/*
			
 
				-	 * Set if the target required discard request to be split
			
 
				+	 * Set if the target required discard bios to be split
			
 
				 	 * on max_io_len boundary.
			
 
				 	 */
			
 
				-	bool split_discard_requests:1;
			
 
				+	bool split_discard_bios:1;
			
 
				 
			
 
				 	/*
			
 
				 	 * Set if this target does not return zeroes on discarded blocks.
			
@@ -261,7 +276,7 @@ struct dm_target_io {
 
				 	struct dm_io *io;
			
 
				 	struct dm_target *ti;
			
 
				 	union map_info info;
			
 
				-	unsigned target_request_nr;
			
 
				+	unsigned target_bio_nr;
			
 
				 	struct bio clone;
			
 
				 };
			
 
				 
			
@@ -275,9 +290,9 @@ static inline struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
 
				 	return (struct bio *)((char *)data + data_size + offsetof(struct dm_target_io, clone));
			
 
				 }
			
 
				 
			
 
				-static inline unsigned dm_bio_get_target_request_nr(const struct bio *bio)
			
 
				+static inline unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
			
 
				 {
			
 
				-	return container_of(bio, struct dm_target_io, clone)->target_request_nr;
			
 
				+	return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
			
 
				 }
			
 
				 
			
 
				 int dm_register_target(struct target_type *t);
			
--- a/include/linux/dm-kcopyd.h
+++ b/include/linux/dm-kcopyd.h
@@ -21,11 +21,34 @@
 
				 
			
 
				 #define DM_KCOPYD_IGNORE_ERROR 1
			
 
				 
			
 
				+struct dm_kcopyd_throttle {
			
 
				+	unsigned throttle;
			
 
				+	unsigned num_io_jobs;
			
 
				+	unsigned io_period;
			
 
				+	unsigned total_period;
			
 
				+	unsigned last_jiffies;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * kcopyd clients that want to support throttling must pass an initialised
			
 
				+ * dm_kcopyd_throttle struct into dm_kcopyd_client_create().
			
 
				+ * Two or more clients may share the same instance of this struct between
			
 
				+ * them if they wish to be throttled as a group.
			
 
				+ *
			
 
				+ * This macro also creates a corresponding module parameter to configure
			
 
				+ * the amount of throttling.
			
 
				+ */
			
 
				+#define DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(name, description)	\
			
 
				+static struct dm_kcopyd_throttle dm_kcopyd_throttle = { 100, 0, 0, 0, 0 }; \
			
 
				+module_param_named(name, dm_kcopyd_throttle.throttle, uint, 0644); \
			
 
				+MODULE_PARM_DESC(name, description)
			
 
				+
			
 
				 /*
			
 
				  * To use kcopyd you must first create a dm_kcopyd_client object.
			
 
				+ * throttle can be NULL if you don't want any throttling.
			
 
				  */
			
 
				 struct dm_kcopyd_client;
			
 
				-struct dm_kcopyd_client *dm_kcopyd_client_create(void);
			
 
				+struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle);
			
 
				 void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc);
			
 
				 
			
 
				 /*
			
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -267,9 +267,9 @@ enum {
 
				 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
			
 
				 
			
 
				 #define DM_VERSION_MAJOR	4
			
 
				-#define DM_VERSION_MINOR	23
			
 
				-#define DM_VERSION_PATCHLEVEL	1
			
 
				-#define DM_VERSION_EXTRA	"-ioctl (2012-12-18)"
			
 
				+#define DM_VERSION_MINOR	24
			
 
				+#define DM_VERSION_PATCHLEVEL	0
			
 
				+#define DM_VERSION_EXTRA	"-ioctl (2013-01-15)"
			
 
				 
			
 
				 /* Status bits */
			
 
				 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
			
@@ -336,4 +336,9 @@ enum {
 
				  */
			
 
				 #define DM_SECURE_DATA_FLAG		(1 << 15) /* In */
			
 
				 
			
 
				+/*
			
 
				+ * If set, a message generated output data.
			
 
				+ */
			
 
				+#define DM_DATA_OUT_FLAG		(1 << 16) /* Out */
			
 
				+
			
 
				 #endif				/* _LINUX_DM_IOCTL_H */