21 files changed, 1544 insertions, 467 deletions
diff --git a/Documentation/device-mapper/cache-policies.txt b/Documentation/device-mapper/cache-policies.txt
index d7c440b444c..df52a849957 100644
--- a/Documentation/device-mapper/cache-policies.txt
+++ b/Documentation/device-mapper/cache-policies.txt
@@ -30,8 +30,10 @@ multiqueue
 
 This policy is the default.
 
-The multiqueue policy has two sets of 16 queues: one set for entries
-waiting for the cache and another one for those in the cache.
+The multiqueue policy has three sets of 16 queues: one set for entries
+waiting for the cache and another two for those in the cache (a set for
+clean entries and a set for dirty entries).
+
 Cache entries in the queues are aged based on logical time. Entry into
 the cache is based on variable thresholds and queue selection is based
 on hit count on entry. The policy aims to take different cache miss
diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt
index 33d45ee0b73..274752f8bdf 100644
--- a/Documentation/device-mapper/cache.txt
+++ b/Documentation/device-mapper/cache.txt
@@ -68,10 +68,11 @@ So large block sizes are bad because they waste cache space.  And small
 block sizes are bad because they increase the amount of metadata (both
 in core and on disk).
 
-Writeback/writethrough
-----------------------
+Cache operating modes
+---------------------
 
-The cache has two modes, writeback and writethrough.
+The cache has three operating modes: writeback, writethrough and
+passthrough.
 
 If writeback, the default, is selected then a write to a block that is
 cached will go only to the cache and the block will be marked dirty in
@@ -81,8 +82,31 @@ If writethrough is selected then a write to a cached block will not
 complete until it has hit both the origin and cache devices.  Clean
 blocks should remain clean.
 
+If passthrough is selected, useful when the cache contents are not known
+to be coherent with the origin device, then all reads are served from
+the origin device (all reads miss the cache) and all writes are
+forwarded to the origin device; additionally, write hits cause cache
+block invalidates.  To enable passthrough mode the cache must be clean.
+Passthrough mode allows a cache device to be activated without having to
+worry about coherency.  Coherency that exists is maintained, although
+the cache will gradually cool as writes take place.  If the coherency of
+the cache can later be verified, or established through use of the
+"invalidate_cblocks" message, the cache device can be transitioned to
+writethrough or writeback mode while still warm.  Otherwise, the cache
+contents can be discarded prior to transitioning to the desired
+operating mode.
+
 A simple cleaner policy is provided, which will clean (write back) all
-dirty blocks in a cache.  Useful for decommissioning a cache.
+dirty blocks in a cache.  Useful for decommissioning a cache or when
+shrinking a cache.  Shrinking the cache's fast device requires all cache
+blocks, in the area of the cache being removed, to be clean.  If the
+area being removed from the cache still contains dirty blocks the resize
+will fail.  Care must be taken to never reduce the volume used for the
+cache's fast device until the cache is clean.  This is of particular
+importance if writeback mode is used.  Writethrough and passthrough
+modes already maintain a clean cache.  Future support to partially clean
+the cache, above a specified threshold, will allow for keeping the cache
+warm and in writeback mode during resize.
 
 Migration throttling
 --------------------
@@ -161,7 +185,7 @@ Constructor
  block size      : cache unit size in sectors
 
  #feature args   : number of feature arguments passed
- feature args    : writethrough.  (The default is writeback.)
+ feature args    : writethrough or passthrough (The default is writeback.)
 
  policy          : the replacement policy to use
  #policy args    : an even number of arguments corresponding to
@@ -177,6 +201,13 @@ Optional feature arguments are:
 		   back cache block contents later for performance reasons,
 		   so they may differ from the corresponding origin blocks.
 
+   passthrough	 : a degraded mode useful for various cache coherency
+		   situations (e.g., rolling back snapshots of
+		   underlying storage).	 Reads and writes always go to
+		   the origin.	If a write goes to a cached origin
+		   block, then the cache block is invalidated.
+		   To enable passthrough mode the cache must be clean.
+
 A policy called 'default' is always registered.  This is an alias for
 the policy we currently think is giving best all round performance.
 
@@ -231,12 +262,26 @@ The message format is:
 E.g.
    dmsetup message my_cache 0 sequential_threshold 1024
 
+
+Invalidation is removing an entry from the cache without writing it
+back.  Cache blocks can be invalidated via the invalidate_cblocks
+message, which takes an arbitrary number of cblock ranges.  Each cblock
+must be expressed as a decimal value, in the future a variant message
+that takes cblock ranges expressed in hexidecimal may be needed to
+better support efficient invalidation of larger caches.  The cache must
+be in passthrough mode when invalidate_cblocks is used.
+
+   invalidate_cblocks [<cblock>|<cblock begin>-<cblock end>]*
+
+E.g.
+   dmsetup message my_cache 0 invalidate_cblocks 2345 3456-4567 5678-6789
+
 Examples
 ========
 
 The test suite can be found here:
 
-https://github.com/jthornber/thinp-test-suite
+https://github.com/jthornber/device-mapper-test-suite
 
 dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \
 	/dev/mapper/ssd /dev/mapper/origin 512 1 writeback default 0'
diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt
index 2c656ae43ba..c81839b52c4 100644
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt
@@ -4,12 +4,15 @@ dm-crypt
 Device-Mapper's "crypt" target provides transparent encryption of block devices
 using the kernel crypto API.
 
+For a more detailed description of supported parameters see:
+http://code.google.com/p/cryptsetup/wiki/DMCrypt
+
 Parameters: <cipher> <key> <iv_offset> <device path> \
 	      <offset> [<#opt_params> <opt_params>]
 
 <cipher>
     Encryption cipher and an optional IV generation mode.
-    (In format cipher[:keycount]-chainmode-ivopts:ivmode).
+    (In format cipher[:keycount]-chainmode-ivmode[:ivopts]).
     Examples:
        des
        aes-cbc-essiv:sha256
@@ -19,7 +22,11 @@ Parameters: <cipher> <key> <iv_offset> <device path> \
 
 <key>
     Key used for encryption. It is encoded as a hexadecimal number.
-    You can only use key sizes that are valid for the selected cipher.
+    You can only use key sizes that are valid for the selected cipher
+    in combination with the selected iv mode.
+    Note that for some iv modes the key string can contain additional
+    keys (for example IV seed) so the key contains more parts concatenated
+    into a single string.
 
 <keycount>
     Multi-key compatibility mode. You can define <keycount> keys and
diff --git a/MAINTAINERS b/MAINTAINERS
index 647d21877d7..6610106bece 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2647,6 +2647,7 @@ M:	dm-devel@redhat.com
 L:	dm-devel@redhat.com
 W:	http://sources.redhat.com/dm
 Q:	http://patchwork.kernel.org/project/dm-devel/list/
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git
 T:	quilt http://people.redhat.com/agk/patches/linux/editing/
 S:	Maintained
 F:	Documentation/device-mapper/
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 30b426ed744..f2ccbc3b9fe 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -297,6 +297,17 @@ config DM_MIRROR
          Allow volume managers to mirror logical volumes, also
          needed for live data migration tools such as 'pvmove'.
 
+config DM_LOG_USERSPACE
+	tristate "Mirror userspace logging"
+	depends on DM_MIRROR && NET
+	select CONNECTOR
+	---help---
+	  The userspace logging module provides a mechanism for
+	  relaying the dm-dirty-log API to userspace.  Log designs
+	  which are more suited to userspace implementation (e.g.
+	  shared storage logs) or experimental logs can be implemented
+	  by leveraging this framework.
+
 config DM_RAID
        tristate "RAID 1/4/5/6/10 target"
        depends on BLK_DEV_DM
@@ -323,17 +334,6 @@ config DM_RAID
 	 RAID-5, RAID-6 distributes the syndromes across the drives
 	 in one of the available parity distribution methods.
 
-config DM_LOG_USERSPACE
-	tristate "Mirror userspace logging"
-	depends on DM_MIRROR && NET
-	select CONNECTOR
-	---help---
-	  The userspace logging module provides a mechanism for
-	  relaying the dm-dirty-log API to userspace.  Log designs
-	  which are more suited to userspace implementation (e.g.
-	  shared storage logs) or experimental logs can be implemented
-	  by leveraging this framework.
-
 config DM_ZERO
 	tristate "Zero target"
 	depends on BLK_DEV_DM
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 1af7255bbff..9ef0752e8a0 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -20,7 +20,13 @@
 
 #define CACHE_SUPERBLOCK_MAGIC 06142003
 #define CACHE_SUPERBLOCK_LOCATION 0
-#define CACHE_VERSION 1
+
+/*
+ * defines a range of metadata versions that this module can handle.
+ */
+#define MIN_CACHE_VERSION 1
+#define MAX_CACHE_VERSION 1
+
 #define CACHE_METADATA_CACHE_SIZE 64
 
 /*
@@ -134,6 +140,18 @@ static void sb_prepare_for_write(struct dm_block_validator *v,
 						      SUPERBLOCK_CSUM_XOR));
 }
 
+static int check_metadata_version(struct cache_disk_superblock *disk_super)
+{
+	uint32_t metadata_version = le32_to_cpu(disk_super->version);
+	if (metadata_version < MIN_CACHE_VERSION || metadata_version > MAX_CACHE_VERSION) {
+		DMERR("Cache metadata version %u found, but only versions between %u and %u supported.",
+		      metadata_version, MIN_CACHE_VERSION, MAX_CACHE_VERSION);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int sb_check(struct dm_block_validator *v,
 		    struct dm_block *b,
 		    size_t sb_block_size)
@@ -164,7 +182,7 @@ static int sb_check(struct dm_block_validator *v,
 		return -EILSEQ;
 	}
 
-	return 0;
+	return check_metadata_version(disk_super);
 }
 
 static struct dm_block_validator sb_validator = {
@@ -198,7 +216,7 @@ static int superblock_lock(struct dm_cache_metadata *cmd,
 
 /*----------------------------------------------------------------*/
 
-static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
+static int __superblock_all_zeroes(struct dm_block_manager *bm, bool *result)
 {
 	int r;
 	unsigned i;
@@ -214,10 +232,10 @@ static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
 		return r;
 
 	data_le = dm_block_data(b);
-	*result = 1;
+	*result = true;
 	for (i = 0; i < sb_block_size; i++) {
 		if (data_le[i] != zero) {
-			*result = 0;
+			*result = false;
 			break;
 		}
 	}
@@ -270,7 +288,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
 	disk_super->flags = 0;
 	memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
 	disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC);
-	disk_super->version = cpu_to_le32(CACHE_VERSION);
+	disk_super->version = cpu_to_le32(MAX_CACHE_VERSION);
 	memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));
 	memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version));
 	disk_super->policy_hint_size = 0;
@@ -411,7 +429,8 @@ bad:
 static int __open_or_format_metadata(struct dm_cache_metadata *cmd,
 				     bool format_device)
 {
-	int r, unformatted;
+	int r;
+	bool unformatted = false;
 
 	r = __superblock_all_zeroes(cmd->bm, &unformatted);
 	if (r)
@@ -666,19 +685,85 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
 	kfree(cmd);
 }
 
+/*
+ * Checks that the given cache block is either unmapped or clean.
+ */
+static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b,
+				   bool *result)
+{
+	int r;
+	__le64 value;
+	dm_oblock_t ob;
+	unsigned flags;
+
+	r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(b), &value);
+	if (r) {
+		DMERR("block_unmapped_or_clean failed");
+		return r;
+	}
+
+	unpack_value(value, &ob, &flags);
+	*result = !((flags & M_VALID) && (flags & M_DIRTY));
+
+	return 0;
+}
+
+static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
+					dm_cblock_t begin, dm_cblock_t end,
+					bool *result)
+{
+	int r;
+	*result = true;
+
+	while (begin != end) {
+		r = block_unmapped_or_clean(cmd, begin, result);
+		if (r)
+			return r;
+
+		if (!*result) {
+			DMERR("cache block %llu is dirty",
+			      (unsigned long long) from_cblock(begin));
+			return 0;
+		}
+
+		begin = to_cblock(from_cblock(begin) + 1);
+	}
+
+	return 0;
+}
+
 int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
 {
 	int r;
+	bool clean;
 	__le64 null_mapping = pack_value(0, 0);
 
 	down_write(&cmd->root_lock);
 	__dm_bless_for_disk(&null_mapping);
+
+	if (from_cblock(new_cache_size) < from_cblock(cmd->cache_blocks)) {
+		r = blocks_are_unmapped_or_clean(cmd, new_cache_size, cmd->cache_blocks, &clean);
+		if (r) {
+			__dm_unbless_for_disk(&null_mapping);
+			goto out;
+		}
+
+		if (!clean) {
+			DMERR("unable to shrink cache due to dirty blocks");
+			r = -EINVAL;
+			__dm_unbless_for_disk(&null_mapping);
+			goto out;
+		}
+	}
+
 	r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks),
 			    from_cblock(new_cache_size),
 			    &null_mapping, &cmd->root);
 	if (!r)
 		cmd->cache_blocks = new_cache_size;
 	cmd->changed = true;
+
+out:
 	up_write(&cmd->root_lock);
 
 	return r;
@@ -1182,3 +1267,8 @@ int dm_cache_save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock,
 
 	return r;
 }
+
+int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result)
+{
+	return blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result);
+}
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index f45cef21f3d..cd906f14f98 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -137,6 +137,11 @@ int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *
 int dm_cache_save_hint(struct dm_cache_metadata *cmd,
 		       dm_cblock_t cblock, uint32_t hint);
 
+/*
+ * Query method.  Are all the blocks in the cache clean?
+ */
+int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result);
+
 /*----------------------------------------------------------------*/
 
 #endif /* DM_CACHE_METADATA_H */
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index 0928abdc49f..2256a1f24f7 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -61,7 +61,12 @@ static inline int policy_writeback_work(struct dm_cache_policy *p,
 
 static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
 {
-	return p->remove_mapping(p, oblock);
+	p->remove_mapping(p, oblock);
+}
+
+static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
+{
+	return p->remove_cblock(p, cblock);
 }
 
 static inline void policy_force_mapping(struct dm_cache_policy *p,
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 4296155090b..416b7b752a6 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -26,19 +26,6 @@ static unsigned next_power(unsigned n, unsigned min)
 
 /*----------------------------------------------------------------*/
 
-static unsigned long *alloc_bitset(unsigned nr_entries)
-{
-	size_t s = sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
-	return vzalloc(s);
-}
-
-static void free_bitset(unsigned long *bits)
-{
-	vfree(bits);
-}
-
-/*----------------------------------------------------------------*/
-
 /*
  * Large, sequential ios are probably better left on the origin device since
  * spindles tend to have good bandwidth.
@@ -151,6 +138,21 @@ static void queue_init(struct queue *q)
 }
 
 /*
+ * Checks to see if the queue is empty.
+ * FIXME: reduce cpu usage.
+ */
+static bool queue_empty(struct queue *q)
+{
+	unsigned i;
+
+	for (i = 0; i < NR_QUEUE_LEVELS; i++)
+		if (!list_empty(q->qs + i))
+			return false;
+
+	return true;
+}
+
+/*
  * Insert an entry to the back of the given level.
  */
 static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
@@ -218,17 +220,116 @@ struct entry {
 	struct hlist_node hlist;
 	struct list_head list;
 	dm_oblock_t oblock;
-	dm_cblock_t cblock;	/* valid iff in_cache */
 
 	/*
 	 * FIXME: pack these better
 	 */
-	bool in_cache:1;
+	bool dirty:1;
 	unsigned hit_count;
 	unsigned generation;
 	unsigned tick;
 };
 
+/*
+ * Rather than storing the cblock in an entry, we allocate all entries in
+ * an array, and infer the cblock from the entry position.
+ *
+ * Free entries are linked together into a list.
+ */
+struct entry_pool {
+	struct entry *entries, *entries_end;
+	struct list_head free;
+	unsigned nr_allocated;
+};
+
+static int epool_init(struct entry_pool *ep, unsigned nr_entries)
+{
+	unsigned i;
+
+	ep->entries = vzalloc(sizeof(struct entry) * nr_entries);
+	if (!ep->entries)
+		return -ENOMEM;
+
+	ep->entries_end = ep->entries + nr_entries;
+
+	INIT_LIST_HEAD(&ep->free);
+	for (i = 0; i < nr_entries; i++)
+		list_add(&ep->entries[i].list, &ep->free);
+
+	ep->nr_allocated = 0;
+
+	return 0;
+}
+
+static void epool_exit(struct entry_pool *ep)
+{
+	vfree(ep->entries);
+}
+
+static struct entry *alloc_entry(struct entry_pool *ep)
+{
+	struct entry *e;
+
+	if (list_empty(&ep->free))
+		return NULL;
+
+	e = list_entry(list_pop(&ep->free), struct entry, list);
+	INIT_LIST_HEAD(&e->list);
+	INIT_HLIST_NODE(&e->hlist);
+	ep->nr_allocated++;
+
+	return e;
+}
+
+/*
+ * This assumes the cblock hasn't already been allocated.
+ */
+static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock)
+{
+	struct entry *e = ep->entries + from_cblock(cblock);
+	list_del(&e->list);
+
+	INIT_LIST_HEAD(&e->list);
+	INIT_HLIST_NODE(&e->hlist);
+	ep->nr_allocated++;
+
+	return e;
+}
+
+static void free_entry(struct entry_pool *ep, struct entry *e)
+{
+	BUG_ON(!ep->nr_allocated);
+	ep->nr_allocated--;
+	INIT_HLIST_NODE(&e->hlist);
+	list_add(&e->list, &ep->free);
+}
+
+/*
+ * Returns NULL if the entry is free.
+ */
+static struct entry *epool_find(struct entry_pool *ep, dm_cblock_t cblock)
+{
+	struct entry *e = ep->entries + from_cblock(cblock);
+	return !hlist_unhashed(&e->hlist) ? e : NULL;
+}
+
+static bool epool_empty(struct entry_pool *ep)
+{
+	return list_empty(&ep->free);
+}
+
+static bool in_pool(struct entry_pool *ep, struct entry *e)
+{
+	return e >= ep->entries && e < ep->entries_end;
+}
+
+static dm_cblock_t infer_cblock(struct entry_pool *ep, struct entry *e)
+{
+	return to_cblock(e - ep->entries);
+}
+
+/*----------------------------------------------------------------*/
+
 struct mq_policy {
 	struct dm_cache_policy policy;
 
@@ -238,13 +339,22 @@ struct mq_policy {
 	struct io_tracker tracker;
 
 	/*
-	 * We maintain two queues of entries.  The cache proper contains
-	 * the currently active mappings.  Whereas the pre_cache tracks
-	 * blocks that are being hit frequently and potential candidates
-	 * for promotion to the cache.
+	 * Entries come from two pools, one of pre-cache entries, and one
+	 * for the cache proper.
+	 */
+	struct entry_pool pre_cache_pool;
+	struct entry_pool cache_pool;
+
+	/*
+	 * We maintain three queues of entries.  The cache proper,
+	 * consisting of a clean and dirty queue, contains the currently
+	 * active mappings.  Whereas the pre_cache tracks blocks that
+	 * are being hit frequently and potential candidates for promotion
+	 * to the cache.
 	 */
 	struct queue pre_cache;
-	struct queue cache;
+	struct queue cache_clean;
+	struct queue cache_dirty;
 
 	/*
 	 * Keeps track of time, incremented by the core.  We use this to
@@ -282,25 +392,6 @@ struct mq_policy {
 	unsigned promote_threshold;
 
 	/*
-	 * We need cache_size entries for the cache, and choose to have
-	 * cache_size entries for the pre_cache too.  One motivation for
-	 * using the same size is to make the hit counts directly
-	 * comparable between pre_cache and cache.
-	 */
-	unsigned nr_entries;
-	unsigned nr_entries_allocated;
-	struct list_head free;
-
-	/*
-	 * Cache blocks may be unallocated.  We store this info in a
-	 * bitset.
-	 */
-	unsigned long *allocation_bitset;
-	unsigned nr_cblocks_allocated;
-	unsigned find_free_nr_words;
-	unsigned find_free_last_word;
-
-	/*
 	 * The hash table allows us to quickly find an entry by origin
 	 * block.  Both pre_cache and cache entries are in here.
 	 */
@@ -310,49 +401,6 @@ struct mq_policy {
 };
 
 /*----------------------------------------------------------------*/
-/* Free/alloc mq cache entry structures. */
-static void takeout_queue(struct list_head *lh, struct queue *q)
-{
-	unsigned level;
-
-	for (level = 0; level < NR_QUEUE_LEVELS; level++)
-		list_splice(q->qs + level, lh);
-}
-
-static void free_entries(struct mq_policy *mq)
-{
-	struct entry *e, *tmp;
-
-	takeout_queue(&mq->free, &mq->pre_cache);
-	takeout_queue(&mq->free, &mq->cache);
-
-	list_for_each_entry_safe(e, tmp, &mq->free, list)
-		kmem_cache_free(mq_entry_cache, e);
-}
-
-static int alloc_entries(struct mq_policy *mq, unsigned elts)
-{
-	unsigned u = mq->nr_entries;
-
-	INIT_LIST_HEAD(&mq->free);
-	mq->nr_entries_allocated = 0;
-
-	while (u--) {
-		struct entry *e = kmem_cache_zalloc(mq_entry_cache, GFP_KERNEL);
-
-		if (!e) {
-			free_entries(mq);
-			return -ENOMEM;
-		}
-
-
-		list_add(&e->list, &mq->free);
-	}
-
-	return 0;
-}
-
-/*----------------------------------------------------------------*/
 
 /*
  * Simple hash table implementation.  Should replace with the standard hash
@@ -388,96 +436,14 @@ static void hash_remove(struct entry *e)
 
 /*----------------------------------------------------------------*/
 
-/*
- * Allocates a new entry structure.  The memory is allocated in one lump,
- * so we just handing it out here.  Returns NULL if all entries have
- * already been allocated.  Cannot fail otherwise.
- */
-static struct entry *alloc_entry(struct mq_policy *mq)
-{
-	struct entry *e;
-
-	if (mq->nr_entries_allocated >= mq->nr_entries) {
-		BUG_ON(!list_empty(&mq->free));
-		return NULL;
-	}
-
-	e = list_entry(list_pop(&mq->free), struct entry, list);
-	INIT_LIST_HEAD(&e->list);
-	INIT_HLIST_NODE(&e->hlist);
-
-	mq->nr_entries_allocated++;
-	return e;
-}
-
-/*----------------------------------------------------------------*/
-
-/*
- * Mark cache blocks allocated or not in the bitset.
- */
-static void alloc_cblock(struct mq_policy *mq, dm_cblock_t cblock)
-{
-	BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size));
-	BUG_ON(test_bit(from_cblock(cblock), mq->allocation_bitset));
-
-	set_bit(from_cblock(cblock), mq->allocation_bitset);
-	mq->nr_cblocks_allocated++;
-}
-
-static void free_cblock(struct mq_policy *mq, dm_cblock_t cblock)
-{
-	BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size));
-	BUG_ON(!test_bit(from_cblock(cblock), mq->allocation_bitset));
-
-	clear_bit(from_cblock(cblock), mq->allocation_bitset);
-	mq->nr_cblocks_allocated--;
-}
-
 static bool any_free_cblocks(struct mq_policy *mq)
 {
-	return mq->nr_cblocks_allocated < from_cblock(mq->cache_size);
+	return !epool_empty(&mq->cache_pool);
 }
 
-/*
- * Fills result out with a cache block that isn't in use, or return
- * -ENOSPC.  This does _not_ mark the cblock as allocated, the caller is
- * reponsible for that.
- */
-static int __find_free_cblock(struct mq_policy *mq, unsigned begin, unsigned end,
-			      dm_cblock_t *result, unsigned *last_word)
+static bool any_clean_cblocks(struct mq_policy *mq)
 {
-	int r = -ENOSPC;
-	unsigned w;
-
-	for (w = begin; w < end; w++) {
-		/*
-		 * ffz is undefined if no zero exists
-		 */
-		if (mq->allocation_bitset[w] != ~0UL) {
-			*last_word = w;
-			*result = to_cblock((w * BITS_PER_LONG) + ffz(mq->allocation_bitset[w]));
-			if (from_cblock(*result) < from_cblock(mq->cache_size))
-				r = 0;
-
-			break;
-		}
-	}
-
-	return r;
-}
-
-static int find_free_cblock(struct mq_policy *mq, dm_cblock_t *result)
-{
-	int r;
-
-	if (!any_free_cblocks(mq))
-		return -ENOSPC;
-
-	r = __find_free_cblock(mq, mq->find_free_last_word, mq->find_free_nr_words, result, &mq->find_free_last_word);
-	if (r == -ENOSPC && mq->find_free_last_word)
-		r = __find_free_cblock(mq, 0, mq->find_free_last_word, result, &mq->find_free_last_word);
-
-	return r;
+	return !queue_empty(&mq->cache_clean);
 }
 
 /*----------------------------------------------------------------*/
@@ -496,33 +462,35 @@ static unsigned queue_level(struct entry *e)
 	return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u);
 }
 
+static bool in_cache(struct mq_policy *mq, struct entry *e)
+{
+	return in_pool(&mq->cache_pool, e);
+}
+
 /*
  * Inserts the entry into the pre_cache or the cache.  Ensures the cache
- * block is marked as allocated if necc.  Inserts into the hash table.  Sets the
- * tick which records when the entry was last moved about.
+ * block is marked as allocated if necc.  Inserts into the hash table.
+ * Sets the tick which records when the entry was last moved about.
  */
 static void push(struct mq_policy *mq, struct entry *e)
 {
 	e->tick = mq->tick;
 	hash_insert(mq, e);
 
-	if (e->in_cache) {
-		alloc_cblock(mq, e->cblock);
-		queue_push(&mq->cache, queue_level(e), &e->list);
-	} else
+	if (in_cache(mq, e))
+		queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean,
+			   queue_level(e), &e->list);
+	else
 		queue_push(&mq->pre_cache, queue_level(e), &e->list);
 }
 
 /*
  * Removes an entry from pre_cache or cache.  Removes from the hash table.
- * Frees off the cache block if necc.
  */
 static void del(struct mq_policy *mq, struct entry *e)
 {
 	queue_remove(&e->list);
 	hash_remove(e);
-	if (e->in_cache)
-		free_cblock(mq, e->cblock);
 }
 
 /*
@@ -531,14 +499,14 @@ static void del(struct mq_policy *mq, struct entry *e)
  */
 static struct entry *pop(struct mq_policy *mq, struct queue *q)
 {
-	struct entry *e = container_of(queue_pop(q), struct entry, list);
+	struct entry *e;
+	struct list_head *h = queue_pop(q);
 
-	if (e) {
-		hash_remove(e);
+	if (!h)
+		return NULL;
 
-		if (e->in_cache)
-			free_cblock(mq, e->cblock);
-	}
+	e = container_of(h, struct entry, list);
+	hash_remove(e);
 
 	return e;
 }
@@ -556,7 +524,8 @@ static bool updated_this_tick(struct mq_policy *mq, struct entry *e)
  * of the entries.
  *
  * At the moment the threshold is taken by averaging the hit counts of some
- * of the entries in the cache (the first 20 entries of the first level).
+ * of the entries in the cache (the first 20 entries across all levels in
+ * ascending order, giving preference to the clean entries at each level).
  *
  * We can be much cleverer than this though.  For example, each promotion
  * could bump up the threshold helping to prevent churn.  Much more to do
@@ -571,14 +540,21 @@ static void check_generation(struct mq_policy *mq)
 	struct list_head *head;
 	struct entry *e;
 
-	if ((mq->hit_count >= mq->generation_period) &&
-	    (mq->nr_cblocks_allocated == from_cblock(mq->cache_size))) {
-
+	if ((mq->hit_count >= mq->generation_period) && (epool_empty(&mq->cache_pool))) {
 		mq->hit_count = 0;
 		mq->generation++;
 
 		for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) {
-			head = mq->cache.qs + level;
+			head = mq->cache_clean.qs + level;
+			list_for_each_entry(e, head, list) {
+				nr++;
+				total += e->hit_count;
+
+				if (++count >= MAX_TO_AVERAGE)
+					break;
+			}
+
+			head = mq->cache_dirty.qs + level;
 			list_for_each_entry(e, head, list) {
 				nr++;
 				total += e->hit_count;
@@ -631,19 +607,30 @@ static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e)
  * - set the hit count to a hard coded value other than 1, eg, is it better
  *   if it goes in at level 2?
  */
-static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
+static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
 {
-	dm_cblock_t result;
-	struct entry *demoted = pop(mq, &mq->cache);
+	struct entry *demoted = pop(mq, &mq->cache_clean);
+
+	if (!demoted)
+		/*
+		 * We could get a block from mq->cache_dirty, but that
+		 * would add extra latency to the triggering bio as it
+		 * waits for the writeback.  Better to not promote this
+		 * time and hope there's a clean block next time this block
+		 * is hit.
+		 */
+		return -ENOSPC;
 
-	BUG_ON(!demoted);
-	result = demoted->cblock;
 	*oblock = demoted->oblock;
-	demoted->in_cache = false;
-	demoted->hit_count = 1;
-	push(mq, demoted);
+	free_entry(&mq->cache_pool, demoted);
+
+	/*
+	 * We used to put the demoted block into the pre-cache, but I think
+	 * it's simpler to just let it work it's way up from zero again.
+	 * Stops blocks flickering in and out of the cache.
+	 */
 
-	return result;
+	return 0;
 }
 
 /*
@@ -662,17 +649,18 @@ static dm_cblock_t demote_cb