23 files changed, 2184 insertions, 921 deletions
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
index ceb359050a5..0c2dec7aec2 100644
--- a/drivers/md/persistent-data/Kconfig
+++ b/drivers/md/persistent-data/Kconfig
@@ -1,8 +1,18 @@
 config DM_PERSISTENT_DATA
        tristate
-       depends on BLK_DEV_DM && EXPERIMENTAL
+       depends on BLK_DEV_DM
        select LIBCRC32C
        select DM_BUFIO
        ---help---
 	 Library providing immutable on-disk data structure support for
 	 device-mapper targets such as the thin provisioning target.
+
+config DM_DEBUG_BLOCK_STACK_TRACING
+       boolean "Keep stack trace of persistent data block lock holders"
+       depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA
+       select STACKTRACE
+       ---help---
+	 Enable this for messages that may help debug problems with the
+	 block manager locking used by thin provisioning and caching.
+
+	 If unsure, say N.
diff --git a/drivers/md/persistent-data/Makefile b/drivers/md/persistent-data/Makefile
index cfa95f66223..ff528792c35 100644
--- a/drivers/md/persistent-data/Makefile
+++ b/drivers/md/persistent-data/Makefile
@@ -1,7 +1,8 @@
 obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o
 dm-persistent-data-objs := \
+	dm-array.o \
+	dm-bitset.o \
 	dm-block-manager.o \
-	dm-space-map-checker.o \
 	dm-space-map-common.o \
 	dm-space-map-disk.o \
 	dm-space-map-metadata.o \
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
new file mode 100644
index 00000000000..1d75b1dc1e2
--- /dev/null
+++ b/drivers/md/persistent-data/dm-array.c
@@ -0,0 +1,819 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-array.h"
+#include "dm-space-map.h"
+#include "dm-transaction-manager.h"
+
+#include <linux/export.h>
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "array"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * The array is implemented as a fully populated btree, which points to
+ * blocks that contain the packed values.  This is more space efficient
+ * than just using a btree since we don't store 1 key per value.
+ */
+struct array_block {
+	__le32 csum;
+	__le32 max_entries;
+	__le32 nr_entries;
+	__le32 value_size;
+	__le64 blocknr; /* Block this node is supposed to live in. */
+} __packed;
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Validator methods.  As usual we calculate a checksum, and also write the
+ * block location into the header (paranoia about ssds remapping areas by
+ * mistake).
+ */
+#define CSUM_XOR 595846735
+
+static void array_block_prepare_for_write(struct dm_block_validator *v,
+					  struct dm_block *b,
+					  size_t size_of_block)
+{
+	struct array_block *bh_le = dm_block_data(b);
+
+	bh_le->blocknr = cpu_to_le64(dm_block_location(b));
+	bh_le->csum = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries,
+						 size_of_block - sizeof(__le32),
+						 CSUM_XOR));
+}
+
+static int array_block_check(struct dm_block_validator *v,
+			     struct dm_block *b,
+			     size_t size_of_block)
+{
+	struct array_block *bh_le = dm_block_data(b);
+	__le32 csum_disk;
+
+	if (dm_block_location(b) != le64_to_cpu(bh_le->blocknr)) {
+		DMERR_LIMIT("array_block_check failed: blocknr %llu != wanted %llu",
+			    (unsigned long long) le64_to_cpu(bh_le->blocknr),
+			    (unsigned long long) dm_block_location(b));
+		return -ENOTBLK;
+	}
+
+	csum_disk = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries,
+					       size_of_block - sizeof(__le32),
+					       CSUM_XOR));
+	if (csum_disk != bh_le->csum) {
+		DMERR_LIMIT("array_block_check failed: csum %u != wanted %u",
+			    (unsigned) le32_to_cpu(csum_disk),
+			    (unsigned) le32_to_cpu(bh_le->csum));
+		return -EILSEQ;
+	}
+
+	return 0;
+}
+
+static struct dm_block_validator array_validator = {
+	.name = "array",
+	.prepare_for_write = array_block_prepare_for_write,
+	.check = array_block_check
+};
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Functions for manipulating the array blocks.
+ */
+
+/*
+ * Returns a pointer to a value within an array block.
+ *
+ * index - The index into _this_ specific block.
+ */
+static void *element_at(struct dm_array_info *info, struct array_block *ab,
+			unsigned index)
+{
+	unsigned char *entry = (unsigned char *) (ab + 1);
+
+	entry += index * info->value_type.size;
+
+	return entry;
+}
+
+/*
+ * Utility function that calls one of the value_type methods on every value
+ * in an array block.
+ */
+static void on_entries(struct dm_array_info *info, struct array_block *ab,
+		       void (*fn)(void *, const void *))
+{
+	unsigned i, nr_entries = le32_to_cpu(ab->nr_entries);
+
+	for (i = 0; i < nr_entries; i++)
+		fn(info->value_type.context, element_at(info, ab, i));
+}
+
+/*
+ * Increment every value in an array block.
+ */
+static void inc_ablock_entries(struct dm_array_info *info, struct array_block *ab)
+{
+	struct dm_btree_value_type *vt = &info->value_type;
+
+	if (vt->inc)
+		on_entries(info, ab, vt->inc);
+}
+
+/*
+ * Decrement every value in an array block.
+ */
+static void dec_ablock_entries(struct dm_array_info *info, struct array_block *ab)
+{
+	struct dm_btree_value_type *vt = &info->value_type;
+
+	if (vt->dec)
+		on_entries(info, ab, vt->dec);
+}
+
+/*
+ * Each array block can hold this many values.
+ */
+static uint32_t calc_max_entries(size_t value_size, size_t size_of_block)
+{
+	return (size_of_block - sizeof(struct array_block)) / value_size;
+}
+
+/*
+ * Allocate a new array block.  The caller will need to unlock block.
+ */
+static int alloc_ablock(struct dm_array_info *info, size_t size_of_block,
+			uint32_t max_entries,
+			struct dm_block **block, struct array_block **ab)
+{
+	int r;
+
+	r = dm_tm_new_block(info->btree_info.tm, &array_validator, block);
+	if (r)
+		return r;
+
+	(*ab) = dm_block_data(*block);
+	(*ab)->max_entries = cpu_to_le32(max_entries);
+	(*ab)->nr_entries = cpu_to_le32(0);
+	(*ab)->value_size = cpu_to_le32(info->value_type.size);
+
+	return 0;
+}
+
+/*
+ * Pad an array block out with a particular value.  Every instance will
+ * cause an increment of the value_type.  new_nr must always be more than
+ * the current number of entries.
+ */
+static void fill_ablock(struct dm_array_info *info, struct array_block *ab,
+			const void *value, unsigned new_nr)
+{
+	unsigned i;
+	uint32_t nr_entries;
+	struct dm_btree_value_type *vt = &info->value_type;
+
+	BUG_ON(new_nr > le32_to_cpu(ab->max_entries));
+	BUG_ON(new_nr < le32_to_cpu(ab->nr_entries));
+
+	nr_entries = le32_to_cpu(ab->nr_entries);
+	for (i = nr_entries; i < new_nr; i++) {
+		if (vt->inc)
+			vt->inc(vt->context, value);
+		memcpy(element_at(info, ab, i), value, vt->size);
+	}
+	ab->nr_entries = cpu_to_le32(new_nr);
+}
+
+/*
+ * Remove some entries from the back of an array block.  Every value
+ * removed will be decremented.  new_nr must be <= the current number of
+ * entries.
+ */
+static void trim_ablock(struct dm_array_info *info, struct array_block *ab,
+			unsigned new_nr)
+{
+	unsigned i;
+	uint32_t nr_entries;
+	struct dm_btree_value_type *vt = &info->value_type;
+
+	BUG_ON(new_nr > le32_to_cpu(ab->max_entries));
+	BUG_ON(new_nr > le32_to_cpu(ab->nr_entries));
+
+	nr_entries = le32_to_cpu(ab->nr_entries);
+	for (i = nr_entries; i > new_nr; i--)
+		if (vt->dec)
+			vt->dec(vt->context, element_at(info, ab, i - 1));
+	ab->nr_entries = cpu_to_le32(new_nr);
+}
+
+/*
+ * Read locks a block, and coerces it to an array block.  The caller must
+ * unlock 'block' when finished.
+ */
+static int get_ablock(struct dm_array_info *info, dm_block_t b,
+		      struct dm_block **block, struct array_block **ab)
+{
+	int r;
+
+	r = dm_tm_read_lock(info->btree_info.tm, b, &array_validator, block);
+	if (r)
+		return r;
+
+	*ab = dm_block_data(*block);
+	return 0;
+}
+
+/*
+ * Unlocks an array block.
+ */
+static int unlock_ablock(struct dm_array_info *info, struct dm_block *block)
+{
+	return dm_tm_unlock(info->btree_info.tm, block);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Btree manipulation.
+ */
+
+/*
+ * Looks up an array block in the btree, and then read locks it.
+ *
+ * index is the index of the index of the array_block, (ie. the array index
+ * / max_entries).
+ */
+static int lookup_ablock(struct dm_array_info *info, dm_block_t root,
+			 unsigned index, struct dm_block **block,
+			 struct array_block **ab)
+{
+	int r;
+	uint64_t key = index;
+	__le64 block_le;
+
+	r = dm_btree_lookup(&info->btree_info, root, &key, &block_le);
+	if (r)
+		return r;
+
+	return get_ablock(info, le64_to_cpu(block_le), block, ab);
+}
+
+/*
+ * Insert an array block into the btree.  The block is _not_ unlocked.
+ */
+static int insert_ablock(struct dm_array_info *info, uint64_t index,
+			 struct dm_block *block, dm_block_t *root)
+{
+	__le64 block_le = cpu_to_le64(dm_block_location(block));
+
+	__dm_bless_for_disk(block_le);
+	return dm_btree_insert(&info->btree_info, *root, &index, &block_le, root);
+}
+
+/*
+ * Looks up an array block in the btree.  Then shadows it, and updates the
+ * btree to point to this new shadow.  'root' is an input/output parameter
+ * for both the current root block, and the new one.
+ */
+static int shadow_ablock(struct dm_array_info *info, dm_block_t *root,
+			 unsigned index, struct dm_block **block,
+			 struct array_block **ab)
+{
+	int r, inc;
+	uint64_t key = index;
+	dm_block_t b;
+	__le64 block_le;
+
+	/*
+	 * lookup
+	 */
+	r = dm_btree_lookup(&info->btree_info, *root, &key, &block_le);
+	if (r)
+		return r;
+	b = le64_to_cpu(block_le);
+
+	/*
+	 * shadow
+	 */
+	r = dm_tm_shadow_block(info->btree_info.tm, b,
+			       &array_validator, block, &inc);
+	if (r)
+		return r;
+
+	*ab = dm_block_data(*block);
+	if (inc)
+		inc_ablock_entries(info, *ab);
+
+	/*
+	 * Reinsert.
+	 *
+	 * The shadow op will often be a noop.  Only insert if it really
+	 * copied data.
+	 */
+	if (dm_block_location(*block) != b) {
+		/*
+		 * dm_tm_shadow_block will have already decremented the old
+		 * block, but it is still referenced by the btree.  We
+		 * increment to stop the insert decrementing it below zero
+		 * when overwriting the old value.
+		 */
+		dm_tm_inc(info->btree_info.tm, b);
+		r = insert_ablock(info, index, *block, root);
+	}
+
+	return r;
+}
+
+/*
+ * Allocate an new array block, and fill it with some values.
+ */
+static int insert_new_ablock(struct dm_array_info *info, size_t size_of_block,
+			     uint32_t max_entries,
+			     unsigned block_index, uint32_t nr,
+			     const void *value, dm_block_t *root)
+{
+	int r;
+	struct dm_block *block;
+	struct array_block *ab;
+
+	r = alloc_ablock(info, size_of_block, max_entries, &block, &ab);
+	if (r)
+		return r;
+
+	fill_ablock(info, ab, value, nr);
+	r = insert_ablock(info, block_index, block, root);
+	unlock_ablock(info, block);
+
+	return r;
+}
+
+static int insert_full_ablocks(struct dm_array_info *info, size_t size_of_block,
+			       unsigned begin_block, unsigned end_block,
+			       unsigned max_entries, const void *value,
+			       dm_block_t *root)
+{
+	int r = 0;
+
+	for (; !r && begin_block != end_block; begin_block++)
+		r = insert_new_ablock(info, size_of_block, max_entries, begin_block, max_entries, value, root);
+
+	return r;
+}
+
+/*
+ * There are a bunch of functions involved with resizing an array.  This
+ * structure holds information that commonly needed by them.  Purely here
+ * to reduce parameter count.
+ */
+struct resize {
+	/*
+	 * Describes the array.
+	 */
+	struct dm_array_info *info;
+
+	/*
+	 * The current root of the array.  This gets updated.
+	 */
+	dm_block_t root;
+
+	/*
+	 * Metadata block size.  Used to calculate the nr entries in an
+	 * array block.
+	 */
+	size_t size_of_block;
+
+	/*
+	 * Maximum nr entries in an array block.
+	 */
+	unsigned max_entries;
+
+	/*
+	 * nr of completely full blocks in the array.
+	 *
+	 * 'old' refers to before the resize, 'new' after.
+	 */
+	unsigned old_nr_full_blocks, new_nr_full_blocks;
+
+	/*
+	 * Number of entries in the final block.  0 iff only full blocks in
+	 * the array.
+	 */
+	unsigned old_nr_entries_in_last_block, new_nr_entries_in_last_block;
+
+	/*
+	 * The default value used when growing the array.
+	 */
+	const void *value;
+};
+
+/*
+ * Removes a consecutive set of array blocks from the btree.  The values
+ * in block are decremented as a side effect of the btree remove.
+ *
+ * begin_index - the index of the first array block to remove.
+ * end_index - the one-past-the-end value.  ie. this block is not removed.
+ */
+static int drop_blocks(struct resize *resize, unsigned begin_index,
+		       unsigned end_index)
+{
+	int r;
+
+	while (begin_index != end_index) {
+		uint64_t key = begin_index++;
+		r = dm_btree_remove(&resize->info->btree_info, resize->root,
+				    &key, &resize->root);
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
+/*
+ * Calculates how many blocks are needed for the array.
+ */
+static unsigned total_nr_blocks_needed(unsigned nr_full_blocks,
+				       unsigned nr_entries_in_last_block)
+{
+	return nr_full_blocks + (nr_entries_in_last_block ? 1 : 0);
+}
+
+/*
+ * Shrink an array.
+ */
+static int shrink(struct resize *resize)
+{
+	int r;
+	unsigned begin, end;
+	struct dm_block *block;
+	struct array_block *ab;
+
+	/*
+	 * Lose some blocks from the back?
+	 */
+	if (resize->new_nr_full_blocks < resize->old_nr_full_blocks) {
+		begin = total_nr_blocks_needed(resize->new_nr_full_blocks,
+					       resize->new_nr_entries_in_last_block);
+		end = total_nr_blocks_needed(resize->old_nr_full_blocks,
+					     resize->old_nr_entries_in_last_block);
+
+		r = drop_blocks(resize, begin, end);
+		if (r)
+			return r;
+	}
+
+	/*
+	 * Trim the new tail block
+	 */
+	if (resize->new_nr_entries_in_last_block) {
+		r = shadow_ablock(resize->info, &resize->root,
+				  resize->new_nr_full_blocks, &block, &ab);
+		if (r)
+			return r;
+
+		trim_ablock(resize->info, ab, resize->new_nr_entries_in_last_block);
+		unlock_ablock(resize->info, block);
+	}
+
+	return 0;
+}
+
+/*
+ * Grow an array.
+ */
+static int grow_extend_tail_block(struct resize *resize, uint32_t new_nr_entries)
+{
+	int r;
+	struct dm_block *block;
+	struct array_block *ab;
+
+	r = shadow_ablock(resize->info, &resize->root,
+			  resize->old_nr_full_blocks, &block, &ab);
+	if (r)
+		return r;
+
+	fill_ablock(resize->info, ab, resize->value, new_nr_entries);
+	unlock_ablock(resize->info, block);
+
+	return r;
+}
+
+static int grow_add_tail_block(struct resize *resize)
+{
+	return insert_new_ablock(resize->info, resize->size_of_block,
+				 resize->max_entries,
+				 resize->new_nr_full_blocks,
+				 resize->new_nr_entries_in_last_block,
+				 resize->value, &resize->root);
+}
+
+static int grow_needs_more_blocks(struct resize *resize)
+{
+	int r;
+	unsigned old_nr_blocks = resize->old_nr_full_blocks;
+
+	if (resize->old_nr_entries_in_last_block > 0) {
+		old_nr_blocks++;
+
+		r = grow_extend_tail_block(resize, resize->max_entries);
+		if (r)
+			return r;
+	}
+
+	r = insert_full_ablocks(resize->info, resize->size_of_block,
+				old_nr_blocks,
+				resize->new_nr_full_blocks,
+				resize->max_entries, resize->value,
+				&resize->root);
+	if (r)
+		return r;
+
+	if (resize->new_nr_entries_in_last_block)
+		r = grow_add_tail_block(resize);
+
+	return r;
+}
+
+static int grow(struct resize *resize)
+{
+	if (resize->new_nr_full_blocks > resize->old_nr_full_blocks)
+		return grow_needs_more_blocks(resize);
+
+	else if (resize->old_nr_entries_in_last_block)
+		return grow_extend_tail_block(resize, resize->new_nr_entries_in_last_block);
+
+	else
+		return grow_add_tail_block(resize);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * These are the value_type functions for the btree elements, which point
+ * to array blocks.
+ */
+static void block_inc(void *context, const void *value)
+{
+	__le64 block_le;
+	struct dm_array_info *info = context;
+
+	memcpy(&block_le, value, sizeof(block_le));
+	dm_tm_inc(info->btree_info.tm, le64_to_cpu(block_le));
+}
+
+static void block_dec(void *context, const void *value)
+{
+	int r;
+	uint64_t b;
+	__le64 block_le;
+	uint32_t ref_count;
+	struct dm_block *block;
+	struct array_block *ab;
+	struct dm_array_info *info = context;
+
+	memcpy(&block_le, value, sizeof(block_le));
+	b = le64_to_cpu(block_le);
+
+	r = dm_tm_ref(info->btree_info.tm, b, &ref_count);
+	if (r) {
+		DMERR_LIMIT("couldn't get reference count for block %llu",
+			    (unsigned long long) b);
+		return;
+	}
+
+	if (ref_count == 1) {
+		/*
+		 * We're about to drop the last reference to this ablock.
+		 * So we need to decrement the ref count of the contents.
+		 */
+		r = get_ablock(info, b, &block, &ab);
+		if (r) {
+			DMERR_LIMIT("couldn't get array block %llu",
+				    (unsigned long long) b);
+			return;
+		}
+
+		dec_ablock_entries(info, ab);
+		unlock_ablock(info, block);
+	}
+
+	dm_tm_dec(info->btree_info.tm, b);
+}
+
+static int block_equal(void *context, const void *value1, const void *value2)
+{
+	return !memcmp(value1, value2, sizeof(__le64));
+}
+
+/*----------------------------------------------------------------*/
+
+void dm_array_info_init(struct dm_array_info *info,
+			struct dm_transaction_manager *tm,
+			struct dm_btree_value_type *vt)
+{
+	struct dm_btree_value_type *bvt = &info->btree_info.value_type;
+
+	memcpy(&info->value_type, vt, sizeof(info->value_type));
+	info->btree_info.tm = tm;
+	info->btree_info.levels = 1;
+
+	bvt->context = info;
+	bvt->size = sizeof(__le64);
+	bvt->inc = block_inc;
+	bvt->dec = block_dec;
+	bvt->equal = block_equal;
+}
+EXPORT_SYMBOL_GPL(dm_array_info_init);
+
+int dm_array_empty(struct dm_array_info *info, dm_block_t *root)
+{
+	return dm_btree_empty(&info->btree_info, root);
+}
+EXPORT_SYMBOL_GPL(dm_array_empty);
+
+static int array_resize(struct dm_array_info *info, dm_block_t root,
+			uint32_t old_size, uint32_t new_size,
+			const void *value, dm_block_t *new_root)
+{
+	int r;
+	struct resize resize;
+
+	if (old_size == new_size)
+		return 0;
+
+	resize.info = info;
+	resize.root = root;
+	resize.size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm));
+	resize.max_entries = calc_max_entries(info->value_type.size,
+					      resize.size_of_block);
+
+	resize.old_nr_full_blocks = old_size / resize.max_entries;
+	resize.old_nr_entries_in_last_block = old_size % resize.max_entries;
+	resize.new_nr_full_blocks = new_size / resize.max_entries;
+	resize.new_nr_entries_in_last_block = new_size % resize.max_entries;
+	resize.value = value;
+
+	r = ((new_size > old_size) ? grow : shrink)(&resize);
+	if (r)
+		return r;
+
+	*new_root = resize.root;
+	return 0;
+}
+
+int dm_array_resize(struct dm_array_info *info, dm_block_t root,
+		    uint32_t old_size, uint32_t new_size,
+		    const void *value, dm_block_t *new_root)
+		    __dm_written_to_disk(value)
+{
+	int r = array_resize(info, root, old_size, new_size, value, new_root);
+	__dm_unbless_for_disk(value);
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_array_resize);
+
+int dm_array_del(struct dm_array_info *info, dm_block_t root)
+{
+	return dm_btree_del(&info->btree_info, root);
+}
+EXPORT_SYMBOL_GPL(dm_array_del);
+
+int dm_array_get_value(struct dm_array_info *info, dm_block_t root,
+		       uint32_t index, void *value_le)
+{
+	int r;
+	struct dm_block *block;
+	struct array_block *ab;
+	size_t size_of_block;
+	unsigned entry, max_entries;
+
+	size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm));
+	max_entries = calc_max_entries(info->value_type.size, size_of_block);
+
+	r = lookup_ablock(info, root, index / max_entries, &block, &ab);
+	if (r)
+		return r;
+
+	entry = index % max_entries;
+	if (entry >= le32_to_cpu(ab->nr_entries))
+		r = -ENODATA;
+	else
+		memcpy(value_le, element_at(info, ab, entry),
+		       info->value_type.size);
+
+	unlock_ablock(info, block);
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_array_get_value);
+
+static int array_set_value(struct dm_array_info *info, dm_block_t root,
+			   uint32_t index, const void *value, dm_block_t *new_root)
+{
+	int r;
+	struct dm_block *block;
+	struct array_block *ab;
+	size_t size_of_block;
+	unsigned max_entries;
+	unsigned entry;
+	void *old_value;
+	struct dm_btree_value_type *vt = &info->value_type;
+
+	size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm));
+	max_entries = calc_max_entries(info->value_type.size, size_of_block);
+
+	r = shadow_ablock(info, &root, index / max_entries, &block, &ab);
+	if (r)
+		return r;
+	*new_root = root;
+
+	entry = index % max_entries;
+	if (entry >= le32_to_cpu(ab->nr_entries)) {
+		r = -ENODATA;
+		goto out;
+	}
+
+	old_value = element_at(info, ab, entry);
+	if (vt->dec &&
+	    (!vt->equal || !vt->equal(vt->context, old_value, value))) {
+		vt->dec(vt->context, old_value);
+		if (vt->inc)
+			vt->inc(vt->context, value);
+	}
+
+	memcpy(old_value, value, info->value_type.size);
+
+out:
+	unlock_ablock(info, block);
+	return r;
+}
+
+int dm_array_set_value(struct dm_array_info *info, dm_block_t root,
+		 uint32_t index, const void *value, dm_block_t *new_root)
+		 __dm_written_to_disk(value)
+{
+	int r;
+
+	r = array_set_value(info, root, index, value, new_root);
+	__dm_unbless_for_disk(value);
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_array_set_value);
+
+struct walk_info {
+	struct dm_array_info *info;
+	int (*fn)(void *context, uint64_t key, void *leaf);
+	void *context;
+};
+
+static int walk_ablock(void *context, uint64_t *keys, void *leaf)
+{
+	struct walk_info *wi = context;
+
+	int r;
+	unsigned i;
+	__le64 block_le;
+	unsigned nr_entries, max_entries;
+	struct dm_block *block;
+	struct array_block *ab;
+
+	memcpy(&block_le, leaf, sizeof(block_le));
+	r = get_ablock(wi->info, le64_to_cpu(block_le), &block, &ab);
+	if (r)
+		return r;
+
+	max_entries = le32_to_cpu(ab->max_entries);
+	nr_entries = le32_to_cpu(ab->nr_entries);
+	for (i = 0; i < nr_entries; i++) {
+		r = wi->fn(wi->context, keys[0] * max_entries + i,
+			   element_at(wi->info, ab, i));
+
+		if (r)
+			break;
+	}
+
+	unlock_ablock(wi->info, block);
+	return r;
+}
+
+int dm_array_walk(struct dm_array_info *info, dm_block_t root,
+		  int (*fn)(void *, uint64_t key, void *leaf),
+		  void *context)
+{
+	struct walk_info wi;
+
+	wi.info = info;
+	wi.fn = fn;
+	wi.context = context;
+
+	return dm_btree_walk(&info->btree_info, root, walk_ablock, &wi);
+}
+EXPORT_SYMBOL_GPL(dm_array_walk);
+
+/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-array.h b/drivers/md/persistent-data/dm-array.h
new file mode 100644
index 00000000000..ea177d6fa58
--- /dev/null
+++ b/drivers/md/persistent-data/dm-array.h
@@ -0,0 +1,166 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+#ifndef _LINUX_DM_ARRAY_H
+#define _LINUX_DM_ARRAY_H
+
+#include "dm-btree.h"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * The dm-array is a persistent version of an array.  It packs the data
+ * more efficiently than a btree which will result in less disk space use,
+ * and a performance boost.  The element get and set operations are still
+ * O(ln(n)), but with a much smaller constant.
+ *
+ * The value type structure is reused from the btree type to support proper
+ * reference counting of values.
+ *
+ * The arrays implicitly know their length, and bounds are checked for
+ * lookups and updated.  It doesn't store this in an accessible place
+ * because it would waste a whole metadata block.  Make sure you store the
+ * size along with the array root in your encompassing data.
+ *
+ * Array entries are indexed via an unsigned integer starting from zero.
+ * Arrays are not sparse; if you resize an array to have 'n' entries then
+ * 'n - 1' will be the last valid index.
+ *
+ * Typical use:
+ *
+ * a) initialise a dm_array_info structure.  This describes the array
+ *    values and ties it into a specific transaction manager.  It holds no
+ *    instance data; the same info can be used for many similar arrays if
+ *    you wish.
+ *
+ * b) Get yourself a root.  The root is the index of a block of data on the
+ *    disk that holds a particular instance of an array.  You may have a
+ *    pre existing root in your metadata that you wish to use, or you may
+ *    want to create a brand new, empty array with dm_array_empty().
+ *
+ * Like the other data structures in this library, dm_array objects are
+ * immutable between transactions.  Update functions will return you the
+ * root for a _new_ array.  If you've incremented the old root, via
+ * dm_tm_inc(), before calling the update function you may continue to use
+ * it in parallel with the new root.
+ *
+ * c) resize an array with dm_array_resize().
+ *
+ * d) Get a value from the array with dm_array_get_value().
+ *
+ * e) Set a value in the array with dm_array_set_value().
+ *
+ * f) Walk an array of values in index order with dm_array_walk().  More
+ *    efficient than making many calls to dm_array_get_value().
+ *
+ * g) Destroy the array with dm_array_del().  This tells the transaction
+ *    manager that you're no longer using this data structure so it can
+ *    recycle it's blocks.  (dm_array_dec() would be a better name for it,
+ *    but del is in keeping with dm_btree_del()).
+ */
+
+/*
+ * Describes an array.  Don't initialise this structure yourself, use the
+ * init function below.
+ */
+struct dm_array_info {
+	struct dm_transaction_manager *tm;
+	struct dm_btree_value_type value_type;
+	struct dm_btree_info btree_info;
+};
+
+/*
+ * Sets up a dm_array_info structure.  You don't need to do anything with
+ * this structure when you finish using it.
+ *
+ * info - the structure being filled in.
+ * tm   - the transaction manager that should supervise this structure.
+ * vt   - describes the leaf values.
+ */
+void dm_array_info_init(struct dm_array_info *info,
+			struct dm_transaction_manager *tm,
+			struct dm_btree_value_type *vt);
+
+/*
+ * Create an empty, zero length array.
+ *
+ * info - describes the array
+ * root - on success this will be filled out with the root block
+ */
+int dm_array_empty(struct dm_array_info *info, dm_block_t *root);
+
+/*
+ * Resizes the array.
+ *
+ * info - describes the array
+ * root - the root block of the array on disk
+ * old_size - the caller is responsible for remembering the size of
+ *            the array
+ * new_size - can be bigger or smaller than old_size
+ * value - if we're growing the array the new entries will have this value
+ * new_root - on success, points to the new root block
+ *
+ * If growing the inc function for 'value' will be called the appropriate
+ * number of times.  So if the caller is holding a reference they may want
+ * to drop it.
+ */
+int dm_array_resize(struct dm_array_info *info, dm_block_t root,
+		    uint32_t old_size, uint32_t new_size,
+		    const void *value, dm_block_t *new_root)
+	__dm_written_to_disk(value);
+
+/*
+ * Frees a whole array.  The value_type's decrement operation will be called
+ * for all values in the array
+ */
+int dm_array_del(struct dm_array_info *info, dm_block_t root);
+
+/*
+ * Lookup a value in the array
+ *
+ * info - describes the array
+ * root - root block of the array
+ * index - array index
+ * value - the value to be read.  Will be in on-disk format of course.
+ *
+ * -ENODATA will be returned if the index is out of bounds.
+ */
+int dm_array_get_value(struct dm_array_info *info, dm_block_t root,
+		       uint32_t index, void *value);
+
+/*
+ * Set an entry in the array.
+ *
+ * info - describes the array
+ * root - root block of the array
+ * index - array index
+ * value - value to be written to disk.  Make sure you confirm the value is
+ *         in on-disk format with__dm_bless_for_disk() before calling.
+ * new_root - the new root block
+ *
+ * The old value being overwritten will be decremented, the new value
+ * incremented.
+ *
+ * -ENODATA will be returned if the index is out of bounds.
+ */
+int dm_array_set_value(struct dm_array_info *info, dm_block_t root,
+		       uint32_t index, const void *value, dm_block_t *new_root)
+	__dm_written_to_disk(value);
+
+/*
+ * Walk through all the entries in an array.
+ *
+ * info - describes the array
+ * root - root block of the array
+ * fn - called back for every element
+ * context - passed to the callback
+ */
+int dm_array_walk(struct dm_array_info *info, dm_block_t root,
+		  int (*fn)(void *context, uint64_t key, void *leaf),
+		  void *context);
+
+/*----------------------------------------------------------------*/
+
+#endif	/* _LINUX_DM_ARRAY_H */
diff --git a/drivers/md/persistent-data/dm-bitset.c b/drivers/md/persistent-data/dm-bitset.c
new file mode 100644
index 00000000000..36f7cc2c710
--- /dev/null
+++ b/drivers/md/persistent-data/dm-bitset.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-bitset.h"
+#include "dm-transaction-manager.h"
+
+#include <linux/export.h>
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "bitset"
+#define BITS_PER_ARRAY_ENTRY 64
+
+/*----------------------------------------------------------------*/
+
+static struct dm_btree_value_type bitset_bvt = {
+	.context = NULL,
+	.size = sizeof(__le64),
+	.inc = NULL,
+	.dec = NULL,
+	.equal = NULL,
+};
+
+/*----------------------------------------------------------------*/
+
+void dm_disk_bitset_init(struct dm_transaction_manager *tm,
+			 struct dm_disk_bitset *info)
+{
+	dm_array_info_init(&info->array_info, tm, &bitset_bvt);
+	info->current_index_set = false;
+}
+EXPORT_SYMBOL_GPL(dm_disk_bitset_init);
+
+int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *root)
+{
+	return dm_array_empty(&info->array_info, root);
+}
+EXPORT_SYMBOL_GPL(dm_bitset_empty);
+
+int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t root,
+		     uint32_t old_nr_entries, uint32_t new_nr_entries,
+		     bool default_value, dm_block_t *new_root)
+{
+	uint32_t old_blocks = dm_div_up(old_nr_entries, BITS_PER_ARRAY_ENTRY);
+	uint32_t new_blocks = dm_div_up(new_nr_entries, BITS_PER_ARRAY_ENTRY);
+	__le64 value = default_value ? cpu_to_le64(~0) : cpu_to_le64(0);
+
+	__dm_bless_for_disk(&value);
+	return dm_array_resize(&info->array_info, root, old_blocks, new_blocks,
+			       &value, new_root);
+}
+EXPORT_SYMBOL_GPL(dm_bitset_resize);
+
+int dm_bitset_del(struct dm_disk_bitset *info, dm_block_t root)
+{
+	return dm_array_del(&info->array_info, root);
+}
+EXPORT_SYMBOL_GPL(dm_bitset_del);
+
+int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root,
+		    dm_block_t *new_root)
+{
+	int r;
+	__le64 value;
+
+	if (!info->current_index_set || !info->dirty)
+		return 0;
+
+	value = cpu_to_le64(info->current_bits);
+
+	__dm_bless_for_disk(&value);
+	r = dm_array_set_value(&info->array_info, root, info->current_index,
+			       &value, new_root);
+	if (r)
+		return r;
+
+	info->current_index_set = false;
+	info->dirty = false;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bitset_flush);
+
+static int read_bits(struct dm_disk_bitset *info, dm_block_t root,
+		     uint32_t array_index)
+{
+	int r;
+	__le64 value;
+
+	r = dm_array_get_value(&info->array_info, root, array_index, &value);
+	if (r)
+		return r;
+
+	info->current_bits = le64_to_cpu(value);
+	info->current_index_set = true;
+	info->current_index = array_index;
+	info->dirty = false;
+
+	return 0;
+}
+
+static int get_array_entry(struct dm_disk_bitset *info, dm_block_t root,
+			   uint32_t index, dm_block_t *new_root)
+{
+	int r;
+	unsigned array_index = index / BITS_PER_ARRAY_ENTRY;
+
+	if (info->current_index_set) {
+		if (info->current_index == array_index)
+			return 0;
+
+		r = dm_bitset_flush(info, root, new_root);
+		if (r)
+			return r;
+	}
+
+	return read_bits(info, root, array_index);
+}
+
+int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root,
+		      uint32_t index, dm_block_t *new_root)
+{
+	int r;
+	unsigned b = index % BITS_PER_ARRAY_ENTRY;
+
+	r = get_array_entry(info, root, index, new_root);
+	if (r)
+		return r;
+
+	set_bit(b, (unsigned long *) &info->current_bits);
+	info->dirty = true;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bitset_set_bit);
+
+int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root,
+			uint32_t index, dm_block_t *new_root)
+{
+	int r;
+	unsigned b = index % BITS_PER_ARRAY_ENTRY;
+
+	r = get_array_entry(info, root, index, new_root);
+	if (r)
+		return r;
+
+	clear_bit(b, (unsigned long *) &info->current_bits);
+	info->dirty = true;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bitset_clear_bit);
+
+int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root,
+		       uint32_t index, dm_block_t *new_root, bool *result)
+{
+	int r;
+	unsigned b = index % BITS_PER_ARRAY_ENTRY;
+
+	r = get_array_entry(info, root, index, new_root);
+	if (r)
+		return r;
+
+	*result = test_bit(b, (unsigned long *) &info->current_bits);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bitset_test_bit);
+
+/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-bitset.h b/drivers/md/persistent-data/dm-bitset.h
new file mode 100644
index 00000000000..c2287d672ef
--- /dev/null
+++ b/drivers/md/persistent-data/dm-bitset.h
@@ -0,0 +1,166 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+#ifndef _LINUX_DM_BITSET_H
+#define _LINUX_DM_BITSET_H
+
+#include "dm-array.h"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * This bitset type is a thin wrapper round a dm_array of 64bit words.  It
+ * uses a tiny, one word cache to reduce the number of array lookups and so
+ * increase performance.
+ *
+ * Like the dm-array that it's based on, the caller needs to keep track of
+ * the size of the bitset separately.  The underlying dm-array implicitly
+ * knows how many words it's storing and will return -ENODATA if you try
+ * and access an out of bounds word.  However, an out of bounds bit in the
+ * final word will _not_ be detected, you have been warned.
+ *
+ * Bits are indexed from zero.
+
+ * Typical use:
+ *
+ * a) Initialise a dm_disk_bitset structure with dm_disk_bitset_init().
+ *    This describes the bitset and includes the cache.  It's not called it
+ *    dm_bitset_info in line with other data structures because it does
+ *    include instance data.
+ *
+ * b) Get yourself a root.  The root is the index of a block of data on the
+ *    disk that holds a particular instance of an bitset.  You may have a
+ *    pre existing root in your metadata that you wish to use, or you may
+ *    want to create a brand new, empty bitset with dm_bitset_empty().
+ *
+ * Like the other data structures in this library, dm_bitset objects are
+ * immutable between transactions.  Update functions will return you the
+ * root for a _new_ array.  If you've incremented the old root, via
+ * dm_tm_inc(), before calling the update function you may continue to use
+ * it in parallel with the new root.
+ *
+ * Even read operations may trigger the cache to be flushed and as such
+ * return a root for a new, updated bitset.
+ *
+ * c) resize a bitset with dm_bitset_resize().
+ *
+ * d) Set a bit with dm_bitset_set_bit().
+ *
+ * e) Clear a bit with dm_bitset_clear_bit().
+ *
+ * f) Test a bit with dm_bitset_test_bit().
+ *
+ * g) Flush all updates from the cache with dm_bitset_flush().
+ *
+ * h) Destroy the bitset with dm_bitset_del().  This tells the transaction
+ *    manager that you're no longer using this data structure so it can
+ *    recycle it's blocks.  (dm_bitset_dec() would be a better name for it,
+ *    but del is in keeping with dm_btree_del()).
+ */
+
+/*
+ * Opaque object.  Unlike dm_array_info, you should have one of these per
+ * bitset.  Initialise with dm_disk_bitset_init().
+ */
+struct dm_disk_bitset {
+	struct dm_array_info array_info;
+
+	uint32_t current_index;
+	uint64_t current_bits;
+
+	bool current_index_set:1;
+	bool dirty:1;
+};
+
+/*
+ * Sets up a dm_disk_bitset structure.  You don't need to do anything with
+ * this structure when you finish using it.
+ *
+ * tm - the transaction manager that should supervise this structure
+ * info - the structure being initialised
+ */
+void dm_disk_bitset_init(struct dm_transaction_manager *tm,
+			 struct dm_disk_bitset *info);
+
+/*
+ * Create an empty, zero length bitset.
+ *
+ * info - describes the bitset
+ * new_root - on success, points to the new root block
+ */
+int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *new_root);
+
+/*
+ * Resize the bitset.
+ *
+ * info - describes the bitset
+ * old_root - the root block of the array on disk
+ * old_nr_entries - the number of bits in the old bitset
+ * new_nr_entries - the number of bits you want in the new bitset
+ * default_value - the value for any new bits
+ * new_root - on success, points to the new root block
+ */
+int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t old_root,
+		     uint32_t old_nr_entries, uint32_t new_nr_entries,
+		     bool default_value, dm_block_t *new_root);
+
+/*
+ * Frees the bitset.
+ */
+int dm_bitset_del(struct dm_disk_bitset *info, dm_block_t root);
+
+/*
+ * Set a bit.
+ *
+ * info - describes the bitset
+ * root - the root block of the bitset
+ * index - the bit index
+ * new_root - on success, points to the new root block
+ *
+ * -ENODATA will be returned if the index is out of bounds.
+ */
+int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root,
+		      uint32_t index, dm_block_t *new_root);
+
+/*
+ * Clears a bit.
+ *
+ * info - describes the bitset
+ * root - the root block of the bitset
+ * index - the bit index
+ * new_root - on success, points to the new root block
+ *
+ * -ENODATA will be returned if the index is out of bounds.
+ */
+int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root,
+			uint32_t index, dm_block_t *new_root);
+
+/*
+ * Tests a bit.
+ *
+ * info - describes the bitset
+ * root - the root block of the bitset
+ * index - the bit index
+ * new_root - on success, points to the new root block (cached values may have been written)
+ * result - the bit value you're after
+ *
+ * -ENODATA will be returned if the index is out of bounds.
+ */
+int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root,
+		       uint32_t index, dm_block_t *new_root, bool *result);
+
+/*
+ * Flush any cached changes to disk.
+ *
+ * info - describes the bitset
+ * root - the root block of the bitset
+ * new_root - on success, points to the new root block
+ */
+int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root,
+		    dm_block_t *new_root);
+
+/*----------------------------------------------------------------*/
+
+#endif /* _LINUX_DM_BITSET_H */
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 0317ecdc6e5..087411c95ff 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -25,7 +25,7 @@
  * may be held at once.  This is just an implementation detail.
  *
  * ii) Recursive locking attempts are detected and return EINVAL.  A stack
- * trace is also emitted for the previous lock aquisition.
+ * trace is also emitted for the previous lock acquisition.
  *
  * iii) Priority is given to write locks.
  */
@@ -104,12 +104,12 @@ static int __check_holder(struct block_lock *lock)
 
 	for (i = 0; i < MAX_HOLDERS; i++) {
 		if (lock->holders[i] == current) {
-			DMERR("recursive lock detected in pool metadata");
+			DMERR("recursive lock detected in metadata");
 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
 			DMERR("previously held here:");
 			print_stack_trace(lock->traces + i, 4);
 
-			DMERR("subsequent aquisition attempted here:");
+			DMERR("subsequent acquisition attempted here:");
 			t.nr_entries = 0;
 			t.max_entries = MAX_STACK;
 			t.entries = entries;
@@ -325,11 +325,6 @@ static struct dm_buffer *to_buffer(struct dm_block *b)
 	return (struct dm_buffer *) b;
 }
 
-static struct dm_bufio_client *to_bufio(struct dm_block_manager *bm)
-{
-	return (struct dm_bufio_client *) bm;
-}
-
 dm_block_t dm_block_location(struct dm_block *b)
 {
 	return dm_bufio_get_block_number(to_buffer(b));
@@ -367,34 +362,60 @@ static void dm_block_manager_write_callback(struct dm_buffer *buf)
 /*----------------------------------------------------------------
  * Public interface
  *--------------------------------------------------------------*/
+struct dm_block_manager {
+	struct dm_bufio_client *bufio;
+	bool read_only:1;
+};
+
 struct dm_block_manager *dm_block_manager_create(struct block_device *bdev,
 						 unsigned block_size,
 						 unsigned cache_size,
 						 unsigned max_held_per_thread)
 {
-	return (struct dm_block_manager *)
-		dm_bufio_client_create(bdev, block_size, max_held_per_thread,
-				       sizeof(struct buffer_aux),
-				       dm_block_manager_alloc_callback,
-				       dm_block_manager_write_callback);
+	int r;
+	struct dm_block_manager *bm;
+
+	bm = kmalloc(sizeof(*bm), GFP_KERNEL);
+	if (!bm) {
+		r = -ENOMEM;
+		goto bad;
+	}
+
+	bm->bufio = dm_bufio_client_create(bdev, block_size, max_held_per_thread,
+					   sizeof(struct buffer_aux),
+					   dm_block_manager_alloc_callback,
+					   dm_block_manager_write_callback);
+	if (IS_ERR(bm->bufio)) {
+		r = PTR_ERR(bm->bufio);
+		kfree(bm);
+		goto bad;
+	}
+
+	bm->read_only = false;
+
+	return bm;
+
+bad:
+	return ERR_PTR(r);
 }
 EXPORT_SYMBOL_GPL(dm_block_manager_create);
 
 void dm_block_manager_destroy(struct dm_block_manager *bm)
 {
-	return dm_bufio_client_destroy(to_bufio(bm));
+	dm_bufio_client_destroy(bm->bufio);
+	kfree(bm);
 }
 EXPORT_SYMBOL_GPL(dm_block_manager_destroy);
 
 unsigned dm_bm_block_size(struct dm_block_manager *bm)
 {
-	return dm_bufio_get_block_size(to_bufio(bm));
+	return dm_bufio_get_block_size(bm->bufio);
 }
 EXPORT_SYMBOL_GPL(dm_bm_block_size);
 
 dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm)
 {
-	return dm_bufio_get_device_size(to_bufio(bm));
+	return dm_bufio_get_device_size(bm->bufio);
 }
 
 static int dm_bm_validate_buffer(struct dm_block_manager *bm,
@@ -406,16 +427,18 @@ static int dm_bm_validate_buffer(struct dm_block_manager *bm,
 		int r;
 		if (!v)
 			return 0;
-		r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(to_bufio(bm)));
-		if (unlikely(r))
+		r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(bm->bufio));
+		if (unlikely(r)) {
+			DMERR_LIMIT("%s validator check failed for block %llu", v->name,
+				    (unsigned long long) dm_bufio_get_block_number(buf));
 			return r;
+		}
 		aux->validator = v;
 	} else {
 		if (unlikely(aux->validator != v)) {
-			DMERR("validator mismatch (old=%s vs new=%s) for block %llu",
-				aux->validator->name, v ? v->name : "NULL",
-				(unsigned long long)
-					dm_bufio_get_block_number(buf));
+			DMERR_LIMIT("validator mismatch (old=%s vs new=%s) for block %llu",
+				    aux->validator->name, v ? v->name : "NULL",
+				    (unsigned long long) dm_bufio_get_block_number(buf));
 			return -EINVAL;
 		}
 	}
@@ -430,7 +453,7 @@ int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
 	void *p;
 	int r;
 
-	p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result);
+	p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
 	if (unlikely(IS_ERR(p)))
 		return PTR_ERR(p);
 
@@ -463,7 +486,10 @@ int dm_bm_write_lock(struct dm_block_manager *bm,
 	void *p;
 	int r;
 
-	p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result);
+	if (bm->read_only)
+		return -EPERM;
+
+	p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
 	if (unlikely(IS_ERR(p)))
 		return PTR_ERR(p);
 
@@ -496,7 +522,7 @@ int dm_bm_read_try_lock(struct dm_block_manager *bm,
 	void *p;
 	int r;
 
-	p = dm_bufio_get(to_bufio(bm), b, (struct dm_buffer **) result);
+	p = dm_bufio_get(bm->bufio, b, (struct dm_buffer **) result);
 	if (unlikely(IS_ERR(p)))
 		return PTR_ERR(p);
 	if (unlikely(!p))
@@ -529,7 +555,10 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm,
 	struct buffer_aux *aux;
 	void *p;
 
-	p = dm_bufio_new(to_bufio(bm), b, (struct dm_buffer **) result);
+	if (bm->read_only)
+		return -EPERM;
+
+	p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result);
 	if (unlikely(IS_ERR(p)))
 		return PTR_ERR(p);
 
@@ -547,6 +576,7 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(dm_bm_write_lock_zero);
 
 int dm_bm_unlock(struct dm_block *b)
 {
@@ -565,45 +595,31 @@ int dm_bm_unlock(struct dm_block *b)
 }
 EXPORT_SYMBOL_GPL(dm_bm_unlock);
 
-int dm_bm_unlock_move(struct dm_block *b, dm_block_t n)
+int dm_bm_flush(struct dm_block_manager *bm)
 {
-	struct buffer_aux *aux;
-
-	aux = dm_bufio_get_aux_data(to_buffer(b));
+	if (bm->read_only)
+		return -EPERM;
 
-	if (aux->write_locked) {
-		dm_bufio_mark_buffer_dirty(to_buffer(b));
-		bl_up_write(&aux->lock);
-	} else
-		bl_up_read(&aux->lock);
-
-	dm_bufio_release_move(to_buffer(b), n);
-	return 0;
+	return dm_bufio_write_dirty_buffers(bm->bufio);
 }
+EXPORT_SYMBOL_GPL(dm_bm_flush);
 
-int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
-			   struct dm_block *superblock)
+void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b)
 {
-	int r;
-
-	r = dm_bufio_write_dirty_buffers(to_bufio(bm));
-	if (unlikely(r))
-		return r;
-	r = dm_bufio_issue_flush(to_bufio(bm));
-	if (unlikely(r))
-		return r;
-
-	dm_bm_unlock(superblock);
+	dm_bufio_prefetch(bm->bufio, b, 1);
+}
 
-	r = dm_bufio_write_dirty_buffers(to_bufio(bm));
-	if (unlikely(r))
-		return r;
-	r = dm_bufio_issue_flush(to_bufio(bm));
-	if (unlikely(r))
-		return r;
+void dm_bm_set_read_only(struct dm_block_manager *bm)
+{
+	bm->read_only = true;
+}
+EXPORT_SYMBOL_GPL(dm_bm_set_read_only);
 
-	return 0;
+void dm_bm_set_read_write(struct dm_block_manager *bm)
+{
+	bm->read_only = false;
 }
+EXPORT_SYMBOL_GPL(dm_bm_set_read_write);
 
 u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor)
 {
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
index 924833d2dfa..1b95dfc1778 100644
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -97,14 +97,6 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b,
 int dm_bm_unlock(struct dm_block *b);
 
 /*
- * An optimisation; we often want to copy a block's contents to a new
- * block.  eg, as part of the shadowing operation.  It's far better for
- * bufio to do this move behind the scenes than hold 2 locks and memcpy the
- * data.
- */
-int dm_bm_unlock_move(struct dm_block *b, dm_block_t n);
-
-/*
  * It's a common idiom to have a superblock that should be committed last.
  *
  * @superblock should be write-locked on entry. It will be unlocked during
@@ -113,8 +105,26 @@ int dm_bm_unlock_move(struct dm_block *b, dm_block_t n);
  *
  * This method always blocks.
  */
-int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
-			   struct dm_block *superblock);
+int dm_bm_flush(struct dm_block_manager *bm);
+
+/*
+ * Request data is prefetched into the cache.
+ */
+void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b);
+
+/*
+ * Switches the bm to a read only mode.  Once read-only mode
+ * has been entered the following functions will return -EPERM.
+ *
+ *   dm_bm_write_lock
+ *   dm_bm_write_lock_zero
+ *   dm_bm_flush_and_unlock
+ *
+ * Additionally you should not use dm_bm_unlock_move, however no error will
+ * be returned if you do.
+ */
+void dm_bm_set_read_only(struct dm_block_manager *bm);
+void dm_bm_set_read_write(struct dm_block_manager *bm);
 
 u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor);
 
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
index d279c768f8f..37d367bb9aa 100644
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -36,13 +36,13 @@ struct node_header {
 	__le32 padding;
 } __packed;
 
-struct node {
+struct btree_node {
 	struct node_header header;
 	__le64 keys[0];
 } __packed;
 
 
-void inc_children(struct dm_transaction_manager *tm, struct node *n,
+void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
 		  struct dm_btree_value_type *vt);
 
 int new_block(struct dm_btree_info *info, struct dm_block **result);
@@ -64,7 +64,8 @@ struct ro_spine {
 void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info);
 int exit_ro_spine(struct ro_spine *s);
 int ro_step(struct ro_spine *s, dm_block_t new_child);
-struct node *ro_node(struct ro_spine *s);
+void ro_pop(struct ro_spine *s);
+struct btree_node *ro_node(struct ro_spine *s);
 
 struct shadow_spine {
 	struct dm_btree_info *info;
@@ -98,29 +99,26 @@ int shadow_root(struct shadow_spine *s);
 /*
  * Some inlines.
  */
-static inline __le64 *key_ptr(struct node *n, uint32_t index)
+static inline __le64 *key_ptr(struct btree_node *n, uint32_t index)
 {
 	return n->keys + index;
 }
 
-static inline void *value_base(struct node *n)
+static inline void *value_base(struct btree_node *n)
 {
 	return &n->keys[le32_to_cpu(n->header.max_entries)];
 }
 
-/*
- * FIXME: Now that value size is stored in node we don't need the third parm.
- */
-static inline void *value_ptr(struct node *n, uint32_t index, size_t value_size)
+static inline void *value_ptr(struct btree_node *n, uint32_t index)
 {
-	BUG_ON(value_size != le32_to_cpu(n->header.value_size));
+	uint32_t value_size = le32_to_cpu(n->header.value_size);
 	return value_base(n) + (value_size * index);
 }
 
 /*
  * Assumes the values are suitably-aligned and converts to core format.
  */
-static inline uint64_t value64(struct node *n, uint32_t index)
+static inline uint64_t value64(struct btree_node *n, uint32_t index)
 {
 	__le64 *values_le = value_base(n);
 
@@ -130,7 +128,7 @@ static inline uint64_t value64(struct node *n, uint32_t index)
 /*
  * Searching for a key within a single node.
  */
-int lower_bound(struct node *n, uint64_t key);
+int lower_bound(struct btree_node *n, uint64_t key);
 
 extern struct dm_block_validator btree_node_validator;
 
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index 023fbc2d389..b88757cd0d1 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -53,7 +53,7 @@
 /*
  * Some little utilities for moving node data around.
  */
-static void node_shift(struct node *n, int shift)
+static void node_shift(struct btree_node *n, int shift)
 {
 	uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
 	uint32_t value_size = le32_to_cpu(n->header.value_size);
@@ -61,25 +61,25 @@ static void node_shift(struct node *n, int shift)
 	if (shift < 0) {
 		shift = -shift;
 		BUG_ON(shift > nr_entries);
-		BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift, value_size));
+		BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
 		memmove(key_ptr(n, 0),
 			key_ptr(n, shift),
 			(nr_entries - shift) * sizeof(__le64));
-		memmove(value_ptr(n, 0, value_size),
-			value_ptr(n, shift, value_size),
+		memmove(value_ptr(n, 0),
+			value_ptr(n, shift),
 			(nr_entries - shift) * value_size);
 	} else {
 		BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
 		memmove(key_ptr(n, shift),
 			key_ptr(n, 0),
 			nr_entries * sizeof(__le64));
-		memmove(value_ptr(n, shift, value_size),
-			value_ptr(n, 0, value_size),
+		memmove(value_ptr(n, shift),
+			value_ptr(n, 0),
 			nr_entries * value_size);
 	}
 }
 
-static void node_copy(struct node *left, struct node *right, int shift)
+static void node_copy(struct btree_node *left, struct btree_node *right, int shift)
 {
 	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 	uint32_t value_size = le32_to_cpu(left->header.value_size);
@@ -91,16 +91,16 @@ static void node_copy(struct node *left, struct node *right, int shift)
 		memcpy(key_ptr(left, nr_left),
 		       key_ptr(right, 0),
 		       shift * sizeof(__le64));
-		memcpy(value_ptr(left, nr_left, value_size),
-		       value_ptr(right, 0, value_size),
+		memcpy(value_ptr(left, nr_left),
+		       value_ptr(right, 0),
 		       shift * value_size);
 	} else {
 		BUG_ON(shift > le32_to_cpu(right->header.max_entries));
 		memcpy(key_ptr(right, 0),
 		       key_ptr(left, nr_left - shift),
 		       shift * sizeof(__le64));
-		memcpy(value_ptr(right, 0, value_size),
-		       value_ptr(left, nr_left - shift, value_size),
+		memcpy(value_ptr(right, 0),
+		       value_ptr(left, nr_left - shift),
 		       shift * value_size);
 	}
 }
@@ -108,7 +108,7 @@ static void node_copy(struct node *left, struct node *right, int shift)
 /*
  * Delete a specific entry from a leaf node.
  */
-static void delete_at(struct node *n, unsigned index)
+static void delete_at(struct btree_node *n, unsigned index)
 {
 	unsigned nr_entries = le32_to_cpu(n->header.nr_entries);
 	unsigned nr_to_copy = nr_entries - (index + 1);
@@ -120,43 +120,27 @@ static void delete_at(struct node *n, unsigned index)
 			key_ptr(n, index + 1),
 			nr_to_copy * sizeof(__le64));
 
-		memmove(value_ptr(n, index, value_size),
-			value_ptr(n, index + 1, value_size),
+		memmove(value_ptr(n, index),
+			value_ptr(n, index + 1),
 			nr_to_copy * value_size);
 	}
 
 	n->header.nr_entries = cpu_to_le32(nr_entries - 1);
 }
 
-static unsigned del_threshold(struct node *n)
+static unsigned merge_threshold(struct btree_node *n)
 {
 	return le32_to_cpu(n->header.max_entries) / 3;
 }
 
-static unsigned merge_threshold(struct node *n)
-{
-	/*
-	 * The extra one is because we know we're potentially going to
-	 * delete an entry.
-	 */
-	return 2 * (le32_to_cpu(n->header.max_entries) / 3) + 1;
-}
-
 struct child {
 	unsigned index;
 	struct dm_block *block;
-	struct node *n;
+	struct btree_node *n;
 };
 
-static struct dm_btree_value_type le64_type = {
-	.context = NULL,
-	.size = sizeof(__le64),
-	.inc = NULL,
-	.dec = NULL,
-	.equal = NULL
-};
-
-static int init_child(struct dm_btree_info *info, struct node *parent,
+static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
+		      struct btree_node *parent,
 		      unsigned index, struct child *result)
 {
 	int r, inc;
@@ -173,9 +157,9 @@ static int init_child(struct dm_btree_info *info, struct node *parent,
 	result->n = dm_block_data(result->block);
 
 	if (inc)
-		inc_children(info->tm, result->n, &le64_type);
+		inc_children(info->tm, result->n, vt);
 
-	*((__le64 *) value_ptr(parent, index, sizeof(__le64))) =
+	*((__le64 *) value_ptr(parent, index)) =
 		cpu_to_le64(dm_block_location(result->block));
 
 	return 0;
@@ -186,8 +170,17 @@ static int exit_child(struct dm_btree_info *info, struct child *c)
 	return dm_tm_unlock(info->tm, c->block);
 }
 
-static void shift(struct node *left, struct node *right, int count)
+static void shift(struct btree_node *left, struct btree_node *right, int count)
 {
+	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
+	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
+	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
+	uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
+
+	BUG_ON(max_entries != r_max_entries);
+	BUG_ON(nr_left - count > max_entries);
+	BUG_ON(nr_right + count > max_entries);
+
 	if (!count)
 		return;
 
@@ -199,24 +192,20 @@ static void shift(struct node *left, struct node *right, int count)
 		node_shift(right, count);
 	}
 
-	left->header.nr_entries =
-		cpu_to_le32(le32_to_cpu(left->header.nr_entries) - count);
-	BUG_ON(le32_to_cpu(left->header.nr_entries) > le32_to_cpu(left->header.max_entries));
-
-	right->header.nr_entries =
-		cpu_to_le32(le32_to_cpu(right->header.nr_entries) + count);
-	BUG_ON(le32_to_cpu(right->header.nr_entries) > le32_to_cpu(right->header.max_entries));
+	left->header.nr_entries = cpu_to_le32(nr_left - count);
+	right->header.nr_entries = cpu_to_le32(nr_right + count);
 }
 
-static void __rebalance2(struct dm_btree_info *info, struct node *parent,
+static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
 			 struct child *l, struct child *r)
 {
-	struct node *left = l->n;
-	struct node *right = r->n;
+	struct btree_node *left = l->n;
+	struct btree_node *right = r->n;
 	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
+	unsigned threshold = 2 * merge_threshold(left) + 1;
 
-	if (nr_left + nr_right <= merge_threshold(left)) {
+	if (nr_left + nr_right < threshold) {
 		/*
 		 * Merge
 		 */
@@ -234,28 +223,25 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent,
 		 * Rebalance.
 		 */
 		unsigned target_left = (nr_left + nr_right) / 2;
-		unsigned shift_ = nr_left - target_left;
-		BUG_ON(le32_to_cpu(left->header.max_entries) <= nr_left - shift_);
-		BUG_ON(le32_to_cpu(right->header.max_entries) <= nr_right + shift_);
 		shift(left, right, nr_left - target_left);
 		*key_ptr(parent, r->index) = right->keys[0];
 	}
 }
 
 static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
-		      unsigned left_index)
+		      struct dm_btree_value_type *vt, unsigned left_index)
 {
 	int r;
-	struct node *parent;
+	struct btree_node *parent;
 	struct child left, right;
 
 	parent = dm_block_data(shadow_current(s));
 
-	r = init_child(info, parent, left_index, &left);
+	r = init_child(info, vt, parent, left_index, &left);
 	if (r)
 		return r;
 
-	r = init_child(info, parent, left_index + 1, &right);
+	r = init_child(info, vt, parent, left_index + 1, &right);
 	if (r) {
 		exit_child(info, &left);
 		return r;
@@ -272,95 +258,129 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
 	return exit_child(info, &right);
 }
 
-static void __rebalance3(struct dm_btree_info *info, struct node *parent,
-			 struct child *l, struct child *c, struct child *r)
+/*
+ * We dump as many entries from center as possible into left, then the rest
+ * in right, then rebalance2.  This wastes some cpu, but I want something
+ * simple atm.
+ */
+static void delete_center_node(struct dm_btree_info *info, struct btree_node *parent,
+			       struct child *l, struct child *c, struct child *r,
+			       struct btree_node *left, struct btree_node *center, struct btree_node *right,
+			       uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
 {
-	struct node *left = l->n;
-	struct node *center = c->n;
-	struct node *right = r->n;
-
-	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
-	uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
-	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
 	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
+	unsigned shift = min(max_entries - nr_left, nr_center);
+
+	BUG_ON(nr_left + shift > max_entries);
+	node_copy(left, center, -shift);
+	left->header.nr_entries = cpu_to_le32(nr_left + shift);
+
+	if (shift != nr_center) {
+		shift = nr_center - shift;
+		BUG_ON((nr_right + shift) > max_entries);
+		node_shift(right, shift);
+		node_copy(center, right, shift);
+		right->header.nr_entries = cpu_to_le32(nr_right + shift);
+	}
+	*key_ptr(parent, r->index) = right->keys[0];
 
-	unsigned target;
+	delete_at(parent, c->index);
+	r->index--;
 
-	BUG_ON(left->header.max_entries != center->header.max_entries);
-	BUG_ON(center->header.max_entries != right->header.max_entries);
-
-	if (((nr_left + nr_center + nr_right) / 2) < merge_threshold(center)) {
-		/*
-		 * Delete center node:
-		 *
-		 * We dump as many entries from center as possible into
-		 * left, then the rest in right, then rebalance2.  This
-		 * wastes some cpu, but I want something simple atm.
-		 */
-		unsigned shift = min(max_entries - nr_left, nr_center);
-
-		BUG_ON(nr_left + shift > max_entries);
-		node_copy(left, center, -shift);
-		left->header.nr_entries = cpu_to_le32(nr_left + shift);
-
-		if (shift != nr_center) {
-			shift = nr_center - shift;
-			BUG_ON((nr_right + shift) >= max_entries);
-			node_shift(right, shift);
-			node_copy(center, right, shift);
-			right->header.nr_entries = cpu_to_le32(nr_right + shift);
-		}
-		*key_ptr(parent, r->index) = right->keys[0];
+	dm_tm_dec(info->tm, dm_block_location(c->block));
+	__rebalance2(info, parent, l, r);
+}
 
-		delete_at(parent, c->index);
-		r->index--;
+/*
+ * Redistributes entries among 3 sibling nodes.
+ */
+static void redistribute3(struct dm_btree_info *info, struct btree_node *parent,
+			  struct child *l, struct child *c, struct child *r,
+			  struct btree_node *left, struct btree_node *center, struct btree_node *right,
+			  uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
+{
+	int s;
+	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
+	unsigned target = (nr_left + nr_center + nr_right) / 3;
+	BUG_ON(target > max_entries);
 
-		dm_tm_dec(info->tm, dm_block_location(c->block));
-		__rebalance2(info, parent, l, r);
+	if (nr_left < nr_right) {
+		s = nr_left - target;
 
-		return;
-	}
+		if (s < 0 && nr_center < -s) {
+			/* not enough in central node */
+			shift(left, center, nr_center);
+			s = nr_center - target;
+			shift(left, right, s);
+			nr_right += s;
+		} else
+			shift(left, center, s);
 
-	/*
-	 * Rebalance
-	 */
-	target = (nr_left + nr_center + nr_right) / 3;
-	BUG_ON(target > max_entries);
+		shift(center, right, target - nr_right);
 
-	/*
-	 * Adjust the left node
-	 */
-	shift(left, center, nr_left - target);
+	} else {
+		s = target - nr_right;
+		if (s > 0 && nr_center < s) {
+			/* not enough in central node */
+			shift(center, right, nr_center);
+			s = target - nr_center;
+			shift(left, right, s);
+			nr_left -= s;
+		} else
+			shift(center, right, s);
+
+		shift(left, center, nr_left - target);
+	}
 
-	/*
-	 * Adjust the right node
-	 */
-	shift(center, right, target - nr_right);
 	*key_ptr(parent, c->index) = center->keys[0];
 	*key_ptr(parent, r->index) = right->keys[0];
 }
 
+static void __rebalance3(struct dm_btree_info *info, struct btree_node *parent,
+			 struct child *l, struct child *c, struct child *r)
+{
+	struct btree_node *left = l->n;
+	struct btree_node *center = c->n;
+	struct btree_node *right = r->n;
+
+	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
+	uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
+	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
+
+	unsigned threshold = merge_threshold(left) * 4 + 1;
+
+	BUG_ON(left->header.max_entries != center->header.max_entries);
+	BUG_ON(center->header.max_entries != right->header.max_entries);
+
+	if ((nr_left + nr_center + nr_right) < threshold)
+		delete_center_node(info, parent, l, c, r, left, center, right,
+				   nr_left, nr_center, nr_right);
+	else
+		redistribute3(info, parent, l, c, r, left, center, right,
+			      nr_left, nr_center, nr_right);
+}
+
 static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
-		      unsigned left_index)
+		      struct dm_btree_value_type *vt, unsigned left_index)
 {
 	int r;
-	struct node *parent = dm_block_data(shadow_current(s));
+	struct btree_node *parent = dm_block_data(shadow_current(s));
 	struct child left, center, right;
 
 	/*
 	 * FIXME: fill out an array?
 	 */
-	r = init_child(info, parent, left_index, &left);
+	r = init_child(info, vt, parent, left_index, &left);
 	if (r)
 		return r;
 
-	r = init_child(info, parent, left_index + 1, &center);
+	r = init_child(info, vt, parent, left_index + 1, &center);
 	if (r) {
 		exit_child(info, &left);
 		return r;
 	}
 
-	r = init_child(info, parent, left_index + 2, &right);
+	r = init_child(info, vt, parent, left_index + 2, &right);
 	if (r) {
 		exit_child(info, &left);
 		exit_child(info, &center);
@@ -394,7 +414,7 @@ static int get_nr_entries(struct dm_transaction_manager *tm,
 {
 	int r;
 	struct dm_block *block;
-	struct node *n;
+	struct btree_node *n;
 
 	r = dm_tm_read_lock(tm, b, &btree_node_validator, &block);
 	if (r)
@@ -407,11 +427,12 @@ static int get_nr_entries(struct dm_transaction_manager *tm,
 }
 
 static int rebalance_children(struct shadow_spine *s,
-			      struct dm_btree_info *info, uint64_t key)
+			      struct dm_btree_info *info,
+			      struct dm_btree_value_type *vt, uint64_t key)
 {
 	int i, r, has_left_sibling, has_right_sibling;
 	uint32_t child_entries;
-	struct node *n;
+	struct btree_node *n;
 
 	n = dm_block_data(shadow_current(s));
 
@@ -441,25 +462,22 @@ static int rebalance_children(struct shadow_spine *s,
 	if (r)
 		return r;
 
-	if (child_entries > del_threshold(n))
-		return 0;
-
 	has_left_sibling = i > 0;
 	has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
 
 	if (!has_left_sibling)
-		r = rebalance2(s, info, i);
+		r = rebalance2(s, info, vt, i);
 
 	else if (!has_right_sibling)
-		r = rebalance2(s, info, i - 1);
+		r = rebalance2(s, info, vt, i - 1);
 
 	else
-		r = rebalance3(s, info, i - 1);
+		r = rebalance3(s, info, vt, i - 1);
 
 	return r;
 }
 
-static int do_leaf(struct node *n, uint64_t key, unsigned *index)
+static int do_leaf(struct btree_node *n, uint64_t key, unsigned *index)
 {
 	int i = lower_bound(n, key);
 
@@ -482,7 +500,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
 		      uint64_t key, unsigned *index)
 {
 	int i = *index, r;
-	struct node *n;
+	struct btree_node *n;
 
 	for (;;) {
 		r = shadow_step(s, root, vt);
@@ -496,7 +514,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
 		 */
 		if (shadow_has_parent(s)) {
 			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
-			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(__le64)),
+			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
 			       &location, sizeof(__le64));
 		}
 
@@ -505,7 +523,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
 		if (le32_to_cpu(n->header.flags) & LEAF_NODE)
 			return do_leaf(n, key, index);
 
-		r = rebalance_children(s, info, key);
+		r = rebalance_children(s, info, vt, key);
 		if (r)
 			break;
 
@@ -526,13 +544,21 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
 	return r;
 }
 
+static struct dm_btree_value_type le64_type = {
+	.context = NULL,
+	.size = sizeof(__le64),
+	.inc = NULL,
+	.dec = NULL,
+	.equal = NULL
+};
+
 int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
 		    uint64_t *keys, dm_block_t *new_root)
 {
 	unsigned level, last_level = info->levels - 1;
 	int index = 0, r = 0;
 	struct shadow_spine spine;
-	struct node *n;
+	struct btree_node *n;
 
 	init_shadow_spine(&spine, info);
 	for (level = 0; level < info->levels; level++) {
@@ -553,7 +579,7 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
 
 		if (info->value_type.dec)
 			info->value_type.dec(info->value_type.context,
-					     value_ptr(n, index, info->value_type.size));
+					     value_ptr(n, index));
 
 		delete_at(n, index);
 	}
diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c
index d9a7912ee8e..cf9fd676ae4 100644
--- a/drivers/md/persistent-data/dm-btree-spine.c
+++ b/drivers/md/persistent-data/dm-btree-spine.c
@@ -23,7 +23,7 @@ static void node_prepare_for_write(struct dm_block_validator *v,
 				   struct dm_block *b,
 				   size_t block_size)
 {
-	struct node *n = dm_block_data(b);
+	struct btree_node *n = dm_block_data(b);
 	struct node_header *h = &n->header;
 
 	h->blocknr = cpu_to_le64(dm_block_location(b));
@@ -38,15 +38,15 @@ static int node_check(struct dm_block_validator *v,
 		      struct dm_block *b,
 		      size_t block_size)
 {
-	struct node *n = dm_block_data(b);
+	struct btree_node *n = dm_block_data(b);
 	struct node_header *h = &n->header;
 	size_t value_size;
 	__le32 csum_disk;
 	uint32_t flags;
 
 	if (dm_block_location(b) != le64_to_cpu(h->blocknr)) {
-		DMERR("node_check failed blocknr %llu wanted %llu",
-		      le64_to_cpu(h->blocknr), dm_block_location(b));
+		DMERR_LIMIT("node_check failed: blocknr %llu != wanted %llu",
+			    le64_to_cpu(h->blocknr), dm_block_location(b));
 		return -ENOTBLK;
 	}
 
@@ -54,8 +54,8 @@ static int node_check(struct dm_block_validator *v,
 					       block_size - sizeof(__le32),
 					       BTREE_CSUM_XOR));
 	if (csum_disk != h->csum) {
-		DMERR("node_check failed csum %u wanted %u",
-		      le32_to_cpu(csum_disk), le32_to_cpu(h->csum));
+		DMERR_LIMIT("node_check failed: csum %u != wanted %u",
+			    le32_to_cpu(csum_disk), le32_to_cpu(h->csum));
 		return -EILSEQ;
 	}
 
@@ -63,12 +63,12 @@ static int node_check(struct dm_block_validator *v,
 
 	if (sizeof(struct node_header) +
 	    (sizeof(__le64) + value_size) * le32_to_cpu(h->max_entries) > block_size) {
-		DMERR("node_check failed: max_entries too large");
+		DMERR_LIMIT("node_check failed: max_entries too large");
 		return -EILSEQ;
 	}
 
 	if (le32_to_cpu(h->nr_entries) > le32_to_cpu(h->max_entries)) {
-		DMERR("node_check failed, too many entries");
+		DMERR_LIMIT("node_check failed: too many entries");
 		return -EILSEQ;
 	}
 
@@ -77,7 +77,7 @@ static int node_check(struct dm_block_validator *v,
 	 */
 	flags = le32_to_cpu(h->flags);
 	if (!(flags & INTERNAL_NODE) && !(flags & LEAF_NODE)) {
-		DMERR("node_check failed, node is neither INTERNAL or LEAF");
+		DMERR_LIMIT("node_check failed: node is neither INTERNAL or LEAF");
 		return -EILSEQ;
 	}
 
@@ -164,7 +164,14 @@ int ro_step(struct ro_spine *s, dm_block_t new_child)
 	return r;
 }
 
-struct node *ro_node(struct ro_spine *s)
+void ro_pop(struct ro_spine *s)
+{
+	BUG_ON(!s->count);
+	--s->count;
+	unlock_block(s->info, s->nodes[s->count]);
+}
+
+struct btree_node *ro_node(struct ro_spine *s)
 {
 	struct dm_block *block;
 
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index bd1e7ffbe26..416060c2570 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -38,7 +38,7 @@ static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
 /*----------------------------------------------------------------*/
 
 /* makes the assumption that no two keys are the same. */
-static int bsearch(struct node *n, uint64_t key, int want_hi)
+static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
 {
 	int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
 
@@ -58,12 +58,12 @@ static int bsearch(struct node *n, uint64_t key, int want_hi)
 	return want_hi ? hi : lo;
 }
 
-int lower_bound(struct node *n, uint64_t key)
+int lower_bound(struct btree_node *n, uint64_t key)
 {
 	return bsearch(n, key, 0);
 }
 
-void inc_children(struct dm_transaction_manager *tm, struct node *n,
+void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
 		  struct dm_btree_value_type *vt)
 {
 	unsigned i;
@@ -74,11 +74,10 @@ void inc_children(struct dm_transaction_manager *tm, struct node *n,
 			dm_tm_inc(tm, value64(n, i));
 	else if (vt->inc)
 		for (i = 0; i < nr_entries; i++)
-			vt->inc(vt->context,
-				value_ptr(n, i, vt->size));
+			vt->inc(vt->context, value_ptr(n, i));
 }
 
-static int insert_at(size_t value_size, struct node *node, unsigned index,
+static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
 		      uint64_t key, void *value)
 		      __dm_written_to_disk(value)
 {
@@ -123,7 +122,7 @@ int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
 {
 	int r;
 	struct dm_block *b;
-	struct node *n;
+	struct btree_node *n;
 	size_t block_size;
 	uint32_t max_entries;
 
@@ -155,13 +154,14 @@ EXPORT_SYMBOL_GPL(dm_btree_empty);
 #define MAX_SPINE_DEPTH 64
 struct frame {
 	struct dm_block *b;
-	struct node *n;
+	struct btree_node *n;
 	unsigned level;
 	unsigned nr_children;
 	unsigned current_child;
 };
 
 struct del_stack {
+	struct dm_btree_info *info;
 	struct dm_transaction_manager *tm;
 	int top;
 	struct frame spine[MAX_SPINE_DEPTH];
@@ -184,6 +184,20 @@ static int unprocessed_frames(struct del_stack *s)
 	return s->top >= 0;
 }
 
+static void prefetch_children(struct del_stack *s, struct frame *f)
+{
+	unsigned i;
+	struct dm_block_manager *bm = dm_tm_get_bm(s->tm);
+
+	for (i = 0; i < f->nr_children; i++)
+		dm_bm_prefetch(bm, value64(f->n, i));
+}
+
+static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
+{
+	return f->level < (info->levels - 1);
+}
+
 static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
 {
 	int r;
@@ -206,6 +220,7 @@ static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
 		dm_tm_dec(s->tm, b);
 
 	else {
+		uint32_t flags;
 		struct frame *f = s->spine + ++s->top;
 
 		r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
@@ -218,6 +233,10 @@ static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
 		f->level = level;
 		f->nr_children = le32_to_cpu(f->n->header.nr_entries);
 		f->current_child = 0;
+
+		flags = le32_to_cpu(f->n->header.flags);
+		if (flags & INTERNAL_NODE || is_internal_level(s->info, f))
+			prefetch_children(s, f);
 	}
 
 	return 0;
@@ -239,10 +258,11 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
 	s = kmalloc(sizeof(*s), GFP_KERNEL);
 	if (!s)
 		return -ENOMEM;
+	s->info = info;
 	s->tm = info->tm;
 	s->top = -1;
 
-	r = push_frame(s, root, 1);
+	r = push_frame(s, root, 0);
 	if (r)
 		goto out;
 
@@ -268,7 +288,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
 			if (r)
 				goto out;
 
-		} else if (f->level != (info->levels - 1)) {
+		} else if (is_internal_level(info, f)) {
 			b = value64(f->n, f->current_child);
 			f->current_child++;
 			r = push_frame(s, b, f->level + 1);
@@ -281,9 +301,9 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
 
 				for (i = 0; i < f->nr_children; i++)
 					info->value_type.dec(info->value_type.context,
-							     value_ptr(f->n, i, info->value_type.size));
+							     value_ptr(f->n, i));
 			}
-			f->current_child = f->nr_children;
+			pop_frame(s);
 		}
 	}
 
@@ -296,7 +316,7 @@ EXPORT_SYMBOL_GPL(dm_btree_del);
 /*----------------------------------------------------------------*/
 
 static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
-			    int (*search_fn)(struct node *, uint64_t),
+			    int (*search_fn)(struct btree_node *, uint64_t),
 			    uint64_t *result_key, void *v, size_t value_size)
 {
 	int i, r;
@@ -320,7 +340,7 @@ static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
 	} while (!(flags & LEAF_NODE));
 
 	*result_key = le64_to_cpu(ro_node(s)->keys[i]);
-	memcpy(v, value_ptr(ro_node(s), i, value_size), value_size);
+	memcpy(v, value_ptr(ro_node(s), i), value_size);
 
 	return 0;
 }
@@ -407,7 +427,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
 	size_t size;
 	unsigned nr_left, nr_right;
 	struct dm_block *left, *right, *parent;
-	struct node *ln, *rn, *pn;
+	struct btree_node *ln, *rn, *pn;
 	__le64 location;
 
 	left = shadow_current(s);
@@ -432,7 +452,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
 
 	size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
 		sizeof(uint64_t) : s->info->value_type.size;
-	memcpy(value_ptr(rn, 0, size), value_ptr(ln, nr_left, size),
+	memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
 	       size * nr_right);
 
 	/*
@@ -443,7 +463,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
 	pn = dm_block_data(parent);
 	location = cpu_to_le64(dm_block_location(left));
 	__dm_bless_for_disk(&location);
-	memcpy_disk(value_ptr(pn, parent_index, sizeof(__le64)),
+	memcpy_disk(value_ptr(pn, parent_index),
 		    &location, sizeof(__le64));
 
 	location = cpu_to_le64(dm_block_location(right));
@@ -492,7 +512,7 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
 	size_t size;
 	unsigned nr_left, nr_right;
 	struct dm_block *left, *right, *new_parent;
-	struct node *pn, *ln, *rn;
+	struct btree_node *pn, *ln, *rn;
 	__le64 val;
 
 	new_parent = shadow_current(s);
@@ -529,8 +549,8 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
 
 	size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
 		sizeof(__le64) : s->info->value_type.size;
-	memcpy(value_ptr(ln, 0, size), value_ptr(pn, 0, size), nr_left * size);
-	memcpy(value_ptr(rn, 0, size), value_ptr(pn, nr_left, size),
+	memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
+	memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
 	       nr_right * size);
 
 	/* new_parent should just point to l and r now */
@@ -545,12 +565,12 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
 	val = cpu_to_le64(dm_block_location(left));
 	__dm_bless_for_disk(&val);
 	pn->keys[0] = ln->keys[0];
-	memcpy_disk(value_ptr(pn, 0, sizeof(__le64)), &val, sizeof(__le64));
+	memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
 
 	val = cpu_to_le64(dm_block_location(right));
 	__dm_bless_for_disk(&val);
 	pn->keys[1] = rn->keys[0];
-	memcpy_disk(value_ptr(pn, 1, sizeof(__le64)), &val, sizeof(__le64));
+	memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
 
 	/*
 	 * rejig the spine.  This is ugly, since it knows too
@@ -577,7 +597,7 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
 			    uint64_t key, unsigned *index)
 {
 	int r, i = *index, top = 1;
-	struct node *node;
+	struct btree_node *node;
 
 	for (;;) {
 		r = shadow_step(s, root, vt);
@@ -595,7 +615,7 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
 			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
 
 			__dm_bless_for_disk(&location);
-			memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(uint64_t)),
+			memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
 				    &location, sizeof(__le64));
 		}
 
@@ -644,7 +664,7 @@ static int insert(struct dm_btree_info *info, dm_block_t root,
 	unsigned level, index = -1, last_level = info->levels - 1;
 	dm_block_t block = root;
 	struct shadow_spine spine;
-	struct node *n;
+	struct btree_node *n;
 	struct dm_btree_value_type le64_type;
 
 	le64_type.context = NULL;
@@ -710,12 +730,12 @@ static int insert(struct dm_btree_info *info, dm_block_t root,
 		    (!info->value_type.equal ||
 		     !info->value_type.equal(
 			     info->value_type.context,
-			     value_ptr(n, index, info->value_type.size),
+			     value_ptr(n, index),
 			     value))) {
 			info->value_type.dec(info->value_type.context,
-					     value_ptr(n, index, info->value_type.size));
+					     value_ptr(n, index));
 		}
-		memcpy_disk(value_ptr(n, index, info->value_type.size),
+		memcpy_disk(value_ptr(n, index),
 			    value, info->value_type.size);
 	}
 
@@ -750,8 +770,8 @@ EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
 
 /*----------------------------------------------------------------*/
 
-static int find_highest_key(struct ro_spine *s, dm_block_t block,
-			    uint64_t *result_key, dm_block_t *next_block)
+static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
+		    uint64_t *result_key, dm_block_t *next_block)
 {
 	int i, r;
 	uint32_t flags;
@@ -768,7 +788,11 @@ static int find_highest_key(struct ro_spine *s, dm_block_t block,
 		else
 			i--;
 
-		*result_key = le64_to_cpu(ro_node(s)->keys[i]);
+		if (find_highest)
+			*result_key = le64_to_cpu(ro_node(s)->keys[i]);
+		else
+			*result_key = le64_to_cpu(ro_node(s)->keys[0]);
+
 		if (next_block || flags & INTERNAL_NODE)
 			block = value64(ro_node(s), i);
 
@@ -779,16 +803,16 @@ static int find_highest_key(struct ro_spine *s, dm_block_t block,
 	return 0;
 }
 
-int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
-			      uint64_t *result_keys)
+static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root,
+			     bool find_highest, uint64_t *result_keys)
 {
 	int r = 0, count = 0, level;
 	struct ro_spine spine;
 
 	init_ro_spine(&spine, info);
 	for (level = 0; level < info->levels; level++) {
-		r = find_highest_key(&spine, root, result_keys + level,
-				     level == info->levels - 1 ? NULL : &root);
+		r = find_key(&spine, root, find_highest, result_keys + level,
+			     level == info->levels - 1 ? NULL : &root);
 		if (r == -ENODATA) {
 			r = 0;
 			break;
@@ -802,4 +826,71 @@ int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
 
 	return r ? r : count;
 }
+
+int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
+			      uint64_t *result_keys)
+{
+	return dm_btree_find_key(info, root, true, result_keys);
+}
 EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
+
+int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
+			     uint64_t *result_keys)
+{
+	return dm_btree_find_key(info, root, false, result_keys);
+}
+EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
+
+/*----------------------------------------------------------------*/
+
+/*
+ * FIXME: We shouldn't use a recursive algorithm when we have limited stack
+ * space.  Also this only works for single level trees.
+ */
+static int walk_node(struct ro_spine *s, dm_block_t block,
+		     int (*fn)(void *context, uint64_t *keys, void *leaf),
+		     void *context)
+{
+	int r;
+	unsigned i, nr;
+	struct btree_node *n;
+	uint64_t keys;
+
+	r = ro_step(s, block);
+	n = ro_node(s);
+
+	nr = le32_to_cpu(n->header.nr_entries);
+	for (i = 0; i < nr; i++) {
+		if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
+			r = walk_node(s, value64(n, i), fn, context);
+			if (r)
+				goto out;
+		} else {
+			keys = le64_to_cpu(*key_ptr(n, i));
+			r = fn(context, &keys, value_ptr(n, i));
+			if (r)
+				goto out;
+		}
+	}
+
+out:
+	ro_pop(s);
+	return r;
+}
+
+int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
+		  int (*fn)(void *context, uint64_t *keys, void *leaf),
+		  void *context)
+{
+	int r;
+	struct ro_spine spine;
+
+	BUG_ON(info->levels > 1);
+
+	init_ro_spine(&spine, info);
+	r = walk_node(&spine, root, fn, context);
+	exit_ro_spine(&spine);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_btree_walk);
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
index ae02c84410f..dacfc34180b 100644
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -35,7 +35,7 @@ struct dm_transaction_manager;
  */
 
 /*
- * Infomation about the values stored within the btree.
+ * Information about the values stored within the btree.
  */
 struct dm_btree_value_type {
 	void *context;
@@ -58,21 +58,21 @@ struct dm_btree_value_type {
 	 * somewhere.) This method is _not_ called for insertion of a new
 	 * value: It is assumed the ref count is already 1.
 	 */
-	void (*inc)(void *context, void *value);
+	void (*inc)(void *context, const void *value);
 
 	/*
 	 * This value is being deleted.  The btree takes care of freeing
 	 * the memory pointed to by @value.  Often the del function just
 	 * needs to decrement a reference count somewhere.
 	 */
-	void (*dec)(void *context, void *value);
+	void (*dec)(void *context, const void *value);
 
 	/*
 	 * A test for equality between two values.  When a value is
 	 * overwritten with a new one, the old one has the dec method
 	 * called _unless_ the new and old value are deemed equal.
 	 */
-	int (*equal)(void *context, void *value1, void *value2);
+	int (*equal)(void *context, const void *value1, const void *value2);
 };
 
 /*
@@ -137,9 +137,26 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
 /*
  * Returns < 0 on failure.  Otherwise the number of key entries that have
  * been filled out.  Remember trees can have zero entries, and as such have
+ * no lowest key.
+ */
+int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
+			     uint64_t *result_keys);
+
+/*
+ * Returns < 0 on failure.  Otherwise the number of key entries that have
+ * been filled out.  Remember trees can have zero entries, and as such have
  * no highest key.
  */
 int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
 			      uint64_t *result_keys);
 
+/*
+ * Iterate through the a btree, calling fn() on each entry.
+ * It only works for single level trees and is internally recursive, so
+ * monitor stack usage carefully.
+ */
+int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
+		  int (*fn)(void *context, uint64_t *keys, void *leaf),
+		  void *context);
+
 #endif	/* _LINUX_DM_BTREE_H */
diff --git a/drivers/md/persistent-data/dm-space-map-checker.c b/drivers/md/persistent-data/dm-space-map-checker.c
deleted file mode 100644
index 50ed53bf4aa..00000000000
--- a/drivers/md/persistent-data/dm-space-map-checker.c
+++ /dev/null
@@ -1,438 +0,0 @@
-/*
- * Copyright (C) 2011 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-
-#include "dm-space-map-checker.h"
-
-#include <linux/device-mapper.h>
-#include <linux/export.h>
-
-#ifdef CONFIG_DM_DEBUG_SPACE_MAPS
-
-#define DM_MSG_PREFIX "space map checker"
-
-/*----------------------------------------------------------------*/
-
-struct count_array {
-	dm_block_t nr;
-	dm_block_t nr_free;
-
-	uint32_t *counts;
-};
-
-static int ca_get_count(struct count_array *ca, dm_block_t b, uint32_t *count)
-{
-	if (b >= ca->nr)
-		return -EINVAL;
-
-	*count = ca->counts[b];
-	return 0;
-}
-
-static int ca_count_more_than_one(struct count_array *ca, dm_block_t b, int *r)
-{
-	if (b >= ca->nr)
-		return -EINVAL;
-
-	*r = ca->counts[b] > 1;
-	return 0;
-}
-
-static int ca_set_count(struct count_array *ca, dm_block_t b, uint32_t count)
-{
-	uint32_t old_count;
-
-	if (b >= ca->nr)
-		return -EINVAL;
-
-	old_count = ca->counts[b];
-
-	if (!count && old_count)
-		ca->nr_free++;
-
-	else if (count && !old_count)
-		ca->nr_free--;
-
-	ca->counts[b] = count;
-	return 0;
-}
-
-static int ca_inc_block(struct count_array *ca, dm_block_t b)
-{
-	if (b >= ca->nr)
-		return -EINVAL;
-
-	ca_set_count(ca, b, ca->counts[b] + 1);
-	return 0;
-}
-
-static int ca_dec_block(struct count_array *ca, dm_block_t b)
-{
-	if (b >= ca->nr)
-		return -EINVAL;
-
-	BUG_ON(ca->counts[b] == 0);
-	ca_set_count(ca, b, ca->counts[b] - 1);
-	return 0;
-}
-
-static int ca_create(struct count_array *ca, struct dm_space_map *sm)
-{
-	int r;
-	dm_block_t nr_blocks;
-
-	r = dm_sm_get_nr_blocks(sm, &nr_blocks);
-	if (r)
-		return r;
-
-	ca->nr = nr_blocks;
-	ca->nr_free = nr_blocks;
-	ca->counts = kzalloc(sizeof(*ca->counts) * nr_blocks, GFP_KERNEL);
-	if (!ca->counts)
-		return -ENOMEM;
-
-	return 0;
-}
-
-static int ca_load(struct count_array *ca, struct dm_space_map *sm)
-{
-	int r;
-	uint32_t count;
-	dm_block_t nr_blocks, i;
-
-	r = dm_sm_get_nr_blocks(sm, &nr_blocks);
-	if (r)
-		return r;
-
-	BUG_ON(ca->nr != nr_blocks);
-
-	DMWARN("Loading debug space map from disk.  This may take some time");
-	for (i = 0; i < nr_blocks; i++) {
-		r = dm_sm_get_count(sm, i, &count);
-		if (r) {
-			DMERR("load failed");
-			return r;
-		}
-
-		ca_set_count(ca, i, count);
-	}
-	DMWARN("Load complete");
-
-	return 0;
-}
-
-static int ca_extend(struct count_array *ca, dm_block_t extra_blocks)
-{
-	dm_block_t nr_blocks = ca->nr + extra_blocks;
-	uint32_t *counts = kzalloc(sizeof(*counts) * nr_blocks, GFP_KERNEL);
-	if (!counts)
-		return -ENOMEM;
-
-	memcpy(counts, ca->counts, sizeof(*counts) * ca->nr);
-	kfree(ca->counts);
-	ca->nr = nr_blocks;
-	ca->nr_free += extra_blocks;
-	ca->counts = counts;
-	return 0;
-}
-
-static int ca_commit(struct count_array *old, struct count_array *new)
-{
-	if (old->nr != new->nr) {
-		BUG_ON(old->nr > new->nr);
-		ca_extend(old, new->nr - old->nr);
-	}
-
-	BUG_ON(old->nr != new->nr);
-	old->nr_free = new->nr_free;
-	memcpy(old->counts, new->counts, sizeof(*old->counts) * old->nr);
-	return 0;
-}
-
-static void ca_destroy(struct count_array *ca)
-{
-	kfree(ca->counts);
-}
-
-/*----------------------------------------------------------------*/
-
-struct sm_checker {
-	struct dm_space_map sm;
-
-	struct count_array old_counts;
-	struct count_array counts;
-
-	struct dm_space_map *real_sm;
-};
-
-static void sm_checker_destroy(struct dm_space_map *sm)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-
-	dm_sm_destroy(smc->real_sm);
-	ca_destroy(&smc->old_counts);
-	ca_destroy(&smc->counts);
-	kfree(smc);
-}
-
-static int sm_checker_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-	int r = dm_sm_get_nr_blocks(smc->real_sm, count);
-	if (!r)
-		BUG_ON(smc->old_counts.nr != *count);
-	return r;
-}
-
-static int sm_checker_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-	int r = dm_sm_get_nr_free(smc->real_sm, count);
-	if (!r) {
-		/*
-		 * Slow, but we know it's correct.
-		 */
-		dm_block_t b, n = 0;
-		for (b = 0; b < smc->old_counts.nr; b++)
-			if (smc->old_counts.counts[b] == 0 &&
-			    smc->counts.counts[b] == 0)
-				n++;
-
-		if (n != *count)
-			DMERR("free block counts differ, checker %u, sm-disk:%u",
-			      (unsigned) n, (unsigned) *count);
-	}
-	return r;
-}
-
-static int sm_checker_new_block(struct dm_space_map *sm, dm_block_t *b)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-	int r = dm_sm_new_block(smc->real_sm, b);
-
-	if (!r) {
-		BUG_ON(*b >= smc->old_counts.nr);
-		BUG_ON(smc->old_counts.counts[*b] != 0);
-		BUG_ON(*b >= smc->counts.nr);
-		BUG_ON(smc->counts.counts[*b] != 0);
-		ca_set_count(&smc->counts, *b, 1);
-	}
-
-	return r;
-}
-
-static int sm_checker_inc_block(struct dm_space_map *sm, dm_block_t b)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-	int r = dm_sm_inc_block(smc->real_sm, b);
-	int r2 = ca_inc_block(&smc->counts, b);
-	BUG_ON(r != r2);
-	return r;
-}
-
-static int sm_checker_dec_block(struct dm_space_map *sm, dm_block_t b)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-	int r = dm_sm_dec_block(smc->real_sm, b);
-	int r2 = ca_dec_block(&smc->counts, b);
-	BUG_ON(r != r2);
-	return r;
-}
-
-static int sm_checker_get_count(struct dm_space_map *sm, dm_block_t b, uint32_t *result)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-	uint32_t result2 = 0;
-	int r = dm_sm_get_count(smc->real_sm, b, result);
-	int r2 = ca_get_count(&smc->counts, b, &result2);
-
-	BUG_ON(r != r2);
-	if (!r)
-		BUG_ON(*result != result2);
-	return r;
-}
-
-static int sm_checker_count_more_than_one(struct dm_space_map *sm, dm_block_t b, int *result)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-	int result2 = 0;
-	int r = dm_sm_count_is_more_than_one(smc->real_sm, b, result);
-	int r2 = ca_count_more_than_one(&smc->counts, b, &result2);
-
-	BUG_ON(r != r2);
-	if (!r)
-		BUG_ON(!(*result) && result2);
-	return r;
-}
-
-static int sm_checker_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-	uint32_t old_rc;
-	int r = dm_sm_set_count(smc->real_sm, b, count);
-	int r2;
-
-	BUG_ON(b >= smc->counts.nr);
-	old_rc = smc->counts.counts[b];
-	r2 = ca_set_count(&smc->counts, b, count);
-	BUG_ON(r != r2);
-
-	return r;
-}
-
-static int sm_checker_commit(struct dm_space_map *sm)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-	int r;
-
-	r = dm_sm_commit(smc->real_sm);
-	if (r)
-		return r;
-
-	r = ca_commit(&smc->old_counts, &smc->counts);
-	if (r)
-		return r;
-
-	return 0;
-}
-
-static int sm_checker_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-	int r = dm_sm_extend(smc->real_sm, extra_blocks);
-	if (r)
-		return r;
-
-	return ca_extend(&smc->counts, extra_blocks);
-}
-
-static int sm_checker_root_size(struct dm_space_map *sm, size_t *result)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-	return dm_sm_root_size(smc->real_sm, result);
-}
-
-static int sm_checker_copy_root(struct dm_space_map *sm, void *copy_to_here_le, size_t len)
-{
-	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
-	return dm_sm_copy_root(smc->real_sm, copy_to_here_le, len);
-}
-
-/*----------------------------------------------------------------*/
-
-static struct dm_space_map ops_ = {
-	.destroy = sm_checker_destroy,
-	.get_nr_blocks = sm_checker_get_nr_blocks,
-	.get_nr_free = sm_checker_get_nr_free,
-	.inc_block = sm_checker_inc_block,
-	.dec_block = sm_checker_dec_block,
-	.new_block = sm_checker_new_block,
-	.get_count = sm_checker_get_count,
-	.count_is_more_than_one = sm_checker_count_more_than_one,
-	.set_count = sm_checker_set_count,
-	.commit = sm_checker_commit,
-	.extend = sm_checker_extend,
-	.root_size = sm_checker_root_size,
-	.copy_root = sm_checker_copy_root
-};
-
-struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm)
-{
-	int r;
-	struct sm_checker *smc;
-
-	if (!sm)
-		return NULL;
-
-	smc = kmalloc(sizeof(*smc), GFP_KERNEL);
-	if (!smc)
-		return NULL;
-
-	memcpy(&smc->sm, &ops_, sizeof(smc->sm));
-	r = ca_create(&smc->old_counts, sm);
-	if (r) {
-		kfree(smc);
-		return NULL;
-	}
-
-	r = ca_create(&smc->counts, sm);
-	if (r) {
-		ca_destroy(&smc->old_counts);
-		kfree(smc);
-		return NULL;
-	}
-
-	smc->real_sm = sm;
-
-	r = ca_load(&smc->counts, sm);
-	if (r) {
-		ca_destroy(&smc->counts);
-		ca_destroy(&smc->old_counts);
-		kfree(smc);
-		return NULL;
-	}
-
-	r = ca_commit(&smc->old_counts, &smc->counts);
-	if (r) {
-		ca_destroy(&smc->counts);
-		ca_destroy(&smc->old_counts);
-		kfree(smc);
-		return NULL;
-	}
-
-	return &smc->sm;
-}
-EXPORT_SYMBOL_GPL(dm_sm_checker_create);
-
-struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm)
-{
-	int r;
-	struct sm_checker *smc;
-
-	if (!sm)
-		return NULL;
-
-	smc = kmalloc(sizeof(*smc), GFP_KERNEL);
-	if (!smc)
-		return NULL;
-
-	memcpy(&smc->sm, &ops_, sizeof(smc->sm));
-	r = ca_create(&smc->old_counts, sm);
-	if (r) {
-		kfree(smc);
-		return NULL;
-	}
-
-	r = ca_create(&smc->counts, sm);
-	if (r) {
-		ca_destroy(&smc->old_counts);
-		kfree(smc);
-		return NULL;
-	}
-
-	smc->real_sm = sm;
-	return &smc->sm;
-}
-EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh);
-
-/*----------------------------------------------------------------*/
-
-#else
-
-struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm)
-{
-	return sm;
-}
-EXPORT_SYMBOL_GPL(dm_sm_checker_create);
-
-struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm)
-{
-	return sm;
-}
-EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh);
-
-/*----------------------------------------------------------------*/
-
-#endif
diff --git a/drivers/md/persistent-data/dm-space-map-checker.h b/drivers/md/persistent-data/dm-space-map-checker.h
deleted file mode 100644
index 444dccf6688..00000000000
--- a/drivers/md/persistent-data/dm-space-map-checker.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright (C) 2011 Red Hat, Inc.
- *
- * This file is released under the GPL.
- */
-
-#ifndef SNAPSHOTS_SPACE_MAP_CHECKER_H
-#define SNAPSHOTS_SPACE_MAP_CHECKER_H
-
-#include "dm-space-map.h"
-
-/*----------------------------------------------------------------*/
-
-/*
- * This space map wraps a real on-disk space map, and verifies all of its
- * operations.  It uses a lot of memory, so only use if you have a specific
- * problem that you're debugging.
- *
- * Ownership of @sm passes.
- */
-struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm);
-struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm);
-
-/*----------------------------------------------------------------*/
-
-#endif
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index df2494c06cd..aacbe70c2c2 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -39,8 +39,8 @@ static int index_check(struct dm_block_validator *v,
 	__le32 csum_disk;
 
 	if (dm_block_location(b) != le64_to_cpu(mi_le->blocknr)) {
-		DMERR("index_check failed blocknr %llu wanted %llu",
-		      le64_to_cpu(mi_le->blocknr), dm_block_location(b));
+		DMERR_LIMIT("index_check failed: blocknr %llu != wanted %llu",
+			    le64_to_cpu(mi_le->blocknr), dm_block_location(b));
 		return -ENOTBLK;
 	}
 
@@ -48,8 +48,8 @@ static int index_check(struct dm_block_validator *v,
 					       block_size - sizeof(__le32),
 					       INDEX_CSUM_XOR));
 	if (csum_disk != mi_le->csum) {
-		DMERR("index_check failed csum %u wanted %u",
-		      le32_to_cpu(csum_disk), le32_to_cpu(mi_le->csum));
+		DMERR_LIMIT("index_check failed: csum %u != wanted %u",
+			    le32_to_cpu(csum_disk), le32_to_cpu(mi_le->csum));
 		return -EILSEQ;
 	}
 
@@ -89,8 +89,8 @@ static int bitmap_check(struct dm_block_validator *v,
 	__le32 csum_disk;
 
 	if (dm_block_location(b) != le64_to_cpu(disk_header->blocknr)) {
-		DMERR("bitmap check failed blocknr %llu wanted %llu",
-		      le64_to_cpu(disk_header->blocknr), dm_block_location(b));
+		DMERR_LIMIT("bitmap check failed: blocknr %llu != wanted %llu",
+			    le64_to_cpu(disk_header->blocknr), dm_block_location(b));
 		return -ENOTBLK;
 	}
 
@@ -98,8 +98,8 @@ static int bitmap_check(struct dm_block_validator *v,
 					       block_size - sizeof(__le32),
 					       BITMAP_CSUM_XOR));
 	if (csum_disk != disk_header->csum) {
-		DMERR("bitmap check failed csum %u wanted %u",
-		      le32_to_cpu(csum_disk), le32_to_cpu(disk_header->csum));
+		DMERR_LIMIT("bitmap check failed: csum %u != wanted %u",
+			    le32_to_cpu(csum_disk), le32_to_cpu(disk_header->csum));
 		return -EILSEQ;
 	}
 
@@ -224,6 +224,7 @@ static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm)
 	ll->nr_blocks = 0;
 	ll->bitmap_root = 0;
 	ll->ref_count_root = 0;
+	ll->bitmap_index_changed = false;
 
 	return 0;
 }
@@ -244,6 +245,10 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
 		return -EINVAL;
 	}
 
+	/*
+	 * We need to set this before the dm_tm_new_block() call below.
+	 */
+	ll->nr_blocks = nr_blocks;
 	for (i = old_blocks; i < blocks; i++) {
 		struct dm_block *b;
 		struct disk_index_entry idx;
@@ -251,6 +256,7 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
 		r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b);
 		if (r < 0)
 			return r;
+
 		idx.blocknr = cpu_to_le64(dm_block_location(b));
 
 		r = dm_tm_unlock(ll->tm, b);
@@ -265,7 +271,6 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
 			return r;
 	}
 
-	ll->nr_blocks = nr_blocks;
 	return 0;
 }
 
@@ -291,16 +296,11 @@ int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result)
 	return dm_tm_unlock(ll->tm, blk);
 }
 
-int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result)
+static int sm_ll_lookup_big_ref_count(struct ll_disk *ll, dm_block_t b,
+				      uint32_t *result)
 {
 	__le32 le_rc;
-	int r = sm_ll_lookup_bitmap(ll, b, result);
-
-	if (r)
-		return r;
-
-	if (*result != 3)
-		return r;
+	int r;
 
 	r = dm_btree_lookup(&ll->ref_count_info, ll->ref_count_root, &b, &le_rc);
 	if (r < 0)
@@ -311,6 +311,19 @@ int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result)
 	return r;
 }
 
+int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result)
+{
+	int r = sm_ll_lookup_bitmap(ll, b, result);
+
+	if (r)
+		return r;
+
+	if (*result != 3)
+		return r;
+
+	return sm_ll_lookup_big_ref_count(ll, b, result);
+}
+
 int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
 			  dm_block_t end, dm_block_t *result)
 {
@@ -371,11 +384,12 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
 	return -ENOSPC;
 }
 
-int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
-		 uint32_t ref_count, enum allocation_event *ev)
+static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
+			int (*mutator)(void *context, uint32_t old, uint32_t *new),
+			void *context, enum allocation_event *ev)
 {
 	int r;
-	uint32_t bit, old;
+	uint32_t bit, old, ref_count;
 	struct dm_block *nb;
 	dm_block_t index = b;
 	struct disk_index_entry ie_disk;
@@ -398,6 +412,20 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
 	bm_le = dm_bitmap_data(nb);
 	old = sm_lookup_bitmap(bm_le, bit);
 
+	if (old > 2) {
+		r = sm_ll_lookup_big_ref_count(ll, b, &old);
+		if (r < 0) {
+			dm_tm_unlock(ll->tm, nb);
+			return r;
+		}
+	}
+
+	r = mutator(context, old, &ref_count);
+	if (r) {
+		dm_tm_unlock(ll->tm, nb);
+		return r;
+	}
+
 	if (ref_count <= 2) {
 		sm_set_bitmap(bm_le, bit, ref_count);
 
@@ -405,8 +433,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
 		if (r < 0)
 			return r;
 
-#if 0
-		/* FIXME: dm_btree_remove doesn't handle this yet */
 		if (old > 2) {
 			r = dm_btree_remove(&ll->ref_count_info,
 					    ll->ref_count_root,
@@ -414,7 +440,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
 			if (r)
 				return r;
 		}
-#endif
 
 	} else {
 		__le32 le_rc = cpu_to_le32(ref_count);
@@ -436,50 +461,70 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
 	if (ref_count && !old) {
 		*ev = SM_ALLOC;
 		ll->nr_allocated++;
-		ie_disk.nr_free = cpu_to_le32(le32_to_cpu(ie_disk.nr_free) - 1);
+		le32_add_cpu(&ie_disk.nr_free, -1);
 		if (le32_to_cpu(ie_disk.none_free_before) == bit)
 			ie_disk.none_free_before = cpu_to_le32(bit + 1);
 
 	} else if (old && !ref_count) {
 		*ev = SM_FREE;
 		ll->nr_allocated--;
-		ie_disk.nr_free = cpu_to_le32(le32_to_cpu(ie_disk.nr_free) + 1);
+		le32_add_cpu(&ie_disk.nr_free, 1);
 		ie_disk.none_free_before = cpu_to_le32(min(le32_to_cpu(ie_disk.none_free_before), bit));
 	}
 
 	return ll->save_ie(ll, index, &ie_disk);
 }
 
-int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
+static int set_ref_count(void *context, uint32_t old, uint32_t *new)
 {
-	int r;
-	uint32_t rc;
-
-	r = sm_ll_lookup(ll, b, &rc);
-	if (r)
-		return r;
+	*new = *((uint32_t *) context);
+	return 0;
+}
 
-	return sm_ll_insert(ll, b, rc + 1, ev);
+int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
+		 uint32_t ref_count, enum allocation_event *ev)
+{
+	return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev);
 }
 
-int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
+static int inc_ref_count(void *context, uint32_t old, uint32_t *new)
 {
-	int r;
-	uint32_t rc;
+	*new = old + 1;
+	return 0;
+}
 
-	r = sm_ll_lookup(ll, b, &rc);
-	if (r)
-		return r;
+int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
+{
+	return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev);
+}
 
-	if (!rc)
+static int dec_ref_count(void *context, uint32_t old, uint32_t *new)
+{
+	if (!old) {
+		DMERR_LIMIT("unable to decrement a reference count below 0");
 		return -EINVAL;
+	}
+
+	*new = old - 1;
+	return 0;
+}
 
-	return sm_ll_insert(ll, b, rc - 1, ev);
+int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
+{
+	return sm_ll_mutate(ll, b, dec_ref_count, NULL, ev);
 }
 
 int sm_ll_commit(struct ll_disk *ll)
 {
-	return ll->commit(ll);
+	int r = 0;
+
+	if (ll->bitmap_index_changed) {
+		r = ll->commit(ll);
+		if (!r)
+			ll->bitmap_index_changed = false;
+	}
+
+	return r;
 }
 
 /*----------------------------------------------------------------*/
@@ -494,6 +539,7 @@ static int metadata_ll_load_ie(struct ll_disk *ll, dm_block_t index,
 static int metadata_ll_save_ie(struct ll_disk *ll, dm_block_t index,
 			       struct disk_index_entry *ie)
 {
+	ll->bitmap_index_changed = true;
 	memcpy(ll->mi_le.index + index, ie, sizeof(*ie));
 	return 0;
 }
diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h
index 8f220821a9a..b3078d5eda0 100644
--- a/drivers/md/persistent-data/dm-space-map-common.h
+++ b/drivers/md/persistent-data/dm-space-map-common.h
@@ -78,6 +78,7 @@ struct ll_disk {
 	open_index_fn open_index;
 	max_index_entries_fn max_entries;
 	commit_fn commit;
+	bool bitmap_index_changed:1;
 };
 
 struct disk_sm_root {
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
index fc469ba9f62..cfbf9617e46 100644
--- a/drivers/md/persistent-data/dm-space-map-disk.c
+++ b/drivers/md/persistent-data/dm-space-map-disk.c
@@ -4,7 +4,6 @@
  * This file is released under the GPL.
  */
 
-#include "dm-space-map-checker.h"
 #include "dm-space-map-common.h"
 #include "dm-space-map-disk.h"
 #include "dm-space-map.h"
@@ -141,26 +140,10 @@ static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b)
 
 static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b)
 {
-	int r;
-	uint32_t old_count;
 	enum allocation_event ev;
 	struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
 
-	r = sm_ll_dec(&smd->ll, b, &ev);
-	if (!r && (ev == SM_FREE)) {
-		/*
-		 * It's only free if it's also free in the last
-		 * transaction.
-		 */
-		r = sm_ll_lookup(&smd->old_ll, b, &old_count);
-		if (r)
-			return r;
-
-		if (!old_count)
-			smd->nr_allocated_this_transaction--;
-	}
-
-	return r;
+	return sm_ll_dec(&smd->ll, b, &ev);
 }
 
 static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
@@ -249,12 +232,12 @@ static struct dm_space_map ops = {
 	.new_block = sm_disk_new_block,
 	.commit = sm_disk_commit,
 	.root_size = sm_disk_root_size,
-	.copy_root = sm_disk_copy_root
+	.copy_root = sm_disk_copy_root,
+	.register_threshold_callback = NULL
 };
 
-static struct dm_space_map *dm_sm_disk_create_real(
-	struct dm_transaction_manager *tm,
-	dm_block_t nr_blocks)
+struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm,
+				       dm_block_t nr_blocks)
 {
 	int r;
 	struct sm_disk *smd;
@@ -285,18 +268,10 @@ bad:
 	kfree(smd);
 	return ERR_PTR(r);
 }
-
-struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm,
-				       dm_block_t nr_blocks)
-{
-	struct dm_space_map *sm = dm_sm_disk_create_real(tm, nr_blocks);
-	return dm_sm_checker_create_fresh(sm);
-}
 EXPORT_SYMBOL_GPL(dm_sm_disk_create);
 
-static struct dm_space_map *dm_sm_disk_open_real(
-	struct dm_transaction_manager *tm,
-	void *root_le, size_t len)
+struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm,
+				     void *root_le, size_t len)
 {
 	int r;
 	struct sm_disk *smd;
@@ -323,13 +298,6 @@ bad:
 	kfree(smd);
 	return ERR_PTR(r);
 }
-
-struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm,
-				     void *root_le, size_t len)
-{
-	return dm_sm_checker_create(
-		dm_sm_disk_open_real(tm, root_le, len));
-}
 EXPORT_SYMBOL_GPL(dm_sm_disk_open);
 
 /*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index e89ae5e7a51..786b689bdfc 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -17,6 +17,55 @@
 /*----------------------------------------------------------------*/
 
 /*
+ * An edge triggered threshold.
+ */
+struct threshold {
+	bool threshold_set;
+	bool value_set;
+	dm_block_t threshold;
+	dm_block_t current_value;
+	dm_sm_threshold_fn fn;
+	void *context;
+};
+
+static void threshold_init(struct threshold *t)
+{
+	t->threshold_set = false;
+	t->value_set = false;
+}
+
+static void set_threshold(struct threshold *t, dm_block_t value,
+			  dm_sm_threshold_fn fn, void *context)
+{
+	t->threshold_set = true;
+	t->threshold = value;
+	t->fn = fn;
+	t->context = context;
+}
+
+static bool below_threshold(struct threshold *t, dm_block_t value)
+{
+	return t->threshold_set && value <= t->threshold;
+}
+
+static bool threshold_already_triggered(struct threshold *t)
+{
+	return t->value_set && below_threshold(t, t->current_value);
+}
+
+static void check_threshold(struct threshold *t, dm_block_t value)
+{
+	if (below_threshold(t, value) &&
+	    !threshold_already_triggered(t))
+		t->fn(t->context);
+
+	t->value_set = true;
+	t->current_value = value;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
  * Space map interface.
  *
  * The low level disk format is written using the standard btree and
@@ -42,6 +91,69 @@ struct block_op {
 	dm_block_t block;
 };
 
+struct bop_ring_buffer {
+	unsigned begin;
+	unsigned end;
+	struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1];
+};
+
+static void brb_init(struct bop_ring_buffer *brb)
+{
+	brb->begin = 0;
+	brb->end = 0;
+}
+
+static bool brb_empty(struct bop_ring_buffer *brb)
+{
+	return brb->begin == brb->end;
+}
+
+static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old)
+{
+	unsigned r = old + 1;
+	return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r;
+}
+
+static int brb_push(struct bop_ring_buffer *brb,
+		    enum block_op_type type, dm_block_t b)
+{
+	struct block_op *bop;
+	unsigned next = brb_next(brb, brb->end);
+
+	/*
+	 * We don't allow the last bop to be filled, this way we can
+	 * differentiate between full and empty.
+	 */
+	if (next == brb->begin)
+		return -ENOMEM;
+
+	bop = brb->bops + brb->end;
+	bop->type = type;
+	bop->block = b;
+
+	brb->end = next;
+
+	return 0;
+}
+
+static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result)
+{
+	struct block_op *bop;
+
+	if (brb_empty(brb))
+		return -ENODATA;
+
+	bop = brb->bops + brb->begin;
+	result->type = bop->type;
+	result->block = bop->block;
+
+	brb->begin = brb_next(brb, brb->begin);
+
+	return 0;
+}
+
+/*----------------------------------------------------------------*/
+
 struct sm_metadata {
 	struct dm_space_map sm;
 
@@ -52,23 +164,20 @@ struct sm_metadata {
 
 	unsigned recursion_count;
 	unsigned allocated_this_transaction;
-	unsigned nr_uncommitted;
-	struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS];
+	struct bop_ring_buffer uncommitted;
+
+	struct threshold threshold;
 };
 
 static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b)
 {
-	struct block_op *op;
+	int r = brb_push(&smm->uncommitted, type, b);
 
-	if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) {
+	if (r) {
 		DMERR("too many recursive allocations");
 		return -ENOMEM;
 	}
 
-	op = smm->uncommitted + smm->nr_uncommitted++;
-	op->type = type;
-	op->block = b;
-
 	return 0;
 }
 
@@ -107,11 +216,17 @@ static int out(struct sm_metadata *smm)
 		return -ENOMEM;
 	}
 
-	if (smm->recursion_count == 1 && smm->nr_uncommitted) {
-		while (smm->nr_uncommitted && !r) {
-			smm->nr_uncommitted--;
-			r = commit_bop(smm, smm->uncommitted +
-				       smm->nr_uncommitted);
+	if (smm->recursion_count == 1) {
+		while (!brb_empty(&smm->uncommitted)) {
+			struct block_op bop;
+
+			r = brb_pop(&smm->uncommitted, &bop);
+			if (r) {
+				DMERR("bug in bop ring buffer");
+				break;
+			}
+
+			r = commit_bop(smm, &bop);
 			if (r)
 				break;
 		}
@@ -144,12 +259,6 @@ static void sm_metadata_destroy(struct dm_space_map *sm)
 	kfree(smm);
 }
 
-static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
-{
-	DMERR("doesn't support extend");
-	return -EINVAL;
-}
-
 static int sm_metadata_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
 {
 	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
@@ -172,7 +281,8 @@ static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
 static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
 				 uint32_t *result)
 {
-	int r, i;
+	int r;
+	unsigned i;
 	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
 	unsigned adjustment = 0;
 
@@ -180,8 +290,10 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
 	 * We may have some uncommitted adjustments to add.  This list
 	 * should always be really short.
 	 */
-	for (i = 0; i < smm->nr_uncommitted; i++) {
-		struct block_op *op = smm->uncommitted + i;
+	for (i = smm->uncommitted.begin;
+	     i != smm->uncommitted.end;
+	     i = brb_next(&smm->uncommitted, i)) {
+		struct block_op *op = smm->uncommitted.bops + i;
 
 		if (op->block != b)
 			continue;
@@ -209,7 +321,8 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
 static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
 					      dm_block_t b, int *result)
 {
-	int r, i, adjustment = 0;
+	int r, adjustment = 0;
+	unsigned i;
 	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
 	uint32_t rc;
 
@@ -217,8 +330,11 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
 	 * We may have some uncommitted adjustments to add.  This list
 	 * should always be really short.
 	 */
-	for (i = 0; i < smm->nr_uncommitted; i++) {
-		struct block_op *op = smm->uncommitted + i;
+	for (i = smm->uncommitted.begin;
+	     i != smm->uncommitted.end;
+	     i = brb_next(&smm->uncommitted, i)) {
+
+		struct block_op *op = smm->uncommitted.bops + i;
 
 		if (op->block != b)
 			continue;
@@ -335,9 +451,23 @@ static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b)
 
 static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b)
 {
+	dm_block_t count;
+	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+
 	int r = sm_metadata_new_block_(sm, b);
-	if (r)
-		DMERR("out of metadata space");
+	if (r) {
+		DMERR_LIMIT("unable to allocate new metadata block");
+		return r;
+	}
+
+	r = sm_metadata_get_nr_free(sm, &count);
+	if (r) {
+		DMERR_LIMIT("couldn't get free block count");
+		return r;
+	}
+
+	check_threshold(&smm->threshold, count);
+
 	return r;
 }
 
@@ -357,6 +487,18 @@ static int sm_metadata_commit(struct dm_space_map *sm)
 	return 0;
 }
 
+static int sm_metadata_register_threshold_callback(struct dm_space_map *sm,
+						   dm_block_t threshold,
+						   dm_sm_threshold_fn fn,
+						   void *context)
+{
+	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+
+	set_threshold(&smm->threshold, threshold, fn, context);
+
+	return 0;
+}
+
 static int sm_metadata_root_size(struct dm_space_map *sm, size_t *result)
 {
 	*result = sizeof(struct disk_sm_root);
@@ -382,6 +524,8 @@ static int sm_metadata_copy_root(struct dm_space_map *sm, void *where_le, size_t
 	return 0;
 }
 
+static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks);
+
 static struct dm_space_map ops = {
 	.destroy = sm_metadata_destroy,
 	.extend = sm_metadata_extend,
@@ -395,7 +539,8 @@ static struct dm_space_map ops = {
 	.new_block = sm_metadata_new_block,
 	.commit = sm_metadata_commit,
 	.root_size = sm_metadata_root_size,
-	.copy_root = sm_metadata_copy_root
+	.copy_root = sm_metadata_copy_root,
+	.register_threshold_callback = sm_metadata_register_threshold_callback
 };
 
 /*----------------------------------------------------------------*/
@@ -410,7 +555,7 @@ static void sm_bootstrap_destroy(struct dm_space_map *sm)
 
 static int sm_bootstrap_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
 {
-	DMERR("boostrap doesn't support extend");
+	DMERR("bootstrap doesn't support extend");
 
 	return -EINVAL;
 }
@@ -450,7 +595,7 @@ static int sm_bootstrap_count_is_more_than_one(struct dm_space_map *sm,
 static int sm_bootstrap_set_count(struct dm_space_map *sm, dm_block_t b,
 				  uint32_t count)
 {
-	DMERR("boostrap doesn't support set_count");
+	DMERR("bootstrap doesn't support set_count");
 
 	return -EINVAL;
 }
@@ -491,7 +636,7 @@ static int sm_bootstrap_commit(struct dm_space_map *sm)
 
 static int sm_bootstrap_root_size(struct dm_space_map *sm, size_t *result)
 {
-	DMERR("boostrap doesn't support root_size");
+	DMERR("bootstrap doesn't support root_size");
 
 	return -EINVAL;
 }
@@ -499,7 +644,7 @@ static int sm_bootstrap_root_size(struct dm_space_map *sm, size_t *result)
 static int sm_bootstrap_copy_root(struct dm_space_map *sm, void *where,
 				  size_t max)
 {
-	DMERR("boostrap doesn't support copy_root");
+	DMERR("bootstrap doesn't support copy_root");
 
 	return -EINVAL;
 }
@@ -517,11 +662,60 @@ static struct dm_space_map bootstrap_ops = {
 	.new_block = sm_bootstrap_new_block,
 	.commit = sm_bootstrap_commit,
 	.root_size = sm_bootstrap_root_size,
-	.copy_root = sm_bootstrap_copy_root
+	.copy_root = sm_bootstrap_copy_root,
+	.register_threshold_callback = NULL
 };
 
 /*----------------------------------------------------------------*/
 
+static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
+{
+	int r, i;
+	enum allocation_event ev;
+	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+	dm_block_t old_len = smm->ll.nr_blocks;
+
+	/*
+	 * Flick into a mode where all blocks get allocated in the new area.
+	 */
+	smm->begin = old_len;
+	memcpy(sm, &bootstrap_ops, sizeof(*sm));
+
+	/*
+	 * Extend.
+	 */
+	r = sm_ll_extend(&smm->ll, extra_blocks);
+	if (r)
+		goto out;
+
+	/*
+	 * We repeatedly increment then commit until the commit doesn't
+	 * allocate any new blocks.
+	 */
+	do {
+		for (i = old_len; !r && i < smm->begin; i++) {
+			r = sm_ll_inc(&smm->ll, i, &ev);
+			if (r)
+				goto out;
+		}
+		old_len = smm->begin;
+
+		r = sm_ll_commit(&smm->ll);
+		if (r)
+			goto out;
+
+	} while (old_len != smm->begin);
+
+out:
+	/*
+	 * Switch back to normal behaviour.
+	 */
+	memcpy(sm, &ops, sizeof(*sm));
+	return r;
+}
+
+/*----------------------------------------------------------------*/
+
 struct dm_space_map *dm_sm_metadata_init(void)
 {
 	struct sm_metadata *smm;
@@ -548,7 +742,8 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
 	smm->begin = superblock + 1;
 	smm->recursion_count = 0;
 	smm->allocated_this_transaction = 0;
-	smm->nr_uncommitted = 0;
+	brb_init(&smm->uncommitted);
+	threshold_init(&smm->threshold);
 
 	memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
 
@@ -556,6 +751,8 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
 	if (r)
 		return r;
 
+	if (nr_blocks > DM_SM_METADATA_MAX_BLOCKS)
+		nr_blocks = DM_SM_METADATA_MAX_BLOCKS;
 	r = sm_ll_extend(&smm->ll, nr_blocks);
 	if (r)
 		return r;
@@ -589,7 +786,8 @@ int dm_sm_metadata_open(struct dm_space_map *sm,
 	smm->begin = 0;
 	smm->recursion_count = 0;
 	smm->allocated_this_transaction = 0;
-	smm->nr_uncommitted = 0;
+	brb_init(&smm->uncommitted);
+	threshold_init(&smm->threshold);
 
 	memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
 	return 0;
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.h b/drivers/md/persistent-data/dm-space-map-metadata.h
index 39bba0801cf..64df923974d 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.h
+++ b/drivers/md/persistent-data/dm-space-map-metadata.h
@@ -9,6 +9,17 @@
 
 #include "dm-transaction-manager.h"
 
+#define DM_SM_METADATA_BLOCK_SIZE (4096 >> SECTOR_SHIFT)
+
+/*
+ * The metadata device is currently limited in size.
+ *
+ * We have one block of index, which can hold 255 index entries.  Each
+ * index entry contains allocation info about ~16k metadata blocks.
+ */
+#define DM_SM_METADATA_MAX_BLOCKS (255 * ((1 << 14) - 64))
+#define DM_SM_METADATA_MAX_SECTORS (DM_SM_METADATA_MAX_BLOCKS * DM_SM_METADATA_BLOCK_SIZE)
+
 /*
  * Unfortunately we have to use two-phase construction due to the cycle
  * between the tm and sm.
diff --git a/drivers/md/persistent-data/dm-space-map.h b/drivers/md/persistent-data/dm-space-map.h
index 1cbfc6b1638..3e6d1153b7c 100644
--- a/drivers/md/persistent-data/dm-space-map.h
+++ b/drivers/md/persistent-data/dm-space-map.h
@@ -9,6 +9,8 @@
 
 #include "dm-block-manager.h"
 
+typedef void (*dm_sm_threshold_fn)(void *context);
+
 /*
  * struct dm_space_map keeps a record of how many times each block in a device
  * is referenced.  It needs to be fixed on disk as part of the transaction.
@@ -59,6 +61,15 @@ struct dm_space_map {
 	 */
 	int (*root_size)(struct dm_space_map *sm, size_t *result);
 	int (*copy_root)(struct dm_space_map *sm, void *copy_to_here_le, size_t len);
+
+	/*
+	 * You can register one threshold callback which is edge-triggered
+	 * when the free space in the space map drops below the threshold.
+	 */
+	int (*register_threshold_callback)(struct dm_space_map *sm,
+					   dm_block_t threshold,
+					   dm_sm_threshold_fn fn,
+					   void *context);
 };
 
 /*----------------------------------------------------------------*/
@@ -131,4 +142,16 @@ static inline int dm_sm_copy_root(struct dm_space_map *sm, void *copy_to_here_le
 	return sm->copy_root(sm, copy_to_here_le, len);
 }
 
+static inline int dm_sm_register_threshold_callback(struct dm_space_map *sm,
+						    dm_block_t threshold,
+						    dm_sm_threshold_fn fn,
+						    void *context)
+{
+	if (sm->register_threshold_callback)
+		return sm->register_threshold_callback(sm, threshold, fn, context);
+
+	return -EINVAL;
+}
+
+
 #endif	/* _LINUX_DM_SPACE_MAP_H */
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
index 6f8d38747d7..3bc30a0ae3d 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -5,7 +5,6 @@
  */
 #include "dm-transaction-manager.h"
 #include "dm-space-map.h"
-#include "dm-space-map-checker.h"
 #include "dm-space-map-disk.h"
 #include "dm-space-map-metadata.h"
 #include "dm-persistent-data-internal.h"
@@ -26,8 +25,8 @@ struct shadow_info {
 /*
  * It would be nice if we scaled with the size of transaction.
  */
-#define HASH_SIZE 256
-#define HASH_MASK (HASH_SIZE - 1)
+#define DM_HASH_SIZE 256
+#define DM_HASH_MASK (DM_HASH_SIZE - 1)
 
 struct dm_transaction_manager {
 	int is_clone;
@@ -37,7 +36,7 @@ struct dm_transaction_manager {
 	struct dm_space_map *sm;
 
 	spinlock_t lock;
-	struct hlist_head buckets[HASH_SIZE];
+	struct hlist_head buckets[DM_HASH_SIZE];
 };
 
 /*----------------------------------------------------------------*/
@@ -45,12 +44,11 @@ struct dm_transaction_manager {
 static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b)
 {
 	int r = 0;
-	unsigned bucket = dm_hash_block(b, HASH_MASK);
+	unsigned bucket = dm_hash_block(b, DM_HASH_MASK);
 	struct shadow_info *si;
-	struct hlist_node *n;
 
 	spin_lock(&tm->lock);
-	hlist_for_each_entry(si, n, tm->buckets + bucket, hlist)
+	hlist_for_each_entry(si, tm->buckets + bucket, hlist)
 		if (si->where == b) {
 			r = 1;
 			break;
@@ -72,7 +70,7 @@ static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b)
 	si = kmalloc(sizeof(*si), GFP_NOIO);
 	if (si) {
 		si->where = b;
-		bucket = dm_hash_block(b, HASH_MASK);
+		bucket = dm_hash_block(b, DM_HASH_MASK);
 		spin_lock(&tm->lock);
 		hlist_add_head(&si->hlist, tm->buckets + bucket);
 		spin_unlock(&tm->lock);
@@ -82,14 +80,14 @@ static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b)
 static void wipe_shadow_table(struct dm_transaction_manager *tm)
 {
 	struct shadow_info *si;
-	struct hlist_node *n, *tmp;
+	struct hlist_node *tmp;
 	struct hlist_head *bucket;
 	int i;
 
 	spin_lock(&tm->lock);
-	for (i = 0; i < HASH_SIZE; i++) {
+	for (i = 0; i < DM_HASH_SIZE; i++) {
 		bucket = tm->buckets + i;
-		hlist_for_each_entry_safe(si, n, tmp, bucket, hlist)
+		hlist_for_each_entry_safe(si, tmp, bucket, hlist)
 			kfree(si);
 
 		INIT_HLIST_HEAD(bucket);
@@ -116,7 +114,7 @@ static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm,
 	tm->sm = sm;
 
 	spin_lock_init(&tm->lock);
-	for (i = 0; i < HASH_SIZE; i++)
+	for (i = 0; i < DM_HASH_SIZE; i++)
 		INIT_HLIST_HEAD(tm->buckets + i);
 
 	return tm;
@@ -138,6 +136,9 @@ EXPORT_SYMBOL_GPL(dm_tm_create_non_blocking_clone);
 
 void dm_tm_destroy(struct dm_transaction_manager *tm)
 {
+	if (!tm->is_clone)
+		wipe_shadow_table(tm);
+
 	kfree(tm);
 }
 EXPORT_SYMBOL_GPL(dm_tm_destroy);
@@ -153,7 +154,7 @@ int dm_tm_pre_commit(struct dm_transaction_manager *tm)
 	if (r < 0)
 		return r;
 
-	return 0;
+	return dm_bm_flush(tm->bm);
 }
 EXPORT_SYMBOL_GPL(dm_tm_pre_commit);
 
@@ -163,8 +164,9 @@ int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root)
 		return -EWOULDBLOCK;
 
 	wipe_shadow_table(tm);
+	dm_bm_unlock(root);
 
-	return dm_bm_flush_and_unlock(tm->bm, root);
+	return dm_bm_flush(tm->bm);
 }
 EXPORT_SYMBOL_GPL(dm_tm_commit);
 
@@ -217,13 +219,24 @@ static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
 	if (r < 0)
 		return r;
 
-	r = dm_bm_unlock_move(orig_block, new);
-	if (r < 0) {
+	/*
+	 * It would be tempting to use dm_bm_unlock_move here, but some
+	 * code, such as the space maps, keeps using the old data structures
+	 * secure in the knowledge they won't be changed until the next
+	 * transaction.  Using unlock_move would force a synchronous read
+	 * since the old block would no longer be in the cache.
+	 */
+	r = dm_bm_write_lock_zero(tm->bm, new, v, result);
+	if (r) {
 		dm_bm_unlock(orig_block);
 		return r;
 	}
 
-	return dm_bm_write_lock(tm->bm, new, v, result);
+	memcpy(dm_block_data(*result), dm_block_data(orig_block),
+	       dm_bm_block_size(tm->bm));
+
+	dm_bm_unlock(orig_block);
+	return r;
 }
 
 int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
@@ -249,6 +262,7 @@ int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
 
 	return r;
 }
+EXPORT_SYMBOL_GPL(dm_tm_shadow_block);
 
 int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
 		    struct dm_block_validator *v,
@@ -259,6 +273,7 @@ int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
 
 	return dm_bm_read_lock(tm->bm, b, v, blk);
 }
+EXPORT_SYMBOL_GPL(dm_tm_read_lock);
 
 int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b)
 {
@@ -306,94 +321,61 @@ struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm)
 
 static int dm_tm_create_internal(struct dm_block_manager *bm,
 				 dm_block_t sb_location,
-				 struct dm_block_validator *sb_validator,
-				 size_t root_offset, size_t root_max_len,
 				 struct dm_transaction_manager **tm,
 				 struct dm_space_map **sm,
-				 struct dm_block **sblock,
-				 int create)
+				 int create,
+				 void *sm_root, size_t sm_len)
 {
 	int r;
-	struct dm_space_map *inner;
 
-	inner = dm_sm_metadata_init();
-	if (IS_ERR(inner))
-		return PTR_ERR(inner);
+	*sm = dm_sm_metadata_init();
+	if (IS_ERR(*sm))
+		return PTR_ERR(*sm);
 
-	*tm = dm_tm_create(bm, inner);
+	*tm = dm_tm_create(bm, *sm);
 	if (IS_ERR(*tm)) {
-		dm_sm_destroy(inner);
+		dm_sm_destroy(*sm);
 		return PTR_ERR(*tm);
 	}
 
 	if (create) {
-		r = dm_bm_write_lock_zero(dm_tm_get_bm(*tm), sb_location,
-					  sb_validator, sblock);
-		if (r < 0) {
-			DMERR("couldn't lock superblock");
-			goto bad1;
-		}
-
-		r = dm_sm_metadata_create(inner, *tm, dm_bm_nr_blocks(bm),
+		r = dm_sm_metadata_create(*sm, *tm, dm_bm_nr_blocks(bm),
 					  sb_location);
 		if (r) {
 			DMERR("couldn't create metadata space map");
-			goto bad2;
+			goto bad;
 		}
 
-		*sm = dm_sm_checker_create(inner);
-		if (!*sm)
-			goto bad2;
-
 	} else {
-		r = dm_bm_write_lock(dm_tm_get_bm(*tm), sb_location,
-				     sb_validator, sblock);
-		if (r < 0) {
-			DMERR("couldn't lock superblock");
-			goto bad1;
-		}
-
-		r = dm_sm_metadata_open(inner, *tm,
-					dm_block_data(*sblock) + root_offset,
-					root_max_len);
+		r = dm_sm_metadata_open(*sm, *tm, sm_root, sm_len);
 		if (r) {
 			DMERR("couldn't open metadata space map");
-			goto bad2;
+			goto bad;
 		}
-
-		*sm = dm_sm_checker_create(inner);
-		if (!*sm)
-			goto bad2;
 	}
 
 	return 0;
 
-bad2:
-	dm_tm_unlock(*tm, *sblock);
-bad1:
+bad:
 	dm_tm_destroy(*tm);
-	dm_sm_destroy(inner);
+	dm_sm_destroy(*sm);
 	return r;
 }
 
 int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
-			 struct dm_block_validator *sb_validator,
 			 struct dm_transaction_manager **tm,
-			 struct dm_space_map **sm, struct dm_block **sblock)
+			 struct dm_space_map **sm)
 {
-	return dm_tm_create_internal(bm, sb_location, sb_validator,
-				     0, 0, tm, sm, sblock, 1);
+	return dm_tm_create_internal(bm, sb_location, tm, sm, 1, NULL, 0);
 }
 EXPORT_SYMBOL_GPL(dm_tm_create_with_sm);
 
 int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
-		       struct dm_block_validator *sb_validator,
-		       size_t root_offset, size_t root_max_len,
+		       void *sm_root, size_t root_len,
 		       struct dm_transaction_manager **tm,
-		       struct dm_space_map **sm, struct dm_block **sblock)
+		       struct dm_space_map **sm)
 {
-	return dm_tm_create_internal(bm, sb_location, sb_validator, root_offset,
-				     root_max_len, tm, sm, sblock, 0);
+	return dm_tm_create_internal(bm, sb_location, tm, sm, 0, sm_root, root_len);
 }
 EXPORT_SYMBOL_GPL(dm_tm_open_with_sm);
 
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h
index 6da784871db..2772ed2a781 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.h
+++ b/drivers/md/persistent-data/dm-transaction-manager.h
@@ -38,18 +38,17 @@ struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transac
 /*
  * We use a 2-phase commit here.
  *
- * i) In the first phase the block manager is told to start flushing, and
- * the changes to the space map are written to disk.  You should interrogate
- * your particular space map to get detail of its root node etc. to be
- * included in your superblock.
+ * i) Make all changes for the transaction *except* for the superblock.
+ * Then call dm_tm_pre_commit() to flush them to disk.
  *
- * ii) @root will be committed last.  You shouldn't use more than the
- * first 512 bytes of @root if you wish the transaction to survive a power
- * failure.  You *must* have a write lock held on @root for both stage (i)
- * and (ii).  The commit will drop the write lock.
+ * ii) Lock your superblock.  Update.  Then call dm_tm_commit() which will
+ * unlock the superblock and flush it.  No other blocks should be updated
+ * during this period.  Care should be taken to never unlock a partially
+ * updated superblock; perform any operations that could fail *before* you
+ * take the superblock lock.
  */
 int dm_tm_pre_commit(struct dm_transaction_manager *tm);
-int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root);
+int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *superblock);
 
 /*
  * These methods are the only way to get hold of a writeable block.
@@ -115,16 +114,17 @@ struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm);
  *
  * Returns a tm that has an open transaction to write the new disk sm.
  * Caller should store the new sm root and commit.
+ *
+ * The superblock location is passed so the metadata space map knows it
+ * shouldn't be used.
  */
 int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
-			 struct dm_block_validator *sb_validator,
 			 struct dm_transaction_manager **tm,
-			 struct dm_space_map **sm, struct dm_block **sblock);
+			 struct dm_space_map **sm);
 
 int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
-		       struct dm_block_validator *sb_validator,
-		       size_t root_offset, size_t root_max_len,
+		       void *sm_root, size_t root_len,
 		       struct dm_transaction_manager **tm,
-		       struct dm_space_map **sm, struct dm_block **sblock);
+		       struct dm_space_map **sm);
 
 #endif	/* _LINUX_DM_TRANSACTION_MANAGER_H */