111 files changed, 34783 insertions, 3910 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 91a02eeeb31..5bdedf6df15 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -154,17 +154,6 @@ config MD_RAID456
 
 	  If unsure, say Y.
 
-config MULTICORE_RAID456
-	bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)"
-	depends on MD_RAID456
-	depends on SMP
-	depends on EXPERIMENTAL
-	---help---
-	  Enable the raid456 module to dispatch per-stripe raid operations to a
-	  thread pool.
-
-	  If unsure, say N.
-
 config MD_MULTIPATH
 	tristate "Multipath I/O support"
 	depends on BLK_DEV_MD
@@ -185,8 +174,14 @@ config MD_FAULTY
 
 	  In unsure, say N.
 
+source "drivers/md/bcache/Kconfig"
+
+config BLK_DEV_DM_BUILTIN
+	boolean
+
 config BLK_DEV_DM
 	tristate "Device mapper support"
+	select BLK_DEV_DM_BUILTIN
 	---help---
 	  Device-mapper is a low level volume manager.  It works by allowing
 	  people to specify mappings for ranges of logical sectors.  Various
@@ -210,7 +205,7 @@ config DM_DEBUG
 
 config DM_BUFIO
        tristate
-       depends on BLK_DEV_DM && EXPERIMENTAL
+       depends on BLK_DEV_DM
        ---help---
 	 This interface allows you to do buffered I/O on a device and acts
 	 as a cache, holding recently-read blocks in memory and performing
@@ -218,7 +213,7 @@ config DM_BUFIO
 
 config DM_BIO_PRISON
        tristate
-       depends on BLK_DEV_DM && EXPERIMENTAL
+       depends on BLK_DEV_DM
        ---help---
 	 Some bio locking schemes used by other device-mapper targets
 	 including thin provisioning.
@@ -247,26 +242,59 @@ config DM_CRYPT
 config DM_SNAPSHOT
        tristate "Snapshot target"
        depends on BLK_DEV_DM
+       select DM_BUFIO
        ---help---
          Allow volume managers to take writable snapshots of a device.
 
 config DM_THIN_PROVISIONING
-       tristate "Thin provisioning target (EXPERIMENTAL)"
-       depends on BLK_DEV_DM && EXPERIMENTAL
+       tristate "Thin provisioning target"
+       depends on BLK_DEV_DM
        select DM_PERSISTENT_DATA
        select DM_BIO_PRISON
        ---help---
          Provides thin provisioning and snapshots that share a data store.
 
-config DM_DEBUG_BLOCK_STACK_TRACING
-	boolean "Keep stack trace of thin provisioning block lock holders"
-	depends on STACKTRACE_SUPPORT && DM_THIN_PROVISIONING
-	select STACKTRACE
-	---help---
-	  Enable this for messages that may help debug problems with the
-	  block manager locking used by thin provisioning.
+config DM_CACHE
+       tristate "Cache target (EXPERIMENTAL)"
+       depends on BLK_DEV_DM
+       default n
+       select DM_PERSISTENT_DATA
+       select DM_BIO_PRISON
+       ---help---
+         dm-cache attempts to improve performance of a block device by
+         moving frequently used data to a smaller, higher performance
+         device.  Different 'policy' plugins can be used to change the
+         algorithms used to select which blocks are promoted, demoted,
+         cleaned etc.  It supports writeback and writethrough modes.
+
+config DM_CACHE_MQ
+       tristate "MQ Cache Policy (EXPERIMENTAL)"
+       depends on DM_CACHE
+       default y
+       ---help---
+         A cache policy that uses a multiqueue ordered by recent hit
+         count to select which blocks should be promoted and demoted.
+         This is meant to be a general purpose policy.  It prioritises
+         reads over writes.
+
+config DM_CACHE_CLEANER
+       tristate "Cleaner Cache Policy (EXPERIMENTAL)"
+       depends on DM_CACHE
+       default y
+       ---help---
+         A simple cache policy that writes back all data to the
+         origin.  Used when decommissioning a dm-cache.
 
-	  If unsure, say N.
+config DM_ERA
+       tristate "Era target (EXPERIMENTAL)"
+       depends on BLK_DEV_DM
+       default n
+       select DM_PERSISTENT_DATA
+       select DM_BIO_PRISON
+       ---help---
+         dm-era tracks which parts of a block device are written to
+         over time.  Useful for maintaining cache coherency when using
+         vendor snapshots.
 
 config DM_MIRROR
        tristate "Mirror target"
@@ -275,6 +303,17 @@ config DM_MIRROR
          Allow volume managers to mirror logical volumes, also
          needed for live data migration tools such as 'pvmove'.
 
+config DM_LOG_USERSPACE
+	tristate "Mirror userspace logging"
+	depends on DM_MIRROR && NET
+	select CONNECTOR
+	---help---
+	  The userspace logging module provides a mechanism for
+	  relaying the dm-dirty-log API to userspace.  Log designs
+	  which are more suited to userspace implementation (e.g.
+	  shared storage logs) or experimental logs can be implemented
+	  by leveraging this framework.
+
 config DM_RAID
        tristate "RAID 1/4/5/6/10 target"
        depends on BLK_DEV_DM
@@ -301,17 +340,6 @@ config DM_RAID
 	 RAID-5, RAID-6 distributes the syndromes across the drives
 	 in one of the available parity distribution methods.
 
-config DM_LOG_USERSPACE
-	tristate "Mirror userspace logging (EXPERIMENTAL)"
-	depends on DM_MIRROR && EXPERIMENTAL && NET
-	select CONNECTOR
-	---help---
-	  The userspace logging module provides a mechanism for
-	  relaying the dm-dirty-log API to userspace.  Log designs
-	  which are more suited to userspace implementation (e.g.
-	  shared storage logs) or experimental logs can be implemented
-	  by leveraging this framework.
-
 config DM_ZERO
 	tristate "Zero target"
 	depends on BLK_DEV_DM
@@ -350,8 +378,8 @@ config DM_MULTIPATH_ST
 	  If unsure, say N.
 
 config DM_DELAY
-	tristate "I/O delaying target (EXPERIMENTAL)"
-	depends on BLK_DEV_DM && EXPERIMENTAL
+	tristate "I/O delaying target"
+	depends on BLK_DEV_DM
 	---help---
 	A target that delays reads and/or writes and can send
 	them to different devices.  Useful for testing.
@@ -365,14 +393,14 @@ config DM_UEVENT
 	Generate udev events for DM events.
 
 config DM_FLAKEY
-       tristate "Flakey target (EXPERIMENTAL)"
-       depends on BLK_DEV_DM && EXPERIMENTAL
+       tristate "Flakey target"
+       depends on BLK_DEV_DM
        ---help---
          A target that intermittently fails I/O for debugging purposes.
 
 config DM_VERITY
-	tristate "Verity target support (EXPERIMENTAL)"
-	depends on BLK_DEV_DM && EXPERIMENTAL
+	tristate "Verity target support"
+	depends on BLK_DEV_DM
 	select CRYPTO
 	select CRYPTO_HASH
 	select DM_BUFIO
@@ -390,4 +418,18 @@ config DM_VERITY
 
 	  If unsure, say N.
 
+config DM_SWITCH
+	tristate "Switch target support (EXPERIMENTAL)"
+	depends on BLK_DEV_DM
+	---help---
+	  This device-mapper target creates a device that supports an arbitrary
+	  mapping of fixed-size regions of I/O across a fixed set of paths.
+	  The path used for any specific region can be switched dynamically
+	  by sending the target a message.
+
+	  To compile this code as a module, choose M here: the module will
+	  be called dm-switch.
+
+	  If unsure, say N.
+
 endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 94dce8b4932..a2da532b1c2 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -3,7 +3,7 @@
 #
 
 dm-mod-y	+= dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
-		   dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o
+		   dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o dm-stats.o
 dm-multipath-y	+= dm-path-selector.o dm-mpath.o
 dm-snapshot-y	+= dm-snap.o dm-exception-store.o dm-snap-transient.o \
 		    dm-snap-persistent.o
@@ -11,6 +11,10 @@ dm-mirror-y	+= dm-raid1.o
 dm-log-userspace-y \
 		+= dm-log-userspace-base.o dm-log-userspace-transfer.o
 dm-thin-pool-y	+= dm-thin.o dm-thin-metadata.o
+dm-cache-y	+= dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
+dm-cache-mq-y   += dm-cache-policy-mq.o
+dm-cache-cleaner-y += dm-cache-policy-cleaner.o
+dm-era-y	+= dm-era-target.o
 md-mod-y	+= md.o bitmap.o
 raid456-y	+= raid5.o
 
@@ -26,8 +30,10 @@ obj-$(CONFIG_MD_RAID10)		+= raid10.o
 obj-$(CONFIG_MD_RAID456)	+= raid456.o
 obj-$(CONFIG_MD_MULTIPATH)	+= multipath.o
 obj-$(CONFIG_MD_FAULTY)		+= faulty.o
+obj-$(CONFIG_BCACHE)		+= bcache/
 obj-$(CONFIG_BLK_DEV_MD)	+= md-mod.o
 obj-$(CONFIG_BLK_DEV_DM)	+= dm-mod.o
+obj-$(CONFIG_BLK_DEV_DM_BUILTIN) += dm-builtin.o
 obj-$(CONFIG_DM_BUFIO)		+= dm-bufio.o
 obj-$(CONFIG_DM_BIO_PRISON)	+= dm-bio-prison.o
 obj-$(CONFIG_DM_CRYPT)		+= dm-crypt.o
@@ -36,6 +42,7 @@ obj-$(CONFIG_DM_FLAKEY)		+= dm-flakey.o
 obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipath.o dm-round-robin.o
 obj-$(CONFIG_DM_MULTIPATH_QL)	+= dm-queue-length.o
 obj-$(CONFIG_DM_MULTIPATH_ST)	+= dm-service-time.o
+obj-$(CONFIG_DM_SWITCH)		+= dm-switch.o
 obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
 obj-$(CONFIG_DM_PERSISTENT_DATA)	+= persistent-data/
 obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
@@ -44,6 +51,10 @@ obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
 obj-$(CONFIG_DM_RAID)	+= dm-raid.o
 obj-$(CONFIG_DM_THIN_PROVISIONING)	+= dm-thin-pool.o
 obj-$(CONFIG_DM_VERITY)		+= dm-verity.o
+obj-$(CONFIG_DM_CACHE)		+= dm-cache.o
+obj-$(CONFIG_DM_CACHE_MQ)	+= dm-cache-mq.o
+obj-$(CONFIG_DM_CACHE_CLEANER)	+= dm-cache-cleaner.o
+obj-$(CONFIG_DM_ERA)		+= dm-era.o
 
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs			+= dm-uevent.o
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
new file mode 100644
index 00000000000..4d200883c50
--- /dev/null
+++ b/drivers/md/bcache/Kconfig
@@ -0,0 +1,26 @@
+
+config BCACHE
+	tristate "Block device as cache"
+	---help---
+	Allows a block device to be used as cache for other devices; uses
+	a btree for indexing and the layout is optimized for SSDs.
+
+	See Documentation/bcache.txt for details.
+
+config BCACHE_DEBUG
+	bool "Bcache debugging"
+	depends on BCACHE
+	---help---
+	Don't select this option unless you're a developer
+
+	Enables extra debugging tools, allows expensive runtime checks to be
+	turned on.
+
+config BCACHE_CLOSURES_DEBUG
+	bool "Debug closures"
+	depends on BCACHE
+	select DEBUG_FS
+	---help---
+	Keeps all active closures in a linked list and provides a debugfs
+	interface to list them, which makes it possible to see asynchronous
+	operations that get stuck.
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
new file mode 100644
index 00000000000..c488b846f83
--- /dev/null
+++ b/drivers/md/bcache/Makefile
@@ -0,0 +1,8 @@
+
+obj-$(CONFIG_BCACHE)	+= bcache.o
+
+bcache-y		:= alloc.o bset.o btree.o closure.o debug.o extents.o\
+	io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
+	util.o writeback.o
+
+CFLAGS_request.o	+= -Iblock
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
new file mode 100644
index 00000000000..443d03fbac4
--- /dev/null
+++ b/drivers/md/bcache/alloc.c
@@ -0,0 +1,696 @@
+/*
+ * Primary bucket allocation code
+ *
+ * Copyright 2012 Google, Inc.
+ *
+ * Allocation in bcache is done in terms of buckets:
+ *
+ * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
+ * btree pointers - they must match for the pointer to be considered valid.
+ *
+ * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
+ * bucket simply by incrementing its gen.
+ *
+ * The gens (along with the priorities; it's really the gens are important but
+ * the code is named as if it's the priorities) are written in an arbitrary list
+ * of buckets on disk, with a pointer to them in the journal header.
+ *
+ * When we invalidate a bucket, we have to write its new gen to disk and wait
+ * for that write to complete before we use it - otherwise after a crash we
+ * could have pointers that appeared to be good but pointed to data that had
+ * been overwritten.
+ *
+ * Since the gens and priorities are all stored contiguously on disk, we can
+ * batch this up: We fill up the free_inc list with freshly invalidated buckets,
+ * call prio_write(), and when prio_write() finishes we pull buckets off the
+ * free_inc list and optionally discard them.
+ *
+ * free_inc isn't the only freelist - if it was, we'd often to sleep while
+ * priorities and gens were being written before we could allocate. c->free is a
+ * smaller freelist, and buckets on that list are always ready to be used.
+ *
+ * If we've got discards enabled, that happens when a bucket moves from the
+ * free_inc list to the free list.
+ *
+ * There is another freelist, because sometimes we have buckets that we know
+ * have nothing pointing into them - these we can reuse without waiting for
+ * priorities to be rewritten. These come from freed btree nodes and buckets
+ * that garbage collection discovered no longer had valid keys pointing into
+ * them (because they were overwritten). That's the unused list - buckets on the
+ * unused list move to the free list, optionally being discarded in the process.
+ *
+ * It's also important to ensure that gens don't wrap around - with respect to
+ * either the oldest gen in the btree or the gen on disk. This is quite
+ * difficult to do in practice, but we explicitly guard against it anyways - if
+ * a bucket is in danger of wrapping around we simply skip invalidating it that
+ * time around, and we garbage collect or rewrite the priorities sooner than we
+ * would have otherwise.
+ *
+ * bch_bucket_alloc() allocates a single bucket from a specific cache.
+ *
+ * bch_bucket_alloc_set() allocates one or more buckets from different caches
+ * out of a cache set.
+ *
+ * free_some_buckets() drives all the processes described above. It's called
+ * from bch_bucket_alloc() and a few other places that need to make sure free
+ * buckets are ready.
+ *
+ * invalidate_buckets_(lru|fifo)() find buckets that are available to be
+ * invalidated, and then invalidate them and stick them on the free_inc list -
+ * in either lru or fifo order.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+
+#include <linux/blkdev.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/random.h>
+#include <trace/events/bcache.h>
+
+/* Bucket heap / gen */
+
+uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
+{
+	uint8_t ret = ++b->gen;
+
+	ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
+	WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);
+
+	return ret;
+}
+
+void bch_rescale_priorities(struct cache_set *c, int sectors)
+{
+	struct cache *ca;
+	struct bucket *b;
+	unsigned next = c->nbuckets * c->sb.bucket_size / 1024;
+	unsigned i;
+	int r;
+
+	atomic_sub(sectors, &c->rescale);
+
+	do {
+		r = atomic_read(&c->rescale);
+
+		if (r >= 0)
+			return;
+	} while (atomic_cmpxchg(&c->rescale, r, r + next) != r);
+
+	mutex_lock(&c->bucket_lock);
+
+	c->min_prio = USHRT_MAX;
+
+	for_each_cache(ca, c, i)
+		for_each_bucket(b, ca)
+			if (b->prio &&
+			    b->prio != BTREE_PRIO &&
+			    !atomic_read(&b->pin)) {
+				b->prio--;
+				c->min_prio = min(c->min_prio, b->prio);
+			}
+
+	mutex_unlock(&c->bucket_lock);
+}
+
+/*
+ * Background allocation thread: scans for buckets to be invalidated,
+ * invalidates them, rewrites prios/gens (marking them as invalidated on disk),
+ * then optionally issues discard commands to the newly free buckets, then puts
+ * them on the various freelists.
+ */
+
+static inline bool can_inc_bucket_gen(struct bucket *b)
+{
+	return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX;
+}
+
+bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
+{
+	BUG_ON(!ca->set->gc_mark_valid);
+
+	return (!GC_MARK(b) ||
+		GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
+		!atomic_read(&b->pin) &&
+		can_inc_bucket_gen(b);
+}
+
+void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
+{
+	lockdep_assert_held(&ca->set->bucket_lock);
+	BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE);
+
+	if (GC_SECTORS_USED(b))
+		trace_bcache_invalidate(ca, b - ca->buckets);
+
+	bch_inc_gen(ca, b);
+	b->prio = INITIAL_PRIO;
+	atomic_inc(&b->pin);
+}
+
+static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
+{
+	__bch_invalidate_one_bucket(ca, b);
+
+	fifo_push(&ca->free_inc, b - ca->buckets);
+}
+
+/*
+ * Determines what order we're going to reuse buckets, smallest bucket_prio()
+ * first: we also take into account the number of sectors of live data in that
+ * bucket, and in order for that multiply to make sense we have to scale bucket
+ *
+ * Thus, we scale the bucket priorities so that the bucket with the smallest
+ * prio is worth 1/8th of what INITIAL_PRIO is worth.
+ */
+
+#define bucket_prio(b)							\
+({									\
+	unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8;	\
+									\
+	(b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b);	\
+})
+
+#define bucket_max_cmp(l, r)	(bucket_prio(l) < bucket_prio(r))
+#define bucket_min_cmp(l, r)	(bucket_prio(l) > bucket_prio(r))
+
+static void invalidate_buckets_lru(struct cache *ca)
+{
+	struct bucket *b;
+	ssize_t i;
+
+	ca->heap.used = 0;
+
+	for_each_bucket(b, ca) {
+		if (!bch_can_invalidate_bucket(ca, b))
+			continue;
+
+		if (!heap_full(&ca->heap))
+			heap_add(&ca->heap, b, bucket_max_cmp);
+		else if (bucket_max_cmp(b, heap_peek(&ca->heap))) {
+			ca->heap.data[0] = b;
+			heap_sift(&ca->heap, 0, bucket_max_cmp);
+		}
+	}
+
+	for (i = ca->heap.used / 2 - 1; i >= 0; --i)
+		heap_sift(&ca->heap, i, bucket_min_cmp);
+
+	while (!fifo_full(&ca->free_inc)) {
+		if (!heap_pop(&ca->heap, b, bucket_min_cmp)) {
+			/*
+			 * We don't want to be calling invalidate_buckets()
+			 * multiple times when it can't do anything
+			 */
+			ca->invalidate_needs_gc = 1;
+			wake_up_gc(ca->set);
+			return;
+		}
+
+		bch_invalidate_one_bucket(ca, b);
+	}
+}
+
+static void invalidate_buckets_fifo(struct cache *ca)
+{
+	struct bucket *b;
+	size_t checked = 0;
+
+	while (!fifo_full(&ca->free_inc)) {
+		if (ca->fifo_last_bucket <  ca->sb.first_bucket ||
+		    ca->fifo_last_bucket >= ca->sb.nbuckets)
+			ca->fifo_last_bucket = ca->sb.first_bucket;
+
+		b = ca->buckets + ca->fifo_last_bucket++;
+
+		if (bch_can_invalidate_bucket(ca, b))
+			bch_invalidate_one_bucket(ca, b);
+
+		if (++checked >= ca->sb.nbuckets) {
+			ca->invalidate_needs_gc = 1;
+			wake_up_gc(ca->set);
+			return;
+		}
+	}
+}
+
+static void invalidate_buckets_random(struct cache *ca)
+{
+	struct bucket *b;
+	size_t checked = 0;
+
+	while (!fifo_full(&ca->free_inc)) {
+		size_t n;
+		get_random_bytes(&n, sizeof(n));
+
+		n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket);
+		n += ca->sb.first_bucket;
+
+		b = ca->buckets + n;
+
+		if (bch_can_invalidate_bucket(ca, b))
+			bch_invalidate_one_bucket(ca, b);
+
+		if (++checked >= ca->sb.nbuckets / 2) {
+			ca->invalidate_needs_gc = 1;
+			wake_up_gc(ca->set);
+			return;
+		}
+	}
+}
+
+static void invalidate_buckets(struct cache *ca)
+{
+	BUG_ON(ca->invalidate_needs_gc);
+
+	switch (CACHE_REPLACEMENT(&ca->sb)) {
+	case CACHE_REPLACEMENT_LRU:
+		invalidate_buckets_lru(ca);
+		break;
+	case CACHE_REPLACEMENT_FIFO:
+		invalidate_buckets_fifo(ca);
+		break;
+	case CACHE_REPLACEMENT_RANDOM:
+		invalidate_buckets_random(ca);
+		break;
+	}
+}
+
+#define allocator_wait(ca, cond)					\
+do {									\
+	while (1) {							\
+		set_current_state(TASK_INTERRUPTIBLE);			\
+		if (cond)						\
+			break;						\
+									\
+		mutex_unlock(&(ca)->set->bucket_lock);			\
+		if (kthread_should_stop())				\
+			return 0;					\
+									\
+		try_to_freeze();					\
+		schedule();						\
+		mutex_lock(&(ca)->set->bucket_lock);			\
+	}								\
+	__set_current_state(TASK_RUNNING);				\
+} while (0)
+
+static int bch_allocator_push(struct cache *ca, long bucket)
+{
+	unsigned i;
+
+	/* Prios/gens are actually the most important reserve */
+	if (fifo_push(&ca->free[RESERVE_PRIO], bucket))
+		return true;
+
+	for (i = 0; i < RESERVE_NR; i++)
+		if (fifo_push(&ca->free[i], bucket))
+			return true;
+
+	return false;
+}
+
+static int bch_allocator_thread(void *arg)
+{
+	struct cache *ca = arg;
+
+	mutex_lock(&ca->set->bucket_lock);
+
+	while (1) {
+		/*
+		 * First, we pull buckets off of the unused and free_inc lists,
+		 * possibly issue discards to them, then we add the bucket to
+		 * the free list:
+		 */
+		while (!fifo_empty(&ca->free_inc)) {
+			long bucket;
+
+			fifo_pop(&ca->free_inc, bucket);
+
+			if (ca->discard) {
+				mutex_unlock(&ca->set->bucket_lock);
+				blkdev_issue_discard(ca->bdev,
+					bucket_to_sector(ca->set, bucket),
+					ca->sb.block_size, GFP_KERNEL, 0);
+				mutex_lock(&ca->set->bucket_lock);
+			}
+
+			allocator_wait(ca, bch_allocator_push(ca, bucket));
+			wake_up(&ca->set->btree_cache_wait);
+			wake_up(&ca->set->bucket_wait);
+		}
+
+		/*
+		 * We've run out of free buckets, we need to find some buckets
+		 * we can invalidate. First, invalidate them in memory and add
+		 * them to the free_inc list:
+		 */
+
+retry_invalidate:
+		allocator_wait(ca, ca->set->gc_mark_valid &&
+			       !ca->invalidate_needs_gc);
+		invalidate_buckets(ca);
+
+		/*
+		 * Now, we write their new gens to disk so we can start writing
+		 * new stuff to them:
+		 */
+		allocator_wait(ca, !atomic_read(&ca->set->prio_blocked));
+		if (CACHE_SYNC(&ca->set->sb)) {
+			/*
+			 * This could deadlock if an allocation with a btree
+			 * node locked ever blocked - having the btree node
+			 * locked would block garbage collection, but here we're
+			 * waiting on garbage collection before we invalidate
+			 * and free anything.
+			 *
+			 * But this should be safe since the btree code always
+			 * uses btree_check_reserve() before allocating now, and
+			 * if it fails it blocks without btree nodes locked.
+			 */
+			if (!fifo_full(&ca->free_inc))
+				goto retry_invalidate;
+
+			bch_prio_write(ca);
+		}
+	}
+}
+
+/* Allocation */
+
+long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
+{
+	DEFINE_WAIT(w);
+	struct bucket *b;
+	long r;
+
+	/* fastpath */
+	if (fifo_pop(&ca->free[RESERVE_NONE], r) ||
+	    fifo_pop(&ca->free[reserve], r))
+		goto out;
+
+	if (!wait) {
+		trace_bcache_alloc_fail(ca, reserve);
+		return -1;
+	}
+
+	do {
+		prepare_to_wait(&ca->set->bucket_wait, &w,
+				TASK_UNINTERRUPTIBLE);
+
+		mutex_unlock(&ca->set->bucket_lock);
+		schedule();
+		mutex_lock(&ca->set->bucket_lock);
+	} while (!fifo_pop(&ca->free[RESERVE_NONE], r) &&
+		 !fifo_pop(&ca->free[reserve], r));
+
+	finish_wait(&ca->set->bucket_wait, &w);
+out:
+	wake_up_process(ca->alloc_thread);
+
+	trace_bcache_alloc(ca, reserve);
+
+	if (expensive_debug_checks(ca->set)) {
+		size_t iter;
+		long i;
+		unsigned j;
+
+		for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
+			BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);
+
+		for (j = 0; j < RESERVE_NR; j++)
+			fifo_for_each(i, &ca->free[j], iter)
+				BUG_ON(i == r);
+		fifo_for_each(i, &ca->free_inc, iter)
+			BUG_ON(i == r);
+	}
+
+	b = ca->buckets + r;
+
+	BUG_ON(atomic_read(&b->pin) != 1);
+
+	SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
+
+	if (reserve <= RESERVE_PRIO) {
+		SET_GC_MARK(b, GC_MARK_METADATA);
+		SET_GC_MOVE(b, 0);
+		b->prio = BTREE_PRIO;
+	} else {
+		SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
+		SET_GC_MOVE(b, 0);
+		b->prio = INITIAL_PRIO;
+	}
+
+	return r;
+}
+
+void __bch_bucket_free(struct cache *ca, struct bucket *b)
+{
+	SET_GC_MARK(b, 0);
+	SET_GC_SECTORS_USED(b, 0);
+}
+
+void bch_bucket_free(struct cache_set *c, struct bkey *k)
+{
+	unsigned i;
+
+	for (i = 0; i < KEY_PTRS(k); i++)
+		__bch_bucket_free(PTR_CACHE(c, k, i),
+				  PTR_BUCKET(c, k, i));
+}
+
+int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
+			   struct bkey *k, int n, bool wait)
+{
+	int i;
+
+	lockdep_assert_held(&c->bucket_lock);
+	BUG_ON(!n || n > c->caches_loaded || n > 8);
+
+	bkey_init(k);
+
+	/* sort by free space/prio of oldest data in caches */
+
+	for (i = 0; i < n; i++) {
+		struct cache *ca = c->cache_by_alloc[i];
+		long b = bch_bucket_alloc(ca, reserve, wait);
+
+		if (b == -1)
+			goto err;
+
+		k->ptr[i] = PTR(ca->buckets[b].gen,
+				bucket_to_sector(c, b),
+				ca->sb.nr_this_dev);
+
+		SET_KEY_PTRS(k, i + 1);
+	}
+
+	return 0;
+err:
+	bch_bucket_free(c, k);
+	bkey_put(c, k);
+	return -1;
+}
+
+int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
+			 struct bkey *k, int n, bool wait)
+{
+	int ret;
+	mutex_lock(&c->bucket_lock);
+	ret = __bch_bucket_alloc_set(c, reserve, k, n, wait);
+	mutex_unlock(&c->bucket_lock);
+	return ret;
+}
+
+/* Sector allocator */
+
+struct open_bucket {
+	struct list_head	list;
+	unsigned		last_write_point;
+	unsigned		sectors_free;
+	BKEY_PADDED(key);
+};
+
+/*
+ * We keep multiple buckets open for writes, and try to segregate different
+ * write streams for better cache utilization: first we look for a bucket where
+ * the last write to it was sequential with the current write, and failing that
+ * we look for a bucket that was last used by the same task.
+ *
+ * The ideas is if you've got multiple tasks pulling data into the cache at the
+ * same time, you'll get better cache utilization if you try to segregate their
+ * data and preserve locality.
+ *
+ * For example, say you've starting Firefox at the same time you're copying a
+ * bunch of files. Firefox will likely end up being fairly hot and stay in the
+ * cache awhile, but the data you copied might not be; if you wrote all that
+ * data to the same buckets it'd get invalidated at the same time.
+ *
+ * Both of those tasks will be doing fairly random IO so we can't rely on
+ * detecting sequential IO to segregate their data, but going off of the task
+ * should be a sane heuristic.
+ */
+static struct open_bucket *pick_data_bucket(struct cache_set *c,
+					    const struct bkey *search,
+					    unsigned write_point,
+					    struct bkey *alloc)
+{
+	struct open_bucket *ret, *ret_task = NULL;
+
+	list_for_each_entry_reverse(ret, &c->data_buckets, list)
+		if (!bkey_cmp(&ret->key, search))
+			goto found;
+		else if (ret->last_write_point == write_point)
+			ret_task = ret;
+
+	ret = ret_task ?: list_first_entry(&c->data_buckets,
+					   struct open_bucket, list);
+found:
+	if (!ret->sectors_free && KEY_PTRS(alloc)) {
+		ret->sectors_free = c->sb.bucket_size;
+		bkey_copy(&ret->key, alloc);
+		bkey_init(alloc);
+	}
+
+	if (!ret->sectors_free)
+		ret = NULL;
+
+	return ret;
+}
+
+/*
+ * Allocates some space in the cache to write to, and k to point to the newly
+ * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the
+ * end of the newly allocated space).
+ *
+ * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many
+ * sectors were actually allocated.
+ *
+ * If s->writeback is true, will not fail.
+ */
+bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors,
+		       unsigned write_point, unsigned write_prio, bool wait)
+{
+	struct open_bucket *b;
+	BKEY_PADDED(key) alloc;
+	unsigned i;
+
+	/*
+	 * We might have to allocate a new bucket, which we can't do with a
+	 * spinlock held. So if we have to allocate, we drop the lock, allocate
+	 * and then retry. KEY_PTRS() indicates whether alloc points to
+	 * allocated bucket(s).
+	 */
+
+	bkey_init(&alloc.key);
+	spin_lock(&c->data_bucket_lock);
+
+	while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) {
+		unsigned watermark = write_prio
+			? RESERVE_MOVINGGC
+			: RESERVE_NONE;
+
+		spin_unlock(&c->data_bucket_lock);
+
+		if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait))
+			return false;
+
+		spin_lock(&c->data_bucket_lock);
+	}
+
+	/*
+	 * If we had to allocate, we might race and not need to allocate the
+	 * second time we call find_data_bucket(). If we allocated a bucket but
+	 * didn't use it, drop the refcount bch_bucket_alloc_set() took:
+	 */
+	if (KEY_PTRS(&alloc.key))
+		bkey_put(c, &alloc.key);
+
+	for (i = 0; i < KEY_PTRS(&b->key); i++)
+		EBUG_ON(ptr_stale(c, &b->key, i));
+
+	/* Set up the pointer to the space we're allocating: */
+
+	for (i = 0; i < KEY_PTRS(&b->key); i++)
+		k->ptr[i] = b->key.ptr[i];
+
+	sectors = min(sectors, b->sectors_free);
+
+	SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors);
+	SET_KEY_SIZE(k, sectors);
+	SET_KEY_PTRS(k, KEY_PTRS(&b->key));
+
+	/*
+	 * Move b to the end of the lru, and keep track of what this bucket was
+	 * last used for:
+	 */
+	list_move_tail(&b->list, &c->data_buckets);
+	bkey_copy_key(&b->key, k);
+	b->last_write_point = write_point;
+
+	b->sectors_free	-= sectors;
+
+	for (i = 0; i < KEY_PTRS(&b->key); i++) {
+		SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors);
+
+		atomic_long_add(sectors,
+				&PTR_CACHE(c, &b->key, i)->sectors_written);
+	}
+
+	if (b->sectors_free < c->sb.block_size)
+		b->sectors_free = 0;
+
+	/*
+	 * k takes refcounts on the buckets it points to until it's inserted
+	 * into the btree, but if we're done with this bucket we just transfer
+	 * get_data_bucket()'s refcount.
+	 */
+	if (b->sectors_free)
+		for (i = 0; i < KEY_PTRS(&b->key); i++)
+			atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin);
+
+	spin_unlock(&c->data_bucket_lock);
+	return true;
+}
+
+/* Init */
+
+void bch_open_buckets_free(struct cache_set *c)
+{
+	struct open_bucket *b;
+
+	while (!list_empty(&c->data_buckets)) {
+		b = list_first_entry(&c->data_buckets,
+				     struct open_bucket, list);
+		list_del(&b->list);
+		kfree(b);
+	}
+}
+
+int bch_open_buckets_alloc(struct cache_set *c)
+{
+	int i;
+
+	spin_lock_init(&c->data_bucket_lock);
+
+	for (i = 0; i < 6; i++) {
+		struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
+		if (!b)
+			return -ENOMEM;
+
+		list_add(&b->list, &c->data_buckets);
+	}
+
+	return 0;
+}
+
+int bch_cache_allocator_start(struct cache *ca)
+{
+	struct task_struct *k = kthread_run(bch_allocator_thread,
+					    ca, "bcache_allocator");
+	if (IS_ERR(k))
+		return PTR_ERR(k);
+
+	ca->alloc_thread = k;
+	return 0;
+}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
new file mode 100644
index 00000000000..d2ebcf32309
--- /dev/null
+++ b/drivers/md/bcache/bcache.h
@@ -0,0 +1,942 @@
+#ifndef _BCACHE_H
+#define _BCACHE_H
+
+/*
+ * SOME HIGH LEVEL CODE DOCUMENTATION:
+ *
+ * Bcache mostly works with cache sets, cache devices, and backing devices.
+ *
+ * Support for multiple cache devices hasn't quite been finished off yet, but
+ * it's about 95% plumbed through. A cache set and its cache devices is sort of
+ * like a md raid array and its component devices. Most of the code doesn't care
+ * about individual cache devices, the main abstraction is the cache set.
+ *
+ * Multiple cache devices is intended to give us the ability to mirror dirty
+ * cached data and metadata, without mirroring clean cached data.
+ *
+ * Backing devices are different, in that they have a lifetime independent of a
+ * cache set. When you register a newly formatted backing device it'll come up
+ * in passthrough mode, and then you can attach and detach a backing device from
+ * a cache set at runtime - while it's mounted and in use. Detaching implicitly
+ * invalidates any cached data for that backing device.
+ *
+ * A cache set can have multiple (many) backing devices attached to it.
+ *
+ * There's also flash only volumes - this is the reason for the distinction
+ * between struct cached_dev and struct bcache_device. A flash only volume
+ * works much like a bcache device that has a backing device, except the
+ * "cached" data is always dirty. The end result is that we get thin
+ * provisioning with very little additional code.
+ *
+ * Flash only volumes work but they're not production ready because the moving
+ * garbage collector needs more work. More on that later.
+ *
+ * BUCKETS/ALLOCATION:
+ *
+ * Bcache is primarily designed for caching, which means that in normal
+ * operation all of our available space will be allocated. Thus, we need an
+ * efficient way of deleting things from the cache so we can write new things to
+ * it.
+ *
+ * To do this, we first divide the cache device up into buckets. A bucket is the
+ * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
+ * works efficiently.
+ *
+ * Each bucket has a 16 bit priority, and an 8 bit generation associated with
+ * it. The gens and priorities for all the buckets are stored contiguously and
+ * packed on disk (in a linked list of buckets - aside from the superblock, all
+ * of bcache's metadata is stored in buckets).
+ *
+ * The priority is used to implement an LRU. We reset a bucket's priority when
+ * we allocate it or on cache it, and every so often we decrement the priority
+ * of each bucket. It could be used to implement something more sophisticated,
+ * if anyone ever gets around to it.
+ *
+ * The generation is used for invalidating buckets. Each pointer also has an 8
+ * bit generation embedded in it; for a pointer to be considered valid, its gen
+ * must match the gen of the bucket it points into.  Thus, to reuse a bucket all
+ * we have to do is increment its gen (and write its new gen to disk; we batch
+ * this up).
+ *
+ * Bcache is entirely COW - we never write twice to a bucket, even buckets that
+ * contain metadata (including btree nodes).
+ *
+ * THE BTREE:
+ *
+ * Bcache is in large part design around the btree.
+ *
+ * At a high level, the btree is just an index of key -> ptr tuples.
+ *
+ * Keys represent extents, and thus have a size field. Keys also have a variable
+ * number of pointers attached to them (potentially zero, which is handy for
+ * invalidating the cache).
+ *
+ * The key itself is an inode:offset pair. The inode number corresponds to a
+ * backing device or a flash only volume. The offset is the ending offset of the
+ * extent within the inode - not the starting offset; this makes lookups
+ * slightly more convenient.
+ *
+ * Pointers contain the cache device id, the offset on that device, and an 8 bit
+ * generation number. More on the gen later.
+ *
+ * Index lookups are not fully abstracted - cache lookups in particular are
+ * still somewhat mixed in with the btree code, but things are headed in that
+ * direction.
+ *
+ * Updates are fairly well abstracted, though. There are two different ways of
+ * updating the btree; insert and replace.
+ *
+ * BTREE_INSERT will just take a list of keys and insert them into the btree -
+ * overwriting (possibly only partially) any extents they overlap with. This is
+ * used to update the index after a write.
+ *
+ * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
+ * overwriting a key that matches another given key. This is used for inserting
+ * data into the cache after a cache miss, and for background writeback, and for
+ * the moving garbage collector.
+ *
+ * There is no "delete" operation; deleting things from the index is
+ * accomplished by either by invalidating pointers (by incrementing a bucket's
+ * gen) or by inserting a key with 0 pointers - which will overwrite anything
+ * previously present at that location in the index.
+ *
+ * This means that there are always stale/invalid keys in the btree. They're
+ * filtered out by the code that iterates through a btree node, and removed when
+ * a btree node is rewritten.
+ *
+ * BTREE NODES:
+ *
+ * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and
+ * free smaller than a bucket - so, that's how big our btree nodes are.
+ *
+ * (If buckets are really big we'll only use part of the bucket for a btree node
+ * - no less than 1/4th - but a bucket still contains no more than a single
+ * btree node. I'd actually like to change this, but for now we rely on the
+ * bucket's gen for deleting btree nodes when we rewrite/split a node.)
+ *
+ * Anyways, btree nodes are big - big enough to be inefficient with a textbook
+ * btree implementation.
+ *
+ * The way this is solved is that btree nodes are internally log structured; we
+ * can append new keys to an existing btree node without rewriting it. This
+ * means each set of keys we write is sorted, but the node is not.
+ *
+ * We maintain this log structure in memory - keeping 1Mb of keys sorted would
+ * be expensive, and we have to distinguish between the keys we have written and
+ * the keys we haven't. So to do a lookup in a btree node, we have to search
+ * each sorted set. But we do merge written sets together lazily, so the cost of
+ * these extra searches is quite low (normally most of the keys in a btree node
+ * will be in one big set, and then there'll be one or two sets that are much
+ * smaller).
+ *
+ * This log structure makes bcache's btree more of a hybrid between a
+ * conventional btree and a compacting data structure, with some of the
+ * advantages of both.
+ *
+ * GARBAGE COLLECTION:
+ *
+ * We can't just invalidate any bucket - it might contain dirty data or
+ * metadata. If it once contained dirty data, other writes might overwrite it
+ * later, leaving no valid pointers into that bucket in the index.
+ *
+ * Thus, the primary purpose of garbage collection is to find buckets to reuse.
+ * It also counts how much valid data it each bucket currently contains, so that
+ * allocation can reuse buckets sooner when they've been mostly overwritten.
+ *
+ * It also does some things that are really internal to the btree
+ * implementation. If a btree node contains pointers that are stale by more than
+ * some threshold, it rewrites the btree node to avoid the bucket's generation
+ * wrapping around. It also merges adjacent btree nodes if they're empty enough.
+ *
+ * THE JOURNAL:
+ *
+ * Bcache's journal is not necessary for consistency; we always strictly
+ * order metadata writes so that the btree and everything else is consistent on
+ * disk in the event of an unclean shutdown, and in fact bcache had writeback
+ * caching (with recovery from unclean shutdown) before journalling was
+ * implemented.
+ *
+ * Rather, the journal is purely a performance optimization; we can't complete a
+ * write until we've updated the index on disk, otherwise the cache would be
+ * inconsistent in the event of an unclean shutdown. This means that without the
+ * journal, on random write workloads we constantly have to update all the leaf
+ * nodes in the btree, and those writes will be mostly empty (appending at most
+ * a few keys each) - highly inefficient in terms of amount of metadata writes,
+ * and it puts more strain on the various btree resorting/compacting code.
+ *
+ * The journal is just a log of keys we've inserted; on startup we just reinsert
+ * all the keys in the open journal entries. That means that when we're updating
+ * a node in the btree, we can wait until a 4k block of keys fills up before
+ * writing them out.
+ *
+ * For simplicity, we only journal updates to leaf nodes; updates to parent
+ * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
+ * the complexity to deal with journalling them (in particular, journal replay)
+ * - updates to non leaf nodes just happen synchronously (see btree_split()).
+ */
+
+#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
+
+#include <linux/bcache.h>
+#include <linux/bio.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+#include "bset.h"
+#include "util.h"
+#include "closure.h"
+
+struct bucket {
+	atomic_t	pin;
+	uint16_t	prio;
+	uint8_t		gen;
+	uint8_t		last_gc; /* Most out of date gen in the btree */
+	uint16_t	gc_mark; /* Bitfield used by GC. See below for field */
+};
+
+/*
+ * I'd use bitfields for these, but I don't trust the compiler not to screw me
+ * as multiple threads touch struct bucket without locking
+ */
+
+BITMASK(GC_MARK,	 struct bucket, gc_mark, 0, 2);
+#define GC_MARK_RECLAIMABLE	1
+#define GC_MARK_DIRTY		2
+#define GC_MARK_METADATA	3
+#define GC_SECTORS_USED_SIZE	13
+#define MAX_GC_SECTORS_USED	(~(~0ULL << GC_SECTORS_USED_SIZE))
+BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE);
+BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1);
+
+#include "journal.h"
+#include "stats.h"
+struct search;
+struct btree;
+struct keybuf;
+
+struct keybuf_key {
+	struct rb_node		node;
+	BKEY_PADDED(key);
+	void			*private;
+};
+
+struct keybuf {
+	struct bkey		last_scanned;
+	spinlock_t		lock;
+
+	/*
+	 * Beginning and end of range in rb tree - so that we can skip taking
+	 * lock and checking the rb tree when we need to check for overlapping
+	 * keys.
+	 */
+	struct bkey		start;
+	struct bkey		end;
+
+	struct rb_root		keys;
+
+#define KEYBUF_NR		500
+	DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR);
+};
+
+struct bio_split_pool {
+	struct bio_set		*bio_split;
+	mempool_t		*bio_split_hook;
+};
+
+struct bio_split_hook {
+	struct closure		cl;
+	struct bio_split_pool	*p;
+	struct bio		*bio;
+	bio_end_io_t		*bi_end_io;
+	void			*bi_private;
+};
+
+struct bcache_device {
+	struct closure		cl;
+
+	struct kobject		kobj;
+
+	struct cache_set	*c;
+	unsigned		id;
+#define BCACHEDEVNAME_SIZE	12
+	char			name[BCACHEDEVNAME_SIZE];
+
+	struct gendisk		*disk;
+
+	unsigned long		flags;
+#define BCACHE_DEV_CLOSING	0
+#define BCACHE_DEV_DETACHING	1
+#define BCACHE_DEV_UNLINK_DONE	2
+
+	unsigned		nr_stripes;
+	unsigned		stripe_size;
+	atomic_t		*stripe_sectors_dirty;
+	unsigned long		*full_dirty_stripes;
+
+	unsigned long		sectors_dirty_last;
+	long			sectors_dirty_derivative;
+
+	struct bio_set		*bio_split;
+
+	unsigned		data_csum:1;
+
+	int (*cache_miss)(struct btree *, struct search *,
+			  struct bio *, unsigned);
+	int (*ioctl) (struct bcache_device *, fmode_t, unsigned, unsigned long);
+
+	struct bio_split_pool	bio_split_hook;
+};
+
+struct io {
+	/* Used to track sequential IO so it can be skipped */
+	struct hlist_node	hash;
+	struct list_head	lru;
+
+	unsigned long		jiffies;
+	unsigned		sequential;
+	sector_t		last;
+};
+
+struct cached_dev {
+	struct list_head	list;
+	struct bcache_device	disk;
+	struct block_device	*bdev;
+
+	struct cache_sb		sb;
+	struct bio		sb_bio;
+	struct bio_vec		sb_bv[1];
+	struct closure		sb_write;
+	struct semaphore	sb_write_mutex;
+
+	/* Refcount on the cache set. Always nonzero when we're caching. */
+	atomic_t		count;
+	struct work_struct	detach;
+
+	/*
+	 * Device might not be running if it's dirty and the cache set hasn't
+	 * showed up yet.
+	 */
+	atomic_t		running;
+
+	/*
+	 * Writes take a shared lock from start to finish; scanning for dirty
+	 * data to refill the rb tree requires an exclusive lock.
+	 */
+	struct rw_semaphore	writeback_lock;
+
+	/*
+	 * Nonzero, and writeback has a refcount (d->count), iff there is dirty
+	 * data in the cache. Protected by writeback_lock; must have an
+	 * shared lock to set and exclusive lock to clear.
+	 */
+	atomic_t		has_dirty;
+
+	struct bch_ratelimit	writeback_rate;
+	struct delayed_work	writeback_rate_update;
+
+	/*
+	 * Internal to the writeback code, so read_dirty() can keep track of
+	 * where it's at.
+	 */
+	sector_t		last_read;
+
+	/* Limit number of writeback bios in flight */
+	struct semaphore	in_flight;
+	struct task_struct	*writeback_thread;
+
+	struct keybuf		writeback_keys;
+
+	/* For tracking sequential IO */
+#define RECENT_IO_BITS	7
+#define RECENT_IO	(1 << RECENT_IO_BITS)
+	struct io		io[RECENT_IO];
+	struct hlist_head	io_hash[RECENT_IO + 1];
+	struct list_head	io_lru;
+	spinlock_t		io_lock;
+
+	struct cache_accounting	accounting;
+
+	/* The rest of this all shows up in sysfs */
+	unsigned		sequential_cutoff;
+	unsigned		readahead;
+
+	unsigned		verify:1;
+	unsigned		bypass_torture_test:1;
+
+	unsigned		partial_stripes_expensive:1;
+	unsigned		writeback_metadata:1;
+	unsigned		writeback_running:1;
+	unsigned char		writeback_percent;
+	unsigned		writeback_delay;
+
+	uint64_t		writeback_rate_target;
+	int64_t			writeback_rate_proportional;
+	int64_t			writeback_rate_derivative;
+	int64_t			writeback_rate_change;
+
+	unsigned		writeback_rate_update_seconds;
+	unsigned		writeback_rate_d_term;
+	unsigned		writeback_rate_p_term_inverse;
+};
+
+enum alloc_reserve {
+	RESERVE_BTREE,
+	RESERVE_PRIO,
+	RESERVE_MOVINGGC,
+	RESERVE_NONE,
+	RESERVE_NR,
+};
+
+struct cache {
+	struct cache_set	*set;
+	struct cache_sb		sb;
+	struct bio		sb_bio;
+	struct bio_vec		sb_bv[1];
+
+	struct kobject		kobj;
+	struct block_device	*bdev;
+
+	struct task_struct	*alloc_thread;
+
+	struct closure		prio;
+	struct prio_set		*disk_buckets;
+
+	/*
+	 * When allocating new buckets, prio_write() gets first dibs - since we
+	 * may not be allocate at all without writing priorities and gens.
+	 * prio_buckets[] contains the last buckets we wrote priorities to (so
+	 * gc can mark them as metadata), prio_next[] contains the buckets
+	 * allocated for the next prio write.
+	 */
+	uint64_t		*prio_buckets;
+	uint64_t		*prio_last_buckets;
+
+	/*
+	 * free: Buckets that are ready to be used
+	 *
+	 * free_inc: Incoming buckets - these are buckets that currently have
+	 * cached data in them, and we can't reuse them until after we write
+	 * their new gen to disk. After prio_write() finishes writing the new
+	 * gens/prios, they'll be moved to the free list (and possibly discarded
+	 * in the process)
+	 */
+	DECLARE_FIFO(long, free)[RESERVE_NR];
+	DECLARE_FIFO(long, free_inc);
+
+	size_t			fifo_last_bucket;
+
+	/* Allocation stuff: */
+	struct bucket		*buckets;
+
+	DECLARE_HEAP(struct bucket *, heap);
+
+	/*
+	 * If nonzero, we know we aren't going to find any buckets to invalidate
+	 * until a gc finishes - otherwise we could pointlessly burn a ton of
+	 * cpu
+	 */
+	unsigned		invalidate_needs_gc:1;
+
+	bool			discard; /* Get rid of? */
+
+	struct journal_device	journal;
+
+	/* The rest of this all shows up in sysfs */
+#define IO_ERROR_SHIFT		20
+	atomic_t		io_errors;
+	atomic_t		io_count;
+
+	atomic_long_t		meta_sectors_written;
+	atomic_long_t		btree_sectors_written;
+	atomic_long_t		sectors_written;
+
+	struct bio_split_pool	bio_split_hook;
+};
+
+struct gc_stat {
+	size_t			nodes;
+	size_t			key_bytes;
+
+	size_t			nkeys;
+	uint64_t		data;	/* sectors */
+	unsigned		in_use; /* percent */
+};
+
+/*
+ * Flag bits, for how the cache set is shutting down, and what phase it's at:
+ *
+ * CACHE_SET_UNREGISTERING means we're not just shutting down, we're detaching
+ * all the backing devices first (their cached data gets invalidated, and they
+ * won't automatically reattach).
+ *
+ * CACHE_SET_STOPPING always gets set first when we're closing down a cache set;
+ * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e.
+ * flushing dirty data).
+ */
+#define CACHE_SET_UNREGISTERING		0
+#define	CACHE_SET_STOPPING		1
+
+struct cache_set {
+	struct closure		cl;
+
+	struct list_head	list;
+	struct kobject		kobj;
+	struct kobject		internal;
+	struct dentry		*debug;
+	struct cache_accounting accounting;
+
+	unsigned long		flags;
+
+	struct cache_sb		sb;
+
+	struct cache		*cache[MAX_CACHES_PER_SET];
+	struct cache		*cache_by_alloc[MAX_CACHES_PER_SET];
+	int			caches_loaded;
+
+	struct bcache_device	**devices;
+	struct list_head	cached_devs;
+	uint64_t		cached_dev_sectors;
+	struct closure		caching;
+
+	struct closure		sb_write;
+	struct semaphore	sb_write_mutex;
+
+	mempool_t		*search;
+	mempool_t		*bio_meta;
+	struct bio_set		*bio_split;
+
+	/* For the btree cache */
+	struct shrinker		shrink;
+
+	/* For the btree cache and anything allocation related */
+	struct mutex		bucket_lock;
+
+	/* log2(bucket_size), in sectors */
+	unsigned short		bucket_bits;
+
+	/* log2(block_size), in sectors */
+	unsigned short		block_bits;
+
+	/*
+	 * Default number of pages for a new btree node - may be less than a
+	 * full bucket
+	 */
+	unsigned		btree_pages;
+
+	/*
+	 * Lists of struct btrees; lru is the list for structs that have memory
+	 * allocated for actual btree node, freed is for structs that do not.
+	 *
+	 * We never free a struct btree, except on shutdown - we just put it on
+	 * the btree_cache_freed list and reuse it later. This simplifies the
+	 * code, and it doesn't cost us much memory as the memory usage is
+	 * dominated by buffers that hold the actual btree node data and those
+	 * can be freed - and the number of struct btrees allocated is
+	 * effectively bounded.
+	 *
+	 * btree_cache_freeable effectively is a small cache - we use it because
+	 * high order page allocations can be rather expensive, and it's quite
+	 * common to delete and allocate btree nodes in quick succession. It
+	 * should never grow past ~2-3 nodes in practice.
+	 */
+	struct list_head	btree_cache;
+	struct list_head	btree_cache_freeable;
+	struct list_head	btree_cache_freed;
+
+	/* Number of elements in btree_cache + btree_cache_freeable lists */
+	unsigned		btree_cache_used;
+
+	/*
+	 * If we need to allocate memory for a new btree node and that
+	 * allocation fails, we can cannibalize another node in the btree cache
+	 * to satisfy the allocation - lock to guarantee only one thread does
+	 * this at a time:
+	 */
+	wait_queue_head_t	btree_cache_wait;
+	struct task_struct	*btree_cache_alloc_lock;
+
+	/*
+	 * When we free a btree node, we increment the gen of the bucket the
+	 * node is in - but we can't rewrite the prios and gens until we
+	 * finished whatever it is we were doing, otherwise after a crash the
+	 * btree node would be freed but for say a split, we might not have the
+	 * pointers to the new nodes inserted into the btree yet.
+	 *
+	 * This is a refcount that blocks prio_write() until the new keys are
+	 * written.
+	 */
+	atomic_t		prio_blocked;
+	wait_queue_head_t	bucket_wait;
+
+	/*
+	 * For any bio we don't skip we subtract the number of sectors from
+	 * rescale; when it hits 0 we rescale all the bucket priorities.
+	 */
+	atomic_t		rescale;
+	/*
+	 * When we invalidate buckets, we use both the priority and the amount
+	 * of good data to determine which buckets to reuse first - to weight
+	 * those together consistently we keep track of the smallest nonzero
+	 * priority of any bucket.
+	 */
+	uint16_t		min_prio;
+
+	/*
+	 * max(gen - last_gc) for all buckets. When it gets too big we have to gc
+	 * to keep gens from wrapping around.
+	 */
+	uint8_t			need_gc;
+	struct gc_stat		gc_stats;
+	size_t			nbuckets;
+
+	struct task_struct	*gc_thread;
+	/* Where in the btree gc currently is */
+	struct bkey		gc_done;
+
+	/*
+	 * The allocation code needs gc_mark in struct bucket to be correct, but
+	 * it's not while a gc is in progress. Protected by bucket_lock.
+	 */
+	int			gc_mark_valid;
+
+	/* Counts how many sectors bio_insert has added to the cache */
+	atomic_t		sectors_to_gc;
+
+	wait_queue_head_t	moving_gc_wait;
+	struct keybuf		moving_gc_keys;
+	/* Number of moving GC bios in flight */
+	struct semaphore	moving_in_flight;
+
+	struct workqueue_struct	*moving_gc_wq;
+
+	struct btree		*root;
+
+#ifdef CONFIG_BCACHE_DEBUG
+	struct btree		*verify_data;
+	struct bset		*verify_ondisk;
+	struct mutex		verify_lock;
+#endif
+
+	unsigned		nr_uuids;
+	struct uuid_entry	*uuids;
+	BKEY_PADDED(uuid_bucket);
+	struct closure		uuid_write;
+	struct semaphore	uuid_write_mutex;
+
+	/*
+	 * A btree node on disk could have too many bsets for an iterator to fit
+	 * on the stack - have to dynamically allocate them
+	 */
+	mempool_t		*fill_iter;
+
+	struct bset_sort_state	sort;
+
+	/* List of buckets we're currently writing data to */
+	struct list_head	data_buckets;
+	spinlock_t		data_bucket_lock;
+
+	struct journal		journal;
+
+#define CONGESTED_MAX		1024
+	unsigned		congested_last_us;
+	atomic_t		congested;
+
+	/* The rest of this all shows up in sysfs */
+	unsigned		congested_read_threshold_us;
+	unsigned		congested_write_threshold_us;
+
+	struct time_stats	btree_gc_time;
+	struct time_stats	btree_split_time;
+	struct time_stats	btree_read_time;
+
+	atomic_long_t		cache_read_races;
+	atomic_long_t		writeback_keys_done;
+	atomic_long_t		writeback_keys_failed;
+
+	enum			{
+		ON_ERROR_UNREGISTER,
+		ON_ERROR_PANIC,
+	}			on_error;
+	unsigned		error_limit;
+	unsigned		error_decay;
+
+	unsigned short		journal_delay_ms;
+	bool			expensive_debug_checks;
+	unsigned		verify:1;
+	unsigned		key_merging_disabled:1;
+	unsigned		gc_always_rewrite:1;
+	unsigned		shrinker_disabled:1;
+	unsigned		copy_gc_enabled:1;
+
+#define BUCKET_HASH_BITS	12
+	struct hlist_head	bucket_hash[1 << BUCKET_HASH_BITS];
+};
+
+struct bbio {
+	unsigned		submit_time_us;
+	union {
+		struct bkey	key;
+		uint64_t	_pad[3];
+		/*
+		 * We only need pad = 3 here because we only ever carry around a
+		 * single pointer - i.e. the pointer we're doing io to/from.
+		 */
+	};
+	struct bio		bio;
+};
+
+#define BTREE_PRIO		USHRT_MAX
+#define INITIAL_PRIO		32768U
+
+#define btree_bytes(c)		((c)->btree_pages * PAGE_SIZE)
+#define btree_blocks(b)							\
+	((unsigned) (KEY_SIZE(&b->key) >> (b)->c->block_bits))
+
+#define btree_default_blocks(c)						\
+	((unsigned) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits))
+
+#define bucket_pages(c)		((c)->sb.bucket_size / PAGE_SECTORS)
+#define bucket_bytes(c)		((c)->sb.bucket_size << 9)
+#define block_bytes(c)		((c)->sb.block_size << 9)
+
+#define prios_per_bucket(c)				\
+	((bucket_bytes(c) - sizeof(struct prio_set)) /	\
+	 sizeof(struct bucket_disk))
+#define prio_buckets(c)					\
+	DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c))
+
+static inline size_t sector_to_bucket(struct cache_set *c, sector_t s)
+{
+	return s >> c->bucket_bits;
+}
+
+static inline sector_t bucket_to_sector(struct cache_set *c, size_t b)
+{
+	return ((sector_t) b) << c->bucket_bits;
+}
+
+static inline sector_t bucket_remainder(struct cache_set *c, sector_t s)
+{
+	return s & (c->sb.bucket_size - 1);
+}
+
+static inline struct cache *PTR_CACHE(struct cache_set *c,
+				      const struct bkey *k,
+				      unsigned ptr)
+{
+	return c->cache[PTR_DEV(k, ptr)];
+}
+
+static inline size_t PTR_BUCKET_NR(struct cache_set *c,
+				   const struct bkey *k,
+				   unsigned ptr)
+{
+	return sector_to_bucket(c, PTR_OFFSET(k, ptr));
+}
+
+static inline struct bucket *PTR_BUCKET(struct cache_set *c,
+					const struct bkey *k,
+					unsigned ptr)
+{
+	return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr);
+}
+
+static inline uint8_t gen_after(uint8_t a, uint8_t b)
+{
+	uint8_t r = a - b;
+	return r > 128U ? 0 : r;
+}
+
+static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
+				unsigned i)
+{
+	return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
+}
+
+static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
+				 unsigned i)
+{
+	return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
+}
+
+/* Btree key macros */
+
+/*
+ * This is used for various on disk data structures - cache_sb, prio_set, bset,
+ * jset: The checksum is _always_ the first 8 bytes of these structs
+ */
+#define csum_set(i)							\
+	bch_crc64(((void *) (i)) + sizeof(uint64_t),			\
+		  ((void *) bset_bkey_last(i)) -			\
+		  (((void *) (i)) + sizeof(uint64_t)))
+
+/* Error handling macros */
+
+#define btree_bug(b, ...)						\
+do {									\
+	if (bch_cache_set_error((b)->c, __VA_ARGS__))			\
+		dump_stack();						\
+} while (0)
+
+#define cache_bug(c, ...)						\
+do {									\
+	if (bch_cache_set_error(c, __VA_ARGS__))			\
+		dump_stack();						\
+} while (0)
+
+#define btree_bug_on(cond, b, ...)					\
+do {									\
+	if (cond)							\
+		btree_bug(b, __VA_ARGS__);				\
+} while (0)
+
+#define cache_bug_on(cond, c, ...)					\
+do {									\
+	if (cond)							\
+		cache_bug(c, __VA_ARGS__);				\
+} while (0)
+
+#define cache_set_err_on(cond, c, ...)					\
+do {									\
+	if (cond)							\
+		bch_cache_set_error(c, __VA_ARGS__);			\
+} while (0)
+
+/* Looping macros */
+
+#define for_each_cache(ca, cs, iter)					\
+	for (iter = 0; ca = cs->cache[iter], iter < (cs)->sb.nr_in_set; iter++)
+
+#define for_each_bucket(b, ca)						\
+	for (b = (ca)->buckets + (ca)->sb.first_bucket;			\
+	     b < (ca)->buckets + (ca)->sb.nbuckets; b++)
+
+static inline void cached_dev_put(struct cached_dev *dc)
+{
+	if (atomic_dec_and_test(&dc->count))
+		schedule_work(&dc->detach);
+}
+
+static inline bool cached_dev_get(struct cached_dev *dc)
+{
+	if (!atomic_inc_not_zero(&dc->count))
+		return false;
+
+	/* Paired with the mb in cached_dev_attach */
+	smp_mb__after_atomic();
+	return true;
+}
+
+/*
+ * bucket_gc_gen() returns the difference between the bucket's current gen and
+ * the oldest gen of any pointer into that bucket in the btree (last_gc).
+ */
+
+static inline uint8_t bucket_gc_gen(struct bucket *b)
+{
+	return b->gen - b->last_gc;
+}
+
+#define BUCKET_GC_GEN_MAX	96U
+
+#define kobj_attribute_write(n, fn)					\
+	static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
+
+#define kobj_attribute_rw(n, show, store)				\
+	static struct kobj_attribute ksysfs_##n =			\
+		__ATTR(n, S_IWUSR|S_IRUSR, show, store)
+
+static inline void wake_up_allocators(struct cache_set *c)
+{
+	struct cache *ca;
+	unsigned i;
+
+	for_each_cache(ca, c, i)
+		wake_up_process(ca->alloc_thread);
+}
+
+/* Forward declarations */
+
+void bch_count_io_errors(struct cache *, int, const char *);
+void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
+			      int, const char *);
+void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *);
+void bch_bbio_free(struct bio *, struct cache_set *);
+struct bio *bch_bbio_alloc(struct cache_set *);
+
+void bch_generic_make_request(struct bio *, struct bio_split_pool *);
+void __bch_submit_bbio(struct bio *, struct cache_set *);
+void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
+
+uint8_t bch_inc_gen(struct cache *, struct bucket *);
+void bch_rescale_priorities(struct cache_set *, int);
+
+bool bch_can_invalidate_bucket(struct cache *, struct bucket *);
+void __bch_invalidate_one_bucket(struct cache *, struct bucket *);
+
+void __bch_bucket_free(struct cache *, struct bucket *);
+void bch_bucket_free(struct cache_set *, struct bkey *);
+
+long bch_bucket_alloc(struct cache *, unsigned, bool);
+int __bch_bucket_alloc_set(struct cache_set *, unsigned,
+			   struct bkey *, int, bool);
+int bch_bucket_alloc_set(struct cache_set *, unsigned,
+			 struct bkey *, int, bool);
+bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned,
+		       unsigned, unsigned, bool);
+
+__printf(2, 3)
+bool bch_cache_set_error(struct cache_set *, const char *, ...);
+
+void bch_prio_write(struct cache *);
+void bch_write_bdev_super(struct cached_dev *, struct closure *);
+
+extern struct workqueue_struct *bcache_wq;
+extern const char * const bch_cache_modes[];
+extern struct mutex bch_register_lock;
+extern struct list_head bch_cache_sets;
+
+extern struct kobj_type bch_cached_dev_ktype;
+extern struct kobj_type bch_flash_dev_ktype;
+extern struct kobj_type bch_cache_set_ktype;
+extern struct kobj_type bch_cache_set_internal_ktype;
+extern struct kobj_type bch_cache_ktype;
+
+void bch_cached_dev_release(struct kobject *);
+void bch_flash_dev_release(struct kobject *);
+void bch_cache_set_release(struct kobject *);
+void bch_cache_release(struct kobject *);
+
+int bch_uuid_write(struct cache_set *);
+void bcache_write_super(struct cache_set *);
+
+int bch_flash_dev_create(struct cache_set *c, uint64_t size);
+
+int bch_cached_dev_attach(struct cached_dev *, struct cache_set *);
+void bch_cached_dev_detach(struct cached_dev *);
+void bch_cached_dev_run(struct cached_dev *);
+void bcache_device_stop(struct bcache_device *);
+
+void bch_cache_set_unregister(struct cache_set *);
+void bch_cache_set_stop(struct cache_set *);
+
+struct cache_set *bch_cache_set_alloc(struct cache_sb *);
+void bch_btree_cache_free(struct cache_set *);
+int bch_btree_cache_alloc(struct cache_set *);
+void bch_moving_init_cache_set(struct cache_set *);
+int bch_open_buckets_alloc(struct cache_set *);
+void bch_open_buckets_free(struct cache_set *);
+
+int bch_cache_allocator_start(struct cache *ca);
+
+void bch_debug_exit(void);
+int bch_debug_init(struct kobject *);
+void bch_request_exit(void);
+int bch_request_init(void);
+
+#endif /* _BCACHE_H */
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
new file mode 100644
index 00000000000..54541641530
--- /dev/null
+++ b/drivers/md/bcache/bset.c
@@ -0,0 +1,1331 @@
+/*
+ * Code for working with individual keys, and sorted sets of keys with in a
+ * btree node
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
+
+#include "util.h"
+#include "bset.h"
+
+#include <linux/console.h>
+#include <linux/random.h>
+#include <linux/prefetch.h>
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set)
+{
+	struct bkey *k, *next;
+
+	for (k = i->start; k < bset_bkey_last(i); k = next) {
+		next = bkey_next(k);
+
+		printk(KERN_ERR "block %u key %u/%u: ", set,
+		       (unsigned) ((u64 *) k - i->d), i->keys);
+
+		if (b->ops->key_dump)
+			b->ops->key_dump(b, k);
+		else
+			printk("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k));
+
+		if (next < bset_bkey_last(i) &&
+		    bkey_cmp(k, b->ops->is_extents ?
+			     &START_KEY(next) : next) > 0)
+			printk(KERN_ERR "Key skipped backwards\n");
+	}
+}
+
+void bch_dump_bucket(struct btree_keys *b)
+{
+	unsigned i;
+
+	console_lock();
+	for (i = 0; i <= b->nsets; i++)
+		bch_dump_bset(b, b->set[i].data,
+			      bset_sector_offset(b, b->set[i].data));
+	console_unlock();
+}
+
+int __bch_count_data(struct btree_keys *b)
+{
+	unsigned ret = 0;
+	struct btree_iter iter;
+	struct bkey *k;
+
+	if (b->ops->is_extents)
+		for_each_key(b, k, &iter)
+			ret += KEY_SIZE(k);
+	return ret;
+}
+
+void __bch_check_keys(struct btree_keys *b, const char *fmt, ...)
+{
+	va_list args;
+	struct bkey *k, *p = NULL;
+	struct btree_iter iter;
+	const char *err;
+
+	for_each_key(b, k, &iter) {
+		if (b->ops->is_extents) {
+			err = "Keys out of order";
+			if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0)
+				goto bug;
+
+			if (bch_ptr_invalid(b, k))
+				continue;
+
+			err =  "Overlapping keys";
+			if (p && bkey_cmp(p, &START_KEY(k)) > 0)
+				goto bug;
+		} else {
+			if (bch_ptr_bad(b, k))
+				continue;
+
+			err = "Duplicate keys";
+			if (p && !bkey_cmp(p, k))
+				goto bug;
+		}
+		p = k;
+	}
+#if 0
+	err = "Key larger than btree node key";
+	if (p && bkey_cmp(p, &b->key) > 0)
+		goto bug;
+#endif
+	return;
+bug:
+	bch_dump_bucket(b);
+
+	va_start(args, fmt);
+	vprintk(fmt, args);
+	va_end(args);
+
+	panic("bch_check_keys error:  %s:\n", err);
+}
+
+static void bch_btree_iter_next_check(struct btree_iter *iter)
+{
+	struct bkey *k = iter->data->k, *next = bkey_next(k);
+
+	if (next < iter->data->end &&
+	    bkey_cmp(k, iter->b->ops->is_extents ?
+		     &START_KEY(next) : next) > 0) {
+		bch_dump_bucket(iter->b);
+		panic("Key skipped backwards\n");
+	}
+}
+
+#else
+
+static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}
+
+#endif
+
+/* Keylists */
+
+int __bch_keylist_realloc(struct keylist *l, unsigned u64s)
+{
+	size_t oldsize = bch_keylist_nkeys(l);
+	size_t newsize = oldsize + u64s;
+	uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p;
+	uint64_t *new_keys;
+
+	newsize = roundup_pow_of_two(newsize);
+
+	if (newsize <= KEYLIST_INLINE ||
+	    roundup_pow_of_two(oldsize) == newsize)
+		return 0;
+
+	new_keys = krealloc(old_keys, sizeof(uint64_t) * newsize, GFP_NOIO);
+
+	if (!new_keys)
+		return -ENOMEM;
+
+	if (!old_keys)
+		memcpy(new_keys, l->inline_keys, sizeof(uint64_t) * oldsize);
+
+	l->keys_p = new_keys;
+	l->top_p = new_keys + oldsize;
+
+	return 0;
+}
+
+struct bkey *bch_keylist_pop(struct keylist *l)
+{
+	struct bkey *k = l->keys;
+
+	if (k == l->top)
+		return NULL;
+
+	while (bkey_next(k) != l->top)
+		k = bkey_next(k);
+
+	return l->top = k;
+}
+
+void bch_keylist_pop_front(struct keylist *l)
+{
+	l->top_p -= bkey_u64s(l->keys);
+
+	memmove(l->keys,
+		bkey_next(l->keys),
+		bch_keylist_bytes(l));
+}
+
+/* Key/pointer manipulation */
+
+void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src,
+			      unsigned i)
+{
+	BUG_ON(i > KEY_PTRS(src));
+
+	/* Only copy the header, key, and one pointer. */
+	memcpy(dest, src, 2 * sizeof(uint64_t));
+	dest->ptr[0] = src->ptr[i];
+	SET_KEY_PTRS(dest, 1);
+	/* We didn't copy the checksum so clear that bit. */
+	SET_KEY_CSUM(dest, 0);
+}
+
+bool __bch_cut_front(const struct bkey *where, struct bkey *k)
+{
+	unsigned i, len = 0;
+
+	if (bkey_cmp(where, &START_KEY(k)) <= 0)
+		return false;
+
+	if (bkey_cmp(where, k) < 0)
+		len = KEY_OFFSET(k) - KEY_OFFSET(where);
+	else
+		bkey_copy_key(k, where);
+
+	for (i = 0; i < KEY_PTRS(k); i++)
+		SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + KEY_SIZE(k) - len);
+
+	BUG_ON(len > KEY_SIZE(k));
+	SET_KEY_SIZE(k, len);
+	return true;
+}
+
+bool __bch_cut_back(const struct bkey *where, struct bkey *k)
+{
+	unsigned len = 0;
+
+	if (bkey_cmp(where, k) >= 0)
+		return false;
+
+	BUG_ON(KEY_INODE(where) != KEY_INODE(k));
+
+	if (bkey_cmp(where, &START_KEY(k)) > 0)
+		len = KEY_OFFSET(where) - KEY_START(k);
+
+	bkey_copy_key(k, where);
+
+	BUG_ON(len > KEY_SIZE(k));
+	SET_KEY_SIZE(k, len);
+	return true;
+}
+
+/* Auxiliary search trees */
+
+/* 32 bits total: */
+#define BKEY_MID_BITS		3
+#define BKEY_EXPONENT_BITS	7
+#define BKEY_MANTISSA_BITS	(32 - BKEY_MID_BITS - BKEY_EXPONENT_BITS)
+#define BKEY_MANTISSA_MASK	((1 << BKEY_MANTISSA_BITS) - 1)
+
+struct bkey_float {
+	unsigned	exponent:BKEY_EXPONENT_BITS;
+	unsigned	m:BKEY_MID_BITS;
+	unsigned	mantissa:BKEY_MANTISSA_BITS;
+} __packed;
+
+/*
+ * BSET_CACHELINE was originally intended to match the hardware cacheline size -
+ * it used to be 64, but I realized the lookup code would touch slightly less
+ * memory if it was 128.
+ *
+ * It definites the number of bytes (in struct bset) per struct bkey_float in
+ * the auxiliar search tree - when we're done searching the bset_float tree we
+ * have this many bytes left that we do a linear search over.
+ *
+ * Since (after level 5) every level of the bset_tree is on a new cacheline,
+ * we're touching one fewer cacheline in the bset tree in exchange for one more
+ * cacheline in the linear search - but the linear search might stop before it
+ * gets to the second cacheline.
+ */
+
+#define BSET_CACHELINE		128
+
+/* Space required for the btree node keys */
+static inline size_t btree_keys_bytes(struct btree_keys *b)
+{
+	return PAGE_SIZE << b->page_order;
+}
+
+static inline size_t btree_keys_cachelines(struct btree_keys *b)
+{
+	return btree_keys_bytes(b) / BSET_CACHELINE;
+}
+
+/* Space required for the auxiliary search trees */
+static inline size_t bset_tree_bytes(struct btree_keys *b)
+{
+	return btree_keys_cachelines(b) * sizeof(struct bkey_float);
+}
+
+/* Space required for the prev pointers */
+static inline size_t bset_prev_bytes(struct btree_keys *b)
+{
+	return btree_keys_cachelines(b) * sizeof(uint8_t);
+}
+
+/* Memory allocation */
+
+void bch_btree_keys_free(struct btree_keys *b)
+{
+	struct bset_tree *t = b->set;
+
+	if (bset_prev_bytes(b) < PAGE_SIZE)
+		kfree(t->prev);
+	else
+		free_pages((unsigned long) t->prev,
+			   get_order(bset_prev_bytes(b)));
+
+	if (bset_tree_bytes(b) < PAGE_SIZE)
+		kfree(t->tree);
+	else
+		free_pages((unsigned long) t->tree,
+			   get_order(bset_tree_bytes(b)));
+
+	free_pages((unsigned long) t->data, b->page_order);
+
+	t->prev = NULL;
+	t->tree = NULL;
+	t->data = NULL;
+}
+EXPORT_SYMBOL(bch_btree_keys_free);
+
+int bch_btree_keys_alloc(struct btree_keys *b, unsigned page_order, gfp_t gfp)
+{
+	struct bset_tree *t = b->set;
+
+	BUG_ON(t->data);
+
+	b->page_order = page_order;
+
+	t->data = (void *) __get_free_pages(gfp, b->page_order);
+	if (!t->data)
+		goto err;
+
+	t->tree = bset_tree_bytes(b) < PAGE_SIZE
+		? kmalloc(bset_tree_bytes(b), gfp)
+		: (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
+	if (!t->tree)
+		goto err;
+
+	t->prev = bset_prev_bytes(b) < PAGE_SIZE
+		? kmalloc(bset_prev_bytes(b), gfp)
+		: (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
+	if (!t->prev)
+		goto err;
+
+	return 0;
+err:
+	bch_btree_keys_free(b);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(bch_btree_keys_alloc);
+
+void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops,
+			 bool *expensive_debug_checks)
+{
+	unsigned i;
+
+	b->ops = ops;
+	b->expensive_debug_checks = expensive_debug_checks;
+	b->nsets = 0;
+	b->last_set_unwritten = 0;
+
+	/* XXX: shouldn't be needed */
+	for (i = 0; i < MAX_BSETS; i++)
+		b->set[i].size = 0;
+	/*
+	 * Second loop starts at 1 because b->keys[0]->data is the memory we
+	 * allocated
+	 */
+	for (i = 1; i < MAX_BSETS; i++)
+		b->set[i].data = NULL;
+}
+EXPORT_SYMBOL(bch_btree_keys_init);
+
+/* Binary tree stuff for auxiliary search trees */
+
+static unsigned inorder_next(unsigned j, unsigned size)
+{
+	if (j * 2 + 1 < size) {
+		j = j * 2 + 1;
+
+		while (j * 2 < size)
+			j *= 2;
+	} else
+		j >>= ffz(j) + 1;
+
+	return j;
+}
+
+static unsigned inorder_prev(unsigned j, unsigned size)
+{
+	if (j * 2 < size) {
+		j = j * 2;
+
+		while (j * 2 + 1 < size)
+			j = j * 2 + 1;
+	} else
+		j >>= ffs(j);
+
+	return j;
+}
+
+/* I have no idea why this code works... and I'm the one who wrote it
+ *
+ * However, I do know what it does:
+ * Given a binary tree constructed in an array (i.e. how you normally implement
+ * a heap), it converts a node in the tree - referenced by array index - to the
+ * index it would have if you did an inorder traversal.
+ *
+ * Also tested for every j, size up to size somewhere around 6 million.
+ *
+ * The binary tree starts at array index 1, not 0
+ * extra is a function of size:
+ *   extra = (size - rounddown_pow_of_two(size - 1)) << 1;
+ */
+static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra)
+{
+	unsigned b = fls(j);
+	unsigned shift = fls(size - 1) - b;
+
+	j  ^= 1U << (b - 1);
+	j <<= 1;
+	j  |= 1;
+	j <<= shift;
+
+	if (j > extra)
+		j -= (j - extra) >> 1;
+
+	return j;
+}
+
+static unsigned to_inorder(unsigned j, struct bset_tree *t)
+{
+	return __to_inorder(j, t->size, t->extra);
+}
+
+static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra)
+{
+	unsigned shift;
+
+	if (j > extra)
+		j += j - extra;
+
+	shift = ffs(j);
+
+	j >>= shift;
+	j  |= roundup_pow_of_two(size) >> shift;
+
+	return j;
+}
+
+static unsigned inorder_to_tree(unsigned j, struct bset_tree *t)
+{
+	return __inorder_to_tree(j, t->size, t->extra);
+}
+
+#if 0
+void inorder_test(void)
+{
+	unsigned long done = 0;
+	ktime_t start = ktime_get();
+
+	for (unsigned size = 2;
+	     size < 65536000;
+	     size++) {
+		unsigned extra = (size - rounddown_pow_of_two(size - 1)) << 1;
+		unsigned i = 1, j = rounddown_pow_of_two(size - 1);
+
+		if (!(size % 4096))
+			printk(KERN_NOTICE "loop %u, %llu per us\n", size,
+			       done / ktime_us_delta(ktime_get(), start));
+
+		while (1) {
+			if (__inorder_to_tree(i, size, extra) != j)
+				panic("size %10u j %10u i %10u", size, j, i);
+
+			if (__to_inorder(j, size, extra) != i)
+				panic("size %10u j %10u i %10u", size, j, i);
+
+			if (j == rounddown_pow_of_two(size) - 1)
+				break;
+
+			BUG_ON(inorder_prev(inorder_next(j, size), size) != j);
+
+			j = inorder_next(j, size);
+			i++;
+		}
+
+		done += size - 1;
+	}
+}
+#endif
+
+/*
+ * Cacheline/offset <-> bkey pointer arithmetic:
+ *
+ * t->tree is a binary search tree in an array; each node corresponds to a key
+ * in one cacheline in t->set (BSET_CACHELINE bytes).
+ *
+ * This means we don't have to store the full index of the key that a node in
+ * the binary tree points to; to_inorder() gives us the cacheline, and then
+ * bkey_float->m gives us the offset within that cacheline, in units of 8 bytes.
+ *
+ * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
+ * make this work.
+ *
+ * To construct the bfloat for an arbitrary key we need to know what the key
+ * immediately preceding it is: we have to check if the two keys differ in the
+ * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
+ * of the previous key so we can walk backwards to it from t->tree[j]'s key.
+ */
+
+static struct bkey *cacheline_to_bkey(struct bset_tree *t, unsigned cacheline,
+				      unsigned offset)
+{
+	return ((void *) t->data) + cacheline * BSET_CACHELINE + offset * 8;
+}
+
+static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k)
+{
+	return ((void *) k - (void *) t->data) / BSET_CACHELINE;
+}
+
+static unsigned bkey_to_cacheline_offset(struct bset_tree *t,
+					 unsigned cacheline,
+					 struct bkey *k)
+{
+	return (u64 *) k - (u64 *) cacheline_to_bkey(t, cacheline, 0);
+}
+
+static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j)
+{
+	return cacheline_to_bkey(t, to_inorder(j, t), t->tree[j].m);
+}
+
+static struct bkey *tree_to_prev_bkey(struct bset_tree *t, unsigned j)
+{
+	return (void *) (((uint64_t *) tree_to_bkey(t, j)) - t->prev[j]);
+}
+
+/*
+ * For the write set - the one we're currently inserting keys into - we don't
+ * maintain a full search tree, we just keep a simple lookup table in t->prev.
+ */
+static struct bkey *table_to_bkey(struct bset_tree *t, unsigned cacheline)
+{
+	return cacheline_to_bkey(t, cacheline, t->prev[cacheline]);
+}
+
+static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift)
+{
+	low >>= shift;
+	low  |= (high << 1) << (63U - shift);
+	return low;
+}
+
+static inline unsigned bfloat_mantissa(const struct bkey *k,
+				       struct bkey_float *f)
+{
+	const uint64_t *p = &k->low - (f->exponent >> 6);
+	return shrd128(p[-1], p[0], f->exponent & 63) & BKEY_MANTISSA_MASK;
+}
+
+static void make_bfloat(struct bset_tree *t, unsigned j)
+{
+	struct bkey_float *f = &t->tree[j];
+	struct bkey *m = tree_to_bkey(t, j);
+	struct bkey *p = tree_to_prev_bkey(t, j);
+
+	struct bkey *l = is_power_of_2(j)
+		? t->data->start
+		: tree_to_prev_bkey(t, j >> ffs(j));
+
+	struct bkey *r = is_power_of_2(j + 1)
+		? bset_bkey_idx(t->data, t->data->keys - bkey_u64s(&t->end))
+		: tree_to_bkey(t, j >> (ffz(j) + 1));
+
+	BUG_ON(m < l || m > r);
+	BUG_ON(bkey_next(p) != m);
+
+	if (KEY_INODE(l) != KEY_INODE(r))
+		f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64;
+	else
+		f->exponent = fls64(r->low ^ l->low);
+
+	f->exponent = max_t(int, f->exponent - BKEY_MANTISSA_BITS, 0);
+
+	/*
+	 * Setting f->exponent = 127 flags this node as failed, and causes the
+	 * lookup code to fall back to comparing against the original key.
+	 */
+
+	if (bfloat_mantissa(m, f) != bfloat_mantissa(p, f))
+		f->mantissa = bfloat_mantissa(m, f) - 1;
+	else
+		f->exponent = 127;
+}
+
+static void bset_alloc_tree(struct btree_keys *b, struct bset_tree *t)
+{
+	if (t != b->set) {
+		unsigned j = roundup(t[-1].size,
+				     64 / sizeof(struct bkey_float));
+
+		t->tree = t[-1].tree + j;
+		t->prev = t[-1].prev + j;
+	}
+
+	while (t < b->set + MAX_BSETS)
+		t++->size = 0;
+}
+
+static void bch_bset_build_unwritten_tree(struct btree_keys *b)
+{
+	struct bset_tree *t = bset_tree_last(b);
+
+	BUG_ON(b->last_set_unwritten);
+	b->last_set_unwritten = 1;
+
+	bset_alloc_tree(b, t);
+
+	if (t->tree != b->set->tree + btree_keys_cachelines(b)) {
+		t->prev[0] = bkey_to_cacheline_offset(t, 0, t->data->start);
+		t->size = 1;
+	}
+}
+
+void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic)
+{
+	if (i != b->set->data) {
+		b->set[++b->nsets].data = i;
+		i->seq = b->set->data->seq;
+	} else
+		get_random_bytes(&i->seq, sizeof(uint64_t));
+
+	i->magic	= magic;
+	i->version	= 0;
+	i->keys		= 0;
+
+	bch_bset_build_unwritten_tree(b);
+}
+EXPORT_SYMBOL(bch_bset_init_next);
+
+void bch_bset_build_written_tree(struct btree_keys *b)
+{
+	struct bset_tree *t = bset_tree_last(b);
+	struct bkey *prev = NULL, *k = t->data->start;
+	unsigned j, cacheline = 1;
+
+	b->last_set_unwritten = 0;
+
+	bset_alloc_tree(b, t);
+
+	t->size = min_t(unsigned,
+			bkey_to_cacheline(t, bset_bkey_last(t->data)),
+			b->set->tree + btree_keys_cachelines(b) - t->tree);
+
+	if (t->size < 2) {
+		t->size = 0;
+		return;
+	}
+
+	t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
+
+	/* First we figure out where the first key in each cacheline is */
+	for (j = inorder_next(0, t->size);
+	     j;
+	     j = inorder_next(j, t->size)) {
+		while (bkey_to_cacheline(t, k) < cacheline)
+			prev = k, k = bkey_next(k);
+
+		t->prev[j] = bkey_u64s(prev);
+		t->tree[j].m = bkey_to_cacheline_offset(t, cacheline++, k);
+	}
+
+	while (bkey_next(k) != bset_bkey_last(t->data))
+		k = bkey_next(k);
+
+	t->end = *k;
+
+	/* Then we build the tree */
+	for (j = inorder_next(0, t->size);
+	     j;
+	     j = inorder_next(j, t->size))
+		make_bfloat(t, j);
+}
+EXPORT_SYMBOL(bch_bset_build_written_tree);
+
+/* Insert */
+
+void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k)
+{
+	struct bset_tree *t;
+	unsigned inorder, j = 1;
+
+	for (t = b->set; t <= bset_tree_last(b); t++)
+		if (k < bset_bkey_last(t->data))
+			goto found_set;
+
+	BUG();
+found_set:
+	if (!t->size || !bset_written(b, t))
+		return;
+
+	inorder = bkey_to_cacheline(t, k);
+
+	if (k == t->data->start)
+		goto fix_left;
+
+	if (bkey_next(k) == bset_bkey_last(t->data)) {
+		t->end = *k;
+		goto fix_right;
+	}
+
+	j = inorder_to_tree(inorder, t);
+
+	if (j &&
+	    j < t->size &&
+	    k == tree_to_bkey(t, j))
+fix_left:	do {
+			make_bfloat(t, j);
+			j = j * 2;
+		} while (j < t->size);
+
+	j = inorder_to_tree(inorder + 1, t);
+
+	if (j &&
+	    j < t->size &&
+	    k == tree_to_prev_bkey(t, j))
+fix_right:	do {
+			make_bfloat(t, j);
+			j = j * 2 + 1;
+		} while (j < t->size);
+}
+EXPORT_SYMBOL(bch_bset_fix_invalidated_key);
+
+static void bch_bset_fix_lookup_table(struct btree_keys *b,
+				      struct bset_tree *t,
+				      struct bkey *k)
+{
+	unsigned shift = bkey_u64s(k);
+	unsigned j = bkey_to_cacheline(t, k);
+
+	/* We're getting called from btree_split() or btree_gc, just bail out */
+	if (!t->size)
+		return;
+
+	/* k is the key we just inserted; we need to find the entry in the
+	 * lookup table for the first key that is strictly greater than k:
+	 * it's either k's cacheline or the next one
+	 */
+	while (j < t->size &&
+	       table_to_bkey(t, j) <= k)
+		j++;
+
+	/* Adjust all the lookup table entries, and find a new key for any that
+	 * have gotten too big
+	 */
+	for (; j < t->size; j++) {
+		t->prev[j] += shift;
+
+		if (t->prev[j] > 7) {
+			k = table_to_bkey(t, j - 1);
+
+			while (k < cacheline_to_bkey(t, j, 0))
+				k = bkey_next(k);
+
+			t->prev[j] = bkey_to_cacheline_offset(t, j, k);
+		}
+	}
+
+	if (t->size == b->set->tree + btree_keys_cachelines(b) - t->tree)
+		return;
+
+	/* Possibly add a new entry to the end of the lookup table */
+
+	for (k = table_to_bkey(t, t->size - 1);
+	     k != bset_bkey_last(t->data);
+	     k = bkey_next(k))
+		if (t->size == bkey_to_cacheline(t, k)) {
+			t->prev[t->size] = bkey_to_cacheline_offset(t, t->size, k);
+			t->size++;
+		}
+}
+
+/*
+ * Tries to merge l and r: l should be lower than r
+ * Returns true if we were able to merge. If we did merge, l will be the merged
+ * key, r will be untouched.
+ */
+bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r)
+{
+	if (!b->ops->key_merge)
+		return false;
+
+	/*
+	 * Generic header checks
+	 * Assumes left and right are in order
+	 * Left and right must be exactly aligned
+	 */
+	if (!bch_bkey_equal_header(l, r) ||
+	     bkey_cmp(l, &START_KEY(r)))
+		return false;
+
+	return b->ops->key_merge(b, l, r);
+}
+EXPORT_SYMBOL(bch_bkey_try_merge);
+
+void bch_bset_insert(struct btree_keys *b, struct bkey *where,
+		     struct bkey *insert)
+{
+	struct bset_tree *t = bset_tree_last(b);
+
+	BUG_ON(!b->last_set_unwritten);
+	BUG_ON(bset_byte_offset(b, t->data) +
+	       __set_bytes(t->data, t->data->keys + bkey_u64s(insert)) >
+	       PAGE_SIZE << b->page_order);
+
+	memmove((uint64_t *) where + bkey_u64s(insert),
+		where,
+		(void *) bset_bkey_last(t->data) - (void *) where);
+
+	t->data->keys += bkey_u64s(insert);
+	bkey_copy(where, insert);
+	bch_bset_fix_lookup_table(b, t, where);
+}
+EXPORT_SYMBOL(bch_bset_insert);
+
+unsigned bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
+			      struct bkey *replace_key)
+{
+	unsigned status = BTREE_INSERT_STATUS_NO_INSERT;
+	struct bset *i = bset_tree_last(b)->data;
+	struct bkey *m, *prev = NULL;
+	struct btree_iter iter;
+
+	BUG_ON(b->ops->is_extents && !KEY_SIZE(k));
+
+	m = bch_btree_iter_init(b, &iter, b->ops->is_extents
+				? PRECEDING_KEY(&START_KEY(k))
+				: PRECEDING_KEY(k));
+
+	if (b->ops->insert_fixup(b, k, &iter, replace_key))
+		return status;
+
+	status = BTREE_INSERT_STATUS_INSERT;
+
+	while (m != bset_bkey_last(i) &&
+	       bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0)
+		prev = m, m = bkey_next(m);
+
+	/* prev is in the tree, if we merge we're done */
+	status = BTREE_INSERT_STATUS_BACK_MERGE;
+	if (prev &&
+	    bch_bkey_try_merge(b, prev, k))
+		goto merged;
+#if 0
+	status = BTREE_INSERT_STATUS_OVERWROTE;
+	if (m != bset_bkey_last(i) &&
+	    KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
+		goto copy;
+#endif
+	status = BTREE_INSERT_STATUS_FRONT_MERGE;
+	if (m != bset_bkey_last(i) &&
+	    bch_bkey_try_merge(b, k, m))
+		goto copy;
+
+	bch_bset_insert(b, m, k);
+copy:	bkey_copy(m, k);
+merged:
+	return status;
+}
+EXPORT_SYMBOL(bch_btree_insert_key);
+
+/* Lookup */
+
+struct bset_search_iter {
+	struct bkey *l, *r;
+};
+
+static struct bset_search_iter bset_search_write_set(struct bset_tree *t,
+						     const struct bkey *search)
+{
+	unsigned li = 0, ri = t->size;
+
+	while (li + 1 != ri) {
+		unsigned m = (li + ri) >> 1;
+
+		if (bkey_cmp(table_to_bkey(t, m), search) > 0)
+			ri = m;
+		else
+			li = m;
+	}
+
+	return (struct bset_search_iter) {
+		table_to_bkey(t, li),
+		ri < t->size ? table_to_bkey(t, ri) : bset_bkey_last(t->data)
+	};
+}
+
+static struct bset_search_iter bset_search_tree(struct bset_tree *t,
+						const struct bkey *search)
+{
+	struct bkey *l, *r;
+	struct bkey_float *f;
+	unsigned inorder, j, n = 1;
+
+	do {
+		unsigned p = n << 4;
+		p &= ((int) (p - t->size)) >> 31;
+
+		prefetch(&t->tree[p]);
+
+		j = n;
+		f = &t->tree[j];
+
+		/*
+		 * n = (f->mantissa > bfloat_mantissa())
+		 *	? j * 2
+		 *	: j * 2 + 1;
+		 *
+		 * We need to subtract 1 from f->mantissa for the sign bit trick
+		 * to work  - that's done in make_bfloat()
+		 */
+		if (likely(f->exponent != 127))
+			n = j * 2 + (((unsigned)
+				      (f->mantissa -
+				       bfloat_mantissa(search, f))) >> 31);
+		else
+			n = (bkey_cmp(tree_to_bkey(t, j), search) > 0)
+				? j * 2
+				: j * 2 + 1;
+	} while (n < t->size);
+
+	inorder = to_inorder(j, t);
+
+	/*
+	 * n would have been the node we recursed to - the low bit tells us if
+	 * we recursed left or recursed right.
+	 */
+	if (n & 1) {
+		l = cacheline_to_bkey(t, inorder, f->m);
+
+		if (++inorder != t->size) {
+			f = &t->tree[inorder_next(j, t->size)];
+			r = cacheline_to_bkey(t, inorder, f->m);
+		} else
+			r = bset_bkey_last(t->data);
+	} else {
+		r = cacheline_to_bkey(t, inorder, f->m);
+
+		if (--inorder) {
+			f = &t->tree[inorder_prev(j, t->size)];
+			l = cacheline_to_bkey(t, inorder, f->m);
+		} else
+			l = t->data->start;
+	}
+
+	return (struct bset_search_iter) {l, r};
+}
+
+struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t,
+			       const struct bkey *search)
+{
+	struct bset_search_iter i;
+
+	/*
+	 * First, we search for a cacheline, then lastly we do a linear search
+	 * within that cacheline.
+	 *
+	 * To search for the cacheline, there's three different possibilities:
+	 *  * The set is too small to have a search tree, so we just do a linear
+	 *    search over the whole set.
+	 *  * The set is the one we're currently inserting into; keeping a full
+	 *    auxiliary search tree up to date would be too expensive, so we
+	 *    use a much simpler lookup table to do a binary search -
+	 *    bset_search_write_set().
+	 *  * Or we use the auxiliary search tree we constructed earlier -
+	 *    bset_search_tree()
+	 */
+
+	if (unlikely(!t->size)) {
+		i.l = t->data->start;
+		i.r = bset_bkey_last(t->data);
+	} else if (bset_written(b, t)) {
+		/*
+		 * Each node in the auxiliary search tree covers a certain range
+		 * of bits, and keys above and below the set it covers might
+		 * differ outside those bits - so we have to special case the
+		 * start and end - handle that here:
+		 */
+
+		if (unlikely(bkey_cmp(search, &t->end) >= 0))
+			return bset_bkey_last(t->data);
+
+		if (unlikely(bkey_cmp(search, t->data->start) < 0))
+			return t->data->start;
+
+		i = bset_search_tree(t, search);
+	} else {
+		BUG_ON(!b->nsets &&
+		       t->size < bkey_to_cacheline(t, bset_bkey_last(t->data)));
+
+		i = bset_search_write_set(t, search);
+	}
+
+	if (btree_keys_expensive_checks(b)) {
+		BUG_ON(bset_written(b, t) &&
+		       i.l != t->data->start &&
+		       bkey_cmp(tree_to_prev_bkey(t,
+			  inorder_to_tree(bkey_to_cacheline(t, i.l), t)),
+				search) > 0);
+
+		BUG_ON(i.r != bset_bkey_last(t->data) &&
+		       bkey_cmp(i.r, search) <= 0);
+	}
+
+	while (likely(i.l != i.r) &&
+	       bkey_cmp(i.l, search) <= 0)
+		i.l = bkey_next(i.l);
+
+	return i.l;
+}
+EXPORT_SYMBOL(__bch_bset_search);
+
+/* Btree iterator */
+
+typedef bool (btree_iter_cmp_fn)(struct btree_iter_set,
+				 struct btree_iter_set);
+
+static inline bool btree_iter_cmp(struct btree_iter_set l,
+				  struct btree_iter_set r)
+{
+	return bkey_cmp(l.k, r.k) > 0;
+}
+
+static inline bool btree_iter_end(struct btree_iter *iter)
+{
+	return !iter->used;
+}
+
+void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
+			 struct bkey *end)
+{
+	if (k != end)
+		BUG_ON(!heap_add(iter,
+				 ((struct btree_iter_set) { k, end }),
+				 btree_iter_cmp));
+}
+
+static struct bkey *__bch_btree_iter_init(struct btree_keys *b,
+					  struct btree_iter *iter,
+					  struct bkey *search,
+					  struct bset_tree *start)
+{
+	struct bkey *ret = NULL;
+	iter->size = ARRAY_SIZE(iter->data);
+	iter->used = 0;
+
+#ifdef CONFIG_BCACHE_DEBUG
+	iter->b = b;
+#endif
+
+	for (; start <= bset_tree_last(b); start++) {
+		ret = bch_bset_search(b, start, search);
+		bch_btree_iter_push(iter, ret, bset_bkey_last(start->data));
+	}
+
+	return ret;
+}
+
+struct bkey *bch_btree_iter_init(struct btree_keys *b,
+				 struct btree_iter *iter,
+				 struct bkey *search)
+{
+	return __bch_btree_iter_init(b, iter, search, b->set);
+}
+EXPORT_SYMBOL(bch_btree_iter_init);
+
+static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
+						 btree_iter_cmp_fn *cmp)
+{
+	struct btree_iter_set unused;
+	struct bkey *ret = NULL;
+
+	if (!btree_iter_end(iter)) {
+		bch_btree_iter_next_check(iter);
+
+		ret = iter->data->k;
+		iter->data->k = bkey_next(iter->data->k);
+
+		if (iter->data->k > iter->data->end) {
+			WARN_ONCE(1, "bset was corrupt!\n");
+			iter->data->k = iter->data->end;
+		}
+
+		if (iter->data->k == iter->data->end)
+			heap_pop(iter, unused, cmp);
+		else
+			heap_sift(iter, 0, cmp);
+	}
+
+	return ret;
+}
+
+struct bkey *bch_btree_iter_next(struct btree_iter *iter)
+{
+	return __bch_btree_iter_next(iter, btree_iter_cmp);
+
+}
+EXPORT_SYMBOL(bch_btree_iter_next);
+
+struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
+					struct btree_keys *b, ptr_filter_fn fn)
+{
+	struct bkey *ret;
+
+	do {
+		ret = bch_btree_iter_next(iter);
+	} while (ret && fn(b, ret));
+
+	return ret;
+}
+
+/* Mergesort */
+
+void bch_bset_sort_state_free(struct bset_sort_state *state)
+{
+	if (state->pool)
+		mempool_destroy(state->pool);
+}
+
+int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order)
+{
+	spin_lock_init(&state->time.lock);
+
+	state->page_order = page_order;
+	state->crit_factor = int_sqrt(1 << page_order);
+
+	state->pool = mempool_create_page_pool(1, page_order);
+	if (!state->pool)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL(bch_bset_sort_state_init);
+
+static void btree_mergesort(struct btree_keys *b, struct bset *out,
+			    struct btree_iter *iter,
+			    bool fixup, bool remove_stale)
+{
+	int i;
+	struct bkey *k, *last = NULL;
+	BKEY_PADDED(k) tmp;
+	bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale
+		? bch_ptr_bad
+		: bch_ptr_invalid;
+
+	/* Heapify the iterator, using our comparison function */
+	for (i = iter->used / 2 - 1; i >= 0; --i)
+		heap_sift(iter, i, b->ops->sort_cmp);
+
+	while (!btree_iter_end(iter)) {
+		if (b->ops->sort_fixup && fixup)
+			k = b->ops->sort_fixup(iter, &tmp.k);
+		else
+			k = NULL;
+
+		if (!k)
+			k = __bch_btree_iter_next(iter, b->ops->sort_cmp);
+
+		if (bad(b, k))
+			continue;
+
+		if (!last) {
+			last = out->start;
+			bkey_copy(last, k);
+		} else if (!bch_bkey_try_merge(b, last, k)) {
+			last = bkey_next(last);
+			bkey_copy(last, k);
+		}
+	}
+
+	out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0;
+
+	pr_debug("sorted %i keys", out->keys);
+}
+
+static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
+			 unsigned start, unsigned order, bool fixup,
+			 struct bset_sort_state *state)
+{
+	uint64_t start_time;
+	bool used_mempool = false;
+	struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO,
+						     order);
+	if (!out) {
+		struct page *outp;
+
+		BUG_ON(order > state->page_order);
+
+		outp = mempool_alloc(state->pool, GFP_NOIO);
+		out = page_address(outp);
+		used_mempool = true;
+		order = state->page_order;
+	}
+
+	start_time = local_clock();
+
+	btree_mergesort(b, out, iter, fixup, false);
+	b->nsets = start;
+
+	if (!start && order == b->page_order) {
+		/*
+		 * Our temporary buffer is the same size as the btree node's
+		 * buffer, we can just swap buffers instead of doing a big
+		 * memcpy()
+		 */
+
+		out->magic	= b->set->data->magic;
+		out->seq	= b->set->data->seq;
+		out->version	= b->set->data->version;
+		swap(out, b->set->data);
+	} else {
+		b->set[start].data->keys = out->keys;
+		memcpy(b->set[start].data->start, out->start,
+		       (void *) bset_bkey_last(out) - (void *) out->start);
+	}
+
+	if (used_mempool)
+		mempool_free(virt_to_page(out), state->pool);
+	else
+		free_pages((unsigned long) out, order);
+
+	bch_bset_build_written_tree(b);
+
+	if (!start)
+		bch_time_stats_update(&state->time, start_time);
+}
+
+void bch_btree_sort_partial(struct btree_keys *b, unsigned start,
+			    struct bset_sort_state *state)
+{
+	size_t order = b->page_order, keys = 0;
+	struct btree_iter iter;
+	int oldsize = bch_count_data(b);
+
+	__bch_btree_iter_init(b, &iter, NULL, &b->set[start]);
+
+	if (start) {
+		unsigned i;
+
+		for (i = start; i <= b->nsets; i++)
+			keys += b->set[i].data->keys;
+
+		order = get_order(__set_bytes(b->set->data, keys));
+	}
+
+	__btree_sort(b, &iter, start, order, false, state);
+
+	EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize);
+}
+EXPORT_SYMBOL(bch_btree_sort_partial);
+
+void bch_btree_sort_and_fix_extents(struct btree_keys *b,
+				    struct btree_iter *iter,
+				    struct bset_sort_state *state)
+{
+	__btree_sort(b, iter, 0, b->page_order, true, state);
+}
+
+void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new,
+			 struct bset_sort_state *state)
+{
+	uint64_t start_time = local_clock();
+
+	struct btree_iter iter;
+	bch_btree_iter_init(b, &iter, NULL);
+
+	btree_mergesort(b, new->set->data, &iter, false, true);
+
+	bch_time_stats_update(&state->time, start_time);
+
+	new->set->size = 0; // XXX: why?
+}
+
+#define SORT_CRIT	(4096 / sizeof(uint64_t))
+
+void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state)
+{
+	unsigned crit = SORT_CRIT;
+	int i;
+
+	/* Don't sort if nothing to do */
+	if (!b->nsets)
+		goto out;
+
+	for (i = b->nsets - 1; i >= 0; --i) {
+		crit *= state->crit_factor;
+
+		if (b->set[i].data->keys < crit) {
+			bch_btree_sort_partial(b, i, state);
+			return;
+		}
+	}
+
+	/* Sort if we'd overflow */
+	if (b->nsets + 1 == MAX_BSETS) {
+		bch_btree_sort(b, state);
+		return;
+	}
+
+out:
+	bch_bset_build_written_tree(b);
+}
+EXPORT_SYMBOL(bch_btree_sort_lazy);
+
+void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats)
+{
+	unsigned i;
+
+	for (i = 0; i <= b->nsets; i++) {
+		struct bset_tree *t = &b->set[i];
+		size_t bytes = t->data->keys * sizeof(uint64_t);
+		size_t j;
+
+		if (bset_written(b, t)) {
+			stats->sets_written++;
+			stats->bytes_written += bytes;
+
+			stats->floats += t->size - 1;
+
+			for (j = 1; j < t->size; j++)
+				if (t->tree[j].exponent == 127)
+					stats->failed++;
+		} else {
+			stats->sets_unwritten++;
+			stats->bytes_unwritten += bytes;
+		}
+	}
+}
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
new file mode 100644
index 00000000000..5f6728d5d4d
--- /dev/null
+++ b/drivers/md/bcache/bset.h
@@ -0,0 +1,566 @@
+#ifndef _BCACHE_BSET_H
+#define _BCACHE_BSET_H
+
+#include <linux/bcache.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include "util.h" /* for time_stats */
+
+/*
+ * BKEYS:
+ *
+ * A bkey contains a key, a size field, a variable number of pointers, and some
+ * ancillary flag bits.
+ *
+ * We use two different functions for validating bkeys, bch_ptr_invalid and
+ * bch_ptr_bad().
+ *
+ * bch_ptr_invalid() primarily filters out keys and pointers that would be
+ * invalid due to some sort of bug, whereas bch_ptr_bad() filters out keys and
+ * pointer that occur in normal practice but don't point to real data.
+ *
+ * The one exception to the rule that ptr_invalid() filters out invalid keys is
+ * that it also filters out keys of size 0 - these are keys that have been
+ * completely overwritten. It'd be safe to delete these in memory while leaving
+ * them on disk, just unnecessary work - so we filter them out when resorting
+ * instead.
+ *
+ * We can't filter out stale keys when we're resorting, because garbage
+ * collection needs to find them to ensure bucket gens don't wrap around -
+ * unless we're rewriting the btree node those stale keys still exist on disk.
+ *
+ * We also implement functions here for removing some number of sectors from the
+ * front or the back of a bkey - this is mainly used for fixing overlapping
+ * extents, by removing the overlapping sectors from the older key.
+ *
+ * BSETS:
+ *
+ * A bset is an array of bkeys laid out contiguously in memory in sorted order,
+ * along with a header. A btree node is made up of a number of these, written at
+ * different times.
+ *
+ * There could be many of them on disk, but we never allow there to be more than
+ * 4 in memory - we lazily resort as needed.
+ *
+ * We implement code here for creating and maintaining auxiliary search trees
+ * (described below) for searching an individial bset, and on top of that we
+ * implement a btree iterator.
+ *
+ * BTREE ITERATOR:
+ *
+ * Most of the code in bcache doesn't care about an individual bset - it needs
+ * to search entire btree nodes and iterate over them in sorted order.
+ *
+ * The btree iterator code serves both functions; it iterates through the keys
+ * in a btree node in sorted order, starting from either keys after a specific
+ * point (if you pass it a search key) or the start of the btree node.
+ *
+ * AUXILIARY SEARCH TREES:
+ *
+ * Since keys are variable length, we can't use a binary search on a bset - we
+ * wouldn't be able to find the start of the next key. But binary searches are
+ * slow anyways, due to terrible cache behaviour; bcache originally used binary
+ * searches and that code topped out at under 50k lookups/second.
+ *
+ * So we need to construct some sort of lookup table. Since we only insert keys
+ * into the last (unwritten) set, most of the keys within a given btree node are
+ * usually in sets that are mostly constant. We use two different types of
+ * lookup tables to take advantage of this.
+ *
+ * Both lookup tables share in common that they don't index every key in the
+ * set; they index one key every BSET_CACHELINE bytes, and then a linear search
+ * is used for the rest.
+ *
+ * For sets that have been written to disk and are no longer being inserted
+ * into, we construct a binary search tree in an array - traversing a binary
+ * search tree in an array gives excellent locality of reference and is very
+ * fast, since both children of any node are adjacent to each other in memory
+ * (and their grandchildren, and great grandchildren...) - this means
+ * prefetching can be used to great effect.
+ *
+ * It's quite useful performance wise to keep these nodes small - not just
+ * because they're more likely to be in L2, but also because we can prefetch
+ * more nodes on a single cacheline and thus prefetch more iterations in advance
+ * when traversing this tree.
+ *
+ * Nodes in the auxiliary search tree must contain both a key to compare against
+ * (we don't want to fetch the key from the set, that would defeat the purpose),
+ * and a pointer to the key. We use a few tricks to compress both of these.
+ *
+ * To compress the pointer, we take advantage of the fact that one node in the
+ * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
+ * a function (to_inorder()) that takes the index of a node in a binary tree and
+ * returns what its index would be in an inorder traversal, so we only have to
+ * store the low bits of the offset.
+ *
+ * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
+ * compress that,  we take advantage of the fact that when we're traversing the
+ * search tree at every iteration we know that both our search key and the key
+ * we're looking for lie within some range - bounded by our previous
+ * comparisons. (We special case the start of a search so that this is true even
+ * at the root of the tree).
+ *
+ * So we know the key we're looking for is between a and b, and a and b don't
+ * differ higher than bit 50, we don't need to check anything higher than bit
+ * 50.
+ *
+ * We don't usually need the rest of the bits, either; we only need enough bits
+ * to partition the key range we're currently checking.  Consider key n - the
+ * key our auxiliary search tree node corresponds to, and key p, the key
+ * immediately preceding n.  The lowest bit we need to store in the auxiliary
+ * search tree is the highest bit that differs between n and p.
+ *
+ * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
+ * comparison. But we'd really like our nodes in the auxiliary search tree to be
+ * of fixed size.
+ *
+ * The solution is to make them fixed size, and when we're constructing a node
+ * check if p and n differed in the bits we needed them to. If they don't we
+ * flag that node, and when doing lookups we fallback to comparing against the
+ * real key. As long as this doesn't happen to often (and it seems to reliably
+ * happen a bit less than 1% of the time), we win - even on failures, that key
+ * is then more likely to be in cache than if we were doing binary searches all
+ * the way, since we're touching so much less memory.
+ *
+ * The keys in the auxiliary search tree are stored in (software) floating
+ * point, with an exponent and a mantissa. The exponent needs to be big enough
+ * to address all the bits in the original key, but the number of bits in the
+ * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
+ *
+ * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
+ * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
+ * We need one node per 128 bytes in the btree node, which means the auxiliary
+ * search trees take up 3% as much memory as the btree itself.
+ *
+ * Constructing these auxiliary search trees is moderately expensive, and we
+ * don't want to be constantly rebuilding the search tree for the last set
+ * whenever we insert another key into it. For the unwritten set, we use a much
+ * simpler lookup table - it's just a flat array, so index i in the lookup table
+ * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
+ * within each byte range works the same as with the auxiliary search trees.
+ *
+ * These are much easier to keep up to date when we insert a key - we do it
+ * somewhat lazily; when we shift a key up we usually just increment the pointer
+ * to it, only when it would overflow do we go to the trouble of finding the
+ * first key in that range of bytes again.
+ */
+
+struct btree_keys;
+struct btree_iter;
+struct btree_iter_set;
+struct bkey_float;
+
+#define MAX_BSETS		4U
+
+struct bset_tree {
+	/*
+	 * We construct a binary tree in an array as if the array
+	 * started at 1, so that things line up on the same cachelines
+	 * better: see comments in bset.c at cacheline_to_bkey() for
+	 * details
+	 */
+
+	/* size of the binary tree and prev array */
+	unsigned		size;
+
+	/* function of size - precalculated for to_inorder() */
+	unsigned		extra;
+
+	/* copy of the last key in the set */
+	struct bkey		end;
+	struct bkey_float	*tree;
+
+	/*
+	 * The nodes in the bset tree point to specific keys - this
+	 * array holds the sizes of the previous key.
+	 *
+	 * Conceptually it's a member of struct bkey_float, but we want
+	 * to keep bkey_float to 4 bytes and prev isn't used in the fast
+	 * path.
+	 */
+	uint8_t			*prev;
+
+	/* The actual btree node, with pointers to each sorted set */
+	struct bset		*data;
+};
+
+struct btree_keys_ops {
+	bool		(*sort_cmp)(struct btree_iter_set,
+				    struct btree_iter_set);
+	struct bkey	*(*sort_fixup)(struct btree_iter *, struct bkey *);
+	bool		(*insert_fixup)(struct btree_keys *, struct bkey *,
+					struct btree_iter *, struct bkey *);
+	bool		(*key_invalid)(struct btree_keys *,
+				       const struct bkey *);
+	bool		(*key_bad)(struct btree_keys *, const struct bkey *);
+	bool		(*key_merge)(struct btree_keys *,
+				     struct bkey *, struct bkey *);
+	void		(*key_to_text)(char *, size_t, const struct bkey *);
+	void		(*key_dump)(struct btree_keys *, const struct bkey *);
+
+	/*
+	 * Only used for deciding whether to use START_KEY(k) or just the key
+	 * itself in a couple places
+	 */
+	bool		is_extents;
+};
+
+struct btree_keys {
+	const struct btree_keys_ops	*ops;
+	uint8_t			page_order;
+	uint8_t			nsets;
+	unsigned		last_set_unwritten:1;
+	bool			*expensive_debug_checks;
+
+	/*
+	 * Sets of sorted keys - the real btree node - plus a binary search tree
+	 *
+	 * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
+	 * to the memory we have allocated for this btree node. Additionally,
+	 * set[0]->data points to the entire btree node as it exists on disk.
+	 */
+	struct bset_tree	set[MAX_BSETS];
+};
+
+static inline struct bset_tree *bset_tree_last(struct btree_keys *b)
+{
+	return b->set + b->nsets;
+}
+
+static inline bool bset_written(struct btree_keys *b, struct bset_tree *t)
+{
+	return t <= b->set + b->nsets - b->last_set_unwritten;
+}
+
+static inline bool bkey_written(struct btree_keys *b, struct bkey *k)
+{
+	return !b->last_set_unwritten || k < b->set[b->nsets].data->start;
+}
+
+static inline unsigned bset_byte_offset(struct btree_keys *b, struct bset *i)
+{
+	return ((size_t) i) - ((size_t) b->set->data);
+}
+
+static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i)
+{
+	return bset_byte_offset(b, i) >> 9;
+}
+
+#define __set_bytes(i, k)	(sizeof(*(i)) + (k) * sizeof(uint64_t))
+#define set_bytes(i)		__set_bytes(i, i->keys)
+
+#define __set_blocks(i, k, block_bytes)				\
+	DIV_ROUND_UP(__set_bytes(i, k), block_bytes)
+#define set_blocks(i, block_bytes)				\
+	__set_blocks(i, (i)->keys, block_bytes)
+
+static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b)
+{
+	struct bset_tree *t = bset_tree_last(b);
+
+	BUG_ON((PAGE_SIZE << b->page_order) <
+	       (bset_byte_offset(b, t->data) + set_bytes(t->data)));
+
+	if (!b->last_set_unwritten)
+		return 0;
+
+	return ((PAGE_SIZE << b->page_order) -
+		(bset_byte_offset(b, t->data) + set_bytes(t->data))) /
+		sizeof(u64);
+}
+
+static inline struct bset *bset_next_set(struct btree_keys *b,
+					 unsigned block_bytes)
+{
+	struct bset *i = bset_tree_last(b)->data;
+
+	return ((void *) i) + roundup(set_bytes(i), block_bytes);
+}
+
+void bch_btree_keys_free(struct btree_keys *);
+int bch_btree_keys_alloc(struct btree_keys *, unsigned, gfp_t);
+void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *,
+			 bool *);
+
+void bch_bset_init_next(struct btree_keys *, struct bset *, uint64_t);
+void bch_bset_build_written_tree(struct btree_keys *);
+void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *);
+bool bch_bkey_try_merge(struct btree_keys *, struct bkey *, struct bkey *);
+void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *);
+unsigned bch_btree_insert_key(struct btree_keys *, struct bkey *,
+			      struct bkey *);
+
+enum {
+	BTREE_INSERT_STATUS_NO_INSERT = 0,
+	BTREE_INSERT_STATUS_INSERT,
+	BTREE_INSERT_STATUS_BACK_MERGE,
+	BTREE_INSERT_STATUS_OVERWROTE,
+	BTREE_INSERT_STATUS_FRONT_MERGE,
+};
+
+/* Btree key iteration */
+
+struct btree_iter {
+	size_t size, used;
+#ifdef CONFIG_BCACHE_DEBUG
+	struct btree_keys *b;
+#endif
+	struct btree_iter_set {
+		struct bkey *k, *end;
+	} data[MAX_BSETS];
+};
+
+typedef bool (*ptr_filter_fn)(struct btree_keys *, const struct bkey *);
+
+struct bkey *bch_btree_iter_next(struct btree_iter *);
+struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
+					struct btree_keys *, ptr_filter_fn);
+
+void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
+struct bkey *bch_btree_iter_init(struct btree_keys *, struct btree_iter *,
+				 struct bkey *);
+
+struct bkey *__bch_bset_search(struct btree_keys *, struct bset_tree *,
+			       const struct bkey *);
+
+/*
+ * Returns the first key that is strictly greater than search
+ */
+static inline struct bkey *bch_bset_search(struct btree_keys *b,
+					   struct bset_tree *t,
+					   const struct bkey *search)
+{
+	return search ? __bch_bset_search(b, t, search) : t->data->start;
+}
+
+#define for_each_key_filter(b, k, iter, filter)				\
+	for (bch_btree_iter_init((b), (iter), NULL);			\
+	     ((k) = bch_btree_iter_next_filter((iter), (b), filter));)
+
+#define for_each_key(b, k, iter)					\
+	for (bch_btree_iter_init((b), (iter), NULL);			\
+	     ((k) = bch_btree_iter_next(iter));)
+
+/* Sorting */
+
+struct bset_sort_state {
+	mempool_t		*pool;
+
+	unsigned		page_order;
+	unsigned		crit_factor;
+
+	struct time_stats	time;
+};
+
+void bch_bset_sort_state_free(struct bset_sort_state *);
+int bch_bset_sort_state_init(struct bset_sort_state *, unsigned);
+void bch_btree_sort_lazy(struct btree_keys *, struct bset_sort_state *);
+void bch_btree_sort_into(struct btree_keys *, struct btree_keys *,
+			 struct bset_sort_state *);
+void bch_btree_sort_and_fix_extents(struct btree_keys *, struct btree_iter *,
+				    struct bset_sort_state *);
+void bch_btree_sort_partial(struct btree_keys *, unsigned,
+			    struct bset_sort_state *);
+
+static inline void bch_btree_sort(struct btree_keys *b,
+				  struct bset_sort_state *state)
+{
+	bch_btree_sort_partial(b, 0, state);
+}
+
+struct bset_stats {
+	size_t sets_written, sets_unwritten;
+	size_t bytes_written, bytes_unwritten;
+	size_t floats, failed;
+};
+
+void bch_btree_keys_stats(struct btree_keys *, struct bset_stats *);
+
+/* Bkey utility code */
+
+#define bset_bkey_last(i)	bkey_idx((struct bkey *) (i)->d, (i)->keys)
+
+static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx)
+{
+	return bkey_idx(i->start, idx);
+}
+
+static inline void bkey_init(struct bkey *k)
+{
+	*k = ZERO_KEY;
+}
+
+static __always_inline int64_t bkey_cmp(const struct bkey *l,
+					const struct bkey *r)
+{
+	return unlikely(KEY_INODE(l) != KEY_INODE(r))
+		? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r)
+		: (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
+}
+
+void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
+			      unsigned);
+bool __bch_cut_front(const struct bkey *, struct bkey *);
+bool __bch_cut_back(const struct bkey *, struct bkey *);
+
+static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
+{
+	BUG_ON(bkey_cmp(where, k) > 0);
+	return __bch_cut_front(where, k);
+}
+
+static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
+{
+	BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
+	return __bch_cut_back(where, k);
+}
+
+#define PRECEDING_KEY(_k)					\
+({								\
+	struct bkey *_ret = NULL;				\
+								\
+	if (KEY_INODE(_k) || KEY_OFFSET(_k)) {			\
+		_ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0);	\
+								\
+		if (!_ret->low)					\
+			_ret->high--;				\
+		_ret->low--;					\
+	}							\
+								\
+	_ret;							\
+})
+
+static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k)
+{
+	return b->ops->key_invalid(b, k);
+}
+
+static inline bool bch_ptr_bad(struct btree_keys *b, const struct bkey *k)
+{
+	return b->ops->key_bad(b, k);
+}
+
+static inline void bch_bkey_to_text(struct btree_keys *b, char *buf,
+				    size_t size, const struct bkey *k)
+{
+	return b->ops->key_to_text(buf, size, k);
+}
+
+static inline bool bch_bkey_equal_header(const struct bkey *l,
+					 const struct bkey *r)
+{
+	return (KEY_DIRTY(l) == KEY_DIRTY(r) &&
+		KEY_PTRS(l) == KEY_PTRS(r) &&
+		KEY_CSUM(l) == KEY_CSUM(l));
+}
+
+/* Keylists */
+
+struct keylist {
+	union {
+		struct bkey		*keys;
+		uint64_t		*keys_p;
+	};
+	union {
+		struct bkey		*top;
+		uint64_t		*top_p;
+	};
+
+	/* Enough room for btree_split's keys without realloc */
+#define KEYLIST_INLINE		16
+	uint64_t		inline_keys[KEYLIST_INLINE];
+};
+
+static inline void bch_keylist_init(struct keylist *l)
+{
+	l->top_p = l->keys_p = l->inline_keys;
+}
+
+static inline void bch_keylist_init_single(struct keylist *l, struct bkey *k)
+{
+	l->keys = k;
+	l->top = bkey_next(k);
+}
+
+static inline void bch_keylist_push(struct keylist *l)
+{
+	l->top = bkey_next(l->top);
+}
+
+static inline void bch_keylist_add(struct keylist *l, struct bkey *k)
+{
+	bkey_copy(l->top, k);
+	bch_keylist_push(l);
+}
+
+static inline bool bch_keylist_empty(struct keylist *l)
+{
+	return l->top == l->keys;
+}
+
+static inline void bch_keylist_reset(struct keylist *l)
+{
+	l->top = l->keys;
+}
+
+static inline void bch_keylist_free(struct keylist *l)
+{
+	if (l->keys_p != l->inline_keys)
+		kfree(l->keys_p);
+}
+
+static inline size_t bch_keylist_nkeys(struct keylist *l)
+{
+	return l->top_p - l->keys_p;
+}
+
+static inline size_t bch_keylist_bytes(struct keylist *l)
+{
+	return bch_keylist_nkeys(l) * sizeof(uint64_t);
+}
+
+struct bkey *bch_keylist_pop(struct keylist *);
+void bch_keylist_pop_front(struct keylist *);
+int __bch_keylist_realloc(struct keylist *, unsigned);
+
+/* Debug stuff */
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+int __bch_count_data(struct btree_keys *);
+void __bch_check_keys(struct btree_keys *, const char *, ...);
+void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
+void bch_dump_bucket(struct btree_keys *);
+
+#else
+
+static inline int __bch_count_data(struct btree_keys *b) { return -1; }
+static inline void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {}
+static inline void bch_dump_bucket(struct btree_keys *b) {}
+void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
+
+#endif
+
+static inline bool btree_keys_expensive_checks(struct btree_keys *b)
+{
+#ifdef CONFIG_BCACHE_DEBUG
+	return *b->expensive_debug_checks;
+#else
+	return false;
+#endif
+}
+
+static inline int bch_count_data(struct btree_keys *b)
+{
+	return btree_keys_expensive_checks(b) ? __bch_count_data(b) : -1;
+}
+
+#define bch_check_keys(b, ...)						\
+do {									\
+	if (btree_keys_expensive_checks(b))				\
+		__bch_check_keys(b, __VA_ARGS__);			\
+} while (0)
+
+#endif
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
new file mode 100644
index 00000000000..7347b610096
--- /dev/null
+++ b/drivers/md/bcache/btree.c
@@ -0,0 +1,2518 @@
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ *
+ * Uses a block device as cache for other block devices; optimized for SSDs.
+ * All allocation is done in buckets, which should match the erase block size
+ * of the device.
+ *
+ * Buckets containing cached data are kept on a heap sorted by priority;
+ * bucket priority is increased on cache hit, and periodically all the buckets
+ * on the heap have their priority scaled down. This currently is just used as
+ * an LRU but in the future should allow for more intelligent heuristics.
+ *
+ * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
+ * counter. Garbage collection is used to remove stale pointers.
+ *
+ * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
+ * as keys are inserted we only sort the pages that have not yet been written.
+ * When garbage collection is run, we resort the entire node.
+ *
+ * All configuration is done via sysfs; see Documentation/bcache.txt.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "extents.h"
+
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/freezer.h>
+#include <linux/hash.h>
+#include <linux/kthread.h>
+#include <linux/prefetch.h>
+#include <linux/random.h>
+#include <linux/rcupdate.h>
+#include <trace/events/bcache.h>
+
+/*
+ * Todo:
+ * register_bcache: Return errors out to userspace correctly
+ *
+ * Writeback: don't undirty key until after a cache flush
+ *
+ * Create an iterator for key pointers
+ *
+ * On btree write error, mark bucket such that it won't be freed from the cache
+ *
+ * Journalling:
+ *   Check for bad keys in replay
+ *   Propagate barriers
+ *   Refcount journal entries in journal_replay
+ *
+ * Garbage collection:
+ *   Finish incremental gc
+ *   Gc should free old UUIDs, data for invalid UUIDs
+ *
+ * Provide a way to list backing device UUIDs we have data cached for, and
+ * probably how long it's been since we've seen them, and a way to invalidate
+ * dirty data for devices that will never be attached again
+ *
+ * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
+ * that based on that and how much dirty data we have we can keep writeback
+ * from being starved
+ *
+ * Add a tracepoint or somesuch to watch for writeback starvation
+ *
+ * When btree depth > 1 and splitting an interior node, we have to make sure
+ * alloc_bucket() cannot fail. This should be true but is not completely
+ * obvious.
+ *
+ * Plugging?
+ *
+ * If data write is less than hard sector size of ssd, round up offset in open
+ * bucket to the next whole sector
+ *
+ * Superblock needs to be fleshed out for multiple cache devices
+ *
+ * Add a sysfs tunable for the number of writeback IOs in flight
+ *
+ * Add a sysfs tunable for the number of open data buckets
+ *
+ * IO tracking: Can we track when one process is doing io on behalf of another?
+ * IO tracking: Don't use just an average, weigh more recent stuff higher
+ *
+ * Test module load/unload
+ */
+
+#define MAX_NEED_GC		64
+#define MAX_SAVE_PRIO		72
+
+#define PTR_DIRTY_BIT		(((uint64_t) 1 << 36))
+
+#define PTR_HASH(c, k)							\
+	(((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
+
+#define insert_lock(s, b)	((b)->level <= (s)->lock)
+
+/*
+ * These macros are for recursing down the btree - they handle the details of
+ * locking and looking up nodes in the cache for you. They're best treated as
+ * mere syntax when reading code that uses them.
+ *
+ * op->lock determines whether we take a read or a write lock at a given depth.
+ * If you've got a read lock and find that you need a write lock (i.e. you're
+ * going to have to split), set op->lock and return -EINTR; btree_root() will
+ * call you again and you'll have the correct lock.
+ */
+
+/**
+ * btree - recurse down the btree on a specified key
+ * @fn:		function to call, which will be passed the child node
+ * @key:	key to recurse on
+ * @b:		parent btree node
+ * @op:		pointer to struct btree_op
+ */
+#define btree(fn, key, b, op, ...)					\
+({									\
+	int _r, l = (b)->level - 1;					\
+	bool _w = l <= (op)->lock;					\
+	struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\
+	if (!IS_ERR(_child)) {						\
+		_child->parent = (b);					\
+		_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);	\
+		rw_unlock(_w, _child);					\
+	} else								\
+		_r = PTR_ERR(_child);					\
+	_r;								\
+})
+
+/**
+ * btree_root - call a function on the root of the btree
+ * @fn:		function to call, which will be passed the child node
+ * @c:		cache set
+ * @op:		pointer to struct btree_op
+ */
+#define btree_root(fn, c, op, ...)					\
+({									\
+	int _r = -EINTR;						\
+	do {								\
+		struct btree *_b = (c)->root;				\
+		bool _w = insert_lock(op, _b);				\
+		rw_lock(_w, _b, _b->level);				\
+		if (_b == (c)->root &&					\
+		    _w == insert_lock(op, _b)) {			\
+			_b->parent = NULL;				\
+			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
+		}							\
+		rw_unlock(_w, _b);					\
+		bch_cannibalize_unlock(c);				\
+		if (_r == -EINTR)					\
+			schedule();					\
+	} while (_r == -EINTR);						\
+									\
+	finish_wait(&(c)->btree_cache_wait, &(op)->wait);		\
+	_r;								\
+})
+
+static inline struct bset *write_block(struct btree *b)
+{
+	return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
+}
+
+static void bch_btree_init_next(struct btree *b)
+{
+	/* If not a leaf node, always sort */
+	if (b->level && b->keys.nsets)
+		bch_btree_sort(&b->keys, &b->c->sort);
+	else
+		bch_btree_sort_lazy(&b->keys, &b->c->sort);
+
+	if (b->written < btree_blocks(b))
+		bch_bset_init_next(&b->keys, write_block(b),
+				   bset_magic(&b->c->sb));
+
+}
+
+/* Btree key manipulation */
+
+void bkey_put(struct cache_set *c, struct bkey *k)
+{
+	unsigned i;
+
+	for (i = 0; i < KEY_PTRS(k); i++)
+		if (ptr_available(c, k, i))
+			atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
+}
+
+/* Btree IO */
+
+static uint64_t btree_csum_set(struct btree *b, struct bset *i)
+{
+	uint64_t crc = b->key.ptr[0];
+	void *data = (void *) i + 8, *end = bset_bkey_last(i);
+
+	crc = bch_crc64_update(crc, data, end - data);
+	return crc ^ 0xffffffffffffffffULL;
+}
+
+void bch_btree_node_read_done(struct btree *b)
+{
+	const char *err = "bad btree header";
+	struct bset *i = btree_bset_first(b);
+	struct btree_iter *iter;
+
+	iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
+	iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
+	iter->used = 0;
+
+#ifdef CONFIG_BCACHE_DEBUG
+	iter->b = &b->keys;
+#endif
+
+	if (!i->seq)
+		goto err;
+
+	for (;
+	     b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq;
+	     i = write_block(b)) {
+		err = "unsupported bset version";
+		if (i->version > BCACHE_BSET_VERSION)
+			goto err;
+
+		err = "bad btree header";
+		if (b->written + set_blocks(i, block_bytes(b->c)) >
+		    btree_blocks(b))
+			goto err;
+
+		err = "bad magic";
+		if (i->magic != bset_magic(&b->c->sb))
+			goto err;
+
+		err = "bad checksum";
+		switch (i->version) {
+		case 0:
+			if (i->csum != csum_set(i))
+				goto err;
+			break;
+		case BCACHE_BSET_VERSION:
+			if (i->csum != btree_csum_set(b, i))
+				goto err;
+			break;
+		}
+
+		err = "empty set";
+		if (i != b->keys.set[0].data && !i->keys)
+			goto err;
+
+		bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
+
+		b->written += set_blocks(i, block_bytes(b->c));
+	}
+
+	err = "corrupted btree";
+	for (i = write_block(b);
+	     bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key);
+	     i = ((void *) i) + block_bytes(b->c))
+		if (i->seq == b->keys.set[0].data->seq)
+			goto err;
+
+	bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort);
+
+	i = b->keys.set[0].data;
+	err = "short btree key";
+	if (b->keys.set[0].size &&
+	    bkey_cmp(&b->key, &b->keys.set[0].end) < 0)
+		goto err;
+
+	if (b->written < btree_blocks(b))
+		bch_bset_init_next(&b->keys, write_block(b),
+				   bset_magic(&b->c->sb));
+out:
+	mempool_free(iter, b->c->fill_iter);
+	return;
+err:
+	set_btree_node_io_error(b);
+	bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys",
+			    err, PTR_BUCKET_NR(b->c, &b->key, 0),
+			    bset_block_offset(b, i), i->keys);
+	goto out;
+}
+
+static void btree_node_read_endio(struct bio *bio, int error)
+{
+	struct closure *cl = bio->bi_private;
+	closure_put(cl);
+}
+
+static void bch_btree_node_read(struct btree *b)
+{
+	uint64_t start_time = local_clock();
+	struct closure cl;
+	struct bio *bio;
+
+	trace_bcache_btree_read(b);
+
+	closure_init_stack(&cl);
+
+	bio = bch_bbio_alloc(b->c);
+	bio->bi_rw	= REQ_META|READ_SYNC;
+	bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9;
+	bio->bi_end_io	= btree_node_read_endio;
+	bio->bi_private	= &cl;
+
+	bch_bio_map(bio, b->keys.set[0].data);
+
+	bch_submit_bbio(bio, b->c, &b->key, 0);
+	closure_sync(&cl);
+
+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		set_btree_node_io_error(b);
+
+	bch_bbio_free(bio, b->c);
+
+	if (btree_node_io_error(b))
+		goto err;
+
+	bch_btree_node_read_done(b);
+	bch_time_stats_update(&b->c->btree_read_time, start_time);
+
+	return;
+err:
+	bch_cache_set_error(b->c, "io error reading bucket %zu",
+			    PTR_BUCKET_NR(b->c, &b->key, 0));
+}
+
+static void btree_complete_write(struct btree *b, struct btree_write *w)
+{
+	if (w->prio_blocked &&
+	    !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
+		wake_up_allocators(b->c);
+
+	if (w->journal) {
+		atomic_dec_bug(w->journal);
+		__closure_wake_up(&b->c->journal.wait);
+	}
+
+	w->prio_blocked	= 0;
+	w->journal	= NULL;
+}
+
+static void btree_node_write_unlock(struct closure *cl)
+{
+	struct btree *b = container_of(cl, struct btree, io);
+
+	up(&b->io_mutex);
+}
+
+static void __btree_node_write_done(struct closure *cl)
+{
+	struct btree *b = container_of(cl, struct btree, io);
+	struct btree_write *w = btree_prev_write(b);
+
+	bch_bbio_free(b->bio, b->c);
+	b->bio = NULL;
+	btree_complete_write(b, w);
+
+	if (btree_node_dirty(b))
+		schedule_delayed_work(&b->work, 30 * HZ);
+
+	closure_return_with_destructor(cl, btree_node_write_unlock);
+}
+
+static void btree_node_write_done(struct closure *cl)
+{
+	struct btree *b = container_of(cl, struct btree, io);
+	struct bio_vec *bv;
+	int n;
+
+	bio_for_each_segment_all(bv, b->bio, n)
+		__free_page(bv->bv_page);
+
+	__btree_node_write_done(cl);
+}
+
+static void btree_node_write_endio(struct bio *bio, int error)
+{
+	struct closure *cl = bio->bi_private;
+	struct btree *b = container_of(cl, struct btree, io);
+
+	if (error)
+		set_btree_node_io_error(b);
+
+	bch_bbio_count_io_errors(b->c, bio, error, "writing btree");
+	closure_put(cl);
+}
+
+static void do_btree_node_write(struct btree *b)
+{
+	struct closure *cl = &b->io;
+	struct bset *i = btree_bset_last(b);
+	BKEY_PADDED(key) k;
+
+	i->version	= BCACHE_BSET_VERSION;
+	i->csum		= btree_csum_set(b, i);
+
+	BUG_ON(b->bio);
+	b->bio = bch_bbio_alloc(b->c);
+
+	b->bio->bi_end_io	= btree_node_write_endio;
+	b->bio->bi_private	= cl;
+	b->bio->bi_rw		= REQ_META|WRITE_SYNC|REQ_FUA;
+	b->bio->bi_iter.bi_size	= roundup(set_bytes(i), block_bytes(b->c));
+	bch_bio_map(b->bio, i);
+
+	/*
+	 * If we're appending to a leaf node, we don't technically need FUA -
+	 * this write just needs to be persisted before the next journal write,
+	 * which will be marked FLUSH|FUA.
+	 *
+	 * Similarly if we're writing a new btree root - the pointer is going to
+	 * be in the next journal entry.
+	 *
+	 * But if we're writing a new btree node (that isn't a root) or
+	 * appending to a non leaf btree node, we need either FUA or a flush
+	 * when we write the parent with the new pointer. FUA is cheaper than a
+	 * flush, and writes appending to leaf nodes aren't blocking anything so
+	 * just make all btree node writes FUA to keep things sane.
+	 */
+
+	bkey_copy(&k.key, &b->key);
+	SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
+		       bset_sector_offset(&b->keys, i));
+
+	if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
+		int j;
+		struct bio_vec *bv;
+		void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
+
+		bio_for_each_segment_all(bv, b->bio, j)
+			memcpy(page_address(bv->bv_page),
+			       base + j * PAGE_SIZE, PAGE_SIZE);
+
+		bch_submit_bbio(b->bio, b->c, &k.key, 0);
+
+		continue_at(cl, btree_node_write_done, NULL);
+	} else {
+		b->bio->bi_vcnt = 0;
+		bch_bio_map(b->bio, i);
+
+		bch_submit_bbio(b->bio, b->c, &k.key, 0);
+
+		closure_sync(cl);
+		continue_at_nobarrier(cl, __btree_node_write_done, NULL);
+	}
+}
+
+void __bch_btree_node_write(struct btree *b, struct closure *parent)
+{
+	struct bset *i = btree_bset_last(b);
+
+	lockdep_assert_held(&b->write_lock);
+
+	trace_bcache_btree_write(b);
+
+	BUG_ON(current->bio_list);
+	BUG_ON(b->written >= btree_blocks(b));
+	BUG_ON(b->written && !i->keys);
+	BUG_ON(btree_bset_first(b)->seq != i->seq);
+	bch_check_keys(&b->keys, "writing");
+
+	cancel_delayed_work(&b->work);
+
+	/* If caller isn't waiting for write, parent refcount is cache set */
+	down(&b->io_mutex);
+	closure_init(&b->io, parent ?: &b->c->cl);
+
+	clear_bit(BTREE_NODE_dirty,	 &b->flags);
+	change_bit(BTREE_NODE_write_idx, &b->flags);
+
+	do_btree_node_write(b);
+
+	atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size,
+			&PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
+
+	b->written += set_blocks(i, block_bytes(b->c));
+}
+
+void bch_btree_node_write(struct btree *b, struct closure *parent)
+{
+	unsigned nsets = b->keys.nsets;
+
+	lockdep_assert_held(&b->lock);
+
+	__bch_btree_node_write(b, parent);
+
+	/*
+	 * do verify if there was more than one set initially (i.e. we did a
+	 * sort) and we sorted down to a single set:
+	 */
+	if (nsets && !b->keys.nsets)
+		bch_btree_verify(b);
+
+	bch_btree_init_next(b);
+}
+
+static void bch_btree_node_write_sync(struct btree *b)
+{
+	struct closure cl;
+
+	closure_init_stack(&cl);
+
+	mutex_lock(&b->write_lock);
+	bch_btree_node_write(b, &cl);
+	mutex_unlock(&b->write_lock);
+
+	closure_sync(&cl);
+}
+
+static void btree_node_write_work(struct work_struct *w)
+{
+	struct btree *b = container_of(to_delayed_work(w), struct btree, work);
+
+	mutex_lock(&b->write_lock);
+	if (btree_node_dirty(b))
+		__bch_btree_node_write(b, NULL);
+	mutex_unlock(&b->write_lock);
+}
+
+static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
+{
+	struct bset *i = btree_bset_last(b);
+	struct btree_write *w = btree_current_write(b);
+
+	lockdep_assert_held(&b->write_lock);
+
+	BUG_ON(!b->written);
+	BUG_ON(!i->keys);
+
+	if (!btree_node_dirty(b))
+		schedule_delayed_work(&b->work, 30 * HZ);
+
+	set_btree_node_dirty(b);
+
+	if (journal_ref) {
+		if (w->journal &&
+		    journal_pin_cmp(b->c, w->journal, journal_ref)) {
+			atomic_dec_bug(w->journal);
+			w->journal = NULL;
+		}
+
+		if (!w->journal) {
+			w->journal = journal_ref;
+			atomic_inc(w->journal);
+		}
+	}
+
+	/* Force write if set is too big */
+	if (set_bytes(i) > PAGE_SIZE - 48 &&
+	    !current->bio_list)
+		bch_btree_node_write(b, NULL);
+}
+
+/*
+ * Btree in memory cache - allocation/freeing
+ * mca -> memory cache
+ */
+
+#define mca_reserve(c)	(((c->root && c->root->level)		\
+			  ? c->root->level : 1) * 8 + 16)
+#define mca_can_free(c)						\
+	max_t(int, 0, c->btree_cache_used - mca_reserve(c))
+
+static void mca_data_free(struct btree *b)
+{
+	BUG_ON(b->io_mutex.count != 1);
+
+	bch_btree_keys_free(&b->keys);
+
+	b->c->btree_cache_used--;
+	list_move(&b->list, &b->c->btree_cache_freed);
+}
+
+static void mca_bucket_free(struct btree *b)
+{
+	BUG_ON(btree_node_dirty(b));
+
+	b->key.ptr[0] = 0;
+	hlist_del_init_rcu(&b->hash);
+	list_move(&b->list, &b->c->btree_cache_freeable);
+}
+
+static unsigned btree_order(struct bkey *k)
+{
+	return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
+}
+
+static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
+{
+	if (!bch_btree_keys_alloc(&b->keys,
+				  max_t(unsigned,
+					ilog2(b->c->btree_pages),
+					btree_order(k)),
+				  gfp)) {
+		b->c->btree_cache_used++;
+		list_move(&b->list, &b->c->btree_cache);
+	} else {
+		list_move(&b->list, &b->c->btree_cache_freed);
+	}
+}
+
+static struct btree *mca_bucket_alloc(struct cache_set *c,
+				      struct bkey *k, gfp_t gfp)
+{
+	struct btree *b = kzalloc(sizeof(struct btree), gfp);
+	if (!b)
+		return NULL;
+
+	init_rwsem(&b->lock);
+	lockdep_set_novalidate_class(&b->lock);
+	mutex_init(&b->write_lock);
+	lockdep_set_novalidate_class(&b->write_lock);
+	INIT_LIST_HEAD(&b->list);
+	INIT_DELAYED_WORK(&b->work, btree_node_write_work);
+	b->c = c;
+	sema_init(&b->io_mutex, 1);
+
+	mca_data_alloc(b, k, gfp);
+	return b;
+}
+
+static int mca_reap(struct btree *b, unsigned min_order, bool flush)
+{
+	struct closure cl;
+
+	closure_init_stack(&cl);
+	lockdep_assert_held(&b->c->bucket_lock);
+
+	if (!down_write_trylock(&b->lock))
+		return -ENOMEM;
+
+	BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data);
+
+	if (b->keys.page_order < min_order)
+		goto out_unlock;
+
+	if (!flush) {
+		if (btree_node_dirty(b))
+			goto out_unlock;
+
+		if (down_trylock(&b->io_mutex))
+			goto out_unlock;
+		up(&b->io_mutex);
+	}
+
+	mutex_lock(&b->write_lock);
+	if (btree_node_dirty(b))
+		__bch_btree_node_write(b, &cl);
+	mutex_unlock(&b->write_lock);
+
+	closure_sync(&cl);
+
+	/* wait for any in flight btree write */
+	down(&b->io_mutex);
+	up(&b->io_mutex);
+
+	return 0;
+out_unlock:
+	rw_unlock(true, b);
+	return -ENOMEM;
+}
+
+static unsigned long bch_mca_scan(struct shrinker *shrink,
+				  struct shrink_control *sc)
+{
+	struct cache_set *c = container_of(shrink, struct cache_set, shrink);
+	struct btree *b, *t;
+	unsigned long i, nr = sc->nr_to_scan;
+	unsigned long freed = 0;
+
+	if (c->shrinker_disabled)
+		return SHRINK_STOP;
+
+	if (c->btree_cache_alloc_lock)
+		return SHRINK_STOP;
+
+	/* Return -1 if we can't do anything right now */
+	if (sc->gfp_mask & __GFP_IO)
+		mutex_lock(&c->bucket_lock);
+	else if (!mutex_trylock(&c->bucket_lock))
+		return -1;
+
+	/*
+	 * It's _really_ critical that we don't free too many btree nodes - we
+	 * have to always leave ourselves a reserve. The reserve is how we
+	 * guarantee that allocating memory for a new btree node can always
+	 * succeed, so that inserting keys into the btree can always succeed and
+	 * IO can always make forward progress:
+	 */
+	nr /= c->btree_pages;
+	nr = min_t(unsigned long, nr, mca_can_free(c));
+
+	i = 0;
+	list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
+		if (freed >= nr)
+			break;
+
+		if (++i > 3 &&
+		    !mca_reap(b, 0, false)) {
+			mca_data_free(b);
+			rw_unlock(true, b);
+			freed++;
+		}
+	}
+
+	for (i = 0; (nr--) && i < c->btree_cache_used; i++) {
+		if (list_empty(&c->btree_cache))
+			goto out;
+
+		b = list_first_entry(&c->btree_cache, struct btree, list);
+		list_rotate_left(&c->btree_cache);
+
+		if (!b->accessed &&
+		    !mca_reap(b, 0, false)) {
+			mca_bucket_free(b);
+			mca_data_free(b);
+			rw_unlock(true, b);
+			freed++;
+		} else
+			b->accessed = 0;
+	}
+out:
+	mutex_unlock(&c->bucket_lock);
+	return freed;
+}
+
+static unsigned long bch_mca_count(struct shrinker *shrink,
+				   struct shrink_control *sc)
+{
+	struct cache_set *c = container_of(shrink, struct cache_set, shrink);
+
+	if (c->shrinker_disabled)
+		return 0;
+
+	if (c->btree_cache_alloc_lock)
+		return 0;
+
+	return mca_can_free(c) * c->btree_pages;
+}
+
+void bch_btree_cache_free(struct cache_set *c)
+{
+	struct btree *b;
+	struct closure cl;
+	closure_init_stack(&cl);
+
+	if (c->shrink.list.next)
+		unregister_shrinker(&c->shrink);
+
+	mutex_lock(&c->bucket_lock);
+
+#ifdef CONFIG_BCACHE_DEBUG
+	if (c->verify_data)
+		list_move(&c->verify_data->list, &c->btree_cache);
+
+	free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c)));
+#endif
+
+	list_splice(&c->btree_cache_freeable,
+		    &c->btree_cache);
+
+	while (!list_empty(&c->btree_cache)) {
+		b = list_first_entry(&c->btree_cache, struct btree, list);
+
+		if (btree_node_dirty(b))
+			btree_complete_write(b, btree_current_write(b));
+		clear_bit(BTREE_NODE_dirty, &b->flags);
+
+		mca_data_free(b);
+	}
+
+	while (!list_empty(&c->btree_cache_freed)) {
+		b = list_first_entry(&c->btree_cache_freed,
+				     struct btree, list);
+		list_del(&b->list);
+		cancel_delayed_work_sync(&b->work);
+		kfree(b);
+	}
+
+	mutex_unlock(&c->bucket_lock);
+}
+
+int bch_btree_cache_alloc(struct cache_set *c)
+{
+	unsigned i;
+
+	for (i = 0; i < mca_reserve(c); i++)
+		if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL))
+			return -ENOMEM;
+
+	list_splice_init(&c->btree_cache,
+			 &c->btree_cache_freeable);
+
+#ifdef CONFIG_BCACHE_DEBUG
+	mutex_init(&c->verify_lock);
+
+	c->verify_ondisk = (void *)
+		__get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c)));
+
+	c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
+
+	if (c->verify_data &&
+	    c->verify_data->keys.set->data)
+		list_del_init(&c->verify_data->list);
+	else
+		c->verify_data = NULL;
+#endif
+
+	c->shrink.count_objects = bch_mca_count;
+	c->shrink.scan_objects = bch_mca_scan;
+	c->shrink.seeks = 4;
+	c->shrink.batch = c->btree_pages * 2;
+	register_shrinker(&c->shrink);
+
+	return 0;
+}
+
+/* Btree in memory cache - hash table */
+
+static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k)
+{
+	return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
+}
+
+static struct btree *mca_find(struct cache_set *c, struct bkey *k)
+{
+	struct btree *b;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
+		if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
+			goto out;
+	b = NULL;
+out:
+	rcu_read_unlock();
+	return b;
+}
+
+static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
+{
+	struct task_struct *old;
+
+	old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
+	if (old && old != current) {
+		if (op)
+			prepare_to_wait(&c->btree_cache_wait, &op->wait,
+					TASK_UNINTERRUPTIBLE);
+		return -EINTR;
+	}
+
+	return 0;
+}
+
+static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
+				     struct bkey *k)
+{
+	struct btree *b;
+
+	trace_bcache_btree_cache_cannibalize(c);
+
+	if (mca_cannibalize_lock(c, op))
+		return ERR_PTR(-EINTR);
+
+	list_for_each_entry_reverse(b, &c->btree_cache, list)
+		if (!mca_reap(b, btree_order(k), false))
+			return b;
+
+	list_for_each_entry_reverse(b, &c->btree_cache, list)
+		if (!mca_reap(b, btree_order(k), true))
+			return b;
+
+	WARN(1, "btree cache cannibalize failed\n");
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * We can only have one thread cannibalizing other cached btree nodes at a time,
+ * or we'll deadlock. We use an open coded mutex to ensure that, which a
+ * cannibalize_bucket() will take. This means every time we unlock the root of
+ * the btree, we need to release this lock if we have it held.
+ */
+static void bch_cannibalize_unlock(struct cache_set *c)
+{
+	if (c->btree_cache_alloc_lock == current) {
+		c->btree_cache_alloc_lock = NULL;
+		wake_up(&c->btree_cache_wait);
+	}
+}
+
+static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
+			       struct bkey *k, int level)
+{
+	struct btree *b;
+
+	BUG_ON(current->bio_list);
+
+	lockdep_assert_held(&c->bucket_lock);
+
+	if (mca_find(c, k))
+		return NULL;
+
+	/* btree_free() doesn't free memory; it sticks the node on the end of
+	 * the list. Check if there's any freed nodes there:
+	 */
+	list_for_each_entry(b, &c->btree_cache_freeable, list)
+		if (!mca_reap(b, btree_order(k), false))
+			goto out;
+
+	/* We never free struct btree itself, just the memory that holds the on
+	 * disk node. Check the freed list before allocating a new one:
+	 */
+	list_for_each_entry(b, &c->btree_cache_freed, list)
+		if (!mca_reap(b, 0, false)) {
+			mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
+			if (!b->keys.set[0].data)
+				goto err;
+			else
+				goto out;
+		}
+
+	b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
+	if (!b)
+		goto err;
+
+	BUG_ON(!down_write_trylock(&b->lock));
+	if (!b->keys.set->data)
+		goto err;
+out:
+	BUG_ON(b->io_mutex.count != 1);
+
+	bkey_copy(&b->key, k);
+	list_move(&b->list, &c->btree_cache);
+	hlist_del_init_rcu(&b->hash);
+	hlist_add_head_rcu(&b->hash, mca_hash(c, k));
+
+	lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
+	b->parent	= (void *) ~0UL;
+	b->flags	= 0;
+	b->written	= 0;
+	b->level	= level;
+
+	if (!b->level)
+		bch_btree_keys_init(&b->keys, &bch_extent_keys_ops,
+				    &b->c->expensive_debug_checks);
+	else
+		bch_btree_keys_init(&b->keys, &bch_btree_keys_ops,
+				    &b->c->expensive_debug_checks);
+
+	return b;
+err:
+	if (b)
+		rw_unlock(true, b);
+
+	b = mca_cannibalize(c, op, k);
+	if (!IS_ERR(b))
+		goto out;
+
+	return b;
+}
+
+/**
+ * bch_btree_node_get - find a btree node in the cache and lock it, reading it
+ * in from disk if necessary.
+ *
+ * If IO is necessary and running under generic_make_request, returns -EAGAIN.
+ *
+ * The btree node will have either a read or a write lock held, depending on
+ * level and op->lock.
+ */
+struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
+				 struct bkey *k, int level, bool write)
+{
+	int i = 0;
+	struct btree *b;
+
+	BUG_ON(level < 0);
+retry:
+	b = mca_find(c, k);
+
+	if (!b) {
+		if (current->bio_list)
+			return ERR_PTR(-EAGAIN);
+
+		mutex_lock(&c->bucket_lock);
+		b = mca_alloc(c, op, k, level);
+		mutex_unlock(&c->bucket_lock);
+
+		if (!b)
+			goto retry;
+		if (IS_ERR(b))
+			return b;
+
+		bch_btree_node_read(b);
+
+		if (!write)
+			downgrade_write(&b->lock);
+	} else {
+		rw_lock(write, b, level);
+		if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
+			rw_unlock(write, b);
+			goto retry;
+		}
+		BUG_ON(b->level != level);
+	}
+
+	b->accessed = 1;
+
+	for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
+		prefetch(b->keys.set[i].tree);
+		prefetch(b->keys.set[i].data);
+	}
+
+	for (; i <= b->keys.nsets; i++)
+		prefetch(b->keys.set[i].data);
+
+	if (btree_node_io_error(b)) {
+		rw_unlock(write, b);
+		return ERR_PTR(-EIO);
+	}
+
+	BUG_ON(!b->written);
+
+	return b;
+}
+
+static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
+{
+	struct btree *b;
+
+	mutex_lock(&c->bucket_lock);
+	b = mca_alloc(c, NULL, k, level);
+	mutex_unlock(&c->bucket_lock);
+
+	if (!IS_ERR_OR_NULL(b)) {
+		bch_btree_node_read(b);
+		rw_unlock(true, b);
+	}
+}
+
+/* Btree alloc */
+
+static void btree_node_free(struct btree *b)
+{
+	trace_bcache_btree_node_free(b);
+
+	BUG_ON(b == b->c->root);
+
+	mutex_lock(&b->write_lock);
+
+	if (btree_node_dirty(b))
+		btree_complete_write(b, btree_current_write(b));
+	clear_bit(BTREE_NODE_dirty, &b->flags);
+
+	mutex_unlock(&b->write_lock);
+
+	cancel_delayed_work(&b->work);
+
+	mutex_lock(&b->c->bucket_lock);
+	bch_bucket_free(b->c, &b->key);
+	mca_bucket_free(b);
+	mutex_unlock(&b->c->bucket_lock);
+}
+
+struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
+				   int level)
+{
+	BKEY_PADDED(key) k;
+	struct btree *b = ERR_PTR(-EAGAIN);
+
+	mutex_lock(&c->bucket_lock);
+retry:
+	if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, op != NULL))
+		goto err;
+
+	bkey_put(c, &k.key);
+	SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
+
+	b = mca_alloc(c, op, &k.key, level);
+	if (IS_ERR(b))
+		goto err_free;
+
+	if (!b) {
+		cache_bug(c,
+			"Tried to allocate bucket that was in btree cache");
+		goto retry;
+	}
+
+	b->accessed = 1;
+	bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
+
+	mutex_unlock(&c->bucket_lock);
+
+	trace_bcache_btree_node_alloc(b);
+	return b;
+err_free:
+	bch_bucket_free(c, &k.key);
+err:
+	mutex_unlock(&c->bucket_lock);
+
+	trace_bcache_btree_node_alloc_fail(b);
+	return b;
+}
+
+static struct btree *btree_node_alloc_replacement(struct btree *b,
+						  struct btree_op *op)
+{
+	struct btree *n = bch_btree_node_alloc(b->c, op, b->level);
+	if (!IS_ERR_OR_NULL(n)) {
+		mutex_lock(&n->write_lock);
+		bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
+		bkey_copy_key(&n->key, &b->key);
+		mutex_unlock(&n->write_lock);
+	}
+
+	return n;
+}
+
+static void make_btree_freeing_key(struct btree *b, struct bkey *k)
+{
+	unsigned i;
+
+	mutex_lock(&b->c->bucket_lock);
+
+	atomic_inc(&b->c->prio_blocked);
+
+	bkey_copy(k, &b->key);
+	bkey_copy_key(k, &ZERO_KEY);
+
+	for (i = 0; i < KEY_PTRS(k); i++)
+		SET_PTR_GEN(k, i,
+			    bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
+					PTR_BUCKET(b->c, &b->key, i)));
+
+	mutex_unlock(&b->c->bucket_lock);
+}
+
+static int btree_check_reserve(struct btree *b, struct btree_op *op)
+{
+	struct cache_set *c = b->c;
+	struct cache *ca;
+	unsigned i, reserve = (c->root->level - b->level) * 2 + 1;
+
+	mutex_lock(&c->bucket_lock);
+
+	for_each_cache(ca, c, i)
+		if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
+			if (op)
+				prepare_to_wait(&c->btree_cache_wait, &op->wait,
+						TASK_UNINTERRUPTIBLE);
+			mutex_unlock(&c->bucket_lock);
+			return -EINTR;
+		}
+
+	mutex_unlock(&c->bucket_lock);
+
+	return mca_cannibalize_lock(b->c, op);
+}
+
+/* Garbage collection */
+
+static uint8_t __bch_btree_mark_key(struct cache_set *c, int level,
+				    struct bkey *k)
+{
+	uint8_t stale = 0;
+	unsigned i;
+	struct bucket *g;
+
+	/*
+	 * ptr_invalid() can't return true for the keys that mark btree nodes as
+	 * freed, but since ptr_bad() returns true we'll never actually use them
+	 * for anything and thus we don't want mark their pointers here
+	 */
+	if (!bkey_cmp(k, &ZERO_KEY))
+		return stale;
+
+	for (i = 0; i < KEY_PTRS(k); i++) {
+		if (!ptr_available(c, k, i))
+			continue;
+
+		g = PTR_BUCKET(c, k, i);
+
+		if (gen_after(g->last_gc, PTR_GEN(k, i)))
+			g->last_gc = PTR_GEN(k, i);
+
+		if (ptr_stale(c, k, i)) {
+			stale = max(stale, ptr_stale(c, k, i));
+			continue;
+		}
+
+		cache_bug_on(GC_MARK(g) &&
+			     (GC_MARK(g) == GC_MARK_METADATA) != (level != 0),
+			     c, "inconsistent ptrs: mark = %llu, level = %i",
+			     GC_MARK(g), level);
+
+		if (level)
+			SET_GC_MARK(g, GC_MARK_METADATA);
+		else if (KEY_DIRTY(k))
+			SET_GC_MARK(g, GC_MARK_DIRTY);
+		else if (!GC_MARK(g))
+			SET_GC_MARK(g, GC_MARK_RECLAIMABLE);
+
+		/* guard against overflow */
+		SET_GC_SECTORS_USED(g, min_t(unsigned,
+					     GC_SECTORS_USED(g) + KEY_SIZE(k),
+					     MAX_GC_SECTORS_USED));
+
+		BUG_ON(!GC_SECTORS_USED(g));
+	}
+
+	return stale;
+}
+
+#define btree_mark_key(b, k)	__bch_btree_mark_key(b->c, b->level, k)
+
+void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k)
+{
+	unsigned i;
+
+	for (i = 0; i < KEY_PTRS(k); i++)
+		if (ptr_available(c, k, i) &&
+		    !ptr_stale(c, k, i)) {
+			struct bucket *b = PTR_BUCKET(c, k, i);
+
+			b->gen = PTR_GEN(k, i);
+
+			if (level && bkey_cmp(k, &ZERO_KEY))
+				b->prio = BTREE_PRIO;
+			else if (!level && b->prio == BTREE_PRIO)
+				b->prio = INITIAL_PRIO;
+		}
+
+	__bch_btree_mark_key(c, level, k);
+}
+
+static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
+{
+	uint8_t stale = 0;
+	unsigned keys = 0, good_keys = 0;
+	struct bkey *k;
+	struct btree_iter iter;
+	struct bset_tree *t;
+
+	gc->nodes++;
+
+	for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
+		stale = max(stale, btree_mark_key(b, k));
+		keys++;
+
+		if (bch_ptr_bad(&b->keys, k))
+			continue;
+
+		gc->key_bytes += bkey_u64s(k);
+		gc->nkeys++;
+		good_keys++;
+
+		gc->data += KEY_SIZE(k);
+	}
+
+	for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++)
+		btree_bug_on(t->size &&
+			     bset_written(&b->keys, t) &&
+			     bkey_cmp(&b->key, &t->end) < 0,
+			     b, "found short btree key in gc");
+
+	if (b->c->gc_always_rewrite)
+		return true;
+
+	if (stale > 10)
+		return true;
+
+	if ((keys - good_keys) * 2 > keys)
+		return true;
+
+	return false;
+}
+
+#define GC_MERGE_NODES	4U
+
+struct gc_merge_info {
+	struct btree	*b;
+	unsigned	keys;
+};
+
+static int bch_btree_insert_node(struct btree *, struct btree_op *,
+				 struct keylist *, atomic_t *, struct bkey *);
+
+static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
+			     struct gc_stat *gc, struct gc_merge_info *r)
+{
+	unsigned i, nodes = 0, keys = 0, blocks;
+	struct btree *new_nodes[GC_MERGE_NODES];
+	struct keylist keylist;
+	struct closure cl;
+	struct bkey *k;
+
+	bch_keylist_init(&keylist);
+
+	if (btree_check_reserve(b, NULL))
+		return 0;
+
+	memset(new_nodes, 0, sizeof(new_nodes));
+	closure_init_stack(&cl);
+
+	while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b))
+		keys += r[nodes++].keys;
+
+	blocks = btree_default_blocks(b->c) * 2 / 3;
+
+	if (nodes < 2 ||
+	    __set_blocks(b->keys.set[0].data, keys,
+			 block_bytes(b->c)) > blocks * (nodes - 1))
+		return 0;
+
+	for (i = 0; i < nodes; i++) {
+		new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL);
+		if (IS_ERR_OR_NULL(new_nodes[i]))
+			goto out_nocoalesce;
+	}
+
+	/*
+	 * We have to check the reserve here, after we've allocated our new
+	 * nodes, to make sure the insert below will succeed - we also check
+	 * before as an optimization to potentially avoid a bunch of expensive
+	 * allocs/sorts
+	 */
+	if (btree_check_reserve(b, NULL))
+		goto out_nocoalesce;
+
+	for (i = 0; i < nodes; i++)
+		mutex_lock(&new_nodes[i]->write_lock);
+
+	for (i = nodes - 1; i > 0; --i) {
+		struct bset *n1 = btree_bset_first(new_nodes[i]);
+		struct bset *n2 = btree_bset_first(new_nodes[i - 1]);
+		struct bkey *k, *last = NULL;
+
+		keys = 0;
+
+		if (i > 1) {
+			for (k = n2->start;
+			     k < bset_bkey_last(n2);
+			     k = bkey_next(k)) {
+				if (__set_blocks(n1, n1->keys + keys +
+						 bkey_u64s(k),
+						 block_bytes(b->c)) > blocks)
+					break;
+
+				last = k;
+				keys += bkey_u64s(k);
+			}
+		} else {
+			/*
+			 * Last node we're not getting rid of - we're getting
+			 * rid of the node at r[0]. Have to try and fit all of
+			 * the remaining keys into this node; we can't ensure
+			 * they will always fit due to rounding and variable
+			 * length keys (shouldn't be possible in practice,
+			 * though)
+			 */
+			if (__set_blocks(n1, n1->keys + n2->keys,
+					 block_bytes(b->c)) >
+			    btree_blocks(new_nodes[i]))
+				goto out_nocoalesce;
+
+			keys = n2->keys;
+			/* Take the key of the node we're getting rid of */
+			last = &r->b->key;
+		}
+
+		BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) >
+		       btree_blocks(new_nodes[i]));
+
+		if (last)
+			bkey_copy_key(&new_nodes[i]->key, last);
+
+		memcpy(bset_bkey_last(n1),
+		       n2->start,
+		       (void *) bset_bkey_idx(n2, keys) - (void *) n2->start);
+
+		n1->keys += keys;
+		r[i].keys = n1->keys;
+
+		memmove(n2->start,
+			bset_bkey_idx(n2, keys),
+			(void *) bset_bkey_last(n2) -
+			(void *) bset_bkey_idx(n2, keys));
+
+		n2->keys -= keys;
+
+		if (__bch_keylist_realloc(&keylist,
+					  bkey_u64s(&new_nodes[i]->key)))
+			goto out_nocoalesce;
+
+		bch_btree_node_write(new_nodes[i], &cl);
+		bch_keylist_add(&keylist, &new_nodes[i]->key);
+	}
+
+	for (i = 0; i < nodes; i++)
+		mutex_unlock(&new_nodes[i]->write_lock);
+
+	closure_sync(&cl);
+
+	/* We emptied out this node */
+	BUG_ON(btree_bset_first(new_nodes[0])->keys);
+	btree_node_free(new_nodes[0]);
+	rw_unlock(true, new_nodes[0]);
+
+	for (i = 0; i < nodes; i++) {
+		if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key)))
+			goto out_nocoalesce;
+
+		make_btree_freeing_key(r[i].b, keylist.top);
+		bch_keylist_push(&keylist);
+	}
+
+	bch_btree_insert_node(b, op, &keylist, NULL, NULL);
+	BUG_ON(!bch_keylist_empty(&keylist));
+
+	for (i = 0; i < nodes; i++) {
+		btree_node_free(r[i].b);
+		rw_unlock(true, r[i].b);
+
+		r[i].b = new_nodes[i];
+	}
+
+	memmove(r, r + 1, sizeof(r[0]) * (nodes - 1));
+	r[nodes - 1].b = ERR_PTR(-EINTR);
+
+	trace_bcache_btree_gc_coalesce(nodes);
+	gc->nodes--;
+
+	bch_keylist_free(&keylist);
+
+	/* Invalidated our iterator */
+	return -EINTR;
+
+out_nocoalesce:
+	closure_sync(&cl);
+	bch_keylist_free(&keylist);
+
+	while ((k = bch_keylist_pop(&keylist)))
+		if (!bkey_cmp(k, &ZERO_KEY))
+			atomic_dec(&b->c->prio_blocked);
+
+	for (i = 0; i < nodes; i++)
+		if (!IS_ERR_OR_NULL(new_nodes[i])) {
+			btree_node_free(new_nodes[i]);
+			rw_unlock(true, new_nodes[i]);
+		}
+	return 0;
+}
+
+static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
+				 struct btree *replace)
+{
+	struct keylist keys;
+	struct btree *n;
+
+	if (btree_check_reserve(b, NULL))
+		return 0;
+
+	n = btree_node_alloc_replacement(replace, NULL);
+
+	/* recheck reserve after allocating replacement node */
+	if (btree_check_reserve(b, NULL)) {
+		btree_node_free(n);
+		rw_unlock(true, n);
+		return 0;
+	}
+
+	bch_btree_node_write_sync(n);
+
+	bch_keylist_init(&keys);
+	bch_keylist_add(&keys, &n->key);
+
+	make_btree_freeing_key(replace, keys.top);
+	bch_keylist_push(&keys);
+
+	bch_btree_insert_node(b, op, &keys, NULL, NULL);
+	BUG_ON(!bch_keylist_empty(&keys));
+
+	btree_node_free(replace);
+	rw_unlock(true, n);
+
+	/* Invalidated our iterator */
+	return -EINTR;
+}
+
+static unsigned btree_gc_count_keys(struct btree *b)
+{
+	struct bkey *k;
+	struct btree_iter iter;
+	unsigned ret = 0;
+
+	for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
+		ret += bkey_u64s(k);
+
+	return ret;
+}
+
+static int btree_gc_recurse(struct btree *b, struct btree_op *op,
+			    struct closure *writes, struct gc_stat *gc)
+{
+	int ret = 0;
+	bool should_rewrite;
+	struct bkey *k;
+	struct btree_iter iter;
+	struct gc_merge_info r[GC_MERGE_NODES];
+	struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
+
+	bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
+
+	for (i = r; i < r + ARRAY_SIZE(r); i++)
+		i->b = ERR_PTR(-EINTR);
+
+	while (1) {
+		k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
+		if (k) {
+			r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
+						  true);
+			if (IS_ERR(r->b)) {
+				ret = PTR_ERR(r->b);
+				break;
+			}
+
+			r->keys = btree_gc_count_keys(r->b);
+
+			ret = btree_gc_coalesce(b, op, gc, r);
+			if (ret)
+				break;
+		}
+
+		if (!last->b)
+			break;
+
+		if (!IS_ERR(last->b)) {
+			should_rewrite = btree_gc_mark_node(last->b, gc);
+			if (should_rewrite) {
+				ret = btree_gc_rewrite_node(b, op, last->b);
+				if (ret)
+					break;
+			}
+
+			if (last->b->level) {
+				ret = btree_gc_recurse(last->b, op, writes, gc);
+				if (ret)
+					break;
+			}
+
+			bkey_copy_key(&b->c->gc_done, &last->b->key);
+
+			/*
+			 * Must flush leaf nodes before gc ends, since replace
+			 * operations aren't journalled
+			 */
+			mutex_lock(&last->b->write_lock);
+			if (btree_node_dirty(last->b))
+				bch_btree_node_write(last->b, writes);
+			mutex_unlock(&last->b->write_lock);
+			rw_unlock(true, last->b);
+		}
+
+		memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
+		r->b = NULL;
+
+		if (need_resched()) {
+			ret = -EAGAIN;
+			break;
+		}
+	}
+
+	for (i = r; i < r + ARRAY_SIZE(r); i++)
+		if (!IS_ERR_OR_NULL(i->b)) {
+			mutex_lock(&i->b->write_lock);
+			if (btree_node_dirty(i->b))
+				bch_btree_node_write(i->b, writes);
+			mutex_unlock(&i->b->write_lock);
+			rw_unlock(true, i->b);
+		}
+
+	return ret;
+}
+
+static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
+			     struct closure *writes, struct gc_stat *gc)
+{
+	struct btree *n = NULL;
+	int ret = 0;
+	bool should_rewrite;
+
+	should_rewrite = btree_gc_mark_node(b, gc);
+	if (should_rewrite) {
+		n = btree_node_alloc_replacement(b, NULL);
+
+		if (!IS_ERR_OR_NULL(n)) {
+			bch_btree_node_write_sync(n);
+
+			bch_btree_set_root(n);
+			btree_node_free(b);
+			rw_unlock(true, n);
+
+			return -EINTR;
+		}
+	}
+
+	__bch_btree_mark_key(b->c, b->level + 1, &b->key);
+
+	if (b->level) {
+		ret = btree_gc_recurse(b, op, writes, gc);
+		if (ret)
+			return ret;
+	}
+
+	bkey_copy_key(&b->c->gc_done, &b->key);
+
+	return ret;
+}
+
+static void btree_gc_start(struct cache_set *c)
+{
+	struct cache *ca;
+	struct bucket *b;
+	unsigned i;
+
+	if (!c->gc_mark_valid)
+		return;
+
+	mutex_lock(&c->bucket_lock);
+
+	c->gc_mark_valid = 0;
+	c->gc_done = ZERO_KEY;
+
+	for_each_cache(ca, c, i)
+		for_each_bucket(b, ca) {
+			b->last_gc = b->gen;
+			if (!atomic_read(&b->pin)) {
+				SET_GC_MARK(b, 0);
+				SET_GC_SECTORS_USED(b, 0);
+			}
+		}
+
+	mutex_unlock(&c->bucket_lock);
+}
+
+static size_t bch_btree_gc_finish(struct cache_set *c)
+{
+	size_t available = 0;
+	struct bucket *b;
+	struct cache *ca;
+	unsigned i;
+
+	mutex_lock(&c->bucket_lock);
+
+	set_gc_sectors(c);
+	c->gc_mark_valid = 1;
+	c->need_gc	= 0;
+
+	for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
+		SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
+			    GC_MARK_METADATA);
+
+	/* don't reclaim buckets to which writeback keys point */
+	rcu_read_lock();
+	for (i = 0; i < c->nr_uuids; i++) {
+		struct bcache_device *d = c->devices[i];
+		struct cached_dev *dc;
+		struct keybuf_key *w, *n;
+		unsigned j;
+
+		if (!d || UUID_FLASH_ONLY(&c->uuids[i]))
+			continue;
+		dc = container_of(d, struct cached_dev, disk);
+
+		spin_lock(&dc->writeback_keys.lock);
+		rbtree_postorder_for_each_entry_safe(w, n,
+					&dc->writeback_keys.keys, node)
+			for (j = 0; j < KEY_PTRS(&w->key); j++)
+				SET_GC_MARK(PTR_BUCKET(c, &w->key, j),
+					    GC_MARK_DIRTY);
+		spin_unlock(&dc->writeback_keys.lock);
+	}
+	rcu_read_unlock();
+
+	for_each_cache(ca, c, i) {
+		uint64_t *i;
+
+		ca->invalidate_needs_gc = 0;
+
+		for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
+			SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
+
+		for (i = ca->prio_buckets;
+		     i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
+			SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
+
+		for_each_bucket(b, ca) {
+			c->need_gc	= max(c->need_gc, bucket_gc_gen(b));
+
+			if (atomic_read(&b->pin))
+				continue;
+
+			BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
+
+			if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
+				available++;
+		}
+	}
+
+	mutex_unlock(&c->bucket_lock);
+	return available;
+}
+
+static void bch_btree_gc(struct cache_set *c)
+{
+	int ret;
+	unsigned long available;
+	struct gc_stat stats;
+	struct closure writes;
+	struct btree_op op;
+	uint64_t start_time = local_clock();
+
+	trace_bcache_gc_start(c);
+
+	memset(&stats, 0, sizeof(struct gc_stat));
+	closure_init_stack(&writes);
+	bch_btree_op_init(&op, SHRT_MAX);
+
+	btree_gc_start(c);
+
+	do {
+		ret = btree_root(gc_root, c, &op, &writes, &stats);
+		closure_sync(&writes);
+
+		if (ret && ret != -EAGAIN)
+			pr_warn("gc failed!");
+	} while (ret);
+
+	available = bch_btree_gc_finish(c);
+	wake_up_allocators(c);
+
+	bch_time_stats_update(&c->btree_gc_time, start_time);
+
+	stats.key_bytes *= sizeof(uint64_t);
+	stats.data	<<= 9;
+	stats.in_use	= (c->nbuckets - available) * 100 / c->nbuckets;
+	memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
+
+	trace_bcache_gc_end(c);
+
+	bch_moving_gc(c);
+}
+
+static int bch_gc_thread(void *arg)
+{
+	struct cache_set *c = arg;
+	struct cache *ca;
+	unsigned i;
+
+	while (1) {
+again:
+		bch_btree_gc(c);
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop())
+			break;
+
+		mutex_lock(&c->bucket_lock);
+
+		for_each_cache(ca, c, i)
+			if (ca->invalidate_needs_gc) {
+				mutex_unlock(&c->bucket_lock);
+				set_current_state(TASK_RUNNING);
+				goto again;
+			}
+
+		mutex_unlock(&c->bucket_lock);
+
+		try_to_freeze();
+		schedule();
+	}
+
+	return 0;
+}
+
+int bch_gc_thread_start(struct cache_set *c)
+{
+	c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc");
+	if (IS_ERR(c->gc_thread))
+		return PTR_ERR(c->gc_thread);
+
+	set_task_state(c->gc_thread, TASK_INTERRUPTIBLE);
+	return 0;
+}
+
+/* Initial partial gc */
+
+static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
+{
+	int ret = 0;
+	struct bkey *k, *p = NULL;
+	struct btree_iter iter;
+
+	for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid)
+		bch_initial_mark_key(b->c, b->level, k);
+
+	bch_initial_mark_key(b->c, b->level + 1, &b->key);
+
+	if (b->level) {
+		bch_btree_iter_init(&b->keys, &iter, NULL);
+
+		do {
+			k = bch_btree_iter_next_filter(&iter, &b->keys,
+						       bch_ptr_bad);
+			if (k)
+				btree_node_prefetch(b->c, k, b->level - 1);
+
+			if (p)
+				ret = btree(check_recurse, p, b, op);
+
+			p = k;
+		} while (p && !ret);
+	}
+
+	return ret;
+}
+
+int bch_btree_check(struct cache_set *c)
+{
+	struct btree_op op;
+
+	bch_btree_op_init(&op, SHRT_MAX);
+
+	return btree_root(check_recurse, c, &op);
+}
+
+void bch_initial_gc_finish(struct cache_set *c)
+{
+	struct cache *ca;
+	struct bucket *b;
+	unsigned i;
+
+	bch_btree_gc_finish(c);
+
+	mutex_lock(&c->bucket_lock);
+
+	/*
+	 * We need to put some unused buckets directly on the prio freelist in
+	 * order to get the allocator thread started - it needs freed buckets in
+	 * order to rewrite the prios and gens, and it needs to rewrite prios
+	 * and gens in order to free buckets.
+	 *
+	 * This is only safe for buckets that have no live data in them, which
+	 * there should always be some of.
+	 */
+	for_each_cache(ca, c, i) {
+		for_each_bucket(b, ca) {
+			if (fifo_full(&ca->free[RESERVE_PRIO]))
+				break;
+
+			if (bch_can_invalidate_bucket(ca, b) &&
+			    !GC_MARK(b)) {
+				__bch_invalidate_one_bucket(ca, b);
+				fifo_push(&ca->free[RESERVE_PRIO],
+					  b - ca->buckets);
+			}
+		}
+	}
+
+	mutex_unlock(&c->bucket_lock);
+}
+
+/* Btree insertion */
+
+static bool btree_insert_key(struct btree *b, struct bkey *k,
+			     struct bkey *replace_key)
+{
+	unsigned status;
+
+	BUG_ON(bkey_cmp(k, &b->key) > 0);
+
+	status = bch_btree_insert_key(&b->keys, k, replace_key);
+	if (status != BTREE_INSERT_STATUS_NO_INSERT) {
+		bch_check_keys(&b->keys, "%u for %s", status,
+			       replace_key ? "replace" : "insert");
+
+		trace_bcache_btree_insert_key(b, k, replace_key != NULL,
+					      status);
+		return true;
+	} else
+		return false;
+}
+
+static size_t insert_u64s_remaining(struct btree *b)
+{
+	long ret = bch_btree_keys_u64s_remaining(&b->keys);
+
+	/*
+	 * Might land in the middle of an existing extent and have to split it
+	 */
+	if (b->keys.ops->is_extents)
+		ret -= KEY_MAX_U64S;
+
+	return max(ret, 0L);
+}
+
+static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
+				  struct keylist *insert_keys,
+				  struct bkey *replace_key)
+{
+	bool ret = false;
+	int oldsize = bch_count_data(&b->keys);
+
+	while (!bch_keylist_empty(insert_keys)) {
+		struct bkey *k = insert_keys->keys;
+
+		if (bkey_u64s(k) > insert_u64s_remaining(b))
+			break;
+
+		if (bkey_cmp(k, &b->key) <= 0) {
+			if (!b->level)
+				bkey_put(b->c, k);
+
+			ret |= btree_insert_key(b, k, replace_key);
+			bch_keylist_pop_front(insert_keys);
+		} else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
+			BKEY_PADDED(key) temp;
+			bkey_copy(&temp.key, insert_keys->keys);
+
+			bch_cut_back(&b->key, &temp.key);
+			bch_cut_front(&b->key, insert_keys->keys);
+
+			ret |= btree_insert_key(b, &temp.key, replace_key);
+			break;
+		} else {
+			break;
+		}
+	}
+
+	if (!ret)
+		op->insert_collision = true;
+
+	BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
+
+	BUG_ON(bch_count_data(&b->keys) < oldsize);
+	return ret;
+}
+
+static int btree_split(struct btree *b, struct btree_op *op,
+		       struct keylist *insert_keys,
+		       struct bkey *replace_key)
+{
+	bool split;
+	struct btree *n1, *n2 = NULL, *n3 = NULL;
+	uint64_t start_time = local_clock();
+	struct closure cl;
+	struct keylist parent_keys;
+
+	closure_init_stack(&cl);
+	bch_keylist_init(&parent_keys);
+
+	if (btree_check_reserve(b, op)) {
+		if (!b->level)
+			return -EINTR;
+		else
+			WARN(1, "insufficient reserve for split\n");
+	}
+
+	n1 = btree_node_alloc_replacement(b, op);
+	if (IS_ERR(n1))
+		goto err;
+
+	split = set_blocks(btree_bset_first(n1),
+			   block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5;
+
+	if (split) {
+		unsigned keys = 0;
+
+		trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
+
+		n2 = bch_btree_node_alloc(b->c, op, b->level);
+		if (IS_ERR(n2))
+			goto err_free1;
+
+		if (!b->parent) {
+			n3 = bch_btree_node_alloc(b->c, op, b->level + 1);
+			if (IS_ERR(n3))
+				goto err_free2;
+		}
+
+		mutex_lock(&n1->write_lock);
+		mutex_lock(&n2->write_lock);
+
+		bch_btree_insert_keys(n1, op, insert_keys, replace_key);
+
+		/*
+		 * Has to be a linear search because we don't have an auxiliary
+		 * search tree yet
+		 */
+
+		while (keys < (btree_bset_first(n1)->keys * 3) / 5)
+			keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1),
+							keys));
+
+		bkey_copy_key(&n1->key,
+			      bset_bkey_idx(btree_bset_first(n1), keys));
+		keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys));
+
+		btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys;
+		btree_bset_first(n1)->keys = keys;
+
+		memcpy(btree_bset_first(n2)->start,
+		       bset_bkey_last(btree_bset_first(n1)),
+		       btree_bset_first(n2)->keys * sizeof(uint64_t));
+
+		bkey_copy_key(&n2->key, &b->key);
+
+		bch_keylist_add(&parent_keys, &n2->key);
+		bch_btree_node_write(n2, &cl);
+		mutex_unlock(&n2->write_lock);
+		rw_unlock(true, n2);
+	} else {
+		trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys);
+
+		mutex_lock(&n1->write_lock);
+		bch_btree_insert_keys(n1, op, insert_keys, replace_key);
+	}
+
+	bch_keylist_add(&parent_keys, &n1->key);
+	bch_btree_node_write(n1, &cl);
+	mutex_unlock(&n1->write_lock);
+
+	if (n3) {
+		/* Depth increases, make a new root */
+		mutex_lock(&n3->write_lock);
+		bkey_copy_key(&n3->key, &MAX_KEY);
+		bch_btree_insert_keys(n3, op, &parent_keys, NULL);
+		bch_btree_node_write(n3, &cl);
+		mutex_unlock(&n3->write_lock);
+
+		closure_sync(&cl);
+		bch_btree_set_root(n3);
+		rw_unlock(true, n3);
+	} else if (!b->parent) {
+		/* Root filled up but didn't need to be split */
+		closure_sync(&cl);
+		bch_btree_set_root(n1);
+	} else {
+		/* Split a non root node */
+		closure_sync(&cl);
+		make_btree_freeing_key(b, parent_keys.top);
+		bch_keylist_push(&parent_keys);
+
+		bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL);
+		BUG_ON(!bch_keylist_empty(&parent_keys));
+	}
+
+	btree_node_free(b);
+	rw_unlock(true, n1);
+
+	bch_time_stats_update(&b->c->btree_split_time, start_time);
+
+	return 0;
+err_free2:
+	bkey_put(b->c, &n2->key);
+	btree_node_free(n2);
+	rw_unlock(true, n2);
+err_free1:
+	bkey_put(b->c, &n1->key);
+	btree_node_free(n1);
+	rw_unlock(true, n1);
+err:
+	WARN(1, "bcache: btree split failed (level %u)", b->level);
+
+	if (n3 == ERR_PTR(-EAGAIN) ||
+	    n2 == ERR_PTR(-EAGAIN) ||
+	    n1 == ERR_PTR(-EAGAIN))
+		return -EAGAIN;
+
+	return -ENOMEM;
+}
+
+static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
+				 struct keylist *insert_keys,
+				 atomic_t *journal_ref,
+				 struct bkey *replace_key)
+{
+	struct closure cl;
+
+	BUG_ON(b->level && replace_key);
+
+	closure_init_stack(&cl);
+
+	mutex_lock(&b->write_lock);
+
+	if (write_block(b) != btree_bset_last(b) &&
+	    b->keys.last_set_unwritten)
+		bch_btree_init_next(b); /* just wrote a set */
+
+	if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) {
+		mutex_unlock(&b->write_lock);
+		goto split;
+	}
+
+	BUG_ON(write_block(b) != btree_bset_last(b));
+
+	if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
+		if (!b->level)
+			bch_btree_leaf_dirty(b, journal_ref);
+		else
+			bch_btree_node_write(b, &cl);
+	}
+
+	mutex_unlock(&b->write_lock);
+
+	/* wait for btree node write if necessary, after unlock */
+	closure_sync(&cl);
+
+	return 0;
+split:
+	if (current->bio_list) {
+		op->lock = b->c->root->level + 1;
+		return -EAGAIN;
+	} else if (op->lock <= b->c->root->level) {
+		op->lock = b->c->root->level + 1;
+		return -EINTR;
+	} else {
+		/* Invalidated all iterators */
+		int ret = btree_split(b, op, insert_keys, replace_key);
+
+		if (bch_keylist_empty(insert_keys))
+			return 0;
+		else if (!ret)
+			return -EINTR;
+		return ret;
+	}
+}
+
+int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
+			       struct bkey *check_key)
+{
+	int ret = -EINTR;
+	uint64_t btree_ptr = b->key.ptr[0];
+	unsigned long seq = b->seq;
+	struct keylist insert;
+	bool upgrade = op->lock == -1;
+
+	bch_keylist_init(&insert);
+
+	if (upgrade) {
+		rw_unlock(false, b);
+		rw_lock(true, b, b->level);
+
+		if (b->key.ptr[0] != btree_ptr ||
+		    b->seq != seq + 1)
+			goto out;
+	}
+
+	SET_KEY_PTRS(check_key, 1);
+	get_random_bytes(&check_key->ptr[0], sizeof(uint64_t));
+
+	SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV);
+
+	bch_keylist_add(&insert, check_key);
+
+	ret = bch_btree_insert_node(b, op, &insert, NULL, NULL);
+
+	BUG_ON(!ret && !bch_keylist_empty(&insert));
+out:
+	if (upgrade)
+		downgrade_write(&b->lock);
+	return ret;
+}
+
+struct btree_insert_op {
+	struct btree_op	op;
+	struct keylist	*keys;
+	atomic_t	*journal_ref;
+	struct bkey	*replace_key;
+};
+
+static int btree_insert_fn(struct btree_op *b_op, struct btree *b)
+{
+	struct btree_insert_op *op = container_of(b_op,
+					struct btree_insert_op, op);
+
+	int ret = bch_btree_insert_node(b, &op->op, op->keys,
+					op->journal_ref, op->replace_key);
+	if (ret && !bch_keylist_empty(op->keys))
+		return ret;
+	else
+		return MAP_DONE;
+}
+
+int bch_btree_insert(struct cache_set *c, struct keylist *keys,
+		     atomic_t *journal_ref, struct bkey *replace_key)
+{
+	struct btree_insert_op op;
+	int ret = 0;
+
+	BUG_ON(current->bio_list);
+	BUG_ON(bch_keylist_empty(keys));
+
+	bch_btree_op_init(&op.op, 0);
+	op.keys		= keys;
+	op.journal_ref	= journal_ref;
+	op.replace_key	= replace_key;
+
+	while (!ret && !bch_keylist_empty(keys)) {
+		op.op.lock = 0;
+		ret = bch_btree_map_leaf_nodes(&op.op, c,
+					       &START_KEY(keys->keys),
+					       btree_insert_fn);
+	}
+
+	if (ret) {
+		struct bkey *k;
+
+		pr_err("error %i", ret);
+
+		while ((k = bch_keylist_pop(keys)))
+			bkey_put(c, k);
+	} else if (op.op.insert_collision)
+		ret = -ESRCH;
+
+	return ret;
+}
+
+void bch_btree_set_root(struct btree *b)
+{
+	unsigned i;
+	struct closure cl;
+
+	closure_init_stack(&cl);
+
+	trace_bcache_btree_set_root(b);
+
+	BUG_ON(!b->written);
+
+	for (i = 0; i < KEY_PTRS(&b->key); i++)
+		BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);
+
+	mutex_lock(&b->c->bucket_lock);
+	list_del_init(&b->list);
+	mutex_unlock(&b->c->bucket_lock);
+
+	b->c->root = b;
+
+	bch_journal_meta(b->c, &cl);
+	closure_sync(&cl);
+}
+
+/* Map across nodes or keys */
+
+static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
+				       struct bkey *from,
+				       btree_map_nodes_fn *fn, int flags)
+{
+	int ret = MAP_CONTINUE;
+
+	if (b->level) {
+		struct bkey *k;
+		struct btree_iter iter;
+
+		bch_btree_iter_init(&b->keys, &iter, from);
+
+		while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
+						       bch_ptr_bad))) {
+			ret = btree(map_nodes_recurse, k, b,
+				    op, from, fn, flags);
+			from = NULL;
+
+			if (ret != MAP_CONTINUE)
+				return ret;
+		}
+	}
+
+	if (!b->level || flags == MAP_ALL_NODES)
+		ret = fn(op, b);
+
+	return ret;
+}
+
+int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
+			  struct bkey *from, btree_map_nodes_fn *fn, int flags)
+{
+	return btree_root(map_nodes_recurse, c, op, from, fn, flags);
+}
+
+static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
+				      struct bkey *from, btree_map_keys_fn *fn,
+				      int flags)
+{
+	int ret = MAP_CONTINUE;
+	struct bkey *k;
+	struct btree_iter iter;
+
+	bch_btree_iter_init(&b->keys, &iter, from);
+
+	while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
+		ret = !b->level
+			? fn(op, b, k)
+			: btree(map_keys_recurse, k, b, op, from, fn, flags);
+		from = NULL;
+
+		if (ret != MAP_CONTINUE)
+			return ret;
+	}
+
+	if (!b->level && (flags & MAP_END_KEY))
+		ret = fn(op, b, &KEY(KEY_INODE(&b->key),
+				     KEY_OFFSET(&b->key), 0));
+
+	return ret;
+}
+
+int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
+		       struct bkey *from, btree_map_keys_fn *fn, int flags)
+{
+	return btree_root(map_keys_recurse, c, op, from, fn, flags);
+}
+
+/* Keybuf code */
+
+static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
+{
+	/* Overlapping keys compare equal */
+	if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0)
+		return -1;
+	if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0)
+		return 1;
+	return 0;
+}
+
+static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
+					    struct keybuf_key *r)
+{
+	return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
+}
+
+struct refill {
+	struct btree_op	op;
+	unsigned	nr_found;
+	struct keybuf	*buf;
+	struct bkey	*end;
+	keybuf_pred_fn	*pred;
+};
+
+static int refill_keybuf_fn(struct btree_op *op, struct btree *b,
+			    struct bkey *k)
+{
+	struct refill *refill = container_of(op, struct refill, op);
+	struct keybuf *buf = refill->buf;
+	int ret = MAP_CONTINUE;
+
+	if (bkey_cmp(k, refill->end) >= 0) {
+		ret = MAP_DONE;
+		goto out;
+	}
+
+	if (!KEY_SIZE(k)) /* end key */
+		goto out;
+
+	if (refill->pred(buf, k)) {
+		struct keybuf_key *w;
+
+		spin_lock(&buf->lock);
+
+		w = array_alloc(&buf->freelist);
+		if (!w) {
+			spin_unlock(&buf->lock);
+			return MAP_DONE;
+		}
+
+		w->private = NULL;
+		bkey_copy(&w->key, k);
+
+		if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
+			array_free(&buf->freelist, w);
+		else
+			refill->nr_found++;
+
+		if (array_freelist_empty(&buf->freelist))
+			ret = MAP_DONE;
+
+		spin_unlock(&buf->lock);
+	}
+out:
+	buf->last_scanned = *k;
+	return ret;
+}
+
+void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
+		       struct bkey *end, keybuf_pred_fn *pred)
+{
+	struct bkey start = buf->last_scanned;
+	struct refill refill;
+
+	cond_resched();
+
+	bch_btree_op_init(&refill.op, -1);
+	refill.nr_found	= 0;
+	refill.buf	= buf;
+	refill.end	= end;
+	refill.pred	= pred;
+
+	bch_btree_map_keys(&refill.op, c, &buf->last_scanned,
+			   refill_keybuf_fn, MAP_END_KEY);
+
+	trace_bcache_keyscan(refill.nr_found,
+			     KEY_INODE(&start), KEY_OFFSET(&start),
+			     KEY_INODE(&buf->last_scanned),
+			     KEY_OFFSET(&buf->last_scanned));
+
+	spin_lock(&buf->lock);
+
+	if (!RB_EMPTY_ROOT(&buf->keys)) {
+		struct keybuf_key *w;
+		w = RB_FIRST(&buf->keys, struct keybuf_key, node);
+		buf->start	= START_KEY(&w->key);
+
+		w = RB_LAST(&buf->keys, struct keybuf_key, node);
+		buf->end	= w->key;
+	} else {
+		buf->start	= MAX_KEY;
+		buf->end	= MAX_KEY;
+	}
+
+	spin_unlock(&buf->lock);
+}
+
+static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
+{
+	rb_erase(&w->node, &buf->keys);
+	array_free(&buf->freelist, w);
+}
+
+void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
+{
+	spin_lock(&buf->lock);
+	__bch_keybuf_del(buf, w);
+	spin_unlock(&buf->lock);
+}
+
+bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
+				  struct bkey *end)
+{
+	bool ret = false;
+	struct keybuf_key *p, *w, s;
+	s.key = *start;
+
+	if (bkey_cmp(end, &buf->start) <= 0 ||
+	    bkey_cmp(start, &buf->end) >= 0)
+		return false;
+
+	spin_lock(&buf->lock);
+	w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
+
+	while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) {
+		p = w;
+		w = RB_NEXT(w, node);
+
+		if (p->private)
+			ret = true;
+		else
+			__bch_keybuf_del(buf, p);
+	}
+
+	spin_unlock(&buf->lock);
+	return ret;
+}
+
+struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
+{
+	struct keybuf_key *w;
+	spin_lock(&buf->lock);
+
+	w = RB_FIRST(&buf->keys, struct keybuf_key, node);
+
+	while (w && w->private)
+		w = RB_NEXT(w, node);
+
+	if (w)
+		w->private = ERR_PTR(-EINTR);
+
+	spin_unlock(&buf->lock);
+	return w;
+}
+
+struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
+					  struct keybuf *buf,
+					  struct bkey *end,
+					  keybuf_pred_fn *pred)
+{
+	struct keybuf_key *ret;
+
+	while (1) {
+		ret = bch_keybuf_next(buf);
+		if (ret)
+			break;
+
+		if (bkey_cmp(&buf->last_scanned, end) >= 0) {
+			pr_debug("scan finished");
+			break;
+		}
+
+		bch_refill_keybuf(c, buf, end, pred);
+	}
+
+	return ret;
+}
+
+void bch_keybuf_init(struct keybuf *buf)
+{
+	buf->last_scanned	= MAX_KEY;
+	buf->keys		= RB_ROOT;
+
+	spin_lock_init(&buf->lock);
+	array_allocator_init(&buf->freelist);
+}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
new file mode 100644
index 00000000000..91dfa5e6968
--- /dev/null
+++ b/drivers/md/bcache/btree.h
@@ -0,0 +1,309 @@
+#ifndef _BCACHE_BTREE_H
+#define _BCACHE_BTREE_H
+
+/*
+ * THE BTREE:
+ *
+ * At a high level, bcache's btree is relatively standard b+ tree. All keys and
+ * pointers are in the leaves; interior nodes only have pointers to the child
+ * nodes.
+ *
+ * In the interior nodes, a struct bkey always points to a child btree node, and
+ * the key is the highest key in the child node - except that the highest key in
+ * an interior node is always MAX_KEY. The size field refers to the size on disk
+ * of the child node - this would allow us to have variable sized btree nodes
+ * (handy for keeping the depth of the btree 1 by expanding just the root).
+ *
+ * Btree nodes are themselves log structured, but this is hidden fairly
+ * thoroughly. Btree nodes on disk will in practice have extents that overlap
+ * (because they were written at different times), but in memory we never have
+ * overlapping extents - when we read in a btree node from disk, the first thing
+ * we do is resort all the sets of keys with a mergesort, and in the same pass
+ * we check for overlapping extents and adjust them appropriately.
+ *
+ * struct btree_op is a central interface to the btree code. It's used for
+ * specifying read vs. write locking, and the embedded closure is used for
+ * waiting on IO or reserve memory.
+ *
+ * BTREE CACHE:
+ *
+ * Btree nodes are cached in memory; traversing the btree might require reading
+ * in btree nodes which is handled mostly transparently.
+ *
+ * bch_btree_node_get() looks up a btree node in the cache and reads it in from
+ * disk if necessary. This function is almost never called directly though - the
+ * btree() macro is used to get a btree node, call some function on it, and
+ * unlock the node after the function returns.
+ *
+ * The root is special cased - it's taken out of the cache's lru (thus pinning
+ * it in memory), so we can find the root of the btree by just dereferencing a
+ * pointer instead of looking it up in the cache. This makes locking a bit
+ * tricky, since the root pointer is protected by the lock in the btree node it
+ * points to - the btree_root() macro handles this.
+ *
+ * In various places we must be able to allocate memory for multiple btree nodes
+ * in order to make forward progress. To do this we use the btree cache itself
+ * as a reserve; if __get_free_pages() fails, we'll find a node in the btree
+ * cache we can reuse. We can't allow more than one thread to be doing this at a
+ * time, so there's a lock, implemented by a pointer to the btree_op closure -
+ * this allows the btree_root() macro to implicitly release this lock.
+ *
+ * BTREE IO:
+ *
+ * Btree nodes never have to be explicitly read in; bch_btree_node_get() handles
+ * this.
+ *
+ * For writing, we have two btree_write structs embeddded in struct btree - one
+ * write in flight, and one being set up, and we toggle between them.
+ *
+ * Writing is done with a single function -  bch_btree_write() really serves two
+ * different purposes and should be broken up into two different functions. When
+ * passing now = false, it merely indicates that the node is now dirty - calling
+ * it ensures that the dirty keys will be written at some point in the future.
+ *
+ * When passing now = true, bch_btree_write() causes a write to happen
+ * "immediately" (if there was already a write in flight, it'll cause the write
+ * to happen as soon as the previous write completes). It returns immediately
+ * though - but it takes a refcount on the closure in struct btree_op you passed
+ * to it, so a closure_sync() later can be used to wait for the write to
+ * complete.
+ *
+ * This is handy because btree_split() and garbage collection can issue writes
+ * in parallel, reducing the amount of time they have to hold write locks.
+ *
+ * LOCKING:
+ *
+ * When traversing the btree, we may need write locks starting at some level -
+ * inserting a key into the btree will typically only require a write lock on
+ * the leaf node.
+ *
+ * This is specified with the lock field in struct btree_op; lock = 0 means we
+ * take write locks at level <= 0, i.e. only leaf nodes. bch_btree_node_get()
+ * checks this field and returns the node with the appropriate lock held.
+ *
+ * If, after traversing the btree, the insertion code discovers it has to split
+ * then it must restart from the root and take new locks - to do this it changes
+ * the lock field and returns -EINTR, which causes the btree_root() macro to
+ * loop.
+ *
+ * Handling cache misses require a different mechanism for upgrading to a write
+ * lock. We do cache lookups with only a read lock held, but if we get a cache
+ * miss and we wish to insert this data into the cache, we have to insert a
+ * placeholder key to detect races - otherwise, we could race with a write and
+ * overwrite the data that was just written to the cache with stale data from
+ * the backing device.
+ *
+ * For this we use a sequence number that write locks and unlocks increment - to
+ * insert the check key it unlocks the btree node and then takes a write lock,
+ * and fails if the sequence number doesn't match.
+ */
+
+#include "bset.h"
+#include "debug.h"
+
+struct btree_write {
+	atomic_t		*journal;
+
+	/* If btree_split() frees a btree node, it writes a new pointer to that
+	 * btree node indicating it was freed; it takes a refcount on
+	 * c->prio_blocked because we can't write the gens until the new
+	 * pointer is on disk. This allows btree_write_endio() to release the
+	 * refcount that btree_split() took.
+	 */
+	int			prio_blocked;
+};
+
+struct btree {
+	/* Hottest entries first */
+	struct hlist_node	hash;
+
+	/* Key/pointer for this btree node */
+	BKEY_PADDED(key);
+
+	/* Single bit - set when accessed, cleared by shrinker */
+	unsigned long		accessed;
+	unsigned long		seq;
+	struct rw_semaphore	lock;
+	struct cache_set	*c;
+	struct btree		*parent;
+
+	struct mutex		write_lock;
+
+	unsigned long		flags;
+	uint16_t		written;	/* would be nice to kill */
+	uint8_t			level;
+
+	struct btree_keys	keys;
+
+	/* For outstanding btree writes, used as a lock - protects write_idx */
+	struct closure		io;
+	struct semaphore	io_mutex;
+
+	struct list_head	list;
+	struct delayed_work	work;
+
+	struct btree_write	writes[2];
+	struct bio		*bio;
+};
+
+#define BTREE_FLAG(flag)						\
+static inline bool btree_node_ ## flag(struct btree *b)			\
+{	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
+									\
+static inline void set_btree_node_ ## flag(struct btree *b)		\
+{	set_bit(BTREE_NODE_ ## flag, &b->flags); }			\
+
+enum btree_flags {
+	BTREE_NODE_io_error,
+	BTREE_NODE_dirty,
+	BTREE_NODE_write_idx,
+};
+
+BTREE_FLAG(io_error);
+BTREE_FLAG(dirty);
+BTREE_FLAG(write_idx);
+
+static inline struct btree_write *btree_current_write(struct btree *b)
+{
+	return b->writes + btree_node_write_idx(b);
+}
+
+static inline struct btree_write *btree_prev_write(struct btree *b)
+{
+	return b->writes + (btree_node_write_idx(b) ^ 1);
+}
+
+static inline struct bset *btree_bset_first(struct btree *b)
+{
+	return b->keys.set->data;
+}
+
+static inline struct bset *btree_bset_last(struct btree *b)
+{
+	return bset_tree_last(&b->keys)->data;
+}
+
+static inline unsigned bset_block_offset(struct btree *b, struct bset *i)
+{
+	return bset_sector_offset(&b->keys, i) >> b->c->block_bits;
+}
+
+static inline void set_gc_sectors(struct cache_set *c)
+{
+	atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16);
+}
+
+void bkey_put(struct cache_set *c, struct bkey *k);
+
+/* Looping macros */
+
+#define for_each_cached_btree(b, c, iter)				\
+	for (iter = 0;							\
+	     iter < ARRAY_SIZE((c)->bucket_hash);			\
+	     iter++)							\
+		hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash)
+
+/* Recursing down the btree */
+
+struct btree_op {
+	/* for waiting on btree reserve in btree_split() */
+	wait_queue_t		wait;
+
+	/* Btree level at which we start taking write locks */
+	short			lock;
+
+	unsigned		insert_collision:1;
+};
+
+static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
+{
+	memset(op, 0, sizeof(struct btree_op));
+	init_wait(&op->wait);
+	op->lock = write_lock_level;
+}
+
+static inline void rw_lock(bool w, struct btree *b, int level)
+{
+	w ? down_write_nested(&b->lock, level + 1)
+	  : down_read_nested(&b->lock, level + 1);
+	if (w)
+		b->seq++;
+}
+
+static inline void rw_unlock(bool w, struct btree *b)
+{
+	if (w)
+		b->seq++;
+	(w ? up_write : up_read)(&b->lock);
+}
+
+void bch_btree_node_read_done(struct btree *);
+void __bch_btree_node_write(struct btree *, struct closure *);
+void bch_btree_node_write(struct btree *, struct closure *);
+
+void bch_btree_set_root(struct btree *);
+struct btree *bch_btree_node_alloc(struct cache_set *, struct btree_op *, int);
+struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *,
+				 struct bkey *, int, bool);
+
+int bch_btree_insert_check_key(struct btree *, struct btree_op *,
+			       struct bkey *);
+int bch_btree_insert(struct cache_set *, struct keylist *,
+		     atomic_t *, struct bkey *);
+
+int bch_gc_thread_start(struct cache_set *);
+void bch_initial_gc_finish(struct cache_set *);
+void bch_moving_gc(struct cache_set *);
+int bch_btree_check(struct cache_set *);
+void bch_initial_mark_key(struct cache_set *, int, struct bkey *);
+
+static inline void wake_up_gc(struct cache_set *c)
+{
+	if (c->gc_thread)
+		wake_up_process(c->gc_thread);
+}
+
+#define MAP_DONE	0
+#define MAP_CONTINUE	1
+
+#define MAP_ALL_NODES	0
+#define MAP_LEAF_NODES	1
+
+#define MAP_END_KEY	1
+
+typedef int (btree_map_nodes_fn)(struct btree_op *, struct btree *);
+int __bch_btree_map_nodes(struct btree_op *, struct cache_set *,
+			  struct bkey *, btree_map_nodes_fn *, int);
+
+static inline int bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
+				      struct bkey *from, btree_map_nodes_fn *fn)
+{
+	return __bch_btree_map_nodes(op, c, from, fn, MAP_ALL_NODES);
+}
+
+static inline int bch_btree_map_leaf_nodes(struct btree_op *op,
+					   struct cache_set *c,
+					   struct bkey *from,
+					   btree_map_nodes_fn *fn)
+{
+	return __bch_btree_map_nodes(op, c, from, fn, MAP_LEAF_NODES);
+}
+
+typedef int (btree_map_keys_fn)(struct btree_op *, struct btree *,
+				struct bkey *);
+int bch_btree_map_keys(struct btree_op *, struct cache_set *,
+		       struct bkey *, btree_map_keys_fn *, int);
+
+typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);
+
+void bch_keybuf_init(struct keybuf *);
+void bch_refill_keybuf(struct cache_set *, struct keybuf *,
+		       struct bkey *, keybuf_pred_fn *);
+bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *,
+				  struct bkey *);
+void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
+struct keybuf_key *bch_keybuf_next(struct keybuf *);
+struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *,
+					  struct bkey *, keybuf_pred_fn *);
+
+#endif
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
new file mode 100644
index 00000000000..7a228de95fd
--- /dev/null
+++ b/drivers/md/bcache/closure.c
@@ -0,0 +1,222 @@
+/*
+ * Asynchronous refcounty things
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+
+#include "closure.h"
+
+static inline void closure_put_after_sub(struct closure *cl, int flags)
+{
+	int r = flags & CLOSURE_REMAINING_MASK;
+
+	BUG_ON(flags & CLOSURE_GUARD_MASK);
+	BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
+
+	/* Must deliver precisely one wakeup */
+	if (r == 1 && (flags & CLOSURE_SLEEPING))
+		wake_up_process(cl->task);
+
+	if (!r) {
+		if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
+			atomic_set(&cl->remaining,
+				   CLOSURE_REMAINING_INITIALIZER);
+			closure_queue(cl);
+		} else {
+			struct closure *parent = cl->parent;
+			closure_fn *destructor = cl->fn;
+
+			closure_debug_destroy(cl);
+
+			if (destructor)
+				destructor(cl);
+
+			if (parent)
+				closure_put(parent);
+		}
+	}
+}
+
+/* For clearing flags with the same atomic op as a put */
+void closure_sub(struct closure *cl, int v)
+{
+	closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
+}
+EXPORT_SYMBOL(closure_sub);
+
+/**
+ * closure_put - decrement a closure's refcount
+ */
+void closure_put(struct closure *cl)
+{
+	closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
+}
+EXPORT_SYMBOL(closure_put);
+
+/**
+ * closure_wake_up - wake up all closures on a wait list, without memory barrier
+ */
+void __closure_wake_up(struct closure_waitlist *wait_list)
+{
+	struct llist_node *list;
+	struct closure *cl;
+	struct llist_node *reverse = NULL;
+
+	list = llist_del_all(&wait_list->list);
+
+	/* We first reverse the list to preserve FIFO ordering and fairness */
+
+	while (list) {
+		struct llist_node *t = list;
+		list = llist_next(list);
+
+		t->next = reverse;
+		reverse = t;
+	}
+
+	/* Then do the wakeups */
+
+	while (reverse) {
+		cl = container_of(reverse, struct closure, list);
+		reverse = llist_next(reverse);
+
+		closure_set_waiting(cl, 0);
+		closure_sub(cl, CLOSURE_WAITING + 1);
+	}
+}
+EXPORT_SYMBOL(__closure_wake_up);
+
+/**
+ * closure_wait - add a closure to a waitlist
+ *
+ * @waitlist will own a ref on @cl, which will be released when
+ * closure_wake_up() is called on @waitlist.
+ *
+ */
+bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
+{
+	if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
+		return false;
+
+	closure_set_waiting(cl, _RET_IP_);
+	atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
+	llist_add(&cl->list, &waitlist->list);
+
+	return true;
+}
+EXPORT_SYMBOL(closure_wait);
+
+/**
+ * closure_sync - sleep until a closure a closure has nothing left to wait on
+ *
+ * Sleeps until the refcount hits 1 - the thread that's running the closure owns
+ * the last refcount.
+ */
+void closure_sync(struct closure *cl)
+{
+	while (1) {
+		__closure_start_sleep(cl);
+		closure_set_ret_ip(cl);
+
+		if ((atomic_read(&cl->remaining) &
+		     CLOSURE_REMAINING_MASK) == 1)
+			break;
+
+		schedule();
+	}
+
+	__closure_end_sleep(cl);
+}
+EXPORT_SYMBOL(closure_sync);
+
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+
+static LIST_HEAD(closure_list);
+static DEFINE_SPINLOCK(closure_list_lock);
+
+void closure_debug_create(struct closure *cl)
+{
+	unsigned long flags;
+
+	BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE);
+	cl->magic = CLOSURE_MAGIC_ALIVE;
+
+	spin_lock_irqsave(&closure_list_lock, flags);
+	list_add(&cl->all, &closure_list);
+	spin_unlock_irqrestore(&closure_list_lock, flags);
+}
+EXPORT_SYMBOL(closure_debug_create);
+
+void closure_debug_destroy(struct closure *cl)
+{
+	unsigned long flags;
+
+	BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE);
+	cl->magic = CLOSURE_MAGIC_DEAD;
+
+	spin_lock_irqsave(&closure_list_lock, flags);
+	list_del(&cl->all);
+	spin_unlock_irqrestore(&closure_list_lock, flags);
+}
+EXPORT_SYMBOL(closure_debug_destroy);
+
+static struct dentry *debug;
+
+#define work_data_bits(work) ((unsigned long *)(&(work)->data))
+
+static int debug_seq_show(struct seq_file *f, void *data)
+{
+	struct closure *cl;
+	spin_lock_irq(&closure_list_lock);
+
+	list_for_each_entry(cl, &closure_list, all) {
+		int r = atomic_read(&cl->remaining);
+
+		seq_printf(f, "%p: %pF -> %pf p %p r %i ",
+			   cl, (void *) cl->ip, cl->fn, cl->parent,
+			   r & CLOSURE_REMAINING_MASK);
+
+		seq_printf(f, "%s%s%s%s\n",
+			   test_bit(WORK_STRUCT_PENDING,
+				    work_data_bits(&cl->work)) ? "Q" : "",
+			   r & CLOSURE_RUNNING	? "R" : "",
+			   r & CLOSURE_STACK	? "S" : "",
+			   r & CLOSURE_SLEEPING	? "Sl" : "");
+
+		if (r & CLOSURE_WAITING)
+			seq_printf(f, " W %pF\n",
+				   (void *) cl->waiting_on);
+
+		seq_printf(f, "\n");
+	}
+
+	spin_unlock_irq(&closure_list_lock);
+	return 0;
+}
+
+static int debug_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, debug_seq_show, NULL);
+}
+
+static const struct file_operations debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= debug_seq_open,
+	.read		= seq_read,
+	.release	= single_release
+};
+
+void __init closure_debug_init(void)
+{
+	debug = debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops);
+}
+
+#endif
+
+MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
new file mode 100644
index 00000000000..a08e3eeac3c
--- /dev/null
+++ b/drivers/md/bcache/closure.h
@@ -0,0 +1,386 @@
+#ifndef _LINUX_CLOSURE_H
+#define _LINUX_CLOSURE_H
+
+#include <linux/llist.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+
+/*
+ * Closure is perhaps the most overused and abused term in computer science, but
+ * since I've been unable to come up with anything better you're stuck with it
+ * again.
+ *
+ * What are closures?
+ *
+ * They embed a refcount. The basic idea is they count "things that are in
+ * progress" - in flight bios, some other thread that's doing something else -
+ * anything you might want to wait on.
+ *
+ * The refcount may be manipulated with closure_get() and closure_put().
+ * closure_put() is where many of the interesting things happen, when it causes
+ * the refcount to go to 0.
+ *
+ * Closures can be used to wait on things both synchronously and asynchronously,
+ * and synchronous and asynchronous use can be mixed without restriction. To
+ * wait synchronously, use closure_sync() - you will sleep until your closure's
+ * refcount hits 1.
+ *
+ * To wait asynchronously, use
+ *   continue_at(cl, next_function, workqueue);
+ *
+ * passing it, as you might expect, the function to run when nothing is pending
+ * and the workqueue to run that function out of.
+ *
+ * continue_at() also, critically, is a macro that returns the calling function.
+ * There's good reason for this.
+ *
+ * To use safely closures asynchronously, they must always have a refcount while
+ * they are running owned by the thread that is running them. Otherwise, suppose
+ * you submit some bios and wish to have a function run when they all complete:
+ *
+ * foo_endio(struct bio *bio, int error)
+ * {
+ *	closure_put(cl);
+ * }
+ *
+ * closure_init(cl);
+ *
+ * do_stuff();
+ * closure_get(cl);
+ * bio1->bi_endio = foo_endio;
+ * bio_submit(bio1);
+ *
+ * do_more_stuff();
+ * closure_get(cl);
+ * bio2->bi_endio = foo_endio;
+ * bio_submit(bio2);
+ *
+ * continue_at(cl, complete_some_read, system_wq);
+ *
+ * If closure's refcount started at 0, complete_some_read() could run before the
+ * second bio was submitted - which is almost always not what you want! More
+ * importantly, it wouldn't be possible to say whether the original thread or
+ * complete_some_read()'s thread owned the closure - and whatever state it was
+ * associated with!
+ *
+ * So, closure_init() initializes a closure's refcount to 1 - and when a
+ * closure_fn is run, the refcount will be reset to 1 first.
+ *
+ * Then, the rule is - if you got the refcount with closure_get(), release it
+ * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount
+ * on a closure because you called closure_init() or you were run out of a
+ * closure - _always_ use continue_at(). Doing so consistently will help
+ * eliminate an entire class of particularly pernicious races.
+ *
+ * Lastly, you might have a wait list dedicated to a specific event, and have no
+ * need for specifying the condition - you just want to wait until someone runs
+ * closure_wake_up() on the appropriate wait list. In that case, just use
+ * closure_wait(). It will return either true or false, depending on whether the
+ * closure was already on a wait list or not - a closure can only be on one wait
+ * list at a time.
+ *
+ * Parents:
+ *
+ * closure_init() takes two arguments - it takes the closure to initialize, and
+ * a (possibly null) parent.
+ *
+ * If parent is non null, the new closure will have a refcount for its lifetime;
+ * a closure is considered to be "finished" when its refcount hits 0 and the
+ * function to run is null. Hence
+ *
+ * continue_at(cl, NULL, NULL);
+ *
+ * returns up the (spaghetti) stack of closures, precisely like normal return
+ * returns up the C stack. continue_at() with non null fn is better thought of
+ * as doing a tail call.
+ *
+ * All this implies that a closure should typically be embedded in a particular
+ * struct (which its refcount will normally control the lifetime of), and that
+ * struct can very much be thought of as a stack frame.
+ */
+
+struct closure;
+typedef void (closure_fn) (struct closure *);
+
+struct closure_waitlist {
+	struct llist_head	list;
+};
+
+enum closure_state {
+	/*
+	 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
+	 * the thread that owns the closure, and cleared by the thread that's
+	 * waking up the closure.
+	 *
+	 * CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep
+	 * - indicates that cl->task is valid and closure_put() may wake it up.
+	 * Only set or cleared by the thread that owns the closure.
+	 *
+	 * The rest are for debugging and don't affect behaviour:
+	 *
+	 * CLOSURE_RUNNING: Set when a closure is running (i.e. by
+	 * closure_init() and when closure_put() runs then next function), and
+	 * must be cleared before remaining hits 0. Primarily to help guard
+	 * against incorrect usage and accidentally transferring references.
+	 * continue_at() and closure_return() clear it for you, if you're doing
+	 * something unusual you can use closure_set_dead() which also helps
+	 * annotate where references are being transferred.
+	 *
+	 * CLOSURE_STACK: Sanity check - remaining should never hit 0 on a
+	 * closure with this flag set
+	 */
+
+	CLOSURE_BITS_START	= (1 << 23),
+	CLOSURE_DESTRUCTOR	= (1 << 23),
+	CLOSURE_WAITING		= (1 << 25),
+	CLOSURE_SLEEPING	= (1 << 27),
+	CLOSURE_RUNNING		= (1 << 29),
+	CLOSURE_STACK		= (1 << 31),
+};
+
+#define CLOSURE_GUARD_MASK					\
+	((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_SLEEPING|	\
+	  CLOSURE_RUNNING|CLOSURE_STACK) << 1)
+
+#define CLOSURE_REMAINING_MASK		(CLOSURE_BITS_START - 1)
+#define CLOSURE_REMAINING_INITIALIZER	(1|CLOSURE_RUNNING)
+
+struct closure {
+	union {
+		struct {
+			struct workqueue_struct *wq;
+			struct task_struct	*task;
+			struct llist_node	list;
+			closure_fn		*fn;
+		};
+		struct work_struct	work;
+	};
+
+	struct closure		*parent;
+
+	atomic_t		remaining;
+
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+#define CLOSURE_MAGIC_DEAD	0xc054dead
+#define CLOSURE_MAGIC_ALIVE	0xc054a11e
+
+	unsigned		magic;
+	struct list_head	all;
+	unsigned long		ip;
+	unsigned long		waiting_on;
+#endif
+};
+
+void closure_sub(struct closure *cl, int v);
+void closure_put(struct closure *cl);
+void __closure_wake_up(struct closure_waitlist *list);
+bool closure_wait(struct closure_waitlist *list, struct closure *cl);
+void closure_sync(struct closure *cl);
+
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+
+void closure_debug_init(void);
+void closure_debug_create(struct closure *cl);
+void closure_debug_destroy(struct closure *cl);
+
+#else
+
+static inline void closure_debug_init(void) {}
+static inline void closure_debug_create(struct closure *cl) {}
+static inline void closure_debug_destroy(struct closure *cl) {}
+
+#endif
+
+static inline void closure_set_ip(struct closure *cl)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+	cl->ip = _THIS_IP_;
+#endif
+}
+
+static inline void closure_set_ret_ip(struct closure *cl)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+	cl->ip = _RET_IP_;
+#endif
+}
+
+static inline void closure_set_waiting(struct closure *cl, unsigned long f)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+	cl->waiting_on = f;
+#endif
+}
+
+static inline void __closure_end_sleep(struct closure *cl)
+{
+	__set_current_state(TASK_RUNNING);
+
+	if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
+		atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
+}
+
+static inline void __closure_start_sleep(struct closure *cl)
+{
+	closure_set_ip(cl);
+	cl->task = current;
+	set_current_state(TASK_UNINTERRUPTIBLE);
+
+	if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
+		atomic_add(CLOSURE_SLEEPING, &cl->remaining);
+}
+
+static inline void closure_set_stopped(struct closure *cl)
+{
+	atomic_sub(CLOSURE_RUNNING, &cl->remaining);
+}
+
+static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
+				  struct workqueue_struct *wq)
+{
+	BUG_ON(object_is_on_stack(cl));
+	closure_set_ip(cl);
+	cl->fn = fn;
+	cl->wq = wq;
+	/* between atomic_dec() in closure_put() */
+	smp_mb__before_atomic();
+}
+
+static inline void closure_queue(struct closure *cl)
+{
+	struct workqueue_struct *wq = cl->wq;
+	if (wq) {
+		INIT_WORK(&cl->work, cl->work.func);
+		BUG_ON(!queue_work(wq, &cl->work));
+	} else
+		cl->fn(cl);
+}
+
+/**
+ * closure_get - increment a closure's refcount
+ */
+static inline void closure_get(struct closure *cl)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+	BUG_ON((atomic_inc_return(&cl->remaining) &
+		CLOSURE_REMAINING_MASK) <= 1);
+#else
+	atomic_inc(&cl->remaining);
+#endif
+}
+
+/**
+ * closure_init - Initialize a closure, setting the refcount to 1
+ * @cl:		closure to initialize
+ * @parent:	parent of the new closure. cl will take a refcount on it for its
+ *		lifetime; may be NULL.
+ */
+static inline void closure_init(struct closure *cl, struct closure *parent)
+{
+	memset(cl, 0, sizeof(struct closure));
+	cl->parent = parent;
+	if (parent)
+		closure_get(parent);
+
+	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+
+	closure_debug_create(cl);
+	closure_set_ip(cl);
+}
+
+static inline void closure_init_stack(struct closure *cl)
+{
+	memset(cl, 0, sizeof(struct closure));
+	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK);
+}
+
+/**
+ * closure_wake_up - wake up all closures on a wait list.
+ */
+static inline void closure_wake_up(struct closure_waitlist *list)
+{
+	smp_mb();
+	__closure_wake_up(list);
+}
+
+/**
+ * continue_at - jump to another function with barrier
+ *
+ * After @cl is no longer waiting on anything (i.e. all outstanding refs have
+ * been dropped with closure_put()), it will resume execution at @fn running out
+ * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
+ *
+ * NOTE: This macro expands to a return in the calling function!
+ *
+ * This is because after calling continue_at() you no longer have a ref on @cl,
+ * and whatever @cl owns may be freed out from under you - a running closure fn
+ * has a ref on its own closure which continue_at() drops.
+ */
+#define continue_at(_cl, _fn, _wq)					\
+do {									\
+	set_closure_fn(_cl, _fn, _wq);					\
+	closure_sub(_cl, CLOSURE_RUNNING + 1);				\
+	return;								\
+} while (0)
+
+/**
+ * closure_return - finish execution of a closure
+ *
+ * This is used to indicate that @cl is finished: when all outstanding refs on
+ * @cl have been dropped @cl's ref on its parent closure (as passed to
+ * closure_init()) will be dropped, if one was specified - thus this can be
+ * thought of as returning to the parent closure.
+ */
+#define closure_return(_cl)	continue_at((_cl), NULL, NULL)
+
+/**
+ * continue_at_nobarrier - jump to another function without barrier
+ *
+ * Causes @fn to be executed out of @cl, in @wq context (or called directly if
+ * @wq is NULL).
+ *
+ * NOTE: like continue_at(), this macro expands to a return in the caller!
+ *
+ * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
+ * thus it's not safe to touch anything protected by @cl after a
+ * continue_at_nobarrier().
+ */
+#define continue_at_nobarrier(_cl, _fn, _wq)				\
+do {									\
+	set_closure_fn(_cl, _fn, _wq);					\
+	closure_queue(_cl);						\
+	return;								\
+} while (0)
+
+/**
+ * closure_return - finish execution of a closure, with destructor
+ *
+ * Works like closure_return(), except @destructor will be called when all
+ * outstanding refs on @cl have been dropped; @destructor may be used to safely
+ * free the memory occupied by @cl, and it is called with the ref on the parent
+ * closure still held - so @destructor could safely return an item to a
+ * freelist protected by @cl's parent.
+ */
+#define closure_return_with_destructor(_cl, _destructor)		\
+do {									\
+	set_closure_fn(_cl, _destructor, NULL);				\
+	closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1);	\
+	return;								\
+} while (0)
+
+/**
+ * closure_call - execute @fn out of a new, uninitialized closure
+ *
+ * Typically used when running out of one closure, and we want to run @fn
+ * asynchronously out of a new closure - @parent will then wait for @cl to
+ * finish.
+ */
+static inline void closure_call(struct closure *cl, closure_fn fn,
+				struct workqueue_struct *wq,
+				struct closure *parent)
+{
+	closure_init(cl, parent);
+	continue_at_nobarrier(cl, fn, wq);
+}
+
+#endif /* _LINUX_CLOSURE_H */
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
new file mode 100644
index 00000000000..8b1f1d5c181
--- /dev/null
+++ b/drivers/md/bcache/debug.c
@@ -0,0 +1,252 @@
+/*
+ * Assorted bcache debug code
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "extents.h"
+
+#include <linux/console.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+
+static struct dentry *debug;
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+#define for_each_written_bset(b, start, i)				\
+	for (i = (start);						\
+	     (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\
+	     i->seq == (start)->seq;					\
+	     i = (void *) i + set_blocks(i, block_bytes(b->c)) *	\
+		 block_bytes(b->c))
+
+void bch_btree_verify(struct btree *b)
+{
+	struct btree *v = b->c->verify_data;
+	struct bset *ondisk, *sorted, *inmemory;
+	struct bio *bio;
+
+	if (!b->c->verify || !b->c->verify_ondisk)
+		return;
+
+	down(&b->io_mutex);
+	mutex_lock(&b->c->verify_lock);
+
+	ondisk = b->c->verify_ondisk;
+	sorted = b->c->verify_data->keys.set->data;
+	inmemory = b->keys.set->data;
+
+	bkey_copy(&v->key, &b->key);
+	v->written = 0;
+	v->level = b->level;
+	v->keys.ops = b->keys.ops;
+
+	bio = bch_bbio_alloc(b->c);
+	bio->bi_bdev		= PTR_CACHE(b->c, &b->key, 0)->bdev;
+	bio->bi_iter.bi_sector	= PTR_OFFSET(&b->key, 0);
+	bio->bi_iter.bi_size	= KEY_SIZE(&v->key) << 9;
+	bch_bio_map(bio, sorted);
+
+	submit_bio_wait(REQ_META|READ_SYNC, bio);
+	bch_bbio_free(bio, b->c);
+
+	memcpy(ondisk, sorted, KEY_SIZE(&v->key) << 9);
+
+	bch_btree_node_read_done(v);
+	sorted = v->keys.set->data;
+
+	if (inmemory->keys != sorted->keys ||
+	    memcmp(inmemory->start,
+		   sorted->start,
+		   (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) {
+		struct bset *i;
+		unsigned j;
+
+		console_lock();
+
+		printk(KERN_ERR "*** in memory:\n");
+		bch_dump_bset(&b->keys, inmemory, 0);
+
+		printk(KERN_ERR "*** read back in:\n");
+		bch_dump_bset(&v->keys, sorted, 0);
+
+		for_each_written_bset(b, ondisk, i) {
+			unsigned block = ((void *) i - (void *) ondisk) /
+				block_bytes(b->c);
+
+			printk(KERN_ERR "*** on disk block %u:\n", block);
+			bch_dump_bset(&b->keys, i, block);
+		}
+
+		printk(KERN_ERR "*** block %zu not written\n",
+		       ((void *) i - (void *) ondisk) / block_bytes(b->c));
+
+		for (j = 0; j < inmemory->keys; j++)
+			if (inmemory->d[j] != sorted->d[j])
+				break;
+
+		printk(KERN_ERR "b->written %u\n", b->written);
+
+		console_unlock();
+		panic("verify failed at %u\n", j);
+	}
+
+	mutex_unlock(&b->c->verify_lock);
+	up(&b->io_mutex);
+}
+
+void bch_data_verify(struct cached_dev *dc, struct bio *bio)
+{
+	char name[BDEVNAME_SIZE];
+	struct bio *check;
+	struct bio_vec bv, *bv2;
+	struct bvec_iter iter;
+	int i;
+
+	check = bio_clone(bio, GFP_NOIO);
+	if (!check)
+		return;
+
+	if (bio_alloc_pages(check, GFP_NOIO))
+		goto out_put;
+
+	submit_bio_wait(READ_SYNC, check);
+
+	bio_for_each_segment(bv, bio, iter) {
+		void *p1 = kmap_atomic(bv.bv_page);
+		void *p2 = page_address(check->bi_io_vec[iter.bi_idx].bv_page);
+
+		cache_set_err_on(memcmp(p1 + bv.bv_offset,
+					p2 + bv.bv_offset,
+					bv.bv_len),
+				 dc->disk.c,
+				 "verify failed at dev %s sector %llu",
+				 bdevname(dc->bdev, name),
+				 (uint64_t) bio->bi_iter.bi_sector);
+
+		kunmap_atomic(p1);
+	}
+
+	bio_for_each_segment_all(bv2, check, i)
+		__free_page(bv2->bv_page);
+out_put:
+	bio_put(check);
+}
+
+#endif
+
+#ifdef CONFIG_DEBUG_FS
+
+/* XXX: cache set refcounting */
+
+struct dump_iterator {
+	char			buf[PAGE_SIZE];
+	size_t			bytes;
+	struct cache_set	*c;
+	struct keybuf		keys;
+};
+
+static bool dump_pred(struct keybuf *buf, struct bkey *k)
+{
+	return true;
+}
+
+static ssize_t bch_dump_read(struct file *file, char __user *buf,
+			     size_t size, loff_t *ppos)
+{
+	struct dump_iterator *i = file->private_data;
+	ssize_t ret = 0;
+	char kbuf[80];
+
+	while (size) {
+		struct keybuf_key *w;
+		unsigned bytes = min(i->bytes, size);
+
+		int err = copy_to_user(buf, i->buf, bytes);
+		if (err)
+			return err;
+
+		ret	 += bytes;
+		buf	 += bytes;
+		size	 -= bytes;
+		i->bytes -= bytes;
+		memmove(i->buf, i->buf + bytes, i->bytes);
+
+		if (i->bytes)
+			break;
+
+		w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred);
+		if (!w)
+			break;
+
+		bch_extent_to_text(kbuf, sizeof(kbuf), &w->key);
+		i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf);
+		bch_keybuf_del(&i->keys, w);
+	}
+
+	return ret;
+}
+
+static int bch_dump_open(struct inode *inode, struct file *file)
+{
+	struct cache_set *c = inode->i_private;
+	struct dump_iterator *i;
+
+	i = kzalloc(sizeof(struct dump_iterator), GFP_KERNEL);
+	if (!i)
+		return -ENOMEM;
+
+	file->private_data = i;
+	i->c = c;
+	bch_keybuf_init(&i->keys);
+	i->keys.last_scanned = KEY(0, 0, 0);
+
+	return 0;
+}
+
+static int bch_dump_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static const struct file_operations cache_set_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch_dump_open,
+	.read		= bch_dump_read,
+	.release	= bch_dump_release
+};
+
+void bch_debug_init_cache_set(struct cache_set *c)
+{
+	if (!IS_ERR_OR_NULL(debug)) {
+		char name[50];
+		snprintf(name, 50, "bcache-%pU", c->sb.set_uuid);
+
+		c->debug = debugfs_create_file(name, 0400, debug, c,
+					       &cache_set_debug_ops);
+	}
+}
+
+#endif
+
+void bch_debug_exit(void)
+{
+	if (!IS_ERR_OR_NULL(debug))
+		debugfs_remove_recursive(debug);
+}
+
+int __init bch_debug_init(struct kobject *kobj)
+{
+	int ret = 0;
+
+	debug = debugfs_create_dir("bcache", NULL);
+	return ret;
+}
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h
new file mode 100644
index 00000000000..1f63c195d24
--- /dev/null
+++ b/drivers/md/bcache/debug.h
@@ -0,0 +1,34 @@
+#ifndef _BCACHE_DEBUG_H
+#define _BCACHE_DEBUG_H
+
+struct bio;
+struct cached_dev;
+struct cache_set;
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+void bch_btree_verify(struct btree *);
+void bch_data_verify(struct cached_dev *, struct bio *);
+
+#define expensive_debug_checks(c)	((c)->expensive_debug_checks)
+#define key_merging_disabled(c)		((c)->key_merging_disabled)
+#define bypass_torture_test(d)		((d)->bypass_torture_test)
+
+#else /* DEBUG */
+
+static inline void bch_btree_verify(struct btree *b) {}
+static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {}
+
+#define expensive_debug_checks(c)	0
+#define key_merging_disabled(c)		0
+#define bypass_torture_test(d)		0
+
+#endif
+
+#ifdef CONFIG_DEBUG_FS
+void bch_debug_init_cache_set(struct cache_set *);
+#else
+static inline void bch_debug_init_cache_set(struct cache_set *c) {}
+#endif
+
+#endif
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
new file mode 100644
index 00000000000..3a0de4cf977
--- /dev/null
+++ b/drivers/md/bcache/extents.c
@@ -0,0 +1,620 @@
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ *
+ * Uses a block device as cache for other block devices; optimized for SSDs.
+ * All allocation is done in buckets, which should match the erase block size
+ * of the device.
+ *
+ * Buckets containing cached data are kept on a heap sorted by priority;
+ * bucket priority is increased on cache hit, and periodically all the buckets
+ * on the heap have their priority scaled down. This currently is just used as
+ * an LRU but in the future should allow for more intelligent heuristics.
+ *
+ * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
+ * counter. Garbage collection is used to remove stale pointers.
+ *
+ * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
+ * as keys are inserted we only sort the pages that have not yet been written.
+ * When garbage collection is run, we resort the entire node.
+ *
+ * All configuration is done via sysfs; see Documentation/bcache.txt.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "extents.h"
+#include "writeback.h"
+
+static void sort_key_next(struct btree_iter *iter,
+			  struct btree_iter_set *i)
+{
+	i->k = bkey_next(i->k);
+
+	if (i->k == i->end)
+		*i = iter->data[--iter->used];
+}
+
+static bool bch_key_sort_cmp(struct btree_iter_set l,
+			     struct btree_iter_set r)
+{
+	int64_t c = bkey_cmp(l.k, r.k);
+
+	return c ? c > 0 : l.k < r.k;
+}
+
+static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
+{
+	unsigned i;
+
+	for (i = 0; i < KEY_PTRS(k); i++)
+		if (ptr_available(c, k, i)) {
+			struct cache *ca = PTR_CACHE(c, k, i);
+			size_t bucket = PTR_BUCKET_NR(c, k, i);
+			size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
+
+			if (KEY_SIZE(k) + r > c->sb.bucket_size ||
+			    bucket <  ca->sb.first_bucket ||
+			    bucket >= ca->sb.nbuckets)
+				return true;
+		}
+
+	return false;
+}
+
+/* Common among btree and extent ptrs */
+
+static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
+{
+	unsigned i;
+
+	for (i = 0; i < KEY_PTRS(k); i++)
+		if (ptr_available(c, k, i)) {
+			struct cache *ca = PTR_CACHE(c, k, i);
+			size_t bucket = PTR_BUCKET_NR(c, k, i);
+			size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
+
+			if (KEY_SIZE(k) + r > c->sb.bucket_size)
+				return "bad, length too big";
+			if (bucket <  ca->sb.first_bucket)
+				return "bad, short offset";
+			if (bucket >= ca->sb.nbuckets)
+				return "bad, offset past end of device";
+			if (ptr_stale(c, k, i))
+				return "stale";
+		}
+
+	if (!bkey_cmp(k, &ZERO_KEY))
+		return "bad, null key";
+	if (!KEY_PTRS(k))
+		return "bad, no pointers";
+	if (!KEY_SIZE(k))
+		return "zeroed key";
+	return "";
+}
+
+void bch_extent_to_text(char *buf, size_t size, const struct bkey *k)
+{
+	unsigned i = 0;
+	char *out = buf, *end = buf + size;
+
+#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
+
+	p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k));
+
+	for (i = 0; i < KEY_PTRS(k); i++) {
+		if (i)
+			p(", ");
+
+		if (PTR_DEV(k, i) == PTR_CHECK_DEV)
+			p("check dev");
+		else
+			p("%llu:%llu gen %llu", PTR_DEV(k, i),
+			  PTR_OFFSET(k, i), PTR_GEN(k, i));
+	}
+
+	p("]");
+
+	if (KEY_DIRTY(k))
+		p(" dirty");
+	if (KEY_CSUM(k))
+		p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
+#undef p
+}
+
+static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k)
+{
+	struct btree *b = container_of(keys, struct btree, keys);
+	unsigned j;
+	char buf[80];
+
+	bch_extent_to_text(buf, sizeof(buf), k);
+	printk(" %s", buf);
+
+	for (j = 0; j < KEY_PTRS(k); j++) {
+		size_t n = PTR_BUCKET_NR(b->c, k, j);
+		printk(" bucket %zu", n);
+
+		if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
+			printk(" prio %i",
+			       PTR_BUCKET(b->c, k, j)->prio);
+	}
+
+	printk(" %s\n", bch_ptr_status(b->c, k));
+}
+
+/* Btree ptrs */
+
+bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
+{
+	char buf[80];
+
+	if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
+		goto bad;
+
+	if (__ptr_invalid(c, k))
+		goto bad;
+
+	return false;
+bad:
+	bch_extent_to_text(buf, sizeof(buf), k);
+	cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
+	return true;
+}
+
+static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k)
+{
+	struct btree *b = container_of(bk, struct btree, keys);
+	return __bch_btree_ptr_invalid(b->c, k);
+}
+
+static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k)
+{
+	unsigned i;
+	char buf[80];
+	struct bucket *g;
+
+	if (mutex_trylock(&b->c->bucket_lock)) {
+		for (i = 0; i < KEY_PTRS(k); i++)
+			if (ptr_available(b->c, k, i)) {
+				g = PTR_BUCKET(b->c, k, i);
+
+				if (KEY_DIRTY(k) ||
+				    g->prio != BTREE_PRIO ||
+				    (b->c->gc_mark_valid &&
+				     GC_MARK(g) != GC_MARK_METADATA))
+					goto err;
+			}
+
+		mutex_unlock(&b->c->bucket_lock);
+	}
+
+	return false;
+err:
+	mutex_unlock(&b->c->bucket_lock);
+	bch_extent_to_text(buf, sizeof(buf), k);
+	btree_bug(b,
+"inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu",
+		  buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
+		  g->prio, g->gen, g->last_gc, GC_MARK(g));
+	return true;
+}
+
+static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k)
+{
+	struct btree *b = container_of(bk, struct btree, keys);
+	unsigned i;
+
+	if (!bkey_cmp(k, &ZERO_KEY) ||
+	    !KEY_PTRS(k) ||
+	    bch_ptr_invalid(bk, k))
+		return true;
+
+	for (i = 0; i < KEY_PTRS(k); i++)
+		if (!ptr_available(b->c, k, i) ||
+		    ptr_stale(b->c, k, i))
+			return true;
+
+	if (expensive_debug_checks(b->c) &&
+	    btree_ptr_bad_expensive(b, k))
+		return true;
+
+	return false;
+}
+
+static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk,
+				       struct bkey *insert,
+				       struct btree_iter *iter,
+				       struct bkey *replace_key)
+{
+	struct btree *b = container_of(bk, struct btree, keys);
+
+	if (!KEY_OFFSET(insert))
+		btree_current_write(b)->prio_blocked++;
+
+	return false;
+}
+
+const struct btree_keys_ops bch_btree_keys_ops = {
+	.sort_cmp	= bch_key_sort_cmp,
+	.insert_fixup	= bch_btree_ptr_insert_fixup,
+	.key_invalid	= bch_btree_ptr_invalid,
+	.key_bad	= bch_btree_ptr_bad,
+	.key_to_text	= bch_extent_to_text,
+	.key_dump	= bch_bkey_dump,
+};
+
+/* Extents */
+
+/*
+ * Returns true if l > r - unless l == r, in which case returns true if l is
+ * older than r.
+ *
+ * Necessary for btree_sort_fixup() - if there are multiple keys that compare
+ * equal in different sets, we have to process them newest to oldest.
+ */
+static bool bch_extent_sort_cmp(struct btree_iter_set l,
+				struct btree_iter_set r)
+{
+	int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
+
+	return c ? c > 0 : l.k < r.k;
+}
+
+static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
+					  struct bkey *tmp)
+{
+	while (iter->used > 1) {
+		struct btree_iter_set *top = iter->data, *i = top + 1;
+
+		if (iter->used > 2 &&
+		    bch_extent_sort_cmp(i[0], i[1]))
+			i++;
+
+		if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
+			break;
+
+		if (!KEY_SIZE(i->k)) {
+			sort_key_next(iter, i);
+			heap_sift(iter, i - top, bch_extent_sort_cmp);
+			continue;
+		}
+
+		if (top->k > i->k) {
+			if (bkey_cmp(top->k, i->k) >= 0)
+				sort_key_next(iter, i);
+			else
+				bch_cut_front(top->k, i->k);
+
+			heap_sift(iter, i - top, bch_extent_sort_cmp);
+		} else {
+			/* can't happen because of comparison func */
+			BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
+
+			if (bkey_cmp(i->k, top->k) < 0) {
+				bkey_copy(tmp, top->k);
+
+				bch_cut_back(&START_KEY(i->k), tmp);
+				bch_cut_front(i->k, top->k);
+				heap_sift(iter, 0, bch_extent_sort_cmp);
+
+				return tmp;
+			} else {
+				bch_cut_back(&START_KEY(i->k), top->k);
+			}
+		}
+	}
+
+	return NULL;
+}
+
+static void bch_subtract_dirty(struct bkey *k,
+			   struct cache_set *c,
+			   uint64_t offset,
+			   int sectors)
+{
+	if (KEY_DIRTY(k))
+		bcache_dev_sectors_dirty_add(c, KEY_INODE(k),
+					     offset, -sectors);
+}
+
+static bool bch_extent_insert_fixup(struct btree_keys *b,
+				    struct bkey *insert,
+				    struct btree_iter *iter,
+				    struct bkey *replace_key)
+{
+	struct cache_set *c = container_of(b, struct btree, keys)->c;
+
+	uint64_t old_offset;
+	unsigned old_size, sectors_found = 0;
+
+	BUG_ON(!KEY_OFFSET(insert));
+	BUG_ON(!KEY_SIZE(insert));
+
+	while (1) {
+		struct bkey *k = bch_btree_iter_next(iter);
+		if (!k)
+			break;
+
+		if (bkey_cmp(&START_KEY(k), insert) >= 0) {
+			if (KEY_SIZE(k))
+				break;
+			else
+				continue;
+		}
+
+		if (bkey_cmp(k, &START_KEY(insert)) <= 0)
+			continue;
+
+		old_offset = KEY_START(k);
+		old_size = KEY_SIZE(k);
+
+		/*
+		 * We might overlap with 0 size extents; we can't skip these
+		 * because if they're in the set we're inserting to we have to
+		 * adjust them so they don't overlap with the key we're
+		 * inserting. But we don't want to check them for replace
+		 * operations.
+		 */
+
+		if (replace_key && KEY_SIZE(k)) {
+			/*
+			 * k might have been split since we inserted/found the
+			 * key we're replacing
+			 */
+			unsigned i;
+			uint64_t offset = KEY_START(k) -
+				KEY_START(replace_key);
+
+			/* But it must be a subset of the replace key */
+			if (KEY_START(k) < KEY_START(replace_key) ||
+			    KEY_OFFSET(k) > KEY_OFFSET(replace_key))
+				goto check_failed;
+
+			/* We didn't find a key that we were supposed to */
+			if (KEY_START(k) > KEY_START(insert) + sectors_found)
+				goto check_failed;
+
+			if (!bch_bkey_equal_header(k, replace_key))
+				goto check_failed;
+
+			/* skip past gen */
+			offset <<= 8;
+
+			BUG_ON(!KEY_PTRS(replace_key));
+
+			for (i = 0; i < KEY_PTRS(replace_key); i++)
+				if (k->ptr[i] != replace_key->ptr[i] + offset)
+					goto check_failed;
+
+			sectors_found = KEY_OFFSET(k) - KEY_START(insert);
+		}
+
+		if (bkey_cmp(insert, k) < 0 &&
+		    bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
+			/*
+			 * We overlapped in the middle of an existing key: that
+			 * means we have to split the old key. But we have to do
+			 * slightly different things depending on whether the
+			 * old key has been written out yet.
+			 */
+
+			struct bkey *top;
+
+			bch_subtract_dirty(k, c, KEY_START(insert),
+				       KEY_SIZE(insert));
+
+			if (bkey_written(b, k)) {
+				/*
+				 * We insert a new key to cover the top of the
+				 * old key, and the old key is modified in place
+				 * to represent the bottom split.
+				 *
+				 * It's completely arbitrary whether the new key
+				 * is the top or the bottom, but it has to match
+				 * up with what btree_sort_fixup() does - it
+				 * doesn't check for this kind of overlap, it
+				 * depends on us inserting a new key for the top
+				 * here.
+				 */
+				top = bch_bset_search(b, bset_tree_last(b),
+						      insert);
+				bch_bset_insert(b, top, k);
+			} else {
+				BKEY_PADDED(key) temp;
+				bkey_copy(&temp.key, k);
+				bch_bset_insert(b, k, &temp.key);
+				top = bkey_next(k);
+			}
+
+			bch_cut_front(insert, top);
+			bch_cut_back(&START_KEY(insert), k);
+			bch_bset_fix_invalidated_key(b, k);
+			goto out;
+		}
+
+		if (bkey_cmp(insert, k) < 0) {
+			bch_cut_front(insert, k);
+		} else {
+			if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
+				old_offset = KEY_START(insert);
+
+			if (bkey_written(b, k) &&
+			    bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
+				/*
+				 * Completely overwrote, so we don't have to
+				 * invalidate the binary search tree
+				 */
+				bch_cut_front(k, k);
+			} else {
+				__bch_cut_back(&START_KEY(insert), k);
+				bch_bset_fix_invalidated_key(b, k);
+			}
+		}
+
+		bch_subtract_dirty(k, c, old_offset, old_size - KEY_SIZE(k));
+	}
+
+check_failed:
+	if (replace_key) {
+		if (!sectors_found) {
+			return true;
+		} else if (sectors_found < KEY_SIZE(insert)) {
+			SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
+				       (KEY_SIZE(insert) - sectors_found));
+			SET_KEY_SIZE(insert, sectors_found);
+		}
+	}
+out:
+	if (KEY_DIRTY(insert))
+		bcache_dev_sectors_dirty_add(c, KEY_INODE(insert),
+					     KEY_START(insert),
+					     KEY_SIZE(insert));
+
+	return false;
+}
+
+static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
+{
+	struct btree *b = container_of(bk, struct btree, keys);
+	char buf[80];
+
+	if (!KEY_SIZE(k))
+		return true;
+
+	if (KEY_SIZE(k) > KEY_OFFSET(k))
+		goto bad;
+
+	if (__ptr_invalid(b->c, k))
+		goto bad;
+
+	return false;
+bad:
+	bch_extent_to_text(buf, sizeof(buf), k);
+	cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k));
+	return true;
+}
+
+static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
+				     unsigned ptr)
+{
+	struct bucket *g = PTR_BUCKET(b->c, k, ptr);
+	char buf[80];
+
+	if (mutex_trylock(&b->c->bucket_lock)) {
+		if (b->c->gc_mark_valid &&
+		    (!GC_MARK(g) ||
+		     GC_MARK(g) == GC_MARK_METADATA ||
+		     (GC_MARK(g) != GC_MARK_DIRTY && KEY_DIRTY(k))))
+			goto err;
+
+		if (g->prio == BTREE_PRIO)
+			goto err;
+
+		mutex_unlock(&b->c->bucket_lock);
+	}
+
+	return false;
+err:
+	mutex_unlock(&b->c->bucket_lock);
+	bch_extent_to_text(buf, sizeof(buf), k);
+	btree_bug(b,
+"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu",
+		  buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
+		  g->prio, g->gen, g->last_gc, GC_MARK(g));
+	return true;
+}
+
+static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
+{
+	struct btree *b = container_of(bk, struct btree, keys);
+	struct bucket *g;
+	unsigned i, stale;
+
+	if (!KEY_PTRS(k) ||
+	    bch_extent_invalid(bk, k))
+		return true;
+
+	for (i = 0; i < KEY_PTRS(k); i++)
+		if (!ptr_available(b->c, k, i))
+			return true;
+
+	if (!expensive_debug_checks(b->c) && KEY_DIRTY(k))
+		return false;
+
+	for (i = 0; i < KEY_PTRS(k); i++) {
+		g = PTR_BUCKET(b->c, k, i);
+		stale = ptr_stale(b->c, k, i);
+
+		btree_bug_on(stale > 96, b,
+			     "key too stale: %i, need_gc %u",
+			     stale, b->c->need_gc);
+
+		btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
+			     b, "stale dirty pointer");
+
+		if (stale)
+			return true;
+
+		if (expensive_debug_checks(b->c) &&
+		    bch_extent_bad_expensive(b, k, i))
+			return true;
+	}
+
+	return false;
+}
+
+static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
+{
+	return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
+		~((uint64_t)1 << 63);
+}
+
+static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r)
+{
+	struct btree *b = container_of(bk, struct btree, keys);
+	unsigned i;
+
+	if (key_merging_disabled(b->c))
+		return false;
+
+	for (i = 0; i < KEY_PTRS(l); i++)
+		if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
+		    PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
+			return false;
+
+	/* Keys with no pointers aren't restricted to one bucket and could
+	 * overflow KEY_SIZE
+	 */
+	if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
+		SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
+		SET_KEY_SIZE(l, USHRT_MAX);
+
+		bch_cut_front(l, r);
+		return false;
+	}
+
+	if (KEY_CSUM(l)) {
+		if (KEY_CSUM(r))
+			l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
+		else
+			SET_KEY_CSUM(l, 0);
+	}
+
+	SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
+	SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
+
+	return true;
+}
+
+const struct btree_keys_ops bch_extent_keys_ops = {
+	.sort_cmp	= bch_extent_sort_cmp,
+	.sort_fixup	= bch_extent_sort_fixup,
+	.insert_fixup	= bch_extent_insert_fixup,
+	.key_invalid	= bch_extent_invalid,
+	.key_bad	= bch_extent_bad,
+	.key_merge	= bch_extent_merge,
+	.key_to_text	= bch_extent_to_text,
+	.key_dump	= bch_bkey_dump,
+	.is_extents	= true,
+};
diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h
new file mode 100644
index 00000000000..e4e23409782
--- /dev/null
+++ b/drivers/md/bcache/extents.h
@@ -0,0 +1,13 @@
+#ifndef _BCACHE_EXTENTS_H
+#define _BCACHE_EXTENTS_H
+
+extern const struct btree_keys_ops bch_btree_keys_ops;
+extern const struct btree_keys_ops bch_extent_keys_ops;
+
+struct bkey;
+struct cache_set;
+
+void bch_extent_to_text(char *, size_t, const struct bkey *);
+bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
+
+#endif /* _BCACHE_EXTENTS_H */
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
new file mode 100644
index 00000000000..fa028fa82df
--- /dev/null
+++ b/drivers/md/bcache/io.c
@@ -0,0 +1,243 @@
+/*
+ * Some low level IO code, and hacks for various block layer limitations
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "bset.h"
+#include "debug.h"
+
+#include <linux/blkdev.h>
+
+static unsigned bch_bio_max_sectors(struct bio *bio)
+{
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	unsigned ret = 0, seg = 0;
+
+	if (bio->bi_rw & REQ_DISCARD)
+		return min(bio_sectors(bio), q->limits.max_discard_sectors);
+
+	bio_for_each_segment(bv, bio, iter) {
+		struct bvec_merge_data bvm = {
+			.bi_bdev	= bio->bi_bdev,
+			.bi_sector	= bio->bi_iter.bi_sector,
+			.bi_size	= ret << 9,
+			.bi_rw		= bio->bi_rw,
+		};
+
+		if (seg == min_t(unsigned, BIO_MAX_PAGES,
+				 queue_max_segments(q)))
+			break;
+
+		if (q->merge_bvec_fn &&
+		    q->merge_bvec_fn(q, &bvm, &bv) < (int) bv.bv_len)
+			break;
+
+		seg++;
+		ret += bv.bv_len >> 9;
+	}
+
+	ret = min(ret, queue_max_sectors(q));
+
+	WARN_ON(!ret);
+	ret = max_t(int, ret, bio_iovec(bio).bv_len >> 9);
+
+	return ret;
+}
+
+static void bch_bio_submit_split_done(struct closure *cl)
+{
+	struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
+
+	s->bio->bi_end_io = s->bi_end_io;
+	s->bio->bi_private = s->bi_private;
+	bio_endio_nodec(s->bio, 0);
+
+	closure_debug_destroy(&s->cl);
+	mempool_free(s, s->p->bio_split_hook);
+}
+
+static void bch_bio_submit_split_endio(struct bio *bio, int error)
+{
+	struct closure *cl = bio->bi_private;
+	struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
+
+	if (error)
+		clear_bit(BIO_UPTODATE, &s->bio->bi_flags);
+
+	bio_put(bio);
+	closure_put(cl);
+}
+
+void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
+{
+	struct bio_split_hook *s;
+	struct bio *n;
+
+	if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD))
+		goto submit;
+
+	if (bio_sectors(bio) <= bch_bio_max_sectors(bio))
+		goto submit;
+
+	s = mempool_alloc(p->bio_split_hook, GFP_NOIO);
+	closure_init(&s->cl, NULL);
+
+	s->bio		= bio;
+	s->p		= p;
+	s->bi_end_io	= bio->bi_end_io;
+	s->bi_private	= bio->bi_private;
+	bio_get(bio);
+
+	do {
+		n = bio_next_split(bio, bch_bio_max_sectors(bio),
+				   GFP_NOIO, s->p->bio_split);
+
+		n->bi_end_io	= bch_bio_submit_split_endio;
+		n->bi_private	= &s->cl;
+
+		closure_get(&s->cl);
+		generic_make_request(n);
+	} while (n != bio);
+
+	continue_at(&s->cl, bch_bio_submit_split_done, NULL);
+submit:
+	generic_make_request(bio);
+}
+
+/* Bios with headers */
+
+void bch_bbio_free(struct bio *bio, struct cache_set *c)
+{
+	struct bbio *b = container_of(bio, struct bbio, bio);
+	mempool_free(b, c->bio_meta);
+}
+
+struct bio *bch_bbio_alloc(struct cache_set *c)
+{
+	struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO);
+	struct bio *bio = &b->bio;
+
+	bio_init(bio);
+	bio->bi_flags		|= BIO_POOL_NONE << BIO_POOL_OFFSET;
+	bio->bi_max_vecs	 = bucket_pages(c);
+	bio->bi_io_vec		 = bio->bi_inline_vecs;
+
+	return bio;
+}
+
+void __bch_submit_bbio(struct bio *bio, struct cache_set *c)
+{
+	struct bbio *b = container_of(bio, struct bbio, bio);
+
+	bio->bi_iter.bi_sector	= PTR_OFFSET(&b->key, 0);
+	bio->bi_bdev		= PTR_CACHE(c, &b->key, 0)->bdev;
+
+	b->submit_time_us = local_clock_us();
+	closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0));
+}
+
+void bch_submit_bbio(struct bio *bio, struct cache_set *c,
+		     struct bkey *k, unsigned ptr)
+{
+	struct bbio *b = container_of(bio, struct bbio, bio);
+	bch_bkey_copy_single_ptr(&b->key, k, ptr);
+	__bch_submit_bbio(bio, c);
+}
+
+/* IO errors */
+
+void bch_count_io_errors(struct cache *ca, int error, const char *m)
+{
+	/*
+	 * The halflife of an error is:
+	 * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh
+	 */
+
+	if (ca->set->error_decay) {
+		unsigned count = atomic_inc_return(&ca->io_count);
+
+		while (count > ca->set->error_decay) {
+			unsigned errors;
+			unsigned old = count;
+			unsigned new = count - ca->set->error_decay;
+
+			/*
+			 * First we subtract refresh from count; each time we
+			 * succesfully do so, we rescale the errors once:
+			 */
+
+			count = atomic_cmpxchg(&ca->io_count, old, new);
+
+			if (count == old) {
+				count = new;
+
+				errors = atomic_read(&ca->io_errors);
+				do {
+					old = errors;
+					new = ((uint64_t) errors * 127) / 128;
+					errors = atomic_cmpxchg(&ca->io_errors,
+								old, new);
+				} while (old != errors);
+			}
+		}
+	}
+
+	if (error) {
+		char buf[BDEVNAME_SIZE];
+		unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT,
+						    &ca->io_errors);
+		errors >>= IO_ERROR_SHIFT;
+
+		if (errors < ca->set->error_limit)
+			pr_err("%s: IO error on %s, recovering",
+			       bdevname(ca->bdev, buf), m);
+		else
+			bch_cache_set_error(ca->set,
+					    "%s: too many IO errors %s",
+					    bdevname(ca->bdev, buf), m);
+	}
+}
+
+void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
+			      int error, const char *m)
+{
+	struct bbio *b = container_of(bio, struct bbio, bio);
+	struct cache *ca = PTR_CACHE(c, &b->key, 0);
+
+	unsigned threshold = bio->bi_rw & REQ_WRITE
+		? c->congested_write_threshold_us
+		: c->congested_read_threshold_us;
+
+	if (threshold) {
+		unsigned t = local_clock_us();
+
+		int us = t - b->submit_time_us;
+		int congested = atomic_read(&c->congested);
+
+		if (us > (int) threshold) {
+			int ms = us / 1024;
+			c->congested_last_us = t;
+
+			ms = min(ms, CONGESTED_MAX + congested);
+			atomic_sub(ms, &c->congested);
+		} else if (congested < 0)
+			atomic_inc(&c->congested);
+	}
+
+	bch_count_io_errors(ca, error, m);
+}
+
+void bch_bbio_endio(struct cache_set *c, struct bio *bio,
+		    int error, const char *m)
+{
+	struct closure *cl = bio->bi_private;
+
+	bch_bbio_count_io_errors(c, bio, error, m);
+	bio_put(bio);
+	closure_put(cl);
+}
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
new file mode 100644
index 00000000000..59e82021b5b
--- /dev/null
+++ b/drivers/md/bcache/journal.c
@@ -0,0 +1,815 @@
+/*
+ * bcache journalling code, for btree insertions
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+
+#include <trace/events/bcache.h>
+
+/*
+ * Journal replay/recovery:
+ *
+ * This code is all driven from run_cache_set(); we first read the journal
+ * entries, do some other stuff, then we mark all the keys in the journal
+ * entries (same as garbage collection would), then we replay them - reinserting
+ * them into the cache in precisely the same order as they appear in the
+ * journal.
+ *
+ * We only journal keys that go in leaf nodes, which simplifies things quite a
+ * bit.
+ */
+
+static void journal_read_endio(struct bio *bio, int error)
+{
+	struct closure *cl = bio->bi_private;
+	closure_put(cl);
+}
+
+static int journal_read_bucket(struct cache *ca, struct list_head *list,
+			       unsigned bucket_index)
+{
+	struct journal_device *ja = &ca->journal;
+	struct bio *bio = &ja->bio;
+
+	struct journal_replay *i;
+	struct jset *j, *data = ca->set->journal.w[0].data;
+	struct closure cl;
+	unsigned len, left, offset = 0;
+	int ret = 0;
+	sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
+
+	closure_init_stack(&cl);
+
+	pr_debug("reading %u", bucket_index);
+
+	while (offset < ca->sb.bucket_size) {
+reread:		left = ca->sb.bucket_size - offset;
+		len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS);
+
+		bio_reset(bio);
+		bio->bi_iter.bi_sector	= bucket + offset;
+		bio->bi_bdev	= ca->bdev;
+		bio->bi_rw	= READ;
+		bio->bi_iter.bi_size	= len << 9;
+
+		bio->bi_end_io	= journal_read_endio;
+		bio->bi_private = &cl;
+		bch_bio_map(bio, data);
+
+		closure_bio_submit(bio, &cl, ca);
+		closure_sync(&cl);
+
+		/* This function could be simpler now since we no longer write
+		 * journal entries that overlap bucket boundaries; this means
+		 * the start of a bucket will always have a valid journal entry
+		 * if it has any journal entries at all.
+		 */
+
+		j = data;
+		while (len) {
+			struct list_head *where;
+			size_t blocks, bytes = set_bytes(j);
+
+			if (j->magic != jset_magic(&ca->sb)) {
+				pr_debug("%u: bad magic", bucket_index);
+				return ret;
+			}
+
+			if (bytes > left << 9 ||
+			    bytes > PAGE_SIZE << JSET_BITS) {
+				pr_info("%u: too big, %zu bytes, offset %u",
+					bucket_index, bytes, offset);
+				return ret;
+			}
+
+			if (bytes > len << 9)
+				goto reread;
+
+			if (j->csum != csum_set(j)) {
+				pr_info("%u: bad csum, %zu bytes, offset %u",
+					bucket_index, bytes, offset);
+				return ret;
+			}
+
+			blocks = set_blocks(j, block_bytes(ca->set));
+
+			while (!list_empty(list)) {
+				i = list_first_entry(list,
+					struct journal_replay, list);
+				if (i->j.seq >= j->last_seq)
+					break;
+				list_del(&i->list);
+				kfree(i);
+			}
+
+			list_for_each_entry_reverse(i, list, list) {
+				if (j->seq == i->j.seq)
+					goto next_set;
+
+				if (j->seq < i->j.last_seq)
+					goto next_set;
+
+				if (j->seq > i->j.seq) {
+					where = &i->list;
+					goto add;
+				}
+			}
+
+			where = list;
+add:
+			i = kmalloc(offsetof(struct journal_replay, j) +
+				    bytes, GFP_KERNEL);
+			if (!i)
+				return -ENOMEM;
+			memcpy(&i->j, j, bytes);
+			list_add(&i->list, where);
+			ret = 1;
+
+			ja->seq[bucket_index] = j->seq;
+next_set:
+			offset	+= blocks * ca->sb.block_size;
+			len	-= blocks * ca->sb.block_size;
+			j = ((void *) j) + blocks * block_bytes(ca);
+		}
+	}
+
+	return ret;
+}
+
+int bch_journal_read(struct cache_set *c, struct list_head *list)
+{
+#define read_bucket(b)							\
+	({								\
+		int ret = journal_read_bucket(ca, list, b);		\
+		__set_bit(b, bitmap);					\
+		if (ret < 0)						\
+			return ret;					\
+		ret;							\
+	})
+
+	struct cache *ca;
+	unsigned iter;
+
+	for_each_cache(ca, c, iter) {
+		struct journal_device *ja = &ca->journal;
+		unsigned long bitmap[SB_JOURNAL_BUCKETS / BITS_PER_LONG];
+		unsigned i, l, r, m;
+		uint64_t seq;
+
+		bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
+		pr_debug("%u journal buckets", ca->sb.njournal_buckets);
+
+		/*
+		 * Read journal buckets ordered by golden ratio hash to quickly
+		 * find a sequence of buckets with valid journal entries
+		 */
+		for (i = 0; i < ca->sb.njournal_buckets; i++) {
+			l = (i * 2654435769U) % ca->sb.njournal_buckets;
+
+			if (test_bit(l, bitmap))
+				break;
+
+			if (read_bucket(l))
+				goto bsearch;
+		}
+
+		/*
+		 * If that fails, check all the buckets we haven't checked
+		 * already
+		 */
+		pr_debug("falling back to linear search");
+
+		for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets);
+		     l < ca->sb.njournal_buckets;
+		     l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, l + 1))
+			if (read_bucket(l))
+				goto bsearch;
+
+		if (list_empty(list))
+			continue;
+bsearch:
+		/* Binary search */
+		m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
+		pr_debug("starting binary search, l %u r %u", l, r);
+
+		while (l + 1 < r) {
+			seq = list_entry(list->prev, struct journal_replay,
+					 list)->j.seq;
+
+			m = (l + r) >> 1;
+			read_bucket(m);
+
+			if (seq != list_entry(list->prev, struct journal_replay,
+					      list)->j.seq)
+				l = m;
+			else
+				r = m;
+		}
+
+		/*
+		 * Read buckets in reverse order until we stop finding more
+		 * journal entries
+		 */
+		pr_debug("finishing up: m %u njournal_buckets %u",
+			 m, ca->sb.njournal_buckets);
+		l = m;
+
+		while (1) {
+			if (!l--)
+				l = ca->sb.njournal_buckets - 1;
+
+			if (l == m)
+				break;
+
+			if (test_bit(l, bitmap))
+				continue;
+
+			if (!read_bucket(l))
+				break;
+		}
+
+		seq = 0;
+
+		for (i = 0; i < ca->sb.njournal_buckets; i++)
+			if (ja->seq[i] > seq) {
+				seq = ja->seq[i];
+				/*
+				 * When journal_reclaim() goes to allocate for
+				 * the first time, it'll use the bucket after
+				 * ja->cur_idx
+				 */
+				ja->cur_idx = i;
+				ja->last_idx = ja->discard_idx = (i + 1) %
+					ca->sb.njournal_buckets;
+
+			}
+	}
+
+	if (!list_empty(list))
+		c->journal.seq = list_entry(list->prev,
+					    struct journal_replay,
+					    list)->j.seq;
+
+	return 0;
+#undef read_bucket
+}
+
+void bch_journal_mark(struct cache_set *c, struct list_head *list)
+{
+	atomic_t p = { 0 };
+	struct bkey *k;
+	struct journal_replay *i;
+	struct journal *j = &c->journal;
+	uint64_t last = j->seq;
+
+	/*
+	 * journal.pin should never fill up - we never write a journal
+	 * entry when it would fill up. But if for some reason it does, we
+	 * iterate over the list in reverse order so that we can just skip that
+	 * refcount instead of bugging.
+	 */
+
+	list_for_each_entry_reverse(i, list, list) {
+		BUG_ON(last < i->j.seq);
+		i->pin = NULL;
+
+		while (last-- != i->j.seq)
+			if (fifo_free(&j->pin) > 1) {
+				fifo_push_front(&j->pin, p);
+				atomic_set(&fifo_front(&j->pin), 0);
+			}
+
+		if (fifo_free(&j->pin) > 1) {
+			fifo_push_front(&j->pin, p);
+			i->pin = &fifo_front(&j->pin);
+			atomic_set(i->pin, 1);
+		}
+
+		for (k = i->j.start;
+		     k < bset_bkey_last(&i->j);
+		     k = bkey_next(k)) {
+			unsigned j;
+
+			for (j = 0; j < KEY_PTRS(k); j++)
+				if (ptr_available(c, k, j))
+					atomic_inc(&PTR_BUCKET(c, k, j)->pin);
+
+			bch_initial_mark_key(c, 0, k);
+		}
+	}
+}
+
+int bch_journal_replay(struct cache_set *s, struct list_head *list)
+{
+	int ret = 0, keys = 0, entries = 0;
+	struct bkey *k;
+	struct journal_replay *i =
+		list_entry(list->prev, struct journal_replay, list);
+
+	uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
+	struct keylist keylist;
+
+	list_for_each_entry(i, list, list) {
+		BUG_ON(i->pin && atomic_read(i->pin) != 1);
+
+		cache_set_err_on(n != i->j.seq, s,
+"bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)",
+				 n, i->j.seq - 1, start, end);
+
+		for (k = i->j.start;
+		     k < bset_bkey_last(&i->j);
+		     k = bkey_next(k)) {
+			trace_bcache_journal_replay_key(k);
+
+			bch_keylist_init_single(&keylist, k);
+
+			ret = bch_btree_insert(s, &keylist, i->pin, NULL);
+			if (ret)
+				goto err;
+
+			BUG_ON(!bch_keylist_empty(&keylist));
+			keys++;
+
+			cond_resched();
+		}
+
+		if (i->pin)
+			atomic_dec(i->pin);
+		n = i->j.seq + 1;
+		entries++;
+	}
+
+	pr_info("journal replay done, %i keys in %i entries, seq %llu",
+		keys, entries, end);
+err:
+	while (!list_empty(list)) {
+		i = list_first_entry(list, struct journal_replay, list);
+		list_del(&i->list);
+		kfree(i);
+	}
+
+	return ret;
+}
+
+/* Journalling */
+
+static void btree_flush_write(struct cache_set *c)
+{
+	/*
+	 * Try to find the btree node with that references the oldest journal
+	 * entry, best is our current candidate and is locked if non NULL:
+	 */
+	struct btree *b, *best;
+	unsigned i;
+retry:
+	best = NULL;
+
+	for_each_cached_btree(b, c, i)
+		if (btree_current_write(b)->journal) {
+			if (!best)
+				best = b;
+			else if (journal_pin_cmp(c,
+					btree_current_write(best)->journal,
+					btree_current_write(b)->journal)) {
+				best = b;
+			}
+		}
+
+	b = best;
+	if (b) {
+		mutex_lock(&b->write_lock);
+		if (!btree_current_write(b)->journal) {
+			mutex_unlock(&b->write_lock);
+			/* We raced */
+			goto retry;
+		}
+
+		__bch_btree_node_write(b, NULL);
+		mutex_unlock(&b->write_lock);
+	}
+}
+
+#define last_seq(j)	((j)->seq - fifo_used(&(j)->pin) + 1)
+
+static void journal_discard_endio(struct bio *bio, int error)
+{
+	struct journal_device *ja =
+		container_of(bio, struct journal_device, discard_bio);
+	struct cache *ca = container_of(ja, struct cache, journal);
+
+	atomic_set(&ja->discard_in_flight, DISCARD_DONE);
+
+	closure_wake_up(&ca->set->journal.wait);
+	closure_put(&ca->set->cl);
+}
+
+static void journal_discard_work(struct work_struct *work)
+{
+	struct journal_device *ja =
+		container_of(work, struct journal_device, discard_work);
+
+	submit_bio(0, &ja->discard_bio);
+}
+
+static void do_journal_discard(struct cache *ca)
+{
+	struct journal_device *ja = &ca->journal;
+	struct bio *bio = &ja->discard_bio;
+
+	if (!ca->discard) {
+		ja->discard_idx = ja->last_idx;
+		return;
+	}
+
+	switch (atomic_read(&ja->discard_in_flight)) {
+	case DISCARD_IN_FLIGHT:
+		return;
+
+	case DISCARD_DONE:
+		ja->discard_idx = (ja->discard_idx + 1) %
+			ca->sb.njournal_buckets;
+
+		atomic_set(&ja->discard_in_flight, DISCARD_READY);
+		/* fallthrough */
+
+	case DISCARD_READY:
+		if (ja->discard_idx == ja->last_idx)
+			return;
+
+		atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT);
+
+		bio_init(bio);
+		bio->bi_iter.bi_sector	= bucket_to_sector(ca->set,
+						ca->sb.d[ja->discard_idx]);
+		bio->bi_bdev		= ca->bdev;
+		bio->bi_rw		= REQ_WRITE|REQ_DISCARD;
+		bio->bi_max_vecs	= 1;
+		bio->bi_io_vec		= bio->bi_inline_vecs;
+		bio->bi_iter.bi_size	= bucket_bytes(ca);
+		bio->bi_end_io		= journal_discard_endio;
+
+		closure_get(&ca->set->cl);
+		INIT_WORK(&ja->discard_work, journal_discard_work);
+		schedule_work(&ja->discard_work);
+	}
+}
+
+static void journal_reclaim(struct cache_set *c)
+{
+	struct bkey *k = &c->journal.key;
+	struct cache *ca;
+	uint64_t last_seq;
+	unsigned iter, n = 0;
+	atomic_t p;
+
+	while (!atomic_read(&fifo_front(&c->journal.pin)))
+		fifo_pop(&c->journal.pin, p);
+
+	last_seq = last_seq(&c->journal);
+
+	/* Update last_idx */
+
+	for_each_cache(ca, c, iter) {
+		struct journal_device *ja = &ca->journal;
+
+		while (ja->last_idx != ja->cur_idx &&
+		       ja->seq[ja->last_idx] < last_seq)
+			ja->last_idx = (ja->last_idx + 1) %
+				ca->sb.njournal_buckets;
+	}
+
+	for_each_cache(ca, c, iter)
+		do_journal_discard(ca);
+
+	if (c->journal.blocks_free)
+		goto out;
+
+	/*
+	 * Allocate:
+	 * XXX: Sort by free journal space
+	 */
+
+	for_each_cache(ca, c, iter) {
+		struct journal_device *ja = &ca->journal;
+		unsigned next = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
+
+		/* No space available on this device */
+		if (next == ja->discard_idx)
+			continue;
+
+		ja->cur_idx = next;
+		k->ptr[n++] = PTR(0,
+				  bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
+				  ca->sb.nr_this_dev);
+	}
+
+	bkey_init(k);
+	SET_KEY_PTRS(k, n);
+
+	if (n)
+		c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
+out:
+	if (!journal_full(&c->journal))
+		__closure_wake_up(&c->journal.wait);
+}
+
+void bch_journal_next(struct journal *j)
+{
+	atomic_t p = { 1 };
+
+	j->cur = (j->cur == j->w)
+		? &j->w[1]
+		: &j->w[0];
+
+	/*
+	 * The fifo_push() needs to happen at the same time as j->seq is
+	 * incremented for last_seq() to be calculated correctly
+	 */
+	BUG_ON(!fifo_push(&j->pin, p));
+	atomic_set(&fifo_back(&j->pin), 1);
+
+	j->cur->data->seq	= ++j->seq;
+	j->cur->dirty		= false;
+	j->cur->need_write	= false;
+	j->cur->data->keys	= 0;
+
+	if (fifo_full(&j->pin))
+		pr_debug("journal_pin full (%zu)", fifo_used(&j->pin));
+}
+
+static void journal_write_endio(struct bio *bio, int error)
+{
+	struct journal_write *w = bio->bi_private;
+
+	cache_set_err_on(error, w->c, "journal io error");
+	closure_put(&w->c->journal.io);
+}
+
+static void journal_write(struct closure *);
+
+static void journal_write_done(struct closure *cl)
+{
+	struct journal *j = container_of(cl, struct journal, io);
+	struct journal_write *w = (j->cur == j->w)
+		? &j->w[1]
+		: &j->w[0];
+
+	__closure_wake_up(&w->wait);
+	continue_at_nobarrier(cl, journal_write, system_wq);
+}
+
+static void journal_write_unlock(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+
+	c->journal.io_in_flight = 0;
+	spin_unlock(&c->journal.lock);
+}
+
+static void journal_write_unlocked(struct closure *cl)
+	__releases(c->journal.lock)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+	struct cache *ca;
+	struct journal_write *w = c->journal.cur;
+	struct bkey *k = &c->journal.key;
+	unsigned i, sectors = set_blocks(w->data, block_bytes(c)) *
+		c->sb.block_size;
+
+	struct bio *bio;
+	struct bio_list list;
+	bio_list_init(&list);
+
+	if (!w->need_write) {
+		closure_return_with_destructor(cl, journal_write_unlock);
+	} else if (journal_full(&c->journal)) {
+		journal_reclaim(c);
+		spin_unlock(&c->journal.lock);
+
+		btree_flush_write(c);
+		continue_at(cl, journal_write, system_wq);
+	}
+
+	c->journal.blocks_free -= set_blocks(w->data, block_bytes(c));
+
+	w->data->btree_level = c->root->level;
+
+	bkey_copy(&w->data->btree_root, &c->root->key);
+	bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
+
+	for_each_cache(ca, c, i)
+		w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
+
+	w->data->magic		= jset_magic(&c->sb);
+	w->data->version	= BCACHE_JSET_VERSION;
+	w->data->last_seq	= last_seq(&c->journal);
+	w->data->csum		= csum_set(w->data);
+
+	for (i = 0; i < KEY_PTRS(k); i++) {
+		ca = PTR_CACHE(c, k, i);
+		bio = &ca->journal.bio;
+
+		atomic_long_add(sectors, &ca->meta_sectors_written);
+
+		bio_reset(bio);
+		bio->bi_iter.bi_sector	= PTR_OFFSET(k, i);
+		bio->bi_bdev	= ca->bdev;
+		bio->bi_rw	= REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA;
+		bio->bi_iter.bi_size = sectors << 9;
+
+		bio->bi_end_io	= journal_write_endio;
+		bio->bi_private = w;
+		bch_bio_map(bio, w->data);
+
+		trace_bcache_journal_write(bio);
+		bio_list_add(&list, bio);
+
+		SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors);
+
+		ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
+	}
+
+	atomic_dec_bug(&fifo_back(&c->journal.pin));
+	bch_journal_next(&c->journal);
+	journal_reclaim(c);
+
+	spin_unlock(&c->journal.lock);
+
+	while ((bio = bio_list_pop(&list)))
+		closure_bio_submit(bio, cl, c->cache[0]);
+
+	continue_at(cl, journal_write_done, NULL);
+}
+
+static void journal_write(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+
+	spin_lock(&c->journal.lock);
+	journal_write_unlocked(cl);
+}
+
+static void journal_try_write(struct cache_set *c)
+	__releases(c->journal.lock)
+{
+	struct closure *cl = &c->journal.io;
+	struct journal_write *w = c->journal.cur;
+
+	w->need_write = true;
+
+	if (!c->journal.io_in_flight) {
+		c->journal.io_in_flight = 1;
+		closure_call(cl, journal_write_unlocked, NULL, &c->cl);
+	} else {
+		spin_unlock(&c->journal.lock);
+	}
+}
+
+static struct journal_write *journal_wait_for_write(struct cache_set *c,
+						    unsigned nkeys)
+{
+	size_t sectors;
+	struct closure cl;
+	bool wait = false;
+
+	closure_init_stack(&cl);
+
+	spin_lock(&c->journal.lock);
+
+	while (1) {
+		struct journal_write *w = c->journal.cur;
+
+		sectors = __set_blocks(w->data, w->data->keys + nkeys,
+				       block_bytes(c)) * c->sb.block_size;
+
+		if (sectors <= min_t(size_t,
+				     c->journal.blocks_free * c->sb.block_size,
+				     PAGE_SECTORS << JSET_BITS))
+			return w;
+
+		if (wait)
+			closure_wait(&c->journal.wait, &cl);
+
+		if (!journal_full(&c->journal)) {
+			if (wait)
+				trace_bcache_journal_entry_full(c);
+
+			/*
+			 * XXX: If we were inserting so many keys that they
+			 * won't fit in an _empty_ journal write, we'll
+			 * deadlock. For now, handle this in
+			 * bch_keylist_realloc() - but something to think about.
+			 */
+			BUG_ON(!w->data->keys);
+
+			journal_try_write(c); /* unlocks */
+		} else {
+			if (wait)
+				trace_bcache_journal_full(c);
+
+			journal_reclaim(c);
+			spin_unlock(&c->journal.lock);
+
+			btree_flush_write(c);
+		}
+
+		closure_sync(&cl);
+		spin_lock(&c->journal.lock);
+		wait = true;
+	}
+}
+
+static void journal_write_work(struct work_struct *work)
+{
+	struct cache_set *c = container_of(to_delayed_work(work),
+					   struct cache_set,
+					   journal.work);
+	spin_lock(&c->journal.lock);
+	if (c->journal.cur->dirty)
+		journal_try_write(c);
+	else
+		spin_unlock(&c->journal.lock);
+}
+
+/*
+ * Entry point to the journalling code - bio_insert() and btree_invalidate()
+ * pass bch_journal() a list of keys to be journalled, and then
+ * bch_journal() hands those same keys off to btree_insert_async()
+ */
+
+atomic_t *bch_journal(struct cache_set *c,
+		      struct keylist *keys,
+		      struct closure *parent)
+{
+	struct journal_write *w;
+	atomic_t *ret;
+
+	if (!CACHE_SYNC(&c->sb))
+		return NULL;
+
+	w = journal_wait_for_write(c, bch_keylist_nkeys(keys));
+
+	memcpy(bset_bkey_last(w->data), keys->keys, bch_keylist_bytes(keys));
+	w->data->keys += bch_keylist_nkeys(keys);
+
+	ret = &fifo_back(&c->journal.pin);
+	atomic_inc(ret);
+
+	if (parent) {
+		closure_wait(&w->wait, parent);
+		journal_try_write(c);
+	} else if (!w->dirty) {
+		w->dirty = true;
+		schedule_delayed_work(&c->journal.work,
+				      msecs_to_jiffies(c->journal_delay_ms));
+		spin_unlock(&c->journal.lock);
+	} else {
+		spin_unlock(&c->journal.lock);
+	}
+
+
+	return ret;
+}
+
+void bch_journal_meta(struct cache_set *c, struct closure *cl)
+{
+	struct keylist keys;
+	atomic_t *ref;
+
+	bch_keylist_init(&keys);
+
+	ref = bch_journal(c, &keys, cl);
+	if (ref)
+		atomic_dec_bug(ref);
+}
+
+void bch_journal_free(struct cache_set *c)
+{
+	free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
+	free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
+	free_fifo(&c->journal.pin);
+}
+
+int bch_journal_alloc(struct cache_set *c)
+{
+	struct journal *j = &c->journal;
+
+	spin_lock_init(&j->lock);
+	INIT_DELAYED_WORK(&j->work, journal_write_work);
+
+	c->journal_delay_ms = 100;
+
+	j->w[0].c = c;
+	j->w[1].c = c;
+
+	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+	    !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
+	    !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
new file mode 100644
index 00000000000..e3c39457afb
--- /dev/null
+++ b/drivers/md/bcache/journal.h
@@ -0,0 +1,179 @@
+#ifndef _BCACHE_JOURNAL_H
+#define _BCACHE_JOURNAL_H
+
+/*
+ * THE JOURNAL:
+ *
+ * The journal is treated as a circular buffer of buckets - a journal entry
+ * never spans two buckets. This means (not implemented yet) we can resize the
+ * journal at runtime, and will be needed for bcache on raw flash support.
+ *
+ * Journal entries contain a list of keys, ordered by the time they were
+ * inserted; thus journal replay just has to reinsert the keys.
+ *
+ * We also keep some things in the journal header that are logically part of the
+ * superblock - all the things that are frequently updated. This is for future
+ * bcache on raw flash support; the superblock (which will become another
+ * journal) can't be moved or wear leveled, so it contains just enough
+ * information to find the main journal, and the superblock only has to be
+ * rewritten when we want to move/wear level the main journal.
+ *
+ * Currently, we don't journal BTREE_REPLACE operations - this will hopefully be
+ * fixed eventually. This isn't a bug - BTREE_REPLACE is used for insertions
+ * from cache misses, which don't have to be journaled, and for writeback and
+ * moving gc we work around it by flushing the btree to disk before updating the
+ * gc information. But it is a potential issue with incremental garbage
+ * collection, and it's fragile.
+ *
+ * OPEN JOURNAL ENTRIES:
+ *
+ * Each journal entry contains, in the header, the sequence number of the last
+ * journal entry still open - i.e. that has keys that haven't been flushed to
+ * disk in the btree.
+ *
+ * We track this by maintaining a refcount for every open journal entry, in a
+ * fifo; each entry in the fifo corresponds to a particular journal
+ * entry/sequence number. When the refcount at the tail of the fifo goes to
+ * zero, we pop it off - thus, the size of the fifo tells us the number of open
+ * journal entries
+ *
+ * We take a refcount on a journal entry when we add some keys to a journal
+ * entry that we're going to insert (held by struct btree_op), and then when we
+ * insert those keys into the btree the btree write we're setting up takes a
+ * copy of that refcount (held by struct btree_write). That refcount is dropped
+ * when the btree write completes.
+ *
+ * A struct btree_write can only hold a refcount on a single journal entry, but
+ * might contain keys for many journal entries - we handle this by making sure
+ * it always has a refcount on the _oldest_ journal entry of all the journal
+ * entries it has keys for.
+ *
+ * JOURNAL RECLAIM:
+ *
+ * As mentioned previously, our fifo of refcounts tells us the number of open
+ * journal entries; from that and the current journal sequence number we compute
+ * last_seq - the oldest journal entry we still need. We write last_seq in each
+ * journal entry, and we also have to keep track of where it exists on disk so
+ * we don't overwrite it when we loop around the journal.
+ *
+ * To do that we track, for each journal bucket, the sequence number of the
+ * newest journal entry it contains - if we don't need that journal entry we
+ * don't need anything in that bucket anymore. From that we track the last
+ * journal bucket we still need; all this is tracked in struct journal_device
+ * and updated by journal_reclaim().
+ *
+ * JOURNAL FILLING UP:
+ *
+ * There are two ways the journal could fill up; either we could run out of
+ * space to write to, or we could have too many open journal entries and run out
+ * of room in the fifo of refcounts. Since those refcounts are decremented
+ * without any locking we can't safely resize that fifo, so we handle it the
+ * same way.
+ *
+ * If the journal fills up, we start flushing dirty btree nodes until we can
+ * allocate space for a journal write again - preferentially flushing btree
+ * nodes that are pinning the oldest journal entries first.
+ */
+
+/*
+ * Only used for holding the journal entries we read in btree_journal_read()
+ * during cache_registration
+ */
+struct journal_replay {
+	struct list_head	list;
+	atomic_t		*pin;
+	struct jset		j;
+};
+
+/*
+ * We put two of these in struct journal; we used them for writes to the
+ * journal that are being staged or in flight.
+ */
+struct journal_write {
+	struct jset		*data;
+#define JSET_BITS		3
+
+	struct cache_set	*c;
+	struct closure_waitlist	wait;
+	bool			dirty;
+	bool			need_write;
+};
+
+/* Embedded in struct cache_set */
+struct journal {
+	spinlock_t		lock;
+	/* used when waiting because the journal was full */
+	struct closure_waitlist	wait;
+	struct closure		io;
+	int			io_in_flight;
+	struct delayed_work	work;
+
+	/* Number of blocks free in the bucket(s) we're currently writing to */
+	unsigned		blocks_free;
+	uint64_t		seq;
+	DECLARE_FIFO(atomic_t, pin);
+
+	BKEY_PADDED(key);
+
+	struct journal_write	w[2], *cur;
+};
+
+/*
+ * Embedded in struct cache. First three fields refer to the array of journal
+ * buckets, in cache_sb.
+ */
+struct journal_device {
+	/*
+	 * For each journal bucket, contains the max sequence number of the
+	 * journal writes it contains - so we know when a bucket can be reused.
+	 */
+	uint64_t		seq[SB_JOURNAL_BUCKETS];
+
+	/* Journal bucket we're currently writing to */
+	unsigned		cur_idx;
+
+	/* Last journal bucket that still contains an open journal entry */
+	unsigned		last_idx;
+
+	/* Next journal bucket to be discarded */
+	unsigned		discard_idx;
+
+#define DISCARD_READY		0
+#define DISCARD_IN_FLIGHT	1
+#define DISCARD_DONE		2
+	/* 1 - discard in flight, -1 - discard completed */
+	atomic_t		discard_in_flight;
+
+	struct work_struct	discard_work;
+	struct bio		discard_bio;
+	struct bio_vec		discard_bv;
+
+	/* Bio for journal reads/writes to this device */
+	struct bio		bio;
+	struct bio_vec		bv[8];
+};
+
+#define journal_pin_cmp(c, l, r)				\
+	(fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r)))
+
+#define JOURNAL_PIN	20000
+
+#define journal_full(j)						\
+	(!(j)->blocks_free || fifo_free(&(j)->pin) <= 1)
+
+struct closure;
+struct cache_set;
+struct btree_op;
+struct keylist;
+
+atomic_t *bch_journal(struct cache_set *, struct keylist *, struct closure *);
+void bch_journal_next(struct journal *);
+void bch_journal_mark(struct cache_set *, struct list_head *);
+void bch_journal_meta(struct cache_set *, struct closure *);
+int bch_journal_read(struct cache_set *, struct list_head *);
+int bch_journal_replay(struct cache_set *, struct list_head *);
+
+void bch_journal_free(struct cache_set *);
+int bch_journal_alloc(struct cache_set *);
+
+#endif /* _BCACHE_JOURNAL_H */
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
new file mode 100644
index 00000000000..cd7490311e5
--- /dev/null
+++ b/drivers/md/bcache/movinggc.c
@@ -0,0 +1,256 @@
+/*
+ * Moving/copying garbage collector
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "request.h"
+
+#include <trace/events/bcache.h>
+
+struct moving_io {
+	struct closure		cl;
+	struct keybuf_key	*w;
+	struct data_insert_op	op;
+	struct bbio		bio;
+};
+
+static bool moving_pred(struct keybuf *buf, struct bkey *k)
+{
+	struct cache_set *c = container_of(buf, struct cache_set,
+					   moving_gc_keys);
+	unsigned i;
+
+	for (i = 0; i < KEY_PTRS(k); i++)
+		if (ptr_available(c, k, i) &&
+		    GC_MOVE(PTR_BUCKET(c, k, i)))
+			return true;
+
+	return false;
+}
+
+/* Moving GC - IO loop */
+
+static void moving_io_destructor(struct closure *cl)
+{
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	kfree(io);
+}
+
+static void write_moving_finish(struct closure *cl)
+{
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	struct bio *bio = &io->bio.bio;
+	struct bio_vec *bv;
+	int i;
+
+	bio_for_each_segment_all(bv, bio, i)
+		__free_page(bv->bv_page);
+
+	if (io->op.replace_collision)
+		trace_bcache_gc_copy_collision(&io->w->key);
+
+	bch_keybuf_del(&io->op.c->moving_gc_keys, io->w);
+
+	up(&io->op.c->moving_in_flight);
+
+	closure_return_with_destructor(cl, moving_io_destructor);
+}
+
+static void read_moving_endio(struct bio *bio, int error)
+{
+	struct bbio *b = container_of(bio, struct bbio, bio);
+	struct moving_io *io = container_of(bio->bi_private,
+					    struct moving_io, cl);
+
+	if (error)
+		io->op.error = error;
+	else if (!KEY_DIRTY(&b->key) &&
+		 ptr_stale(io->op.c, &b->key, 0)) {
+		io->op.error = -EINTR;
+	}
+
+	bch_bbio_endio(io->op.c, bio, error, "reading data to move");
+}
+
+static void moving_init(struct moving_io *io)
+{
+	struct bio *bio = &io->bio.bio;
+
+	bio_init(bio);
+	bio_get(bio);
+	bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+	bio->bi_iter.bi_size	= KEY_SIZE(&io->w->key) << 9;
+	bio->bi_max_vecs	= DIV_ROUND_UP(KEY_SIZE(&io->w->key),
+					       PAGE_SECTORS);
+	bio->bi_private		= &io->cl;
+	bio->bi_io_vec		= bio->bi_inline_vecs;
+	bch_bio_map(bio, NULL);
+}
+
+static void write_moving(struct closure *cl)
+{
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	struct data_insert_op *op = &io->op;
+
+	if (!op->error) {
+		moving_init(io);
+
+		io->bio.bio.bi_iter.bi_sector = KEY_START(&io->w->key);
+		op->write_prio		= 1;
+		op->bio			= &io->bio.bio;
+
+		op->writeback		= KEY_DIRTY(&io->w->key);
+		op->csum		= KEY_CSUM(&io->w->key);
+
+		bkey_copy(&op->replace_key, &io->w->key);
+		op->replace		= true;
+
+		closure_call(&op->cl, bch_data_insert, NULL, cl);
+	}
+
+	continue_at(cl, write_moving_finish, op->wq);
+}
+
+static void read_moving_submit(struct closure *cl)
+{
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	struct bio *bio = &io->bio.bio;
+
+	bch_submit_bbio(bio, io->op.c, &io->w->key, 0);
+
+	continue_at(cl, write_moving, io->op.wq);
+}
+
+static void read_moving(struct cache_set *c)
+{
+	struct keybuf_key *w;
+	struct moving_io *io;
+	struct bio *bio;
+	struct closure cl;
+
+	closure_init_stack(&cl);
+
+	/* XXX: if we error, background writeback could stall indefinitely */
+
+	while (!test_bit(CACHE_SET_STOPPING, &c->flags)) {
+		w = bch_keybuf_next_rescan(c, &c->moving_gc_keys,
+					   &MAX_KEY, moving_pred);
+		if (!w)
+			break;
+
+		if (ptr_stale(c, &w->key, 0)) {
+			bch_keybuf_del(&c->moving_gc_keys, w);
+			continue;
+		}
+
+		io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec)
+			     * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+			     GFP_KERNEL);
+		if (!io)
+			goto err;
+
+		w->private	= io;
+		io->w		= w;
+		io->op.inode	= KEY_INODE(&w->key);
+		io->op.c	= c;
+		io->op.wq	= c->moving_gc_wq;
+
+		moving_init(io);
+		bio = &io->bio.bio;
+
+		bio->bi_rw	= READ;
+		bio->bi_end_io	= read_moving_endio;
+
+		if (bio_alloc_pages(bio, GFP_KERNEL))
+			goto err;
+
+		trace_bcache_gc_copy(&w->key);
+
+		down(&c->moving_in_flight);
+		closure_call(&io->cl, read_moving_submit, NULL, &cl);
+	}
+
+	if (0) {
+err:		if (!IS_ERR_OR_NULL(w->private))
+			kfree(w->private);
+
+		bch_keybuf_del(&c->moving_gc_keys, w);
+	}
+
+	closure_sync(&cl);
+}
+
+static bool bucket_cmp(struct bucket *l, struct bucket *r)
+{
+	return GC_SECTORS_USED(l) < GC_SECTORS_USED(r);
+}
+
+static unsigned bucket_heap_top(struct cache *ca)
+{
+	struct bucket *b;
+	return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0;
+}
+
+void bch_moving_gc(struct cache_set *c)
+{
+	struct cache *ca;
+	struct bucket *b;
+	unsigned i;
+
+	if (!c->copy_gc_enabled)
+		return;
+
+	mutex_lock(&c->bucket_lock);
+
+	for_each_cache(ca, c, i) {
+		unsigned sectors_to_move = 0;
+		unsigned reserve_sectors = ca->sb.bucket_size *
+			fifo_used(&ca->free[RESERVE_MOVINGGC]);
+
+		ca->heap.used = 0;
+
+		for_each_bucket(b, ca) {
+			if (GC_MARK(b) == GC_MARK_METADATA ||
+			    !GC_SECTORS_USED(b) ||
+			    GC_SECTORS_USED(b) == ca->sb.bucket_size ||
+			    atomic_read(&b->pin))
+				continue;
+
+			if (!heap_full(&ca->heap)) {
+				sectors_to_move += GC_SECTORS_USED(b);
+				heap_add(&ca->heap, b, bucket_cmp);
+			} else if (bucket_cmp(b, heap_peek(&ca->heap))) {
+				sectors_to_move -= bucket_heap_top(ca);
+				sectors_to_move += GC_SECTORS_USED(b);
+
+				ca->heap.data[0] = b;
+				heap_sift(&ca->heap, 0, bucket_cmp);
+			}
+		}
+
+		while (sectors_to_move > reserve_sectors) {
+			heap_pop(&ca->heap, b, bucket_cmp);
+			sectors_to_move -= GC_SECTORS_USED(b);
+		}
+
+		while (heap_pop(&ca->heap, b, bucket_cmp))
+			SET_GC_MOVE(b, 1);
+	}
+
+	mutex_unlock(&c->bucket_lock);
+
+	c->moving_gc_keys.last_scanned = ZERO_KEY;
+
+	read_moving(c);
+}
+
+void bch_moving_init_cache_set(struct cache_set *c)
+{
+	bch_keybuf_init(&c->moving_gc_keys);
+	sema_init(&c->moving_in_flight, 64);
+}
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
new file mode 100644
index 00000000000..15fff4f68a7
--- /dev/null
+++ b/drivers/md/bcache/request.c
@@ -0,0 +1,1159 @@
+/*
+ * Main bcache entry point - handle a read or a write request and decide what to
+ * do with it; the make_request functions are called by the block layer.
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "request.h"
+#include "writeback.h"
+
+#include <linux/module.h>
+#include <linux/hash.h>
+#include <linux/random.h>
+
+#include <trace/events/bcache.h>
+
+#define CUTOFF_CACHE_ADD	95
+#define CUTOFF_CACHE_READA	90
+
+struct kmem_cache *bch_search_cache;
+
+static void bch_data_insert_start(struct closure *);
+
+static unsigned cache_mode(struct cached_dev *dc, struct bio *bio)
+{
+	return BDEV_CACHE_MODE(&dc->sb);
+}
+
+static bool verify(struct cached_dev *dc, struct bio *bio)
+{
+	return dc->verify;
+}
+
+static void bio_csum(struct bio *bio, struct bkey *k)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	uint64_t csum = 0;
+
+	bio_for_each_segment(bv, bio, iter) {
+		void *d = kmap(bv.bv_page) + bv.bv_offset;
+		csum = bch_crc64_update(csum, d, bv.bv_len);
+		kunmap(bv.bv_page);
+	}
+
+	k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1);
+}
+
+/* Insert data into cache */
+
+static void bch_data_insert_keys(struct closure *cl)
+{
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+	atomic_t *journal_ref = NULL;
+	struct bkey *replace_key = op->replace ? &op->replace_key : NULL;
+	int ret;
+
+	/*
+	 * If we're looping, might already be waiting on
+	 * another journal write - can't wait on more than one journal write at
+	 * a time
+	 *
+	 * XXX: this looks wrong
+	 */
+#if 0
+	while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING)
+		closure_sync(&s->cl);
+#endif
+
+	if (!op->replace)
+		journal_ref = bch_journal(op->c, &op->insert_keys,
+					  op->flush_journal ? cl : NULL);
+
+	ret = bch_btree_insert(op->c, &op->insert_keys,
+			       journal_ref, replace_key);
+	if (ret == -ESRCH) {
+		op->replace_collision = true;
+	} else if (ret) {
+		op->error		= -ENOMEM;
+		op->insert_data_done	= true;
+	}
+
+	if (journal_ref)
+		atomic_dec_bug(journal_ref);
+
+	if (!op->insert_data_done)
+		continue_at(cl, bch_data_insert_start, op->wq);
+
+	bch_keylist_free(&op->insert_keys);
+	closure_return(cl);
+}
+
+static int bch_keylist_realloc(struct keylist *l, unsigned u64s,
+			       struct cache_set *c)
+{
+	size_t oldsize = bch_keylist_nkeys(l);
+	size_t newsize = oldsize + u64s;
+
+	/*
+	 * The journalling code doesn't handle the case where the keys to insert
+	 * is bigger than an empty write: If we just return -ENOMEM here,
+	 * bio_insert() and bio_invalidate() will insert the keys created so far
+	 * and finish the rest when the keylist is empty.
+	 */
+	if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
+		return -ENOMEM;
+
+	return __bch_keylist_realloc(l, u64s);
+}
+
+static void bch_data_invalidate(struct closure *cl)
+{
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+	struct bio *bio = op->bio;
+
+	pr_debug("invalidating %i sectors from %llu",
+		 bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector);
+
+	while (bio_sectors(bio)) {
+		unsigned sectors = min(bio_sectors(bio),
+				       1U << (KEY_SIZE_BITS - 1));
+
+		if (bch_keylist_realloc(&op->insert_keys, 2, op->c))
+			goto out;
+
+		bio->bi_iter.bi_sector	+= sectors;
+		bio->bi_iter.bi_size	-= sectors << 9;
+
+		bch_keylist_add(&op->insert_keys,
+				&KEY(op->inode, bio->bi_iter.bi_sector, sectors));
+	}
+
+	op->insert_data_done = true;
+	bio_put(bio);
+out:
+	continue_at(cl, bch_data_insert_keys, op->wq);
+}
+
+static void bch_data_insert_error(struct closure *cl)
+{
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+
+	/*
+	 * Our data write just errored, which means we've got a bunch of keys to
+	 * insert that point to data that wasn't succesfully written.
+	 *
+	 * We don't have to insert those keys but we still have to invalidate
+	 * that region of the cache - so, if we just strip off all the pointers
+	 * from the keys we'll accomplish just that.
+	 */
+
+	struct bkey *src = op->insert_keys.keys, *dst = op->insert_keys.keys;
+
+	while (src != op->insert_keys.top) {
+		struct bkey *n = bkey_next(src);
+
+		SET_KEY_PTRS(src, 0);
+		memmove(dst, src, bkey_bytes(src));
+
+		dst = bkey_next(dst);
+		src = n;
+	}
+
+	op->insert_keys.top = dst;
+
+	bch_data_insert_keys(cl);
+}
+
+static void bch_data_insert_endio(struct bio *bio, int error)
+{
+	struct closure *cl = bio->bi_private;
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+
+	if (error) {
+		/* TODO: We could try to recover from this. */
+		if (op->writeback)
+			op->error = error;
+		else if (!op->replace)
+			set_closure_fn(cl, bch_data_insert_error, op->wq);
+		else
+			set_closure_fn(cl, NULL, NULL);
+	}
+
+	bch_bbio_endio(op->c, bio, error, "writing data to cache");
+}
+
+static void bch_data_insert_start(struct closure *cl)
+{
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+	struct bio *bio = op->bio, *n;
+
+	if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
+		set_gc_sectors(op->c);
+		wake_up_gc(op->c);
+	}
+
+	if (op->bypass)
+		return bch_data_invalidate(cl);
+
+	/*
+	 * Journal writes are marked REQ_FLUSH; if the original write was a
+	 * flush, it'll wait on the journal write.
+	 */
+	bio->bi_rw &= ~(REQ_FLUSH|REQ_FUA);
+
+	do {
+		unsigned i;
+		struct bkey *k;
+		struct bio_set *split = op->c->bio_split;
+
+		/* 1 for the device pointer and 1 for the chksum */
+		if (bch_keylist_realloc(&op->insert_keys,
+					3 + (op->csum ? 1 : 0),
+					op->c))
+			continue_at(cl, bch_data_insert_keys, op->wq);
+
+		k = op->insert_keys.top;
+		bkey_init(k);
+		SET_KEY_INODE(k, op->inode);
+		SET_KEY_OFFSET(k, bio->bi_iter.bi_sector);
+
+		if (!bch_alloc_sectors(op->c, k, bio_sectors(bio),
+				       op->write_point, op->write_prio,
+				       op->writeback))
+			goto err;
+
+		n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split);
+
+		n->bi_end_io	= bch_data_insert_endio;
+		n->bi_private	= cl;
+
+		if (op->writeback) {
+			SET_KEY_DIRTY(k, true);
+
+			for (i = 0; i < KEY_PTRS(k); i++)
+				SET_GC_MARK(PTR_BUCKET(op->c, k, i),
+					    GC_MARK_DIRTY);
+		}
+
+		SET_KEY_CSUM(k, op->csum);
+		if (KEY_CSUM(k))
+			bio_csum(n, k);
+
+		trace_bcache_cache_insert(k);
+		bch_keylist_push(&op->insert_keys);
+
+		n->bi_rw |= REQ_WRITE;
+		bch_submit_bbio(n, op->c, k, 0);
+	} while (n != bio);
+
+	op->insert_data_done = true;
+	continue_at(cl, bch_data_insert_keys, op->wq);
+err:
+	/* bch_alloc_sectors() blocks if s->writeback = true */
+	BUG_ON(op->writeback);
+
+	/*
+	 * But if it's not a writeback write we'd rather just bail out if
+	 * there aren't any buckets ready to write to - it might take awhile and
+	 * we might be starving btree writes for gc or something.
+	 */
+
+	if (!op->replace) {
+		/*
+		 * Writethrough write: We can't complete the write until we've
+		 * updated the index. But we don't want to delay the write while
+		 * we wait for buckets to be freed up, so just invalidate the
+		 * rest of the write.
+		 */
+		op->bypass = true;
+		return bch_data_invalidate(cl);
+	} else {
+		/*
+		 * From a cache miss, we can just insert the keys for the data
+		 * we have written or bail out if we didn't do anything.
+		 */
+		op->insert_data_done = true;
+		bio_put(bio);
+
+		if (!bch_keylist_empty(&op->insert_keys))
+			continue_at(cl, bch_data_insert_keys, op->wq);
+		else
+			closure_return(cl);
+	}
+}
+
+/**
+ * bch_data_insert - stick some data in the cache
+ *
+ * This is the starting point for any data to end up in a cache device; it could
+ * be from a normal write, or a writeback write, or a write to a flash only
+ * volume - it's also used by the moving garbage collector to compact data in
+ * mostly empty buckets.
+ *
+ * It first writes the data to the cache, creating a list of keys to be inserted
+ * (if the data had to be fragmented there will be multiple keys); after the
+ * data is written it calls bch_journal, and after the keys have been added to
+ * the next journal write they're inserted into the btree.
+ *
+ * It inserts the data in s->cache_bio; bi_sector is used for the key offset,
+ * and op->inode is used for the key inode.
+ *
+ * If s->bypass is true, instead of inserting the data it invalidates the
+ * region of the cache represented by s->cache_bio and op->inode.
+ */
+void bch_data_insert(struct closure *cl)
+{
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+
+	trace_bcache_write(op->bio, op->writeback, op->bypass);
+
+	bch_keylist_init(&op->insert_keys);
+	bio_get(op->bio);
+	bch_data_insert_start(cl);
+}
+
+/* Congested? */
+
+unsigned bch_get_congested(struct cache_set *c)
+{
+	int i;
+	long rand;
+
+	if (!c->congested_read_threshold_us &&
+	    !c->congested_write_threshold_us)
+		return 0;
+
+	i = (local_clock_us() - c->congested_last_us) / 1024;
+	if (i < 0)
+		return 0;
+
+	i += atomic_read(&c->congested);
+	if (i >= 0)
+		return 0;
+
+	i += CONGESTED_MAX;
+
+	if (i > 0)
+		i = fract_exp_two(i, 6);
+
+	rand = get_random_int();
+	i -= bitmap_weight(&rand, BITS_PER_LONG);
+
+	return i > 0 ? i : 1;
+}
+
+static void add_sequential(struct task_struct *t)
+{
+	ewma_add(t->sequential_io_avg,
+		 t->sequential_io, 8, 0);
+
+	t->sequential_io = 0;
+}
+
+static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
+{
+	return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
+}
+
+static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
+{
+	struct cache_set *c = dc->disk.c;
+	unsigned mode = cache_mode(dc, bio);
+	unsigned sectors, congested = bch_get_congested(c);
+	struct task_struct *task = current;
+	struct io *i;
+
+	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
+	    c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
+	    (bio->bi_rw & REQ_DISCARD))
+		goto skip;
+
+	if (mode == CACHE_MODE_NONE ||
+	    (mode == CACHE_MODE_WRITEAROUND &&
+	     (bio->bi_rw & REQ_WRITE)))
+		goto skip;
+
+	if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
+	    bio_sectors(bio) & (c->sb.block_size - 1)) {
+		pr_debug("skipping unaligned io");
+		goto skip;
+	}
+
+	if (bypass_torture_test(dc)) {
+		if ((get_random_int() & 3) == 3)
+			goto skip;
+		else
+			goto rescale;
+	}
+
+	if (!congested && !dc->sequential_cutoff)
+		goto rescale;
+
+	if (!congested &&
+	    mode == CACHE_MODE_WRITEBACK &&
+	    (bio->bi_rw & REQ_WRITE) &&
+	    (bio->bi_rw & REQ_SYNC))
+		goto rescale;
+
+	spin_lock(&dc->io_lock);
+
+	hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash)
+		if (i->last == bio->bi_iter.bi_sector &&
+		    time_before(jiffies, i->jiffies))
+			goto found;
+
+	i = list_first_entry(&dc->io_lru, struct io, lru);
+
+	add_sequential(task);
+	i->sequential = 0;
+found:
+	if (i->sequential + bio->bi_iter.bi_size > i->sequential)
+		i->sequential	+= bio->bi_iter.bi_size;
+
+	i->last			 = bio_end_sector(bio);
+	i->jiffies		 = jiffies + msecs_to_jiffies(5000);
+	task->sequential_io	 = i->sequential;
+
+	hlist_del(&i->hash);
+	hlist_add_head(&i->hash, iohash(dc, i->last));
+	list_move_tail(&i->lru, &dc->io_lru);
+
+	spin_unlock(&dc->io_lock);
+
+	sectors = max(task->sequential_io,
+		      task->sequential_io_avg) >> 9;
+
+	if (dc->sequential_cutoff &&
+	    sectors >= dc->sequential_cutoff >> 9) {
+		trace_bcache_bypass_sequential(bio);
+		goto skip;
+	}
+
+	if (congested && sectors >= congested) {
+		trace_bcache_bypass_congested(bio);
+		goto skip;
+	}
+
+rescale:
+	bch_rescale_priorities(c, bio_sectors(bio));
+	return false;
+skip:
+	bch_mark_sectors_bypassed(c, dc, bio_sectors(bio));
+	return true;
+}
+
+/* Cache lookup */
+
+struct search {
+	/* Stack frame for bio_complete */
+	struct closure		cl;
+
+	struct bbio		bio;
+	struct bio		*orig_bio;
+	struct bio		*cache_miss;
+	struct bcache_device	*d;
+
+	unsigned		insert_bio_sectors;
+	unsigned		recoverable:1;
+	unsigned		write:1;
+	unsigned		read_dirty_data:1;
+
+	unsigned long		start_time;
+
+	struct btree_op		op;
+	struct data_insert_op	iop;
+};
+
+static void bch_cache_read_endio(struct bio *bio, int error)
+{
+	struct bbio *b = container_of(bio, struct bbio, bio);
+	struct closure *cl = bio->bi_private;
+	struct search *s = container_of(cl, struct search, cl);
+
+	/*
+	 * If the bucket was reused while our bio was in flight, we might have
+	 * read the wrong data. Set s->error but not error so it doesn't get
+	 * counted against the cache device, but we'll still reread the data
+	 * from the backing device.
+	 */
+
+	if (error)
+		s->iop.error = error;
+	else if (!KEY_DIRTY(&b->key) &&
+		 ptr_stale(s->iop.c, &b->key, 0)) {
+		atomic_long_inc(&s->iop.c->cache_read_races);
+		s->iop.error = -EINTR;
+	}
+
+	bch_bbio_endio(s->iop.c, bio, error, "reading from cache");
+}
+
+/*
+ * Read from a single key, handling the initial cache miss if the key starts in
+ * the middle of the bio
+ */
+static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
+{
+	struct search *s = container_of(op, struct search, op);
+	struct bio *n, *bio = &s->bio.bio;
+	struct bkey *bio_key;
+	unsigned ptr;
+
+	if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0)) <= 0)
+		return MAP_CONTINUE;
+
+	if (KEY_INODE(k) != s->iop.inode ||
+	    KEY_START(k) > bio->bi_iter.bi_sector) {
+		unsigned bio_sectors = bio_sectors(bio);
+		unsigned sectors = KEY_INODE(k) == s->iop.inode
+			? min_t(uint64_t, INT_MAX,
+				KEY_START(k) - bio->bi_iter.bi_sector)
+			: INT_MAX;
+
+		int ret = s->d->cache_miss(b, s, bio, sectors);
+		if (ret != MAP_CONTINUE)
+			return ret;
+
+		/* if this was a complete miss we shouldn't get here */
+		BUG_ON(bio_sectors <= sectors);
+	}
+
+	if (!KEY_SIZE(k))
+		return MAP_CONTINUE;
+
+	/* XXX: figure out best pointer - for multiple cache devices */
+	ptr = 0;
+
+	PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
+
+	if (KEY_DIRTY(k))
+		s->read_dirty_data = true;
+
+	n = bio_next_split(bio, min_t(uint64_t, INT_MAX,
+				      KEY_OFFSET(k) - bio->bi_iter.bi_sector),
+			   GFP_NOIO, s->d->bio_split);
+
+	bio_key = &container_of(n, struct bbio, bio)->key;
+	bch_bkey_copy_single_ptr(bio_key, k, ptr);
+
+	bch_cut_front(&KEY(s->iop.inode, n->bi_iter.bi_sector, 0), bio_key);
+	bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key);
+
+	n->bi_end_io	= bch_cache_read_endio;
+	n->bi_private	= &s->cl;
+
+	/*
+	 * The bucket we're reading from might be reused while our bio
+	 * is in flight, and we could then end up reading the wrong
+	 * data.
+	 *
+	 * We guard against this by checking (in cache_read_endio()) if
+	 * the pointer is stale again; if so, we treat it as an error
+	 * and reread from the backing device (but we don't pass that
+	 * error up anywhere).
+	 */
+
+	__bch_submit_bbio(n, b->c);
+	return n == bio ? MAP_DONE : MAP_CONTINUE;
+}
+
+static void cache_lookup(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, iop.cl);
+	struct bio *bio = &s->bio.bio;
+	int ret;
+
+	bch_btree_op_init(&s->op, -1);
+
+	ret = bch_btree_map_keys(&s->op, s->iop.c,
+				 &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0),
+				 cache_lookup_fn, MAP_END_KEY);
+	if (ret == -EAGAIN)
+		continue_at(cl, cache_lookup, bcache_wq);
+
+	closure_return(cl);
+}
+
+/* Common code for the make_request functions */
+
+static void request_endio(struct bio *bio, int error)
+{
+	struct closure *cl = bio->bi_private;
+
+	if (error) {
+		struct search *s = container_of(cl, struct search, cl);
+		s->iop.error = error;
+		/* Only cache read errors are recoverable */
+		s->recoverable = false;
+	}
+
+	bio_put(bio);
+	closure_put(cl);
+}
+
+static void bio_complete(struct search *s)
+{
+	if (s->orig_bio) {
+		int cpu, rw = bio_data_dir(s->orig_bio);
+		unsigned long duration = jiffies - s->start_time;
+
+		cpu = part_stat_lock();
+		part_round_stats(cpu, &s->d->disk->part0);
+		part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration);
+		part_stat_unlock();
+
+		trace_bcache_request_end(s->d, s->orig_bio);
+		bio_endio(s->orig_bio, s->iop.error);
+		s->orig_bio = NULL;
+	}
+}
+
+static void do_bio_hook(struct search *s, struct bio *orig_bio)
+{
+	struct bio *bio = &s->bio.bio;
+
+	bio_init(bio);
+	__bio_clone_fast(bio, orig_bio);
+	bio->bi_end_io		= request_endio;
+	bio->bi_private		= &s->cl;
+
+	atomic_set(&bio->bi_cnt, 3);
+}
+
+static void search_free(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	bio_complete(s);
+
+	if (s->iop.bio)
+		bio_put(s->iop.bio);
+
+	closure_debug_destroy(cl);
+	mempool_free(s, s->d->c->search);
+}
+
+static inline struct search *search_alloc(struct bio *bio,
+					  struct bcache_device *d)
+{
+	struct search *s;
+
+	s = mempool_alloc(d->c->search, GFP_NOIO);
+
+	closure_init(&s->cl, NULL);
+	do_bio_hook(s, bio);
+
+	s->orig_bio		= bio;
+	s->cache_miss		= NULL;
+	s->d			= d;
+	s->recoverable		= 1;
+	s->write		= (bio->bi_rw & REQ_WRITE) != 0;
+	s->read_dirty_data	= 0;
+	s->start_time		= jiffies;
+
+	s->iop.c		= d->c;
+	s->iop.bio		= NULL;
+	s->iop.inode		= d->id;
+	s->iop.write_point	= hash_long((unsigned long) current, 16);
+	s->iop.write_prio	= 0;
+	s->iop.error		= 0;
+	s->iop.flags		= 0;
+	s->iop.flush_journal	= (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
+	s->iop.wq		= bcache_wq;
+
+	return s;
+}
+
+/* Cached devices */
+
+static void cached_dev_bio_complete(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+	search_free(cl);
+	cached_dev_put(dc);
+}
+
+/* Process reads */
+
+static void cached_dev_cache_miss_done(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+
+	if (s->iop.replace_collision)
+		bch_mark_cache_miss_collision(s->iop.c, s->d);
+
+	if (s->iop.bio) {
+		int i;
+		struct bio_vec *bv;
+
+		bio_for_each_segment_all(bv, s->iop.bio, i)
+			__free_page(bv->bv_page);
+	}
+
+	cached_dev_bio_complete(cl);
+}
+
+static void cached_dev_read_error(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	struct bio *bio = &s->bio.bio;
+
+	if (s->recoverable) {
+		/* Retry from the backing device: */
+		trace_bcache_read_retry(s->orig_bio);
+
+		s->iop.error = 0;
+		do_bio_hook(s, s->orig_bio);
+
+		/* XXX: invalidate cache */
+
+		closure_bio_submit(bio, cl, s->d);
+	}
+
+	continue_at(cl, cached_dev_cache_miss_done, NULL);
+}
+
+static void cached_dev_read_done(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+	/*
+	 * We had a cache miss; cache_bio now contains data ready to be inserted
+	 * into the cache.
+	 *
+	 * First, we copy the data we just read from cache_bio's bounce buffers
+	 * to the buffers the original bio pointed to:
+	 */
+
+	if (s->iop.bio) {
+		bio_reset(s->iop.bio);
+		s->iop.bio->bi_iter.bi_sector = s->cache_miss->bi_iter.bi_sector;
+		s->iop.bio->bi_bdev = s->cache_miss->bi_bdev;
+		s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9;
+		bch_bio_map(s->iop.bio, NULL);
+
+		bio_copy_data(s->cache_miss, s->iop.bio);
+
+		bio_put(s->cache_miss);
+		s->cache_miss = NULL;
+	}
+
+	if (verify(dc, &s->bio.bio) && s->recoverable && !s->read_dirty_data)
+		bch_data_verify(dc, s->orig_bio);
+
+	bio_complete(s);
+
+	if (s->iop.bio &&
+	    !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) {
+		BUG_ON(!s->iop.replace);
+		closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
+	}
+
+	continue_at(cl, cached_dev_cache_miss_done, NULL);
+}
+
+static void cached_dev_read_done_bh(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+	bch_mark_cache_accounting(s->iop.c, s->d,
+				  !s->cache_miss, s->iop.bypass);
+	trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass);
+
+	if (s->iop.error)
+		continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq);
+	else if (s->iop.bio || verify(dc, &s->bio.bio))
+		continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq);
+	else
+		continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
+}
+
+static int cached_dev_cache_miss(struct btree *b, struct search *s,
+				 struct bio *bio, unsigned sectors)
+{
+	int ret = MAP_CONTINUE;
+	unsigned reada = 0;
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+	struct bio *miss, *cache_bio;
+
+	if (s->cache_miss || s->iop.bypass) {
+		miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+		ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
+		goto out_submit;
+	}
+
+	if (!(bio->bi_rw & REQ_RAHEAD) &&
+	    !(bio->bi_rw & REQ_META) &&
+	    s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA)
+		reada = min_t(sector_t, dc->readahead >> 9,
+			      bdev_sectors(bio->bi_bdev) - bio_end_sector(bio));
+
+	s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada);
+
+	s->iop.replace_key = KEY(s->iop.inode,
+				 bio->bi_iter.bi_sector + s->insert_bio_sectors,
+				 s->insert_bio_sectors);
+
+	ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key);
+	if (ret)
+		return ret;
+
+	s->iop.replace = true;
+
+	miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+
+	/* btree_search_recurse()'s btree iterator is no good anymore */
+	ret = miss == bio ? MAP_DONE : -EINTR;
+
+	cache_bio = bio_alloc_bioset(GFP_NOWAIT,
+			DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS),
+			dc->disk.bio_split);
+	if (!cache_bio)
+		goto out_submit;
+
+	cache_bio->bi_iter.bi_sector	= miss->bi_iter.bi_sector;
+	cache_bio->bi_bdev		= miss->bi_bdev;
+	cache_bio->bi_iter.bi_size	= s->insert_bio_sectors << 9;
+
+	cache_bio->bi_end_io	= request_endio;
+	cache_bio->bi_private	= &s->cl;
+
+	bch_bio_map(cache_bio, NULL);
+	if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
+		goto out_put;
+
+	if (reada)
+		bch_mark_cache_readahead(s->iop.c, s->d);
+
+	s->cache_miss	= miss;
+	s->iop.bio	= cache_bio;
+	bio_get(cache_bio);
+	closure_bio_submit(cache_bio, &s->cl, s->d);
+
+	return ret;
+out_put:
+	bio_put(cache_bio);
+out_submit:
+	miss->bi_end_io		= request_endio;
+	miss->bi_private	= &s->cl;
+	closure_bio_submit(miss, &s->cl, s->d);
+	return ret;
+}
+
+static void cached_dev_read(struct cached_dev *dc, struct search *s)
+{
+	struct closure *cl = &s->cl;
+
+	closure_call(&s->iop.cl, cache_lookup, NULL, cl);
+	continue_at(cl, cached_dev_read_done_bh, NULL);
+}
+
+/* Process writes */
+
+static void cached_dev_write_complete(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+	up_read_non_owner(&dc->writeback_lock);
+	cached_dev_bio_complete(cl);
+}
+
+static void cached_dev_write(struct cached_dev *dc, struct search *s)
+{
+	struct closure *cl = &s->cl;
+	struct bio *bio = &s->bio.bio;
+	struct bkey start = KEY(dc->disk.id, bio->bi_iter.bi_sector, 0);
+	struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0);
+
+	bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end);
+
+	down_read_non_owner(&dc->writeback_lock);
+	if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) {
+		/*
+		 * We overlap with some dirty data undergoing background
+		 * writeback, force this write to writeback
+		 */
+		s->iop.bypass = false;
+		s->iop.writeback = true;
+	}
+
+	/*
+	 * Discards aren't _required_ to do anything, so skipping if
+	 * check_overlapping returned true is ok
+	 *
+	 * But check_overlapping drops dirty keys for which io hasn't started,
+	 * so we still want to call it.
+	 */
+	if (bio->bi_rw & REQ_DISCARD)
+		s->iop.bypass = true;
+
+	if (should_writeback(dc, s->orig_bio,
+			     cache_mode(dc, bio),
+			     s->iop.bypass)) {
+		s->iop.bypass = false;
+		s->iop.writeback = true;
+	}
+
+	if (s->iop.bypass) {
+		s->iop.bio = s->orig_bio;
+		bio_get(s->iop.bio);
+
+		if (!(bio->bi_rw & REQ_DISCARD) ||
+		    blk_queue_discard(bdev_get_queue(dc->bdev)))
+			closure_bio_submit(bio, cl, s->d);
+	} else if (s->iop.writeback) {
+		bch_writeback_add(dc);
+		s->iop.bio = bio;
+
+		if (bio->bi_rw & REQ_FLUSH) {
+			/* Also need to send a flush to the backing device */
+			struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0,
+							     dc->disk.bio_split);
+
+			flush->bi_rw	= WRITE_FLUSH;
+			flush->bi_bdev	= bio->bi_bdev;
+			flush->bi_end_io = request_endio;
+			flush->bi_private = cl;
+
+			closure_bio_submit(flush, cl, s->d);
+		}
+	} else {
+		s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split);
+
+		closure_bio_submit(bio, cl, s->d);
+	}
+
+	closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
+	continue_at(cl, cached_dev_write_complete, NULL);
+}
+
+static void cached_dev_nodata(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	struct bio *bio = &s->bio.bio;
+
+	if (s->iop.flush_journal)
+		bch_journal_meta(s->iop.c, cl);
+
+	/* If it's a flush, we send the flush to the backing device too */
+	closure_bio_submit(bio, cl, s->d);
+
+	continue_at(cl, cached_dev_bio_complete, NULL);
+}
+
+/* Cached devices - read & write stuff */
+
+static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct search *s;
+	struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+	int cpu, rw = bio_data_dir(bio);
+
+	cpu = part_stat_lock();
+	part_stat_inc(cpu, &d->disk->part0, ios[rw]);
+	part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
+	part_stat_unlock();
+
+	bio->bi_bdev = dc->bdev;
+	bio->bi_iter.bi_sector += dc->sb.data_offset;
+
+	if (cached_dev_get(dc)) {
+		s = search_alloc(bio, d);
+		trace_bcache_request_start(s->d, bio);
+
+		if (!bio->bi_iter.bi_size) {
+			/*
+			 * can't call bch_journal_meta from under
+			 * generic_make_request
+			 */
+			continue_at_nobarrier(&s->cl,
+					      cached_dev_nodata,
+					      bcache_wq);
+		} else {
+			s->iop.bypass = check_should_bypass(dc, bio);
+
+			if (rw)
+				cached_dev_write(dc, s);
+			else
+				cached_dev_read(dc, s);
+		}
+	} else {
+		if ((bio->bi_rw & REQ_DISCARD) &&
+		    !blk_queue_discard(bdev_get_queue(dc->bdev)))
+			bio_endio(bio, 0);
+		else
+			bch_generic_make_request(bio, &d->bio_split_hook);
+	}
+}
+
+static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
+			    unsigned int cmd, unsigned long arg)
+{
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+	return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg);
+}
+
+static int cached_dev_congested(void *data, int bits)
+{
+	struct bcache_device *d = data;
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+	struct request_queue *q = bdev_get_queue(dc->bdev);
+	int ret = 0;
+
+	if (bdi_congested(&q->backing_dev_info, bits))
+		return 1;
+
+	if (cached_dev_get(dc)) {
+		unsigned i;
+		struct cache *ca;
+
+		for_each_cache(ca, d->c, i) {
+			q = bdev_get_queue(ca->bdev);
+			ret |= bdi_congested(&q->backing_dev_info, bits);
+		}
+
+		cached_dev_put(dc);
+	}
+
+	return ret;
+}
+
+void bch_cached_dev_request_init(struct cached_dev *dc)
+{
+	struct gendisk *g = dc->disk.disk;
+
+	g->queue->make_request_fn		= cached_dev_make_request;
+	g->queue->backing_dev_info.congested_fn = cached_dev_congested;
+	dc->disk.cache_miss			= cached_dev_cache_miss;
+	dc->disk.ioctl				= cached_dev_ioctl;
+}
+
+/* Flash backed devices */
+
+static int flash_dev_cache_miss(struct btree *b, struct search *s,
+				struct bio *bio, unsigned sectors)
+{
+	unsigned bytes = min(sectors, bio_sectors(bio)) << 9;
+
+	swap(bio->bi_iter.bi_size, bytes);
+	zero_fill_bio(bio);
+	swap(bio->bi_iter.bi_size, bytes);
+
+	bio_advance(bio, bytes);
+
+	if (!bio->bi_iter.bi_size)
+		return MAP_DONE;
+
+	return MAP_CONTINUE;
+}
+
+static void flash_dev_nodata(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+
+	if (s->iop.flush_journal)
+		bch_journal_meta(s->iop.c, cl);
+
+	continue_at(cl, search_free, NULL);
+}
+
+static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct search *s;
+	struct closure *cl;
+	struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
+	int cpu, rw = bio_data_dir(bio);
+
+	cpu = part_stat_lock();
+	part_stat_inc(cpu, &d->disk->part0, ios[rw]);
+	part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
+	part_stat_unlock();
+
+	s = search_alloc(bio, d);
+	cl = &s->cl;
+	bio = &s->bio.bio;
+
+	trace_bcache_request_start(s->d, bio);
+
+	if (!bio->bi_iter.bi_size) {
+		/*
+		 * can't call bch_journal_meta from under
+		 * generic_make_request
+		 */
+		continue_at_nobarrier(&s->cl,
+				      flash_dev_nodata,
+				      bcache_wq);
+	} else if (rw) {
+		bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
+					&KEY(d->id, bio->bi_iter.bi_sector, 0),
+					&KEY(d->id, bio_end_sector(bio), 0));
+
+		s->iop.bypass		= (bio->bi_rw & REQ_DISCARD) != 0;
+		s->iop.writeback	= true;
+		s->iop.bio		= bio;
+
+		closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
+	} else {
+		closure_call(&s->iop.cl, cache_lookup, NULL, cl);
+	}
+
+	continue_at(cl, search_free, NULL);
+}
+
+static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode,
+			   unsigned int cmd, unsigned long arg)
+{
+	return -ENOTTY;
+}
+
+static int flash_dev_congested(void *data, int bits)
+{
+	struct bcache_device *d = data;
+	struct request_queue *q;
+	struct cache *ca;
+	unsigned i;
+	int ret = 0;
+
+	for_each_cache(ca, d->c, i) {
+		q = bdev_get_queue(ca->bdev);
+		ret |= bdi_congested(&q->backing_dev_info, bits);
+	}
+
+	return ret;
+}
+
+void bch_flash_dev_request_init(struct bcache_device *d)
+{
+	struct gendisk *g = d->disk;
+
+	g->queue->make_request_fn		= flash_dev_make_request;
+	g->queue->backing_dev_info.congested_fn = flash_dev_congested;
+	d->cache_miss				= flash_dev_cache_miss;
+	d->ioctl				= flash_dev_ioctl;
+}
+
+void bch_request_exit(void)
+{
+	if (bch_search_cache)
+		kmem_cache_destroy(bch_search_cache);
+}
+
+int __init bch_request_init(void)
+{
+	bch_search_cache = KMEM_CACHE(search, 0);
+	if (!bch_search_cache)
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
new file mode 100644
index 00000000000..1ff36875c2b
--- /dev/null
+++ b/drivers/md/bcache/request.h
@@ -0,0 +1,43 @@
+#ifndef _BCACHE_REQUEST_H_
+#define _BCACHE_REQUEST_H_
+
+struct data_insert_op {
+	struct closure		cl;
+	struct cache_set	*c;
+	struct bio		*bio;
+	struct workqueue_struct *wq;
+
+	unsigned		inode;
+	uint16_t		write_point;
+	uint16_t		write_prio;
+	short			error;
+
+	union {
+		uint16_t	flags;
+
+	struct {
+		unsigned	bypass:1;
+		unsigned	writeback:1;
+		unsigned	flush_journal:1;
+		unsigned	csum:1;
+
+		unsigned	replace:1;
+		unsigned	replace_collision:1;
+
+		unsigned	insert_data_done:1;
+	};
+	};
+
+	struct keylist		insert_keys;
+	BKEY_PADDED(replace_key);
+};
+
+unsigned bch_get_congested(struct cache_set *);
+void bch_data_insert(struct closure *cl);
+
+void bch_cached_dev_request_init(struct cached_dev *dc);
+void bch_flash_dev_request_init(struct bcache_device *d);
+
+extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache;
+
+#endif /* _BCACHE_REQUEST_H_ */
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
new file mode 100644
index 00000000000..0ca072c20d0
--- /dev/null
+++ b/drivers/md/bcache/stats.c
@@ -0,0 +1,241 @@
+/*
+ * bcache stats code
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "stats.h"
+#include "btree.h"
+#include "sysfs.h"
+
+/*
+ * We keep absolute totals of various statistics, and addionally a set of three
+ * rolling averages.
+ *
+ * Every so often, a timer goes off and rescales the rolling averages.
+ * accounting_rescale[] is how many times the timer has to go off before we
+ * rescale each set of numbers; that gets us half lives of 5 minutes, one hour,
+ * and one day.
+ *
+ * accounting_delay is how often the timer goes off - 22 times in 5 minutes,
+ * and accounting_weight is what we use to rescale:
+ *
+ * pow(31 / 32, 22) ~= 1/2
+ *
+ * So that we don't have to increment each set of numbers every time we (say)
+ * get a cache hit, we increment a single atomic_t in acc->collector, and when
+ * the rescale function runs it resets the atomic counter to 0 and adds its
+ * old value to each of the exported numbers.
+ *
+ * To reduce rounding error, the numbers in struct cache_stats are all
+ * stored left shifted by 16, and scaled back in the sysfs show() function.
+ */
+
+static const unsigned DAY_RESCALE		= 288;
+static const unsigned HOUR_RESCALE		= 12;
+static const unsigned FIVE_MINUTE_RESCALE	= 1;
+static const unsigned accounting_delay		= (HZ * 300) / 22;
+static const unsigned accounting_weight		= 32;
+
+/* sysfs reading/writing */
+
+read_attribute(cache_hits);
+read_attribute(cache_misses);
+read_attribute(cache_bypass_hits);
+read_attribute(cache_bypass_misses);
+read_attribute(cache_hit_ratio);
+read_attribute(cache_readaheads);
+read_attribute(cache_miss_collisions);
+read_attribute(bypassed);
+
+SHOW(bch_stats)
+{
+	struct cache_stats *s =
+		container_of(kobj, struct cache_stats, kobj);
+#define var(stat)		(s->stat >> 16)
+	var_print(cache_hits);
+	var_print(cache_misses);
+	var_print(cache_bypass_hits);
+	var_print(cache_bypass_misses);
+
+	sysfs_print(cache_hit_ratio,
+		    DIV_SAFE(var(cache_hits) * 100,
+			     var(cache_hits) + var(cache_misses)));
+
+	var_print(cache_readaheads);
+	var_print(cache_miss_collisions);
+	sysfs_hprint(bypassed,	var(sectors_bypassed) << 9);
+#undef var
+	return 0;
+}
+
+STORE(bch_stats)
+{
+	return size;
+}
+
+static void bch_stats_release(struct kobject *k)
+{
+}
+
+static struct attribute *bch_stats_files[] = {
+	&sysfs_cache_hits,
+	&sysfs_cache_misses,
+	&sysfs_cache_bypass_hits,
+	&sysfs_cache_bypass_misses,
+	&sysfs_cache_hit_ratio,
+	&sysfs_cache_readaheads,
+	&sysfs_cache_miss_collisions,
+	&sysfs_bypassed,
+	NULL
+};
+static KTYPE(bch_stats);
+
+int bch_cache_accounting_add_kobjs(struct cache_accounting *acc,
+				   struct kobject *parent)
+{
+	int ret = kobject_add(&acc->total.kobj, parent,
+			      "stats_total");
+	ret = ret ?: kobject_add(&acc->five_minute.kobj, parent,
+				 "stats_five_minute");
+	ret = ret ?: kobject_add(&acc->hour.kobj, parent,
+				 "stats_hour");
+	ret = ret ?: kobject_add(&acc->day.kobj, parent,
+				 "stats_day");
+	return ret;
+}
+
+void bch_cache_accounting_clear(struct cache_accounting *acc)
+{
+	memset(&acc->total.cache_hits,
+	       0,
+	       sizeof(unsigned long) * 7);
+}
+
+void bch_cache_accounting_destroy(struct cache_accounting *acc)
+{
+	kobject_put(&acc->total.kobj);
+	kobject_put(&acc->five_minute.kobj);
+	kobject_put(&acc->hour.kobj);
+	kobject_put(&acc->day.kobj);
+
+	atomic_set(&acc->closing, 1);
+	if (del_timer_sync(&acc->timer))
+		closure_return(&acc->cl);
+}
+
+/* EWMA scaling */
+
+static void scale_stat(unsigned long *stat)
+{
+	*stat =  ewma_add(*stat, 0, accounting_weight, 0);
+}
+
+static void scale_stats(struct cache_stats *stats, unsigned long rescale_at)
+{
+	if (++stats->rescale == rescale_at) {
+		stats->rescale = 0;
+		scale_stat(&stats->cache_hits);
+		scale_stat(&stats->cache_misses);
+		scale_stat(&stats->cache_bypass_hits);
+		scale_stat(&stats->cache_bypass_misses);
+		scale_stat(&stats->cache_readaheads);
+		scale_stat(&stats->cache_miss_collisions);
+		scale_stat(&stats->sectors_bypassed);
+	}
+}
+
+static void scale_accounting(unsigned long data)
+{
+	struct cache_accounting *acc = (struct cache_accounting *) data;
+
+#define move_stat(name) do {						\
+	unsigned t = atomic_xchg(&acc->collector.name, 0);		\
+	t <<= 16;							\
+	acc->five_minute.name += t;					\
+	acc->hour.name += t;						\
+	acc->day.name += t;						\
+	acc->total.name += t;						\
+} while (0)
+
+	move_stat(cache_hits);
+	move_stat(cache_misses);
+	move_stat(cache_bypass_hits);
+	move_stat(cache_bypass_misses);
+	move_stat(cache_readaheads);
+	move_stat(cache_miss_collisions);
+	move_stat(sectors_bypassed);
+
+	scale_stats(&acc->total, 0);
+	scale_stats(&acc->day, DAY_RESCALE);
+	scale_stats(&acc->hour, HOUR_RESCALE);
+	scale_stats(&acc->five_minute, FIVE_MINUTE_RESCALE);
+
+	acc->timer.expires += accounting_delay;
+
+	if (!atomic_read(&acc->closing))
+		add_timer(&acc->timer);
+	else
+		closure_return(&acc->cl);
+}
+
+static void mark_cache_stats(struct cache_stat_collector *stats,
+			     bool hit, bool bypass)
+{
+	if (!bypass)
+		if (hit)
+			atomic_inc(&stats->cache_hits);
+		else
+			atomic_inc(&stats->cache_misses);
+	else
+		if (hit)
+			atomic_inc(&stats->cache_bypass_hits);
+		else
+			atomic_inc(&stats->cache_bypass_misses);
+}
+
+void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d,
+			       bool hit, bool bypass)
+{
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+	mark_cache_stats(&dc->accounting.collector, hit, bypass);
+	mark_cache_stats(&c->accounting.collector, hit, bypass);
+}
+
+void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d)
+{
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+	atomic_inc(&dc->accounting.collector.cache_readaheads);
+	atomic_inc(&c->accounting.collector.cache_readaheads);
+}
+
+void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d)
+{
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+	atomic_inc(&dc->accounting.collector.cache_miss_collisions);
+	atomic_inc(&c->accounting.collector.cache_miss_collisions);
+}
+
+void bch_mark_sectors_bypassed(struct cache_set *c, struct cached_dev *dc,
+			       int sectors)
+{
+	atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
+	atomic_add(sectors, &c->accounting.collector.sectors_bypassed);
+}
+
+void bch_cache_accounting_init(struct cache_accounting *acc,
+			       struct closure *parent)
+{
+	kobject_init(&acc->total.kobj,		&bch_stats_ktype);
+	kobject_init(&acc->five_minute.kobj,	&bch_stats_ktype);
+	kobject_init(&acc->hour.kobj,		&bch_stats_ktype);
+	kobject_init(&acc->day.kobj,		&bch_stats_ktype);
+
+	closure_init(&acc->cl, parent);
+	init_timer(&acc->timer);
+	acc->timer.expires	= jiffies + accounting_delay;
+	acc->timer.data		= (unsigned long) acc;
+	acc->timer.function	= scale_accounting;
+	add_timer(&acc->timer);
+}
diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h
new file mode 100644
index 00000000000..adbff141c88
--- /dev/null
+++ b/drivers/md/bcache/stats.h
@@ -0,0 +1,61 @@
+#ifndef _BCACHE_STATS_H_
+#define _BCACHE_STATS_H_
+
+struct cache_stat_collector {
+	atomic_t cache_hits;
+	atomic_t cache_misses;
+	atomic_t cache_bypass_hits;
+	atomic_t cache_bypass_misses;
+	atomic_t cache_readaheads;
+	atomic_t cache_miss_collisions;
+	atomic_t sectors_bypassed;
+};
+
+struct cache_stats {
+	struct kobject		kobj;
+
+	unsigned long cache_hits;
+	unsigned long cache_misses;
+	unsigned long cache_bypass_hits;
+	unsigned long cache_bypass_misses;
+	unsigned long cache_readaheads;
+	unsigned long cache_miss_collisions;
+	unsigned long sectors_bypassed;
+
+	unsigned		rescale;
+};
+
+struct cache_accounting {
+	struct closure		cl;
+	struct timer_list	timer;
+	atomic_t		closing;
+
+	struct cache_stat_collector collector;
+
+	struct cache_stats total;
+	struct cache_stats five_minute;
+	struct cache_stats hour;
+	struct cache_stats day;
+};
+
+struct cache_set;
+struct cached_dev;
+struct bcache_device;
+
+void bch_cache_accounting_init(struct cache_accounting *acc,
+			       struct closure *parent);
+
+int bch_cache_accounting_add_kobjs(struct cache_accounting *acc,
+				   struct kobject *parent);
+
+void bch_cache_accounting_clear(struct cache_accounting *acc);
+
+void bch_cache_accounting_destroy(struct cache_accounting *acc);
+
+void bch_mark_cache_accounting(struct cache_set *, struct bcache_device *,
+			       bool, bool);
+void bch_mark_cache_readahead(struct cache_set *, struct bcache_device *);
+void bch_mark_cache_miss_collision(struct cache_set *, struct bcache_device *);
+void bch_mark_sectors_bypassed(struct cache_set *, struct cached_dev *, int);
+
+#endif /* _BCACHE_STATS_H_ */
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
new file mode 100644
index 00000000000..926ded8ccbf
--- /dev/null
+++ b/drivers/md/bcache/super.c
@@ -0,0 +1,2102 @@
+/*
+ * bcache setup/teardown code, and some metadata io - read a superblock and
+ * figure out what to do with it.
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "extents.h"
+#include "request.h"
+#include "writeback.h"
+
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/debugfs.h>
+#include <linux/genhd.h>
+#include <linux/idr.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/reboot.h>
+#include <linux/sysfs.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+
+static const char bcache_magic[] = {
+	0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
+	0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
+};
+
+static const char invalid_uuid[] = {
+	0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
+	0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
+};
+
+/* Default is -1; we skip past it for struct cached_dev's cache mode */
+const char * const bch_cache_modes[] = {
+	"default",
+	"writethrough",
+	"writeback",
+	"writearound",
+	"none",
+	NULL
+};
+
+static struct kobject *bcache_kobj;
+struct mutex bch_register_lock;
+LIST_HEAD(bch_cache_sets);
+static LIST_HEAD(uncached_devices);
+
+static int bcache_major;
+static DEFINE_IDA(bcache_minor);
+static wait_queue_head_t unregister_wait;
+struct workqueue_struct *bcache_wq;
+
+#define BTREE_MAX_PAGES		(256 * 1024 / PAGE_SIZE)
+
+static void bio_split_pool_free(struct bio_split_pool *p)
+{
+	if (p->bio_split_hook)
+		mempool_destroy(p->bio_split_hook);
+
+	if (p->bio_split)
+		bioset_free(p->bio_split);
+}
+
+static int bio_split_pool_init(struct bio_split_pool *p)
+{
+	p->bio_split = bioset_create(4, 0);
+	if (!p->bio_split)
+		return -ENOMEM;
+
+	p->bio_split_hook = mempool_create_kmalloc_pool(4,
+				sizeof(struct bio_split_hook));
+	if (!p->bio_split_hook)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/* Superblock */
+
+static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
+			      struct page **res)
+{
+	const char *err;
+	struct cache_sb *s;
+	struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
+	unsigned i;
+
+	if (!bh)
+		return "IO error";
+
+	s = (struct cache_sb *) bh->b_data;
+
+	sb->offset		= le64_to_cpu(s->offset);
+	sb->version		= le64_to_cpu(s->version);
+
+	memcpy(sb->magic,	s->magic, 16);
+	memcpy(sb->uuid,	s->uuid, 16);
+	memcpy(sb->set_uuid,	s->set_uuid, 16);
+	memcpy(sb->label,	s->label, SB_LABEL_SIZE);
+
+	sb->flags		= le64_to_cpu(s->flags);
+	sb->seq			= le64_to_cpu(s->seq);
+	sb->last_mount		= le32_to_cpu(s->last_mount);
+	sb->first_bucket	= le16_to_cpu(s->first_bucket);
+	sb->keys		= le16_to_cpu(s->keys);
+
+	for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
+		sb->d[i] = le64_to_cpu(s->d[i]);
+
+	pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
+		 sb->version, sb->flags, sb->seq, sb->keys);
+
+	err = "Not a bcache superblock";
+	if (sb->offset != SB_SECTOR)
+		goto err;
+
+	if (memcmp(sb->magic, bcache_magic, 16))
+		goto err;
+
+	err = "Too many journal buckets";
+	if (sb->keys > SB_JOURNAL_BUCKETS)
+		goto err;
+
+	err = "Bad checksum";
+	if (s->csum != csum_set(s))
+		goto err;
+
+	err = "Bad UUID";
+	if (bch_is_zero(sb->uuid, 16))
+		goto err;
+
+	sb->block_size	= le16_to_cpu(s->block_size);
+
+	err = "Superblock block size smaller than device block size";
+	if (sb->block_size << 9 < bdev_logical_block_size(bdev))
+		goto err;
+
+	switch (sb->version) {
+	case BCACHE_SB_VERSION_BDEV:
+		sb->data_offset	= BDEV_DATA_START_DEFAULT;
+		break;
+	case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
+		sb->data_offset	= le64_to_cpu(s->data_offset);
+
+		err = "Bad data offset";
+		if (sb->data_offset < BDEV_DATA_START_DEFAULT)
+			goto err;
+
+		break;
+	case BCACHE_SB_VERSION_CDEV:
+	case BCACHE_SB_VERSION_CDEV_WITH_UUID:
+		sb->nbuckets	= le64_to_cpu(s->nbuckets);
+		sb->block_size	= le16_to_cpu(s->block_size);
+		sb->bucket_size	= le16_to_cpu(s->bucket_size);
+
+		sb->nr_in_set	= le16_to_cpu(s->nr_in_set);
+		sb->nr_this_dev	= le16_to_cpu(s->nr_this_dev);
+
+		err = "Too many buckets";
+		if (sb->nbuckets > LONG_MAX)
+			goto err;
+
+		err = "Not enough buckets";
+		if (sb->nbuckets < 1 << 7)
+			goto err;
+
+		err = "Bad block/bucket size";
+		if (!is_power_of_2(sb->block_size) ||
+		    sb->block_size > PAGE_SECTORS ||
+		    !is_power_of_2(sb->bucket_size) ||
+		    sb->bucket_size < PAGE_SECTORS)
+			goto err;
+
+		err = "Invalid superblock: device too small";
+		if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets)
+			goto err;
+
+		err = "Bad UUID";
+		if (bch_is_zero(sb->set_uuid, 16))
+			goto err;
+
+		err = "Bad cache device number in set";
+		if (!sb->nr_in_set ||
+		    sb->nr_in_set <= sb->nr_this_dev ||
+		    sb->nr_in_set > MAX_CACHES_PER_SET)
+			goto err;
+
+		err = "Journal buckets not sequential";
+		for (i = 0; i < sb->keys; i++)
+			if (sb->d[i] != sb->first_bucket + i)
+				goto err;
+
+		err = "Too many journal buckets";
+		if (sb->first_bucket + sb->keys > sb->nbuckets)
+			goto err;
+
+		err = "Invalid superblock: first bucket comes before end of super";
+		if (sb->first_bucket * sb->bucket_size < 16)
+			goto err;
+
+		break;
+	default:
+		err = "Unsupported superblock version";
+		goto err;
+	}
+
+	sb->last_mount = get_seconds();
+	err = NULL;
+
+	get_page(bh->b_page);
+	*res = bh->b_page;
+err:
+	put_bh(bh);
+	return err;
+}
+
+static void write_bdev_super_endio(struct bio *bio, int error)
+{
+	struct cached_dev *dc = bio->bi_private;
+	/* XXX: error checking */
+
+	closure_put(&dc->sb_write);
+}
+
+static void __write_super(struct cache_sb *sb, struct bio *bio)
+{
+	struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
+	unsigned i;
+
+	bio->bi_iter.bi_sector	= SB_SECTOR;
+	bio->bi_rw		= REQ_SYNC|REQ_META;
+	bio->bi_iter.bi_size	= SB_SIZE;
+	bch_bio_map(bio, NULL);
+
+	out->offset		= cpu_to_le64(sb->offset);
+	out->version		= cpu_to_le64(sb->version);
+
+	memcpy(out->uuid,	sb->uuid, 16);
+	memcpy(out->set_uuid,	sb->set_uuid, 16);
+	memcpy(out->label,	sb->label, SB_LABEL_SIZE);
+
+	out->flags		= cpu_to_le64(sb->flags);
+	out->seq		= cpu_to_le64(sb->seq);
+
+	out->last_mount		= cpu_to_le32(sb->last_mount);
+	out->first_bucket	= cpu_to_le16(sb->first_bucket);
+	out->keys		= cpu_to_le16(sb->keys);
+
+	for (i = 0; i < sb->keys; i++)
+		out->d[i] = cpu_to_le64(sb->d[i]);
+
+	out->csum = csum_set(out);
+
+	pr_debug("ver %llu, flags %llu, seq %llu",
+		 sb->version, sb->flags, sb->seq);
+
+	submit_bio(REQ_WRITE, bio);
+}
+
+static void bch_write_bdev_super_unlock(struct closure *cl)
+{
+	struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
+
+	up(&dc->sb_write_mutex);
+}
+
+void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
+{
+	struct closure *cl = &dc->sb_write;
+	struct bio *bio = &dc->sb_bio;
+
+	down(&dc->sb_write_mutex);
+	closure_init(cl, parent);
+
+	bio_reset(bio);
+	bio->bi_bdev	= dc->bdev;
+	bio->bi_end_io	= write_bdev_super_endio;
+	bio->bi_private = dc;
+
+	closure_get(cl);
+	__write_super(&dc->sb, bio);
+
+	closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
+}
+
+static void write_super_endio(struct bio *bio, int error)
+{
+	struct cache *ca = bio->bi_private;
+
+	bch_count_io_errors(ca, error, "writing superblock");
+	closure_put(&ca->set->sb_write);
+}
+
+static void bcache_write_super_unlock(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, sb_write);
+
+	up(&c->sb_write_mutex);
+}
+
+void bcache_write_super(struct cache_set *c)
+{
+	struct closure *cl = &c->sb_write;
+	struct cache *ca;
+	unsigned i;
+
+	down(&c->sb_write_mutex);
+	closure_init(cl, &c->cl);
+
+	c->sb.seq++;
+
+	for_each_cache(ca, c, i) {
+		struct bio *bio = &ca->sb_bio;
+
+		ca->sb.version		= BCACHE_SB_VERSION_CDEV_WITH_UUID;
+		ca->sb.seq		= c->sb.seq;
+		ca->sb.last_mount	= c->sb.last_mount;
+
+		SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
+
+		bio_reset(bio);
+		bio->bi_bdev	= ca->bdev;
+		bio->bi_end_io	= write_super_endio;
+		bio->bi_private = ca;
+
+		closure_get(cl);
+		__write_super(&ca->sb, bio);
+	}
+
+	closure_return_with_destructor(cl, bcache_write_super_unlock);
+}
+
+/* UUID io */
+
+static void uuid_endio(struct bio *bio, int error)
+{
+	struct closure *cl = bio->bi_private;
+	struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
+
+	cache_set_err_on(error, c, "accessing uuids");
+	bch_bbio_free(bio, c);
+	closure_put(cl);
+}
+
+static void uuid_io_unlock(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
+
+	up(&c->uuid_write_mutex);
+}
+
+static void uuid_io(struct cache_set *c, unsigned long rw,
+		    struct bkey *k, struct closure *parent)
+{
+	struct closure *cl = &c->uuid_write;
+	struct uuid_entry *u;
+	unsigned i;
+	char buf[80];
+
+	BUG_ON(!parent);
+	down(&c->uuid_write_mutex);
+	closure_init(cl, parent);
+
+	for (i = 0; i < KEY_PTRS(k); i++) {
+		struct bio *bio = bch_bbio_alloc(c);
+
+		bio->bi_rw	= REQ_SYNC|REQ_META|rw;
+		bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
+
+		bio->bi_end_io	= uuid_endio;
+		bio->bi_private = cl;
+		bch_bio_map(bio, c->uuids);
+
+		bch_submit_bbio(bio, c, k, i);
+
+		if (!(rw & WRITE))
+			break;
+	}
+
+	bch_extent_to_text(buf, sizeof(buf), k);
+	pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf);
+
+	for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
+		if (!bch_is_zero(u->uuid, 16))
+			pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u",
+				 u - c->uuids, u->uuid, u->label,
+				 u->first_reg, u->last_reg, u->invalidated);
+
+	closure_return_with_destructor(cl, uuid_io_unlock);
+}
+
+static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
+{
+	struct bkey *k = &j->uuid_bucket;
+
+	if (__bch_btree_ptr_invalid(c, k))
+		return "bad uuid pointer";
+
+	bkey_copy(&c->uuid_bucket, k);
+	uuid_io(c, READ_SYNC, k, cl);
+
+	if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
+		struct uuid_entry_v0	*u0 = (void *) c->uuids;
+		struct uuid_entry	*u1 = (void *) c->uuids;
+		int i;
+
+		closure_sync(cl);
+
+		/*
+		 * Since the new uuid entry is bigger than the old, we have to
+		 * convert starting at the highest memory address and work down
+		 * in order to do it in place
+		 */
+
+		for (i = c->nr_uuids - 1;
+		     i >= 0;
+		     --i) {
+			memcpy(u1[i].uuid,	u0[i].uuid, 16);
+			memcpy(u1[i].label,	u0[i].label, 32);
+
+			u1[i].first_reg		= u0[i].first_reg;
+			u1[i].last_reg		= u0[i].last_reg;
+			u1[i].invalidated	= u0[i].invalidated;
+
+			u1[i].flags	= 0;
+			u1[i].sectors	= 0;
+		}
+	}
+
+	return NULL;
+}
+
+static int __uuid_write(struct cache_set *c)
+{
+	BKEY_PADDED(key) k;
+	struct closure cl;
+	closure_init_stack(&cl);
+
+	lockdep_assert_held(&bch_register_lock);
+
+	if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
+		return 1;
+
+	SET_KEY_SIZE(&k.key, c->sb.bucket_size);
+	uuid_io(c, REQ_WRITE, &k.key, &cl);
+	closure_sync(&cl);
+
+	bkey_copy(&c->uuid_bucket, &k.key);
+	bkey_put(c, &k.key);
+	return 0;
+}
+
+int bch_uuid_write(struct cache_set *c)
+{
+	int ret = __uuid_write(c);
+
+	if (!ret)
+		bch_journal_meta(c, NULL);
+
+	return ret;
+}
+
+static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
+{
+	struct uuid_entry *u;
+
+	for (u = c->uuids;
+	     u < c->uuids + c->nr_uuids; u++)
+		if (!memcmp(u->uuid, uuid, 16))
+			return u;
+
+	return NULL;
+}
+
+static struct uuid_entry *uuid_find_empty(struct cache_set *c)
+{
+	static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
+	return uuid_find(c, zero_uuid);
+}
+
+/*
+ * Bucket priorities/gens:
+ *
+ * For each bucket, we store on disk its
+   * 8 bit gen
+   * 16 bit priority
+ *
+ * See alloc.c for an explanation of the gen. The priority is used to implement
+ * lru (and in the future other) cache replacement policies; for most purposes
+ * it's just an opaque integer.
+ *
+ * The gens and the priorities don't have a whole lot to do with each other, and
+ * it's actually the gens that must be written out at specific times - it's no
+ * big deal if the priorities don't get written, if we lose them we just reuse
+ * buckets in suboptimal order.
+ *
+ * On disk they're stored in a packed array, and in as many buckets are required
+ * to fit them all. The buckets we use to store them form a list; the journal
+ * header points to the first bucket, the first bucket points to the second
+ * bucket, et cetera.
+ *
+ * This code is used by the allocation code; periodically (whenever it runs out
+ * of buckets to allocate from) the allocation code will invalidate some
+ * buckets, but it can't use those buckets until their new gens are safely on
+ * disk.
+ */
+
+static void prio_endio(struct bio *bio, int error)
+{
+	struct cache *ca = bio->bi_private;
+
+	cache_set_err_on(error, ca->set, "accessing priorities");
+	bch_bbio_free(bio, ca->set);
+	closure_put(&ca->prio);
+}
+
+static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw)
+{
+	struct closure *cl = &ca->prio;
+	struct bio *bio = bch_bbio_alloc(ca->set);
+
+	closure_init_stack(cl);
+
+	bio->bi_iter.bi_sector	= bucket * ca->sb.bucket_size;
+	bio->bi_bdev		= ca->bdev;
+	bio->bi_rw		= REQ_SYNC|REQ_META|rw;
+	bio->bi_iter.bi_size	= bucket_bytes(ca);
+
+	bio->bi_end_io	= prio_endio;
+	bio->bi_private = ca;
+	bch_bio_map(bio, ca->disk_buckets);
+
+	closure_bio_submit(bio, &ca->prio, ca);
+	closure_sync(cl);
+}
+
+void bch_prio_write(struct cache *ca)
+{
+	int i;
+	struct bucket *b;
+	struct closure cl;
+
+	closure_init_stack(&cl);
+
+	lockdep_assert_held(&ca->set->bucket_lock);
+
+	ca->disk_buckets->seq++;
+
+	atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
+			&ca->meta_sectors_written);
+
+	//pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
+	//	 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
+
+	for (i = prio_buckets(ca) - 1; i >= 0; --i) {
+		long bucket;
+		struct prio_set *p = ca->disk_buckets;
+		struct bucket_disk *d = p->data;
+		struct bucket_disk *end = d + prios_per_bucket(ca);
+
+		for (b = ca->buckets + i * prios_per_bucket(ca);
+		     b < ca->buckets + ca->sb.nbuckets && d < end;
+		     b++, d++) {
+			d->prio = cpu_to_le16(b->prio);
+			d->gen = b->gen;
+		}
+
+		p->next_bucket	= ca->prio_buckets[i + 1];
+		p->magic	= pset_magic(&ca->sb);
+		p->csum		= bch_crc64(&p->magic, bucket_bytes(ca) - 8);
+
+		bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
+		BUG_ON(bucket == -1);
+
+		mutex_unlock(&ca->set->bucket_lock);
+		prio_io(ca, bucket, REQ_WRITE);
+		mutex_lock(&ca->set->bucket_lock);
+
+		ca->prio_buckets[i] = bucket;
+		atomic_dec_bug(&ca->buckets[bucket].pin);
+	}
+
+	mutex_unlock(&ca->set->bucket_lock);
+
+	bch_journal_meta(ca->set, &cl);
+	closure_sync(&cl);
+
+	mutex_lock(&ca->set->bucket_lock);
+
+	/*
+	 * Don't want the old priorities to get garbage collected until after we
+	 * finish writing the new ones, and they're journalled
+	 */
+	for (i = 0; i < prio_buckets(ca); i++) {
+		if (ca->prio_last_buckets[i])
+			__bch_bucket_free(ca,
+				&ca->buckets[ca->prio_last_buckets[i]]);
+
+		ca->prio_last_buckets[i] = ca->prio_buckets[i];
+	}
+}
+
+static void prio_read(struct cache *ca, uint64_t bucket)
+{
+	struct prio_set *p = ca->disk_buckets;
+	struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
+	struct bucket *b;
+	unsigned bucket_nr = 0;
+
+	for (b = ca->buckets;
+	     b < ca->buckets + ca->sb.nbuckets;
+	     b++, d++) {
+		if (d == end) {
+			ca->prio_buckets[bucket_nr] = bucket;
+			ca->prio_last_buckets[bucket_nr] = bucket;
+			bucket_nr++;
+
+			prio_io(ca, bucket, READ_SYNC);
+
+			if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8))
+				pr_warn("bad csum reading priorities");
+
+			if (p->magic != pset_magic(&ca->sb))
+				pr_warn("bad magic reading priorities");
+
+			bucket = p->next_bucket;
+			d = p->data;
+		}
+
+		b->prio = le16_to_cpu(d->prio);
+		b->gen = b->last_gc = d->gen;
+	}
+}
+
+/* Bcache device */
+
+static int open_dev(struct block_device *b, fmode_t mode)
+{
+	struct bcache_device *d = b->bd_disk->private_data;
+	if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
+		return -ENXIO;
+
+	closure_get(&d->cl);
+	return 0;
+}
+
+static void release_dev(struct gendisk *b, fmode_t mode)
+{
+	struct bcache_device *d = b->private_data;
+	closure_put(&d->cl);
+}
+
+static int ioctl_dev(struct block_device *b, fmode_t mode,
+		     unsigned int cmd, unsigned long arg)
+{
+	struct bcache_device *d = b->bd_disk->private_data;
+	return d->ioctl(d, mode, cmd, arg);
+}
+
+static const struct block_device_operations bcache_ops = {
+	.open		= open_dev,
+	.release	= release_dev,
+	.ioctl		= ioctl_dev,
+	.owner		= THIS_MODULE,
+};
+
+void bcache_device_stop(struct bcache_device *d)
+{
+	if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
+		closure_queue(&d->cl);
+}
+
+static void bcache_device_unlink(struct bcache_device *d)
+{
+	lockdep_assert_held(&bch_register_lock);
+
+	if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
+		unsigned i;
+		struct cache *ca;
+
+		sysfs_remove_link(&d->c->kobj, d->name);
+		sysfs_remove_link(&d->kobj, "cache");
+
+		for_each_cache(ca, d->c, i)
+			bd_unlink_disk_holder(ca->bdev, d->disk);
+	}
+}
+
+static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
+			       const char *name)
+{
+	unsigned i;
+	struct cache *ca;
+
+	for_each_cache(ca, d->c, i)
+		bd_link_disk_holder(ca->bdev, d->disk);
+
+	snprintf(d->name, BCACHEDEVNAME_SIZE,
+		 "%s%u", name, d->id);
+
+	WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
+	     sysfs_create_link(&c->kobj, &d->kobj, d->name),
+	     "Couldn't create device <-> cache set symlinks");
+}
+
+static void bcache_device_detach(struct bcache_device *d)
+{
+	lockdep_assert_held(&bch_register_lock);
+
+	if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
+		struct uuid_entry *u = d->c->uuids + d->id;
+
+		SET_UUID_FLASH_ONLY(u, 0);
+		memcpy(u->uuid, invalid_uuid, 16);
+		u->invalidated = cpu_to_le32(get_seconds());
+		bch_uuid_write(d->c);
+	}
+
+	bcache_device_unlink(d);
+
+	d->c->devices[d->id] = NULL;
+	closure_put(&d->c->caching);
+	d->c = NULL;
+}
+
+static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
+				 unsigned id)
+{
+	BUG_ON(test_bit(CACHE_SET_STOPPING, &c->flags));
+
+	d->id = id;
+	d->c = c;
+	c->devices[id] = d;
+
+	closure_get(&c->caching);
+}
+
+static void bcache_device_free(struct bcache_device *d)
+{
+	lockdep_assert_held(&bch_register_lock);
+
+	pr_info("%s stopped", d->disk->disk_name);
+
+	if (d->c)
+		bcache_device_detach(d);
+	if (d->disk && d->disk->flags & GENHD_FL_UP)
+		del_gendisk(d->disk);
+	if (d->disk && d->disk->queue)
+		blk_cleanup_queue(d->disk->queue);
+	if (d->disk) {
+		ida_simple_remove(&bcache_minor, d->disk->first_minor);
+		put_disk(d->disk);
+	}
+
+	bio_split_pool_free(&d->bio_split_hook);
+	if (d->bio_split)
+		bioset_free(d->bio_split);
+	if (is_vmalloc_addr(d->full_dirty_stripes))
+		vfree(d->full_dirty_stripes);
+	else
+		kfree(d->full_dirty_stripes);
+	if (is_vmalloc_addr(d->stripe_sectors_dirty))
+		vfree(d->stripe_sectors_dirty);
+	else
+		kfree(d->stripe_sectors_dirty);
+
+	closure_debug_destroy(&d->cl);
+}
+
+static int bcache_device_init(struct bcache_device *d, unsigned block_size,
+			      sector_t sectors)
+{
+	struct request_queue *q;
+	size_t n;
+	int minor;
+
+	if (!d->stripe_size)
+		d->stripe_size = 1 << 31;
+
+	d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
+
+	if (!d->nr_stripes ||
+	    d->nr_stripes > INT_MAX ||
+	    d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) {
+		pr_err("nr_stripes too large");
+		return -ENOMEM;
+	}
+
+	n = d->nr_stripes * sizeof(atomic_t);
+	d->stripe_sectors_dirty = n < PAGE_SIZE << 6
+		? kzalloc(n, GFP_KERNEL)
+		: vzalloc(n);
+	if (!d->stripe_sectors_dirty)
+		return -ENOMEM;
+
+	n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
+	d->full_dirty_stripes = n < PAGE_SIZE << 6
+		? kzalloc(n, GFP_KERNEL)
+		: vzalloc(n);
+	if (!d->full_dirty_stripes)
+		return -ENOMEM;
+
+	minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL);
+	if (minor < 0)
+		return minor;
+
+	if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
+	    bio_split_pool_init(&d->bio_split_hook) ||
+	    !(d->disk = alloc_disk(1))) {
+		ida_simple_remove(&bcache_minor, minor);
+		return -ENOMEM;
+	}
+
+	set_capacity(d->disk, sectors);
+	snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor);
+
+	d->disk->major		= bcache_major;
+	d->disk->first_minor	= minor;
+	d->disk->fops		= &bcache_ops;
+	d->disk->private_data	= d;
+
+	q = blk_alloc_queue(GFP_KERNEL);
+	if (!q)
+		return -ENOMEM;
+
+	blk_queue_make_request(q, NULL);
+	d->disk->queue			= q;
+	q->queuedata			= d;
+	q->backing_dev_info.congested_data = d;
+	q->limits.max_hw_sectors	= UINT_MAX;
+	q->limits.max_sectors		= UINT_MAX;
+	q->limits.max_segment_size	= UINT_MAX;
+	q->limits.max_segments		= BIO_MAX_PAGES;
+	q->limits.max_discard_sectors	= UINT_MAX;
+	q->limits.discard_granularity	= 512;
+	q->limits.io_min		= block_size;
+	q->limits.logical_block_size	= block_size;
+	q->limits.physical_block_size	= block_size;
+	set_bit(QUEUE_FLAG_NONROT,	&d->disk->queue->queue_flags);
+	set_bit(QUEUE_FLAG_DISCARD,	&d->disk->queue->queue_flags);
+
+	blk_queue_flush(q, REQ_FLUSH|REQ_FUA);
+
+	return 0;
+}
+
+/* Cached device */
+
+static void calc_cached_dev_sectors(struct cache_set *c)
+{
+	uint64_t sectors = 0;
+	struct cached_dev *dc;
+
+	list_for_each_entry(dc, &c->cached_devs, list)
+		sectors += bdev_sectors(dc->bdev);
+
+	c->cached_dev_sectors = sectors;
+}
+
+void bch_cached_dev_run(struct cached_dev *dc)
+{
+	struct bcache_device *d = &dc->disk;
+	char buf[SB_LABEL_SIZE + 1];
+	char *env[] = {
+		"DRIVER=bcache",
+		kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
+		NULL,
+		NULL,
+	};
+
+	memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
+	buf[SB_LABEL_SIZE] = '\0';
+	env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
+
+	if (atomic_xchg(&dc->running, 1))
+		return;
+
+	if (!d->c &&
+	    BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
+		struct closure cl;
+		closure_init_stack(&cl);
+
+		SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
+		bch_write_bdev_super(dc, &cl);
+		closure_sync(&cl);
+	}
+
+	add_disk(d->disk);
+	bd_link_disk_holder(dc->bdev, dc->disk.disk);
+	/* won't show up in the uevent file, use udevadm monitor -e instead
+	 * only class / kset properties are persistent */
+	kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
+	kfree(env[1]);
+	kfree(env[2]);
+
+	if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
+	    sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
+		pr_debug("error creating sysfs link");
+}
+
+static void cached_dev_detach_finish(struct work_struct *w)
+{
+	struct cached_dev *dc = container_of(w, struct cached_dev, detach);
+	char buf[BDEVNAME_SIZE];
+	struct closure cl;
+	closure_init_stack(&cl);
+
+	BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
+	BUG_ON(atomic_read(&dc->count));
+
+	mutex_lock(&bch_register_lock);
+
+	memset(&dc->sb.set_uuid, 0, 16);
+	SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
+
+	bch_write_bdev_super(dc, &cl);
+	closure_sync(&cl);
+
+	bcache_device_detach(&dc->disk);
+	list_move(&dc->list, &uncached_devices);
+
+	clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
+
+	mutex_unlock(&bch_register_lock);
+
+	pr_info("Caching disabled for %s", bdevname(dc->bdev, buf));
+
+	/* Drop ref we took in cached_dev_detach() */
+	closure_put(&dc->disk.cl);
+}
+
+void bch_cached_dev_detach(struct cached_dev *dc)
+{
+	lockdep_assert_held(&bch_register_lock);
+
+	if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
+		return;
+
+	if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
+		return;
+
+	/*
+	 * Block the device from being closed and freed until we're finished
+	 * detaching
+	 */
+	closure_get(&dc->disk.cl);
+
+	bch_writeback_queue(dc);
+	cached_dev_put(dc);
+}
+
+int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
+{
+	uint32_t rtime = cpu_to_le32(get_seconds());
+	struct uuid_entry *u;
+	char buf[BDEVNAME_SIZE];
+
+	bdevname(dc->bdev, buf);
+
+	if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))
+		return -ENOENT;
+
+	if (dc->disk.c) {
+		pr_err("Can't attach %s: already attached", buf);
+		return -EINVAL;
+	}
+
+	if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
+		pr_err("Can't attach %s: shutting down", buf);
+		return -EINVAL;
+	}
+
+	if (dc->sb.block_size < c->sb.block_size) {
+		/* Will die */
+		pr_err("Couldn't attach %s: block size less than set's block size",
+		       buf);
+		return -EINVAL;
+	}
+
+	u = uuid_find(c, dc->sb.uuid);
+
+	if (u &&
+	    (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
+	     BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
+		memcpy(u->uuid, invalid_uuid, 16);
+		u->invalidated = cpu_to_le32(get_seconds());
+		u = NULL;
+	}
+
+	if (!u) {
+		if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
+			pr_err("Couldn't find uuid for %s in set", buf);
+			return -ENOENT;
+		}
+
+		u = uuid_find_empty(c);
+		if (!u) {
+			pr_err("Not caching %s, no room for UUID", buf);
+			return -EINVAL;
+		}
+	}
+
+	/* Deadlocks since we're called via sysfs...
+	sysfs_remove_file(&dc->kobj, &sysfs_attach);
+	 */
+
+	if (bch_is_zero(u->uuid, 16)) {
+		struct closure cl;
+		closure_init_stack(&cl);
+
+		memcpy(u->uuid, dc->sb.uuid, 16);
+		memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
+		u->first_reg = u->last_reg = rtime;
+		bch_uuid_write(c);
+
+		memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16);
+		SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
+
+		bch_write_bdev_super(dc, &cl);
+		closure_sync(&cl);
+	} else {
+		u->last_reg = rtime;
+		bch_uuid_write(c);
+	}
+
+	bcache_device_attach(&dc->disk, c, u - c->uuids);
+	list_move(&dc->list, &c->cached_devs);
+	calc_cached_dev_sectors(c);
+
+	smp_wmb();
+	/*
+	 * dc->c must be set before dc->count != 0 - paired with the mb in
+	 * cached_dev_get()
+	 */
+	atomic_set(&dc->count, 1);
+
+	if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
+		bch_sectors_dirty_init(dc);
+		atomic_set(&dc->has_dirty, 1);
+		atomic_inc(&dc->count);
+		bch_writeback_queue(dc);
+	}
+
+	bch_cached_dev_run(dc);
+	bcache_device_link(&dc->disk, c, "bdev");
+
+	pr_info("Caching %s as %s on set %pU",
+		bdevname(dc->bdev, buf), dc->disk.disk->disk_name,
+		dc->disk.c->sb.set_uuid);
+	return 0;
+}
+
+void bch_cached_dev_release(struct kobject *kobj)
+{
+	struct cached_dev *dc = container_of(kobj, struct cached_dev,
+					     disk.kobj);
+	kfree(dc);
+	module_put(THIS_MODULE);
+}
+
+static void cached_dev_free(struct closure *cl)
+{
+	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+
+	cancel_delayed_work_sync(&dc->writeback_rate_update);
+	kthread_stop(dc->writeback_thread);
+
+	mutex_lock(&bch_register_lock);
+
+	if (atomic_read(&dc->running))
+		bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
+	bcache_device_free(&dc->disk);
+	list_del(&dc->list);
+
+	mutex_unlock(&bch_register_lock);
+
+	if (!IS_ERR_OR_NULL(dc->bdev)) {
+		if (dc->bdev->bd_disk)
+			blk_sync_queue(bdev_get_queue(dc->bdev));
+
+		blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+	}
+
+	wake_up(&unregister_wait);
+
+	kobject_put(&dc->disk.kobj);
+}
+
+static void cached_dev_flush(struct closure *cl)
+{
+	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+	struct bcache_device *d = &dc->disk;
+
+	mutex_lock(&bch_register_lock);
+	bcache_device_unlink(d);
+	mutex_unlock(&bch_register_lock);
+
+	bch_cache_accounting_destroy(&dc->accounting);
+	kobject_del(&d->kobj);
+
+	continue_at(cl, cached_dev_free, system_wq);
+}
+
+static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
+{
+	int ret;
+	struct io *io;
+	struct request_queue *q = bdev_get_queue(dc->bdev);
+
+	__module_get(THIS_MODULE);
+	INIT_LIST_HEAD(&dc->list);
+	closure_init(&dc->disk.cl, NULL);
+	set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
+	kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
+	INIT_WORK(&dc->detach, cached_dev_detach_finish);
+	sema_init(&dc->sb_write_mutex, 1);
+	INIT_LIST_HEAD(&dc->io_lru);
+	spin_lock_init(&dc->io_lock);
+	bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
+
+	dc->sequential_cutoff		= 4 << 20;
+
+	for (io = dc->io; io < dc->io + RECENT_IO; io++) {
+		list_add(&io->lru, &dc->io_lru);
+		hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
+	}
+
+	dc->disk.stripe_size = q->limits.io_opt >> 9;
+
+	if (dc->disk.stripe_size)
+		dc->partial_stripes_expensive =
+			q->limits.raid_partial_stripes_expensive;
+
+	ret = bcache_device_init(&dc->disk, block_size,
+			 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
+	if (ret)
+		return ret;
+
+	set_capacity(dc->disk.disk,
+		     dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
+
+	dc->disk.disk->queue->backing_dev_info.ra_pages =
+		max(dc->disk.disk->queue->backing_dev_info.ra_pages,
+		    q->backing_dev_info.ra_pages);
+
+	bch_cached_dev_request_init(dc);
+	bch_cached_dev_writeback_init(dc);
+	return 0;
+}
+
+/* Cached device - bcache superblock */
+
+static void register_bdev(struct cache_sb *sb, struct page *sb_page,
+				 struct block_device *bdev,
+				 struct cached_dev *dc)
+{
+	char name[BDEVNAME_SIZE];
+	const char *err = "cannot allocate memory";
+	struct cache_set *c;
+
+	memcpy(&dc->sb, sb, sizeof(struct cache_sb));
+	dc->bdev = bdev;
+	dc->bdev->bd_holder = dc;
+
+	bio_init(&dc->sb_bio);
+	dc->sb_bio.bi_max_vecs	= 1;
+	dc->sb_bio.bi_io_vec	= dc->sb_bio.bi_inline_vecs;
+	dc->sb_bio.bi_io_vec[0].bv_page = sb_page;
+	get_page(sb_page);
+
+	if (cached_dev_init(dc, sb->block_size << 9))
+		goto err;
+
+	err = "error creating kobject";
+	if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
+			"bcache"))
+		goto err;
+	if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
+		goto err;
+
+	pr_info("registered backing device %s", bdevname(bdev, name));
+
+	list_add(&dc->list, &uncached_devices);
+	list_for_each_entry(c, &bch_cache_sets, list)
+		bch_cached_dev_attach(dc, c);
+
+	if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
+	    BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
+		bch_cached_dev_run(dc);
+
+	return;
+err:
+	pr_notice("error opening %s: %s", bdevname(bdev, name), err);
+	bcache_device_stop(&dc->disk);
+}
+
+/* Flash only volumes */
+
+void bch_flash_dev_release(struct kobject *kobj)
+{
+	struct bcache_device *d = container_of(kobj, struct bcache_device,
+					       kobj);
+	kfree(d);
+}
+
+static void flash_dev_free(struct closure *cl)
+{
+	struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+	bcache_device_free(d);
+	kobject_put(&d->kobj);
+}
+
+static void flash_dev_flush(struct closure *cl)
+{
+	struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+
+	bcache_device_unlink(d);
+	kobject_del(&d->kobj);
+	continue_at(cl, flash_dev_free, system_wq);
+}
+
+static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
+{
+	struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
+					  GFP_KERNEL);
+	if (!d)
+		return -ENOMEM;
+
+	closure_init(&d->cl, NULL);
+	set_closure_fn(&d->cl, flash_dev_flush, system_wq);
+
+	kobject_init(&d->kobj, &bch_flash_dev_ktype);
+
+	if (bcache_device_init(d, block_bytes(c), u->sectors))
+		goto err;
+
+	bcache_device_attach(d, c, u - c->uuids);
+	bch_flash_dev_request_init(d);
+	add_disk(d->disk);
+
+	if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
+		goto err;
+
+	bcache_device_link(d, c, "volume");
+
+	return 0;
+err:
+	kobject_put(&d->kobj);
+	return -ENOMEM;
+}
+
+static int flash_devs_run(struct cache_set *c)
+{
+	int ret = 0;
+	struct uuid_entry *u;
+
+	for (u = c->uuids;
+	     u < c->uuids + c->nr_uuids && !ret;
+	     u++)
+		if (UUID_FLASH_ONLY(u))
+			ret = flash_dev_run(c, u);
+
+	return ret;
+}
+
+int bch_flash_dev_create(struct cache_set *c, uint64_t size)
+{
+	struct uuid_entry *u;
+
+	if (test_bit(CACHE_SET_STOPPING, &c->flags))
+		return -EINTR;
+
+	u = uuid_find_empty(c);
+	if (!u) {
+		pr_err("Can't create volume, no room for UUID");
+		return -EINVAL;
+	}
+
+	get_random_bytes(u->uuid, 16);
+	memset(u->label, 0, 32);
+	u->first_reg = u->last_reg = cpu_to_le32(get_seconds());
+
+	SET_UUID_FLASH_ONLY(u, 1);
+	u->sectors = size >> 9;
+
+	bch_uuid_write(c);
+
+	return flash_dev_run(c, u);
+}
+
+/* Cache set */
+
+__printf(2, 3)
+bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
+{
+	va_list args;
+
+	if (c->on_error != ON_ERROR_PANIC &&
+	    test_bit(CACHE_SET_STOPPING, &c->flags))
+		return false;
+
+	/* XXX: we can be called from atomic context
+	acquire_console_sem();
+	*/
+
+	printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid);
+
+	va_start(args, fmt);
+	vprintk(fmt, args);
+	va_end(args);
+
+	printk(", disabling caching\n");
+
+	if (c->on_error == ON_ERROR_PANIC)
+		panic("panic forced after error\n");
+
+	bch_cache_set_unregister(c);
+	return true;
+}
+
+void bch_cache_set_release(struct kobject *kobj)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+	kfree(c);
+	module_put(THIS_MODULE);
+}
+
+static void cache_set_free(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, cl);
+	struct cache *ca;
+	unsigned i;
+
+	if (!IS_ERR_OR_NULL(c->debug))
+		debugfs_remove(c->debug);
+
+	bch_open_buckets_free(c);
+	bch_btree_cache_free(c);
+	bch_journal_free(c);
+
+	for_each_cache(ca, c, i)
+		if (ca)
+			kobject_put(&ca->kobj);
+
+	bch_bset_sort_state_free(&c->sort);
+	free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
+
+	if (c->moving_gc_wq)
+		destroy_workqueue(c->moving_gc_wq);
+	if (c->bio_split)
+		bioset_free(c->bio_split);
+	if (c->fill_iter)
+		mempool_destroy(c->fill_iter);
+	if (c->bio_meta)
+		mempool_destroy(c->bio_meta);
+	if (c->search)
+		mempool_destroy(c->search);
+	kfree(c->devices);
+
+	mutex_lock(&bch_register_lock);
+	list_del(&c->list);
+	mutex_unlock(&bch_register_lock);
+
+	pr_info("Cache set %pU unregistered", c->sb.set_uuid);
+	wake_up(&unregister_wait);
+
+	closure_debug_destroy(&c->cl);
+	kobject_put(&c->kobj);
+}
+
+static void cache_set_flush(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, caching);
+	struct cache *ca;
+	struct btree *b;
+	unsigned i;
+
+	bch_cache_accounting_destroy(&c->accounting);
+
+	kobject_put(&c->internal);
+	kobject_del(&c->kobj);
+
+	if (c->gc_thread)
+		kthread_stop(c->gc_thread);
+
+	if (!IS_ERR_OR_NULL(c->root))
+		list_add(&c->root->list, &c->btree_cache);
+
+	/* Should skip this if we're unregistering because of an error */
+	list_for_each_entry(b, &c->btree_cache, list) {
+		mutex_lock(&b->write_lock);
+		if (btree_node_dirty(b))
+			__bch_btree_node_write(b, NULL);
+		mutex_unlock(&b->write_lock);
+	}
+
+	for_each_cache(ca, c, i)
+		if (ca->alloc_thread)
+			kthread_stop(ca->alloc_thread);
+
+	cancel_delayed_work_sync(&c->journal.work);
+	/* flush last journal entry if needed */
+	c->journal.work.work.func(&c->journal.work.work);
+
+	closure_return(cl);
+}
+
+static void __cache_set_unregister(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, caching);
+	struct cached_dev *dc;
+	size_t i;
+
+	mutex_lock(&bch_register_lock);
+
+	for (i = 0; i < c->nr_uuids; i++)
+		if (c->devices[i]) {
+			if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
+			    test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
+				dc = container_of(c->devices[i],
+						  struct cached_dev, disk);
+				bch_cached_dev_detach(dc);
+			} else {
+				bcache_device_stop(c->devices[i]);
+			}
+		}
+
+	mutex_unlock(&bch_register_lock);
+
+	continue_at(cl, cache_set_flush, system_wq);
+}
+
+void bch_cache_set_stop(struct cache_set *c)
+{
+	if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
+		closure_queue(&c->caching);
+}
+
+void bch_cache_set_unregister(struct cache_set *c)
+{
+	set_bit(CACHE_SET_UNREGISTERING, &c->flags);
+	bch_cache_set_stop(c);
+}
+
+#define alloc_bucket_pages(gfp, c)			\
+	((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
+
+struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
+{
+	int iter_size;
+	struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
+	if (!c)
+		return NULL;
+
+	__module_get(THIS_MODULE);
+	closure_init(&c->cl, NULL);
+	set_closure_fn(&c->cl, cache_set_free, system_wq);
+
+	closure_init(&c->caching, &c->cl);
+	set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
+
+	/* Maybe create continue_at_noreturn() and use it here? */
+	closure_set_stopped(&c->cl);
+	closure_put(&c->cl);
+
+	kobject_init(&c->kobj, &bch_cache_set_ktype);
+	kobject_init(&c->internal, &bch_cache_set_internal_ktype);
+
+	bch_cache_accounting_init(&c->accounting, &c->cl);
+
+	memcpy(c->sb.set_uuid, sb->set_uuid, 16);
+	c->sb.block_size	= sb->block_size;
+	c->sb.bucket_size	= sb->bucket_size;
+	c->sb.nr_in_set		= sb->nr_in_set;
+	c->sb.last_mount	= sb->last_mount;
+	c->bucket_bits		= ilog2(sb->bucket_size);
+	c->block_bits		= ilog2(sb->block_size);
+	c->nr_uuids		= bucket_bytes(c) / sizeof(struct uuid_entry);
+
+	c->btree_pages		= bucket_pages(c);
+	if (c->btree_pages > BTREE_MAX_PAGES)
+		c->btree_pages = max_t(int, c->btree_pages / 4,
+				       BTREE_MAX_PAGES);
+
+	sema_init(&c->sb_write_mutex, 1);
+	mutex_init(&c->bucket_lock);
+	init_waitqueue_head(&c->btree_cache_wait);
+	init_waitqueue_head(&c->bucket_wait);
+	sema_init(&c->uuid_write_mutex, 1);
+
+	spin_lock_init(&c->btree_gc_time.lock);
+	spin_lock_init(&c->btree_split_time.lock);
+	spin_lock_init(&c->btree_read_time.lock);
+
+	bch_moving_init_cache_set(c);
+
+	INIT_LIST_HEAD(&c->list);
+	INIT_LIST_HEAD(&c->cached_devs);
+	INIT_LIST_HEAD(&c->btree_cache);
+	INIT_LIST_HEAD(&c->btree_cache_freeable);
+	INIT_LIST_HEAD(&c->btree_cache_freed);
+	INIT_LIST_HEAD(&c->data_buckets);
+
+	c->search = mempool_create_slab_pool(32, bch_search_cache);
+	if (!c->search)
+		goto err;
+
+	iter_size = (sb->bucket_size / sb->block_size + 1) *
+		sizeof(struct btree_iter_set);
+
+	if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) ||
+	    !(c->bio_meta = mempool_create_kmalloc_pool(2,
+				sizeof(struct bbio) + sizeof(struct bio_vec) *
+				bucket_pages(c))) ||
+	    !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
+	    !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
+	    !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
+	    !(c->moving_gc_wq = create_workqueue("bcache_gc")) ||
+	    bch_journal_alloc(c) ||
+	    bch_btree_cache_alloc(c) ||
+	    bch_open_buckets_alloc(c) ||
+	    bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
+		goto err;
+
+	c->congested_read_threshold_us	= 2000;
+	c->congested_write_threshold_us	= 20000;
+	c->error_limit	= 8 << IO_ERROR_SHIFT;
+
+	return c;
+err:
+	bch_cache_set_unregister(c);
+	return NULL;
+}
+
+static void run_cache_set(struct cache_set *c)
+{
+	const char *err = "cannot allocate memory";
+	struct cached_dev *dc, *t;
+	struct cache *ca;
+	struct closure cl;
+	unsigned i;
+
+	closure_init_stack(&cl);
+
+	for_each_cache(ca, c, i)
+		c->nbuckets += ca->sb.nbuckets;
+
+	if (CACHE_SYNC(&c->sb)) {
+		LIST_HEAD(journal);
+		struct bkey *k;
+		struct jset *j;
+
+		err = "cannot allocate memory for journal";
+		if (bch_journal_read(c, &journal))
+			goto err;
+
+		pr_debug("btree_journal_read() done");
+
+		err = "no journal entries found";
+		if (list_empty(&journal))
+			goto err;
+
+		j = &list_entry(journal.prev, struct journal_replay, list)->j;
+
+		err = "IO error reading priorities";
+		for_each_cache(ca, c, i)
+			prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]);
+
+		/*
+		 * If prio_read() fails it'll call cache_set_error and we'll
+		 * tear everything down right away, but if we perhaps checked
+		 * sooner we could avoid journal replay.
+		 */
+
+		k = &j->btree_root;
+
+		err = "bad btree root";
+		if (__bch_btree_ptr_invalid(c, k))
+			goto err;
+
+		err = "error reading btree root";
+		c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true);
+		if (IS_ERR_OR_NULL(c->root))
+			goto err;
+
+		list_del_init(&c->root->list);
+		rw_unlock(true, c->root);
+
+		err = uuid_read(c, j, &cl);
+		if (err)
+			goto err;
+
+		err = "error in recovery";
+		if (bch_btree_check(c))
+			goto err;
+
+		bch_journal_mark(c, &journal);
+		bch_initial_gc_finish(c);
+		pr_debug("btree_check() done");
+
+		/*
+		 * bcache_journal_next() can't happen sooner, or
+		 * btree_gc_finish() will give spurious errors about last_gc >
+		 * gc_gen - this is a hack but oh well.
+		 */
+		bch_journal_next(&c->journal);
+
+		err = "error starting allocator thread";
+		for_each_cache(ca, c, i)
+			if (bch_cache_allocator_start(ca))
+				goto err;
+
+		/*
+		 * First place it's safe to allocate: btree_check() and
+		 * btree_gc_finish() have to run before we have buckets to
+		 * allocate, and bch_bucket_alloc_set() might cause a journal
+		 * entry to be written so bcache_journal_next() has to be called
+		 * first.
+		 *
+		 * If the uuids were in the old format we have to rewrite them
+		 * before the next journal entry is written:
+		 */
+		if (j->version < BCACHE_JSET_VERSION_UUID)
+			__uuid_write(c);
+
+		bch_journal_replay(c, &journal);
+	} else {
+		pr_notice("invalidating existing data");
+
+		for_each_cache(ca, c, i) {
+			unsigned j;
+
+			ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
+					      2, SB_JOURNAL_BUCKETS);
+
+			for (j = 0; j < ca->sb.keys; j++)
+				ca->sb.d[j] = ca->sb.first_bucket + j;
+		}
+
+		bch_initial_gc_finish(c);
+
+		err = "error starting allocator thread";
+		for_each_cache(ca, c, i)
+			if (bch_cache_allocator_start(ca))
+				goto err;
+
+		mutex_lock(&c->bucket_lock);
+		for_each_cache(ca, c, i)
+			bch_prio_write(ca);
+		mutex_unlock(&c->bucket_lock);
+
+		err = "cannot allocate new UUID bucket";
+		if (__uuid_write(c))
+			goto err;
+
+		err = "cannot allocate new btree root";
+		c->root = bch_btree_node_alloc(c, NULL, 0);
+		if (IS_ERR_OR_NULL(c->root))
+			goto err;
+
+		mutex_lock(&c->root->write_lock);
+		bkey_copy_key(&c->root->key, &MAX_KEY);
+		bch_btree_node_write(c->root, &cl);
+		mutex_unlock(&c->root->write_lock);
+
+		bch_btree_set_root(c->root);
+		rw_unlock(true, c->root);
+
+		/*
+		 * We don't want to write the first journal entry until
+		 * everything is set up - fortunately journal entries won't be
+		 * written until the SET_CACHE_SYNC() here:
+		 */
+		SET_CACHE_SYNC(&c->sb, true);
+
+		bch_journal_next(&c->journal);
+		bch_journal_meta(c, &cl);
+	}
+
+	err = "error starting gc thread";
+	if (bch_gc_thread_start(c))
+		goto err;
+
+	closure_sync(&cl);
+	c->sb.last_mount = get_seconds();
+	bcache_write_super(c);
+
+	list_for_each_entry_safe(dc, t, &uncached_devices, list)
+		bch_cached_dev_attach(dc, c);
+
+	flash_devs_run(c);
+
+	return;
+err:
+	closure_sync(&cl);
+	/* XXX: test this, it's broken */
+	bch_cache_set_error(c, "%s", err);
+}
+
+static bool can_attach_cache(struct cache *ca, struct cache_set *c)
+{
+	return ca->sb.block_size	== c->sb.block_size &&
+		ca->sb.bucket_size	== c->sb.bucket_size &&
+		ca->sb.nr_in_set	== c->sb.nr_in_set;
+}
+
+static const char *register_cache_set(struct cache *ca)
+{
+	char buf[12];
+	const char *err = "cannot allocate memory";
+	struct cache_set *c;
+
+	list_for_each_entry(c, &bch_cache_sets, list)
+		if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
+			if (c->cache[ca->sb.nr_this_dev])
+				return "duplicate cache set member";
+
+			if (!can_attach_cache(ca, c))
+				return "cache sb does not match set";
+
+			if (!CACHE_SYNC(&ca->sb))
+				SET_CACHE_SYNC(&c->sb, false);
+
+			goto found;
+		}
+
+	c = bch_cache_set_alloc(&ca->sb);
+	if (!c)
+		return err;
+
+	err = "error creating kobject";
+	if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) ||
+	    kobject_add(&c->internal, &c->kobj, "internal"))
+		goto err;
+
+	if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
+		goto err;
+
+	bch_debug_init_cache_set(c);
+
+	list_add(&c->list, &bch_cache_sets);
+found:
+	sprintf(buf, "cache%i", ca->sb.nr_this_dev);
+	if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
+	    sysfs_create_link(&c->kobj, &ca->kobj, buf))
+		goto err;
+
+	if (ca->sb.seq > c->sb.seq) {
+		c->sb.version		= ca->sb.version;
+		memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
+		c->sb.flags             = ca->sb.flags;
+		c->sb.seq		= ca->sb.seq;
+		pr_debug("set version = %llu", c->sb.version);
+	}
+
+	ca->set = c;
+	ca->set->cache[ca->sb.nr_this_dev] = ca;
+	c->cache_by_alloc[c->caches_loaded++] = ca;
+
+	if (c->caches_loaded == c->sb.nr_in_set)
+		run_cache_set(c);
+
+	return NULL;
+err:
+	bch_cache_set_unregister(c);
+	return err;
+}
+
+/* Cache device */
+
+void bch_cache_release(struct kobject *kobj)
+{
+	struct cache *ca = container_of(kobj, struct cache, kobj);
+	unsigned i;
+
+	if (ca->set)
+		ca->set->cache[ca->sb.nr_this_dev] = NULL;
+
+	bio_split_pool_free(&ca->bio_split_hook);
+
+	free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
+	kfree(ca->prio_buckets);
+	vfree(ca->buckets);
+
+	free_heap(&ca->heap);
+	free_fifo(&ca->free_inc);
+
+	for (i = 0; i < RESERVE_NR; i++)
+		free_fifo(&ca->free[i]);
+
+	if (ca->sb_bio.bi_inline_vecs[0].bv_page)
+		put_page(ca->sb_bio.bi_io_vec[0].bv_page);
+
+	if (!IS_ERR_OR_NULL(ca->bdev)) {
+		blk_sync_queue(bdev_get_queue(ca->bdev));
+		blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+	}
+
+	kfree(ca);
+	module_put(THIS_MODULE);
+}
+
+static int cache_alloc(struct cache_sb *sb, struct cache *ca)
+{
+	size_t free;
+	struct bucket *b;
+
+	__module_get(THIS_MODULE);
+	kobject_init(&ca->kobj, &bch_cache_ktype);
+
+	bio_init(&ca->journal.bio);
+	ca->journal.bio.bi_max_vecs = 8;
+	ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;
+
+	free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
+
+	if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) ||
+	    !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
+	    !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
+	    !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
+	    !init_fifo(&ca->free_inc,	free << 2, GFP_KERNEL) ||
+	    !init_heap(&ca->heap,	free << 3, GFP_KERNEL) ||
+	    !(ca->buckets	= vzalloc(sizeof(struct bucket) *
+					  ca->sb.nbuckets)) ||
+	    !(ca->prio_buckets	= kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
+					  2, GFP_KERNEL)) ||
+	    !(ca->disk_buckets	= alloc_bucket_pages(GFP_KERNEL, ca)) ||
+	    bio_split_pool_init(&ca->bio_split_hook))
+		return -ENOMEM;
+
+	ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
+
+	for_each_bucket(b, ca)
+		atomic_set(&b->pin, 0);
+
+	return 0;
+}
+
+static void register_cache(struct cache_sb *sb, struct page *sb_page,
+				  struct block_device *bdev, struct cache *ca)
+{
+	char name[BDEVNAME_SIZE];
+	const char *err = "cannot allocate memory";
+
+	memcpy(&ca->sb, sb, sizeof(struct cache_sb));
+	ca->bdev = bdev;
+	ca->bdev->bd_holder = ca;
+
+	bio_init(&ca->sb_bio);
+	ca->sb_bio.bi_max_vecs	= 1;
+	ca->sb_bio.bi_io_vec	= ca->sb_bio.bi_inline_vecs;
+	ca->sb_bio.bi_io_vec[0].bv_page = sb_page;
+	get_page(sb_page);
+
+	if (blk_queue_discard(bdev_get_queue(ca->bdev)))
+		ca->discard = CACHE_DISCARD(&ca->sb);
+
+	if (cache_alloc(sb, ca) != 0)
+		goto err;
+
+	err = "error creating kobject";
+	if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache"))
+		goto err;
+
+	mutex_lock(&bch_register_lock);
+	err = register_cache_set(ca);
+	mutex_unlock(&bch_register_lock);
+
+	if (err)
+		goto err;
+
+	pr_info("registered cache device %s", bdevname(bdev, name));
+	return;
+err:
+	pr_notice("error opening %s: %s", bdevname(bdev, name), err);
+	kobject_put(&ca->kobj);
+}
+
+/* Global interfaces/init */
+
+static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
+			       const char *, size_t);
+
+kobj_attribute_write(register,		register_bcache);
+kobj_attribute_write(register_quiet,	register_bcache);
+
+static bool bch_is_open_backing(struct block_device *bdev) {
+	struct cache_set *c, *tc;
+	struct cached_dev *dc, *t;
+
+	list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
+		list_for_each_entry_safe(dc, t, &c->cached_devs, list)
+			if (dc->bdev == bdev)
+				return true;
+	list_for_each_entry_safe(dc, t, &uncached_devices, list)
+		if (dc->bdev == bdev)
+			return true;
+	return false;
+}
+
+static bool bch_is_open_cache(struct block_device *bdev) {
+	struct cache_set *c, *tc;
+	struct cache *ca;
+	unsigned i;
+
+	list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
+		for_each_cache(ca, c, i)
+			if (ca->bdev == bdev)
+				return true;
+	return false;
+}
+
+static bool bch_is_open(struct block_device *bdev) {
+	return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
+}
+
+static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+			       const char *buffer, size_t size)
+{
+	ssize_t ret = size;
+	const char *err = "cannot allocate memory";
+	char *path = NULL;
+	struct cache_sb *sb = NULL;
+	struct block_device *bdev = NULL;
+	struct page *sb_page = NULL;
+
+	if (!try_module_get(THIS_MODULE))
+		return -EBUSY;
+
+	if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
+	    !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
+		goto err;
+
+	err = "failed to open device";
+	bdev = blkdev_get_by_path(strim(path),
+				  FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				  sb);
+	if (IS_ERR(bdev)) {
+		if (bdev == ERR_PTR(-EBUSY)) {
+			bdev = lookup_bdev(strim(path));
+			if (!IS_ERR(bdev) && bch_is_open(bdev))
+				err = "device already registered";
+			else
+				err = "device busy";
+		}
+		goto err;
+	}
+
+	err = "failed to set blocksize";
+	if (set_blocksize(bdev, 4096))
+		goto err_close;
+
+	err = read_super(sb, bdev, &sb_page);
+	if (err)
+		goto err_close;
+
+	if (SB_IS_BDEV(sb)) {
+		struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
+		if (!dc)
+			goto err_close;
+
+		mutex_lock(&bch_register_lock);
+		register_bdev(sb, sb_page, bdev, dc);
+		mutex_unlock(&bch_register_lock);
+	} else {
+		struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+		if (!ca)
+			goto err_close;
+
+		register_cache(sb, sb_page, bdev, ca);
+	}
+out:
+	if (sb_page)
+		put_page(sb_page);
+	kfree(sb);
+	kfree(path);
+	module_put(THIS_MODULE);
+	return ret;
+
+err_close:
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+err:
+	if (attr != &ksysfs_register_quiet)
+		pr_info("error opening %s: %s", path, err);
+	ret = -EINVAL;
+	goto out;
+}
+
+static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
+{
+	if (code == SYS_DOWN ||
+	    code == SYS_HALT ||
+	    code == SYS_POWER_OFF) {
+		DEFINE_WAIT(wait);
+		unsigned long start = jiffies;
+		bool stopped = false;
+
+		struct cache_set *c, *tc;
+		struct cached_dev *dc, *tdc;
+
+		mutex_lock(&bch_register_lock);
+
+		if (list_empty(&bch_cache_sets) &&
+		    list_empty(&uncached_devices))
+			goto out;
+
+		pr_info("Stopping all devices:");
+
+		list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
+			bch_cache_set_stop(c);
+
+		list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
+			bcache_device_stop(&dc->disk);
+
+		/* What's a condition variable? */
+		while (1) {
+			long timeout = start + 2 * HZ - jiffies;
+
+			stopped = list_empty(&bch_cache_sets) &&
+				list_empty(&uncached_devices);
+
+			if (timeout < 0 || stopped)
+				break;
+
+			prepare_to_wait(&unregister_wait, &wait,
+					TASK_UNINTERRUPTIBLE);
+
+			mutex_unlock(&bch_register_lock);
+			schedule_timeout(timeout);
+			mutex_lock(&bch_register_lock);
+		}
+
+		finish_wait(&unregister_wait, &wait);
+
+		if (stopped)
+			pr_info("All devices stopped");
+		else
+			pr_notice("Timeout waiting for devices to be closed");
+out:
+		mutex_unlock(&bch_register_lock);
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block reboot = {
+	.notifier_call	= bcache_reboot,
+	.priority	= INT_MAX, /* before any real devices */
+};
+
+static void bcache_exit(void)
+{
+	bch_debug_exit();
+	bch_request_exit();
+	if (bcache_kobj)
+		kobject_put(bcache_kobj);
+	if (bcache_wq)
+		destroy_workqueue(bcache_wq);
+	if (bcache_major)
+		unregister_blkdev(bcache_major, "bcache");
+	unregister_reboot_notifier(&reboot);
+}
+
+static int __init bcache_init(void)
+{
+	static const struct attribute *files[] = {
+		&ksysfs_register.attr,
+		&ksysfs_register_quiet.attr,
+		NULL
+	};
+
+	mutex_init(&bch_register_lock);
+	init_waitqueue_head(&unregister_wait);
+	register_reboot_notifier(&reboot);
+	closure_debug_init();
+
+	bcache_major = register_blkdev(0, "bcache");
+	if (bcache_major < 0)
+		return bcache_major;
+
+	if (!(bcache_wq = create_workqueue("bcache")) ||
+	    !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
+	    sysfs_create_files(bcache_kobj, files) ||
+	    bch_request_init() ||
+	    bch_debug_init(bcache_kobj))
+		goto err;
+
+	return 0;
+err:
+	bcache_exit();
+	return -ENOMEM;
+}
+
+module_exit(bcache_exit);
+module_init(bcache_init);
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
new file mode 100644
index 00000000000..b3ff57d61dd
--- /dev/null
+++ b/drivers/md/bcache/sysfs.c
@@ -0,0 +1,898 @@
+/*
+ * bcache sysfs interfaces
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "sysfs.h"
+#include "btree.h"
+#include "request.h"
+#include "writeback.h"
+
+#include <linux/blkdev.h>
+#include <linux/sort.h>
+
+static const char * const cache_replacement_policies[] = {
+	"lru",
+	"fifo",
+	"random",
+	NULL
+};
+
+static const char * const error_actions[] = {
+	"unregister",
+	"panic",
+	NULL
+};
+
+write_attribute(attach);
+write_attribute(detach);
+write_attribute(unregister);
+write_attribute(stop);
+write_attribute(clear_stats);
+write_attribute(trigger_gc);
+write_attribute(prune_cache);
+write_attribute(flash_vol_create);
+
+read_attribute(bucket_size);
+read_attribute(block_size);
+read_attribute(nbuckets);
+read_attribute(tree_depth);
+read_attribute(root_usage_percent);
+read_attribute(priority_stats);
+read_attribute(btree_cache_size);
+read_attribute(btree_cache_max_chain);
+read_attribute(cache_available_percent);
+read_attribute(written);
+read_attribute(btree_written);
+read_attribute(metadata_written);
+read_attribute(active_journal_entries);
+
+sysfs_time_stats_attribute(btree_gc,	sec, ms);
+sysfs_time_stats_attribute(btree_split, sec, us);
+sysfs_time_stats_attribute(btree_sort,	ms,  us);
+sysfs_time_stats_attribute(btree_read,	ms,  us);
+
+read_attribute(btree_nodes);
+read_attribute(btree_used_percent);
+read_attribute(average_key_size);
+read_attribute(dirty_data);
+read_attribute(bset_tree_stats);
+
+read_attribute(state);
+read_attribute(cache_read_races);
+read_attribute(writeback_keys_done);
+read_attribute(writeback_keys_failed);
+read_attribute(io_errors);
+read_attribute(congested);
+rw_attribute(congested_read_threshold_us);
+rw_attribute(congested_write_threshold_us);
+
+rw_attribute(sequential_cutoff);
+rw_attribute(data_csum);
+rw_attribute(cache_mode);
+rw_attribute(writeback_metadata);
+rw_attribute(writeback_running);
+rw_attribute(writeback_percent);
+rw_attribute(writeback_delay);
+rw_attribute(writeback_rate);
+
+rw_attribute(writeback_rate_update_seconds);
+rw_attribute(writeback_rate_d_term);
+rw_attribute(writeback_rate_p_term_inverse);
+read_attribute(writeback_rate_debug);
+
+read_attribute(stripe_size);
+read_attribute(partial_stripes_expensive);
+
+rw_attribute(synchronous);
+rw_attribute(journal_delay_ms);
+rw_attribute(discard);
+rw_attribute(running);
+rw_attribute(label);
+rw_attribute(readahead);
+rw_attribute(errors);
+rw_attribute(io_error_limit);
+rw_attribute(io_error_halflife);
+rw_attribute(verify);
+rw_attribute(bypass_torture_test);
+rw_attribute(key_merging_disabled);
+rw_attribute(gc_always_rewrite);
+rw_attribute(expensive_debug_checks);
+rw_attribute(cache_replacement_policy);
+rw_attribute(btree_shrinker_disabled);
+rw_attribute(copy_gc_enabled);
+rw_attribute(size);
+
+SHOW(__bch_cached_dev)
+{
+	struct cached_dev *dc = container_of(kobj, struct cached_dev,
+					     disk.kobj);
+	const char *states[] = { "no cache", "clean", "dirty", "inconsistent" };
+
+#define var(stat)		(dc->stat)
+
+	if (attr == &sysfs_cache_mode)
+		return bch_snprint_string_list(buf, PAGE_SIZE,
+					       bch_cache_modes + 1,
+					       BDEV_CACHE_MODE(&dc->sb));
+
+	sysfs_printf(data_csum,		"%i", dc->disk.data_csum);
+	var_printf(verify,		"%i");
+	var_printf(bypass_torture_test,	"%i");
+	var_printf(writeback_metadata,	"%i");
+	var_printf(writeback_running,	"%i");
+	var_print(writeback_delay);
+	var_print(writeback_percent);
+	sysfs_hprint(writeback_rate,	dc->writeback_rate.rate << 9);
+
+	var_print(writeback_rate_update_seconds);
+	var_print(writeback_rate_d_term);
+	var_print(writeback_rate_p_term_inverse);
+
+	if (attr == &sysfs_writeback_rate_debug) {
+		char rate[20];
+		char dirty[20];
+		char target[20];
+		char proportional[20];
+		char derivative[20];
+		char change[20];
+		s64 next_io;
+
+		bch_hprint(rate,	dc->writeback_rate.rate << 9);
+		bch_hprint(dirty,	bcache_dev_sectors_dirty(&dc->disk) << 9);
+		bch_hprint(target,	dc->writeback_rate_target << 9);
+		bch_hprint(proportional,dc->writeback_rate_proportional << 9);
+		bch_hprint(derivative,	dc->writeback_rate_derivative << 9);
+		bch_hprint(change,	dc->writeback_rate_change << 9);
+
+		next_io = div64_s64(dc->writeback_rate.next - local_clock(),
+				    NSEC_PER_MSEC);
+
+		return sprintf(buf,
+			       "rate:\t\t%s/sec\n"
+			       "dirty:\t\t%s\n"
+			       "target:\t\t%s\n"
+			       "proportional:\t%s\n"
+			       "derivative:\t%s\n"
+			       "change:\t\t%s/sec\n"
+			       "next io:\t%llims\n",
+			       rate, dirty, target, proportional,
+			       derivative, change, next_io);
+	}
+
+	sysfs_hprint(dirty_data,
+		     bcache_dev_sectors_dirty(&dc->disk) << 9);
+
+	sysfs_hprint(stripe_size,	dc->disk.stripe_size << 9);
+	var_printf(partial_stripes_expensive,	"%u");
+
+	var_hprint(sequential_cutoff);
+	var_hprint(readahead);
+
+	sysfs_print(running,		atomic_read(&dc->running));
+	sysfs_print(state,		states[BDEV_STATE(&dc->sb)]);
+
+	if (attr == &sysfs_label) {
+		memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
+		buf[SB_LABEL_SIZE + 1] = '\0';
+		strcat(buf, "\n");
+		return strlen(buf);
+	}
+
+#undef var
+	return 0;
+}
+SHOW_LOCKED(bch_cached_dev)
+
+STORE(__cached_dev)
+{
+	struct cached_dev *dc = container_of(kobj, struct cached_dev,
+					     disk.kobj);
+	unsigned v = size;
+	struct cache_set *c;
+	struct kobj_uevent_env *env;
+
+#define d_strtoul(var)		sysfs_strtoul(var, dc->var)
+#define d_strtoul_nonzero(var)	sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX)
+#define d_strtoi_h(var)		sysfs_hatoi(var, dc->var)
+
+	sysfs_strtoul(data_csum,	dc->disk.data_csum);
+	d_strtoul(verify);
+	d_strtoul(bypass_torture_test);
+	d_strtoul(writeback_metadata);
+	d_strtoul(writeback_running);
+	d_strtoul(writeback_delay);
+
+	sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
+
+	sysfs_strtoul_clamp(writeback_rate,
+			    dc->writeback_rate.rate, 1, INT_MAX);
+
+	d_strtoul_nonzero(writeback_rate_update_seconds);
+	d_strtoul(writeback_rate_d_term);
+	d_strtoul_nonzero(writeback_rate_p_term_inverse);
+
+	d_strtoi_h(sequential_cutoff);
+	d_strtoi_h(readahead);
+
+	if (attr == &sysfs_clear_stats)
+		bch_cache_accounting_clear(&dc->accounting);
+
+	if (attr == &sysfs_running &&
+	    strtoul_or_return(buf))
+		bch_cached_dev_run(dc);
+
+	if (attr == &sysfs_cache_mode) {
+		ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1);
+
+		if (v < 0)
+			return v;
+
+		if ((unsigned) v != BDEV_CACHE_MODE(&dc->sb)) {
+			SET_BDEV_CACHE_MODE(&dc->sb, v);
+			bch_write_bdev_super(dc, NULL);
+		}
+	}
+
+	if (attr == &sysfs_label) {
+		if (size > SB_LABEL_SIZE)
+			return -EINVAL;
+		memcpy(dc->sb.label, buf, size);
+		if (size < SB_LABEL_SIZE)
+			dc->sb.label[size] = '\0';
+		if (size && dc->sb.label[size - 1] == '\n')
+			dc->sb.label[size - 1] = '\0';
+		bch_write_bdev_super(dc, NULL);
+		if (dc->disk.c) {
+			memcpy(dc->disk.c->uuids[dc->disk.id].label,
+			       buf, SB_LABEL_SIZE);
+			bch_uuid_write(dc->disk.c);
+		}
+		env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL);
+		if (!env)
+			return -ENOMEM;
+		add_uevent_var(env, "DRIVER=bcache");
+		add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid),
+		add_uevent_var(env, "CACHED_LABEL=%s", buf);
+		kobject_uevent_env(
+			&disk_to_dev(dc->disk.disk)->kobj, KOBJ_CHANGE, env->envp);
+		kfree(env);
+	}
+
+	if (attr == &sysfs_attach) {
+		if (bch_parse_uuid(buf, dc->sb.set_uuid) < 16)
+			return -EINVAL;
+
+		list_for_each_entry(c, &bch_cache_sets, list) {
+			v = bch_cached_dev_attach(dc, c);
+			if (!v)
+				return size;
+		}
+
+		pr_err("Can't attach %s: cache set not found", buf);
+		size = v;
+	}
+
+	if (attr == &sysfs_detach && dc->disk.c)
+		bch_cached_dev_detach(dc);
+
+	if (attr == &sysfs_stop)
+		bcache_device_stop(&dc->disk);
+
+	return size;
+}
+
+STORE(bch_cached_dev)
+{
+	struct cached_dev *dc = container_of(kobj, struct cached_dev,
+					     disk.kobj);
+
+	mutex_lock(&bch_register_lock);
+	size = __cached_dev_store(kobj, attr, buf, size);
+
+	if (attr == &sysfs_writeback_running)
+		bch_writeback_queue(dc);
+
+	if (attr == &sysfs_writeback_percent)
+		schedule_delayed_work(&dc->writeback_rate_update,
+				      dc->writeback_rate_update_seconds * HZ);
+
+	mutex_unlock(&bch_register_lock);
+	return size;
+}
+
+static struct attribute *bch_cached_dev_files[] = {
+	&sysfs_attach,
+	&sysfs_detach,
+	&sysfs_stop,
+#if 0
+	&sysfs_data_csum,
+#endif
+	&sysfs_cache_mode,
+	&sysfs_writeback_metadata,
+	&sysfs_writeback_running,
+	&sysfs_writeback_delay,
+	&sysfs_writeback_percent,
+	&sysfs_writeback_rate,
+	&sysfs_writeback_rate_update_seconds,
+	&sysfs_writeback_rate_d_term,
+	&sysfs_writeback_rate_p_term_inverse,
+	&sysfs_writeback_rate_debug,
+	&sysfs_dirty_data,
+	&sysfs_stripe_size,
+	&sysfs_partial_stripes_expensive,
+	&sysfs_sequential_cutoff,
+	&sysfs_clear_stats,
+	&sysfs_running,
+	&sysfs_state,
+	&sysfs_label,
+	&sysfs_readahead,
+#ifdef CONFIG_BCACHE_DEBUG
+	&sysfs_verify,
+	&sysfs_bypass_torture_test,
+#endif
+	NULL
+};
+KTYPE(bch_cached_dev);
+
+SHOW(bch_flash_dev)
+{
+	struct bcache_device *d = container_of(kobj, struct bcache_device,
+					       kobj);
+	struct uuid_entry *u = &d->c->uuids[d->id];
+
+	sysfs_printf(data_csum,	"%i", d->data_csum);
+	sysfs_hprint(size,	u->sectors << 9);
+
+	if (attr == &sysfs_label) {
+		memcpy(buf, u->label, SB_LABEL_SIZE);
+		buf[SB_LABEL_SIZE + 1] = '\0';
+		strcat(buf, "\n");
+		return strlen(buf);
+	}
+
+	return 0;
+}
+
+STORE(__bch_flash_dev)
+{
+	struct bcache_device *d = container_of(kobj, struct bcache_device,
+					       kobj);
+	struct uuid_entry *u = &d->c->uuids[d->id];
+
+	sysfs_strtoul(data_csum,	d->data_csum);
+
+	if (attr == &sysfs_size) {
+		uint64_t v;
+		strtoi_h_or_return(buf, v);
+
+		u->sectors = v >> 9;
+		bch_uuid_write(d->c);
+		set_capacity(d->disk, u->sectors);
+	}
+
+	if (attr == &sysfs_label) {
+		memcpy(u->label, buf, SB_LABEL_SIZE);
+		bch_uuid_write(d->c);
+	}
+
+	if (attr == &sysfs_unregister) {
+		set_bit(BCACHE_DEV_DETACHING, &d->flags);
+		bcache_device_stop(d);
+	}
+
+	return size;
+}
+STORE_LOCKED(bch_flash_dev)
+
+static struct attribute *bch_flash_dev_files[] = {
+	&sysfs_unregister,
+#if 0
+	&sysfs_data_csum,
+#endif
+	&sysfs_label,
+	&sysfs_size,
+	NULL
+};
+KTYPE(bch_flash_dev);
+
+struct bset_stats_op {
+	struct btree_op op;
+	size_t nodes;
+	struct bset_stats stats;
+};
+
+static int bch_btree_bset_stats(struct btree_op *b_op, struct btree *b)
+{
+	struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op);
+
+	op->nodes++;
+	bch_btree_keys_stats(&b->keys, &op->stats);
+
+	return MAP_CONTINUE;
+}
+
+static int bch_bset_print_stats(struct cache_set *c, char *buf)
+{
+	struct bset_stats_op op;
+	int ret;
+
+	memset(&op, 0, sizeof(op));
+	bch_btree_op_init(&op.op, -1);
+
+	ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, bch_btree_bset_stats);
+	if (ret < 0)
+		return ret;
+
+	return snprintf(buf, PAGE_SIZE,
+			"btree nodes:		%zu\n"
+			"written sets:		%zu\n"
+			"unwritten sets:		%zu\n"
+			"written key bytes:	%zu\n"
+			"unwritten key bytes:	%zu\n"
+			"floats:			%zu\n"
+			"failed:			%zu\n",
+			op.nodes,
+			op.stats.sets_written, op.stats.sets_unwritten,
+			op.stats.bytes_written, op.stats.bytes_unwritten,
+			op.stats.floats, op.stats.failed);
+}
+
+static unsigned bch_root_usage(struct cache_set *c)
+{
+	unsigned bytes = 0;
+	struct bkey *k;
+	struct btree *b;
+	struct btree_iter iter;
+
+	goto lock_root;
+
+	do {
+		rw_unlock(false, b);
+lock_root:
+		b = c->root;
+		rw_lock(false, b, b->level);
+	} while (b != c->root);
+
+	for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
+		bytes += bkey_bytes(k);
+
+	rw_unlock(false, b);
+
+	return (bytes * 100) / btree_bytes(c);
+}
+
+static size_t bch_cache_size(struct cache_set *c)
+{
+	size_t ret = 0;
+	struct btree *b;
+
+	mutex_lock(&c->bucket_lock);
+	list_for_each_entry(b, &c->btree_cache, list)
+		ret += 1 << (b->keys.page_order + PAGE_SHIFT);
+
+	mutex_unlock(&c->bucket_lock);
+	return ret;
+}
+
+static unsigned bch_cache_max_chain(struct cache_set *c)
+{
+	unsigned ret = 0;
+	struct hlist_head *h;
+
+	mutex_lock(&c->bucket_lock);
+
+	for (h = c->bucket_hash;
+	     h < c->bucket_hash + (1 << BUCKET_HASH_BITS);
+	     h++) {
+		unsigned i = 0;
+		struct hlist_node *p;
+
+		hlist_for_each(p, h)
+			i++;
+
+		ret = max(ret, i);
+	}
+
+	mutex_unlock(&c->bucket_lock);
+	return ret;
+}
+
+static unsigned bch_btree_used(struct cache_set *c)
+{
+	return div64_u64(c->gc_stats.key_bytes * 100,
+			 (c->gc_stats.nodes ?: 1) * btree_bytes(c));
+}
+
+static unsigned bch_average_key_size(struct cache_set *c)
+{
+	return c->gc_stats.nkeys
+		? div64_u64(c->gc_stats.data, c->gc_stats.nkeys)
+		: 0;
+}
+
+SHOW(__bch_cache_set)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+
+	sysfs_print(synchronous,		CACHE_SYNC(&c->sb));
+	sysfs_print(journal_delay_ms,		c->journal_delay_ms);
+	sysfs_hprint(bucket_size,		bucket_bytes(c));
+	sysfs_hprint(block_size,		block_bytes(c));
+	sysfs_print(tree_depth,			c->root->level);
+	sysfs_print(root_usage_percent,		bch_root_usage(c));
+
+	sysfs_hprint(btree_cache_size,		bch_cache_size(c));
+	sysfs_print(btree_cache_max_chain,	bch_cache_max_chain(c));
+	sysfs_print(cache_available_percent,	100 - c->gc_stats.in_use);
+
+	sysfs_print_time_stats(&c->btree_gc_time,	btree_gc, sec, ms);
+	sysfs_print_time_stats(&c->btree_split_time,	btree_split, sec, us);
+	sysfs_print_time_stats(&c->sort.time,		btree_sort, ms, us);
+	sysfs_print_time_stats(&c->btree_read_time,	btree_read, ms, us);
+
+	sysfs_print(btree_used_percent,	bch_btree_used(c));
+	sysfs_print(btree_nodes,	c->gc_stats.nodes);
+	sysfs_hprint(average_key_size,	bch_average_key_size(c));
+
+	sysfs_print(cache_read_races,
+		    atomic_long_read(&c->cache_read_races));
+
+	sysfs_print(writeback_keys_done,
+		    atomic_long_read(&c->writeback_keys_done));
+	sysfs_print(writeback_keys_failed,
+		    atomic_long_read(&c->writeback_keys_failed));
+
+	if (attr == &sysfs_errors)
+		return bch_snprint_string_list(buf, PAGE_SIZE, error_actions,
+					       c->on_error);
+
+	/* See count_io_errors for why 88 */
+	sysfs_print(io_error_halflife,	c->error_decay * 88);
+	sysfs_print(io_error_limit,	c->error_limit >> IO_ERROR_SHIFT);
+
+	sysfs_hprint(congested,
+		     ((uint64_t) bch_get_congested(c)) << 9);
+	sysfs_print(congested_read_threshold_us,
+		    c->congested_read_threshold_us);
+	sysfs_print(congested_write_threshold_us,
+		    c->congested_write_threshold_us);
+
+	sysfs_print(active_journal_entries,	fifo_used(&c->journal.pin));
+	sysfs_printf(verify,			"%i", c->verify);
+	sysfs_printf(key_merging_disabled,	"%i", c->key_merging_disabled);
+	sysfs_printf(expensive_debug_checks,
+		     "%i", c->expensive_debug_checks);
+	sysfs_printf(gc_always_rewrite,		"%i", c->gc_always_rewrite);
+	sysfs_printf(btree_shrinker_disabled,	"%i", c->shrinker_disabled);
+	sysfs_printf(copy_gc_enabled,		"%i", c->copy_gc_enabled);
+
+	if (attr == &sysfs_bset_tree_stats)
+		return bch_bset_print_stats(c, buf);
+
+	return 0;
+}
+SHOW_LOCKED(bch_cache_set)
+
+STORE(__bch_cache_set)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+
+	if (attr == &sysfs_unregister)
+		bch_cache_set_unregister(c);
+
+	if (attr == &sysfs_stop)
+		bch_cache_set_stop(c);
+
+	if (attr == &sysfs_synchronous) {
+		bool sync = strtoul_or_return(buf);
+
+		if (sync != CACHE_SYNC(&c->sb)) {
+			SET_CACHE_SYNC(&c->sb, sync);
+			bcache_write_super(c);
+		}
+	}
+
+	if (attr == &sysfs_flash_vol_create) {
+		int r;
+		uint64_t v;
+		strtoi_h_or_return(buf, v);
+
+		r = bch_flash_dev_create(c, v);
+		if (r)
+			return r;
+	}
+
+	if (attr == &sysfs_clear_stats) {
+		atomic_long_set(&c->writeback_keys_done,	0);
+		atomic_long_set(&c->writeback_keys_failed,	0);
+
+		memset(&c->gc_stats, 0, sizeof(struct gc_stat));
+		bch_cache_accounting_clear(&c->accounting);
+	}
+
+	if (attr == &sysfs_trigger_gc)
+		wake_up_gc(c);
+
+	if (attr == &sysfs_prune_cache) {
+		struct shrink_control sc;
+		sc.gfp_mask = GFP_KERNEL;
+		sc.nr_to_scan = strtoul_or_return(buf);
+		c->shrink.scan_objects(&c->shrink, &sc);
+	}
+
+	sysfs_strtoul(congested_read_threshold_us,
+		      c->congested_read_threshold_us);
+	sysfs_strtoul(congested_write_threshold_us,
+		      c->congested_write_threshold_us);
+
+	if (attr == &sysfs_errors) {
+		ssize_t v = bch_read_string_list(buf, error_actions);
+
+		if (v < 0)
+			return v;
+
+		c->on_error = v;
+	}
+
+	if (attr == &sysfs_io_error_limit)
+		c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT;
+
+	/* See count_io_errors() for why 88 */
+	if (attr == &sysfs_io_error_halflife)
+		c->error_decay = strtoul_or_return(buf) / 88;
+
+	sysfs_strtoul(journal_delay_ms,		c->journal_delay_ms);
+	sysfs_strtoul(verify,			c->verify);
+	sysfs_strtoul(key_merging_disabled,	c->key_merging_disabled);
+	sysfs_strtoul(expensive_debug_checks,	c->expensive_debug_checks);
+	sysfs_strtoul(gc_always_rewrite,	c->gc_always_rewrite);
+	sysfs_strtoul(btree_shrinker_disabled,	c->shrinker_disabled);
+	sysfs_strtoul(copy_gc_enabled,		c->copy_gc_enabled);
+
+	return size;
+}
+STORE_LOCKED(bch_cache_set)
+
+SHOW(bch_cache_set_internal)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, internal);
+	return bch_cache_set_show(&c->kobj, attr, buf);
+}
+
+STORE(bch_cache_set_internal)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, internal);
+	return bch_cache_set_store(&c->kobj, attr, buf, size);
+}
+
+static void bch_cache_set_internal_release(struct kobject *k)
+{
+}
+
+static struct attribute *bch_cache_set_files[] = {
+	&sysfs_unregister,
+	&sysfs_stop,
+	&sysfs_synchronous,
+	&sysfs_journal_delay_ms,
+	&sysfs_flash_vol_create,
+
+	&sysfs_bucket_size,
+	&sysfs_block_size,
+	&sysfs_tree_depth,
+	&sysfs_root_usage_percent,
+	&sysfs_btree_cache_size,
+	&sysfs_cache_available_percent,
+
+	&sysfs_average_key_size,
+
+	&sysfs_errors,
+	&sysfs_io_error_limit,
+	&sysfs_io_error_halflife,
+	&sysfs_congested,
+	&sysfs_congested_read_threshold_us,
+	&sysfs_congested_write_threshold_us,
+	&sysfs_clear_stats,
+	NULL
+};
+KTYPE(bch_cache_set);
+
+static struct attribute *bch_cache_set_internal_files[] = {
+	&sysfs_active_journal_entries,
+
+	sysfs_time_stats_attribute_list(btree_gc, sec, ms)
+	sysfs_time_stats_attribute_list(btree_split, sec, us)
+	sysfs_time_stats_attribute_list(btree_sort, ms, us)
+	sysfs_time_stats_attribute_list(btree_read, ms, us)
+
+	&sysfs_btree_nodes,
+	&sysfs_btree_used_percent,
+	&sysfs_btree_cache_max_chain,
+
+	&sysfs_bset_tree_stats,
+	&sysfs_cache_read_races,
+	&sysfs_writeback_keys_done,
+	&sysfs_writeback_keys_failed,
+
+	&sysfs_trigger_gc,
+	&sysfs_prune_cache,
+#ifdef CONFIG_BCACHE_DEBUG
+	&sysfs_verify,
+	&sysfs_key_merging_disabled,
+	&sysfs_expensive_debug_checks,
+#endif
+	&sysfs_gc_always_rewrite,
+	&sysfs_btree_shrinker_disabled,
+	&sysfs_copy_gc_enabled,
+	NULL
+};
+KTYPE(bch_cache_set_internal);
+
+SHOW(__bch_cache)
+{
+	struct cache *ca = container_of(kobj, struct cache, kobj);
+
+	sysfs_hprint(bucket_size,	bucket_bytes(ca));
+	sysfs_hprint(block_size,	block_bytes(ca));
+	sysfs_print(nbuckets,		ca->sb.nbuckets);
+	sysfs_print(discard,		ca->discard);
+	sysfs_hprint(written, atomic_long_read(&ca->sectors_written) << 9);
+	sysfs_hprint(btree_written,
+		     atomic_long_read(&ca->btree_sectors_written) << 9);
+	sysfs_hprint(metadata_written,
+		     (atomic_long_read(&ca->meta_sectors_written) +
+		      atomic_long_read(&ca->btree_sectors_written)) << 9);
+
+	sysfs_print(io_errors,
+		    atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT);
+
+	if (attr == &sysfs_cache_replacement_policy)
+		return bch_snprint_string_list(buf, PAGE_SIZE,
+					       cache_replacement_policies,
+					       CACHE_REPLACEMENT(&ca->sb));
+
+	if (attr == &sysfs_priority_stats) {
+		int cmp(const void *l, const void *r)
+		{	return *((uint16_t *) r) - *((uint16_t *) l); }
+
+		struct bucket *b;
+		size_t n = ca->sb.nbuckets, i;
+		size_t unused = 0, available = 0, dirty = 0, meta = 0;
+		uint64_t sum = 0;
+		/* Compute 31 quantiles */
+		uint16_t q[31], *p, *cached;
+		ssize_t ret;
+
+		cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t));
+		if (!p)
+			return -ENOMEM;
+
+		mutex_lock(&ca->set->bucket_lock);
+		for_each_bucket(b, ca) {
+			if (!GC_SECTORS_USED(b))
+				unused++;
+			if (GC_MARK(b) == GC_MARK_RECLAIMABLE)
+				available++;
+			if (GC_MARK(b) == GC_MARK_DIRTY)
+				dirty++;
+			if (GC_MARK(b) == GC_MARK_METADATA)
+				meta++;
+		}
+
+		for (i = ca->sb.first_bucket; i < n; i++)
+			p[i] = ca->buckets[i].prio;
+		mutex_unlock(&ca->set->bucket_lock);
+
+		sort(p, n, sizeof(uint16_t), cmp, NULL);
+
+		while (n &&
+		       !cached[n - 1])
+			--n;
+
+		unused = ca->sb.nbuckets - n;
+
+		while (cached < p + n &&
+		       *cached == BTREE_PRIO)
+			cached++, n--;
+
+		for (i = 0; i < n; i++)
+			sum += INITIAL_PRIO - cached[i];
+
+		if (n)
+			do_div(sum, n);
+
+		for (i = 0; i < ARRAY_SIZE(q); i++)
+			q[i] = INITIAL_PRIO - cached[n * (i + 1) /
+				(ARRAY_SIZE(q) + 1)];
+
+		vfree(p);
+
+		ret = scnprintf(buf, PAGE_SIZE,
+				"Unused:		%zu%%\n"
+				"Clean:		%zu%%\n"
+				"Dirty:		%zu%%\n"
+				"Metadata:	%zu%%\n"
+				"Average:	%llu\n"
+				"Sectors per Q:	%zu\n"
+				"Quantiles:	[",
+				unused * 100 / (size_t) ca->sb.nbuckets,
+				available * 100 / (size_t) ca->sb.nbuckets,
+				dirty * 100 / (size_t) ca->sb.nbuckets,
+				meta * 100 / (size_t) ca->sb.nbuckets, sum,
+				n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1));
+
+		for (i = 0; i < ARRAY_SIZE(q); i++)
+			ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+					 "%u ", q[i]);
+		ret--;
+
+		ret += scnprintf(buf + ret, PAGE_SIZE - ret, "]\n");
+
+		return ret;
+	}
+
+	return 0;
+}
+SHOW_LOCKED(bch_cache)
+
+STORE(__bch_cache)
+{
+	struct cache *ca = container_of(kobj, struct cache, kobj);
+
+	if (attr == &sysfs_discard) {
+		bool v = strtoul_or_return(buf);
+
+		if (blk_queue_discard(bdev_get_queue(ca->bdev)))
+			ca->discard = v;
+
+		if (v != CACHE_DISCARD(&ca->sb)) {
+			SET_CACHE_DISCARD(&ca->sb, v);
+			bcache_write_super(ca->set);
+		}
+	}
+
+	if (attr == &sysfs_cache_replacement_policy) {
+		ssize_t v = bch_read_string_list(buf, cache_replacement_policies);
+
+		if (v < 0)
+			return v;
+
+		if ((unsigned) v != CACHE_REPLACEMENT(&ca->sb)) {
+			mutex_lock(&ca->set->bucket_lock);
+			SET_CACHE_REPLACEMENT(&ca->sb, v);
+			mutex_unlock(&ca->set->bucket_lock);
+
+			bcache_write_super(ca->set);
+		}
+	}
+
+	if (attr == &sysfs_clear_stats) {
+		atomic_long_set(&ca->sectors_written, 0);
+		atomic_long_set(&ca->btree_sectors_written, 0);
+		atomic_long_set(&ca->meta_sectors_written, 0);
+		atomic_set(&ca->io_count, 0);
+		atomic_set(&ca->io_errors, 0);
+	}
+
+	return size;
+}
+STORE_LOCKED(bch_cache)
+
+static struct attribute *bch_cache_files[] = {
+	&sysfs_bucket_size,
+	&sysfs_block_size,
+	&sysfs_nbuckets,
+	&sysfs_priority_stats,
+	&sysfs_discard,
+	&sysfs_written,
+	&sysfs_btree_written,
+	&sysfs_metadata_written,
+	&sysfs_io_errors,
+	&sysfs_clear_stats,
+	&sysfs_cache_replacement_policy,
+	NULL
+};
+KTYPE(bch_cache);
diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h
new file mode 100644
index 00000000000..0526fe92a68
--- /dev/null
+++ b/drivers/md/bcache/sysfs.h
@@ -0,0 +1,110 @@
+#ifndef _BCACHE_SYSFS_H_
+#define _BCACHE_SYSFS_H_
+
+#define KTYPE(type)							\
+struct kobj_type type ## _ktype = {					\
+	.release	= type ## _release,				\
+	.sysfs_ops	= &((const struct sysfs_ops) {			\
+		.show	= type ## _show,				\
+		.store	= type ## _store				\
+	}),								\
+	.default_attrs	= type ## _files				\
+}
+
+#define SHOW(fn)							\
+static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
+			   char *buf)					\
+
+#define STORE(fn)							\
+static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
+			    const char *buf, size_t size)		\
+
+#define SHOW_LOCKED(fn)							\
+SHOW(fn)								\
+{									\
+	ssize_t ret;							\
+	mutex_lock(&bch_register_lock);					\
+	ret = __ ## fn ## _show(kobj, attr, buf);			\
+	mutex_unlock(&bch_register_lock);				\
+	return ret;							\
+}
+
+#define STORE_LOCKED(fn)						\
+STORE(fn)								\
+{									\
+	ssize_t ret;							\
+	mutex_lock(&bch_register_lock);					\
+	ret = __ ## fn ## _store(kobj, attr, buf, size);		\
+	mutex_unlock(&bch_register_lock);				\
+	return ret;							\
+}
+
+#define __sysfs_attribute(_name, _mode)					\
+	static struct attribute sysfs_##_name =				\
+		{ .name = #_name, .mode = _mode }
+
+#define write_attribute(n)	__sysfs_attribute(n, S_IWUSR)
+#define read_attribute(n)	__sysfs_attribute(n, S_IRUGO)
+#define rw_attribute(n)		__sysfs_attribute(n, S_IRUGO|S_IWUSR)
+
+#define sysfs_printf(file, fmt, ...)					\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return snprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);	\
+} while (0)
+
+#define sysfs_print(file, var)						\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return snprint(buf, PAGE_SIZE, var);			\
+} while (0)
+
+#define sysfs_hprint(file, val)						\
+do {									\
+	if (attr == &sysfs_ ## file) {					\
+		ssize_t ret = bch_hprint(buf, val);			\
+		strcat(buf, "\n");					\
+		return ret + 1;						\
+	}								\
+} while (0)
+
+#define var_printf(_var, fmt)	sysfs_printf(_var, fmt, var(_var))
+#define var_print(_var)		sysfs_print(_var, var(_var))
+#define var_hprint(_var)	sysfs_hprint(_var, var(_var))
+
+#define sysfs_strtoul(file, var)					\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return strtoul_safe(buf, var) ?: (ssize_t) size;	\
+} while (0)
+
+#define sysfs_strtoul_clamp(file, var, min, max)			\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return strtoul_safe_clamp(buf, var, min, max)		\
+			?: (ssize_t) size;				\
+} while (0)
+
+#define strtoul_or_return(cp)						\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (_r)								\
+		return _r;						\
+	_v;								\
+})
+
+#define strtoi_h_or_return(cp, v)					\
+do {									\
+	int _r = strtoi_h(cp, &v);					\
+	if (_r)								\
+		return _r;						\
+} while (0)
+
+#define sysfs_hatoi(file, var)						\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return strtoi_h(buf, &var) ?: (ssize_t) size;		\
+} while (0)
+
+#endif  /* _BCACHE_SYSFS_H_ */
diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c
new file mode 100644
index 00000000000..b7820b0d262
--- /dev/null
+++ b/drivers/md/bcache/trace.c
@@ -0,0 +1,52 @@
+#include "bcache.h"
+#include "btree.h"
+
+#include <linux/blktrace_api.h>
+#include <linux/module.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/bcache.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_sequential);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_congested);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_replay_key);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_full);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_entry_full);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_cache_cannibalize);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc_fail);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_free);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_gc_coalesce);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy_collision);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_insert_key);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_invalidate);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback_collision);
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
new file mode 100644
index 00000000000..db3ae4c2b22
--- /dev/null
+++ b/drivers/md/bcache/util.c
@@ -0,0 +1,381 @@
+/*
+ * random utiility code, for bcache but in theory not specific to bcache
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/types.h>
+
+#include "util.h"
+
+#define simple_strtoint(c, end, base)	simple_strtol(c, end, base)
+#define simple_strtouint(c, end, base)	simple_strtoul(c, end, base)
+
+#define STRTO_H(name, type)					\
+int bch_ ## name ## _h(const char *cp, type *res)		\
+{								\
+	int u = 0;						\
+	char *e;						\
+	type i = simple_ ## name(cp, &e, 10);			\
+								\
+	switch (tolower(*e)) {					\
+	default:						\
+		return -EINVAL;					\
+	case 'y':						\
+	case 'z':						\
+		u++;						\
+	case 'e':						\
+		u++;						\
+	case 'p':						\
+		u++;						\
+	case 't':						\
+		u++;						\
+	case 'g':						\
+		u++;						\
+	case 'm':						\
+		u++;						\
+	case 'k':						\
+		u++;						\
+		if (e++ == cp)					\
+			return -EINVAL;				\
+	case '\n':						\
+	case '\0':						\
+		if (*e == '\n')					\
+			e++;					\
+	}							\
+								\
+	if (*e)							\
+		return -EINVAL;					\
+								\
+	while (u--) {						\
+		if ((type) ~0 > 0 &&				\
+		    (type) ~0 / 1024 <= i)			\
+			return -EINVAL;				\
+		if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) ||	\
+		    (i < 0 && -ANYSINT_MAX(type) / 1024 > i))	\
+			return -EINVAL;				\
+		i *= 1024;					\
+	}							\
+								\
+	*res = i;						\
+	return 0;						\
+}								\
+
+STRTO_H(strtoint, int)
+STRTO_H(strtouint, unsigned int)
+STRTO_H(strtoll, long long)
+STRTO_H(strtoull, unsigned long long)
+
+ssize_t bch_hprint(char *buf, int64_t v)
+{
+	static const char units[] = "?kMGTPEZY";
+	char dec[4] = "";
+	int u, t = 0;
+
+	for (u = 0; v >= 1024 || v <= -1024; u++) {
+		t = v & ~(~0 << 10);
+		v >>= 10;
+	}
+
+	if (!u)
+		return sprintf(buf, "%llu", v);
+
+	if (v < 100 && v > -100)
+		snprintf(dec, sizeof(dec), ".%i", t / 100);
+
+	return sprintf(buf, "%lli%s%c", v, dec, units[u]);
+}
+
+ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
+			    size_t selected)
+{
+	char *out = buf;
+	size_t i;
+
+	for (i = 0; list[i]; i++)
+		out += snprintf(out, buf + size - out,
+				i == selected ? "[%s] " : "%s ", list[i]);
+
+	out[-1] = '\n';
+	return out - buf;
+}
+
+ssize_t bch_read_string_list(const char *buf, const char * const list[])
+{
+	size_t i;
+	char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL);
+	if (!d)
+		return -ENOMEM;
+
+	s = strim(d);
+
+	for (i = 0; list[i]; i++)
+		if (!strcmp(list[i], s))
+			break;
+
+	kfree(d);
+
+	if (!list[i])
+		return -EINVAL;
+
+	return i;
+}
+
+bool bch_is_zero(const char *p, size_t n)
+{
+	size_t i;
+
+	for (i = 0; i < n; i++)
+		if (p[i])
+			return false;
+	return true;
+}
+
+int bch_parse_uuid(const char *s, char *uuid)
+{
+	size_t i, j, x;
+	memset(uuid, 0, 16);
+
+	for (i = 0, j = 0;
+	     i < strspn(s, "-0123456789:ABCDEFabcdef") && j < 32;
+	     i++) {
+		x = s[i] | 32;
+
+		switch (x) {
+		case '0'...'9':
+			x -= '0';
+			break;
+		case 'a'...'f':
+			x -= 'a' - 10;
+			break;
+		default:
+			continue;
+		}
+
+		if (!(j & 1))
+			x <<= 4;
+		uuid[j++ >> 1] |= x;
+	}
+	return i;
+}
+
+void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
+{
+	uint64_t now, duration, last;
+
+	spin_lock(&stats->lock);
+
+	now		= local_clock();
+	duration	= time_after64(now, start_time)
+		? now - start_time : 0;
+	last		= time_after64(now, stats->last)
+		? now - stats->last : 0;
+
+	stats->max_duration = max(stats->max_duration, duration);
+
+	if (stats->last) {
+		ewma_add(stats->average_duration, duration, 8, 8);
+
+		if (stats->average_frequency)
+			ewma_add(stats->average_frequency, last, 8, 8);
+		else
+			stats->average_frequency  = last << 8;
+	} else {
+		stats->average_duration  = duration << 8;
+	}
+
+	stats->last = now ?: 1;
+
+	spin_unlock(&stats->lock);
+}
+
+/**
+ * bch_next_delay() - increment @d by the amount of work done, and return how
+ * long to delay until the next time to do some work.
+ *
+ * @d - the struct bch_ratelimit to update
+ * @done - the amount of work done, in arbitrary units
+ *
+ * Returns the amount of time to delay by, in jiffies
+ */
+uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
+{
+	uint64_t now = local_clock();
+
+	d->next += div_u64(done * NSEC_PER_SEC, d->rate);
+
+	if (time_before64(now + NSEC_PER_SEC, d->next))
+		d->next = now + NSEC_PER_SEC;
+
+	if (time_after64(now - NSEC_PER_SEC * 2, d->next))
+		d->next = now - NSEC_PER_SEC * 2;
+
+	return time_after64(d->next, now)
+		? div_u64(d->next - now, NSEC_PER_SEC / HZ)
+		: 0;
+}
+
+void bch_bio_map(struct bio *bio, void *base)
+{
+	size_t size = bio->bi_iter.bi_size;
+	struct bio_vec *bv = bio->bi_io_vec;
+
+	BUG_ON(!bio->bi_iter.bi_size);
+	BUG_ON(bio->bi_vcnt);
+
+	bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0;
+	goto start;
+
+	for (; size; bio->bi_vcnt++, bv++) {
+		bv->bv_offset	= 0;
+start:		bv->bv_len	= min_t(size_t, PAGE_SIZE - bv->bv_offset,
+					size);
+		if (base) {
+			bv->bv_page = is_vmalloc_addr(base)
+				? vmalloc_to_page(base)
+				: virt_to_page(base);
+
+			base += bv->bv_len;
+		}
+
+		size -= bv->bv_len;
+	}
+}
+
+/*
+ * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
+ * use permitted, subject to terms of PostgreSQL license; see.)
+
+ * If we have a 64-bit integer type, then a 64-bit CRC looks just like the
+ * usual sort of implementation. (See Ross Williams' excellent introduction
+ * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
+ * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.)
+ * If we have no working 64-bit type, then fake it with two 32-bit registers.
+ *
+ * The present implementation is a normal (not "reflected", in Williams'
+ * terms) 64-bit CRC, using initial all-ones register contents and a final
+ * bit inversion. The chosen polynomial is borrowed from the DLT1 spec
+ * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM):
+ *
+ * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
+ * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
+ * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
+ * x^7 + x^4 + x + 1
+*/
+
+static const uint64_t crc_table[256] = {
+	0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL,
+	0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL,
+	0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL,
+	0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL,
+	0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL,
+	0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL,
+	0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL,
+	0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL,
+	0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL,
+	0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL,
+	0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL,
+	0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL,
+	0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL,
+	0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL,
+	0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL,
+	0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL,
+	0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL,
+	0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL,
+	0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL,
+	0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL,
+	0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL,
+	0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL,
+	0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL,
+	0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL,
+	0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL,
+	0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL,
+	0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL,
+	0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL,
+	0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL,
+	0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL,
+	0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL,
+	0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL,
+	0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL,
+	0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL,
+	0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL,
+	0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL,
+	0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL,
+	0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL,
+	0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL,
+	0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL,
+	0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL,
+	0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL,
+	0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL,
+	0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL,
+	0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL,
+	0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL,
+	0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL,
+	0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL,
+	0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL,
+	0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL,
+	0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL,
+	0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL,
+	0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL,
+	0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL,
+	0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL,
+	0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL,
+	0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL,
+	0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL,
+	0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL,
+	0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL,
+	0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL,
+	0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL,
+	0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL,
+	0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL,
+	0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL,
+	0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL,
+	0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL,
+	0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL,
+	0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL,
+	0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL,
+	0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL,
+	0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL,
+	0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL,
+	0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL,
+	0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL,
+	0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL,
+	0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL,
+	0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL,
+	0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL,
+	0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL,
+	0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL,
+	0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL,
+	0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL,
+	0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL,
+	0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL,
+	0x9AFCE626CE85B507ULL,
+};
+
+uint64_t bch_crc64_update(uint64_t crc, const void *_data, size_t len)
+{
+	const unsigned char *data = _data;
+
+	while (len--) {
+		int i = ((int) (crc >> 56) ^ *data++) & 0xFF;
+		crc = crc_table[i] ^ (crc << 8);
+	}
+
+	return crc;
+}
+
+uint64_t bch_crc64(const void *data, size_t len)
+{
+	uint64_t crc = 0xffffffffffffffffULL;
+
+	crc = bch_crc64_update(crc, data, len);
+
+	return crc ^ 0xffffffffffffffffULL;
+}
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
new file mode 100644
index 00000000000..ac7d0d1f70d
--- /dev/null
+++ b/drivers/md/bcache/util.h
@@ -0,0 +1,588 @@
+
+#ifndef _BCACHE_UTIL_H
+#define _BCACHE_UTIL_H
+
+#include <linux/blkdev.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/llist.h>
+#include <linux/ratelimit.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+
+#include "closure.h"
+
+#define PAGE_SECTORS		(PAGE_SIZE / 512)
+
+struct closure;
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+#define EBUG_ON(cond)			BUG_ON(cond)
+#define atomic_dec_bug(v)	BUG_ON(atomic_dec_return(v) < 0)
+#define atomic_inc_bug(v, i)	BUG_ON(atomic_inc_return(v) <= i)
+
+#else /* DEBUG */
+
+#define EBUG_ON(cond)			do { if (cond); } while (0)
+#define atomic_dec_bug(v)	atomic_dec(v)
+#define atomic_inc_bug(v, i)	atomic_inc(v)
+
+#endif
+
+#define DECLARE_HEAP(type, name)					\
+	struct {							\
+		size_t size, used;					\
+		type *data;						\
+	} name
+
+#define init_heap(heap, _size, gfp)					\
+({									\
+	size_t _bytes;							\
+	(heap)->used = 0;						\
+	(heap)->size = (_size);						\
+	_bytes = (heap)->size * sizeof(*(heap)->data);			\
+	(heap)->data = NULL;						\
+	if (_bytes < KMALLOC_MAX_SIZE)					\
+		(heap)->data = kmalloc(_bytes, (gfp));			\
+	if ((!(heap)->data) && ((gfp) & GFP_KERNEL))			\
+		(heap)->data = vmalloc(_bytes);				\
+	(heap)->data;							\
+})
+
+#define free_heap(heap)							\
+do {									\
+	if (is_vmalloc_addr((heap)->data))				\
+		vfree((heap)->data);					\
+	else								\
+		kfree((heap)->data);					\
+	(heap)->data = NULL;						\
+} while (0)
+
+#define heap_swap(h, i, j)	swap((h)->data[i], (h)->data[j])
+
+#define heap_sift(h, i, cmp)						\
+do {									\
+	size_t _r, _j = i;						\
+									\
+	for (; _j * 2 + 1 < (h)->used; _j = _r) {			\
+		_r = _j * 2 + 1;					\
+		if (_r + 1 < (h)->used &&				\
+		    cmp((h)->data[_r], (h)->data[_r + 1]))		\
+			_r++;						\
+									\
+		if (cmp((h)->data[_r], (h)->data[_j]))			\
+			break;						\
+		heap_swap(h, _r, _j);					\
+	}								\
+} while (0)
+
+#define heap_sift_down(h, i, cmp)					\
+do {									\
+	while (i) {							\
+		size_t p = (i - 1) / 2;					\
+		if (cmp((h)->data[i], (h)->data[p]))			\
+			break;						\
+		heap_swap(h, i, p);					\
+		i = p;							\
+	}								\
+} while (0)
+
+#define heap_add(h, d, cmp)						\
+({									\
+	bool _r = !heap_full(h);					\
+	if (_r) {							\
+		size_t _i = (h)->used++;				\
+		(h)->data[_i] = d;					\
+									\
+		heap_sift_down(h, _i, cmp);				\
+		heap_sift(h, _i, cmp);					\
+	}								\
+	_r;								\
+})
+
+#define heap_pop(h, d, cmp)						\
+({									\
+	bool _r = (h)->used;						\
+	if (_r) {							\
+		(d) = (h)->data[0];					\
+		(h)->used--;						\
+		heap_swap(h, 0, (h)->used);				\
+		heap_sift(h, 0, cmp);					\
+	}								\
+	_r;								\
+})
+
+#define heap_peek(h)	((h)->used ? (h)->data[0] : NULL)
+
+#define heap_full(h)	((h)->used == (h)->size)
+
+#define DECLARE_FIFO(type, name)					\
+	struct {							\
+		size_t front, back, size, mask;				\
+		type *data;						\
+	} name
+
+#define fifo_for_each(c, fifo, iter)					\
+	for (iter = (fifo)->front;					\
+	     c = (fifo)->data[iter], iter != (fifo)->back;		\
+	     iter = (iter + 1) & (fifo)->mask)
+
+#define __init_fifo(fifo, gfp)						\
+({									\
+	size_t _allocated_size, _bytes;					\
+	BUG_ON(!(fifo)->size);						\
+									\
+	_allocated_size = roundup_pow_of_two((fifo)->size + 1);		\
+	_bytes = _allocated_size * sizeof(*(fifo)->data);		\
+									\
+	(fifo)->mask = _allocated_size - 1;				\
+	(fifo)->front = (fifo)->back = 0;				\
+	(fifo)->data = NULL;						\
+									\
+	if (_bytes < KMALLOC_MAX_SIZE)					\
+		(fifo)->data = kmalloc(_bytes, (gfp));			\
+	if ((!(fifo)->data) && ((gfp) & GFP_KERNEL))			\
+		(fifo)->data = vmalloc(_bytes);				\
+	(fifo)->data;							\
+})
+
+#define init_fifo_exact(fifo, _size, gfp)				\
+({									\
+	(fifo)->size = (_size);						\
+	__init_fifo(fifo, gfp);						\
+})
+
+#define init_fifo(fifo, _size, gfp)					\
+({									\
+	(fifo)->size = (_size);						\
+	if ((fifo)->size > 4)						\
+		(fifo)->size = roundup_pow_of_two((fifo)->size) - 1;	\
+	__init_fifo(fifo, gfp);						\
+})
+
+#define free_fifo(fifo)							\
+do {									\
+	if (is_vmalloc_addr((fifo)->data))				\
+		vfree((fifo)->data);					\
+	else								\
+		kfree((fifo)->data);					\
+	(fifo)->data = NULL;						\
+} while (0)
+
+#define fifo_used(fifo)		(((fifo)->back - (fifo)->front) & (fifo)->mask)
+#define fifo_free(fifo)		((fifo)->size - fifo_used(fifo))
+
+#define fifo_empty(fifo)	(!fifo_used(fifo))
+#define fifo_full(fifo)		(!fifo_free(fifo))
+
+#define fifo_front(fifo)	((fifo)->data[(fifo)->front])
+#define fifo_back(fifo)							\
+	((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
+
+#define fifo_idx(fifo, p)	(((p) - &fifo_front(fifo)) & (fifo)->mask)
+
+#define fifo_push_back(fifo, i)						\
+({									\
+	bool _r = !fifo_full((fifo));					\
+	if (_r) {							\
+		(fifo)->data[(fifo)->back++] = (i);			\
+		(fifo)->back &= (fifo)->mask;				\
+	}								\
+	_r;								\
+})
+
+#define fifo_pop_front(fifo, i)						\
+({									\
+	bool _r = !fifo_empty((fifo));					\
+	if (_r) {							\
+		(i) = (fifo)->data[(fifo)->front++];			\
+		(fifo)->front &= (fifo)->mask;				\
+	}								\
+	_r;								\
+})
+
+#define fifo_push_front(fifo, i)					\
+({									\
+	bool _r = !fifo_full((fifo));					\
+	if (_r) {							\
+		--(fifo)->front;					\
+		(fifo)->front &= (fifo)->mask;				\
+		(fifo)->data[(fifo)->front] = (i);			\
+	}								\
+	_r;								\
+})
+
+#define fifo_pop_back(fifo, i)						\
+({									\
+	bool _r = !fifo_empty((fifo));					\
+	if (_r) {							\
+		--(fifo)->back;						\
+		(fifo)->back &= (fifo)->mask;				\
+		(i) = (fifo)->data[(fifo)->back]			\
+	}								\
+	_r;								\
+})
+
+#define fifo_push(fifo, i)	fifo_push_back(fifo, (i))
+#define fifo_pop(fifo, i)	fifo_pop_front(fifo, (i))
+
+#define fifo_swap(l, r)							\
+do {									\
+	swap((l)->front, (r)->front);					\
+	swap((l)->back, (r)->back);					\
+	swap((l)->size, (r)->size);					\
+	swap((l)->mask, (r)->mask);					\
+	swap((l)->data, (r)->data);					\
+} while (0)
+
+#define fifo_move(dest, src)						\
+do {									\
+	typeof(*((dest)->data)) _t;					\
+	while (!fifo_full(dest) &&					\
+	       fifo_pop(src, _t))					\
+		fifo_push(dest, _t);					\
+} while (0)
+
+/*
+ * Simple array based allocator - preallocates a number of elements and you can
+ * never allocate more than that, also has no locking.
+ *
+ * Handy because if you know you only need a fixed number of elements you don't
+ * have to worry about memory allocation failure, and sometimes a mempool isn't
+ * what you want.
+ *
+ * We treat the free elements as entries in a singly linked list, and the
+ * freelist as a stack - allocating and freeing push and pop off the freelist.
+ */
+
+#define DECLARE_ARRAY_ALLOCATOR(type, name, size)			\
+	struct {							\
+		type	*freelist;					\
+		type	data[size];					\
+	} name
+
+#define array_alloc(array)						\
+({									\
+	typeof((array)->freelist) _ret = (array)->freelist;		\
+									\
+	if (_ret)							\
+		(array)->freelist = *((typeof((array)->freelist) *) _ret);\
+									\
+	_ret;								\
+})
+
+#define array_free(array, ptr)						\
+do {									\
+	typeof((array)->freelist) _ptr = ptr;				\
+									\
+	*((typeof((array)->freelist) *) _ptr) = (array)->freelist;	\
+	(array)->freelist = _ptr;					\
+} while (0)
+
+#define array_allocator_init(array)					\
+do {									\
+	typeof((array)->freelist) _i;					\
+									\
+	BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *));	\
+	(array)->freelist = NULL;					\
+									\
+	for (_i = (array)->data;					\
+	     _i < (array)->data + ARRAY_SIZE((array)->data);		\
+	     _i++)							\
+		array_free(array, _i);					\
+} while (0)
+
+#define array_freelist_empty(array)	((array)->freelist == NULL)
+
+#define ANYSINT_MAX(t)							\
+	((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
+
+int bch_strtoint_h(const char *, int *);
+int bch_strtouint_h(const char *, unsigned int *);
+int bch_strtoll_h(const char *, long long *);
+int bch_strtoull_h(const char *, unsigned long long *);
+
+static inline int bch_strtol_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+	return bch_strtoint_h(cp, (int *) res);
+#else
+	return bch_strtoll_h(cp, (long long *) res);
+#endif
+}
+
+static inline int bch_strtoul_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+	return bch_strtouint_h(cp, (unsigned int *) res);
+#else
+	return bch_strtoull_h(cp, (unsigned long long *) res);
+#endif
+}
+
+#define strtoi_h(cp, res)						\
+	(__builtin_types_compatible_p(typeof(*res), int)		\
+	? bch_strtoint_h(cp, (void *) res)				\
+	: __builtin_types_compatible_p(typeof(*res), long)		\
+	? bch_strtol_h(cp, (void *) res)				\
+	: __builtin_types_compatible_p(typeof(*res), long long)		\
+	? bch_strtoll_h(cp, (void *) res)				\
+	: __builtin_types_compatible_p(typeof(*res), unsigned int)	\
+	? bch_strtouint_h(cp, (void *) res)				\
+	: __builtin_types_compatible_p(typeof(*res), unsigned long)	\
+	? bch_strtoul_h(cp, (void *) res)				\
+	: __builtin_types_compatible_p(typeof(*res), unsigned long long)\
+	? bch_strtoull_h(cp, (void *) res) : -EINVAL)
+
+#define strtoul_safe(cp, var)						\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (!_r)							\
+		var = _v;						\
+	_r;								\
+})
+
+#define strtoul_safe_clamp(cp, var, min, max)				\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (!_r)							\
+		var = clamp_t(typeof(var), _v, min, max);		\
+	_r;								\
+})
+
+#define snprint(buf, size, var)						\
+	snprintf(buf, size,						\
+		__builtin_types_compatible_p(typeof(var), int)		\
+		     ? "%i\n" :						\
+		__builtin_types_compatible_p(typeof(var), unsigned)	\
+		     ? "%u\n" :						\
+		__builtin_types_compatible_p(typeof(var), long)		\
+		     ? "%li\n" :					\
+		__builtin_types_compatible_p(typeof(var), unsigned long)\
+		     ? "%lu\n" :					\
+		__builtin_types_compatible_p(typeof(var), int64_t)	\
+		     ? "%lli\n" :					\
+		__builtin_types_compatible_p(typeof(var), uint64_t)	\
+		     ? "%llu\n" :					\
+		__builtin_types_compatible_p(typeof(var), const char *)	\
+		     ? "%s\n" : "%i\n", var)
+
+ssize_t bch_hprint(char *buf, int64_t v);
+
+bool bch_is_zero(const char *p, size_t n);
+int bch_parse_uuid(const char *s, char *uuid);
+
+ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
+			    size_t selected);
+
+ssize_t bch_read_string_list(const char *buf, const char * const list[]);
+
+struct time_stats {
+	spinlock_t	lock;
+	/*
+	 * all fields are in nanoseconds, averages are ewmas stored left shifted
+	 * by 8
+	 */
+	uint64_t	max_duration;
+	uint64_t	average_duration;
+	uint64_t	average_frequency;
+	uint64_t	last;
+};
+
+void bch_time_stats_update(struct time_stats *stats, uint64_t time);
+
+static inline unsigned local_clock_us(void)
+{
+	return local_clock() >> 10;
+}
+
+#define NSEC_PER_ns			1L
+#define NSEC_PER_us			NSEC_PER_USEC
+#define NSEC_PER_ms			NSEC_PER_MSEC
+#define NSEC_PER_sec			NSEC_PER_SEC
+
+#define __print_time_stat(stats, name, stat, units)			\
+	sysfs_print(name ## _ ## stat ## _ ## units,			\
+		    div_u64((stats)->stat >> 8, NSEC_PER_ ## units))
+
+#define sysfs_print_time_stats(stats, name,				\
+			       frequency_units,				\
+			       duration_units)				\
+do {									\
+	__print_time_stat(stats, name,					\
+			  average_frequency,	frequency_units);	\
+	__print_time_stat(stats, name,					\
+			  average_duration,	duration_units);	\
+	__print_time_stat(stats, name,					\
+			  max_duration,		duration_units);	\
+									\
+	sysfs_print(name ## _last_ ## frequency_units, (stats)->last	\
+		    ? div_s64(local_clock() - (stats)->last,		\
+			      NSEC_PER_ ## frequency_units)		\
+		    : -1LL);						\
+} while (0)
+
+#define sysfs_time_stats_attribute(name,				\
+				   frequency_units,			\
+				   duration_units)			\
+read_attribute(name ## _average_frequency_ ## frequency_units);		\
+read_attribute(name ## _average_duration_ ## duration_units);		\
+read_attribute(name ## _max_duration_ ## duration_units);		\
+read_attribute(name ## _last_ ## frequency_units)
+
+#define sysfs_time_stats_attribute_list(name,				\
+					frequency_units,		\
+					duration_units)			\
+&sysfs_ ## name ## _average_frequency_ ## frequency_units,		\
+&sysfs_ ## name ## _average_duration_ ## duration_units,		\
+&sysfs_ ## name ## _max_duration_ ## duration_units,			\
+&sysfs_ ## name ## _last_ ## frequency_units,
+
+#define ewma_add(ewma, val, weight, factor)				\
+({									\
+	(ewma) *= (weight) - 1;						\
+	(ewma) += (val) << factor;					\
+	(ewma) /= (weight);						\
+	(ewma) >> factor;						\
+})
+
+struct bch_ratelimit {
+	/* Next time we want to do some work, in nanoseconds */
+	uint64_t		next;
+
+	/*
+	 * Rate at which we want to do work, in units per nanosecond
+	 * The units here correspond to the units passed to bch_next_delay()
+	 */
+	unsigned		rate;
+};
+
+static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
+{
+	d->next = local_clock();
+}
+
+uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done);
+
+#define __DIV_SAFE(n, d, zero)						\
+({									\
+	typeof(n) _n = (n);						\
+	typeof(d) _d = (d);						\
+	_d ? _n / _d : zero;						\
+})
+
+#define DIV_SAFE(n, d)	__DIV_SAFE(n, d, 0)
+
+#define container_of_or_null(ptr, type, member)				\
+({									\
+	typeof(ptr) _ptr = ptr;						\
+	_ptr ? container_of(_ptr, type, member) : NULL;			\
+})
+
+#define RB_INSERT(root, new, member, cmp)				\
+({									\
+	__label__ dup;							\
+	struct rb_node **n = &(root)->rb_node, *parent = NULL;		\
+	typeof(new) this;						\
+	int res, ret = -1;						\
+									\
+	while (*n) {							\
+		parent = *n;						\
+		this = container_of(*n, typeof(*(new)), member);	\
+		res = cmp(new, this);					\
+		if (!res)						\
+			goto dup;					\
+		n = res < 0						\
+			? &(*n)->rb_left				\
+			: &(*n)->rb_right;				\
+	}								\
+									\
+	rb_link_node(&(new)->member, parent, n);			\
+	rb_insert_color(&(new)->member, root);				\
+	ret = 0;							\
+dup:									\
+	ret;								\
+})
+
+#define RB_SEARCH(root, search, member, cmp)				\
+({									\
+	struct rb_node *n = (root)->rb_node;				\
+	typeof(&(search)) this, ret = NULL;				\
+	int res;							\
+									\
+	while (n) {							\
+		this = container_of(n, typeof(search), member);		\
+		res = cmp(&(search), this);				\
+		if (!res) {						\
+			ret = this;					\
+			break;						\
+		}							\
+		n = res < 0						\
+			? n->rb_left					\
+			: n->rb_right;					\
+	}								\
+	ret;								\
+})
+
+#define RB_GREATER(root, search, member, cmp)				\
+({									\
+	struct rb_node *n = (root)->rb_node;				\
+	typeof(&(search)) this, ret = NULL;				\
+	int res;							\
+									\
+	while (n) {							\
+		this = container_of(n, typeof(search), member);		\
+		res = cmp(&(search), this);				\
+		if (res < 0) {						\
+			ret = this;					\
+			n = n->rb_left;					\
+		} else							\
+			n = n->rb_right;				\
+	}								\
+	ret;								\
+})
+
+#define RB_FIRST(root, type, member)					\
+	container_of_or_null(rb_first(root), type, member)
+
+#define RB_LAST(root, type, member)					\
+	container_of_or_null(rb_last(root), type, member)
+
+#define RB_NEXT(ptr, member)						\
+	container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member)
+
+#define RB_PREV(ptr, member)						\
+	container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member)
+
+/* Does linear interpolation between powers of two */
+static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
+{
+	unsigned fract = x & ~(~0 << fract_bits);
+
+	x >>= fract_bits;
+	x   = 1 << x;
+	x  += (x * fract) >> fract_bits;
+
+	return x;
+}
+
+void bch_bio_map(struct bio *bio, void *base);
+
+static inline sector_t bdev_sectors(struct block_device *bdev)
+{
+	return bdev->bd_inode->i_size >> 9;
+}
+
+#define closure_bio_submit(bio, cl, dev)				\
+do {									\
+	closure_get(cl);						\
+	bch_generic_make_request(bio, &(dev)->bio_split_hook);		\
+} while (0)
+
+uint64_t bch_crc64_update(uint64_t, const void *, size_t);
+uint64_t bch_crc64(const void *, size_t);
+
+#endif /* _BCACHE_UTIL_H */
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
new file mode 100644
index 00000000000..f4300e4c011
--- /dev/null
+++ b/drivers/md/bcache/writeback.c
@@ -0,0 +1,507 @@
+/*
+ * background writeback - scan btree for dirty data and write it to the backing
+ * device
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "writeback.h"
+
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <trace/events/bcache.h>
+
+/* Rate limiting */
+
+static void __update_writeback_rate(struct cached_dev *dc)
+{
+	struct cache_set *c = dc->disk.c;
+	uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
+	uint64_t cache_dirty_target =
+		div_u64(cache_sectors * dc->writeback_percent, 100);
+
+	int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
+				   c->cached_dev_sectors);
+
+	/* PD controller */
+
+	int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
+	int64_t derivative = dirty - dc->disk.sectors_dirty_last;
+	int64_t proportional = dirty - target;
+	int64_t change;
+
+	dc->disk.sectors_dirty_last = dirty;
+
+	/* Scale to sectors per second */
+
+	proportional *= dc->writeback_rate_update_seconds;
+	proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse);
+
+	derivative = div_s64(derivative, dc->writeback_rate_update_seconds);
+
+	derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
+			      (dc->writeback_rate_d_term /
+			       dc->writeback_rate_update_seconds) ?: 1, 0);
+
+	derivative *= dc->writeback_rate_d_term;
+	derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse);
+
+	change = proportional + derivative;
+
+	/* Don't increase writeback rate if the device isn't keeping up */
+	if (change > 0 &&
+	    time_after64(local_clock(),
+			 dc->writeback_rate.next + NSEC_PER_MSEC))
+		change = 0;
+
+	dc->writeback_rate.rate =
+		clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change,
+			1, NSEC_PER_MSEC);
+
+	dc->writeback_rate_proportional = proportional;
+	dc->writeback_rate_derivative = derivative;
+	dc->writeback_rate_change = change;
+	dc->writeback_rate_target = target;
+}
+
+static void update_writeback_rate(struct work_struct *work)
+{
+	struct cached_dev *dc = container_of(to_delayed_work(work),
+					     struct cached_dev,
+					     writeback_rate_update);
+
+	down_read(&dc->writeback_lock);
+
+	if (atomic_read(&dc->has_dirty) &&
+	    dc->writeback_percent)
+		__update_writeback_rate(dc);
+
+	up_read(&dc->writeback_lock);
+
+	schedule_delayed_work(&dc->writeback_rate_update,
+			      dc->writeback_rate_update_seconds * HZ);
+}
+
+static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
+{
+	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
+	    !dc->writeback_percent)
+		return 0;
+
+	return bch_next_delay(&dc->writeback_rate, sectors);
+}
+
+struct dirty_io {
+	struct closure		cl;
+	struct cached_dev	*dc;
+	struct bio		bio;
+};
+
+static void dirty_init(struct keybuf_key *w)
+{
+	struct dirty_io *io = w->private;
+	struct bio *bio = &io->bio;
+
+	bio_init(bio);
+	if (!io->dc->writeback_percent)
+		bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+	bio->bi_iter.bi_size	= KEY_SIZE(&w->key) << 9;
+	bio->bi_max_vecs	= DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS);
+	bio->bi_private		= w;
+	bio->bi_io_vec		= bio->bi_inline_vecs;
+	bch_bio_map(bio, NULL);
+}
+
+static void dirty_io_destructor(struct closure *cl)
+{
+	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+	kfree(io);
+}
+
+static void write_dirty_finish(struct closure *cl)
+{
+	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+	struct keybuf_key *w = io->bio.bi_private;
+	struct cached_dev *dc = io->dc;
+	struct bio_vec *bv;
+	int i;
+
+	bio_for_each_segment_all(bv, &io->bio, i)
+		__free_page(bv->bv_page);
+
+	/* This is kind of a dumb way of signalling errors. */
+	if (KEY_DIRTY(&w->key)) {
+		int ret;
+		unsigned i;
+		struct keylist keys;
+
+		bch_keylist_init(&keys);
+
+		bkey_copy(keys.top, &w->key);
+		SET_KEY_DIRTY(keys.top, false);
+		bch_keylist_push(&keys);
+
+		for (i = 0; i < KEY_PTRS(&w->key); i++)
+			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
+
+		ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key);
+
+		if (ret)
+			trace_bcache_writeback_collision(&w->key);
+
+		atomic_long_inc(ret
+				? &dc->disk.c->writeback_keys_failed
+				: &dc->disk.c->writeback_keys_done);
+	}
+
+	bch_keybuf_del(&dc->writeback_keys, w);
+	up(&dc->in_flight);
+
+	closure_return_with_destructor(cl, dirty_io_destructor);
+}
+
+static void dirty_endio(struct bio *bio, int error)
+{
+	struct keybuf_key *w = bio->bi_private;
+	struct dirty_io *io = w->private;
+
+	if (error)
+		SET_KEY_DIRTY(&w->key, false);
+
+	closure_put(&io->cl);
+}
+
+static void write_dirty(struct closure *cl)
+{
+	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+	struct keybuf_key *w = io->bio.bi_private;
+
+	dirty_init(w);
+	io->bio.bi_rw		= WRITE;
+	io->bio.bi_iter.bi_sector = KEY_START(&w->key);
+	io->bio.bi_bdev		= io->dc->bdev;
+	io->bio.bi_end_io	= dirty_endio;
+
+	closure_bio_submit(&io->bio, cl, &io->dc->disk);
+
+	continue_at(cl, write_dirty_finish, system_wq);
+}
+
+static void read_dirty_endio(struct bio *bio, int error)
+{
+	struct keybuf_key *w = bio->bi_private;
+	struct dirty_io *io = w->private;
+
+	bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
+			    error, "reading dirty data from cache");
+
+	dirty_endio(bio, error);
+}
+
+static void read_dirty_submit(struct closure *cl)
+{
+	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+
+	closure_bio_submit(&io->bio, cl, &io->dc->disk);
+
+	continue_at(cl, write_dirty, system_wq);
+}
+
+static void read_dirty(struct cached_dev *dc)
+{
+	unsigned delay = 0;
+	struct keybuf_key *w;
+	struct dirty_io *io;
+	struct closure cl;
+
+	closure_init_stack(&cl);
+
+	/*
+	 * XXX: if we error, background writeback just spins. Should use some
+	 * mempools.
+	 */
+
+	while (!kthread_should_stop()) {
+		try_to_freeze();
+
+		w = bch_keybuf_next(&dc->writeback_keys);
+		if (!w)
+			break;
+
+		BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
+
+		if (KEY_START(&w->key) != dc->last_read ||
+		    jiffies_to_msecs(delay) > 50)
+			while (!kthread_should_stop() && delay)
+				delay = schedule_timeout_uninterruptible(delay);
+
+		dc->last_read	= KEY_OFFSET(&w->key);
+
+		io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
+			     * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+			     GFP_KERNEL);
+		if (!io)
+			goto err;
+
+		w->private	= io;
+		io->dc		= dc;
+
+		dirty_init(w);
+		io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
+		io->bio.bi_bdev		= PTR_CACHE(dc->disk.c,
+						    &w->key, 0)->bdev;
+		io->bio.bi_rw		= READ;
+		io->bio.bi_end_io	= read_dirty_endio;
+
+		if (bio_alloc_pages(&io->bio, GFP_KERNEL))
+			goto err_free;
+
+		trace_bcache_writeback(&w->key);
+
+		down(&dc->in_flight);
+		closure_call(&io->cl, read_dirty_submit, NULL, &cl);
+
+		delay = writeback_delay(dc, KEY_SIZE(&w->key));
+	}
+
+	if (0) {
+err_free:
+		kfree(w->private);
+err:
+		bch_keybuf_del(&dc->writeback_keys, w);
+	}
+
+	/*
+	 * Wait for outstanding writeback IOs to finish (and keybuf slots to be
+	 * freed) before refilling again
+	 */
+	closure_sync(&cl);
+}
+
+/* Scan for dirty data */
+
+void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
+				  uint64_t offset, int nr_sectors)
+{
+	struct bcache_device *d = c->devices[inode];
+	unsigned stripe_offset, stripe, sectors_dirty;
+
+	if (!d)
+		return;
+
+	stripe = offset_to_stripe(d, offset);
+	stripe_offset = offset & (d->stripe_size - 1);
+
+	while (nr_sectors) {
+		int s = min_t(unsigned, abs(nr_sectors),
+			      d->stripe_size - stripe_offset);
+
+		if (nr_sectors < 0)
+			s = -s;
+
+		if (stripe >= d->nr_stripes)
+			return;
+
+		sectors_dirty = atomic_add_return(s,
+					d->stripe_sectors_dirty + stripe);
+		if (sectors_dirty == d->stripe_size)
+			set_bit(stripe, d->full_dirty_stripes);
+		else
+			clear_bit(stripe, d->full_dirty_stripes);
+
+		nr_sectors -= s;
+		stripe_offset = 0;
+		stripe++;
+	}
+}
+
+static bool dirty_pred(struct keybuf *buf, struct bkey *k)
+{
+	return KEY_DIRTY(k);
+}
+
+static void refill_full_stripes(struct cached_dev *dc)
+{
+	struct keybuf *buf = &dc->writeback_keys;
+	unsigned start_stripe, stripe, next_stripe;
+	bool wrapped = false;
+
+	stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned));
+
+	if (stripe >= dc->disk.nr_stripes)
+		stripe = 0;
+
+	start_stripe = stripe;
+
+	while (1) {
+		stripe = find_next_bit(dc->disk.full_dirty_stripes,
+				       dc->disk.nr_stripes, stripe);
+
+		if (stripe == dc->disk.nr_stripes)
+			goto next;
+
+		next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes,
+						 dc->disk.nr_stripes, stripe);
+
+		buf->last_scanned = KEY(dc->disk.id,
+					stripe * dc->disk.stripe_size, 0);
+
+		bch_refill_keybuf(dc->disk.c, buf,
+				  &KEY(dc->disk.id,
+				       next_stripe * dc->disk.stripe_size, 0),
+				  dirty_pred);
+
+		if (array_freelist_empty(&buf->freelist))
+			return;
+
+		stripe = next_stripe;
+next:
+		if (wrapped && stripe > start_stripe)
+			return;
+
+		if (stripe == dc->disk.nr_stripes) {
+			stripe = 0;
+			wrapped = true;
+		}
+	}
+}
+
+static bool refill_dirty(struct cached_dev *dc)
+{
+	struct keybuf *buf = &dc->writeback_keys;
+	struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
+	bool searched_from_start = false;
+
+	if (dc->partial_stripes_expensive) {
+		refill_full_stripes(dc);
+		if (array_freelist_empty(&buf->freelist))
+			return false;
+	}
+
+	if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
+		buf->last_scanned = KEY(dc->disk.id, 0, 0);
+		searched_from_start = true;
+	}
+
+	bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
+
+	return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start;
+}
+
+static int bch_writeback_thread(void *arg)
+{
+	struct cached_dev *dc = arg;
+	bool searched_full_index;
+
+	while (!kthread_should_stop()) {
+		down_write(&dc->writeback_lock);
+		if (!atomic_read(&dc->has_dirty) ||
+		    (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
+		     !dc->writeback_running)) {
+			up_write(&dc->writeback_lock);
+			set_current_state(TASK_INTERRUPTIBLE);
+
+			if (kthread_should_stop())
+				return 0;
+
+			try_to_freeze();
+			schedule();
+			continue;
+		}
+
+		searched_full_index = refill_dirty(dc);
+
+		if (searched_full_index &&
+		    RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
+			atomic_set(&dc->has_dirty, 0);
+			cached_dev_put(dc);
+			SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
+			bch_write_bdev_super(dc, NULL);
+		}
+
+		up_write(&dc->writeback_lock);
+
+		bch_ratelimit_reset(&dc->writeback_rate);
+		read_dirty(dc);
+
+		if (searched_full_index) {
+			unsigned delay = dc->writeback_delay * HZ;
+
+			while (delay &&
+			       !kthread_should_stop() &&
+			       !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
+				delay = schedule_timeout_uninterruptible(delay);
+		}
+	}
+
+	return 0;
+}
+
+/* Init */
+
+struct sectors_dirty_init {
+	struct btree_op	op;
+	unsigned	inode;
+};
+
+static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
+				 struct bkey *k)
+{
+	struct sectors_dirty_init *op = container_of(_op,
+						struct sectors_dirty_init, op);
+	if (KEY_INODE(k) > op->inode)
+		return MAP_DONE;
+
+	if (KEY_DIRTY(k))
+		bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
+					     KEY_START(k), KEY_SIZE(k));
+
+	return MAP_CONTINUE;
+}
+
+void bch_sectors_dirty_init(struct cached_dev *dc)
+{
+	struct sectors_dirty_init op;
+
+	bch_btree_op_init(&op.op, -1);
+	op.inode = dc->disk.id;
+
+	bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),
+			   sectors_dirty_init_fn, 0);
+
+	dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk);
+}
+
+int bch_cached_dev_writeback_init(struct cached_dev *dc)
+{
+	sema_init(&dc->in_flight, 64);
+	init_rwsem(&dc->writeback_lock);
+	bch_keybuf_init(&dc->writeback_keys);
+
+	dc->writeback_metadata		= true;
+	dc->writeback_running		= true;
+	dc->writeback_percent		= 10;
+	dc->writeback_delay		= 30;
+	dc->writeback_rate.rate		= 1024;
+
+	dc->writeback_rate_update_seconds = 5;
+	dc->writeback_rate_d_term	= 30;
+	dc->writeback_rate_p_term_inverse = 6000;
+
+	dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
+					      "bcache_writeback");
+	if (IS_ERR(dc->writeback_thread))
+		return PTR_ERR(dc->writeback_thread);
+
+	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
+	schedule_delayed_work(&dc->writeback_rate_update,
+			      dc->writeback_rate_update_seconds * HZ);
+
+	return 0;
+}
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
new file mode 100644
index 00000000000..e2f8598937a
--- /dev/null
+++ b/drivers/md/bcache/writeback.h
@@ -0,0 +1,90 @@
+#ifndef _BCACHE_WRITEBACK_H
+#define _BCACHE_WRITEBACK_H
+
+#define CUTOFF_WRITEBACK	40
+#define CUTOFF_WRITEBACK_SYNC	70
+
+static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
+{
+	uint64_t i, ret = 0;
+
+	for (i = 0; i < d->nr_stripes; i++)
+		ret += atomic_read(d->stripe_sectors_dirty + i);
+
+	return ret;
+}
+
+static inline unsigned offset_to_stripe(struct bcache_device *d,
+					uint64_t offset)
+{
+	do_div(offset, d->stripe_size);
+	return offset;
+}
+
+static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc,
+					   uint64_t offset,
+					   unsigned nr_sectors)
+{
+	unsigned stripe = offset_to_stripe(&dc->disk, offset);
+
+	while (1) {
+		if (atomic_read(dc->disk.stripe_sectors_dirty + stripe))
+			return true;
+
+		if (nr_sectors <= dc->disk.stripe_size)
+			return false;
+
+		nr_sectors -= dc->disk.stripe_size;
+		stripe++;
+	}
+}
+
+static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
+				    unsigned cache_mode, bool would_skip)
+{
+	unsigned in_use = dc->disk.c->gc_stats.in_use;
+
+	if (cache_mode != CACHE_MODE_WRITEBACK ||
+	    test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
+	    in_use > CUTOFF_WRITEBACK_SYNC)
+		return false;
+
+	if (dc->partial_stripes_expensive &&
+	    bcache_dev_stripe_dirty(dc, bio->bi_iter.bi_sector,
+				    bio_sectors(bio)))
+		return true;
+
+	if (would_skip)
+		return false;
+
+	return bio->bi_rw & REQ_SYNC ||
+		in_use <= CUTOFF_WRITEBACK;
+}
+
+static inline void bch_writeback_queue(struct cached_dev *dc)
+{
+	wake_up_process(dc->writeback_thread);
+}
+
+static inline void bch_writeback_add(struct cached_dev *dc)
+{
+	if (!atomic_read(&dc->has_dirty) &&
+	    !atomic_xchg(&dc->has_dirty, 1)) {
+		atomic_inc(&dc->count);
+
+		if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
+			SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
+			/* XXX: should do this synchronously */
+			bch_write_bdev_super(dc, NULL);
+		}
+
+		bch_writeback_queue(dc);
+	}
+}
+
+void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
+
+void bch_sectors_dirty_init(struct cached_dev *dc);
+int bch_cached_dev_writeback_init(struct cached_dev *);
+
+#endif
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 7155945f8eb..67f8b31e205 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -337,7 +337,7 @@ static int read_page(struct file *file, unsigned long index,
 		     struct page *page)
 {
 	int ret = 0;
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct buffer_head *bh;
 	sector_t block;
 
@@ -669,17 +669,13 @@ static inline unsigned long file_page_offset(struct bitmap_storage *store,
 /*
  * return a pointer to the page in the filemap that contains the given bit
  *
- * this lookup is complicated by the fact that the bitmap sb might be exactly
- * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page
- * 0 or page 1
  */
 static inline struct page *filemap_get_page(struct bitmap_storage *store,
 					    unsigned long chunk)
 {
 	if (file_page_index(store, chunk) >= store->file_pages)
 		return NULL;
-	return store->filemap[file_page_index(store, chunk)
-			      - file_page_index(store, 0)];
+	return store->filemap[file_page_index(store, chunk)];
 }
 
 static int bitmap_storage_alloc(struct bitmap_storage *store,
@@ -755,7 +751,7 @@ static void bitmap_file_unmap(struct bitmap_storage *store)
 		free_buffers(sb_page);
 
 	if (file) {
-		struct inode *inode = file->f_path.dentry->d_inode;
+		struct inode *inode = file_inode(file);
 		invalidate_mapping_pages(inode->i_mapping, 0, -1);
 		fput(file);
 	}
@@ -846,7 +842,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 	if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
 		set_bit(bit, kaddr);
 	else
-		test_and_set_bit_le(bit, kaddr);
+		set_bit_le(bit, kaddr);
 	kunmap_atomic(kaddr);
 	pr_debug("set file bit %lu page %lu\n", bit, page->index);
 	/* record page number so it gets flushed to disk when unplug occurs */
@@ -868,7 +864,7 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
 	if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
 		clear_bit(bit, paddr);
 	else
-		test_and_clear_bit_le(bit, paddr);
+		clear_bit_le(bit, paddr);
 	kunmap_atomic(paddr);
 	if (!test_page_attr(bitmap, page->index, BITMAP_PAGE_NEEDWRITE)) {
 		set_page_attr(bitmap, page->index, BITMAP_PAGE_PENDING);
@@ -1635,7 +1631,7 @@ int bitmap_create(struct mddev *mddev)
 	sector_t blocks = mddev->resync_max_sectors;
 	struct file *file = mddev->bitmap_info.file;
 	int err;
-	struct sysfs_dirent *bm = NULL;
+	struct kernfs_node *bm = NULL;
 
 	BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
 
@@ -1654,9 +1650,9 @@ int bitmap_create(struct mddev *mddev)
 	bitmap->mddev = mddev;
 
 	if (mddev->kobj.sd)
-		bm = sysfs_get_dirent(mddev->kobj.sd, NULL, "bitmap");
+		bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap");
 	if (bm) {
-		bitmap->sysfs_can_clear = sysfs_get_dirent(bm, NULL, "can_clear");
+		bitmap->sysfs_can_clear = sysfs_get_dirent(bm, "can_clear");
 		sysfs_put(bm);
 	} else
 		bitmap->sysfs_can_clear = NULL;
@@ -1988,7 +1984,6 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
 		if (mddev->bitmap_info.file) {
 			struct file *f = mddev->bitmap_info.file;
 			mddev->bitmap_info.file = NULL;
-			restore_bitmap_write_access(f);
 			fput(f);
 		}
 	} else {
@@ -2002,9 +1997,9 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
 		} else {
 			int rv;
 			if (buf[0] == '+')
-				rv = strict_strtoll(buf+1, 10, &offset);
+				rv = kstrtoll(buf+1, 10, &offset);
 			else
-				rv = strict_strtoll(buf, 10, &offset);
+				rv = kstrtoll(buf, 10, &offset);
 			if (rv)
 				return rv;
 			if (offset == 0)
@@ -2139,7 +2134,7 @@ static ssize_t
 backlog_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	unsigned long backlog;
-	int rv = strict_strtoul(buf, 10, &backlog);
+	int rv = kstrtoul(buf, 10, &backlog);
 	if (rv)
 		return rv;
 	if (backlog > COUNTER_MAX)
@@ -2165,7 +2160,7 @@ chunksize_store(struct mddev *mddev, const char *buf, size_t len)
 	unsigned long csize;
 	if (mddev->bitmap)
 		return -EBUSY;
-	rv = strict_strtoul(buf, 10, &csize);
+	rv = kstrtoul(buf, 10, &csize);
 	if (rv)
 		return rv;
 	if (csize < 512 ||
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index df4aeb6ac6f..30210b9c4ef 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -225,7 +225,7 @@ struct bitmap {
 	wait_queue_head_t overflow_wait;
 	wait_queue_head_t behind_wait;
 
-	struct sysfs_dirent *sysfs_can_clear;
+	struct kernfs_node *sysfs_can_clear;
 };
 
 /* the bitmap API */
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
index e4e84156745..f752d12081f 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
@@ -14,21 +14,17 @@
 
 /*----------------------------------------------------------------*/
 
-struct dm_bio_prison_cell {
-	struct hlist_node list;
-	struct dm_bio_prison *prison;
-	struct dm_cell_key key;
-	struct bio *holder;
-	struct bio_list bios;
+struct bucket {
+	spinlock_t lock;
+	struct hlist_head cells;
 };
 
 struct dm_bio_prison {
-	spinlock_t lock;
 	mempool_t *cell_pool;
 
 	unsigned nr_buckets;
 	unsigned hash_mask;
-	struct hlist_head *cells;
+	struct bucket *buckets;
 };
 
 /*----------------------------------------------------------------*/
@@ -48,6 +44,12 @@ static uint32_t calc_nr_buckets(unsigned nr_cells)
 
 static struct kmem_cache *_cell_cache;
 
+static void init_bucket(struct bucket *b)
+{
+	spin_lock_init(&b->lock);
+	INIT_HLIST_HEAD(&b->cells);
+}
+
 /*
  * @nr_cells should be the number of cells you want in use _concurrently_.
  * Don't confuse it with the number of distinct keys.
@@ -57,13 +59,12 @@ struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells)
 	unsigned i;
 	uint32_t nr_buckets = calc_nr_buckets(nr_cells);
 	size_t len = sizeof(struct dm_bio_prison) +
-		(sizeof(struct hlist_head) * nr_buckets);
+		(sizeof(struct bucket) * nr_buckets);
 	struct dm_bio_prison *prison = kmalloc(len, GFP_KERNEL);
 
 	if (!prison)
 		return NULL;
 
-	spin_lock_init(&prison->lock);
 	prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache);
 	if (!prison->cell_pool) {
 		kfree(prison);
@@ -72,9 +73,9 @@ struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells)
 
 	prison->nr_buckets = nr_buckets;
 	prison->hash_mask = nr_buckets - 1;
-	prison->cells = (struct hlist_head *) (prison + 1);
+	prison->buckets = (struct bucket *) (prison + 1);
 	for (i = 0; i < nr_buckets; i++)
-		INIT_HLIST_HEAD(prison->cells + i);
+		init_bucket(prison->buckets + i);
 
 	return prison;
 }
@@ -87,6 +88,19 @@ void dm_bio_prison_destroy(struct dm_bio_prison *prison)
 }
 EXPORT_SYMBOL_GPL(dm_bio_prison_destroy);
 
+struct dm_bio_prison_cell *dm_bio_prison_alloc_cell(struct dm_bio_prison *prison, gfp_t gfp)
+{
+	return mempool_alloc(prison->cell_pool, gfp);
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell);
+
+void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
+			     struct dm_bio_prison_cell *cell)
+{
+	mempool_free(cell, prison->cell_pool);
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell);
+
 static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key)
 {
 	const unsigned long BIG_PRIME = 4294967291UL;
@@ -102,175 +116,154 @@ static int keys_equal(struct dm_cell_key *lhs, struct dm_cell_key *rhs)
 		       (lhs->block == rhs->block);
 }
 
-static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket,
+static struct bucket *get_bucket(struct dm_bio_prison *prison,
+				 struct dm_cell_key *key)
+{
+	return prison->buckets + hash_key(prison, key);
+}
+
+static struct dm_bio_prison_cell *__search_bucket(struct bucket *b,
 						  struct dm_cell_key *key)
 {
 	struct dm_bio_prison_cell *cell;
-	struct hlist_node *tmp;
 
-	hlist_for_each_entry(cell, tmp, bucket, list)
+	hlist_for_each_entry(cell, &b->cells, list)
 		if (keys_equal(&cell->key, key))
 			return cell;
 
 	return NULL;
 }
 
-/*
- * This may block if a new cell needs allocating.  You must ensure that
- * cells will be unlocked even if the calling thread is blocked.
- *
- * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
- */
-int dm_bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key,
-		  struct bio *inmate, struct dm_bio_prison_cell **ref)
+static void __setup_new_cell(struct bucket *b,
+			     struct dm_cell_key *key,
+			     struct bio *holder,
+			     struct dm_bio_prison_cell *cell)
 {
-	int r = 1;
-	unsigned long flags;
-	uint32_t hash = hash_key(prison, key);
-	struct dm_bio_prison_cell *cell, *cell2;
-
-	BUG_ON(hash > prison->nr_buckets);
-
-	spin_lock_irqsave(&prison->lock, flags);
+	memcpy(&cell->key, key, sizeof(cell->key));
+	cell->holder = holder;
+	bio_list_init(&cell->bios);
+	hlist_add_head(&cell->list, &b->cells);
+}
 
-	cell = __search_bucket(prison->cells + hash, key);
-	if (cell) {
-		bio_list_add(&cell->bios, inmate);
-		goto out;
-	}
+static int __bio_detain(struct bucket *b,
+			struct dm_cell_key *key,
+			struct bio *inmate,
+			struct dm_bio_prison_cell *cell_prealloc,
+			struct dm_bio_prison_cell **cell_result)
+{
+	struct dm_bio_prison_cell *cell;
 
-	/*
-	 * Allocate a new cell
-	 */
-	spin_unlock_irqrestore(&prison->lock, flags);
-	cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
-	spin_lock_irqsave(&prison->lock, flags);
-
-	/*
-	 * We've been unlocked, so we have to double check that
-	 * nobody else has inserted this cell in the meantime.
-	 */
-	cell = __search_bucket(prison->cells + hash, key);
+	cell = __search_bucket(b, key);
 	if (cell) {
-		mempool_free(cell2, prison->cell_pool);
-		bio_list_add(&cell->bios, inmate);
-		goto out;
+		if (inmate)
+			bio_list_add(&cell->bios, inmate);
+		*cell_result = cell;
+		return 1;
 	}
 
-	/*
-	 * Use new cell.
-	 */
-	cell = cell2;
-
-	cell->prison = prison;
-	memcpy(&cell->key, key, sizeof(cell->key));
-	cell->holder = inmate;
-	bio_list_init(&cell->bios);
-	hlist_add_head(&cell->list, prison->cells + hash);
-
-	r = 0;
+	__setup_new_cell(b, key, inmate, cell_prealloc);
+	*cell_result = cell_prealloc;
+	return 0;
+}
 
-out:
-	spin_unlock_irqrestore(&prison->lock, flags);
+static int bio_detain(struct dm_bio_prison *prison,
+		      struct dm_cell_key *key,
+		      struct bio *inmate,
+		      struct dm_bio_prison_cell *cell_prealloc,
+		      struct dm_bio_prison_cell **cell_result)
+{
+	int r;
+	unsigned long flags;
+	struct bucket *b = get_bucket(prison, key);
 
-	*ref = cell;
+	spin_lock_irqsave(&b->lock, flags);
+	r = __bio_detain(b, key, inmate, cell_prealloc, cell_result);
+	spin_unlock_irqrestore(&b->lock, flags);
 
 	return r;
 }
+
+int dm_bio_detain(struct dm_bio_prison *prison,
+		  struct dm_cell_key *key,
+		  struct bio *inmate,
+		  struct dm_bio_prison_cell *cell_prealloc,
+		  struct dm_bio_prison_cell **cell_result)
+{
+	return bio_detain(prison, key, inmate, cell_prealloc, cell_result);
+}
 EXPORT_SYMBOL_GPL(dm_bio_detain);
 
+int dm_get_cell(struct dm_bio_prison *prison,
+		struct dm_cell_key *key,
+		struct dm_bio_prison_cell *cell_prealloc,
+		struct dm_bio_prison_cell **cell_result)
+{
+	return bio_detain(prison, key, NULL, cell_prealloc, cell_result);
+}
+EXPORT_SYMBOL_GPL(dm_get_cell);
+
 /*
  * @inmates must have been initialised prior to this call
  */
-static void __cell_release(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
+static void __cell_release(struct dm_bio_prison_cell *cell,
+			   struct bio_list *inmates)
 {
-	struct dm_bio_prison *prison = cell->prison;
-
 	hlist_del(&cell->list);
 
 	if (inmates) {
-		bio_list_add(inmates, cell->holder);
+		if (cell->holder)
+			bio_list_add(inmates, cell->holder);
 		bio_list_merge(inmates, &cell->bios);
 	}
-
-	mempool_free(cell, prison->cell_pool);
 }
 
-void dm_cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios)
+void dm_cell_release(struct dm_bio_prison *prison,
+		     struct dm_bio_prison_cell *cell,
+		     struct bio_list *bios)
 {
 	unsigned long flags;
-	struct dm_bio_prison *prison = cell->prison;
+	struct bucket *b = get_bucket(prison, &cell->key);
 
-	spin_lock_irqsave(&prison->lock, flags);
+	spin_lock_irqsave(&b->lock, flags);
 	__cell_release(cell, bios);
-	spin_unlock_irqrestore(&prison->lock, flags);
+	spin_unlock_irqrestore(&b->lock, flags);
 }
 EXPORT_SYMBOL_GPL(dm_cell_release);
 
 /*
- * There are a couple of places where we put a bio into a cell briefly
- * before taking it out again.  In these situations we know that no other
- * bio may be in the cell.  This function releases the cell, and also does
- * a sanity check.
- */
-static void __cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio)
-{
-	BUG_ON(cell->holder != bio);
-	BUG_ON(!bio_list_empty(&cell->bios));
-
-	__cell_release(cell, NULL);
-}
-
-void dm_cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio)
-{
-	unsigned long flags;
-	struct dm_bio_prison *prison = cell->prison;
-
-	spin_lock_irqsave(&prison->lock, flags);
-	__cell_release_singleton(cell, bio);
-	spin_unlock_irqrestore(&prison->lock, flags);
-}
-EXPORT_SYMBOL_GPL(dm_cell_release_singleton);
-
-/*
  * Sometimes we don't want the holder, just the additional bios.
  */
-static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
+static void __cell_release_no_holder(struct dm_bio_prison_cell *cell,
+				     struct bio_list *inmates)
 {
-	struct dm_bio_prison *prison = cell->prison;
-
 	hlist_del(&cell->list);
 	bio_list_merge(inmates, &cell->bios);
-
-	mempool_free(cell, prison->cell_pool);
 }
 
-void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
+void dm_cell_release_no_holder(struct dm_bio_prison *prison,
+			       struct dm_bio_prison_cell *cell,
+			       struct bio_list *inmates)
 {
 	unsigned long flags;
-	struct dm_bio_prison *prison = cell->prison;
+	struct bucket *b = get_bucket(prison, &cell->key);
 
-	spin_lock_irqsave(&prison->lock, flags);
+	spin_lock_irqsave(&b->lock, flags);
 	__cell_release_no_holder(cell, inmates);
-	spin_unlock_irqrestore(&prison->lock, flags);
+	spin_unlock_irqrestore(&b->lock, flags);
 }
 EXPORT_SYMBOL_GPL(dm_cell_release_no_holder);
 
-void dm_cell_error(struct dm_bio_prison_cell *cell)
+void dm_cell_error(struct dm_bio_prison *prison,
+		   struct dm_bio_prison_cell *cell, int error)
 {
-	struct dm_bio_prison *prison = cell->prison;
 	struct bio_list bios;
 	struct bio *bio;
-	unsigned long flags;
 
 	bio_list_init(&bios);
-
-	spin_lock_irqsave(&prison->lock, flags);
-	__cell_release(cell, &bios);
-	spin_unlock_irqrestore(&prison->lock, flags);
+	dm_cell_release(prison, cell, &bios);
 
 	while ((bio = bio_list_pop(&bios)))
-		bio_io_error(bio);
+		bio_endio(bio, error);
 }
 EXPORT_SYMBOL_GPL(dm_cell_error);
 
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
index 4e0ac376700..6805a142b75 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
@@ -22,7 +22,6 @@
  * subsequently unlocked the bios become available.
  */
 struct dm_bio_prison;
-struct dm_bio_prison_cell;
 
 /* FIXME: this needs to be more abstract */
 struct dm_cell_key {
@@ -31,22 +30,62 @@ struct dm_cell_key {
 	dm_block_t block;
 };
 
+/*
+ * Treat this as opaque, only in header so callers can manage allocation
+ * themselves.
+ */
+struct dm_bio_prison_cell {
+	struct hlist_node list;
+	struct dm_cell_key key;
+	struct bio *holder;
+	struct bio_list bios;
+};
+
 struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells);
 void dm_bio_prison_destroy(struct dm_bio_prison *prison);
 
 /*
- * This may block if a new cell needs allocating.  You must ensure that
- * cells will be unlocked even if the calling thread is blocked.
+ * These two functions just wrap a mempool.  This is a transitory step:
+ * Eventually all bio prison clients should manage their own cell memory.
  *
- * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
+ * Like mempool_alloc(), dm_bio_prison_alloc_cell() can only fail if called
+ * in interrupt context or passed GFP_NOWAIT.
  */
-int dm_bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key,
-		  struct bio *inmate, struct dm_bio_prison_cell **ref);
+struct dm_bio_prison_cell *dm_bio_prison_alloc_cell(struct dm_bio_prison *prison,
+						    gfp_t gfp);
+void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
+			     struct dm_bio_prison_cell *cell);
 
-void dm_cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios);
-void dm_cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio); // FIXME: bio arg not needed
-void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates);
-void dm_cell_error(struct dm_bio_prison_cell *cell);
+/*
+ * Creates, or retrieves a cell for the given key.
+ *
+ * Returns 1 if pre-existing cell returned, zero if new cell created using
+ * @cell_prealloc.
+ */
+int dm_get_cell(struct dm_bio_prison *prison,
+		struct dm_cell_key *key,
+		struct dm_bio_prison_cell *cell_prealloc,
+		struct dm_bio_prison_cell **cell_result);
+
+/*
+ * An atomic op that combines retrieving a cell, and adding a bio to it.
+ *
+ * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
+ */
+int dm_bio_detain(struct dm_bio_prison *prison,
+		  struct dm_cell_key *key,
+		  struct bio *inmate,
+		  struct dm_bio_prison_cell *cell_prealloc,
+		  struct dm_bio_prison_cell **cell_result);
+
+void dm_cell_release(struct dm_bio_prison *prison,
+		     struct dm_bio_prison_cell *cell,
+		     struct bio_list *bios);
+void dm_cell_release_no_holder(struct dm_bio_prison *prison,
+			       struct dm_bio_prison_cell *cell,
+			       struct bio_list *inmates);
+void dm_cell_error(struct dm_bio_prison *prison,
+		   struct dm_bio_prison_cell *cell, int error);
 
 /*----------------------------------------------------------------*/
 
diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h
index 3a8cfa2645c..dd364611156 100644
--- a/drivers/md/dm-bio-record.h
+++ b/drivers/md/dm-bio-record.h
@@ -17,55 +17,24 @@
  * original bio state.
  */
 
-struct dm_bio_vec_details {
-#if PAGE_SIZE < 65536
-	__u16 bv_len;
-	__u16 bv_offset;
-#else
-	unsigned bv_len;
-	unsigned bv_offset;
-#endif
-};
-
 struct dm_bio_details {
-	sector_t bi_sector;
 	struct block_device *bi_bdev;
-	unsigned int bi_size;
-	unsigned short bi_idx;
 	unsigned long bi_flags;
-	struct dm_bio_vec_details bi_io_vec[BIO_MAX_PAGES];
+	struct bvec_iter bi_iter;
 };
 
 static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
 {
-	unsigned i;
-
-	bd->bi_sector = bio->bi_sector;
 	bd->bi_bdev = bio->bi_bdev;
-	bd->bi_size = bio->bi_size;
-	bd->bi_idx = bio->bi_idx;
 	bd->bi_flags = bio->bi_flags;
-
-	for (i = 0; i < bio->bi_vcnt; i++) {
-		bd->bi_io_vec[i].bv_len = bio->bi_io_vec[i].bv_len;
-		bd->bi_io_vec[i].bv_offset = bio->bi_io_vec[i].bv_offset;
-	}
+	bd->bi_iter = bio->bi_iter;
 }
 
 static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio)
 {
-	unsigned i;
-
-	bio->bi_sector = bd->bi_sector;
 	bio->bi_bdev = bd->bi_bdev;
-	bio->bi_size = bd->bi_size;
-	bio->bi_idx = bd->bi_idx;
 	bio->bi_flags = bd->bi_flags;
-
-	for (i = 0; i < bio->bi_vcnt; i++) {
-		bio->bi_io_vec[i].bv_len = bd->bi_io_vec[i].bv_len;
-		bio->bi_io_vec[i].bv_offset = bd->bi_io_vec[i].bv_offset;
-	}
+	bio->bi_iter = bd->bi_iter;
 }
 
 #endif
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 651ca79881d..d724459860d 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -104,6 +104,8 @@ struct dm_bufio_client {
 	struct list_head reserved_buffers;
 	unsigned need_reserved_buffers;
 
+	unsigned minimum_buffers;
+
 	struct hlist_head *cache_hash;
 	wait_queue_head_t free_buffer_wait;
 
@@ -145,6 +147,7 @@ struct dm_buffer {
 	unsigned long state;
 	unsigned long last_accessed;
 	struct dm_bufio_client *c;
+	struct list_head write_list;
 	struct bio bio;
 	struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS];
 };
@@ -319,6 +322,9 @@ static void __cache_size_refresh(void)
 static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
 			       enum data_mode *data_mode)
 {
+	unsigned noio_flag;
+	void *ptr;
+
 	if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) {
 		*data_mode = DATA_MODE_SLAB;
 		return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask);
@@ -332,7 +338,26 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
 	}
 
 	*data_mode = DATA_MODE_VMALLOC;
-	return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
+
+	/*
+	 * __vmalloc allocates the data pages and auxiliary structures with
+	 * gfp_flags that were specified, but pagetables are always allocated
+	 * with GFP_KERNEL, no matter what was specified as gfp_mask.
+	 *
+	 * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that
+	 * all allocations done by this process (including pagetables) are done
+	 * as if GFP_NOIO was specified.
+	 */
+
+	if (gfp_mask & __GFP_NORETRY)
+		noio_flag = memalloc_noio_save();
+
+	ptr = __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, PAGE_KERNEL);
+
+	if (gfp_mask & __GFP_NORETRY)
+		memalloc_noio_restore(noio_flag);
+
+	return ptr;
 }
 
 /*
@@ -515,7 +540,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
 	bio_init(&b->bio);
 	b->bio.bi_io_vec = b->bio_vec;
 	b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS;
-	b->bio.bi_sector = block << b->c->sectors_per_block_bits;
+	b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits;
 	b->bio.bi_bdev = b->c->bdev;
 	b->bio.bi_end_io = end_io;
 
@@ -582,9 +607,9 @@ static void write_endio(struct bio *bio, int error)
 
 	BUG_ON(!test_bit(B_WRITING, &b->state));
 
-	smp_mb__before_clear_bit();
+	smp_mb__before_atomic();
 	clear_bit(B_WRITING, &b->state);
-	smp_mb__after_clear_bit();
+	smp_mb__after_atomic();
 
 	wake_up_bit(&b->state, B_WRITING);
 }
@@ -608,7 +633,8 @@ static int do_io_schedule(void *word)
  * - Submit our write and don't wait on it. We set B_WRITING indicating
  *   that there is a write in progress.
  */
-static void __write_dirty_buffer(struct dm_buffer *b)
+static void __write_dirty_buffer(struct dm_buffer *b,
+				 struct list_head *write_list)
 {
 	if (!test_bit(B_DIRTY, &b->state))
 		return;
@@ -617,7 +643,24 @@ static void __write_dirty_buffer(struct dm_buffer *b)
 	wait_on_bit_lock(&b->state, B_WRITING,
 			 do_io_schedule, TASK_UNINTERRUPTIBLE);
 
-	submit_io(b, WRITE, b->block, write_endio);
+	if (!write_list)
+		submit_io(b, WRITE, b->block, write_endio);
+	else
+		list_add_tail(&b->write_list, write_list);
+}
+
+static void __flush_write_list(struct list_head *write_list)
+{
+	struct blk_plug plug;
+	blk_start_plug(&plug);
+	while (!list_empty(write_list)) {
+		struct dm_buffer *b =
+			list_entry(write_list->next, struct dm_buffer, write_list);
+		list_del(&b->write_list);
+		submit_io(b, WRITE, b->block, write_endio);
+		dm_bufio_cond_resched();
+	}
+	blk_finish_plug(&plug);
 }
 
 /*
@@ -633,7 +676,7 @@ static void __make_buffer_clean(struct dm_buffer *b)
 		return;
 
 	wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE);
-	__write_dirty_buffer(b);
+	__write_dirty_buffer(b, NULL);
 	wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE);
 }
 
@@ -780,7 +823,8 @@ static void __free_buffer_wake(struct dm_buffer *b)
 	wake_up(&c->free_buffer_wait);
 }
 
-static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait)
+static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
+					struct list_head *write_list)
 {
 	struct dm_buffer *b, *tmp;
 
@@ -796,7 +840,7 @@ static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait)
 		if (no_wait && test_bit(B_WRITING, &b->state))
 			return;
 
-		__write_dirty_buffer(b);
+		__write_dirty_buffer(b, write_list);
 		dm_bufio_cond_resched();
 	}
 }
@@ -819,8 +863,8 @@ static void __get_memory_limit(struct dm_bufio_client *c,
 	buffers = dm_bufio_cache_size_per_client >>
 		  (c->sectors_per_block_bits + SECTOR_SHIFT);
 
-	if (buffers < DM_BUFIO_MIN_BUFFERS)
-		buffers = DM_BUFIO_MIN_BUFFERS;
+	if (buffers < c->minimum_buffers)
+		buffers = c->minimum_buffers;
 
 	*limit_buffers = buffers;
 	*threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100;
@@ -831,7 +875,8 @@ static void __get_memory_limit(struct dm_bufio_client *c,
  * If we are over threshold_buffers, start freeing buffers.
  * If we're over "limit_buffers", block until we get under the limit.
  */
-static void __check_watermark(struct dm_bufio_client *c)
+static void __check_watermark(struct dm_bufio_client *c,
+			      struct list_head *write_list)
 {
 	unsigned long threshold_buffers, limit_buffers;
 
@@ -850,7 +895,7 @@ static void __check_watermark(struct dm_bufio_client *c)
 	}
 
 	if (c->n_buffers[LIST_DIRTY] > threshold_buffers)
-		__write_dirty_buffers_async(c, 1);
+		__write_dirty_buffers_async(c, 1, write_list);
 }
 
 /*
@@ -859,9 +904,8 @@ static void __check_watermark(struct dm_bufio_client *c)
 static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
 {
 	struct dm_buffer *b;
-	struct hlist_node *hn;
 
-	hlist_for_each_entry(b, hn, &c->cache_hash[DM_BUFIO_HASH(block)],
+	hlist_for_each_entry(b, &c->cache_hash[DM_BUFIO_HASH(block)],
 			     hash_list) {
 		dm_bufio_cond_resched();
 		if (b->block == block)
@@ -876,7 +920,8 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
  *--------------------------------------------------------------*/
 
 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
-				     enum new_flag nf, int *need_submit)
+				     enum new_flag nf, int *need_submit,
+				     struct list_head *write_list)
 {
 	struct dm_buffer *b, *new_b = NULL;
 
@@ -903,7 +948,7 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
 		goto found_buffer;
 	}
 
-	__check_watermark(c);
+	__check_watermark(c, write_list);
 
 	b = new_b;
 	b->hold_count = 1;
@@ -952,9 +997,9 @@ static void read_endio(struct bio *bio, int error)
 
 	BUG_ON(!test_bit(B_READING, &b->state));
 
-	smp_mb__before_clear_bit();
+	smp_mb__before_atomic();
 	clear_bit(B_READING, &b->state);
-	smp_mb__after_clear_bit();
+	smp_mb__after_atomic();
 
 	wake_up_bit(&b->state, B_READING);
 }
@@ -971,10 +1016,14 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
 	int need_submit;
 	struct dm_buffer *b;
 
+	LIST_HEAD(write_list);
+
 	dm_bufio_lock(c);
-	b = __bufio_new(c, block, nf, &need_submit);
+	b = __bufio_new(c, block, nf, &need_submit, &write_list);
 	dm_bufio_unlock(c);
 
+	__flush_write_list(&write_list);
+
 	if (!b)
 		return b;
 
@@ -1026,13 +1075,25 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
 {
 	struct blk_plug plug;
 
+	LIST_HEAD(write_list);
+
+	BUG_ON(dm_bufio_in_request());
+
 	blk_start_plug(&plug);
 	dm_bufio_lock(c);
 
 	for (; n_blocks--; block++) {
 		int need_submit;
 		struct dm_buffer *b;
-		b = __bufio_new(c, block, NF_PREFETCH, &need_submit);
+		b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
+				&write_list);
+		if (unlikely(!list_empty(&write_list))) {
+			dm_bufio_unlock(c);
+			blk_finish_plug(&plug);
+			__flush_write_list(&write_list);
+			blk_start_plug(&plug);
+			dm_bufio_lock(c);
+		}
 		if (unlikely(b != NULL)) {
 			dm_bufio_unlock(c);
 
@@ -1046,7 +1107,6 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
 				goto flush_plug;
 			dm_bufio_lock(c);
 		}
-
 	}
 
 	dm_bufio_unlock(c);
@@ -1103,11 +1163,14 @@ EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
 
 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
 {
+	LIST_HEAD(write_list);
+
 	BUG_ON(dm_bufio_in_request());
 
 	dm_bufio_lock(c);
-	__write_dirty_buffers_async(c, 0);
+	__write_dirty_buffers_async(c, 0, &write_list);
 	dm_bufio_unlock(c);
+	__flush_write_list(&write_list);
 }
 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
 
@@ -1124,8 +1187,13 @@ int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
 	unsigned long buffers_processed = 0;
 	struct dm_buffer *b, *tmp;
 
+	LIST_HEAD(write_list);
+
+	dm_bufio_lock(c);
+	__write_dirty_buffers_async(c, 0, &write_list);
+	dm_bufio_unlock(c);
+	__flush_write_list(&write_list);
 	dm_bufio_lock(c);
-	__write_dirty_buffers_async(c, 0);
 
 again:
 	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
@@ -1193,7 +1261,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
 int dm_bufio_issue_flush(struct dm_bufio_client *c)
 {
 	struct dm_io_request io_req = {
-		.bi_rw = REQ_FLUSH,
+		.bi_rw = WRITE_FLUSH,
 		.mem.type = DM_IO_KMEM,
 		.mem.ptr.addr = NULL,
 		.client = c->dm_io,
@@ -1251,7 +1319,7 @@ retry:
 	BUG_ON(!b->hold_count);
 	BUG_ON(test_bit(B_READING, &b->state));
 
-	__write_dirty_buffer(b);
+	__write_dirty_buffer(b, NULL);
 	if (b->hold_count == 1) {
 		wait_on_bit(&b->state, B_WRITING,
 			    do_io_schedule, TASK_UNINTERRUPTIBLE);
@@ -1284,6 +1352,34 @@ retry:
 }
 EXPORT_SYMBOL_GPL(dm_bufio_release_move);
 
+/*
+ * Free the given buffer.
+ *
+ * This is just a hint, if the buffer is in use or dirty, this function
+ * does nothing.
+ */
+void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
+{
+	struct dm_buffer *b;
+
+	dm_bufio_lock(c);
+
+	b = __find(c, block);
+	if (b && likely(!b->hold_count) && likely(!b->state)) {
+		__unlink_buffer(b);
+		__free_buffer_wake(b);
+	}
+
+	dm_bufio_unlock(c);
+}
+EXPORT_SYMBOL(dm_bufio_forget);
+
+void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n)
+{
+	c->minimum_buffers = n;
+}
+EXPORT_SYMBOL(dm_bufio_set_minimum_buffers);
+
 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
 {
 	return c->block_size;
@@ -1359,62 +1455,75 @@ static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp,
 				unsigned long max_jiffies)
 {
 	if (jiffies - b->last_accessed < max_jiffies)
-		return 1;
+		return 0;
 
 	if (!(gfp & __GFP_IO)) {
 		if (test_bit(B_READING, &b->state) ||
 		    test_bit(B_WRITING, &b->state) ||
 		    test_bit(B_DIRTY, &b->state))
-			return 1;
+			return 0;
 	}
 
 	if (b->hold_count)
-		return 1;
+		return 0;
 
 	__make_buffer_clean(b);
 	__unlink_buffer(b);
 	__free_buffer_wake(b);
 
-	return 0;
+	return 1;
 }
 
-static void __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
-		   struct shrink_control *sc)
+static long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
+		   gfp_t gfp_mask)
 {
 	int l;
 	struct dm_buffer *b, *tmp;
+	long freed = 0;
 
 	for (l = 0; l < LIST_SIZE; l++) {
-		list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list)
-			if (!__cleanup_old_buffer(b, sc->gfp_mask, 0) &&
-			    !--nr_to_scan)
-				return;
+		list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
+			freed += __cleanup_old_buffer(b, gfp_mask, 0);
+			if (!--nr_to_scan)
+				break;
+		}
 		dm_bufio_cond_resched();
 	}
+	return freed;
 }
 
-static int shrink(struct shrinker *shrinker, struct shrink_control *sc)
+static unsigned long
+dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
-	struct dm_bufio_client *c =
-	    container_of(shrinker, struct dm_bufio_client, shrinker);
-	unsigned long r;
-	unsigned long nr_to_scan = sc->nr_to_scan;
+	struct dm_bufio_client *c;
+	unsigned long freed;
 
+	c = container_of(shrink, struct dm_bufio_client, shrinker);
 	if (sc->gfp_mask & __GFP_IO)
 		dm_bufio_lock(c);
 	else if (!dm_bufio_trylock(c))
-		return !nr_to_scan ? 0 : -1;
+		return SHRINK_STOP;
+
+	freed  = __scan(c, sc->nr_to_scan, sc->gfp_mask);
+	dm_bufio_unlock(c);
+	return freed;
+}
 
-	if (nr_to_scan)
-		__scan(c, nr_to_scan, sc);
+static unsigned long
+dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+	struct dm_bufio_client *c;
+	unsigned long count;
 
-	r = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
-	if (r > INT_MAX)
-		r = INT_MAX;
+	c = container_of(shrink, struct dm_bufio_client, shrinker);
+	if (sc->gfp_mask & __GFP_IO)
+		dm_bufio_lock(c);
+	else if (!dm_bufio_trylock(c))
+		return 0;
 
+	count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
 	dm_bufio_unlock(c);
-
-	return r;
+	return count;
 }
 
 /*
@@ -1432,7 +1541,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 	BUG_ON(block_size < 1 << SECTOR_SHIFT ||
 	       (block_size & (block_size - 1)));
 
-	c = kmalloc(sizeof(*c), GFP_KERNEL);
+	c = kzalloc(sizeof(*c), GFP_KERNEL);
 	if (!c) {
 		r = -ENOMEM;
 		goto bad_client;
@@ -1467,6 +1576,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 	INIT_LIST_HEAD(&c->reserved_buffers);
 	c->need_reserved_buffers = reserved_buffers;
 
+	c->minimum_buffers = DM_BUFIO_MIN_BUFFERS;
+
 	init_waitqueue_head(&c->free_buffer_wait);
 	c->async_write_error = 0;
 
@@ -1516,7 +1627,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 	__cache_size_refresh();
 	mutex_unlock(&dm_bufio_clients_lock);
 
-	c->shrinker.shrink = shrink;
+	c->shrinker.count_objects = dm_bufio_shrink_count;
+	c->shrinker.scan_objects = dm_bufio_shrink_scan;
 	c->shrinker.seeks = 1;
 	c->shrinker.batch = 0;
 	register_shrinker(&c->shrinker);
@@ -1603,7 +1715,7 @@ static void cleanup_old_buffers(void)
 			struct dm_buffer *b;
 			b = list_entry(c->lru[LIST_CLEAN].prev,
 				       struct dm_buffer, lru_list);
-			if (__cleanup_old_buffer(b, 0, max_age * HZ))
+			if (!__cleanup_old_buffer(b, 0, max_age * HZ))
 				break;
 			dm_bufio_cond_resched();
 		}
@@ -1637,6 +1749,11 @@ static int __init dm_bufio_init(void)
 {
 	__u64 mem;
 
+	dm_bufio_allocated_kmem_cache = 0;
+	dm_bufio_allocated_get_free_pages = 0;
+	dm_bufio_allocated_vmalloc = 0;
+	dm_bufio_current_allocated = 0;
+
 	memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches);
 	memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names);
 
diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h
index b142946a9e3..c096779a729 100644
--- a/drivers/md/dm-bufio.h
+++ b/drivers/md/dm-bufio.h
@@ -108,6 +108,18 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c);
  */
 void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block);
 
+/*
+ * Free the given buffer.
+ * This is just a hint, if the buffer is in use or dirty, this function
+ * does nothing.
+ */
+void dm_bufio_forget(struct dm_bufio_client *c, sector_t block);
+
+/*
+ * Set the minimum number of buffers before cleanup happens.
+ */
+void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n);
+
 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c);
 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c);
 sector_t dm_bufio_get_block_number(struct dm_buffer *b);
diff --git a/drivers/md/dm-builtin.c b/drivers/md/dm-builtin.c
new file mode 100644
index 00000000000..6c9049c51b2
--- /dev/null
+++ b/drivers/md/dm-builtin.c
@@ -0,0 +1,48 @@
+#include "dm.h"
+
+/*
+ * The kobject release method must not be placed in the module itself,
+ * otherwise we are subject to module unload races.
+ *
+ * The release method is called when the last reference to the kobject is
+ * dropped. It may be called by any other kernel code that drops the last
+ * reference.
+ *
+ * The release method suffers from module unload race. We may prevent the
+ * module from being unloaded at the start of the release method (using
+ * increased module reference count or synchronizing against the release
+ * method), however there is no way to prevent the module from being
+ * unloaded at the end of the release method.
+ *
+ * If this code were placed in the dm module, the following race may
+ * happen:
+ *  1. Some other process takes a reference to dm kobject
+ *  2. The user issues ioctl function to unload the dm device
+ *  3. dm_sysfs_exit calls kobject_put, however the object is not released
+ *     because of the other reference taken at step 1
+ *  4. dm_sysfs_exit waits on the completion
+ *  5. The other process that took the reference in step 1 drops it,
+ *     dm_kobject_release is called from this process
+ *  6. dm_kobject_release calls complete()
+ *  7. a reschedule happens before dm_kobject_release returns
+ *  8. dm_sysfs_exit continues, the dm device is unloaded, module reference
+ *     count is decremented
+ *  9. The user unloads the dm module
+ * 10. The other process that was rescheduled in step 7 continues to run,
+ *     it is now executing code in unloaded module, so it crashes
+ *
+ * Note that if the process that takes the foreign reference to dm kobject
+ * has a low priority and the system is sufficiently loaded with
+ * higher-priority processes that prevent the low-priority process from
+ * being scheduled long enough, this bug may really happen.
+ *
+ * In order to fix this module unload race, we place the release method
+ * into a helper code that is compiled directly into the kernel.
+ */
+
+void dm_kobject_release(struct kobject *kobj)
+{
+	complete(dm_get_completion_from_kobject(kobj));
+}
+
+EXPORT_SYMBOL(dm_kobject_release);
diff --git a/drivers/md/dm-cache-block-types.h b/drivers/md/dm-cache-block-types.h
new file mode 100644
index 00000000000..aac0e2df06b
--- /dev/null
+++ b/drivers/md/dm-cache-block-types.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_CACHE_BLOCK_TYPES_H
+#define DM_CACHE_BLOCK_TYPES_H
+
+#include "persistent-data/dm-block-manager.h"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * It's helpful to get sparse to differentiate between indexes into the
+ * origin device, indexes into the cache device, and indexes into the
+ * discard bitset.
+ */
+
+typedef dm_block_t __bitwise__ dm_oblock_t;
+typedef uint32_t __bitwise__ dm_cblock_t;
+
+static inline dm_oblock_t to_oblock(dm_block_t b)
+{
+	return (__force dm_oblock_t) b;
+}
+
+static inline dm_block_t from_oblock(dm_oblock_t b)
+{
+	return (__force dm_block_t) b;
+}
+
+static inline dm_cblock_t to_cblock(uint32_t b)
+{
+	return (__force dm_cblock_t) b;
+}
+
+static inline uint32_t from_cblock(dm_cblock_t b)
+{
+	return (__force uint32_t) b;
+}
+
+#endif /* DM_CACHE_BLOCK_TYPES_H */
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
new file mode 100644
index 00000000000..d2899e7eb3a
--- /dev/null
+++ b/drivers/md/dm-cache-metadata.c
@@ -0,0 +1,1299 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-cache-metadata.h"
+
+#include "persistent-data/dm-array.h"
+#include "persistent-data/dm-bitset.h"
+#include "persistent-data/dm-space-map.h"
+#include "persistent-data/dm-space-map-disk.h"
+#include "persistent-data/dm-transaction-manager.h"
+
+#include <linux/device-mapper.h>
+
+/*----------------------------------------------------------------*/
+
+#define DM_MSG_PREFIX   "cache metadata"
+
+#define CACHE_SUPERBLOCK_MAGIC 06142003
+#define CACHE_SUPERBLOCK_LOCATION 0
+
+/*
+ * defines a range of metadata versions that this module can handle.
+ */
+#define MIN_CACHE_VERSION 1
+#define MAX_CACHE_VERSION 1
+
+#define CACHE_METADATA_CACHE_SIZE 64
+
+/*
+ *  3 for btree insert +
+ *  2 for btree lookup used within space map
+ */
+#define CACHE_MAX_CONCURRENT_LOCKS 5
+#define SPACE_MAP_ROOT_SIZE 128
+
+enum superblock_flag_bits {
+	/* for spotting crashes that would invalidate the dirty bitset */
+	CLEAN_SHUTDOWN,
+};
+
+/*
+ * Each mapping from cache block -> origin block carries a set of flags.
+ */
+enum mapping_bits {
+	/*
+	 * A valid mapping.  Because we're using an array we clear this
+	 * flag for an non existant mapping.
+	 */
+	M_VALID = 1,
+
+	/*
+	 * The data on the cache is different from that on the origin.
+	 */
+	M_DIRTY = 2
+};
+
+struct cache_disk_superblock {
+	__le32 csum;
+	__le32 flags;
+	__le64 blocknr;
+
+	__u8 uuid[16];
+	__le64 magic;
+	__le32 version;
+
+	__u8 policy_name[CACHE_POLICY_NAME_SIZE];
+	__le32 policy_hint_size;
+
+	__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
+	__le64 mapping_root;
+	__le64 hint_root;
+
+	__le64 discard_root;
+	__le64 discard_block_size;
+	__le64 discard_nr_blocks;
+
+	__le32 data_block_size;
+	__le32 metadata_block_size;
+	__le32 cache_blocks;
+
+	__le32 compat_flags;
+	__le32 compat_ro_flags;
+	__le32 incompat_flags;
+
+	__le32 read_hits;
+	__le32 read_misses;
+	__le32 write_hits;
+	__le32 write_misses;
+
+	__le32 policy_version[CACHE_POLICY_VERSION_SIZE];
+} __packed;
+
+struct dm_cache_metadata {
+	struct block_device *bdev;
+	struct dm_block_manager *bm;
+	struct dm_space_map *metadata_sm;
+	struct dm_transaction_manager *tm;
+
+	struct dm_array_info info;
+	struct dm_array_info hint_info;
+	struct dm_disk_bitset discard_info;
+
+	struct rw_semaphore root_lock;
+	dm_block_t root;
+	dm_block_t hint_root;
+	dm_block_t discard_root;
+
+	sector_t discard_block_size;
+	dm_oblock_t discard_nr_blocks;
+
+	sector_t data_block_size;
+	dm_cblock_t cache_blocks;
+	bool changed:1;
+	bool clean_when_opened:1;
+
+	char policy_name[CACHE_POLICY_NAME_SIZE];
+	unsigned policy_version[CACHE_POLICY_VERSION_SIZE];
+	size_t policy_hint_size;
+	struct dm_cache_statistics stats;
+
+	/*
+	 * Reading the space map root can fail, so we read it into this
+	 * buffer before the superblock is locked and updated.
+	 */
+	__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
+};
+
+/*-------------------------------------------------------------------
+ * superblock validator
+ *-----------------------------------------------------------------*/
+
+#define SUPERBLOCK_CSUM_XOR 9031977
+
+static void sb_prepare_for_write(struct dm_block_validator *v,
+				 struct dm_block *b,
+				 size_t sb_block_size)
+{
+	struct cache_disk_superblock *disk_super = dm_block_data(b);
+
+	disk_super->blocknr = cpu_to_le64(dm_block_location(b));
+	disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
+						      sb_block_size - sizeof(__le32),
+						      SUPERBLOCK_CSUM_XOR));
+}
+
+static int check_metadata_version(struct cache_disk_superblock *disk_super)
+{
+	uint32_t metadata_version = le32_to_cpu(disk_super->version);
+	if (metadata_version < MIN_CACHE_VERSION || metadata_version > MAX_CACHE_VERSION) {
+		DMERR("Cache metadata version %u found, but only versions between %u and %u supported.",
+		      metadata_version, MIN_CACHE_VERSION, MAX_CACHE_VERSION);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int sb_check(struct dm_block_validator *v,
+		    struct dm_block *b,
+		    size_t sb_block_size)
+{
+	struct cache_disk_superblock *disk_super = dm_block_data(b);
+	__le32 csum_le;
+
+	if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) {
+		DMERR("sb_check failed: blocknr %llu: wanted %llu",
+		      le64_to_cpu(disk_super->blocknr),
+		      (unsigned long long)dm_block_location(b));
+		return -ENOTBLK;
+	}
+
+	if (le64_to_cpu(disk_super->magic) != CACHE_SUPERBLOCK_MAGIC) {
+		DMERR("sb_check failed: magic %llu: wanted %llu",
+		      le64_to_cpu(disk_super->magic),
+		      (unsigned long long)CACHE_SUPERBLOCK_MAGIC);
+		return -EILSEQ;
+	}
+
+	csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
+					     sb_block_size - sizeof(__le32),
+					     SUPERBLOCK_CSUM_XOR));
+	if (csum_le != disk_super->csum) {
+		DMERR("sb_check failed: csum %u: wanted %u",
+		      le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum));
+		return -EILSEQ;
+	}
+
+	return check_metadata_version(disk_super);
+}
+
+static struct dm_block_validator sb_validator = {
+	.name = "superblock",
+	.prepare_for_write = sb_prepare_for_write,
+	.check = sb_check
+};
+
+/*----------------------------------------------------------------*/
+
+static int superblock_read_lock(struct dm_cache_metadata *cmd,
+				struct dm_block **sblock)
+{
+	return dm_bm_read_lock(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
+			       &sb_validator, sblock);
+}
+
+static int superblock_lock_zero(struct dm_cache_metadata *cmd,
+				struct dm_block **sblock)
+{
+	return dm_bm_write_lock_zero(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
+				     &sb_validator, sblock);
+}
+
+static int superblock_lock(struct dm_cache_metadata *cmd,
+			   struct dm_block **sblock)
+{
+	return dm_bm_write_lock(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
+				&sb_validator, sblock);
+}
+
+/*----------------------------------------------------------------*/
+
+static int __superblock_all_zeroes(struct dm_block_manager *bm, bool *result)
+{
+	int r;
+	unsigned i;
+	struct dm_block *b;
+	__le64 *data_le, zero = cpu_to_le64(0);
+	unsigned sb_block_size = dm_bm_block_size(bm) / sizeof(__le64);
+
+	/*
+	 * We can't use a validator here - it may be all zeroes.
+	 */
+	r = dm_bm_read_lock(bm, CACHE_SUPERBLOCK_LOCATION, NULL, &b);
+	if (r)
+		return r;
+
+	data_le = dm_block_data(b);
+	*result = true;
+	for (i = 0; i < sb_block_size; i++) {
+		if (data_le[i] != zero) {
+			*result = false;
+			break;
+		}
+	}
+
+	return dm_bm_unlock(b);
+}
+
+static void __setup_mapping_info(struct dm_cache_metadata *cmd)
+{
+	struct dm_btree_value_type vt;
+
+	vt.context = NULL;
+	vt.size = sizeof(__le64);
+	vt.inc = NULL;
+	vt.dec = NULL;
+	vt.equal = NULL;
+	dm_array_info_init(&cmd->info, cmd->tm, &vt);
+
+	if (cmd->policy_hint_size) {
+		vt.size = sizeof(__le32);
+		dm_array_info_init(&cmd->hint_info, cmd->tm, &vt);
+	}
+}
+
+static int __save_sm_root(struct dm_cache_metadata *cmd)
+{
+	int r;
+	size_t metadata_len;
+
+	r = dm_sm_root_size(cmd->metadata_sm, &metadata_len);
+	if (r < 0)
+		return r;
+
+	return dm_sm_copy_root(cmd->metadata_sm, &cmd->metadata_space_map_root,
+			       metadata_len);
+}
+
+static void __copy_sm_root(struct dm_cache_metadata *cmd,
+			   struct cache_disk_superblock *disk_super)
+{
+	memcpy(&disk_super->metadata_space_map_root,
+	       &cmd->metadata_space_map_root,
+	       sizeof(cmd->metadata_space_map_root));
+}
+
+static int __write_initial_superblock(struct dm_cache_metadata *cmd)
+{
+	int r;
+	struct dm_block *sblock;
+	struct cache_disk_superblock *disk_super;
+	sector_t bdev_size = i_size_read(cmd->bdev->bd_inode) >> SECTOR_SHIFT;
+
+	/* FIXME: see if we can lose the max sectors limit */
+	if (bdev_size > DM_CACHE_METADATA_MAX_SECTORS)
+		bdev_size = DM_CACHE_METADATA_MAX_SECTORS;
+
+	r = dm_tm_pre_commit(cmd->tm);
+	if (r < 0)
+		return r;
+
+	/*
+	 * dm_sm_copy_root() can fail.  So we need to do it before we start
+	 * updating the superblock.
+	 */
+	r = __save_sm_root(cmd);
+	if (r)
+		return r;
+
+	r = superblock_lock_zero(cmd, &sblock);
+	if (r)
+		return r;
+
+	disk_super = dm_block_data(sblock);
+	disk_super->flags = 0;
+	memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
+	disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC);
+	disk_super->version = cpu_to_le32(MAX_CACHE_VERSION);
+	memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));
+	memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version));
+	disk_super->policy_hint_size = 0;
+
+	__copy_sm_root(cmd, disk_super);
+
+	disk_super->mapping_root = cpu_to_le64(cmd->root);
+	disk_super->hint_root = cpu_to_le64(cmd->hint_root);
+	disk_super->discard_root = cpu_to_le64(cmd->discard_root);
+	disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
+	disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks));
+	disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
+	disk_super->data_block_size = cpu_to_le32(cmd->data_block_size);
+	disk_super->cache_blocks = cpu_to_le32(0);
+
+	disk_super->read_hits = cpu_to_le32(0);
+	disk_super->read_misses = cpu_to_le32(0);
+	disk_super->write_hits = cpu_to_le32(0);
+	disk_super->write_misses = cpu_to_le32(0);
+
+	return dm_tm_commit(cmd->tm, sblock);
+}
+
+static int __format_metadata(struct dm_cache_metadata *cmd)
+{
+	int r;
+
+	r = dm_tm_create_with_sm(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
+				 &cmd->tm, &cmd->metadata_sm);
+	if (r < 0) {
+		DMERR("tm_create_with_sm failed");
+		return r;
+	}
+
+	__setup_mapping_info(cmd);
+
+	r = dm_array_empty(&cmd->info, &cmd->root);
+	if (r < 0)
+		goto bad;
+
+	dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
+
+	r = dm_bitset_empty(&cmd->discard_info, &cmd->discard_root);
+	if (r < 0)
+		goto bad;
+
+	cmd->discard_block_size = 0;
+	cmd->discard_nr_blocks = 0;
+
+	r = __write_initial_superblock(cmd);
+	if (r)
+		goto bad;
+
+	cmd->clean_when_opened = true;
+	return 0;
+
+bad:
+	dm_tm_destroy(cmd->tm);
+	dm_sm_destroy(cmd->metadata_sm);
+
+	return r;
+}
+
+static int __check_incompat_features(struct cache_disk_superblock *disk_super,
+				     struct dm_cache_metadata *cmd)
+{
+	uint32_t features;
+
+	features = le32_to_cpu(disk_super->incompat_flags) & ~DM_CACHE_FEATURE_INCOMPAT_SUPP;
+	if (features) {
+		DMERR("could not access metadata due to unsupported optional features (%lx).",
+		      (unsigned long)features);
+		return -EINVAL;
+	}
+
+	/*
+	 * Check for read-only metadata to skip the following RDWR checks.
+	 */
+	if (get_disk_ro(cmd->bdev->bd_disk))
+		return 0;
+
+	features = le32_to_cpu(disk_super->compat_ro_flags) & ~DM_CACHE_FEATURE_COMPAT_RO_SUPP;
+	if (features) {
+		DMERR("could not access metadata RDWR due to unsupported optional features (%lx).",
+		      (unsigned long)features);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int __open_metadata(struct dm_cache_metadata *cmd)
+{
+	int r;
+	struct dm_block *sblock;
+	struct cache_disk_superblock *disk_super;
+	unsigned long sb_flags;
+
+	r = superblock_read_lock(cmd, &sblock);
+	if (r < 0) {
+		DMERR("couldn't read lock superblock");
+		return r;
+	}
+
+	disk_super = dm_block_data(sblock);
+
+	/* Verify the data block size hasn't changed */
+	if (le32_to_cpu(disk_super->data_block_size) != cmd->data_block_size) {
+		DMERR("changing the data block size (from %u to %llu) is not supported",
+		      le32_to_cpu(disk_super->data_block_size),
+		      (unsigned long long)cmd->data_block_size);
+		r = -EINVAL;
+		goto bad;
+	}
+
+	r = __check_incompat_features(disk_super, cmd);
+	if (r < 0)
+		goto bad;
+
+	r = dm_tm_open_with_sm(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
+			       disk_super->metadata_space_map_root,
+			       sizeof(disk_super->metadata_space_map_root),
+			       &cmd->tm, &cmd->metadata_sm);
+	if (r < 0) {
+		DMERR("tm_open_with_sm failed");
+		goto bad;
+	}
+
+	__setup_mapping_info(cmd);
+	dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
+	sb_flags = le32_to_cpu(disk_super->flags);
+	cmd->clean_when_opened = test_bit(CLEAN_SHUTDOWN, &sb_flags);
+	return dm_bm_unlock(sblock);
+
+bad:
+	dm_bm_unlock(sblock);
+	return r;
+}
+
+static int __open_or_format_metadata(struct dm_cache_metadata *cmd,
+				     bool format_device)
+{
+	int r;
+	bool unformatted = false;
+
+	r = __superblock_all_zeroes(cmd->bm, &unformatted);
+	if (r)
+		return r;
+
+	if (unformatted)
+		return format_device ? __format_metadata(cmd) : -EPERM;
+
+	return __open_metadata(cmd);
+}
+
+static int __create_persistent_data_objects(struct dm_cache_metadata *cmd,
+					    bool may_format_device)
+{
+	int r;
+	cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE,
+					  CACHE_METADATA_CACHE_SIZE,
+					  CACHE_MAX_CONCURRENT_LOCKS);
+	if (IS_ERR(cmd->bm)) {
+		DMERR("could not create block manager");
+		return PTR_ERR(cmd->bm);
+	}
+
+	r = __open_or_format_metadata(cmd, may_format_device);
+	if (r)
+		dm_block_manager_destroy(cmd->bm);
+
+	return r;
+}
+
+static void __destroy_persistent_data_objects(struct dm_cache_metadata *cmd)
+{
+	dm_sm_destroy(cmd->metadata_sm);
+	dm_tm_destroy(cmd->tm);
+	dm_block_manager_destroy(cmd->bm);
+}
+
+typedef unsigned long (*flags_mutator)(unsigned long);
+
+static void update_flags(struct cache_disk_superblock *disk_super,
+			 flags_mutator mutator)
+{
+	uint32_t sb_flags = mutator(le32_to_cpu(disk_super->flags));
+	disk_super->flags = cpu_to_le32(sb_flags);
+}
+
+static unsigned long set_clean_shutdown(unsigned long flags)
+{
+	set_bit(CLEAN_SHUTDOWN, &flags);
+	return flags;
+}
+
+static unsigned long clear_clean_shutdown(unsigned long flags)
+{
+	clear_bit(CLEAN_SHUTDOWN, &flags);
+	return flags;
+}
+
+static void read_superblock_fields(struct dm_cache_metadata *cmd,
+				   struct cache_disk_superblock *disk_super)
+{
+	cmd->root = le64_to_cpu(disk_super->mapping_root);
+	cmd->hint_root = le64_to_cpu(disk_super->hint_root);
+	cmd->discard_root = le64_to_cpu(disk_super->discard_root);
+	cmd->discard_block_size = le64_to_cpu(disk_super->discard_block_size);
+	cmd->discard_nr_blocks = to_oblock(le64_to_cpu(disk_super->discard_nr_blocks));
+	cmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
+	cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks));
+	strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name));
+	cmd->policy_version[0] = le32_to_cpu(disk_super->policy_version[0]);
+	cmd->policy_version[1] = le32_to_cpu(disk_super->policy_version[1]);
+	cmd->policy_version[2] = le32_to_cpu(disk_super->policy_version[2]);
+	cmd->policy_hint_size = le32_to_cpu(disk_super->policy_hint_size);
+
+	cmd->stats.read_hits = le32_to_cpu(disk_super->read_hits);
+	cmd->stats.read_misses = le32_to_cpu(disk_super->read_misses);
+	cmd->stats.write_hits = le32_to_cpu(disk_super->write_hits);
+	cmd->stats.write_misses = le32_to_cpu(disk_super->write_misses);
+
+	cmd->changed = false;
+}
+
+/*
+ * The mutator updates the superblock flags.
+ */
+static int __begin_transaction_flags(struct dm_cache_metadata *cmd,
+				     flags_mutator mutator)
+{
+	int r;
+	struct cache_disk_superblock *disk_super;
+	struct dm_block *sblock;
+
+	r = superblock_lock(cmd, &sblock);
+	if (r)
+		return r;
+
+	disk_super = dm_block_data(sblock);
+	update_flags(disk_super, mutator);
+	read_superblock_fields(cmd, disk_super);
+	dm_bm_unlock(sblock);
+
+	return dm_bm_flush(cmd->bm);
+}
+
+static int __begin_transaction(struct dm_cache_metadata *cmd)
+{
+	int r;
+	struct cache_disk_superblock *disk_super;
+	struct dm_block *sblock;
+
+	/*
+	 * We re-read the superblock every time.  Shouldn't need to do this
+	 * really.
+	 */
+	r = superblock_read_lock(cmd, &sblock);
+	if (r)
+		return r;
+
+	disk_super = dm_block_data(sblock);
+	read_superblock_fields(cmd, disk_super);
+	dm_bm_unlock(sblock);
+
+	return 0;
+}
+
+static int __commit_transaction(struct dm_cache_metadata *cmd,
+				flags_mutator mutator)
+{
+	int r;
+	struct cache_disk_superblock *disk_super;
+	struct dm_block *sblock;
+
+	/*
+	 * We need to know if the cache_disk_superblock exceeds a 512-byte sector.
+	 */
+	BUILD_BUG_ON(sizeof(struct cache_disk_superblock) > 512);
+
+	r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root,
+			    &cmd->discard_root);
+	if (r)
+		return r;
+
+	r = dm_tm_pre_commit(cmd->tm);
+	if (r < 0)
+		return r;
+
+	r = __save_sm_root(cmd);
+	if (r)
+		return r;
+
+	r = superblock_lock(cmd, &sblock);
+	if (r)
+		return r;
+
+	disk_super = dm_block_data(sblock);
+
+	if (mutator)
+		update_flags(disk_super, mutator);
+
+	disk_super->mapping_root = cpu_to_le64(cmd->root);
+	disk_super->hint_root = cpu_to_le64(cmd->hint_root);
+	disk_super->discard_root = cpu_to_le64(cmd->discard_root);
+	disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
+	disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks));
+	disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks));
+	strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name));
+	disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]);
+	disk_super->policy_version[1] = cpu_to_le32(cmd->policy_version[1]);
+	disk_super->policy_version[2] = cpu_to_le32(cmd->policy_version[2]);
+
+	disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits);
+	disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses);
+	disk_super->write_hits = cpu_to_le32(cmd->stats.write_hits);
+	disk_super->write_misses = cpu_to_le32(cmd->stats.write_misses);
+	__copy_sm_root(cmd, disk_super);
+
+	return dm_tm_commit(cmd->tm, sblock);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * The mappings are held in a dm-array that has 64-bit values stored in
+ * little-endian format.  The index is the cblock, the high 48bits of the
+ * value are the oblock and the low 16 bit the flags.
+ */
+#define FLAGS_MASK ((1 << 16) - 1)
+
+static __le64 pack_value(dm_oblock_t block, unsigned flags)
+{
+	uint64_t value = from_oblock(block);
+	value <<= 16;
+	value = value | (flags & FLAGS_MASK);
+	return cpu_to_le64(value);
+}
+
+static void unpack_value(__le64 value_le, dm_oblock_t *block, unsigned *flags)
+{
+	uint64_t value = le64_to_cpu(value_le);
+	uint64_t b = value >> 16;
+	*block = to_oblock(b);
+	*flags = value & FLAGS_MASK;
+}
+
+/*----------------------------------------------------------------*/
+
+struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
+						 sector_t data_block_size,
+						 bool may_format_device,
+						 size_t policy_hint_size)
+{
+	int r;
+	struct dm_cache_metadata *cmd;
+
+	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+	if (!cmd) {
+		DMERR("could not allocate metadata struct");
+		return NULL;
+	}
+
+	init_rwsem(&cmd->root_lock);
+	cmd->bdev = bdev;
+	cmd->data_block_size = data_block_size;
+	cmd->cache_blocks = 0;
+	cmd->policy_hint_size = policy_hint_size;
+	cmd->changed = true;
+
+	r = __create_persistent_data_objects(cmd, may_format_device);
+	if (r) {
+		kfree(cmd);
+		return ERR_PTR(r);
+	}
+
+	r = __begin_transaction_flags(cmd, clear_clean_shutdown);
+	if (r < 0) {
+		dm_cache_metadata_close(cmd);
+		return ERR_PTR(r);
+	}
+
+	return cmd;
+}
+
+void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
+{
+	__destroy_persistent_data_objects(cmd);
+	kfree(cmd);
+}
+
+/*
+ * Checks that the given cache block is either unmapped or clean.
+ */
+static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b,
+				   bool *result)
+{
+	int r;
+	__le64 value;
+	dm_oblock_t ob;
+	unsigned flags;
+
+	r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(b), &value);
+	if (r) {
+		DMERR("block_unmapped_or_clean failed");
+		return r;
+	}
+
+	unpack_value(value, &ob, &flags);
+	*result = !((flags & M_VALID) && (flags & M_DIRTY));
+
+	return 0;
+}
+
+static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
+					dm_cblock_t begin, dm_cblock_t end,
+					bool *result)
+{
+	int r;
+	*result = true;
+
+	while (begin != end) {
+		r = block_unmapped_or_clean(cmd, begin, result);
+		if (r)
+			return r;
+
+		if (!*result) {
+			DMERR("cache block %llu is dirty",
+			      (unsigned long long) from_cblock(begin));
+			return 0;
+		}
+
+		begin = to_cblock(from_cblock(begin) + 1);
+	}
+
+	return 0;
+}
+
+int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
+{
+	int r;
+	bool clean;
+	__le64 null_mapping = pack_value(0, 0);
+
+	down_write(&cmd->root_lock);
+	__dm_bless_for_disk(&null_mapping);
+
+	if (from_cblock(new_cache_size) < from_cblock(cmd->cache_blocks)) {
+		r = blocks_are_unmapped_or_clean(cmd, new_cache_size, cmd->cache_blocks, &clean);
+		if (r) {
+			__dm_unbless_for_disk(&null_mapping);
+			goto out;
+		}
+
+		if (!clean) {
+			DMERR("unable to shrink cache due to dirty blocks");
+			r = -EINVAL;
+			__dm_unbless_for_disk(&null_mapping);
+			goto out;
+		}
+	}
+
+	r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks),
+			    from_cblock(new_cache_size),
+			    &null_mapping, &cmd->root);
+	if (!r)
+		cmd->cache_blocks = new_cache_size;
+	cmd->changed = true;
+
+out:
+	up_write(&cmd->root_lock);
+
+	return r;
+}
+
+int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
+				   sector_t discard_block_size,
+				   dm_oblock_t new_nr_entries)
+{
+	int r;
+
+	down_write(&cmd->root_lock);
+	r = dm_bitset_resize(&cmd->discard_info,
+			     cmd->discard_root,
+			     from_oblock(cmd->discard_nr_blocks),
+			     from_oblock(new_nr_entries),
+			     false, &cmd->discard_root);
+	if (!r) {
+		cmd->discard_block_size = discard_block_size;
+		cmd->discard_nr_blocks = new_nr_entries;
+	}
+
+	cmd->changed = true;
+	up_write(&cmd->root_lock);
+
+	return r;
+}
+
+static int __set_discard(struct dm_cache_metadata *cmd, dm_oblock_t b)
+{
+	return dm_bitset_set_bit(&cmd->discard_info, cmd->discard_root,
+				 from_oblock(b), &cmd->discard_root);
+}
+
+static int __clear_discard(struct dm_cache_metadata *cmd, dm_oblock_t b)
+{
+	return dm_bitset_clear_bit(&cmd->discard_info, cmd->discard_root,
+				   from_oblock(b), &cmd->discard_root);
+}
+
+static int __is_discarded(struct dm_cache_metadata *cmd, dm_oblock_t b,
+			  bool *is_discarded)
+{
+	return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root,
+				  from_oblock(b), &cmd->discard_root,
+				  is_discarded);
+}
+
+static int __discard(struct dm_cache_metadata *cmd,
+		     dm_oblock_t dblock, bool discard)
+{
+	int r;
+
+	r = (discard ? __set_discard : __clear_discard)(cmd, dblock);
+	if (r)
+		return r;
+
+	cmd->changed = true;
+	return 0;
+}
+
+int dm_cache_set_discard(struct dm_cache_metadata *cmd,
+			 dm_oblock_t dblock, bool discard)
+{
+	int r;
+
+	down_write(&cmd->root_lock);
+	r = __discard(cmd, dblock, discard);
+	up_write(&cmd->root_lock);
+
+	return r;
+}
+
+static int __load_discards(struct dm_cache_metadata *cmd,
+			   load_discard_fn fn, void *context)
+{
+	int r = 0;
+	dm_block_t b;
+	bool discard;
+
+	for (b = 0; b < from_oblock(cmd->discard_nr_blocks); b++) {
+		dm_oblock_t dblock = to_oblock(b);
+
+		if (cmd->clean_when_opened) {
+			r = __is_discarded(cmd, dblock, &discard);
+			if (r)
+				return r;
+		} else
+			discard = false;
+
+		r = fn(context, cmd->discard_block_size, dblock, discard);
+		if (r)
+			break;
+	}
+
+	return r;
+}
+
+int dm_cache_load_discards(struct dm_cache_metadata *cmd,
+			   load_discard_fn fn, void *context)
+{
+	int r;
+
+	down_read(&cmd->root_lock);
+	r = __load_discards(cmd, fn, context);
+	up_read(&cmd->root_lock);
+
+	return r;
+}
+
+dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd)
+{
+	dm_cblock_t r;
+
+	down_read(&cmd->root_lock);
+	r = cmd->cache_blocks;
+	up_read(&cmd->root_lock);
+
+	return r;
+}
+
+static int __remove(struct dm_cache_metadata *cmd, dm_cblock_t cblock)
+{
+	int r;
+	__le64 value = pack_value(0, 0);
+
+	__dm_bless_for_disk(&value);
+	r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock),
+			       &value, &cmd->root);
+	if (r)
+		return r;
+
+	cmd->changed = true;
+	return 0;
+}
+
+int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock)
+{
+	int r;
+
+	down_write(&cmd->root_lock);
+	r = __remove(cmd, cblock);
+	up_write(&cmd->root_lock);
+
+	return r;
+}
+
+static int __insert(struct dm_cache_metadata *cmd,
+		    dm_cblock_t cblock, dm_oblock_t oblock)
+{
+	int r;
+	__le64 value = pack_value(oblock, M_VALID);
+	__dm_bless_for_disk(&value);
+
+	r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock),
+			       &value, &cmd->root);
+	if (r)
+		return r;
+
+	cmd->changed = true;
+	return 0;
+}
+
+int dm_cache_insert_mapping(struct dm_cache_metadata *cmd,
+			    dm_cblock_t cblock, dm_oblock_t oblock)
+{
+	int r;
+
+	down_write(&cmd->root_lock);
+	r = __insert(cmd, cblock, oblock);
+	up_write(&cmd->root_lock);
+
+	return r;
+}
+
+struct thunk {
+	load_mapping_fn fn;
+	void *context;
+
+	struct dm_cache_metadata *cmd;
+	bool respect_dirty_flags;
+	bool hints_valid;
+};
+
+static bool policy_unchanged(struct dm_cache_metadata *cmd,
+			     struct dm_cache_policy *policy)
+{
+	const char *policy_name = dm_cache_policy_get_name(policy);
+	const unsigned *policy_version = dm_cache_policy_get_version(policy);
+	size_t policy_hint_size = dm_cache_policy_get_hint_size(policy);
+
+	/*
+	 * Ensure policy names match.
+	 */
+	if (strncmp(cmd->policy_name, policy_name, sizeof(cmd->policy_name)))
+		return false;
+
+	/*
+	 * Ensure policy major versions match.
+	 */
+	if (cmd->policy_version[0] != policy_version[0])
+		return false;
+
+	/*
+	 * Ensure policy hint sizes match.
+	 */
+	if (cmd->policy_hint_size != policy_hint_size)
+		return false;
+
+	return true;
+}
+
+static bool hints_array_initialized(struct dm_cache_metadata *cmd)
+{
+	return cmd->hint_root && cmd->policy_hint_size;
+}
+
+static bool hints_array_available(struct dm_cache_metadata *cmd,
+				  struct dm_cache_policy *policy)
+{
+	return cmd->clean_when_opened && policy_unchanged(cmd, policy) &&
+		hints_array_initialized(cmd);
+}
+
+static int __load_mapping(void *context, uint64_t cblock, void *leaf)
+{
+	int r = 0;
+	bool dirty;
+	__le64 value;
+	__le32 hint_value = 0;
+	dm_oblock_t oblock;
+	unsigned flags;
+	struct thunk *thunk = context;
+	struct dm_cache_metadata *cmd = thunk->cmd;
+
+	memcpy(&value, leaf, sizeof(value));
+	unpack_value(value, &oblock, &flags);
+
+	if (flags & M_VALID) {
+		if (thunk->hints_valid) {
+			r = dm_array_get_value(&cmd->hint_info, cmd->hint_root,
+					       cblock, &hint_value);
+			if (r && r != -ENODATA)
+				return r;
+		}
+
+		dirty = thunk->respect_dirty_flags ? (flags & M_DIRTY) : true;
+		r = thunk->fn(thunk->context, oblock, to_cblock(cblock),
+			      dirty, le32_to_cpu(hint_value), thunk->hints_valid);
+	}
+
+	return r;
+}
+
+static int __load_mappings(struct dm_cache_metadata *cmd,
+			   struct dm_cache_policy *policy,
+			   load_mapping_fn fn, void *context)
+{
+	struct thunk thunk;
+
+	thunk.fn = fn;
+	thunk.context = context;
+
+	thunk.cmd = cmd;
+	thunk.respect_dirty_flags = cmd->clean_when_opened;
+	thunk.hints_valid = hints_array_available(cmd, policy);
+
+	return dm_array_walk(&cmd->info, cmd->root, __load_mapping, &thunk);
+}
+
+int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
+			   struct dm_cache_policy *policy,
+			   load_mapping_fn fn, void *context)
+{
+	int r;
+
+	down_read(&cmd->root_lock);
+	r = __load_mappings(cmd, policy, fn, context);
+	up_read(&cmd->root_lock);
+
+	return r;
+}
+
+static int __dump_mapping(void *context, uint64_t cblock, void *leaf)
+{
+	int r = 0;
+	__le64 value;
+	dm_oblock_t oblock;
+	unsigned flags;
+
+	memcpy(&value, leaf, sizeof(value));
+	unpack_value(value, &oblock, &flags);
+
+	return r;
+}
+
+static int __dump_mappings(struct dm_cache_metadata *cmd)
+{
+	return dm_array_walk(&cmd->info, cmd->root, __dump_mapping, NULL);
+}
+
+void dm_cache_dump(struct dm_cache_metadata *cmd)
+{
+	down_read(&cmd->root_lock);
+	__dump_mappings(cmd);
+	up_read(&cmd->root_lock);
+}
+
+int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd)
+{
+	int r;
+
+	down_read(&cmd->root_lock);
+	r = cmd->changed;
+	up_read(&cmd->root_lock);
+
+	return r;
+}
+
+static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty)
+{
+	int r;
+	unsigned flags;
+	dm_oblock_t oblock;
+	__le64 value;
+
+	r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(cblock), &value);
+	if (r)
+		return r;
+
+	unpack_value(value, &oblock, &flags);
+
+	if (((flags & M_DIRTY) && dirty) || (!(flags & M_DIRTY) && !dirty))
+		/* nothing to be done */
+		return 0;
+
+	value = pack_value(oblock, (flags & ~M_DIRTY) | (dirty ? M_DIRTY : 0));
+	__dm_bless_for_disk(&value);
+
+	r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock),
+			       &value, &cmd->root);
+	if (r)
+		return r;
+
+	cmd->changed = true;
+	return 0;
+
+}
+
+int dm_cache_set_dirty(struct dm_cache_metadata *cmd,
+		       dm_cblock_t cblock, bool dirty)
+{
+	int r;
+
+	down_write(&cmd->root_lock);
+	r = __dirty(cmd, cblock, dirty);
+	up_write(&cmd->root_lock);
+
+	return r;
+}
+
+void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
+				 struct dm_cache_statistics *stats)
+{
+	down_read(&cmd->root_lock);
+	*stats = cmd->stats;
+	up_read(&cmd->root_lock);
+}
+
+void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
+				 struct dm_cache_statistics *stats)
+{
+	down_write(&cmd->root_lock);
+	cmd->stats = *stats;
+	up_write(&cmd->root_lock);
+}
+
+int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
+{
+	int r;
+	flags_mutator mutator = (clean_shutdown ? set_clean_shutdown :
+				 clear_clean_shutdown);
+
+	down_write(&cmd->root_lock);
+	r = __commit_transaction(cmd, mutator);
+	if (r)
+		goto out;
+
+	r = __begin_transaction(cmd);
+
+out:
+	up_write(&cmd->root_lock);
+	return r;
+}
+
+int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd,
+					   dm_block_t *result)
+{
+	int r = -EINVAL;
+
+	down_read(&cmd->root_lock);
+	r = dm_sm_get_nr_free(cmd->metadata_sm, result);
+	up_read(&cmd->root_lock);
+
+	return r;
+}
+
+int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd,
+				   dm_block_t *result)
+{
+	int r = -EINVAL;
+
+	down_read(&cmd->root_lock);
+	r = dm_sm_get_nr_blocks(cmd->metadata_sm, result);
+	up_read(&cmd->root_lock);
+
+	return r;
+}
+
+/*----------------------------------------------------------------*/
+
+static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
+{
+	int r;
+	__le32 value;
+	size_t hint_size;
+	const char *policy_name = dm_cache_policy_get_name(policy);
+	const unsigned *policy_version = dm_cache_policy_get_version(policy);
+
+	if (!policy_name[0] ||
+	    (strlen(policy_name) > sizeof(cmd->policy_name) - 1))
+		return -EINVAL;
+
+	if (!policy_unchanged(cmd, policy)) {
+		strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name));
+		memcpy(cmd->policy_version, policy_version, sizeof(cmd->policy_version));
+
+		hint_size = dm_cache_policy_get_hint_size(policy);
+		if (!hint_size)
+			return 0; /* short-circuit hints initialization */
+		cmd->policy_hint_size = hint_size;
+
+		if (cmd->hint_root) {
+			r = dm_array_del(&cmd->hint_info, cmd->hint_root);
+			if (r)
+				return r;
+		}
+
+		r = dm_array_empty(&cmd->hint_info, &cmd->hint_root);
+		if (r)
+			return r;
+
+		value = cpu_to_le32(0);
+		__dm_bless_for_disk(&value);
+		r = dm_array_resize(&cmd->hint_info, cmd->hint_root, 0,
+				    from_cblock(cmd->cache_blocks),
+				    &value, &cmd->hint_root);
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
+static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock, uint32_t hint)
+{
+	struct dm_cache_metadata *cmd = context;
+	__le32 value = cpu_to_le32(hint);
+	int r;
+
+	__dm_bless_for_disk(&value);
+
+	r = dm_array_set_value(&cmd->hint_info, cmd->hint_root,
+			       from_cblock(cblock), &value, &cmd->hint_root);
+	cmd->changed = true;
+
+	return r;
+}
+
+static int write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
+{
+	int r;
+
+	r = begin_hints(cmd, policy);
+	if (r) {
+		DMERR("begin_hints failed");
+		return r;
+	}
+
+	return policy_walk_mappings(policy, save_hint, cmd);
+}
+
+int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
+{
+	int r;
+
+	down_write(&cmd->root_lock);
+	r = write_hints(cmd, policy);
+	up_write(&cmd->root_lock);
+
+	return r;
+}
+
+int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result)
+{
+	return blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result);
+}
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
new file mode 100644
index 00000000000..cd70a78623a
--- /dev/null
+++ b/drivers/md/dm-cache-metadata.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_CACHE_METADATA_H
+#define DM_CACHE_METADATA_H
+
+#include "dm-cache-block-types.h"
+#include "dm-cache-policy-internal.h"
+
+/*----------------------------------------------------------------*/
+
+#define DM_CACHE_METADATA_BLOCK_SIZE 4096
+
+/* FIXME: remove this restriction */
+/*
+ * The metadata device is currently limited in size.
+ *
+ * We have one block of index, which can hold 255 index entries.  Each
+ * index entry contains allocation info about 16k metadata blocks.
+ */
+#define DM_CACHE_METADATA_MAX_SECTORS (255 * (1 << 14) * (DM_CACHE_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
+
+/*
+ * A metadata device larger than 16GB triggers a warning.
+ */
+#define DM_CACHE_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT))
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Ext[234]-style compat feature flags.
+ *
+ * A new feature which old metadata will still be compatible with should
+ * define a DM_CACHE_FEATURE_COMPAT_* flag (rarely useful).
+ *
+ * A new feature that is not compatible with old code should define a
+ * DM_CACHE_FEATURE_INCOMPAT_* flag and guard the relevant code with
+ * that flag.
+ *
+ * A new feature that is not compatible with old code accessing the
+ * metadata RDWR should define a DM_CACHE_FEATURE_RO_COMPAT_* flag and
+ * guard the relevant code with that flag.
+ *
+ * As these various flags are defined they should be added to the
+ * following masks.
+ */
+#define DM_CACHE_FEATURE_COMPAT_SUPP	  0UL
+#define DM_CACHE_FEATURE_COMPAT_RO_SUPP	  0UL
+#define DM_CACHE_FEATURE_INCOMPAT_SUPP	  0UL
+
+/*
+ * Reopens or creates a new, empty metadata volume.
+ * Returns an ERR_PTR on failure.
+ */
+struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
+						 sector_t data_block_size,
+						 bool may_format_device,
+						 size_t policy_hint_size);
+
+void dm_cache_metadata_close(struct dm_cache_metadata *cmd);
+
+/*
+ * The metadata needs to know how many cache blocks there are.  We don't
+ * care about the origin, assuming the core target is giving us valid
+ * origin blocks to map to.
+ */
+int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size);
+dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd);
+
+int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
+				   sector_t discard_block_size,
+				   dm_oblock_t new_nr_entries);
+
+typedef int (*load_discard_fn)(void *context, sector_t discard_block_size,
+			       dm_oblock_t dblock, bool discarded);
+int dm_cache_load_discards(struct dm_cache_metadata *cmd,
+			   load_discard_fn fn, void *context);
+
+int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_oblock_t dblock, bool discard);
+
+int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock);
+int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock, dm_oblock_t oblock);
+int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd);
+
+typedef int (*load_mapping_fn)(void *context, dm_oblock_t oblock,
+			       dm_cblock_t cblock, bool dirty,
+			       uint32_t hint, bool hint_valid);
+int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
+			   struct dm_cache_policy *policy,
+			   load_mapping_fn fn,
+			   void *context);
+
+int dm_cache_set_dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty);
+
+struct dm_cache_statistics {
+	uint32_t read_hits;
+	uint32_t read_misses;
+	uint32_t write_hits;
+	uint32_t write_misses;
+};
+
+void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
+				 struct dm_cache_statistics *stats);
+void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
+				 struct dm_cache_statistics *stats);
+
+int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown);
+
+int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd,
+					   dm_block_t *result);
+
+int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd,
+				   dm_block_t *result);
+
+void dm_cache_dump(struct dm_cache_metadata *cmd);
+
+/*
+ * The policy is invited to save a 32bit hint value for every cblock (eg,
+ * for a hit count).  These are stored against the policy name.  If
+ * policies are changed, then hints will be lost.  If the machine crashes,
+ * hints will be lost.
+ *
+ * The hints are indexed by the cblock, but many policies will not
+ * neccessarily have a fast way of accessing efficiently via cblock.  So
+ * rather than querying the policy for each cblock, we let it walk its data
+ * structures and fill in the hints in whatever order it wishes.
+ */
+int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *p);
+
+/*
+ * Query method.  Are all the blocks in the cache clean?
+ */
+int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result);
+
+/*----------------------------------------------------------------*/
+
+#endif /* DM_CACHE_METADATA_H */
diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c
new file mode 100644
index 00000000000..b04d1f904d0
--- /dev/null
+++ b/drivers/md/dm-cache-policy-cleaner.c
@@ -0,0 +1,467 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * writeback cache policy supporting flushing out dirty cache blocks.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-cache-policy.h"
+#include "dm.h"
+
+#include <linux/hash.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+/*----------------------------------------------------------------*/
+
+#define DM_MSG_PREFIX "cache cleaner"
+
+/* Cache entry struct. */
+struct wb_cache_entry {
+	struct list_head list;
+	struct hlist_node hlist;
+
+	dm_oblock_t oblock;
+	dm_cblock_t cblock;
+	bool dirty:1;
+	bool pending:1;
+};
+
+struct hash {
+	struct hlist_head *table;
+	dm_block_t hash_bits;
+	unsigned nr_buckets;
+};
+
+struct policy {
+	struct dm_cache_policy policy;
+	spinlock_t lock;
+
+	struct list_head free;
+	struct list_head clean;
+	struct list_head clean_pending;
+	struct list_head dirty;
+
+	/*
+	 * We know exactly how many cblocks will be needed,
+	 * so we can allocate them up front.
+	 */
+	dm_cblock_t cache_size, nr_cblocks_allocated;
+	struct wb_cache_entry *cblocks;
+	struct hash chash;
+};
+
+/*----------------------------------------------------------------------------*/
+
+/*
+ * Low-level functions.
+ */
+static unsigned next_power(unsigned n, unsigned min)
+{
+	return roundup_pow_of_two(max(n, min));
+}
+
+static struct policy *to_policy(struct dm_cache_policy *p)
+{
+	return container_of(p, struct policy, policy);
+}
+
+static struct list_head *list_pop(struct list_head *q)
+{
+	struct list_head *r = q->next;
+
+	list_del(r);
+
+	return r;
+}
+
+/*----------------------------------------------------------------------------*/
+
+/* Allocate/free various resources. */
+static int alloc_hash(struct hash *hash, unsigned elts)
+{
+	hash->nr_buckets = next_power(elts >> 4, 16);
+	hash->hash_bits = ffs(hash->nr_buckets) - 1;
+	hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets);
+
+	return hash->table ? 0 : -ENOMEM;
+}
+
+static void free_hash(struct hash *hash)
+{
+	vfree(hash->table);
+}
+
+static int alloc_cache_blocks_with_hash(struct policy *p, dm_cblock_t cache_size)
+{
+	int r = -ENOMEM;
+
+	p->cblocks = vzalloc(sizeof(*p->cblocks) * from_cblock(cache_size));
+	if (p->cblocks) {
+		unsigned u = from_cblock(cache_size);
+
+		while (u--)
+			list_add(&p->cblocks[u].list, &p->free);
+
+		p->nr_cblocks_allocated = 0;
+
+		/* Cache entries hash. */
+		r = alloc_hash(&p->chash, from_cblock(cache_size));
+		if (r)
+			vfree(p->cblocks);
+	}
+
+	return r;
+}
+
+static void free_cache_blocks_and_hash(struct policy *p)
+{
+	free_hash(&p->chash);
+	vfree(p->cblocks);
+}
+
+static struct wb_cache_entry *alloc_cache_entry(struct policy *p)
+{
+	struct wb_cache_entry *e;
+
+	BUG_ON(from_cblock(p->nr_cblocks_allocated) >= from_cblock(p->cache_size));
+
+	e = list_entry(list_pop(&p->free), struct wb_cache_entry, list);
+	p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) + 1);
+
+	return e;
+}
+
+/*----------------------------------------------------------------------------*/
+
+/* Hash functions (lookup, insert, remove). */
+static struct wb_cache_entry *lookup_cache_entry(struct policy *p, dm_oblock_t oblock)
+{
+	struct hash *hash = &p->chash;
+	unsigned h = hash_64(from_oblock(oblock), hash->hash_bits);
+	struct wb_cache_entry *cur;
+	struct hlist_head *bucket = &hash->table[h];
+
+	hlist_for_each_entry(cur, bucket, hlist) {
+		if (cur->oblock == oblock) {
+			/* Move upfront bucket for faster access. */
+			hlist_del(&cur->hlist);
+			hlist_add_head(&cur->hlist, bucket);
+			return cur;
+		}
+	}
+
+	return NULL;
+}
+
+static void insert_cache_hash_entry(struct policy *p, struct wb_cache_entry *e)
+{
+	unsigned h = hash_64(from_oblock(e->oblock), p->chash.hash_bits);
+
+	hlist_add_head(&e->hlist, &p->chash.table[h]);
+}
+
+static void remove_cache_hash_entry(struct wb_cache_entry *e)
+{
+	hlist_del(&e->hlist);
+}
+
+/* Public interface (see dm-cache-policy.h */
+static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock,
+		  bool can_block, bool can_migrate, bool discarded_oblock,
+		  struct bio *bio, struct policy_result *result)
+{
+	struct policy *p = to_policy(pe);
+	struct wb_cache_entry *e;
+	unsigned long flags;
+
+	result->op = POLICY_MISS;
+
+	if (can_block)
+		spin_lock_irqsave(&p->lock, flags);
+
+	else if (!spin_trylock_irqsave(&p->lock, flags))
+		return -EWOULDBLOCK;
+
+	e = lookup_cache_entry(p, oblock);
+	if (e) {
+		result->op = POLICY_HIT;
+		result->cblock = e->cblock;
+
+	}
+
+	spin_unlock_irqrestore(&p->lock, flags);
+
+	return 0;
+}
+
+static int wb_lookup(struct dm_cache_policy *pe, dm_oblock_t oblock, dm_cblock_t *cblock)
+{
+	int r;
+	struct policy *p = to_policy(pe);
+	struct wb_cache_entry *e;
+	unsigned long flags;
+
+	if (!spin_trylock_irqsave(&p->lock, flags))
+		return -EWOULDBLOCK;
+
+	e = lookup_cache_entry(p, oblock);
+	if (e) {
+		*cblock = e->cblock;
+		r = 0;
+
+	} else
+		r = -ENOENT;
+
+	spin_unlock_irqrestore(&p->lock, flags);
+
+	return r;
+}
+
+static void __set_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock, bool set)
+{
+	struct policy *p = to_policy(pe);
+	struct wb_cache_entry *e;
+
+	e = lookup_cache_entry(p, oblock);
+	BUG_ON(!e);
+
+	if (set) {
+		if (!e->dirty) {
+			e->dirty = true;
+			list_move(&e->list, &p->dirty);
+		}
+
+	} else {
+		if (e->dirty) {
+			e->pending = false;
+			e->dirty = false;
+			list_move(&e->list, &p->clean);
+		}
+	}
+}
+
+static void wb_set_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
+{
+	struct policy *p = to_policy(pe);
+	unsigned long flags;
+
+	spin_lock_irqsave(&p->lock, flags);
+	__set_clear_dirty(pe, oblock, true);
+	spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static void wb_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
+{
+	struct policy *p = to_policy(pe);
+	unsigned long flags;
+
+	spin_lock_irqsave(&p->lock, flags);
+	__set_clear_dirty(pe, oblock, false);
+	spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static void add_cache_entry(struct policy *p, struct wb_cache_entry *e)
+{
+	insert_cache_hash_entry(p, e);
+	if (e->dirty)
+		list_add(&e->list, &p->dirty);
+	else
+		list_add(&e->list, &p->clean);
+}
+
+static int wb_load_mapping(struct dm_cache_policy *pe,
+			   dm_oblock_t oblock, dm_cblock_t cblock,
+			   uint32_t hint, bool hint_valid)
+{
+	int r;
+	struct policy *p = to_policy(pe);
+	struct wb_cache_entry *e = alloc_cache_entry(p);
+
+	if (e) {
+		e->cblock = cblock;
+		e->oblock = oblock;
+		e->dirty = false; /* blocks default to clean */
+		add_cache_entry(p, e);
+		r = 0;
+
+	} else
+		r = -ENOMEM;
+
+	return r;
+}
+
+static void wb_destroy(struct dm_cache_policy *pe)
+{
+	struct policy *p = to_policy(pe);
+
+	free_cache_blocks_and_hash(p);
+	kfree(p);
+}
+
+static struct wb_cache_entry *__wb_force_remove_mapping(struct policy *p, dm_oblock_t oblock)
+{
+	struct wb_cache_entry *r = lookup_cache_entry(p, oblock);
+
+	BUG_ON(!r);
+
+	remove_cache_hash_entry(r);
+	list_del(&r->list);
+
+	return r;
+}
+
+static void wb_remove_mapping(struct dm_cache_policy *pe, dm_oblock_t oblock)
+{
+	struct policy *p = to_policy(pe);
+	struct wb_cache_entry *e;
+	unsigned long flags;
+
+	spin_lock_irqsave(&p->lock, flags);
+	e = __wb_force_remove_mapping(p, oblock);
+	list_add_tail(&e->list, &p->free);
+	BUG_ON(!from_cblock(p->nr_cblocks_allocated));
+	p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) - 1);
+	spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static void wb_force_mapping(struct dm_cache_policy *pe,
+				dm_oblock_t current_oblock, dm_oblock_t oblock)
+{
+	struct policy *p = to_policy(pe);
+	struct wb_cache_entry *e;
+	unsigned long flags;
+
+	spin_lock_irqsave(&p->lock, flags);
+	e = __wb_force_remove_mapping(p, current_oblock);
+	e->oblock = oblock;
+	add_cache_entry(p, e);
+	spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static struct wb_cache_entry *get_next_dirty_entry(struct policy *p)
+{
+	struct list_head *l;
+	struct wb_cache_entry *r;
+
+	if (list_empty(&p->dirty))
+		return NULL;
+
+	l = list_pop(&p->dirty);
+	r = container_of(l, struct wb_cache_entry, list);
+	list_add(l, &p->clean_pending);
+
+	return r;
+}
+
+static int wb_writeback_work(struct dm_cache_policy *pe,
+			     dm_oblock_t *oblock,
+			     dm_cblock_t *cblock)
+{
+	int r = -ENOENT;
+	struct policy *p = to_policy(pe);
+	struct wb_cache_entry *e;
+	unsigned long flags;
+
+	spin_lock_irqsave(&p->lock, flags);
+
+	e = get_next_dirty_entry(p);
+	if (e) {
+		*oblock = e->oblock;
+		*cblock = e->cblock;
+		r = 0;
+	}
+
+	spin_unlock_irqrestore(&p->lock, flags);
+
+	return r;
+}
+
+static dm_cblock_t wb_residency(struct dm_cache_policy *pe)
+{
+	return to_policy(pe)->nr_cblocks_allocated;
+}
+
+/* Init the policy plugin interface function pointers. */
+static void init_policy_functions(struct policy *p)
+{
+	p->policy.destroy = wb_destroy;
+	p->policy.map = wb_map;
+	p->policy.lookup = wb_lookup;
+	p->policy.set_dirty = wb_set_dirty;
+	p->policy.clear_dirty = wb_clear_dirty;
+	p->policy.load_mapping = wb_load_mapping;
+	p->policy.walk_mappings = NULL;
+	p->policy.remove_mapping = wb_remove_mapping;
+	p->policy.writeback_work = wb_writeback_work;
+	p->policy.force_mapping = wb_force_mapping;
+	p->policy.residency = wb_residency;
+	p->policy.tick = NULL;
+}
+
+static struct dm_cache_policy *wb_create(dm_cblock_t cache_size,
+					 sector_t origin_size,
+					 sector_t cache_block_size)
+{
+	int r;
+	struct policy *p = kzalloc(sizeof(*p), GFP_KERNEL);
+
+	if (!p)
+		return NULL;
+
+	init_policy_functions(p);
+	INIT_LIST_HEAD(&p->free);
+	INIT_LIST_HEAD(&p->clean);
+	INIT_LIST_HEAD(&p->clean_pending);
+	INIT_LIST_HEAD(&p->dirty);
+
+	p->cache_size = cache_size;
+	spin_lock_init(&p->lock);
+
+	/* Allocate cache entry structs and add them to free list. */
+	r = alloc_cache_blocks_with_hash(p, cache_size);
+	if (!r)
+		return &p->policy;
+
+	kfree(p);
+
+	return NULL;
+}
+/*----------------------------------------------------------------------------*/
+
+static struct dm_cache_policy_type wb_policy_type = {
+	.name = "cleaner",
+	.version = {1, 0, 0},
+	.hint_size = 0,
+	.owner = THIS_MODULE,
+	.create = wb_create
+};
+
+static int __init wb_init(void)
+{
+	int r = dm_cache_policy_register(&wb_policy_type);
+
+	if (r < 0)
+		DMERR("register failed %d", r);
+	else
+		DMINFO("version %u.%u.%u loaded",
+		       wb_policy_type.version[0],
+		       wb_policy_type.version[1],
+		       wb_policy_type.version[2]);
+
+	return r;
+}
+
+static void __exit wb_exit(void)
+{
+	dm_cache_policy_unregister(&wb_policy_type);
+}
+
+module_init(wb_init);
+module_exit(wb_exit);
+
+MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("cleaner cache policy");
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
new file mode 100644
index 00000000000..2256a1f24f7
--- /dev/null
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_CACHE_POLICY_INTERNAL_H
+#define DM_CACHE_POLICY_INTERNAL_H
+
+#include "dm-cache-policy.h"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Little inline functions that simplify calling the policy methods.
+ */
+static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock,
+			     bool can_block, bool can_migrate, bool discarded_oblock,
+			     struct bio *bio, struct policy_result *result)
+{
+	return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, result);
+}
+
+static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
+{
+	BUG_ON(!p->lookup);
+	return p->lookup(p, oblock, cblock);
+}
+
+static inline void policy_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+	if (p->set_dirty)
+		p->set_dirty(p, oblock);
+}
+
+static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+	if (p->clear_dirty)
+		p->clear_dirty(p, oblock);
+}
+
+static inline int policy_load_mapping(struct dm_cache_policy *p,
+				      dm_oblock_t oblock, dm_cblock_t cblock,
+				      uint32_t hint, bool hint_valid)
+{
+	return p->load_mapping(p, oblock, cblock, hint, hint_valid);
+}
+
+static inline int policy_walk_mappings(struct dm_cache_policy *p,
+				      policy_walk_fn fn, void *context)
+{
+	return p->walk_mappings ? p->walk_mappings(p, fn, context) : 0;
+}
+
+static inline int policy_writeback_work(struct dm_cache_policy *p,
+					dm_oblock_t *oblock,
+					dm_cblock_t *cblock)
+{
+	return p->writeback_work ? p->writeback_work(p, oblock, cblock) : -ENOENT;
+}
+
+static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+	p->remove_mapping(p, oblock);
+}
+
+static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
+{
+	return p->remove_cblock(p, cblock);
+}
+
+static inline void policy_force_mapping(struct dm_cache_policy *p,
+					dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+{
+	return p->force_mapping(p, current_oblock, new_oblock);
+}
+
+static inline dm_cblock_t policy_residency(struct dm_cache_policy *p)
+{
+	return p->residency(p);
+}
+
+static inline void policy_tick(struct dm_cache_policy *p)
+{
+	if (p->tick)
+		return p->tick(p);
+}
+
+static inline int policy_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
+{
+	ssize_t sz = 0;
+	if (p->emit_config_values)
+		return p->emit_config_values(p, result, maxlen);
+
+	DMEMIT("0");
+	return 0;
+}
+
+static inline int policy_set_config_value(struct dm_cache_policy *p,
+					  const char *key, const char *value)
+{
+	return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Creates a new cache policy given a policy name, a cache size, an origin size and the block size.
+ */
+struct dm_cache_policy *dm_cache_policy_create(const char *name, dm_cblock_t cache_size,
+					       sector_t origin_size, sector_t block_size);
+
+/*
+ * Destroys the policy.  This drops references to the policy module as well
+ * as calling it's destroy method.  So always use this rather than calling
+ * the policy->destroy method directly.
+ */
+void dm_cache_policy_destroy(struct dm_cache_policy *p);
+
+/*
+ * In case we've forgotten.
+ */
+const char *dm_cache_policy_get_name(struct dm_cache_policy *p);
+
+const unsigned *dm_cache_policy_get_version(struct dm_cache_policy *p);
+
+size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p);
+
+/*----------------------------------------------------------------*/
+
+#endif /* DM_CACHE_POLICY_INTERNAL_H */
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
new file mode 100644
index 00000000000..0e385e40909
--- /dev/null
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -0,0 +1,1333 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-cache-policy.h"
+#include "dm.h"
+
+#include <linux/hash.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#define DM_MSG_PREFIX "cache-policy-mq"
+
+static struct kmem_cache *mq_entry_cache;
+
+/*----------------------------------------------------------------*/
+
+static unsigned next_power(unsigned n, unsigned min)
+{
+	return roundup_pow_of_two(max(n, min));
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Large, sequential ios are probably better left on the origin device since
+ * spindles tend to have good bandwidth.
+ *
+ * The io_tracker tries to spot when the io is in one of these sequential
+ * modes.
+ *
+ * Two thresholds to switch between random and sequential io mode are defaulting
+ * as follows and can be adjusted via the constructor and message interfaces.
+ */
+#define RANDOM_THRESHOLD_DEFAULT 4
+#define SEQUENTIAL_THRESHOLD_DEFAULT 512
+
+enum io_pattern {
+	PATTERN_SEQUENTIAL,
+	PATTERN_RANDOM
+};
+
+struct io_tracker {
+	enum io_pattern pattern;
+
+	unsigned nr_seq_samples;
+	unsigned nr_rand_samples;
+	unsigned thresholds[2];
+
+	dm_oblock_t last_end_oblock;
+};
+
+static void iot_init(struct io_tracker *t,
+		     int sequential_threshold, int random_threshold)
+{
+	t->pattern = PATTERN_RANDOM;
+	t->nr_seq_samples = 0;
+	t->nr_rand_samples = 0;
+	t->last_end_oblock = 0;
+	t->thresholds[PATTERN_RANDOM] = random_threshold;
+	t->thresholds[PATTERN_SEQUENTIAL] = sequential_threshold;
+}
+
+static enum io_pattern iot_pattern(struct io_tracker *t)
+{
+	return t->pattern;
+}
+
+static void iot_update_stats(struct io_tracker *t, struct bio *bio)
+{
+	if (bio->bi_iter.bi_sector == from_oblock(t->last_end_oblock) + 1)
+		t->nr_seq_samples++;
+	else {
+		/*
+		 * Just one non-sequential IO is enough to reset the
+		 * counters.
+		 */
+		if (t->nr_seq_samples) {
+			t->nr_seq_samples = 0;
+			t->nr_rand_samples = 0;
+		}
+
+		t->nr_rand_samples++;
+	}
+
+	t->last_end_oblock = to_oblock(bio_end_sector(bio) - 1);
+}
+
+static void iot_check_for_pattern_switch(struct io_tracker *t)
+{
+	switch (t->pattern) {
+	case PATTERN_SEQUENTIAL:
+		if (t->nr_rand_samples >= t->thresholds[PATTERN_RANDOM]) {
+			t->pattern = PATTERN_RANDOM;
+			t->nr_seq_samples = t->nr_rand_samples = 0;
+		}
+		break;
+
+	case PATTERN_RANDOM:
+		if (t->nr_seq_samples >= t->thresholds[PATTERN_SEQUENTIAL]) {
+			t->pattern = PATTERN_SEQUENTIAL;
+			t->nr_seq_samples = t->nr_rand_samples = 0;
+		}
+		break;
+	}
+}
+
+static void iot_examine_bio(struct io_tracker *t, struct bio *bio)
+{
+	iot_update_stats(t, bio);
+	iot_check_for_pattern_switch(t);
+}
+
+/*----------------------------------------------------------------*/
+
+
+/*
+ * This queue is divided up into different levels.  Allowing us to push
+ * entries to the back of any of the levels.  Think of it as a partially
+ * sorted queue.
+ */
+#define NR_QUEUE_LEVELS 16u
+
+struct queue {
+	struct list_head qs[NR_QUEUE_LEVELS];
+};
+
+static void queue_init(struct queue *q)
+{
+	unsigned i;
+
+	for (i = 0; i < NR_QUEUE_LEVELS; i++)
+		INIT_LIST_HEAD(q->qs + i);
+}
+
+/*
+ * Checks to see if the queue is empty.
+ * FIXME: reduce cpu usage.
+ */
+static bool queue_empty(struct queue *q)
+{
+	unsigned i;
+
+	for (i = 0; i < NR_QUEUE_LEVELS; i++)
+		if (!list_empty(q->qs + i))
+			return false;
+
+	return true;
+}
+
+/*
+ * Insert an entry to the back of the given level.
+ */
+static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
+{
+	list_add_tail(elt, q->qs + level);
+}
+
+static void queue_remove(struct list_head *elt)
+{
+	list_del(elt);
+}
+
+/*
+ * Shifts all regions down one level.  This has no effect on the order of
+ * the queue.
+ */
+static void queue_shift_down(struct queue *q)
+{
+	unsigned level;
+
+	for (level = 1; level < NR_QUEUE_LEVELS; level++)
+		list_splice_init(q->qs + level, q->qs + level - 1);
+}
+
+/*
+ * Gives us the oldest entry of the lowest popoulated level.  If the first
+ * level is emptied then we shift down one level.
+ */
+static struct list_head *queue_pop(struct queue *q)
+{
+	unsigned level;
+	struct list_head *r;
+
+	for (level = 0; level < NR_QUEUE_LEVELS; level++)
+		if (!list_empty(q->qs + level)) {
+			r = q->qs[level].next;
+			list_del(r);
+
+			/* have we just emptied the bottom level? */
+			if (level == 0 && list_empty(q->qs))
+				queue_shift_down(q);
+
+			return r;
+		}
+
+	return NULL;
+}
+
+static struct list_head *list_pop(struct list_head *lh)
+{
+	struct list_head *r = lh->next;
+
+	BUG_ON(!r);
+	list_del_init(r);
+
+	return r;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Describes a cache entry.  Used in both the cache and the pre_cache.
+ */
+struct entry {
+	struct hlist_node hlist;
+	struct list_head list;
+	dm_oblock_t oblock;
+
+	/*
+	 * FIXME: pack these better
+	 */
+	bool dirty:1;
+	unsigned hit_count;
+	unsigned generation;
+	unsigned tick;
+};
+
+/*
+ * Rather than storing the cblock in an entry, we allocate all entries in
+ * an array, and infer the cblock from the entry position.
+ *
+ * Free entries are linked together into a list.
+ */
+struct entry_pool {
+	struct entry *entries, *entries_end;
+	struct list_head free;
+	unsigned nr_allocated;
+};
+
+static int epool_init(struct entry_pool *ep, unsigned nr_entries)
+{
+	unsigned i;
+
+	ep->entries = vzalloc(sizeof(struct entry) * nr_entries);
+	if (!ep->entries)
+		return -ENOMEM;
+
+	ep->entries_end = ep->entries + nr_entries;
+
+	INIT_LIST_HEAD(&ep->free);
+	for (i = 0; i < nr_entries; i++)
+		list_add(&ep->entries[i].list, &ep->free);
+
+	ep->nr_allocated = 0;
+
+	return 0;
+}
+
+static void epool_exit(struct entry_pool *ep)
+{
+	vfree(ep->entries);
+}
+
+static struct entry *alloc_entry(struct entry_pool *ep)
+{
+	struct entry *e;
+
+	if (list_empty(&ep->free))
+		return NULL;
+
+	e = list_entry(list_pop(&ep->free), struct entry, list);
+	INIT_LIST_HEAD(&e->list);
+	INIT_HLIST_NODE(&e->hlist);
+	ep->nr_allocated++;
+
+	return e;
+}
+
+/*
+ * This assumes the cblock hasn't already been allocated.
+ */
+static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock)
+{
+	struct entry *e = ep->entries + from_cblock(cblock);
+
+	list_del_init(&e->list);
+	INIT_HLIST_NODE(&e->hlist);
+	ep->nr_allocated++;
+
+	return e;
+}
+
+static void free_entry(struct entry_pool *ep, struct entry *e)
+{
+	BUG_ON(!ep->nr_allocated);
+	ep->nr_allocated--;
+	INIT_HLIST_NODE(&e->hlist);
+	list_add(&e->list, &ep->free);
+}
+
+/*
+ * Returns NULL if the entry is free.
+ */
+static struct entry *epool_find(struct entry_pool *ep, dm_cblock_t cblock)
+{
+	struct entry *e = ep->entries + from_cblock(cblock);
+	return !hlist_unhashed(&e->hlist) ? e : NULL;
+}
+
+static bool epool_empty(struct entry_pool *ep)
+{
+	return list_empty(&ep->free);
+}
+
+static bool in_pool(struct entry_pool *ep, struct entry *e)
+{
+	return e >= ep->entries && e < ep->entries_end;
+}
+
+static dm_cblock_t infer_cblock(struct entry_pool *ep, struct entry *e)
+{
+	return to_cblock(e - ep->entries);
+}
+
+/*----------------------------------------------------------------*/
+
+struct mq_policy {
+	struct dm_cache_policy policy;
+
+	/* protects everything */
+	struct mutex lock;
+	dm_cblock_t cache_size;
+	struct io_tracker tracker;
+
+	/*
+	 * Entries come from two pools, one of pre-cache entries, and one
+	 * for the cache proper.
+	 */
+	struct entry_pool pre_cache_pool;
+	struct entry_pool cache_pool;
+
+	/*
+	 * We maintain three queues of entries.  The cache proper,
+	 * consisting of a clean and dirty queue, contains the currently
+	 * active mappings.  Whereas the pre_cache tracks blocks that
+	 * are being hit frequently and potential candidates for promotion
+	 * to the cache.
+	 */
+	struct queue pre_cache;
+	struct queue cache_clean;
+	struct queue cache_dirty;
+
+	/*
+	 * Keeps track of time, incremented by the core.  We use this to
+	 * avoid attributing multiple hits within the same tick.
+	 *
+	 * Access to tick_protected should be done with the spin lock held.
+	 * It's copied to tick at the start of the map function (within the
+	 * mutex).
+	 */
+	spinlock_t tick_lock;
+	unsigned tick_protected;
+	unsigned tick;
+
+	/*
+	 * A count of the number of times the map function has been called
+	 * and found an entry in the pre_cache or cache.  Currently used to
+	 * calculate the generation.
+	 */
+	unsigned hit_count;
+
+	/*
+	 * A generation is a longish period that is used to trigger some
+	 * book keeping effects.  eg, decrementing hit counts on entries.
+	 * This is needed to allow the cache to evolve as io patterns
+	 * change.
+	 */
+	unsigned generation;
+	unsigned generation_period; /* in lookups (will probably change) */
+
+	/*
+	 * Entries in the pre_cache whose hit count passes the promotion
+	 * threshold move to the cache proper.  Working out the correct
+	 * value for the promotion_threshold is crucial to this policy.
+	 */
+	unsigned promote_threshold;
+
+	unsigned discard_promote_adjustment;
+	unsigned read_promote_adjustment;
+	unsigned write_promote_adjustment;
+
+	/*
+	 * The hash table allows us to quickly find an entry by origin
+	 * block.  Both pre_cache and cache entries are in here.
+	 */
+	unsigned nr_buckets;
+	dm_block_t hash_bits;
+	struct hlist_head *table;
+};
+
+#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1
+#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4
+#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Simple hash table implementation.  Should replace with the standard hash
+ * table that's making its way upstream.
+ */
+static void hash_insert(struct mq_policy *mq, struct entry *e)
+{
+	unsigned h = hash_64(from_oblock(e->oblock), mq->hash_bits);
+
+	hlist_add_head(&e->hlist, mq->table + h);
+}
+
+static struct entry *hash_lookup(struct mq_policy *mq, dm_oblock_t oblock)
+{
+	unsigned h = hash_64(from_oblock(oblock), mq->hash_bits);
+	struct hlist_head *bucket = mq->table + h;
+	struct entry *e;
+
+	hlist_for_each_entry(e, bucket, hlist)
+		if (e->oblock == oblock) {
+			hlist_del(&e->hlist);
+			hlist_add_head(&e->hlist, bucket);
+			return e;
+		}
+
+	return NULL;
+}
+
+static void hash_remove(struct entry *e)
+{
+	hlist_del(&e->hlist);
+}
+
+/*----------------------------------------------------------------*/
+
+static bool any_free_cblocks(struct mq_policy *mq)
+{
+	return !epool_empty(&mq->cache_pool);
+}
+
+static bool any_clean_cblocks(struct mq_policy *mq)
+{
+	return !queue_empty(&mq->cache_clean);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Now we get to the meat of the policy.  This section deals with deciding
+ * when to to add entries to the pre_cache and cache, and move between
+ * them.
+ */
+
+/*
+ * The queue level is based on the log2 of the hit count.
+ */
+static unsigned queue_level(struct entry *e)
+{
+	return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u);
+}
+
+static bool in_cache(struct mq_policy *mq, struct entry *e)
+{
+	return in_pool(&mq->cache_pool, e);
+}
+
+/*
+ * Inserts the entry into the pre_cache or the cache.  Ensures the cache
+ * block is marked as allocated if necc.  Inserts into the hash table.
+ * Sets the tick which records when the entry was last moved about.
+ */
+static void push(struct mq_policy *mq, struct entry *e)
+{
+	e->tick = mq->tick;
+	hash_insert(mq, e);
+
+	if (in_cache(mq, e))
+		queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean,
+			   queue_level(e), &e->list);
+	else
+		queue_push(&mq->pre_cache, queue_level(e), &e->list);
+}
+
+/*
+ * Removes an entry from pre_cache or cache.  Removes from the hash table.
+ */
+static void del(struct mq_policy *mq, struct entry *e)
+{
+	queue_remove(&e->list);
+	hash_remove(e);
+}
+
+/*
+ * Like del, except it removes the first entry in the queue (ie. the least
+ * recently used).
+ */
+static struct entry *pop(struct mq_policy *mq, struct queue *q)
+{
+	struct entry *e;
+	struct list_head *h = queue_pop(q);
+
+	if (!h)
+		return NULL;
+
+	e = container_of(h, struct entry, list);
+	hash_remove(e);
+
+	return e;
+}
+
+/*
+ * Has this entry already been updated?
+ */
+static bool updated_this_tick(struct mq_policy *mq, struct entry *e)
+{
+	return mq->tick == e->tick;
+}
+
+/*
+ * The promotion threshold is adjusted every generation.  As are the counts
+ * of the entries.
+ *
+ * At the moment the threshold is taken by averaging the hit counts of some
+ * of the entries in the cache (the first 20 entries across all levels in
+ * ascending order, giving preference to the clean entries at each level).
+ *
+ * We can be much cleverer than this though.  For example, each promotion
+ * could bump up the threshold helping to prevent churn.  Much more to do
+ * here.
+ */
+
+#define MAX_TO_AVERAGE 20
+
+static void check_generation(struct mq_policy *mq)
+{
+	unsigned total = 0, nr = 0, count = 0, level;
+	struct list_head *head;
+	struct entry *e;
+
+	if ((mq->hit_count >= mq->generation_period) && (epool_empty(&mq->cache_pool))) {
+		mq->hit_count = 0;
+		mq->generation++;
+
+		for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) {
+			head = mq->cache_clean.qs + level;
+			list_for_each_entry(e, head, list) {
+				nr++;
+				total += e->hit_count;
+
+				if (++count >= MAX_TO_AVERAGE)
+					break;
+			}
+
+			head = mq->cache_dirty.qs + level;
+			list_for_each_entry(e, head, list) {
+				nr++;
+				total += e->hit_count;
+
+				if (++count >= MAX_TO_AVERAGE)
+					break;
+			}
+		}
+
+		mq->promote_threshold = nr ? total / nr : 1;
+		if (mq->promote_threshold * nr < total)
+			mq->promote_threshold++;
+	}
+}
+
+/*
+ * Whenever we use an entry we bump up it's hit counter, and push it to the
+ * back to it's current level.
+ */
+static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e)
+{
+	if (updated_this_tick(mq, e))
+		return;
+
+	e->hit_count++;
+	mq->hit_count++;
+	check_generation(mq);
+
+	/* generation adjustment, to stop the counts increasing forever. */
+	/* FIXME: divide? */
+	/* e->hit_count -= min(e->hit_count - 1, mq->generation - e->generation); */
+	e->generation = mq->generation;
+
+	del(mq, e);
+	push(mq, e);
+}
+
+/*
+ * Demote the least recently used entry from the cache to the pre_cache.
+ * Returns the new cache entry to use, and the old origin block it was
+ * mapped to.
+ *
+ * We drop the hit count on the demoted entry back to 1 to stop it bouncing
+ * straight back into the cache if it's subsequently hit.  There are
+ * various options here, and more experimentation would be good:
+ *
+ * - just forget about the demoted entry completely (ie. don't insert it
+     into the pre_cache).
+ * - divide the hit count rather that setting to some hard coded value.
+ * - set the hit count to a hard coded value other than 1, eg, is it better
+ *   if it goes in at level 2?
+ */
+static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
+{
+	struct entry *demoted = pop(mq, &mq->cache_clean);
+
+	if (!demoted)
+		/*
+		 * We could get a block from mq->cache_dirty, but that
+		 * would add extra latency to the triggering bio as it
+		 * waits for the writeback.  Better to not promote this
+		 * time and hope there's a clean block next time this block
+		 * is hit.
+		 */
+		return -ENOSPC;
+
+	*oblock = demoted->oblock;
+	free_entry(&mq->cache_pool, demoted);
+
+	/*
+	 * We used to put the demoted block into the pre-cache, but I think
+	 * it's simpler to just let it work it's way up from zero again.
+	 * Stops blocks flickering in and out of the cache.
+	 */
+
+	return 0;
+}
+
+/*
+ * We modify the basic promotion_threshold depending on the specific io.
+ *
+ * If the origin block has been discarded then there's no cost to copy it
+ * to the cache.
+ *
+ * We bias towards reads, since they can be demoted at no cost if they
+ * haven't been dirtied.
+ */
+static unsigned adjusted_promote_threshold(struct mq_policy *mq,
+					   bool discarded_oblock, int data_dir)
+{
+	if (data_dir == READ)
+		return mq->promote_threshold + mq->read_promote_adjustment;
+
+	if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) {
+		/*
+		 * We don't need to do any copying at all, so give this a
+		 * very low threshold.
+		 */
+		return mq->discard_promote_adjustment;
+	}
+
+	return mq->promote_threshold + mq->write_promote_adjustment;
+}
+
+static bool should_promote(struct mq_policy *mq, struct entry *e,
+			   bool discarded_oblock, int data_dir)
+{
+	return e->hit_count >=
+		adjusted_promote_threshold(mq, discarded_oblock, data_dir);
+}
+
+static int cache_entry_found(struct mq_policy *mq,
+			     struct entry *e,
+			     struct policy_result *result)
+{
+	requeue_and_update_tick(mq, e);
+
+	if (in_cache(mq, e)) {
+		result->op = POLICY_HIT;
+		result->cblock = infer_cblock(&mq->cache_pool, e);
+	}
+
+	return 0;
+}
+
+/*
+ * Moves an entry from the pre_cache to the cache.  The main work is
+ * finding which cache block to use.
+ */
+static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
+			      struct policy_result *result)
+{
+	int r;
+	struct entry *new_e;
+
+	/* Ensure there's a free cblock in the cache */
+	if (epool_empty(&mq->cache_pool)) {
+		result->op = POLICY_REPLACE;
+		r = demote_cblock(mq, &result->old_oblock);
+		if (r) {
+			result->op = POLICY_MISS;
+			return 0;
+		}
+	} else
+		result->op = POLICY_NEW;
+
+	new_e = alloc_entry(&mq->cache_pool);
+	BUG_ON(!new_e);
+
+	new_e->oblock = e->oblock;
+	new_e->dirty = false;
+	new_e->hit_count = e->hit_count;
+	new_e->generation = e->generation;
+	new_e->tick = e->tick;
+
+	del(mq, e);
+	free_entry(&mq->pre_cache_pool, e);
+	push(mq, new_e);
+
+	result->cblock = infer_cblock(&mq->cache_pool, new_e);
+
+	return 0;
+}
+
+static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
+				 bool can_migrate, bool discarded_oblock,
+				 int data_dir, struct policy_result *result)
+{
+	int r = 0;
+	bool updated = updated_this_tick(mq, e);
+
+	if ((!discarded_oblock && updated) ||
+	    !should_promote(mq, e, discarded_oblock, data_dir)) {
+		requeue_and_update_tick(mq, e);
+		result->op = POLICY_MISS;
+
+	} else if (!can_migrate)
+		r = -EWOULDBLOCK;
+
+	else {
+		requeue_and_update_tick(mq, e);
+		r = pre_cache_to_cache(mq, e, result);
+	}
+
+	return r;
+}
+
+static void insert_in_pre_cache(struct mq_policy *mq,
+				dm_oblock_t oblock)
+{
+	struct entry *e = alloc_entry(&mq->pre_cache_pool);
+
+	if (!e)
+		/*
+		 * There's no spare entry structure, so we grab the least
+		 * used one from the pre_cache.
+		 */
+		e = pop(mq, &mq->pre_cache);
+
+	if (unlikely(!e)) {
+		DMWARN("couldn't pop from pre cache");
+		return;
+	}
+
+	e->dirty = false;
+	e->oblock = oblock;
+	e->hit_count = 1;
+	e->generation = mq->generation;
+	push(mq, e);
+}
+
+static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
+			    struct policy_result *result)
+{
+	int r;
+	struct entry *e;
+
+	if (epool_empty(&mq->cache_pool)) {
+		result->op = POLICY_REPLACE;
+		r = demote_cblock(mq, &result->old_oblock);
+		if (unlikely(r)) {
+			result->op = POLICY_MISS;
+			insert_in_pre_cache(mq, oblock);
+			return;
+		}
+
+		/*
+		 * This will always succeed, since we've just demoted.
+		 */
+		e = alloc_entry(&mq->cache_pool);
+		BUG_ON(!e);
+
+	} else {
+		e = alloc_entry(&mq->cache_pool);
+		result->op = POLICY_NEW;
+	}
+
+	e->oblock = oblock;
+	e->dirty = false;
+	e->hit_count = 1;
+	e->generation = mq->generation;
+	push(mq, e);
+
+	result->cblock = infer_cblock(&mq->cache_pool, e);
+}
+
+static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
+			  bool can_migrate, bool discarded_oblock,
+			  int data_dir, struct policy_result *result)
+{
+	if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) {
+		if (can_migrate)
+			insert_in_cache(mq, oblock, result);
+		else
+			return -EWOULDBLOCK;
+	} else {
+		insert_in_pre_cache(mq, oblock);
+		result->op = POLICY_MISS;
+	}
+
+	return 0;
+}
+
+/*
+ * Looks the oblock up in the hash table, then decides whether to put in
+ * pre_cache, or cache etc.
+ */
+static int map(struct mq_policy *mq, dm_oblock_t oblock,
+	       bool can_migrate, bool discarded_oblock,
+	       int data_dir, struct policy_result *result)
+{
+	int r = 0;
+	struct entry *e = hash_lookup(mq, oblock);
+
+	if (e && in_cache(mq, e))
+		r = cache_entry_found(mq, e, result);
+
+	else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL)
+		result->op = POLICY_MISS;
+
+	else if (e)
+		r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock,
+					  data_dir, result);
+
+	else
+		r = no_entry_found(mq, oblock, can_migrate, discarded_oblock,
+				   data_dir, result);
+
+	if (r == -EWOULDBLOCK)
+		result->op = POLICY_MISS;
+
+	return r;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Public interface, via the policy struct.  See dm-cache-policy.h for a
+ * description of these.
+ */
+
+static struct mq_policy *to_mq_policy(struct dm_cache_policy *p)
+{
+	return container_of(p, struct mq_policy, policy);
+}
+
+static void mq_destroy(struct dm_cache_policy *p)
+{
+	struct mq_policy *mq = to_mq_policy(p);
+
+	vfree(mq->table);
+	epool_exit(&mq->cache_pool);
+	epool_exit(&mq->pre_cache_pool);
+	kfree(mq);
+}
+
+static void copy_tick(struct mq_policy *mq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&mq->tick_lock, flags);
+	mq->tick = mq->tick_protected;
+	spin_unlock_irqrestore(&mq->tick_lock, flags);
+}
+
+static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
+		  bool can_block, bool can_migrate, bool discarded_oblock,
+		  struct bio *bio, struct policy_result *result)
+{
+	int r;
+	struct mq_policy *mq = to_mq_policy(p);
+
+	result->op = POLICY_MISS;
+
+	if (can_block)
+		mutex_lock(&mq->lock);
+	else if (!mutex_trylock(&mq->lock))
+		return -EWOULDBLOCK;
+
+	copy_tick(mq);
+
+	iot_examine_bio(&mq->tracker, bio);
+	r = map(mq, oblock, can_migrate, discarded_oblock,
+		bio_data_dir(bio), result);
+
+	mutex_unlock(&mq->lock);
+
+	return r;
+}
+
+static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
+{
+	int r;
+	struct mq_policy *mq = to_mq_policy(p);
+	struct entry *e;
+
+	if (!mutex_trylock(&mq->lock))
+		return -EWOULDBLOCK;
+
+	e = hash_lookup(mq, oblock);
+	if (e && in_cache(mq, e)) {
+		*cblock = infer_cblock(&mq->cache_pool, e);
+		r = 0;
+	} else
+		r = -ENOENT;
+
+	mutex_unlock(&mq->lock);
+
+	return r;
+}
+
+static void __mq_set_clear_dirty(struct mq_policy *mq, dm_oblock_t oblock, bool set)
+{
+	struct entry *e;
+
+	e = hash_lookup(mq, oblock);
+	BUG_ON(!e || !in_cache(mq, e));
+
+	del(mq, e);
+	e->dirty = set;
+	push(mq, e);
+}
+
+static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+	struct mq_policy *mq = to_mq_policy(p);
+
+	mutex_lock(&mq->lock);
+	__mq_set_clear_dirty(mq, oblock, true);
+	mutex_unlock(&mq->lock);
+}
+
+static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+	struct mq_policy *mq = to_mq_policy(p);
+
+	mutex_lock(&mq->lock);
+	__mq_set_clear_dirty(mq, oblock, false);
+	mutex_unlock(&mq->lock);
+}
+
+static int mq_load_mapping(struct dm_cache_policy *p,
+			   dm_oblock_t oblock, dm_cblock_t cblock,
+			   uint32_t hint, bool hint_valid)
+{
+	struct mq_policy *mq = to_mq_policy(p);
+	struct entry *e;
+
+	e = alloc_particular_entry(&mq->cache_pool, cblock);
+	e->oblock = oblock;
+	e->dirty = false;	/* this gets corrected in a minute */
+	e->hit_count = hint_valid ? hint : 1;
+	e->generation = mq->generation;
+	push(mq, e);
+
+	return 0;
+}
+
+static int mq_save_hints(struct mq_policy *mq, struct queue *q,
+			 policy_walk_fn fn, void *context)
+{
+	int r;
+	unsigned level;
+	struct entry *e;
+
+	for (level = 0; level < NR_QUEUE_LEVELS; level++)
+		list_for_each_entry(e, q->qs + level, list) {
+			r = fn(context, infer_cblock(&mq->cache_pool, e),
+			       e->oblock, e->hit_count);
+			if (r)
+				return r;
+		}
+
+	return 0;
+}
+
+static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
+			    void *context)
+{
+	struct mq_policy *mq = to_mq_policy(p);
+	int r = 0;
+
+	mutex_lock(&mq->lock);
+
+	r = mq_save_hints(mq, &mq->cache_clean, fn, context);
+	if (!r)
+		r = mq_save_hints(mq, &mq->cache_dirty, fn, context);
+
+	mutex_unlock(&mq->lock);
+
+	return r;
+}
+
+static void __remove_mapping(struct mq_policy *mq, dm_oblock_t oblock)
+{
+	struct entry *e;
+
+	e = hash_lookup(mq, oblock);
+	BUG_ON(!e || !in_cache(mq, e));
+
+	del(mq, e);
+	free_entry(&mq->cache_pool, e);
+}
+
+static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+	struct mq_policy *mq = to_mq_policy(p);
+
+	mutex_lock(&mq->lock);
+	__remove_mapping(mq, oblock);
+	mutex_unlock(&mq->lock);
+}
+
+static int __remove_cblock(struct mq_policy *mq, dm_cblock_t cblock)
+{
+	struct entry *e = epool_find(&mq->cache_pool, cblock);
+
+	if (!e)
+		return -ENODATA;
+
+	del(mq, e);
+	free_entry(&mq->cache_pool, e);
+
+	return 0;
+}
+
+static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
+{
+	int r;
+	struct mq_policy *mq = to_mq_policy(p);
+
+	mutex_lock(&mq->lock);
+	r = __remove_cblock(mq, cblock);
+	mutex_unlock(&mq->lock);
+
+	return r;
+}
+
+static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
+			      dm_cblock_t *cblock)
+{
+	struct entry *e = pop(mq, &mq->cache_dirty);
+
+	if (!e)
+		return -ENODATA;
+
+	*oblock = e->oblock;
+	*cblock = infer_cblock(&mq->cache_pool, e);
+	e->dirty = false;
+	push(mq, e);
+
+	return 0;
+}
+
+static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
+			     dm_cblock_t *cblock)
+{
+	int r;
+	struct mq_policy *mq = to_mq_policy(p);
+
+	mutex_lock(&mq->lock);
+	r = __mq_writeback_work(mq, oblock, cblock);
+	mutex_unlock(&mq->lock);
+
+	return r;
+}
+
+static void __force_mapping(struct mq_policy *mq,
+			    dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+{
+	struct entry *e = hash_lookup(mq, current_oblock);
+
+	if (e && in_cache(mq, e)) {
+		del(mq, e);
+		e->oblock = new_oblock;
+		e->dirty = true;
+		push(mq, e);
+	}
+}
+
+static void mq_force_mapping(struct dm_cache_policy *p,
+			     dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+{
+	struct mq_policy *mq = to_mq_policy(p);
+
+	mutex_lock(&mq->lock);
+	__force_mapping(mq, current_oblock, new_oblock);
+	mutex_unlock(&mq->lock);
+}
+
+static dm_cblock_t mq_residency(struct dm_cache_policy *p)
+{
+	dm_cblock_t r;
+	struct mq_policy *mq = to_mq_policy(p);
+
+	mutex_lock(&mq->lock);
+	r = to_cblock(mq->cache_pool.nr_allocated);
+	mutex_unlock(&mq->lock);
+
+	return r;
+}
+
+static void mq_tick(struct dm_cache_policy *p)
+{
+	struct mq_policy *mq = to_mq_policy(p);
+	unsigned long flags;
+
+	spin_lock_irqsave(&mq->tick_lock, flags);
+	mq->tick_protected++;
+	spin_unlock_irqrestore(&mq->tick_lock, flags);
+}
+
+static int mq_set_config_value(struct dm_cache_policy *p,
+			       const char *key, const char *value)
+{
+	struct mq_policy *mq = to_mq_policy(p);
+	unsigned long tmp;
+
+	if (kstrtoul(value, 10, &tmp))
+		return -EINVAL;
+
+	if (!strcasecmp(key, "random_threshold")) {
+		mq->tracker.thresholds[PATTERN_RANDOM] = tmp;
+
+	} else if (!strcasecmp(key, "sequential_threshold")) {
+		mq->tracker.thresholds[PATTERN_SEQUENTIAL] = tmp;
+
+	} else if (!strcasecmp(key, "discard_promote_adjustment"))
+		mq->discard_promote_adjustment = tmp;
+
+	else if (!strcasecmp(key, "read_promote_adjustment"))
+		mq->read_promote_adjustment = tmp;
+
+	else if (!strcasecmp(key, "write_promote_adjustment"))
+		mq->write_promote_adjustment = tmp;
+
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
+static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
+{
+	ssize_t sz = 0;
+	struct mq_policy *mq = to_mq_policy(p);
+
+	DMEMIT("10 random_threshold %u "
+	       "sequential_threshold %u "
+	       "discard_promote_adjustment %u "
+	       "read_promote_adjustment %u "
+	       "write_promote_adjustment %u",
+	       mq->tracker.thresholds[PATTERN_RANDOM],
+	       mq->tracker.thresholds[PATTERN_SEQUENTIAL],
+	       mq->discard_promote_adjustment,
+	       mq->read_promote_adjustment,
+	       mq->write_promote_adjustment);
+
+	return 0;
+}
+
+/* Init the policy plugin interface function pointers. */
+static void init_policy_functions(struct mq_policy *mq)
+{
+	mq->policy.destroy = mq_destroy;
+	mq->policy.map = mq_map;
+	mq->policy.lookup = mq_lookup;
+	mq->policy.set_dirty = mq_set_dirty;
+	mq->policy.clear_dirty = mq_clear_dirty;
+	mq->policy.load_mapping = mq_load_mapping;
+	mq->policy.walk_mappings = mq_walk_mappings;
+	mq->policy.remove_mapping = mq_remove_mapping;
+	mq->policy.remove_cblock = mq_remove_cblock;
+	mq->policy.writeback_work = mq_writeback_work;
+	mq->policy.force_mapping = mq_force_mapping;
+	mq->policy.residency = mq_residency;
+	mq->policy.tick = mq_tick;
+	mq->policy.emit_config_values = mq_emit_config_values;
+	mq->policy.set_config_value = mq_set_config_value;
+}
+
+static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
+					 sector_t origin_size,
+					 sector_t cache_block_size)
+{
+	struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
+
+	if (!mq)
+		return NULL;
+
+	init_policy_functions(mq);
+	iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT);
+	mq->cache_size = cache_size;
+
+	if (epool_init(&mq->pre_cache_pool, from_cblock(cache_size))) {
+		DMERR("couldn't initialize pool of pre-cache entries");
+		goto bad_pre_cache_init;
+	}
+
+	if (epool_init(&mq->cache_pool, from_cblock(cache_size))) {
+		DMERR("couldn't initialize pool of cache entries");
+		goto bad_cache_init;
+	}
+
+	mq->tick_protected = 0;
+	mq->tick = 0;
+	mq->hit_count = 0;
+	mq->generation = 0;
+	mq->promote_threshold = 0;
+	mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT;
+	mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT;
+	mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT;
+	mutex_init(&mq->lock);
+	spin_lock_init(&mq->tick_lock);
+
+	queue_init(&mq->pre_cache);
+	queue_init(&mq->cache_clean);
+	queue_init(&mq->cache_dirty);
+
+	mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U);
+
+	mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
+	mq->hash_bits = ffs(mq->nr_buckets) - 1;
+	mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets);
+	if (!mq->table)
+		goto bad_alloc_table;
+
+	return &mq->policy;
+
+bad_alloc_table:
+	epool_exit(&mq->cache_pool);
+bad_cache_init:
+	epool_exit(&mq->pre_cache_pool);
+bad_pre_cache_init:
+	kfree(mq);
+
+	return NULL;
+}
+
+/*----------------------------------------------------------------*/
+
+static struct dm_cache_policy_type mq_policy_type = {
+	.name = "mq",
+	.version = {1, 2, 0},
+	.hint_size = 4,
+	.owner = THIS_MODULE,
+	.create = mq_create
+};
+
+static struct dm_cache_policy_type default_policy_type = {
+	.name = "default",
+	.version = {1, 2, 0},
+	.hint_size = 4,
+	.owner = THIS_MODULE,
+	.create = mq_create,
+	.real = &mq_policy_type
+};
+
+static int __init mq_init(void)
+{
+	int r;
+
+	mq_entry_cache = kmem_cache_create("dm_mq_policy_cache_entry",
+					   sizeof(struct entry),
+					   __alignof__(struct entry),
+					   0, NULL);
+	if (!mq_entry_cache)
+		goto bad;
+
+	r = dm_cache_policy_register(&mq_policy_type);
+	if (r) {
+		DMERR("register failed %d", r);
+		goto bad_register_mq;
+	}
+
+	r = dm_cache_policy_register(&default_policy_type);
+	if (!r) {
+		DMINFO("version %u.%u.%u loaded",
+		       mq_policy_type.version[0],
+		       mq_policy_type.version[1],
+		       mq_policy_type.version[2]);
+		return 0;
+	}
+
+	DMERR("register failed (as default) %d", r);
+
+	dm_cache_policy_unregister(&mq_policy_type);
+bad_register_mq:
+	kmem_cache_destroy(mq_entry_cache);
+bad:
+	return -ENOMEM;
+}
+
+static void __exit mq_exit(void)
+{
+	dm_cache_policy_unregister(&mq_policy_type);
+	dm_cache_policy_unregister(&default_policy_type);
+
+	kmem_cache_destroy(mq_entry_cache);
+}
+
+module_init(mq_init);
+module_exit(mq_exit);
+
+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("mq cache policy");
+
+MODULE_ALIAS("dm-cache-default");
diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c
new file mode 100644
index 00000000000..c1a3cee99b4
--- /dev/null
+++ b/drivers/md/dm-cache-policy.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-cache-policy-internal.h"
+#include "dm.h"
+
+#include <linux/module.h>
+#include <linux/slab.h>
+
+/*----------------------------------------------------------------*/
+
+#define DM_MSG_PREFIX "cache-policy"
+
+static DEFINE_SPINLOCK(register_lock);
+static LIST_HEAD(register_list);
+
+static struct dm_cache_policy_type *__find_policy(const char *name)
+{
+	struct dm_cache_policy_type *t;
+
+	list_for_each_entry(t, &register_list, list)
+		if (!strcmp(t->name, name))
+			return t;
+
+	return NULL;
+}
+
+static struct dm_cache_policy_type *__get_policy_once(const char *name)
+{
+	struct dm_cache_policy_type *t = __find_policy(name);
+
+	if (t && !try_module_get(t->owner)) {
+		DMWARN("couldn't get module %s", name);
+		t = ERR_PTR(-EINVAL);
+	}
+
+	return t;
+}
+
+static struct dm_cache_policy_type *get_policy_once(const char *name)
+{
+	struct dm_cache_policy_type *t;
+
+	spin_lock(&register_lock);
+	t = __get_policy_once(name);
+	spin_unlock(&register_lock);
+
+	return t;
+}
+
+static struct dm_cache_policy_type *get_policy(const char *name)
+{
+	struct dm_cache_policy_type *t;
+
+	t = get_policy_once(name);
+	if (IS_ERR(t))
+		return NULL;
+
+	if (t)
+		return t;
+
+	request_module("dm-cache-%s", name);
+
+	t = get_policy_once(name);
+	if (IS_ERR(t))
+		return NULL;
+
+	return t;
+}
+
+static void put_policy(struct dm_cache_policy_type *t)
+{
+	module_put(t->owner);
+}
+
+int dm_cache_policy_register(struct dm_cache_policy_type *type)
+{
+	int r;
+
+	/* One size fits all for now */
+	if (type->hint_size != 0 && type->hint_size != 4) {
+		DMWARN("hint size must be 0 or 4 but %llu supplied.", (unsigned long long) type->hint_size);
+		return -EINVAL;
+	}
+
+	spin_lock(&register_lock);
+	if (__find_policy(type->name)) {
+		DMWARN("attempt to register policy under duplicate name %s", type->name);
+		r = -EINVAL;
+	} else {
+		list_add(&type->list, &register_list);
+		r = 0;
+	}
+	spin_unlock(&register_lock);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_register);
+
+void dm_cache_policy_unregister(struct dm_cache_policy_type *type)
+{
+	spin_lock(&register_lock);
+	list_del_init(&type->list);
+	spin_unlock(&register_lock);
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_unregister);
+
+struct dm_cache_policy *dm_cache_policy_create(const char *name,
+					       dm_cblock_t cache_size,
+					       sector_t origin_size,
+					       sector_t cache_block_size)
+{
+	struct dm_cache_policy *p = NULL;
+	struct dm_cache_policy_type *type;
+
+	type = get_policy(name);
+	if (!type) {
+		DMWARN("unknown policy type");
+		return ERR_PTR(-EINVAL);
+	}
+
+	p = type->create(cache_size, origin_size, cache_block_size);
+	if (!p) {
+		put_policy(type);
+		return ERR_PTR(-ENOMEM);
+	}
+	p->private = type;
+
+	return p;
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_create);
+
+void dm_cache_policy_destroy(struct dm_cache_policy *p)
+{
+	struct dm_cache_policy_type *t = p->private;
+
+	p->destroy(p);
+	put_policy(t);
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_destroy);
+
+const char *dm_cache_policy_get_name(struct dm_cache_policy *p)
+{
+	struct dm_cache_policy_type *t = p->private;
+
+	/* if t->real is set then an alias was used (e.g. "default") */
+	if (t->real)
+		return t->real->name;
+
+	return t->name;
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_get_name);
+
+const unsigned *dm_cache_policy_get_version(struct dm_cache_policy *p)
+{
+	struct dm_cache_policy_type *t = p->private;
+
+	return t->version;
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_get_version);
+
+size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p)
+{
+	struct dm_cache_policy_type *t = p->private;
+
+	return t->hint_size;
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_get_hint_size);
+
+/*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
new file mode 100644
index 00000000000..f50fe360c54
--- /dev/null
+++ b/drivers/md/dm-cache-policy.h
@@ -0,0 +1,249 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_CACHE_POLICY_H
+#define DM_CACHE_POLICY_H
+
+#include "dm-cache-block-types.h"
+
+#include <linux/device-mapper.h>
+
+/*----------------------------------------------------------------*/
+
+/* FIXME: make it clear which methods are optional.  Get debug policy to
+ * double check this at start.
+ */
+
+/*
+ * The cache policy makes the important decisions about which blocks get to
+ * live on the faster cache device.
+ *
+ * When the core target has to remap a bio it calls the 'map' method of the
+ * policy.  This returns an instruction telling the core target what to do.
+ *
+ * POLICY_HIT:
+ *   That block is in the cache.  Remap to the cache and carry on.
+ *
+ * POLICY_MISS:
+ *   This block is on the origin device.  Remap and carry on.
+ *
+ * POLICY_NEW:
+ *   This block is currently on the origin device, but the policy wants to
+ *   move it.  The core should:
+ *
+ *   - hold any further io to this origin block
+ *   - copy the origin to the given cache block
+ *   - release all the held blocks
+ *   - remap the original block to the cache
+ *
+ * POLICY_REPLACE:
+ *   This block is currently on the origin device.  The policy wants to
+ *   move it to the cache, with the added complication that the destination
+ *   cache block needs a writeback first.  The core should:
+ *
+ *   - hold any further io to this origin block
+ *   - hold any further io to the origin block that's being written back
+ *   - writeback
+ *   - copy new block to cache
+ *   - release held blocks
+ *   - remap bio to cache and reissue.
+ *
+ * Should the core run into trouble while processing a POLICY_NEW or
+ * POLICY_REPLACE instruction it will roll back the policies mapping using
+ * remove_mapping() or force_mapping().  These methods must not fail.  This
+ * approach avoids having transactional semantics in the policy (ie, the
+ * core informing the policy when a migration is complete), and hence makes
+ * it easier to write new policies.
+ *
+ * In general policy methods should never block, except in the case of the
+ * map function when can_migrate is set.  So be careful to implement using
+ * bounded, preallocated memory.
+ */
+enum policy_operation {
+	POLICY_HIT,
+	POLICY_MISS,
+	POLICY_NEW,
+	POLICY_REPLACE
+};
+
+/*
+ * This is the instruction passed back to the core target.
+ */
+struct policy_result {
+	enum policy_operation op;
+	dm_oblock_t old_oblock;	/* POLICY_REPLACE */
+	dm_cblock_t cblock;	/* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */
+};
+
+typedef int (*policy_walk_fn)(void *context, dm_cblock_t cblock,
+			      dm_oblock_t oblock, uint32_t hint);
+
+/*
+ * The cache policy object.  Just a bunch of methods.  It is envisaged that
+ * this structure will be embedded in a bigger, policy specific structure
+ * (ie. use container_of()).
+ */
+struct dm_cache_policy {
+
+	/*
+	 * FIXME: make it clear which methods are optional, and which may
+	 * block.
+	 */
+
+	/*
+	 * Destroys this object.
+	 */
+	void (*destroy)(struct dm_cache_policy *p);
+
+	/*
+	 * See large comment above.
+	 *
+	 * oblock      - the origin block we're interested in.
+	 *
+	 * can_block - indicates whether the current thread is allowed to
+	 *             block.  -EWOULDBLOCK returned if it can't and would.
+	 *
+	 * can_migrate - gives permission for POLICY_NEW or POLICY_REPLACE
+	 *               instructions.  If denied and the policy would have
+	 *               returned one of these instructions it should
+	 *               return -EWOULDBLOCK.
+	 *
+	 * discarded_oblock - indicates whether the whole origin block is
+	 *               in a discarded state (FIXME: better to tell the
+	 *               policy about this sooner, so it can recycle that
+	 *               cache block if it wants.)
+	 * bio         - the bio that triggered this call.
+	 * result      - gets filled in with the instruction.
+	 *
+	 * May only return 0, or -EWOULDBLOCK (if !can_migrate)
+	 */
+	int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock,
+		   bool can_block, bool can_migrate, bool discarded_oblock,
+		   struct bio *bio, struct policy_result *result);
+
+	/*
+	 * Sometimes we want to see if a block is in the cache, without
+	 * triggering any update of stats.  (ie. it's not a real hit).
+	 *
+	 * Must not block.
+	 *
+	 * Returns 0 if in cache, -ENOENT if not, < 0 for other errors
+	 * (-EWOULDBLOCK would be typical).
+	 */
+	int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock);
+
+	void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
+	void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
+
+	/*
+	 * Called when a cache target is first created.  Used to load a
+	 * mapping from the metadata device into the policy.
+	 */
+	int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock,
+			    dm_cblock_t cblock, uint32_t hint, bool hint_valid);
+
+	int (*walk_mappings)(struct dm_cache_policy *p, policy_walk_fn fn,
+			     void *context);
+
+	/*
+	 * Override functions used on the error paths of the core target.
+	 * They must succeed.
+	 */
+	void (*remove_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock);
+	void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock,
+			      dm_oblock_t new_oblock);
+
+	/*
+	 * This is called via the invalidate_cblocks message.  It is
+	 * possible the particular cblock has already been removed due to a
+	 * write io in passthrough mode.  In which case this should return
+	 * -ENODATA.
+	 */
+	int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock);
+
+	/*
+	 * Provide a dirty block to be written back by the core target.
+	 *
+	 * Returns:
+	 *
+	 * 0 and @cblock,@oblock: block to write back provided
+	 *
+	 * -ENODATA: no dirty blocks available
+	 */
+	int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock);
+
+	/*
+	 * How full is the cache?
+	 */
+	dm_cblock_t (*residency)(struct dm_cache_policy *p);
+
+	/*
+	 * Because of where we sit in the block layer, we can be asked to
+	 * map a lot of little bios that are all in the same block (no
+	 * queue merging has occurred).  To stop the policy being fooled by
+	 * these the core target sends regular tick() calls to the policy.
+	 * The policy should only count an entry as hit once per tick.
+	 */
+	void (*tick)(struct dm_cache_policy *p);
+
+	/*
+	 * Configuration.
+	 */
+	int (*emit_config_values)(struct dm_cache_policy *p,
+				  char *result, unsigned maxlen);
+	int (*set_config_value)(struct dm_cache_policy *p,
+				const char *key, const char *value);
+
+	/*
+	 * Book keeping ptr for the policy register, not for general use.
+	 */
+	void *private;
+};
+
+/*----------------------------------------------------------------*/
+
+/*
+ * We maintain a little register of the different policy types.
+ */
+#define CACHE_POLICY_NAME_SIZE 16
+#define CACHE_POLICY_VERSION_SIZE 3
+
+struct dm_cache_policy_type {
+	/* For use by the register code only. */
+	struct list_head list;
+
+	/*
+	 * Policy writers should fill in these fields.  The name field is
+	 * what gets passed on the target line to select your policy.
+	 */
+	char name[CACHE_POLICY_NAME_SIZE];
+	unsigned version[CACHE_POLICY_VERSION_SIZE];
+
+	/*
+	 * For use by an alias dm_cache_policy_type to point to the
+	 * real dm_cache_policy_type.
+	 */
+	struct dm_cache_policy_type *real;
+
+	/*
+	 * Policies may store a hint for each each cache block.
+	 * Currently the size of this hint must be 0 or 4 bytes but we
+	 * expect to relax this in future.
+	 */
+	size_t hint_size;
+
+	struct module *owner;
+	struct dm_cache_policy *(*create)(dm_cblock_t cache_size,
+					  sector_t origin_size,
+					  sector_t block_size);
+};
+
+int dm_cache_policy_register(struct dm_cache_policy_type *type);
+void dm_cache_policy_unregister(struct dm_cache_policy_type *type);
+
+/*----------------------------------------------------------------*/
+
+#endif	/* DM_CACHE_POLICY_H */
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
new file mode 100644
index 00000000000..2c63326638b
--- /dev/null
+++ b/drivers/md/dm-cache-target.c
@@ -0,0 +1,3121 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+#include "dm-bio-prison.h"
+#include "dm-bio-record.h"
+#include "dm-cache-metadata.h"
+
+#include <linux/dm-io.h>
+#include <linux/dm-kcopyd.h>
+#include <linux/init.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#define DM_MSG_PREFIX "cache"
+
+DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
+	"A percentage of time allocated for copying to and/or from cache");
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Glossary:
+ *
+ * oblock: index of an origin block
+ * cblock: index of a cache block
+ * promotion: movement of a block from origin to cache
+ * demotion: movement of a block from cache to origin
+ * migration: movement of a block between the origin and cache device,
+ *	      either direction
+ */
+
+/*----------------------------------------------------------------*/
+
+static size_t bitset_size_in_bytes(unsigned nr_entries)
+{
+	return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
+}
+
+static unsigned long *alloc_bitset(unsigned nr_entries)
+{
+	size_t s = bitset_size_in_bytes(nr_entries);
+	return vzalloc(s);
+}
+
+static void clear_bitset(void *bitset, unsigned nr_entries)
+{
+	size_t s = bitset_size_in_bytes(nr_entries);
+	memset(bitset, 0, s);
+}
+
+static void free_bitset(unsigned long *bits)
+{
+	vfree(bits);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * There are a couple of places where we let a bio run, but want to do some
+ * work before calling its endio function.  We do this by temporarily
+ * changing the endio fn.
+ */
+struct dm_hook_info {
+	bio_end_io_t *bi_end_io;
+	void *bi_private;
+};
+
+static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
+			bio_end_io_t *bi_end_io, void *bi_private)
+{
+	h->bi_end_io = bio->bi_end_io;
+	h->bi_private = bio->bi_private;
+
+	bio->bi_end_io = bi_end_io;
+	bio->bi_private = bi_private;
+}
+
+static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
+{
+	bio->bi_end_io = h->bi_end_io;
+	bio->bi_private = h->bi_private;
+
+	/*
+	 * Must bump bi_remaining to allow bio to complete with
+	 * restored bi_end_io.
+	 */
+	atomic_inc(&bio->bi_remaining);
+}
+
+/*----------------------------------------------------------------*/
+
+#define PRISON_CELLS 1024
+#define MIGRATION_POOL_SIZE 128
+#define COMMIT_PERIOD HZ
+#define MIGRATION_COUNT_WINDOW 10
+
+/*
+ * The block size of the device holding cache data must be
+ * between 32KB and 1GB.
+ */
+#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
+#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
+
+/*
+ * FIXME: the cache is read/write for the time being.
+ */
+enum cache_metadata_mode {
+	CM_WRITE,		/* metadata may be changed */
+	CM_READ_ONLY,		/* metadata may not be changed */
+};
+
+enum cache_io_mode {
+	/*
+	 * Data is written to cached blocks only.  These blocks are marked
+	 * dirty.  If you lose the cache device you will lose data.
+	 * Potential performance increase for both reads and writes.
+	 */
+	CM_IO_WRITEBACK,
+
+	/*
+	 * Data is written to both cache and origin.  Blocks are never
+	 * dirty.  Potential performance benfit for reads only.
+	 */
+	CM_IO_WRITETHROUGH,
+
+	/*
+	 * A degraded mode useful for various cache coherency situations
+	 * (eg, rolling back snapshots).  Reads and writes always go to the
+	 * origin.  If a write goes to a cached oblock, then the cache
+	 * block is invalidated.
+	 */
+	CM_IO_PASSTHROUGH
+};
+
+struct cache_features {
+	enum cache_metadata_mode mode;
+	enum cache_io_mode io_mode;
+};
+
+struct cache_stats {
+	atomic_t read_hit;
+	atomic_t read_miss;
+	atomic_t write_hit;
+	atomic_t write_miss;
+	atomic_t demotion;
+	atomic_t promotion;
+	atomic_t copies_avoided;
+	atomic_t cache_cell_clash;
+	atomic_t commit_count;
+	atomic_t discard_count;
+};
+
+/*
+ * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
+ * the one-past-the-end value.
+ */
+struct cblock_range {
+	dm_cblock_t begin;
+	dm_cblock_t end;
+};
+
+struct invalidation_request {
+	struct list_head list;
+	struct cblock_range *cblocks;
+
+	atomic_t complete;
+	int err;
+
+	wait_queue_head_t result_wait;
+};
+
+struct cache {
+	struct dm_target *ti;
+	struct dm_target_callbacks callbacks;
+
+	struct dm_cache_metadata *cmd;
+
+	/*
+	 * Metadata is written to this device.
+	 */
+	struct dm_dev *metadata_dev;
+
+	/*
+	 * The slower of the two data devices.  Typically a spindle.
+	 */
+	struct dm_dev *origin_dev;
+
+	/*
+	 * The faster of the two data devices.  Typically an SSD.
+	 */
+	struct dm_dev *cache_dev;
+
+	/*
+	 * Size of the origin device in _complete_ blocks and native sectors.
+	 */
+	dm_oblock_t origin_blocks;
+	sector_t origin_sectors;
+
+	/*
+	 * Size of the cache device in blocks.
+	 */
+	dm_cblock_t cache_size;
+
+	/*
+	 * Fields for converting from sectors to blocks.
+	 */
+	uint32_t sectors_per_block;
+	int sectors_per_block_shift;
+
+	spinlock_t lock;
+	struct bio_list deferred_bios;
+	struct bio_list deferred_flush_bios;
+	struct bio_list deferred_writethrough_bios;
+	struct list_head quiesced_migrations;
+	struct list_head completed_migrations;
+	struct list_head need_commit_migrations;
+	sector_t migration_threshold;
+	wait_queue_head_t migration_wait;
+	atomic_t nr_migrations;
+
+	wait_queue_head_t quiescing_wait;
+	atomic_t quiescing;
+	atomic_t quiescing_ack;
+
+	/*
+	 * cache_size entries, dirty if set
+	 */
+	atomic_t nr_dirty;
+	unsigned long *dirty_bitset;
+
+	/*
+	 * origin_blocks entries, discarded if set.
+	 */
+	dm_oblock_t discard_nr_blocks;
+	unsigned long *discard_bitset;
+
+	/*
+	 * Rather than reconstructing the table line for the status we just
+	 * save it and regurgitate.
+	 */
+	unsigned nr_ctr_args;
+	const char **ctr_args;
+
+	struct dm_kcopyd_client *copier;
+	struct workqueue_struct *wq;
+	struct work_struct worker;
+
+	struct delayed_work waker;
+	unsigned long last_commit_jiffies;
+
+	struct dm_bio_prison *prison;
+	struct dm_deferred_set *all_io_ds;
+
+	mempool_t *migration_pool;
+	struct dm_cache_migration *next_migration;
+
+	struct dm_cache_policy *policy;
+	unsigned policy_nr_args;
+
+	bool need_tick_bio:1;
+	bool sized:1;
+	bool invalidate:1;
+	bool commit_requested:1;
+	bool loaded_mappings:1;
+	bool loaded_discards:1;
+
+	/*
+	 * Cache features such as write-through.
+	 */
+	struct cache_features features;
+
+	struct cache_stats stats;
+
+	/*
+	 * Invalidation fields.
+	 */
+	spinlock_t invalidation_lock;
+	struct list_head invalidation_requests;
+};
+
+struct per_bio_data {
+	bool tick:1;
+	unsigned req_nr:2;
+	struct dm_deferred_entry *all_io_entry;
+	struct dm_hook_info hook_info;
+
+	/*
+	 * writethrough fields.  These MUST remain at the end of this
+	 * structure and the 'cache' member must be the first as it
+	 * is used to determine the offset of the writethrough fields.
+	 */
+	struct cache *cache;
+	dm_cblock_t cblock;
+	struct dm_bio_details bio_details;
+};
+
+struct dm_cache_migration {
+	struct list_head list;
+	struct cache *cache;
+
+	unsigned long start_jiffies;
+	dm_oblock_t old_oblock;
+	dm_oblock_t new_oblock;
+	dm_cblock_t cblock;
+
+	bool err:1;
+	bool writeback:1;
+	bool demote:1;
+	bool promote:1;
+	bool requeue_holder:1;
+	bool invalidate:1;
+
+	struct dm_bio_prison_cell *old_ocell;
+	struct dm_bio_prison_cell *new_ocell;
+};
+
+/*
+ * Processing a bio in the worker thread may require these memory
+ * allocations.  We prealloc to avoid deadlocks (the same worker thread
+ * frees them back to the mempool).
+ */
+struct prealloc {
+	struct dm_cache_migration *mg;
+	struct dm_bio_prison_cell *cell1;
+	struct dm_bio_prison_cell *cell2;
+};
+
+static void wake_worker(struct cache *cache)
+{
+	queue_work(cache->wq, &cache->worker);
+}
+
+/*----------------------------------------------------------------*/
+
+static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
+{
+	/* FIXME: change to use a local slab. */
+	return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
+}
+
+static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
+{
+	dm_bio_prison_free_cell(cache->prison, cell);
+}
+
+static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
+{
+	if (!p->mg) {
+		p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
+		if (!p->mg)
+			return -ENOMEM;
+	}
+
+	if (!p->cell1) {
+		p->cell1 = alloc_prison_cell(cache);
+		if (!p->cell1)
+			return -ENOMEM;
+	}
+
+	if (!p->cell2) {
+		p->cell2 = alloc_prison_cell(cache);
+		if (!p->cell2)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
+{
+	if (p->cell2)
+		free_prison_cell(cache, p->cell2);
+
+	if (p->cell1)
+		free_prison_cell(cache, p->cell1);
+
+	if (p->mg)
+		mempool_free(p->mg, cache->migration_pool);
+}
+
+static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
+{
+	struct dm_cache_migration *mg = p->mg;
+
+	BUG_ON(!mg);
+	p->mg = NULL;
+
+	return mg;
+}
+
+/*
+ * You must have a cell within the prealloc struct to return.  If not this
+ * function will BUG() rather than returning NULL.
+ */
+static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
+{
+	struct dm_bio_prison_cell *r = NULL;
+
+	if (p->cell1) {
+		r = p->cell1;
+		p->cell1 = NULL;
+
+	} else if (p->cell2) {
+		r = p->cell2;
+		p->cell2 = NULL;
+	} else
+		BUG();
+
+	return r;
+}
+
+/*
+ * You can't have more than two cells in a prealloc struct.  BUG() will be
+ * called if you try and overfill.
+ */
+static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
+{
+	if (!p->cell2)
+		p->cell2 = cell;
+
+	else if (!p->cell1)
+		p->cell1 = cell;
+
+	else
+		BUG();
+}
+
+/*----------------------------------------------------------------*/
+
+static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
+{
+	key->virtual = 0;
+	key->dev = 0;
+	key->block = from_oblock(oblock);
+}
+
+/*
+ * The caller hands in a preallocated cell, and a free function for it.
+ * The cell will be freed if there's an error, or if it wasn't used because
+ * a cell with that key already exists.
+ */
+typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
+
+static int bio_detain(struct cache *cache, dm_oblock_t oblock,
+		      struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
+		      cell_free_fn free_fn, void *free_context,
+		      struct dm_bio_prison_cell **cell_result)
+{
+	int r;
+	struct dm_cell_key key;
+
+	build_key(oblock, &key);
+	r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
+	if (r)
+		free_fn(free_context, cell_prealloc);
+
+	return r;
+}
+
+static int get_cell(struct cache *cache,
+		    dm_oblock_t oblock,
+		    struct prealloc *structs,
+		    struct dm_bio_prison_cell **cell_result)
+{
+	int r;
+	struct dm_cell_key key;
+	struct dm_bio_prison_cell *cell_prealloc;
+
+	cell_prealloc = prealloc_get_cell(structs);
+
+	build_key(oblock, &key);
+	r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
+	if (r)
+		prealloc_put_cell(structs, cell_prealloc);
+
+	return r;
+}
+
+/*----------------------------------------------------------------*/
+
+static bool is_dirty(struct cache *cache, dm_cblock_t b)
+{
+	return test_bit(from_cblock(b), cache->dirty_bitset);
+}
+
+static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
+{
+	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
+		atomic_inc(&cache->nr_dirty);
+		policy_set_dirty(cache->policy, oblock);
+	}
+}
+
+static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
+{
+	if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
+		policy_clear_dirty(cache->policy, oblock);
+		if (atomic_dec_return(&cache->nr_dirty) == 0)
+			dm_table_event(cache->ti->table);
+	}
+}
+
+/*----------------------------------------------------------------*/
+
+static bool block_size_is_power_of_two(struct cache *cache)
+{
+	return cache->sectors_per_block_shift >= 0;
+}
+
+/* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
+#if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
+__always_inline
+#endif
+static dm_block_t block_div(dm_block_t b, uint32_t n)
+{
+	do_div(b, n);
+
+	return b;
+}
+
+static void set_discard(struct cache *cache, dm_oblock_t b)
+{
+	unsigned long flags;
+
+	atomic_inc(&cache->stats.discard_count);
+
+	spin_lock_irqsave(&cache->lock, flags);
+	set_bit(from_oblock(b), cache->discard_bitset);
+	spin_unlock_irqrestore(&cache->lock, flags);
+}
+
+static void clear_discard(struct cache *cache, dm_oblock_t b)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cache->lock, flags);
+	clear_bit(from_oblock(b), cache->discard_bitset);
+	spin_unlock_irqrestore(&cache->lock, flags);
+}
+
+static bool is_discarded(struct cache *cache, dm_oblock_t b)
+{
+	int r;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cache->lock, flags);
+	r = test_bit(from_oblock(b), cache->discard_bitset);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	return r;
+}
+
+static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
+{
+	int r;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cache->lock, flags);
+	r = test_bit(from_oblock(b), cache->discard_bitset);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	return r;
+}
+
+/*----------------------------------------------------------------*/
+
+static void load_stats(struct cache *cache)
+{
+	struct dm_cache_statistics stats;
+
+	dm_cache_metadata_get_stats(cache->cmd, &stats);
+	atomic_set(&cache->stats.read_hit, stats.read_hits);
+	atomic_set(&cache->stats.read_miss, stats.read_misses);
+	atomic_set(&cache->stats.write_hit, stats.write_hits);
+	atomic_set(&cache->stats.write_miss, stats.write_misses);
+}
+
+static void save_stats(struct cache *cache)
+{
+	struct dm_cache_statistics stats;
+
+	stats.read_hits = atomic_read(&cache->stats.read_hit);
+	stats.read_misses = atomic_read(&cache->stats.read_miss);
+	stats.write_hits = atomic_read(&cache->stats.write_hit);
+	stats.write_misses = atomic_read(&cache->stats.write_miss);
+
+	dm_cache_metadata_set_stats(cache->cmd, &stats);
+}
+
+/*----------------------------------------------------------------
+ * Per bio data
+ *--------------------------------------------------------------*/
+
+/*
+ * If using writeback, leave out struct per_bio_data's writethrough fields.
+ */
+#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
+#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
+
+static bool writethrough_mode(struct cache_features *f)
+{
+	return f->io_mode == CM_IO_WRITETHROUGH;
+}
+
+static bool writeback_mode(struct cache_features *f)
+{
+	return f->io_mode == CM_IO_WRITEBACK;
+}
+
+static bool passthrough_mode(struct cache_features *f)
+{
+	return f->io_mode == CM_IO_PASSTHROUGH;
+}
+
+static size_t get_per_bio_data_size(struct cache *cache)
+{
+	return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
+}
+
+static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
+{
+	struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
+	BUG_ON(!pb);
+	return pb;
+}
+
+static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
+{
+	struct per_bio_data *pb = get_per_bio_data(bio, data_size);
+
+	pb->tick = false;
+	pb->req_nr = dm_bio_get_target_bio_nr(bio);
+	pb->all_io_entry = NULL;
+
+	return pb;
+}
+
+/*----------------------------------------------------------------
+ * Remapping
+ *--------------------------------------------------------------*/
+static void remap_to_origin(struct cache *cache, struct bio *bio)
+{
+	bio->bi_bdev = cache->origin_dev->bdev;
+}
+
+static void remap_to_cache(struct cache *cache, struct bio *bio,
+			   dm_cblock_t cblock)
+{
+	sector_t bi_sector = bio->bi_iter.bi_sector;
+	sector_t block = from_cblock(cblock);
+
+	bio->bi_bdev = cache->cache_dev->bdev;
+	if (!block_size_is_power_of_two(cache))
+		bio->bi_iter.bi_sector =
+			(block * cache->sectors_per_block) +
+			sector_div(bi_sector, cache->sectors_per_block);
+	else
+		bio->bi_iter.bi_sector =
+			(block << cache->sectors_per_block_shift) |
+			(bi_sector & (cache->sectors_per_block - 1));
+}
+
+static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
+{
+	unsigned long flags;
+	size_t pb_data_size = get_per_bio_data_size(cache);
+	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+	spin_lock_irqsave(&cache->lock, flags);
+	if (cache->need_tick_bio &&
+	    !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
+		pb->tick = true;
+		cache->need_tick_bio = false;
+	}
+	spin_unlock_irqrestore(&cache->lock, flags);
+}
+
+static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
+				  dm_oblock_t oblock)
+{
+	check_if_tick_bio_needed(cache, bio);
+	remap_to_origin(cache, bio);
+	if (bio_data_dir(bio) == WRITE)
+		clear_discard(cache, oblock);
+}
+
+static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
+				 dm_oblock_t oblock, dm_cblock_t cblock)
+{
+	check_if_tick_bio_needed(cache, bio);
+	remap_to_cache(cache, bio, cblock);
+	if (bio_data_dir(bio) == WRITE) {
+		set_dirty(cache, oblock, cblock);
+		clear_discard(cache, oblock);
+	}
+}
+
+static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
+{
+	sector_t block_nr = bio->bi_iter.bi_sector;
+
+	if (!block_size_is_power_of_two(cache))
+		(void) sector_div(block_nr, cache->sectors_per_block);
+	else
+		block_nr >>= cache->sectors_per_block_shift;
+
+	return to_oblock(block_nr);
+}
+
+static int bio_triggers_commit(struct cache *cache, struct bio *bio)
+{
+	return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
+}
+
+static void issue(struct cache *cache, struct bio *bio)
+{
+	unsigned long flags;
+
+	if (!bio_triggers_commit(cache, bio)) {
+		generic_make_request(bio);
+		return;
+	}
+
+	/*
+	 * Batch together any bios that trigger commits and then issue a
+	 * single commit for them in do_worker().
+	 */
+	spin_lock_irqsave(&cache->lock, flags);
+	cache->commit_requested = true;
+	bio_list_add(&cache->deferred_flush_bios, bio);
+	spin_unlock_irqrestore(&cache->lock, flags);
+}
+
+static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cache->lock, flags);
+	bio_list_add(&cache->deferred_writethrough_bios, bio);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	wake_worker(cache);
+}
+
+static void writethrough_endio(struct bio *bio, int err)
+{
+	struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
+
+	dm_unhook_bio(&pb->hook_info, bio);
+
+	if (err) {
+		bio_endio(bio, err);
+		return;
+	}
+
+	dm_bio_restore(&pb->bio_details, bio);
+	remap_to_cache(pb->cache, bio, pb->cblock);
+
+	/*
+	 * We can't issue this bio directly, since we're in interrupt
+	 * context.  So it gets put on a bio list for processing by the
+	 * worker thread.
+	 */
+	defer_writethrough_bio(pb->cache, bio);
+}
+
+/*
+ * When running in writethrough mode we need to send writes to clean blocks
+ * to both the cache and origin devices.  In future we'd like to clone the
+ * bio and send them in parallel, but for now we're doing them in
+ * series as this is easier.
+ */
+static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
+				       dm_oblock_t oblock, dm_cblock_t cblock)
+{
+	struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
+
+	pb->cache = cache;
+	pb->cblock = cblock;
+	dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL);
+	dm_bio_record(&pb->bio_details, bio);
+
+	remap_to_origin_clear_discard(pb->cache, bio, oblock);
+}
+
+/*----------------------------------------------------------------
+ * Migration processing
+ *
+ * Migration covers moving data from the origin device to the cache, or
+ * vice versa.
+ *--------------------------------------------------------------*/
+static void free_migration(struct dm_cache_migration *mg)
+{
+	mempool_free(mg, mg->cache->migration_pool);
+}
+
+static void inc_nr_migrations(struct cache *cache)
+{
+	atomic_inc(&cache->nr_migrations);
+}
+
+static void dec_nr_migrations(struct cache *cache)
+{
+	atomic_dec(&cache->nr_migrations);
+
+	/*
+	 * Wake the worker in case we're suspending the target.
+	 */
+	wake_up(&cache->migration_wait);
+}
+
+static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
+			 bool holder)
+{
+	(holder ? dm_cell_release : dm_cell_release_no_holder)
+		(cache->prison, cell, &cache->deferred_bios);
+	free_prison_cell(cache, cell);
+}
+
+static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
+		       bool holder)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cache->lock, flags);
+	__cell_defer(cache, cell, holder);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	wake_worker(cache);
+}
+
+static void cleanup_migration(struct dm_cache_migration *mg)
+{
+	struct cache *cache = mg->cache;
+	free_migration(mg);
+	dec_nr_migrations(cache);
+}
+
+static void migration_failure(struct dm_cache_migration *mg)
+{
+	struct cache *cache = mg->cache;
+
+	if (mg->writeback) {
+		DMWARN_LIMIT("writeback failed; couldn't copy block");
+		set_dirty(cache, mg->old_oblock, mg->cblock);
+		cell_defer(cache, mg->old_ocell, false);
+
+	} else if (mg->demote) {
+		DMWARN_LIMIT("demotion failed; couldn't copy block");
+		policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
+
+		cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
+		if (mg->promote)
+			cell_defer(cache, mg->new_ocell, true);
+	} else {
+		DMWARN_LIMIT("promotion failed; couldn't copy block");
+		policy_remove_mapping(cache->policy, mg->new_oblock);
+		cell_defer(cache, mg->new_ocell, true);
+	}
+
+	cleanup_migration(mg);
+}
+
+static void migration_success_pre_commit(struct dm_cache_migration *mg)
+{
+	unsigned long flags;
+	struct cache *cache = mg->cache;
+
+	if (mg->writeback) {
+		cell_defer(cache, mg->old_ocell, false);
+		clear_dirty(cache, mg->old_oblock, mg->cblock);
+		cleanup_migration(mg);
+		return;
+
+	} else if (mg->demote) {
+		if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
+			DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
+			policy_force_mapping(cache->policy, mg->new_oblock,
+					     mg->old_oblock);
+			if (mg->promote)
+				cell_defer(cache, mg->new_ocell, true);
+			cleanup_migration(mg);
+			return;
+		}
+	} else {
+		if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
+			DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
+			policy_remove_mapping(cache->policy, mg->new_oblock);
+			cleanup_migration(mg);
+			return;
+		}
+	}
+
+	spin_lock_irqsave(&cache->lock, flags);
+	list_add_tail(&mg->list, &cache->need_commit_migrations);
+	cache->commit_requested = true;
+	spin_unlock_irqrestore(&cache->lock, flags);
+}
+
+static void migration_success_post_commit(struct dm_cache_migration *mg)
+{
+	unsigned long flags;
+	struct cache *cache = mg->cache;
+
+	if (mg->writeback) {
+		DMWARN("writeback unexpectedly triggered commit");
+		return;
+
+	} else if (mg->demote) {
+		cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
+
+		if (mg->promote) {
+			mg->demote = false;
+
+			spin_lock_irqsave(&cache->lock, flags);
+			list_add_tail(&mg->list, &cache->quiesced_migrations);
+			spin_unlock_irqrestore(&cache->lock, flags);
+
+		} else {
+			if (mg->invalidate)
+				policy_remove_mapping(cache->policy, mg->old_oblock);
+			cleanup_migration(mg);
+		}
+
+	} else {
+		if (mg->requeue_holder)
+			cell_defer(cache, mg->new_ocell, true);
+		else {
+			bio_endio(mg->new_ocell->holder, 0);
+			cell_defer(cache, mg->new_ocell, false);
+		}
+		clear_dirty(cache, mg->new_oblock, mg->cblock);
+		cleanup_migration(mg);
+	}
+}
+
+static void copy_complete(int read_err, unsigned long write_err, void *context)
+{
+	unsigned long flags;
+	struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
+	struct cache *cache = mg->cache;
+
+	if (read_err || write_err)
+		mg->err = true;
+
+	spin_lock_irqsave(&cache->lock, flags);
+	list_add_tail(&mg->list, &cache->completed_migrations);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	wake_worker(cache);
+}
+
+static void issue_copy_real(struct dm_cache_migration *mg)
+{
+	int r;
+	struct dm_io_region o_region, c_region;
+	struct cache *cache = mg->cache;
+	sector_t cblock = from_cblock(mg->cblock);
+
+	o_region.bdev = cache->origin_dev->bdev;
+	o_region.count = cache->sectors_per_block;
+
+	c_region.bdev = cache->cache_dev->bdev;
+	c_region.sector = cblock * cache->sectors_per_block;
+	c_region.count = cache->sectors_per_block;
+
+	if (mg->writeback || mg->demote) {
+		/* demote */
+		o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
+		r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
+	} else {
+		/* promote */
+		o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
+		r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
+	}
+
+	if (r < 0) {
+		DMERR_LIMIT("issuing migration failed");
+		migration_failure(mg);
+	}
+}
+
+static void overwrite_endio(struct bio *bio, int err)
+{
+	struct dm_cache_migration *mg = bio->bi_private;
+	struct cache *cache = mg->cache;
+	size_t pb_data_size = get_per_bio_data_size(cache);
+	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+	unsigned long flags;
+
+	dm_unhook_bio(&pb->hook_info, bio);
+
+	if (err)
+		mg->err = true;
+
+	mg->requeue_holder = false;
+
+	spin_lock_irqsave(&cache->lock, flags);
+	list_add_tail(&mg->list, &cache->completed_migrations);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	wake_worker(cache);
+}
+
+static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
+{
+	size_t pb_data_size = get_per_bio_data_size(mg->cache);
+	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+	dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
+	remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
+	generic_make_request(bio);
+}
+
+static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
+{
+	return (bio_data_dir(bio) == WRITE) &&
+		(bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
+}
+
+static void avoid_copy(struct dm_cache_migration *mg)
+{
+	atomic_inc(&mg->cache->stats.copies_avoided);
+	migration_success_pre_commit(mg);
+}
+
+static void issue_copy(struct dm_cache_migration *mg)
+{
+	bool avoid;
+	struct cache *cache = mg->cache;
+
+	if (mg->writeback || mg->demote)
+		avoid = !is_dirty(cache, mg->cblock) ||
+			is_discarded_oblock(cache, mg->old_oblock);
+	else {
+		struct bio *bio = mg->new_ocell->holder;
+
+		avoid = is_discarded_oblock(cache, mg->new_oblock);
+
+		if (!avoid && bio_writes_complete_block(cache, bio)) {
+			issue_overwrite(mg, bio);
+			return;
+		}
+	}
+
+	avoid ? avoid_copy(mg) : issue_copy_real(mg);
+}
+
+static void complete_migration(struct dm_cache_migration *mg)
+{
+	if (mg->err)
+		migration_failure(mg);
+	else
+		migration_success_pre_commit(mg);
+}
+
+static void process_migrations(struct cache *cache, struct list_head *head,
+			       void (*fn)(struct dm_cache_migration *))
+{
+	unsigned long flags;
+	struct list_head list;
+	struct dm_cache_migration *mg, *tmp;
+
+	INIT_LIST_HEAD(&list);
+	spin_lock_irqsave(&cache->lock, flags);
+	list_splice_init(head, &list);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	list_for_each_entry_safe(mg, tmp, &list, list)
+		fn(mg);
+}
+
+static void __queue_quiesced_migration(struct dm_cache_migration *mg)
+{
+	list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
+}
+
+static void queue_quiesced_migration(struct dm_cache_migration *mg)
+{
+	unsigned long flags;
+	struct cache *cache = mg->cache;
+
+	spin_lock_irqsave(&cache->lock, flags);
+	__queue_quiesced_migration(mg);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	wake_worker(cache);
+}
+
+static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
+{
+	unsigned long flags;
+	struct dm_cache_migration *mg, *tmp;
+
+	spin_lock_irqsave(&cache->lock, flags);
+	list_for_each_entry_safe(mg, tmp, work, list)
+		__queue_quiesced_migration(mg);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	wake_worker(cache);
+}
+
+static void check_for_quiesced_migrations(struct cache *cache,
+					  struct per_bio_data *pb)
+{
+	struct list_head work;
+
+	if (!pb->all_io_entry)
+		return;
+
+	INIT_LIST_HEAD(&work);
+	if (pb->all_io_entry)
+		dm_deferred_entry_dec(pb->all_io_entry, &work);
+
+	if (!list_empty(&work))
+		queue_quiesced_migrations(cache, &work);
+}
+
+static void quiesce_migration(struct dm_cache_migration *mg)
+{
+	if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
+		queue_quiesced_migration(mg);
+}
+
+static void promote(struct cache *cache, struct prealloc *structs,
+		    dm_oblock_t oblock, dm_cblock_t cblock,
+		    struct dm_bio_prison_cell *cell)
+{
+	struct dm_cache_migration *mg = prealloc_get_migration(structs);
+
+	mg->err = false;
+	mg->writeback = false;
+	mg->demote = false;
+	mg->promote = true;
+	mg->requeue_holder = true;
+	mg->invalidate = false;
+	mg->cache = cache;
+	mg->new_oblock = oblock;
+	mg->cblock = cblock;
+	mg->old_ocell = NULL;
+	mg->new_ocell = cell;
+	mg->start_jiffies = jiffies;
+
+	inc_nr_migrations(cache);
+	quiesce_migration(mg);
+}
+
+static void writeback(struct cache *cache, struct prealloc *structs,
+		      dm_oblock_t oblock, dm_cblock_t cblock,
+		      struct dm_bio_prison_cell *cell)
+{
+	struct dm_cache_migration *mg = prealloc_get_migration(structs);
+
+	mg->err = false;
+	mg->writeback = true;
+	mg->demote = false;
+	mg->promote = false;
+	mg->requeue_holder = true;
+	mg->invalidate = false;
+	mg->cache = cache;
+	mg->old_oblock = oblock;
+	mg->cblock = cblock;
+	mg->old_ocell = cell;
+	mg->new_ocell = NULL;
+	mg->start_jiffies = jiffies;
+
+	inc_nr_migrations(cache);
+	quiesce_migration(mg);
+}
+
+static void demote_then_promote(struct cache *cache, struct prealloc *structs,
+				dm_oblock_t old_oblock, dm_oblock_t new_oblock,
+				dm_cblock_t cblock,
+				struct dm_bio_prison_cell *old_ocell,
+				struct dm_bio_prison_cell *new_ocell)
+{
+	struct dm_cache_migration *mg = prealloc_get_migration(structs);
+
+	mg->err = false;
+	mg->writeback = false;
+	mg->demote = true;
+	mg->promote = true;
+	mg->requeue_holder = true;
+	mg->invalidate = false;
+	mg->cache = cache;
+	mg->old_oblock = old_oblock;
+	mg->new_oblock = new_oblock;
+	mg->cblock = cblock;
+	mg->old_ocell = old_ocell;
+	mg->new_ocell = new_ocell;
+	mg->start_jiffies = jiffies;
+
+	inc_nr_migrations(cache);
+	quiesce_migration(mg);
+}
+
+/*
+ * Invalidate a cache entry.  No writeback occurs; any changes in the cache
+ * block are thrown away.
+ */
+static void invalidate(struct cache *cache, struct prealloc *structs,
+		       dm_oblock_t oblock, dm_cblock_t cblock,
+		       struct dm_bio_prison_cell *cell)
+{
+	struct dm_cache_migration *mg = prealloc_get_migration(structs);
+
+	mg->err = false;
+	mg->writeback = false;
+	mg->demote = true;
+	mg->promote = false;
+	mg->requeue_holder = true;
+	mg->invalidate = true;
+	mg->cache = cache;
+	mg->old_oblock = oblock;
+	mg->cblock = cblock;
+	mg->old_ocell = cell;
+	mg->new_ocell = NULL;
+	mg->start_jiffies = jiffies;
+
+	inc_nr_migrations(cache);
+	quiesce_migration(mg);
+}
+
+/*----------------------------------------------------------------
+ * bio processing
+ *--------------------------------------------------------------*/
+static void defer_bio(struct cache *cache, struct bio *bio)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cache->lock, flags);
+	bio_list_add(&cache->deferred_bios, bio);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	wake_worker(cache);
+}
+
+static void process_flush_bio(struct cache *cache, struct bio *bio)
+{
+	size_t pb_data_size = get_per_bio_data_size(cache);
+	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+	BUG_ON(bio->bi_iter.bi_size);
+	if (!pb->req_nr)
+		remap_to_origin(cache, bio);
+	else
+		remap_to_cache(cache, bio, 0);
+
+	issue(cache, bio);
+}
+
+/*
+ * People generally discard large parts of a device, eg, the whole device
+ * when formatting.  Splitting these large discards up into cache block
+ * sized ios and then quiescing (always neccessary for discard) takes too
+ * long.
+ *
+ * We keep it simple, and allow any size of discard to come in, and just
+ * mark off blocks on the discard bitset.  No passdown occurs!
+ *
+ * To implement passdown we need to change the bio_prison such that a cell
+ * can have a key that spans many blocks.
+ */
+static void process_discard_bio(struct cache *cache, struct bio *bio)
+{
+	dm_block_t start_block = dm_sector_div_up(bio->bi_iter.bi_sector,
+						  cache->sectors_per_block);
+	dm_block_t end_block = bio_end_sector(bio);
+	dm_block_t b;
+
+	end_block = block_div(end_block, cache->sectors_per_block);
+
+	for (b = start_block; b < end_block; b++)
+		set_discard(cache, to_oblock(b));
+
+	bio_endio(bio, 0);
+}
+
+static bool spare_migration_bandwidth(struct cache *cache)
+{
+	sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) *
+		cache->sectors_per_block;
+	return current_volume < cache->migration_threshold;
+}
+
+static void inc_hit_counter(struct cache *cache, struct bio *bio)
+{
+	atomic_inc(bio_data_dir(bio) == READ ?
+		   &cache->stats.read_hit : &cache->stats.write_hit);
+}
+
+static void inc_miss_counter(struct cache *cache, struct bio *bio)
+{
+	atomic_inc(bio_data_dir(bio) == READ ?
+		   &cache->stats.read_miss : &cache->stats.write_miss);
+}
+
+static void issue_cache_bio(struct cache *cache, struct bio *bio,
+			    struct per_bio_data *pb,
+			    dm_oblock_t oblock, dm_cblock_t cblock)
+{
+	pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+	remap_to_cache_dirty(cache, bio, oblock, cblock);
+	issue(cache, bio);
+}
+
+static void process_bio(struct cache *cache, struct prealloc *structs,
+			struct bio *bio)
+{
+	int r;
+	bool release_cell = true;
+	dm_oblock_t block = get_bio_block(cache, bio);
+	struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
+	struct policy_result lookup_result;
+	size_t pb_data_size = get_per_bio_data_size(cache);
+	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+	bool discarded_block = is_discarded_oblock(cache, block);
+	bool passthrough = passthrough_mode(&cache->features);
+	bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
+
+	/*
+	 * Check to see if that block is currently migrating.
+	 */
+	cell_prealloc = prealloc_get_cell(structs);
+	r = bio_detain(cache, block, bio, cell_prealloc,
+		       (cell_free_fn) prealloc_put_cell,
+		       structs, &new_ocell);
+	if (r > 0)
+		return;
+
+	r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
+		       bio, &lookup_result);
+
+	if (r == -EWOULDBLOCK)
+		/* migration has been denied */
+		lookup_result.op = POLICY_MISS;
+
+	switch (lookup_result.op) {
+	case POLICY_HIT:
+		if (passthrough) {
+			inc_miss_counter(cache, bio);
+
+			/*
+			 * Passthrough always maps to the origin,
+			 * invalidating any cache blocks that are written
+			 * to.
+			 */
+
+			if (bio_data_dir(bio) == WRITE) {
+				atomic_inc(&cache->stats.demotion);
+				invalidate(cache, structs, block, lookup_result.cblock, new_ocell);
+				release_cell = false;
+
+			} else {
+				/* FIXME: factor out issue_origin() */
+				pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+				remap_to_origin_clear_discard(cache, bio, block);
+				issue(cache, bio);
+			}
+		} else {
+			inc_hit_counter(cache, bio);
+
+			if (bio_data_dir(bio) == WRITE &&
+			    writethrough_mode(&cache->features) &&
+			    !is_dirty(cache, lookup_result.cblock)) {
+				pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+				remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
+				issue(cache, bio);
+			} else
+				issue_cache_bio(cache, bio, pb, block, lookup_result.cblock);
+		}
+
+		break;
+
+	case POLICY_MISS:
+		inc_miss_counter(cache, bio);
+		pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+		remap_to_origin_clear_discard(cache, bio, block);
+		issue(cache, bio);
+		break;
+
+	case POLICY_NEW:
+		atomic_inc(&cache->stats.promotion);
+		promote(cache, structs, block, lookup_result.cblock, new_ocell);
+		release_cell = false;
+		break;
+
+	case POLICY_REPLACE:
+		cell_prealloc = prealloc_get_cell(structs);
+		r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
+			       (cell_free_fn) prealloc_put_cell,
+			       structs, &old_ocell);
+		if (r > 0) {
+			/*
+			 * We have to be careful to avoid lock inversion of
+			 * the cells.  So we back off, and wait for the
+			 * old_ocell to become free.
+			 */
+			policy_force_mapping(cache->policy, block,
+					     lookup_result.old_oblock);
+			atomic_inc(&cache->stats.cache_cell_clash);
+			break;
+		}
+		atomic_inc(&cache->stats.demotion);
+		atomic_inc(&cache->stats.promotion);
+
+		demote_then_promote(cache, structs, lookup_result.old_oblock,
+				    block, lookup_result.cblock,
+				    old_ocell, new_ocell);
+		release_cell = false;
+		break;
+
+	default:
+		DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
+			    (unsigned) lookup_result.op);
+		bio_io_error(bio);
+	}
+
+	if (release_cell)
+		cell_defer(cache, new_ocell, false);
+}
+
+static int need_commit_due_to_time(struct cache *cache)
+{
+	return jiffies < cache->last_commit_jiffies ||
+	       jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
+}
+
+static int commit_if_needed(struct cache *cache)
+{
+	int r = 0;
+
+	if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
+	    dm_cache_changed_this_transaction(cache->cmd)) {
+		atomic_inc(&cache->stats.commit_count);
+		cache->commit_requested = false;
+		r = dm_cache_commit(cache->cmd, false);
+		cache->last_commit_jiffies = jiffies;
+	}
+
+	return r;
+}
+
+static void process_deferred_bios(struct cache *cache)
+{
+	unsigned long flags;
+	struct bio_list bios;
+	struct bio *bio;
+	struct prealloc structs;
+
+	memset(&structs, 0, sizeof(structs));
+	bio_list_init(&bios);
+
+	spin_lock_irqsave(&cache->lock, flags);
+	bio_list_merge(&bios, &cache->deferred_bios);
+	bio_list_init(&cache->deferred_bios);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	while (!bio_list_empty(&bios)) {
+		/*
+		 * If we've got no free migration structs, and processing
+		 * this bio might require one, we pause until there are some
+		 * prepared mappings to process.
+		 */
+		if (prealloc_data_structs(cache, &structs)) {
+			spin_lock_irqsave(&cache->lock, flags);
+			bio_list_merge(&cache->deferred_bios, &bios);
+			spin_unlock_irqrestore(&cache->lock, flags);
+			break;
+		}
+
+		bio = bio_list_pop(&bios);
+
+		if (bio->bi_rw & REQ_FLUSH)
+			process_flush_bio(cache, bio);
+		else if (bio->bi_rw & REQ_DISCARD)
+			process_discard_bio(cache, bio);
+		else
+			process_bio(cache, &structs, bio);
+	}
+
+	prealloc_free_structs(cache, &structs);
+}
+
+static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
+{
+	unsigned long flags;
+	struct bio_list bios;
+	struct bio *bio;
+
+	bio_list_init(&bios);
+
+	spin_lock_irqsave(&cache->lock, flags);
+	bio_list_merge(&bios, &cache->deferred_flush_bios);
+	bio_list_init(&cache->deferred_flush_bios);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	while ((bio = bio_list_pop(&bios)))
+		submit_bios ? generic_make_request(bio) : bio_io_error(bio);
+}
+
+static void process_deferred_writethrough_bios(struct cache *cache)
+{
+	unsigned long flags;
+	struct bio_list bios;
+	struct bio *bio;
+
+	bio_list_init(&bios);
+
+	spin_lock_irqsave(&cache->lock, flags);
+	bio_list_merge(&bios, &cache->deferred_writethrough_bios);
+	bio_list_init(&cache->deferred_writethrough_bios);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	while ((bio = bio_list_pop(&bios)))
+		generic_make_request(bio);
+}
+
+static void writeback_some_dirty_blocks(struct cache *cache)
+{
+	int r = 0;
+	dm_oblock_t oblock;
+	dm_cblock_t cblock;
+	struct prealloc structs;
+	struct dm_bio_prison_cell *old_ocell;
+
+	memset(&structs, 0, sizeof(structs));
+
+	while (spare_migration_bandwidth(cache)) {
+		if (prealloc_data_structs(cache, &structs))
+			break;
+
+		r = policy_writeback_work(cache->policy, &oblock, &cblock);
+		if (r)
+			break;
+
+		r = get_cell(cache, oblock, &structs, &old_ocell);
+		if (r) {
+			policy_set_dirty(cache->policy, oblock);
+			break;
+		}
+
+		writeback(cache, &structs, oblock, cblock, old_ocell);
+	}
+
+	prealloc_free_structs(cache, &structs);
+}
+
+/*----------------------------------------------------------------
+ * Invalidations.
+ * Dropping something from the cache *without* writing back.
+ *--------------------------------------------------------------*/
+
+static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
+{
+	int r = 0;
+	uint64_t begin = from_cblock(req->cblocks->begin);
+	uint64_t end = from_cblock(req->cblocks->end);
+
+	while (begin != end) {
+		r = policy_remove_cblock(cache->policy, to_cblock(begin));
+		if (!r) {
+			r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
+			if (r)
+				break;
+
+		} else if (r == -ENODATA) {
+			/* harmless, already unmapped */
+			r = 0;
+
+		} else {
+			DMERR("policy_remove_cblock failed");
+			break;
+		}
+
+		begin++;
+        }
+
+	cache->commit_requested = true;
+
+	req->err = r;
+	atomic_set(&req->complete, 1);
+
+	wake_up(&req->result_wait);
+}
+
+static void process_invalidation_requests(struct cache *cache)
+{
+	struct list_head list;
+	struct invalidation_request *req, *tmp;
+
+	INIT_LIST_HEAD(&list);
+	spin_lock(&cache->invalidation_lock);
+	list_splice_init(&cache->invalidation_requests, &list);
+	spin_unlock(&cache->invalidation_lock);
+
+	list_for_each_entry_safe (req, tmp, &list, list)
+		process_invalidation_request(cache, req);
+}
+
+/*----------------------------------------------------------------
+ * Main worker loop
+ *--------------------------------------------------------------*/
+static bool is_quiescing(struct cache *cache)
+{
+	return atomic_read(&cache->quiescing);
+}
+
+static void ack_quiescing(struct cache *cache)
+{
+	if (is_quiescing(cache)) {
+		atomic_inc(&cache->quiescing_ack);
+		wake_up(&cache->quiescing_wait);
+	}
+}
+
+static void wait_for_quiescing_ack(struct cache *cache)
+{
+	wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
+}
+
+static void start_quiescing(struct cache *cache)
+{
+	atomic_inc(&cache->quiescing);
+	wait_for_quiescing_ack(cache);
+}
+
+static void stop_quiescing(struct cache *cache)
+{
+	atomic_set(&cache->quiescing, 0);
+	atomic_set(&cache->quiescing_ack, 0);
+}
+
+static void wait_for_migrations(struct cache *cache)
+{
+	wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations));
+}
+
+static void stop_worker(struct cache *cache)
+{
+	cancel_delayed_work(&cache->waker);
+	flush_workqueue(cache->wq);
+}
+
+static void requeue_deferred_io(struct cache *cache)
+{
+	struct bio *bio;
+	struct bio_list bios;
+
+	bio_list_init(&bios);
+	bio_list_merge(&bios, &cache->deferred_bios);
+	bio_list_init(&cache->deferred_bios);
+
+	while ((bio = bio_list_pop(&bios)))
+		bio_endio(bio, DM_ENDIO_REQUEUE);
+}
+
+static int more_work(struct cache *cache)
+{
+	if (is_quiescing(cache))
+		return !list_empty(&cache->quiesced_migrations) ||
+			!list_empty(&cache->completed_migrations) ||
+			!list_empty(&cache->need_commit_migrations);
+	else
+		return !bio_list_empty(&cache->deferred_bios) ||
+			!bio_list_empty(&cache->deferred_flush_bios) ||
+			!bio_list_empty(&cache->deferred_writethrough_bios) ||
+			!list_empty(&cache->quiesced_migrations) ||
+			!list_empty(&cache->completed_migrations) ||
+			!list_empty(&cache->need_commit_migrations) ||
+			cache->invalidate;
+}
+
+static void do_worker(struct work_struct *ws)
+{
+	struct cache *cache = container_of(ws, struct cache, worker);
+
+	do {
+		if (!is_quiescing(cache)) {
+			writeback_some_dirty_blocks(cache);
+			process_deferred_writethrough_bios(cache);
+			process_deferred_bios(cache);
+			process_invalidation_requests(cache);
+		}
+
+		process_migrations(cache, &cache->quiesced_migrations, issue_copy);
+		process_migrations(cache, &cache->completed_migrations, complete_migration);
+
+		if (commit_if_needed(cache)) {
+			process_deferred_flush_bios(cache, false);
+
+			/*
+			 * FIXME: rollback metadata or just go into a
+			 * failure mode and error everything
+			 */
+		} else {
+			process_deferred_flush_bios(cache, true);
+			process_migrations(cache, &cache->need_commit_migrations,
+					   migration_success_post_commit);
+		}
+
+		ack_quiescing(cache);
+
+	} while (more_work(cache));
+}
+
+/*
+ * We want to commit periodically so that not too much
+ * unwritten metadata builds up.
+ */
+static void do_waker(struct work_struct *ws)
+{
+	struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
+	policy_tick(cache->policy);
+	wake_worker(cache);
+	queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
+}
+
+/*----------------------------------------------------------------*/
+
+static int is_congested(struct dm_dev *dev, int bdi_bits)
+{
+	struct request_queue *q = bdev_get_queue(dev->bdev);
+	return bdi_congested(&q->backing_dev_info, bdi_bits);
+}
+
+static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
+{
+	struct cache *cache = container_of(cb, struct cache, callbacks);
+
+	return is_congested(cache->origin_dev, bdi_bits) ||
+		is_congested(cache->cache_dev, bdi_bits);
+}
+
+/*----------------------------------------------------------------
+ * Target methods
+ *--------------------------------------------------------------*/
+
+/*
+ * This function gets called on the error paths of the constructor, so we
+ * have to cope with a partially initialised struct.
+ */
+static void destroy(struct cache *cache)
+{
+	unsigned i;
+
+	if (cache->next_migration)
+		mempool_free(cache->next_migration, cache->migration_pool);
+
+	if (cache->migration_pool)
+		mempool_destroy(cache->migration_pool);
+
+	if (cache->all_io_ds)
+		dm_deferred_set_destroy(cache->all_io_ds);
+
+	if (cache->prison)
+		dm_bio_prison_destroy(cache->prison);
+
+	if (cache->wq)
+		destroy_workqueue(cache->wq);
+
+	if (cache->dirty_bitset)
+		free_bitset(cache->dirty_bitset);
+
+	if (cache->discard_bitset)
+		free_bitset(cache->discard_bitset);
+
+	if (cache->copier)
+		dm_kcopyd_client_destroy(cache->copier);
+
+	if (cache->cmd)
+		dm_cache_metadata_close(cache->cmd);
+
+	if (cache->metadata_dev)
+		dm_put_device(cache->ti, cache->metadata_dev);
+
+	if (cache->origin_dev)
+		dm_put_device(cache->ti, cache->origin_dev);
+
+	if (cache->cache_dev)
+		dm_put_device(cache->ti, cache->cache_dev);
+
+	if (cache->policy)
+		dm_cache_policy_destroy(cache->policy);
+
+	for (i = 0; i < cache->nr_ctr_args ; i++)
+		kfree(cache->ctr_args[i]);
+	kfree(cache->ctr_args);
+
+	kfree(cache);
+}
+
+static void cache_dtr(struct dm_target *ti)
+{
+	struct cache *cache = ti->private;
+
+	destroy(cache);
+}
+
+static sector_t get_dev_size(struct dm_dev *dev)
+{
+	return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Construct a cache device mapping.
+ *
+ * cache <metadata dev> <cache dev> <origin dev> <block size>
+ *       <#feature args> [<feature arg>]*
+ *       <policy> <#policy args> [<policy arg>]*
+ *
+ * metadata dev    : fast device holding the persistent metadata
+ * cache dev	   : fast device holding cached data blocks
+ * origin dev	   : slow device holding original data blocks
+ * block size	   : cache unit size in sectors
+ *
+ * #feature args   : number of feature arguments passed
+ * feature args    : writethrough.  (The default is writeback.)
+ *
+ * policy	   : the replacement policy to use
+ * #policy args    : an even number of policy arguments corresponding
+ *		     to key/value pairs passed to the policy
+ * policy args	   : key/value pairs passed to the policy
+ *		     E.g. 'sequential_threshold 1024'
+ *		     See cache-policies.txt for details.
+ *
+ * Optional feature arguments are:
+ *   writethrough  : write through caching that prohibits cache block
+ *		     content from being different from origin block content.
+ *		     Without this argument, the default behaviour is to write
+ *		     back cache block contents later for performance reasons,
+ *		     so they may differ from the corresponding origin blocks.
+ */
+struct cache_args {
+	struct dm_target *ti;
+
+	struct dm_dev *metadata_dev;
+
+	struct dm_dev *cache_dev;
+	sector_t cache_sectors;
+
+	struct dm_dev *origin_dev;
+	sector_t origin_sectors;
+
+	uint32_t block_size;
+
+	const char *policy_name;
+	int policy_argc;
+	const char **policy_argv;
+
+	struct cache_features features;
+};
+
+static void destroy_cache_args(struct cache_args *ca)
+{
+	if (ca->metadata_dev)
+		dm_put_device(ca->ti, ca->metadata_dev);
+
+	if (ca->cache_dev)
+		dm_put_device(ca->ti, ca->cache_dev);
+
+	if (ca->origin_dev)
+		dm_put_device(ca->ti, ca->origin_dev);
+
+	kfree(ca);
+}
+
+static bool at_least_one_arg(struct dm_arg_set *as, char **error)
+{
+	if (!as->argc) {
+		*error = "Insufficient args";
+		return false;
+	}
+
+	return true;
+}
+
+static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
+			      char **error)
+{
+	int r;
+	sector_t metadata_dev_size;
+	char b[BDEVNAME_SIZE];
+
+	if (!at_least_one_arg(as, error))
+		return -EINVAL;
+
+	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
+			  &ca->metadata_dev);
+	if (r) {
+		*error = "Error opening metadata device";
+		return r;
+	}
+
+	metadata_dev_size = get_dev_size(ca->metadata_dev);
+	if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
+		DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
+		       bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
+
+	return 0;
+}
+
+static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
+			   char **error)
+{
+	int r;
+
+	if (!at_least_one_arg(as, error))
+		return -EINVAL;
+
+	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
+			  &ca->cache_dev);
+	if (r) {
+		*error = "Error opening cache device";
+		return r;
+	}
+	ca->cache_sectors = get_dev_size(ca->cache_dev);
+
+	return 0;
+}
+
+static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
+			    char **error)
+{
+	int r;
+
+	if (!at_least_one_arg(as, error))
+		return -EINVAL;
+
+	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
+			  &ca->origin_dev);
+	if (r) {
+		*error = "Error opening origin device";
+		return r;
+	}
+
+	ca->origin_sectors = get_dev_size(ca->origin_dev);
+	if (ca->ti->len > ca->origin_sectors) {
+		*error = "Device size larger than cached device";
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
+			    char **error)
+{
+	unsigned long block_size;
+
+	if (!at_least_one_arg(as, error))
+		return -EINVAL;
+
+	if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size ||
+	    block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
+	    block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
+	    block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
+		*error = "Invalid data block size";
+		return -EINVAL;
+	}
+
+	if (block_size > ca->cache_sectors) {
+		*error = "Data block size is larger than the cache device";
+		return -EINVAL;
+	}
+
+	ca->block_size = block_size;
+
+	return 0;
+}
+
+static void init_features(struct cache_features *cf)
+{
+	cf->mode = CM_WRITE;
+	cf->io_mode = CM_IO_WRITEBACK;
+}
+
+static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
+			  char **error)
+{
+	static struct dm_arg _args[] = {
+		{0, 1, "Invalid number of cache feature arguments"},
+	};
+
+	int r;
+	unsigned argc;
+	const char *arg;
+	struct cache_features *cf = &ca->features;
+
+	init_features(cf);
+
+	r = dm_read_arg_group(_args, as, &argc, error);
+	if (r)
+		return -EINVAL;
+
+	while (argc--) {
+		arg = dm_shift_arg(as);
+
+		if (!strcasecmp(arg, "writeback"))
+			cf->io_mode = CM_IO_WRITEBACK;
+
+		else if (!strcasecmp(arg, "writethrough"))
+			cf->io_mode = CM_IO_WRITETHROUGH;
+
+		else if (!strcasecmp(arg, "passthrough"))
+			cf->io_mode = CM_IO_PASSTHROUGH;
+
+		else {
+			*error = "Unrecognised cache feature requested";
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
+			char **error)
+{
+	static struct dm_arg _args[] = {
+		{0, 1024, "Invalid number of policy arguments"},
+	};
+
+	int r;
+
+	if (!at_least_one_arg(as, error))
+		return -EINVAL;
+
+	ca->policy_name = dm_shift_arg(as);
+
+	r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
+	if (r)
+		return -EINVAL;
+
+	ca->policy_argv = (const char **)as->argv;
+	dm_consume_args(as, ca->policy_argc);
+
+	return 0;
+}
+
+static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
+			    char **error)
+{
+	int r;
+	struct dm_arg_set as;
+
+	as.argc = argc;
+	as.argv = argv;
+
+	r = parse_metadata_dev(ca, &as, error);
+	if (r)
+		return r;
+
+	r = parse_cache_dev(ca, &as, error);
+	if (r)
+		return r;
+
+	r = parse_origin_dev(ca, &as, error);
+	if (r)
+		return r;
+
+	r = parse_block_size(ca, &as, error);
+	if (r)
+		return r;
+
+	r = parse_features(ca, &as, error);
+	if (r)
+		return r;
+
+	r = parse_policy(ca, &as, error);
+	if (r)
+		return r;
+
+	return 0;
+}
+
+/*----------------------------------------------------------------*/
+
+static struct kmem_cache *migration_cache;
+
+#define NOT_CORE_OPTION 1
+
+static int process_config_option(struct cache *cache, const char *key, const char *value)
+{
+	unsigned long tmp;
+
+	if (!strcasecmp(key, "migration_threshold")) {
+		if (kstrtoul(value, 10, &tmp))
+			return -EINVAL;
+
+		cache->migration_threshold = tmp;
+		return 0;
+	}
+
+	return NOT_CORE_OPTION;
+}
+
+static int set_config_value(struct cache *cache, const char *key, const char *value)
+{
+	int r = process_config_option(cache, key, value);
+
+	if (r == NOT_CORE_OPTION)
+		r = policy_set_config_value(cache->policy, key, value);
+
+	if (r)
+		DMWARN("bad config value for %s: %s", key, value);
+
+	return r;
+}
+
+static int set_config_values(struct cache *cache, int argc, const char **argv)
+{
+	int r = 0;
+
+	if (argc & 1) {
+		DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
+		return -EINVAL;
+	}
+
+	while (argc) {
+		r = set_config_value(cache, argv[0], argv[1]);
+		if (r)
+			break;
+
+		argc -= 2;
+		argv += 2;
+	}
+
+	return r;
+}
+
+static int create_cache_policy(struct cache *cache, struct cache_args *ca,
+			       char **error)
+{
+	struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
+							   cache->cache_size,
+							   cache->origin_sectors,
+							   cache->sectors_per_block);
+	if (IS_ERR(p)) {
+		*error = "Error creating cache's policy";
+		return PTR_ERR(p);
+	}
+	cache->policy = p;
+
+	return 0;
+}
+
+#define DEFAULT_MIGRATION_THRESHOLD 2048
+
+static int cache_create(struct cache_args *ca, struct cache **result)
+{
+	int r = 0;
+	char **error = &ca->ti->error;
+	struct cache *cache;
+	struct dm_target *ti = ca->ti;
+	dm_block_t origin_blocks;
+	struct dm_cache_metadata *cmd;
+	bool may_format = ca->features.mode == CM_WRITE;
+
+	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
+	if (!cache)
+		return -ENOMEM;
+
+	cache->ti = ca->ti;
+	ti->private = cache;
+	ti->num_flush_bios = 2;
+	ti->flush_supported = true;
+
+	ti->num_discard_bios = 1;
+	ti->discards_supported = true;
+	ti->discard_zeroes_data_unsupported = true;
+	/* Discard bios must be split on a block boundary */
+	ti->split_discard_bios = true;
+
+	cache->features = ca->features;
+	ti->per_bio_data_size = get_per_bio_data_size(cache);
+
+	cache->callbacks.congested_fn = cache_is_congested;
+	dm_table_add_target_callbacks(ti->table, &cache->callbacks);
+
+	cache->metadata_dev = ca->metadata_dev;
+	cache->origin_dev = ca->origin_dev;
+	cache->cache_dev = ca->cache_dev;
+
+	ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
+
+	/* FIXME: factor out this whole section */
+	origin_blocks = cache->origin_sectors = ca->origin_sectors;
+	origin_blocks = block_div(origin_blocks, ca->block_size);
+	cache->origin_blocks = to_oblock(origin_blocks);
+
+	cache->sectors_per_block = ca->block_size;
+	if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
+		r = -EINVAL;
+		goto bad;
+	}
+
+	if (ca->block_size & (ca->block_size - 1)) {
+		dm_block_t cache_size = ca->cache_sectors;
+
+		cache->sectors_per_block_shift = -1;
+		cache_size = block_div(cache_size, ca->block_size);
+		cache->cache_size = to_cblock(cache_size);
+	} else {
+		cache->sectors_per_block_shift = __ffs(ca->block_size);
+		cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift);
+	}
+
+	r = create_cache_policy(cache, ca, error);
+	if (r)
+		goto bad;
+
+	cache->policy_nr_args = ca->policy_argc;
+	cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
+
+	r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
+	if (r) {
+		*error = "Error setting cache policy's config values";
+		goto bad;
+	}
+
+	cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
+				     ca->block_size, may_format,
+				     dm_cache_policy_get_hint_size(cache->policy));
+	if (IS_ERR(cmd)) {
+		*error = "Error creating metadata object";
+		r = PTR_ERR(cmd);
+		goto bad;
+	}
+	cache->cmd = cmd;
+
+	if (passthrough_mode(&cache->features)) {
+		bool all_clean;
+
+		r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
+		if (r) {
+			*error = "dm_cache_metadata_all_clean() failed";
+			goto bad;
+		}
+
+		if (!all_clean) {
+			*error = "Cannot enter passthrough mode unless all blocks are clean";
+			r = -EINVAL;
+			goto bad;
+		}
+	}
+
+	spin_lock_init(&cache->lock);
+	bio_list_init(&cache->deferred_bios);
+	bio_list_init(&cache->deferred_flush_bios);
+	bio_list_init(&cache->deferred_writethrough_bios);
+	INIT_LIST_HEAD(&cache->quiesced_migrations);
+	INIT_LIST_HEAD(&cache->completed_migrations);
+	INIT_LIST_HEAD(&cache->need_commit_migrations);
+	atomic_set(&cache->nr_migrations, 0);
+	init_waitqueue_head(&cache->migration_wait);
+
+	init_waitqueue_head(&cache->quiescing_wait);
+	atomic_set(&cache->quiescing, 0);
+	atomic_set(&cache->quiescing_ack, 0);
+
+	r = -ENOMEM;
+	atomic_set(&cache->nr_dirty, 0);
+	cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
+	if (!cache->dirty_bitset) {
+		*error = "could not allocate dirty bitset";
+		goto bad;
+	}
+	clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
+
+	cache->discard_nr_blocks = cache->origin_blocks;
+	cache->discard_bitset = alloc_bitset(from_oblock(cache->discard_nr_blocks));
+	if (!cache->discard_bitset) {
+		*error = "could not allocate discard bitset";
+		goto bad;
+	}
+	clear_bitset(cache->discard_bitset, from_oblock(cache->discard_nr_blocks));
+
+	cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
+	if (IS_ERR(cache->copier)) {
+		*error = "could not create kcopyd client";
+		r = PTR_ERR(cache->copier);
+		goto bad;
+	}
+
+	cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
+	if (!cache->wq) {
+		*error = "could not create workqueue for metadata object";
+		goto bad;
+	}
+	INIT_WORK(&cache->worker, do_worker);
+	INIT_DELAYED_WORK(&cache->waker, do_waker);
+	cache->last_commit_jiffies = jiffies;
+
+	cache->prison = dm_bio_prison_create(PRISON_CELLS);
+	if (!cache->prison) {
+		*error = "could not create bio prison";
+		goto bad;
+	}
+
+	cache->all_io_ds = dm_deferred_set_create();
+	if (!cache->all_io_ds) {
+		*error = "could not create all_io deferred set";
+		goto bad;
+	}
+
+	cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
+							 migration_cache);
+	if (!cache->migration_pool) {
+		*error = "Error creating cache's migration mempool";
+		goto bad;
+	}
+
+	cache->next_migration = NULL;
+
+	cache->need_tick_bio = true;
+	cache->sized = false;
+	cache->invalidate = false;
+	cache->commit_requested = false;
+	cache->loaded_mappings = false;
+	cache->loaded_discards = false;
+
+	load_stats(cache);
+
+	atomic_set(&cache->stats.demotion, 0);
+	atomic_set(&cache->stats.promotion, 0);
+	atomic_set(&cache->stats.copies_avoided, 0);
+	atomic_set(&cache->stats.cache_cell_clash, 0);
+	atomic_set(&cache->stats.commit_count, 0);
+	atomic_set(&cache->stats.discard_count, 0);
+
+	spin_lock_init(&cache->invalidation_lock);
+	INIT_LIST_HEAD(&cache->invalidation_requests);
+
+	*result = cache;
+	return 0;
+
+bad:
+	destroy(cache);
+	return r;
+}
+
+static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
+{
+	unsigned i;
+	const char **copy;
+
+	copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
+	if (!copy)
+		return -ENOMEM;
+	for (i = 0; i < argc; i++) {
+		copy[i] = kstrdup(argv[i], GFP_KERNEL);
+		if (!copy[i]) {
+			while (i--)
+				kfree(copy[i]);
+			kfree(copy);
+			return -ENOMEM;
+		}
+	}
+
+	cache->nr_ctr_args = argc;
+	cache->ctr_args = copy;
+
+	return 0;
+}
+
+static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+	int r = -EINVAL;
+	struct cache_args *ca;
+	struct cache *cache = NULL;
+
+	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+	if (!ca) {
+		ti->error = "Error allocating memory for cache";
+		return -ENOMEM;
+	}
+	ca->ti = ti;
+
+	r = parse_cache_args(ca, argc, argv, &ti->error);
+	if (r)
+		goto out;
+
+	r = cache_create(ca, &cache);
+	if (r)
+		goto out;
+
+	r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
+	if (r) {
+		destroy(cache);
+		goto out;
+	}
+
+	ti->private = cache;
+
+out:
+	destroy_cache_args(ca);
+	return r;
+}
+
+static int cache_map(struct dm_target *ti, struct bio *bio)
+{
+	struct cache *cache = ti->private;
+
+	int r;
+	dm_oblock_t block = get_bio_block(cache, bio);
+	size_t pb_data_size = get_per_bio_data_size(cache);
+	bool can_migrate = false;
+	bool discarded_block;
+	struct dm_bio_prison_cell *cell;
+	struct policy_result lookup_result;
+	struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
+
+	if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
+		/*
+		 * This can only occur if the io goes to a partial block at
+		 * the end of the origin device.  We don't cache these.
+		 * Just remap to the origin and carry on.
+		 */
+		remap_to_origin(cache, bio);
+		return DM_MAPIO_REMAPPED;
+	}
+
+	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
+		defer_bio(cache, bio);
+		return DM_MAPIO_SUBMITTED;
+	}
+
+	/*
+	 * Check to see if that block is currently migrating.
+	 */
+	cell = alloc_prison_cell(cache);
+	if (!cell) {
+		defer_bio(cache, bio);
+		return DM_MAPIO_SUBMITTED;
+	}
+
+	r = bio_detain(cache, block, bio, cell,
+		       (cell_free_fn) free_prison_cell,
+		       cache, &cell);
+	if (r) {
+		if (r < 0)
+			defer_bio(cache, bio);
+
+		return DM_MAPIO_SUBMITTED;
+	}
+
+	discarded_block = is_discarded_oblock(cache, block);
+
+	r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
+		       bio, &lookup_result);
+	if (r == -EWOULDBLOCK) {
+		cell_defer(cache, cell, true);
+		return DM_MAPIO_SUBMITTED;
+
+	} else if (r) {
+		DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
+		bio_io_error(bio);
+		return DM_MAPIO_SUBMITTED;
+	}
+
+	r = DM_MAPIO_REMAPPED;
+	switch (lookup_result.op) {
+	case POLICY_HIT:
+		if (passthrough_mode(&cache->features)) {
+			if (bio_data_dir(bio) == WRITE) {
+				/*
+				 * We need to invalidate this block, so
+				 * defer for the worker thread.
+				 */
+				cell_defer(cache, cell, true);
+				r = DM_MAPIO_SUBMITTED;
+
+			} else {
+				pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+				inc_miss_counter(cache, bio);
+				remap_to_origin_clear_discard(cache, bio, block);
+
+				cell_defer(cache, cell, false);
+			}
+
+		} else {
+			inc_hit_counter(cache, bio);
+			pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+
+			if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
+			    !is_dirty(cache, lookup_result.cblock))
+				remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
+			else
+				remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+
+			cell_defer(cache, cell, false);
+		}
+		break;
+
+	case POLICY_MISS:
+		inc_miss_counter(cache, bio);
+		pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+
+		if (pb->req_nr != 0) {
+			/*
+			 * This is a duplicate writethrough io that is no
+			 * longer needed because the block has been demoted.
+			 */
+			bio_endio(bio, 0);
+			cell_defer(cache, cell, false);
+			return DM_MAPIO_SUBMITTED;
+		} else {
+			remap_to_origin_clear_discard(cache, bio, block);
+			cell_defer(cache, cell, false);
+		}
+		break;
+
+	default:
+		DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
+			    (unsigned) lookup_result.op);
+		bio_io_error(bio);
+		r = DM_MAPIO_SUBMITTED;
+	}
+
+	return r;
+}
+
+static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
+{
+	struct cache *cache = ti->private;
+	unsigned long flags;
+	size_t pb_data_size = get_per_bio_data_size(cache);
+	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+	if (pb->tick) {
+		policy_tick(cache->policy);
+
+		spin_lock_irqsave(&cache->lock, flags);
+		cache->need_tick_bio = true;
+		spin_unlock_irqrestore(&cache->lock, flags);
+	}
+
+	check_for_quiesced_migrations(cache, pb);
+
+	return 0;
+}
+
+static int write_dirty_bitset(struct cache *cache)
+{
+	unsigned i, r;
+
+	for (i = 0; i < from_cblock(cache->cache_size); i++) {
+		r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
+				       is_dirty(cache, to_cblock(i)));
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
+static int write_discard_bitset(struct cache *cache)
+{
+	unsigned i, r;
+
+	r = dm_cache_discard_bitset_resize(cache->cmd, cache->sectors_per_block,
+					   cache->origin_blocks);
+	if (r) {
+		DMERR("could not resize on-disk discard bitset");
+		return r;
+	}
+
+	for (i = 0; i < from_oblock(cache->discard_nr_blocks); i++) {
+		r = dm_cache_set_discard(cache->cmd, to_oblock(i),
+					 is_discarded(cache, to_oblock(i)));
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
+/*
+ * returns true on success
+ */
+static bool sync_metadata(struct cache *cache)
+{
+	int r1, r2, r3, r4;
+
+	r1 = write_dirty_bitset(cache);
+	if (r1)
+		DMERR("could not write dirty bitset");
+
+	r2 = write_discard_bitset(cache);
+	if (r2)
+		DMERR("could not write discard bitset");
+
+	save_stats(cache);
+
+	r3 = dm_cache_write_hints(cache->cmd, cache->policy);
+	if (r3)
+		DMERR("could not write hints");
+
+	/*
+	 * If writing the above metadata failed, we still commit, but don't
+	 * set the clean shutdown flag.  This will effectively force every
+	 * dirty bit to be set on reload.
+	 */
+	r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
+	if (r4)
+		DMERR("could not write cache metadata.  Data loss may occur.");
+
+	return !r1 && !r2 && !r3 && !r4;
+}
+
+static void cache_postsuspend(struct dm_target *ti)
+{
+	struct cache *cache = ti->private;
+
+	start_quiescing(cache);
+	wait_for_migrations(cache);
+	stop_worker(cache);
+	requeue_deferred_io(cache);
+	stop_quiescing(cache);
+
+	(void) sync_metadata(cache);
+}
+
+static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
+			bool dirty, uint32_t hint, bool hint_valid)
+{
+	int r;
+	struct cache *cache = context;
+
+	r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
+	if (r)
+		return r;
+
+	if (dirty)
+		set_dirty(cache, oblock, cblock);
+	else
+		clear_dirty(cache, oblock, cblock);
+
+	return 0;
+}
+
+static int load_discard(void *context, sector_t discard_block_size,
+			dm_oblock_t oblock, bool discard)
+{
+	struct cache *cache = context;
+
+	if (discard)
+		set_discard(cache, oblock);
+	else
+		clear_discard(cache, oblock);
+
+	return 0;
+}
+
+static dm_cblock_t get_cache_dev_size(struct cache *cache)
+{
+	sector_t size = get_dev_size(cache->cache_dev);
+	(void) sector_div(size, cache->sectors_per_block);
+	return to_cblock(size);
+}
+
+static bool can_resize(struct cache *cache, dm_cblock_t new_size)
+{
+	if (from_cblock(new_size) > from_cblock(cache->cache_size))
+		return true;
+
+	/*
+	 * We can't drop a dirty block when shrinking the cache.
+	 */
+	while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
+		new_size = to_cblock(from_cblock(new_size) + 1);
+		if (is_dirty(cache, new_size)) {
+			DMERR("unable to shrink cache; cache block %llu is dirty",
+			      (unsigned long long) from_cblock(new_size));
+			return false;
+		}
+	}
+
+	return true;
+}
+
+static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
+{
+	int r;
+
+	r = dm_cache_resize(cache->cmd, new_size);
+	if (r) {
+		DMERR("could not resize cache metadata");
+		return r;
+	}
+
+	cache->cache_size = new_size;
+
+	return 0;
+}
+
+static int cache_preresume(struct dm_target *ti)
+{
+	int r = 0;
+	struct cache *cache = ti->private;
+	dm_cblock_t csize = get_cache_dev_size(cache);
+
+	/*
+	 * Check to see if the cache has resized.
+	 */
+	if (!cache->sized) {
+		r = resize_cache_dev(cache, csize);
+		if (r)
+			return r;
+
+		cache->sized = true;
+
+	} else if (csize != cache->cache_size) {
+		if (!can_resize(cache, csize))
+			return -EINVAL;
+
+		r = resize_cache_dev(cache, csize);
+		if (r)
+			return r;
+	}
+
+	if (!cache->loaded_mappings) {
+		r = dm_cache_load_mappings(cache->cmd, cache->policy,
+					   load_mapping, cache);
+		if (r) {
+			DMERR("could not load cache mappings");
+			return r;
+		}
+
+		cache->loaded_mappings = true;
+	}
+
+	if (!cache->loaded_discards) {
+		r = dm_cache_load_discards(cache->cmd, load_discard, cache);
+		if (r) {
+			DMERR("could not load origin discards");
+			return r;
+		}
+
+		cache->loaded_discards = true;
+	}
+
+	return r;
+}
+
+static void cache_resume(struct dm_target *ti)
+{
+	struct cache *cache = ti->private;
+
+	cache->need_tick_bio = true;
+	do_waker(&cache->waker.work);
+}
+
+/*
+ * Status format:
+ *
+ * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
+ * <cache block size> <#used cache blocks>/<#total cache blocks>
+ * <#read hits> <#read misses> <#write hits> <#write misses>
+ * <#demotions> <#promotions> <#dirty>
+ * <#features> <features>*
+ * <#core args> <core args>
+ * <policy name> <#policy args> <policy args>*
+ */
+static void cache_status(struct dm_target *ti, status_type_t type,
+			 unsigned status_flags, char *result, unsigned maxlen)
+{
+	int r = 0;
+	unsigned i;
+	ssize_t sz = 0;
+	dm_block_t nr_free_blocks_metadata = 0;
+	dm_block_t nr_blocks_metadata = 0;
+	char buf[BDEVNAME_SIZE];
+	struct cache *cache = ti->private;
+	dm_cblock_t residency;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		/* Commit to ensure statistics aren't out-of-date */
+		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
+			r = dm_cache_commit(cache->cmd, false);
+			if (r)
+				DMERR("could not commit metadata for accurate status");
+		}
+
+		r = dm_cache_get_free_metadata_block_count(cache->cmd,
+							   &nr_free_blocks_metadata);
+		if (r) {
+			DMERR("could not get metadata free block count");
+			goto err;
+		}
+
+		r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
+		if (r) {
+			DMERR("could not get metadata device size");
+			goto err;
+		}
+
+		residency = policy_residency(cache->policy);
+
+		DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ",
+		       (unsigned)(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT),
+		       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
+		       (unsigned long long)nr_blocks_metadata,
+		       cache->sectors_per_block,
+		       (unsigned long long) from_cblock(residency),
+		       (unsigned long long) from_cblock(cache->cache_size),
+		       (unsigned) atomic_read(&cache->stats.read_hit),
+		       (unsigned) atomic_read(&cache->stats.read_miss),
+		       (unsigned) atomic_read(&cache->stats.write_hit),
+		       (unsigned) atomic_read(&cache->stats.write_miss),
+		       (unsigned) atomic_read(&cache->stats.demotion),
+		       (unsigned) atomic_read(&cache->stats.promotion),
+		       (unsigned long) atomic_read(&cache->nr_dirty));
+
+		if (writethrough_mode(&cache->features))
+			DMEMIT("1 writethrough ");
+
+		else if (passthrough_mode(&cache->features))
+			DMEMIT("1 passthrough ");
+
+		else if (writeback_mode(&cache->features))
+			DMEMIT("1 writeback ");
+
+		else {
+			DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode);
+			goto err;
+		}
+
+		DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
+
+		DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
+		if (sz < maxlen) {
+			r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
+			if (r)
+				DMERR("policy_emit_config_values returned %d", r);
+		}
+
+		break;
+
+	case STATUSTYPE_TABLE:
+		format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
+		DMEMIT("%s ", buf);
+		format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
+		DMEMIT("%s ", buf);
+		format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
+		DMEMIT("%s", buf);
+
+		for (i = 0; i < cache->nr_ctr_args - 1; i++)
+			DMEMIT(" %s", cache->ctr_args[i]);
+		if (cache->nr_ctr_args)
+			DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
+	}
+
+	return;
+
+err:
+	DMEMIT("Error");
+}
+
+/*
+ * A cache block range can take two forms:
+ *
+ * i) A single cblock, eg. '3456'
+ * ii) A begin and end cblock with dots between, eg. 123-234
+ */
+static int parse_cblock_range(struct cache *cache, const char *str,
+			      struct cblock_range *result)
+{
+	char dummy;
+	uint64_t b, e;
+	int r;
+
+	/*
+	 * Try and parse form (ii) first.
+	 */
+	r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
+	if (r < 0)
+		return r;
+
+	if (r == 2) {
+		result->begin = to_cblock(b);
+		result->end = to_cblock(e);
+		return 0;
+	}
+
+	/*
+	 * That didn't work, try form (i).
+	 */
+	r = sscanf(str, "%llu%c", &b, &dummy);
+	if (r < 0)
+		return r;
+
+	if (r == 1) {
+		result->begin = to_cblock(b);
+		result->end = to_cblock(from_cblock(result->begin) + 1u);
+		return 0;
+	}
+
+	DMERR("invalid cblock range '%s'", str);
+	return -EINVAL;
+}
+
+static int validate_cblock_range(struct cache *cache, struct cblock_range *range)
+{
+	uint64_t b = from_cblock(range->begin);
+	uint64_t e = from_cblock(range->end);
+	uint64_t n = from_cblock(cache->cache_size);
+
+	if (b >= n) {
+		DMERR("begin cblock out of range: %llu >= %llu", b, n);
+		return -EINVAL;
+	}
+
+	if (e > n) {
+		DMERR("end cblock out of range: %llu > %llu", e, n);
+		return -EINVAL;
+	}
+
+	if (b >= e) {
+		DMERR("invalid cblock range: %llu >= %llu", b, e);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int request_invalidation(struct cache *cache, struct cblock_range *range)
+{
+	struct invalidation_request req;
+
+	INIT_LIST_HEAD(&req.list);
+	req.cblocks = range;
+	atomic_set(&req.complete, 0);
+	req.err = 0;
+	init_waitqueue_head(&req.result_wait);
+
+	spin_lock(&cache->invalidation_lock);
+	list_add(&req.list, &cache->invalidation_requests);
+	spin_unlock(&cache->invalidation_lock);
+	wake_worker(cache);
+
+	wait_event(req.result_wait, atomic_read(&req.complete));
+	return req.err;
+}
+
+static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
+					      const char **cblock_ranges)
+{
+	int r = 0;
+	unsigned i;
+	struct cblock_range range;
+
+	if (!passthrough_mode(&cache->features)) {
+		DMERR("cache has to be in passthrough mode for invalidation");
+		return -EPERM;
+	}
+
+	for (i = 0; i < count; i++) {
+		r = parse_cblock_range(cache, cblock_ranges[i], &range);
+		if (r)
+			break;
+
+		r = validate_cblock_range(cache, &range);
+		if (r)
+			break;
+
+		/*
+		 * Pass begin and end origin blocks to the worker and wake it.
+		 */
+		r = request_invalidation(cache, &range);
+		if (r)
+			break;
+	}
+
+	return r;
+}
+
+/*
+ * Supports
+ *	"<key> <value>"
+ * and
+ *     "invalidate_cblocks [(<begin>)|(<begin>-<end>)]*
+ *
+ * The key migration_threshold is supported by the cache target core.
+ */
+static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+	struct cache *cache = ti->private;
+
+	if (!argc)
+		return -EINVAL;
+
+	if (!strcasecmp(argv[0], "invalidate_cblocks"))
+		return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
+
+	if (argc != 2)
+		return -EINVAL;
+
+	return set_config_value(cache, argv[0], argv[1]);
+}
+
+static int cache_iterate_devices(struct dm_target *ti,
+				 iterate_devices_callout_fn fn, void *data)
+{
+	int r = 0;
+	struct cache *cache = ti->private;
+
+	r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
+	if (!r)
+		r = fn(ti, cache->origin_dev, 0, ti->len, data);
+
+	return r;
+}
+
+/*
+ * We assume I/O is going to the origin (which is the volume
+ * more likely to have restrictions e.g. by being striped).
+ * (Looking up the exact location of the data would be expensive
+ * and could always be out of date by the time the bio is submitted.)
+ */
+static int cache_bvec_merge(struct dm_target *ti,
+			    struct bvec_merge_data *bvm,
+			    struct bio_vec *biovec, int max_size)
+{
+	struct cache *cache = ti->private;
+	struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
+
+	if (!q->merge_bvec_fn)
+		return max_size;
+
+	bvm->bi_bdev = cache->origin_dev->bdev;
+	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
+{
+	/*
+	 * FIXME: these limits may be incompatible with the cache device
+	 */
+	limits->max_discard_sectors = cache->sectors_per_block;
+	limits->discard_granularity = cache->sectors_per_block << SECTOR_SHIFT;
+}
+
+static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+	struct cache *cache = ti->private;
+	uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
+
+	/*
+	 * If the system-determined stacked limits are compatible with the
+	 * cache's blocksize (io_opt is a factor) do not override them.
+	 */
+	if (io_opt_sectors < cache->sectors_per_block ||
+	    do_div(io_opt_sectors, cache->sectors_per_block)) {
+		blk_limits_io_min(limits, 0);
+		blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
+	}
+	set_discard_limits(cache, limits);
+}
+
+/*----------------------------------------------------------------*/
+
+static struct target_type cache_target = {
+	.name = "cache",
+	.version = {1, 4, 0},
+	.module = THIS_MODULE,
+	.ctr = cache_ctr,
+	.dtr = cache_dtr,
+	.map = cache_map,
+	.end_io = cache_end_io,
+	.postsuspend = cache_postsuspend,
+	.preresume = cache_preresume,
+	.resume = cache_resume,
+	.status = cache_status,
+	.message = cache_message,
+	.iterate_devices = cache_iterate_devices,
+	.merge = cache_bvec_merge,
+	.io_hints = cache_io_hints,
+};
+
+static int __init dm_cache_init(void)
+{
+	int r;
+
+	r = dm_register_target(&cache_target);
+	if (r) {
+		DMERR("cache target registration failed: %d", r);
+		return r;
+	}
+
+	migration_cache = KMEM_CACHE(dm_cache_migration, 0);
+	if (!migration_cache) {
+		dm_unregister_target(&cache_target);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void __exit dm_cache_exit(void)
+{
+	dm_unregister_target(&cache_target);
+	kmem_cache_destroy(migration_cache);
+}
+
+module_init(dm_cache_init);
+module_exit(dm_cache_exit);
+
+MODULE_DESCRIPTION(DM_NAME " cache target");
+MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index bbf459bca61..4cba2d808af 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,7 +1,8 @@
 /*
- * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
+ * Copyright (C) 2003 Jana Saout <jana@saout.de>
  * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
  * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2013 Milan Broz <gmazyland@gmail.com>
  *
  * This file is released under the GPL.
  */
@@ -18,7 +19,6 @@
 #include <linux/crypto.h>
 #include <linux/workqueue.h>
 #include <linux/backing-dev.h>
-#include <linux/percpu.h>
 #include <linux/atomic.h>
 #include <linux/scatterlist.h>
 #include <asm/page.h>
@@ -38,12 +38,11 @@ struct convert_context {
 	struct completion restart;
 	struct bio *bio_in;
 	struct bio *bio_out;
-	unsigned int offset_in;
-	unsigned int offset_out;
-	unsigned int idx_in;
-	unsigned int idx_out;
+	struct bvec_iter iter_in;
+	struct bvec_iter iter_out;
 	sector_t cc_sector;
 	atomic_t cc_pending;
+	struct ablkcipher_request *req;
 };
 
 /*
@@ -98,6 +97,13 @@ struct iv_lmk_private {
 	u8 *seed;
 };
 
+#define TCW_WHITENING_SIZE 16
+struct iv_tcw_private {
+	struct crypto_shash *crc32_tfm;
+	u8 *iv_seed;
+	u8 *whitening;
+};
+
 /*
  * Crypt: maps a linear range of a block device
  * and encrypts / decrypts at the same time.
@@ -105,15 +111,7 @@ struct iv_lmk_private {
 enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
 
 /*
- * Duplicated per-CPU state for cipher.
- */
-struct crypt_cpu {
-	struct ablkcipher_request *req;
-};
-
-/*
- * The fields in here must be read only after initialization,
- * changing state should be in crypt_cpu.
+ * The fields in here must be read only after initialization.
  */
 struct crypt_config {
 	struct dm_dev *dev;
@@ -139,16 +137,11 @@ struct crypt_config {
 		struct iv_essiv_private essiv;
 		struct iv_benbi_private benbi;
 		struct iv_lmk_private lmk;
+		struct iv_tcw_private tcw;
 	} iv_gen_private;
 	sector_t iv_offset;
 	unsigned int iv_size;
 
-	/*
-	 * Duplicated per cpu state. Access through
-	 * per_cpu_ptr() only.
-	 */
-	struct crypt_cpu __percpu *cpu;
-
 	/* ESSIV: struct crypto_cipher *essiv_tfm */
 	void *iv_private;
 	struct crypto_ablkcipher **tfms;
@@ -171,7 +164,8 @@ struct crypt_config {
 
 	unsigned long flags;
 	unsigned int key_size;
-	unsigned int key_parts;
+	unsigned int key_parts;      /* independent parts in key buffer */
+	unsigned int key_extra_size; /* additional keys length */
 	u8 key[0];
 };
 
@@ -184,11 +178,6 @@ static void clone_init(struct dm_crypt_io *, struct bio *);
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
 static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
 
-static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
-{
-	return this_cpu_ptr(cc->cpu);
-}
-
 /*
  * Use this to access cipher attributes that are the same for each CPU.
  */
@@ -230,6 +219,16 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
  *         version 3: the same as version 2 with additional IV seed
  *                   (it uses 65 keys, last key is used as IV seed)
  *
+ * tcw:  Compatible implementation of the block chaining mode used
+ *       by the TrueCrypt device encryption system (prior to version 4.1).
+ *       For more info see: http://www.truecrypt.org
+ *       It operates on full 512 byte sectors and uses CBC
+ *       with an IV derived from initial key and the sector number.
+ *       In addition, whitening value is applied on every sector, whitening
+ *       is calculated from initial key, sector number and mixed using CRC32.
+ *       Note that this encryption scheme is vulnerable to watermarking attacks
+ *       and should be used for old compatible containers access only.
+ *
  * plumb: unimplemented, see:
  * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
  */
@@ -530,7 +529,7 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
 		char ctx[crypto_shash_descsize(lmk->hash_tfm)];
 	} sdesc;
 	struct md5_state md5state;
-	u32 buf[4];
+	__le32 buf[4];
 	int i, r;
 
 	sdesc.desc.tfm = lmk->hash_tfm;
@@ -608,6 +607,153 @@ static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
 	return r;
 }
 
+static void crypt_iv_tcw_dtr(struct crypt_config *cc)
+{
+	struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+
+	kzfree(tcw->iv_seed);
+	tcw->iv_seed = NULL;
+	kzfree(tcw->whitening);
+	tcw->whitening = NULL;
+
+	if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm))
+		crypto_free_shash(tcw->crc32_tfm);
+	tcw->crc32_tfm = NULL;
+}
+
+static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti,
+			    const char *opts)
+{
+	struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+
+	if (cc->key_size <= (cc->iv_size + TCW_WHITENING_SIZE)) {
+		ti->error = "Wrong key size for TCW";
+		return -EINVAL;
+	}
+
+	tcw->crc32_tfm = crypto_alloc_shash("crc32", 0, 0);
+	if (IS_ERR(tcw->crc32_tfm)) {
+		ti->error = "Error initializing CRC32 in TCW";
+		return PTR_ERR(tcw->crc32_tfm);
+	}
+
+	tcw->iv_seed = kzalloc(cc->iv_size, GFP_KERNEL);
+	tcw->whitening = kzalloc(TCW_WHITENING_SIZE, GFP_KERNEL);
+	if (!tcw->iv_seed || !tcw->whitening) {
+		crypt_iv_tcw_dtr(cc);
+		ti->error = "Error allocating seed storage in TCW";
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int crypt_iv_tcw_init(struct crypt_config *cc)
+{
+	struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+	int key_offset = cc->key_size - cc->iv_size - TCW_WHITENING_SIZE;
+
+	memcpy(tcw->iv_seed, &cc->key[key_offset], cc->iv_size);
+	memcpy(tcw->whitening, &cc->key[key_offset + cc->iv_size],
+	       TCW_WHITENING_SIZE);
+
+	return 0;
+}
+
+static int crypt_iv_tcw_wipe(struct crypt_config *cc)
+{
+	struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+
+	memset(tcw->iv_seed, 0, cc->iv_size);
+	memset(tcw->whitening, 0, TCW_WHITENING_SIZE);
+
+	return 0;
+}
+
+static int crypt_iv_tcw_whitening(struct crypt_config *cc,
+				  struct dm_crypt_request *dmreq,
+				  u8 *data)
+{
+	struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+	u64 sector = cpu_to_le64((u64)dmreq->iv_sector);
+	u8 buf[TCW_WHITENING_SIZE];
+	struct {
+		struct shash_desc desc;
+		char ctx[crypto_shash_descsize(tcw->crc32_tfm)];
+	} sdesc;
+	int i, r;
+
+	/* xor whitening with sector number */
+	memcpy(buf, tcw->whitening, TCW_WHITENING_SIZE);
+	crypto_xor(buf, (u8 *)&sector, 8);
+	crypto_xor(&buf[8], (u8 *)&sector, 8);
+
+	/* calculate crc32 for every 32bit part and xor it */
+	sdesc.desc.tfm = tcw->crc32_tfm;
+	sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+	for (i = 0; i < 4; i++) {
+		r = crypto_shash_init(&sdesc.desc);
+		if (r)
+			goto out;
+		r = crypto_shash_update(&sdesc.desc, &buf[i * 4], 4);
+		if (r)
+			goto out;
+		r = crypto_shash_final(&sdesc.desc, &buf[i * 4]);
+		if (r)
+			goto out;
+	}
+	crypto_xor(&buf[0], &buf[12], 4);
+	crypto_xor(&buf[4], &buf[8], 4);
+
+	/* apply whitening (8 bytes) to whole sector */
+	for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++)
+		crypto_xor(data + i * 8, buf, 8);
+out:
+	memset(buf, 0, sizeof(buf));
+	return r;
+}
+
+static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
+			    struct dm_crypt_request *dmreq)
+{
+	struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+	u64 sector = cpu_to_le64((u64)dmreq->iv_sector);
+	u8 *src;
+	int r = 0;
+
+	/* Remove whitening from ciphertext */
+	if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) {
+		src = kmap_atomic(sg_page(&dmreq->sg_in));
+		r = crypt_iv_tcw_whitening(cc, dmreq, src + dmreq->sg_in.offset);
+		kunmap_atomic(src);
+	}
+
+	/* Calculate IV */
+	memcpy(iv, tcw->iv_seed, cc->iv_size);
+	crypto_xor(iv, (u8 *)&sector, 8);
+	if (cc->iv_size > 8)
+		crypto_xor(&iv[8], (u8 *)&sector, cc->iv_size - 8);
+
+	return r;
+}
+
+static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
+			     struct dm_crypt_request *dmreq)
+{
+	u8 *dst;
+	int r;
+
+	if (bio_data_dir(dmreq->ctx->bio_in) != WRITE)
+		return 0;
+
+	/* Apply whitening on ciphertext */
+	dst = kmap_atomic(sg_page(&dmreq->sg_out));
+	r = crypt_iv_tcw_whitening(cc, dmreq, dst + dmreq->sg_out.offset);
+	kunmap_atomic(dst);
+
+	return r;
+}
+
 static struct crypt_iv_operations crypt_iv_plain_ops = {
 	.generator = crypt_iv_plain_gen
 };
@@ -643,6 +789,15 @@ static struct crypt_iv_operations crypt_iv_lmk_ops = {
 	.post	   = crypt_iv_lmk_post
 };
 
+static struct crypt_iv_operations crypt_iv_tcw_ops = {
+	.ctr	   = crypt_iv_tcw_ctr,
+	.dtr	   = crypt_iv_tcw_dtr,
+	.init	   = crypt_iv_tcw_init,
+	.wipe	   = crypt_iv_tcw_wipe,
+	.generator = crypt_iv_tcw_gen,
+	.post	   = crypt_iv_tcw_post
+};
+
 static void crypt_convert_init(struct crypt_config *cc,
 			       struct convert_context *ctx,
 			       struct bio *bio_out, struct bio *bio_in,
@@ -650,10 +805,10 @@ static void crypt_convert_init(struct crypt_config *cc,
 {
 	ctx->bio_in = bio_in;
 	ctx->bio_out = bio_out;
-	ctx->offset_in = 0;
-	ctx->offset_out = 0;
-	ctx->idx_in = bio_in ? bio_in->bi_idx : 0;
-	ctx->idx_out = bio_out ? bio_out->bi_idx : 0;
+	if (bio_in)
+		ctx->iter_in = bio_in->bi_iter;
+	if (bio_out)
+		ctx->iter_out = bio_out->bi_iter;
 	ctx->cc_sector = sector + cc->iv_offset;
 	init_completion(&ctx->restart);
 }
@@ -681,8 +836,8 @@ static int crypt_convert_block(struct crypt_config *cc,
 			       struct convert_context *ctx,
 			       struct ablkcipher_request *req)
 {
-	struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in);
-	struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out);
+	struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in);
+	struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
 	struct dm_crypt_request *dmreq;
 	u8 *iv;
 	int r;
@@ -693,24 +848,15 @@ static int crypt_convert_block(struct crypt_config *cc,
 	dmreq->iv_sector = ctx->cc_sector;
 	dmreq->ctx = ctx;
 	sg_init_table(&dmreq->sg_in, 1);
-	sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
-		    bv_in->bv_offset + ctx->offset_in);
+	sg_set_page(&dmreq->sg_in, bv_in.bv_page, 1 << SECTOR_SHIFT,
+		    bv_in.bv_offset);
 
 	sg_init_table(&dmreq->sg_out, 1);
-	sg_set_page(&dmreq->sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT,
-		    bv_out->bv_offset + ctx->offset_out);
-
-	ctx->offset_in += 1 << SECTOR_SHIFT;
-	if (ctx->offset_in >= bv_in->bv_len) {
-		ctx->offset_in = 0;
-		ctx->idx_in++;
-	}
+	sg_set_page(&dmreq->sg_out, bv_out.bv_page, 1 << SECTOR_SHIFT,
+		    bv_out.bv_offset);
 
-	ctx->offset_out += 1 << SECTOR_SHIFT;
-	if (ctx->offset_out >= bv_out->bv_len) {
-		ctx->offset_out = 0;
-		ctx->idx_out++;
-	}
+	bio_advance_iter(ctx->bio_in, &ctx->iter_in, 1 << SECTOR_SHIFT);
+	bio_advance_iter(ctx->bio_out, &ctx->iter_out, 1 << SECTOR_SHIFT);
 
 	if (cc->iv_gen_ops) {
 		r = cc->iv_gen_ops->generator(cc, iv, dmreq);
@@ -738,16 +884,15 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 static void crypt_alloc_req(struct crypt_config *cc,
 			    struct convert_context *ctx)
 {
-	struct crypt_cpu *this_cc = this_crypt_config(cc);
 	unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1);
 
-	if (!this_cc->req)
-		this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+	if (!ctx->req)
+		ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO);
 
-	ablkcipher_request_set_tfm(this_cc->req, cc->tfms[key_index]);
-	ablkcipher_request_set_callback(this_cc->req,
+	ablkcipher_request_set_tfm(ctx->req, cc->tfms[key_index]);
+	ablkcipher_request_set_callback(ctx->req,
 	    CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-	    kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
+	    kcryptd_async_done, dmreq_of_req(cc, ctx->req));
 }
 
 /*
@@ -756,28 +901,26 @@ static void crypt_alloc_req(struct crypt_config *cc,
 static int crypt_convert(struct crypt_config *cc,
 			 struct convert_context *ctx)
 {
-	struct crypt_cpu *this_cc = this_crypt_config(cc);
 	int r;
 
 	atomic_set(&ctx->cc_pending, 1);
 
-	while(ctx->idx_in < ctx->bio_in->bi_vcnt &&
-	      ctx->idx_out < ctx->bio_out->bi_vcnt) {
+	while (ctx->iter_in.bi_size && ctx->iter_out.bi_size) {
 
 		crypt_alloc_req(cc, ctx);
 
 		atomic_inc(&ctx->cc_pending);
 
-		r = crypt_convert_block(cc, ctx, this_cc->req);
+		r = crypt_convert_block(cc, ctx, ctx->req);
 
 		switch (r) {
 		/* async */
 		case -EBUSY:
 			wait_for_completion(&ctx->restart);
-			INIT_COMPLETION(ctx->restart);
+			reinit_completion(&ctx->restart);
 			/* fall through*/
 		case -EINPROGRESS:
-			this_cc->req = NULL;
+			ctx->req = NULL;
 			ctx->cc_sector++;
 			continue;
 
@@ -845,7 +988,7 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
 		size -= len;
 	}
 
-	if (!clone->bi_size) {
+	if (!clone->bi_iter.bi_size) {
 		bio_put(clone);
 		return NULL;
 	}
@@ -858,8 +1001,7 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
 	unsigned int i;
 	struct bio_vec *bv;
 
-	for (i = 0; i < clone->bi_vcnt; i++) {
-		bv = bio_iovec_idx(clone, i);
+	bio_for_each_segment_all(bv, clone, i) {
 		BUG_ON(!bv->bv_page);
 		mempool_free(bv->bv_page, cc->page_pool);
 		bv->bv_page = NULL;
@@ -877,6 +1019,7 @@ static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
 	io->sector = sector;
 	io->error = 0;
 	io->base_io = NULL;
+	io->ctx.req = NULL;
 	atomic_set(&io->io_pending, 0);
 
 	return io;
@@ -902,6 +1045,8 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
 	if (!atomic_dec_and_test(&io->io_pending))
 		return;
 
+	if (io->ctx.req)
+		mempool_free(io->ctx.req, cc->req_pool);
 	mempool_free(io, cc->io_pool);
 
 	if (likely(!base_io))
@@ -986,7 +1131,7 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 	crypt_inc_pending(io);
 
 	clone_init(io, clone);
-	clone->bi_sector = cc->start + io->sector;
+	clone->bi_iter.bi_sector = cc->start + io->sector;
 
 	generic_make_request(clone);
 	return 0;
@@ -1032,9 +1177,9 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
 	}
 
 	/* crypt_convert should have filled the clone bio */
-	BUG_ON(io->ctx.idx_out < clone->bi_vcnt);
+	BUG_ON(io->ctx.iter_out.bi_size);
 
-	clone->bi_sector = cc->start + io->sector;
+	clone->bi_iter.bi_sector = cc->start + io->sector;
 
 	if (async)
 		kcryptd_queue_io(io);
@@ -1049,7 +1194,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 	struct dm_crypt_io *new_io;
 	int crypt_finished;
 	unsigned out_of_pages = 0;
-	unsigned remaining = io->base_bio->bi_size;
+	unsigned remaining = io->base_bio->bi_iter.bi_size;
 	sector_t sector = io->sector;
 	int r;
 
@@ -1071,9 +1216,9 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 		}
 
 		io->ctx.bio_out = clone;
-		io->ctx.idx_out = 0;
+		io->ctx.iter_out = clone->bi_iter;
 
-		remaining -= clone->bi_size;
+		remaining -= clone->bi_iter.bi_size;
 		sector += bio_sectors(clone);
 
 		crypt_inc_pending(io);
@@ -1115,8 +1260,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 			crypt_inc_pending(new_io);
 			crypt_convert_init(cc, &new_io->ctx, NULL,
 					   io->base_bio, sector);
-			new_io->ctx.idx_in = io->ctx.idx_in;
-			new_io->ctx.offset_in = io->ctx.offset_in;
+			new_io->ctx.iter_in = io->ctx.iter_in;
 
 			/*
 			 * Fragments after the first use the base_io
@@ -1234,20 +1378,6 @@ static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
 	return 0;
 }
 
-/*
- * Encode key into its hex representation
- */
-static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
-{
-	unsigned int i;
-
-	for (i = 0; i < size; i++) {
-		sprintf(hex, "%02x", *key);
-		hex += 2;
-		key++;
-	}
-}
-
 static void crypt_free_tfms(struct crypt_config *cc)
 {
 	unsigned i;
@@ -1289,9 +1419,12 @@ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
 
 static int crypt_setkey_allcpus(struct crypt_config *cc)
 {
-	unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count);
+	unsigned subkey_size;
 	int err = 0, i, r;
 
+	/* Ignore extra keys (which are used for IV etc) */
+	subkey_size = (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count);
+
 	for (i = 0; i < cc->tfms_count; i++) {
 		r = crypto_ablkcipher_setkey(cc->tfms[i],
 					     cc->key + (i * subkey_size),
@@ -1341,8 +1474,6 @@ static int crypt_wipe_key(struct crypt_config *cc)
 static void crypt_dtr(struct dm_target *ti)
 {
 	struct crypt_config *cc = ti->private;
-	struct crypt_cpu *cpu_cc;
-	int cpu;
 
 	ti->private = NULL;
 
@@ -1354,13 +1485,6 @@ static void crypt_dtr(struct dm_target *ti)
 	if (cc->crypt_queue)
 		destroy_workqueue(cc->crypt_queue);
 
-	if (cc->cpu)
-		for_each_possible_cpu(cpu) {
-			cpu_cc = per_cpu_ptr(cc->cpu, cpu);
-			if (cpu_cc->req)
-				mempool_free(cpu_cc->req, cc->req_pool);
-		}
-
 	crypt_free_tfms(cc);
 
 	if (cc->bs)
@@ -1379,9 +1503,6 @@ static void crypt_dtr(struct dm_target *ti)
 	if (cc->dev)
 		dm_put_device(ti, cc->dev);
 
-	if (cc->cpu)
-		free_percpu(cc->cpu);
-
 	kzfree(cc->cipher);
 	kzfree(cc->cipher_string);
 
@@ -1424,6 +1545,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 		return -EINVAL;
 	}
 	cc->key_parts = cc->tfms_count;
+	cc->key_extra_size = 0;
 
 	cc->cipher = kstrdup(cipher, GFP_KERNEL);
 	if (!cc->cipher)
@@ -1436,13 +1558,6 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 	if (tmp)
 		DMWARN("Ignoring unexpected additional cipher options");
 
-	cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)),
-				 __alignof__(struct crypt_cpu));
-	if (!cc->cpu) {
-		ti->error = "Cannot allocate per cpu state";
-		goto bad_mem;
-	}
-
 	/*
 	 * For compatibility with the original dm-crypt mapping format, if
 	 * only the cipher name is supplied, use cbc-plain.
@@ -1475,13 +1590,6 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 		goto bad;
 	}
 
-	/* Initialize and set key */
-	ret = crypt_set_key(cc, key);
-	if (ret < 0) {
-		ti->error = "Error decoding and setting key";
-		goto bad;
-	}
-
 	/* Initialize IV */
 	cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc));
 	if (cc->iv_size)
@@ -1508,18 +1616,33 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 		cc->iv_gen_ops = &crypt_iv_null_ops;
 	else if (strcmp(ivmode, "lmk") == 0) {
 		cc->iv_gen_ops = &crypt_iv_lmk_ops;
-		/* Version 2 and 3 is recognised according
+		/*
+		 * Version 2 and 3 is recognised according
 		 * to length of provided multi-key string.
 		 * If present (version 3), last key is used as IV seed.
+		 * All keys (including IV seed) are always the same size.
 		 */
-		if (cc->key_size % cc->key_parts)
+		if (cc->key_size % cc->key_parts) {
 			cc->key_parts++;
+			cc->key_extra_size = cc->key_size / cc->key_parts;
+		}
+	} else if (strcmp(ivmode, "tcw") == 0) {
+		cc->iv_gen_ops = &crypt_iv_tcw_ops;
+		cc->key_parts += 2; /* IV + whitening */
+		cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE;
 	} else {
 		ret = -EINVAL;
 		ti->error = "Invalid IV mode";
 		goto bad;
 	}
 
+	/* Initialize and set key */
+	ret = crypt_set_key(cc, key);
+	if (ret < 0) {
+		ti->error = "Error decoding and setting key";
+		goto bad;
+	}
+
 	/* Allocate IV */
 	if (cc->iv_gen_ops && cc->iv_gen_ops->ctr) {
 		ret = cc->iv_gen_ops->ctr(cc, ti, ivopts);
@@ -1651,7 +1774,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 		if (opt_params == 1 && opt_string &&
 		    !strcasecmp(opt_string, "allow_discards"))
-			ti->num_discard_requests = 1;
+			ti->num_discard_bios = 1;
 		else if (opt_params) {
 			ret = -EINVAL;
 			ti->error = "Invalid feature arguments";
@@ -1660,26 +1783,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 
 	ret = -ENOMEM;
-	cc->io_queue = alloc_workqueue("kcryptd_io",
-				       WQ_NON_REENTRANT|
-				       WQ_MEM_RECLAIM,
-				       1);
+	cc->io_queue = alloc_workqueue("kcryptd_io", WQ_MEM_RECLAIM, 1);
 	if (!cc->io_queue) {
 		ti->error = "Couldn't create kcryptd io queue";
 		goto bad;
 	}
 
 	cc->crypt_queue = alloc_workqueue("kcryptd",
-					  WQ_NON_REENTRANT|
-					  WQ_CPU_INTENSIVE|
-					  WQ_MEM_RECLAIM,
-					  1);
+					  WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
 	if (!cc->crypt_queue) {
 		ti->error = "Couldn't create kcryptd queue";
 		goto bad;
 	}
 
-	ti->num_flush_requests = 1;
+	ti->num_flush_bios = 1;
 	ti->discard_zeroes_data_unsupported = true;
 
 	return 0;
@@ -1689,8 +1806,7 @@ bad:
 	return ret;
 }
 
-static int crypt_map(struct dm_target *ti, struct bio *bio,
-		     union map_info *map_context)
+static int crypt_map(struct dm_target *ti, struct bio *bio)
 {
 	struct dm_crypt_io *io;
 	struct crypt_config *cc = ti->private;
@@ -1703,11 +1819,12 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
 	if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) {
 		bio->bi_bdev = cc->dev->bdev;
 		if (bio_sectors(bio))
-			bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector);
+			bio->bi_iter.bi_sector = cc->start +
+				dm_target_offset(ti, bio->bi_iter.bi_sector);
 		return DM_MAPIO_REMAPPED;
 	}
 
-	io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_sector));
+	io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
 
 	if (bio_data_dir(io->base_bio) == READ) {
 		if (kcryptd_io_read(io, GFP_NOWAIT))
@@ -1718,11 +1835,11 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
 	return DM_MAPIO_SUBMITTED;
 }
 
-static int crypt_status(struct dm_target *ti, status_type_t type,
-			unsigned status_flags, char *result, unsigned maxlen)
+static void crypt_status(struct dm_target *ti, status_type_t type,
+			 unsigned status_flags, char *result, unsigned maxlen)
 {
 	struct crypt_config *cc = ti->private;
-	unsigned int sz = 0;
+	unsigned i, sz = 0;
 
 	switch (type) {
 	case STATUSTYPE_INFO:
@@ -1732,27 +1849,20 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
 	case STATUSTYPE_TABLE:
 		DMEMIT("%s ", cc->cipher_string);
 
-		if (cc->key_size > 0) {
-			if ((maxlen - sz) < ((cc->key_size << 1) + 1))
-				return -ENOMEM;
-
-			crypt_encode_key(result + sz, cc->key, cc->key_size);
-			sz += cc->key_size << 1;
-		} else {
-			if (sz >= maxlen)
-				return -ENOMEM;
-			result[sz++] = '-';
-		}
+		if (cc->key_size > 0)
+			for (i = 0; i < cc->key_size; i++)
+				DMEMIT("%02x", cc->key[i]);
+		else
+			DMEMIT("-");
 
 		DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset,
 				cc->dev->name, (unsigned long long)cc->start);
 
-		if (ti->num_discard_requests)
+		if (ti->num_discard_bios)
 			DMEMIT(" 1 allow_discards");
 
 		break;
 	}
-	return 0;
 }
 
 static void crypt_postsuspend(struct dm_target *ti)
@@ -1846,7 +1956,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
 
 static struct target_type crypt_target = {
 	.name   = "crypt",
-	.version = {1, 11, 0},
+	.version = {1, 13, 0},
 	.module = THIS_MODULE,
 	.ctr    = crypt_ctr,
 	.dtr    = crypt_dtr,
@@ -1886,6 +1996,6 @@ static void __exit dm_crypt_exit(void)
 module_init(dm_crypt_init);
 module_exit(dm_crypt_exit);
 
-MODULE_AUTHOR("Christophe Saout <christophe@saout.de>");
+MODULE_AUTHOR("Jana Saout <jana@saout.de>");
 MODULE_DESCRIPTION(DM_NAME " target for transparent encryption / decryption");
 MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index f53846f9ab5..42c3a27a14c 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -20,10 +20,10 @@
 struct delay_c {
 	struct timer_list delay_timer;
 	struct mutex timer_lock;
+	struct workqueue_struct *kdelayd_wq;
 	struct work_struct flush_expired_bios;
 	struct list_head delayed_bios;
 	atomic_t may_delay;
-	mempool_t *delayed_pool;
 
 	struct dm_dev *dev_read;
 	sector_t start_read;
@@ -39,20 +39,16 @@ struct delay_c {
 struct dm_delay_info {
 	struct delay_c *context;
 	struct list_head list;
-	struct bio *bio;
 	unsigned long expires;
 };
 
 static DEFINE_MUTEX(delayed_bios_lock);
 
-static struct workqueue_struct *kdelayd_wq;
-static struct kmem_cache *delayed_cache;
-
 static void handle_delayed_timer(unsigned long data)
 {
 	struct delay_c *dc = (struct delay_c *)data;
 
-	queue_work(kdelayd_wq, &dc->flush_expired_bios);
+	queue_work(dc->kdelayd_wq, &dc->flush_expired_bios);
 }
 
 static void queue_timeout(struct delay_c *dc, unsigned long expires)
@@ -87,13 +83,14 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all)
 	mutex_lock(&delayed_bios_lock);
 	list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
 		if (flush_all || time_after_eq(jiffies, delayed->expires)) {
+			struct bio *bio = dm_bio_from_per_bio_data(delayed,
+						sizeof(struct dm_delay_info));
 			list_del(&delayed->list);
-			bio_list_add(&flush_bios, delayed->bio);
-			if ((bio_data_dir(delayed->bio) == WRITE))
+			bio_list_add(&flush_bios, bio);
+			if ((bio_data_dir(bio) == WRITE))
 				delayed->context->writes--;
 			else
 				delayed->context->reads--;
-			mempool_free(delayed, dc->delayed_pool);
 			continue;
 		}
 
@@ -185,10 +182,10 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 
 out:
-	dc->delayed_pool = mempool_create_slab_pool(128, delayed_cache);
-	if (!dc->delayed_pool) {
-		DMERR("Couldn't create delayed bio pool.");
-		goto bad_dev_write;
+	dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
+	if (!dc->kdelayd_wq) {
+		DMERR("Couldn't start kdelayd");
+		goto bad_queue;
 	}
 
 	setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc);
@@ -198,12 +195,13 @@ out:
 	mutex_init(&dc->timer_lock);
 	atomic_set(&dc->may_delay, 1);
 
-	ti->num_flush_requests = 1;
-	ti->num_discard_requests = 1;
+	ti->num_flush_bios = 1;
+	ti->num_discard_bios = 1;
+	ti->per_bio_data_size = sizeof(struct dm_delay_info);
 	ti->private = dc;
 	return 0;
 
-bad_dev_write:
+bad_queue:
 	if (dc->dev_write)
 		dm_put_device(ti, dc->dev_write);
 bad_dev_read:
@@ -217,14 +215,13 @@ static void delay_dtr(struct dm_target *ti)
 {
 	struct delay_c *dc = ti->private;
 
-	flush_workqueue(kdelayd_wq);
+	destroy_workqueue(dc->kdelayd_wq);
 
 	dm_put_device(ti, dc->dev_read);
 
 	if (dc->dev_write)
 		dm_put_device(ti, dc->dev_write);
 
-	mempool_destroy(dc->delayed_pool);
 	kfree(dc);
 }
 
@@ -236,10 +233,9 @@ static int delay_bio(struct delay_c *dc, int delay, struct bio *bio)
 	if (!delay || !atomic_read(&dc->may_delay))
 		return 1;
 
-	delayed = mempool_alloc(dc->delayed_pool, GFP_NOIO);
+	delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
 
 	delayed->context = dc;
-	delayed->bio = bio;
 	delayed->expires = expires = jiffies + (delay * HZ / 1000);
 
 	mutex_lock(&delayed_bios_lock);
@@ -274,28 +270,28 @@ static void delay_resume(struct dm_target *ti)
 	atomic_set(&dc->may_delay, 1);
 }
 
-static int delay_map(struct dm_target *ti, struct bio *bio,
-		     union map_info *map_context)
+static int delay_map(struct dm_target *ti, struct bio *bio)
 {
 	struct delay_c *dc = ti->private;
 
 	if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) {
 		bio->bi_bdev = dc->dev_write->bdev;
 		if (bio_sectors(bio))
-			bio->bi_sector = dc->start_write +
-					 dm_target_offset(ti, bio->bi_sector);
+			bio->bi_iter.bi_sector = dc->start_write +
+				dm_target_offset(ti, bio->bi_iter.bi_sector);
 
 		return delay_bio(dc, dc->write_delay, bio);
 	}
 
 	bio->bi_bdev = dc->dev_read->bdev;
-	bio->bi_sector = dc->start_read + dm_target_offset(ti, bio->bi_sector);
+	bio->bi_iter.bi_sector = dc->start_read +
+		dm_target_offset(ti, bio->bi_iter.bi_sector);
 
 	return delay_bio(dc, dc->read_delay, bio);
 }
 
-static int delay_status(struct dm_target *ti, status_type_t type,
-			unsigned status_flags, char *result, unsigned maxlen)
+static void delay_status(struct dm_target *ti, status_type_t type,
+			 unsigned status_flags, char *result, unsigned maxlen)
 {
 	struct delay_c *dc = ti->private;
 	int sz = 0;
@@ -315,8 +311,6 @@ static int delay_status(struct dm_target *ti, status_type_t type,
 			       dc->write_delay);
 		break;
 	}
-
-	return 0;
 }
 
 static int delay_iterate_devices(struct dm_target *ti,
@@ -338,7 +332,7 @@ out:
 
 static struct target_type delay_target = {
 	.name	     = "delay",
-	.version     = {1, 1, 0},
+	.version     = {1, 2, 1},
 	.module      = THIS_MODULE,
 	.ctr	     = delay_ctr,
 	.dtr	     = delay_dtr,
@@ -351,19 +345,7 @@ static struct target_type delay_target = {
 
 static int __init dm_delay_init(void)
 {
-	int r = -ENOMEM;
-
-	kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
-	if (!kdelayd_wq) {
-		DMERR("Couldn't start kdelayd");
-		goto bad_queue;
-	}
-
-	delayed_cache = KMEM_CACHE(dm_delay_info, 0);
-	if (!delayed_cache) {
-		DMERR("Couldn't create delayed bio cache.");
-		goto bad_memcache;
-	}
+	int r;
 
 	r = dm_register_target(&delay_target);
 	if (r < 0) {
@@ -374,18 +356,12 @@ static int __init dm_delay_init(void)
 	return 0;
 
 bad_register:
-	kmem_cache_destroy(delayed_cache);
-bad_memcache:
-	destroy_workqueue(kdelayd_wq);
-bad_queue:
 	return r;
 }
 
 static void __exit dm_delay_exit(void)
 {
 	dm_unregister_target(&delay_target);
-	kmem_cache_destroy(delayed_cache);
-	destroy_workqueue(kdelayd_wq);
 }
 
 /* Module hooks */
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
new file mode 100644
index 00000000000..ad913cd4ade
--- /dev/null
+++ b/drivers/md/dm-era-target.c
@@ -0,0 +1,1747 @@
+#include "dm.h"
+#include "persistent-data/dm-transaction-manager.h"
+#include "persistent-data/dm-bitset.h"
+#include "persistent-data/dm-space-map.h"
+
+#include <linux/dm-io.h>
+#include <linux/dm-kcopyd.h>
+#include <linux/init.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#define DM_MSG_PREFIX "era"
+
+#define SUPERBLOCK_LOCATION 0
+#define SUPERBLOCK_MAGIC 2126579579
+#define SUPERBLOCK_CSUM_XOR 146538381
+#define MIN_ERA_VERSION 1
+#define MAX_ERA_VERSION 1
+#define INVALID_WRITESET_ROOT SUPERBLOCK_LOCATION
+#define MIN_BLOCK_SIZE 8
+
+/*----------------------------------------------------------------
+ * Writeset
+ *--------------------------------------------------------------*/
+struct writeset_metadata {
+	uint32_t nr_bits;
+	dm_block_t root;
+};
+
+struct writeset {
+	struct writeset_metadata md;
+
+	/*
+	 * An in core copy of the bits to save constantly doing look ups on
+	 * disk.
+	 */
+	unsigned long *bits;
+};
+
+/*
+ * This does not free off the on disk bitset as this will normally be done
+ * after digesting into the era array.
+ */
+static void writeset_free(struct writeset *ws)
+{
+	vfree(ws->bits);
+}
+
+static int setup_on_disk_bitset(struct dm_disk_bitset *info,
+				unsigned nr_bits, dm_block_t *root)
+{
+	int r;
+
+	r = dm_bitset_empty(info, root);
+	if (r)
+		return r;
+
+	return dm_bitset_resize(info, *root, 0, nr_bits, false, root);
+}
+
+static size_t bitset_size(unsigned nr_bits)
+{
+	return sizeof(unsigned long) * dm_div_up(nr_bits, BITS_PER_LONG);
+}
+
+/*
+ * Allocates memory for the in core bitset.
+ */
+static int writeset_alloc(struct writeset *ws, dm_block_t nr_blocks)
+{
+	ws->md.nr_bits = nr_blocks;
+	ws->md.root = INVALID_WRITESET_ROOT;
+	ws->bits = vzalloc(bitset_size(nr_blocks));
+	if (!ws->bits) {
+		DMERR("%s: couldn't allocate in memory bitset", __func__);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * Wipes the in-core bitset, and creates a new on disk bitset.
+ */
+static int writeset_init(struct dm_disk_bitset *info, struct writeset *ws)
+{
+	int r;
+
+	memset(ws->bits, 0, bitset_size(ws->md.nr_bits));
+
+	r = setup_on_disk_bitset(info, ws->md.nr_bits, &ws->md.root);
+	if (r) {
+		DMERR("%s: setup_on_disk_bitset failed", __func__);
+		return r;
+	}
+
+	return 0;
+}
+
+static bool writeset_marked(struct writeset *ws, dm_block_t block)
+{
+	return test_bit(block, ws->bits);
+}
+
+static int writeset_marked_on_disk(struct dm_disk_bitset *info,
+				   struct writeset_metadata *m, dm_block_t block,
+				   bool *result)
+{
+	dm_block_t old = m->root;
+
+	/*
+	 * The bitset was flushed when it was archived, so we know there'll
+	 * be no change to the root.
+	 */
+	int r = dm_bitset_test_bit(info, m->root, block, &m->root, result);
+	if (r) {
+		DMERR("%s: dm_bitset_test_bit failed", __func__);
+		return r;
+	}
+
+	BUG_ON(m->root != old);
+
+	return r;
+}
+
+/*
+ * Returns < 0 on error, 0 if the bit wasn't previously set, 1 if it was.
+ */
+static int writeset_test_and_set(struct dm_disk_bitset *info,
+				 struct writeset *ws, uint32_t block)
+{
+	int r;
+
+	if (!test_and_set_bit(block, ws->bits)) {
+		r = dm_bitset_set_bit(info, ws->md.root, block, &ws->md.root);
+		if (r) {
+			/* FIXME: fail mode */
+			return r;
+		}
+
+		return 0;
+	}
+
+	return 1;
+}
+
+/*----------------------------------------------------------------
+ * On disk metadata layout
+ *--------------------------------------------------------------*/
+#define SPACE_MAP_ROOT_SIZE 128
+#define UUID_LEN 16
+
+struct writeset_disk {
+	__le32 nr_bits;
+	__le64 root;
+} __packed;
+
+struct superblock_disk {
+	__le32 csum;
+	__le32 flags;
+	__le64 blocknr;
+
+	__u8 uuid[UUID_LEN];
+	__le64 magic;
+	__le32 version;
+
+	__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
+
+	__le32 data_block_size;
+	__le32 metadata_block_size;
+	__le32 nr_blocks;
+
+	__le32 current_era;
+	struct writeset_disk current_writeset;
+
+	/*
+	 * Only these two fields are valid within the metadata snapshot.
+	 */
+	__le64 writeset_tree_root;
+	__le64 era_array_root;
+
+	__le64 metadata_snap;
+} __packed;
+
+/*----------------------------------------------------------------
+ * Superblock validation
+ *--------------------------------------------------------------*/
+static void sb_prepare_for_write(struct dm_block_validator *v,
+				 struct dm_block *b,
+				 size_t sb_block_size)
+{
+	struct superblock_disk *disk = dm_block_data(b);
+
+	disk->blocknr = cpu_to_le64(dm_block_location(b));
+	disk->csum = cpu_to_le32(dm_bm_checksum(&disk->flags,
+						sb_block_size - sizeof(__le32),
+						SUPERBLOCK_CSUM_XOR));
+}
+
+static int check_metadata_version(struct superblock_disk *disk)
+{
+	uint32_t metadata_version = le32_to_cpu(disk->version);
+	if (metadata_version < MIN_ERA_VERSION || metadata_version > MAX_ERA_VERSION) {
+		DMERR("Era metadata version %u found, but only versions between %u and %u supported.",
+		      metadata_version, MIN_ERA_VERSION, MAX_ERA_VERSION);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int sb_check(struct dm_block_validator *v,
+		    struct dm_block *b,
+		    size_t sb_block_size)
+{
+	struct superblock_disk *disk = dm_block_data(b);
+	__le32 csum_le;
+
+	if (dm_block_location(b) != le64_to_cpu(disk->blocknr)) {
+		DMERR("sb_check failed: blocknr %llu: wanted %llu",
+		      le64_to_cpu(disk->blocknr),
+		      (unsigned long long)dm_block_location(b));
+		return -ENOTBLK;
+	}
+
+	if (le64_to_cpu(disk->magic) != SUPERBLOCK_MAGIC) {
+		DMERR("sb_check failed: magic %llu: wanted %llu",
+		      le64_to_cpu(disk->magic),
+		      (unsigned long long) SUPERBLOCK_MAGIC);
+		return -EILSEQ;
+	}
+
+	csum_le = cpu_to_le32(dm_bm_checksum(&disk->flags,
+					     sb_block_size - sizeof(__le32),
+					     SUPERBLOCK_CSUM_XOR));
+	if (csum_le != disk->csum) {
+		DMERR("sb_check failed: csum %u: wanted %u",
+		      le32_to_cpu(csum_le), le32_to_cpu(disk->csum));
+		return -EILSEQ;
+	}
+
+	return check_metadata_version(disk);
+}
+
+static struct dm_block_validator sb_validator = {
+	.name = "superblock",
+	.prepare_for_write = sb_prepare_for_write,
+	.check = sb_check
+};
+
+/*----------------------------------------------------------------
+ * Low level metadata handling
+ *--------------------------------------------------------------*/
+#define DM_ERA_METADATA_BLOCK_SIZE 4096
+#define DM_ERA_METADATA_CACHE_SIZE 64
+#define ERA_MAX_CONCURRENT_LOCKS 5
+
+struct era_metadata {
+	struct block_device *bdev;
+	struct dm_block_manager *bm;
+	struct dm_space_map *sm;
+	struct dm_transaction_manager *tm;
+
+	dm_block_t block_size;
+	uint32_t nr_blocks;
+
+	uint32_t current_era;
+
+	/*
+	 * We preallocate 2 writesets.  When an era rolls over we
+	 * switch between them. This means the allocation is done at
+	 * preresume time, rather than on the io path.
+	 */
+	struct writeset writesets[2];
+	struct writeset *current_writeset;
+
+	dm_block_t writeset_tree_root;
+	dm_block_t era_array_root;
+
+	struct dm_disk_bitset bitset_info;
+	struct dm_btree_info writeset_tree_info;
+	struct dm_array_info era_array_info;
+
+	dm_block_t metadata_snap;
+
+	/*
+	 * A flag that is set whenever a writeset has been archived.
+	 */
+	bool archived_writesets;
+
+	/*
+	 * Reading the space map root can fail, so we read it into this
+	 * buffer before the superblock is locked and updated.
+	 */
+	__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
+};
+
+static int superblock_read_lock(struct era_metadata *md,
+				struct dm_block **sblock)
+{
+	return dm_bm_read_lock(md->bm, SUPERBLOCK_LOCATION,
+			       &sb_validator, sblock);
+}
+
+static int superblock_lock_zero(struct era_metadata *md,
+				struct dm_block **sblock)
+{
+	return dm_bm_write_lock_zero(md->bm, SUPERBLOCK_LOCATION,
+				     &sb_validator, sblock);
+}
+
+static int superblock_lock(struct era_metadata *md,
+			   struct dm_block **sblock)
+{
+	return dm_bm_write_lock(md->bm, SUPERBLOCK_LOCATION,
+				&sb_validator, sblock);
+}
+
+/* FIXME: duplication with cache and thin */
+static int superblock_all_zeroes(struct dm_block_manager *bm, bool *result)
+{
+	int r;
+	unsigned i;
+	struct dm_block *b;
+	__le64 *data_le, zero = cpu_to_le64(0);
+	unsigned sb_block_size = dm_bm_block_size(bm) / sizeof(__le64);
+
+	/*
+	 * We can't use a validator here - it may be all zeroes.
+	 */
+	r = dm_bm_read_lock(bm, SUPERBLOCK_LOCATION, NULL, &b);
+	if (r)
+		return r;
+
+	data_le = dm_block_data(b);
+	*result = true;
+	for (i = 0; i < sb_block_size; i++) {
+		if (data_le[i] != zero) {
+			*result = false;
+			break;
+		}
+	}
+
+	return dm_bm_unlock(b);
+}
+
+/*----------------------------------------------------------------*/
+
+static void ws_pack(const struct writeset_metadata *core, struct writeset_disk *disk)
+{
+	disk->nr_bits = cpu_to_le32(core->nr_bits);
+	disk->root = cpu_to_le64(core->root);
+}
+
+static void ws_unpack(const struct writeset_disk *disk, struct writeset_metadata *core)
+{
+	core->nr_bits = le32_to_cpu(disk->nr_bits);
+	core->root = le64_to_cpu(disk->root);
+}
+
+static void ws_inc(void *context, const void *value)
+{
+	struct era_metadata *md = context;
+	struct writeset_disk ws_d;
+	dm_block_t b;
+
+	memcpy(&ws_d, value, sizeof(ws_d));
+	b = le64_to_cpu(ws_d.root);
+
+	dm_tm_inc(md->tm, b);
+}
+
+static void ws_dec(void *context, const void *value)
+{
+	struct era_metadata *md = context;
+	struct writeset_disk ws_d;
+	dm_block_t b;
+
+	memcpy(&ws_d, value, sizeof(ws_d));
+	b = le64_to_cpu(ws_d.root);
+
+	dm_bitset_del(&md->bitset_info, b);
+}
+
+static int ws_eq(void *context, const void *value1, const void *value2)
+{
+	return !memcmp(value1, value2, sizeof(struct writeset_metadata));
+}
+
+/*----------------------------------------------------------------*/
+
+static void setup_writeset_tree_info(struct era_metadata *md)
+{
+	struct dm_btree_value_type *vt = &md->writeset_tree_info.value_type;
+	md->writeset_tree_info.tm = md->tm;
+	md->writeset_tree_info.levels = 1;
+	vt->context = md;
+	vt->size = sizeof(struct writeset_disk);
+	vt->inc = ws_inc;
+	vt->dec = ws_dec;
+	vt->equal = ws_eq;
+}
+
+static void setup_era_array_info(struct era_metadata *md)
+
+{
+	struct dm_btree_value_type vt;
+	vt.context = NULL;
+	vt.size = sizeof(__le32);
+	vt.inc = NULL;
+	vt.dec = NULL;
+	vt.equal = NULL;
+
+	dm_array_info_init(&md->era_array_info, md->tm, &vt);
+}
+
+static void setup_infos(struct era_metadata *md)
+{
+	dm_disk_bitset_init(md->tm, &md->bitset_info);
+	setup_writeset_tree_info(md);
+	setup_era_array_info(md);
+}
+
+/*----------------------------------------------------------------*/
+
+static int create_fresh_metadata(struct era_metadata *md)
+{
+	int r;
+
+	r = dm_tm_create_with_sm(md->bm, SUPERBLOCK_LOCATION,
+				 &md->tm, &md->sm);
+	if (r < 0) {
+		DMERR("dm_tm_create_with_sm failed");
+		return r;
+	}
+
+	setup_infos(md);
+
+	r = dm_btree_empty(&md->writeset_tree_info, &md->writeset_tree_root);
+	if (r) {
+		DMERR("couldn't create new writeset tree");
+		goto bad;
+	}
+
+	r = dm_array_empty(&md->era_array_info, &md->era_array_root);
+	if (r) {
+		DMERR("couldn't create era array");
+		goto bad;
+	}
+
+	return 0;
+
+bad:
+	dm_sm_destroy(md->sm);
+	dm_tm_destroy(md->tm);
+
+	return r;
+}
+
+static int save_sm_root(struct era_metadata *md)
+{
+	int r;
+	size_t metadata_len;
+
+	r = dm_sm_root_size(md->sm, &metadata_len);
+	if (r < 0)
+		return r;
+
+	return dm_sm_copy_root(md->sm, &md->metadata_space_map_root,
+			       metadata_len);
+}
+
+static void copy_sm_root(struct era_metadata *md, struct superblock_disk *disk)
+{
+	memcpy(&disk->metadata_space_map_root,
+	       &md->metadata_space_map_root,
+	       sizeof(md->metadata_space_map_root));
+}
+
+/*
+ * Writes a superblock, including the static fields that don't get updated
+ * with every commit (possible optimisation here).  'md' should be fully
+ * constructed when this is called.
+ */
+static void prepare_superblock(struct era_metadata *md, struct superblock_disk *disk)
+{
+	disk->magic = cpu_to_le64(SUPERBLOCK_MAGIC);
+	disk->flags = cpu_to_le32(0ul);
+
+	/* FIXME: can't keep blanking the uuid (uuid is currently unused though) */
+	memset(disk->uuid, 0, sizeof(disk->uuid));
+	disk->version = cpu_to_le32(MAX_ERA_VERSION);
+
+	copy_sm_root(md, disk);
+
+	disk->data_block_size = cpu_to_le32(md->block_size);
+	disk->metadata_block_size = cpu_to_le32(DM_ERA_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
+	disk->nr_blocks = cpu_to_le32(md->nr_blocks);
+	disk->current_era = cpu_to_le32(md->current_era);
+
+	ws_pack(&md->current_writeset->md, &disk->current_writeset);
+	disk->writeset_tree_root = cpu_to_le64(md->writeset_tree_root);
+	disk->era_array_root = cpu_to_le64(md->era_array_root);
+	disk->metadata_snap = cpu_to_le64(md->metadata_snap);
+}
+
+static int write_superblock(struct era_metadata *md)
+{
+	int r;
+	struct dm_block *sblock;
+	struct superblock_disk *disk;
+
+	r = save_sm_root(md);
+	if (r) {
+		DMERR("%s: save_sm_root failed", __func__);
+		return r;
+	}
+
+	r = superblock_lock_zero(md, &sblock);
+	if (r)
+		return r;
+
+	disk = dm_block_data(sblock);
+	prepare_superblock(md, disk);
+
+	return dm_tm_commit(md->tm, sblock);
+}
+
+/*
+ * Assumes block_size and the infos are set.
+ */
+static int format_metadata(struct era_metadata *md)
+{
+	int r;
+
+	r = create_fresh_metadata(md);
+	if (r)
+		return r;
+
+	r = write_superblock(md);
+	if (r) {
+		dm_sm_destroy(md->sm);
+		dm_tm_destroy(md->tm);
+		return r;
+	}
+
+	return 0;
+}
+
+static int open_metadata(struct era_metadata *md)
+{
+	int r;
+	struct dm_block *sblock;
+	struct superblock_disk *disk;
+
+	r = superblock_read_lock(md, &sblock);
+	if (r) {
+		DMERR("couldn't read_lock superblock");
+		return r;
+	}
+
+	disk = dm_block_data(sblock);
+	r = dm_tm_open_with_sm(md->bm, SUPERBLOCK_LOCATION,
+			       disk->metadata_space_map_root,
+			       sizeof(disk->metadata_space_map_root),
+			       &md->tm, &md->sm);
+	if (r) {
+		DMERR("dm_tm_open_with_sm failed");
+		goto bad;
+	}
+
+	setup_infos(md);
+
+	md->block_size = le32_to_cpu(disk->data_block_size);
+	md->nr_blocks = le32_to_cpu(disk->nr_blocks);
+	md->current_era = le32_to_cpu(disk->current_era);
+
+	md->writeset_tree_root = le64_to_cpu(disk->writeset_tree_root);
+	md->era_array_root = le64_to_cpu(disk->era_array_root);
+	md->metadata_snap = le64_to_cpu(disk->metadata_snap);
+	md->archived_writesets = true;
+
+	return dm_bm_unlock(sblock);
+
+bad:
+	dm_bm_unlock(sblock);
+	return r;
+}
+
+static int open_or_format_metadata(struct era_metadata *md,
+				   bool may_format)
+{
+	int r;
+	bool unformatted = false;
+
+	r = superblock_all_zeroes(md->bm, &unformatted);
+	if (r)
+		return r;
+
+	if (unformatted)
+		return may_format ? format_metadata(md) : -EPERM;
+
+	return open_metadata(md);
+}
+
+static int create_persistent_data_objects(struct era_metadata *md,
+					  bool may_format)
+{
+	int r;
+
+	md->bm = dm_block_manager_create(md->bdev, DM_ERA_METADATA_BLOCK_SIZE,
+					 DM_ERA_METADATA_CACHE_SIZE,
+					 ERA_MAX_CONCURRENT_LOCKS);
+	if (IS_ERR(md->bm)) {
+		DMERR("could not create block manager");
+		return PTR_ERR(md->bm);
+	}
+
+	r = open_or_format_metadata(md, may_format);
+	if (r)
+		dm_block_manager_destroy(md->bm);
+
+	return r;
+}
+
+static void destroy_persistent_data_objects(struct era_metadata *md)
+{
+	dm_sm_destroy(md->sm);
+	dm_tm_destroy(md->tm);
+	dm_block_manager_destroy(md->bm);
+}
+
+/*
+ * This waits until all era_map threads have picked up the new filter.
+ */
+static void swap_writeset(struct era_metadata *md, struct writeset *new_writeset)
+{
+	rcu_assign_pointer(md->current_writeset, new_writeset);
+	synchronize_rcu();
+}
+
+/*----------------------------------------------------------------
+ * Writesets get 'digested' into the main era array.
+ *
+ * We're using a coroutine here so the worker thread can do the digestion,
+ * thus avoiding synchronisation of the metadata.  Digesting a whole
+ * writeset in one go would cause too much latency.
+ *--------------------------------------------------------------*/
+struct digest {
+	uint32_t era;
+	unsigned nr_bits, current_bit;
+	struct writeset_metadata writeset;
+	__le32 value;
+	struct dm_disk_bitset info;
+
+	int (*step)(struct era_metadata *, struct digest *);
+};
+
+static int metadata_digest_lookup_writeset(struct era_metadata *md,
+					   struct digest *d);
+
+static int metadata_digest_remove_writeset(struct era_metadata *md,
+					   struct digest *d)
+{
+	int r;
+	uint64_t key = d->era;
+
+	r = dm_btree_remove(&md->writeset_tree_info, md->writeset_tree_root,
+			    &key, &md->writeset_tree_root);
+	if (r) {
+		DMERR("%s: dm_btree_remove failed", __func__);
+		return r;
+	}
+
+	d->step = metadata_digest_lookup_writeset;
+	return 0;
+}
+
+#define INSERTS_PER_STEP 100
+
+static int metadata_digest_transcribe_writeset(struct era_metadata *md,
+					       struct digest *d)
+{
+	int r;
+	bool marked;
+	unsigned b, e = min(d->current_bit + INSERTS_PER_STEP, d->nr_bits);
+
+	for (b = d->current_bit; b < e; b++) {
+		r = writeset_marked_on_disk(&d->info, &d->writeset, b, &marked);
+		if (r) {
+			DMERR("%s: writeset_marked_on_disk failed", __func__);
+			return r;
+		}
+
+		if (!marked)
+			continue;
+
+		__dm_bless_for_disk(&d->value);
+		r = dm_array_set_value(&md->era_array_info, md->era_array_root,
+				       b, &d->value, &md->era_array_root);
+		if (r) {
+			DMERR("%s: dm_array_set_value failed", __func__);
+			return r;
+		}
+	}
+
+	if (b == d->nr_bits)
+		d->step = metadata_digest_remove_writeset;
+	else
+		d->current_bit = b;
+
+	return 0;
+}
+
+static int metadata_digest_lookup_writeset(struct era_metadata *md,
+					   struct digest *d)
+{
+	int r;
+	uint64_t key;
+	struct writeset_disk disk;
+
+	r = dm_btree_find_lowest_key(&md->writeset_tree_info,
+				     md->writeset_tree_root, &key);
+	if (r < 0)
+		return r;
+
+	d->era = key;
+
+	r = dm_btree_lookup(&md->writeset_tree_info,
+			    md->writeset_tree_root, &key, &disk);
+	if (r) {
+		if (r == -ENODATA) {
+			d->step = NULL;
+			return 0;
+		}
+
+		DMERR("%s: dm_btree_lookup failed", __func__);
+		return r;
+	}
+
+	ws_unpack(&disk, &d->writeset);
+	d->value = cpu_to_le32(key);
+
+	d->nr_bits = min(d->writeset.nr_bits, md->nr_blocks);
+	d->current_bit = 0;
+	d->step = metadata_digest_transcribe_writeset;
+
+	return 0;
+}
+
+static int metadata_digest_start(struct era_metadata *md, struct digest *d)
+{
+	if (d->step)
+		return 0;
+
+	memset(d, 0, sizeof(*d));
+
+	/*
+	 * We initialise another bitset info to avoid any caching side
+	 * effects with the previous one.
+	 */
+	dm_disk_bitset_init(md->tm, &d->info);
+	d->step = metadata_digest_lookup_writeset;
+
+	return 0;
+}
+
+/*----------------------------------------------------------------
+ * High level metadata interface.  Target methods should use these, and not
+ * the lower level ones.
+ *--------------------------------------------------------------*/
+static struct era_metadata *metadata_open(struct block_device *bdev,
+					  sector_t block_size,
+					  bool may_format)
+{
+	int r;
+	struct era_metadata *md = kzalloc(sizeof(*md), GFP_KERNEL);
+
+	if (!md)
+		return NULL;
+
+	md->bdev = bdev;
+	md->block_size = block_size;
+
+	md->writesets[0].md.root = INVALID_WRITESET_ROOT;
+	md->writesets[1].md.root = INVALID_WRITESET_ROOT;
+	md->current_writeset = &md->writesets[0];
+
+	r = create_persistent_data_objects(md, may_format);
+	if (r) {
+		kfree(md);
+		return ERR_PTR(r);
+	}
+
+	return md;
+}
+
+static void metadata_close(struct era_metadata *md)
+{
+	destroy_persistent_data_objects(md);
+	kfree(md);
+}
+
+static bool valid_nr_blocks(dm_block_t n)
+{
+	/*
+	 * dm_bitset restricts us to 2^32.  test_bit & co. restrict us
+	 * further to 2^31 - 1
+	 */
+	return n < (1ull << 31);
+}
+
+static int metadata_resize(struct era_metadata *md, void *arg)
+{
+	int r;
+	dm_block_t *new_size = arg;
+	__le32 value;
+
+	if (!valid_nr_blocks(*new_size)) {
+		DMERR("Invalid number of origin blocks %llu",
+		      (unsigned long long) *new_size);
+		return -EINVAL;
+	}
+
+	writeset_free(&md->writesets[0]);
+	writeset_free(&md->writesets[1]);
+
+	r = writeset_alloc(&md->writesets[0], *new_size);
+	if (r) {
+		DMERR("%s: writeset_alloc failed for writeset 0", __func__);
+		return r;
+	}
+
+	r = writeset_alloc(&md->writesets[1], *new_size);
+	if (r) {
+		DMERR("%s: writeset_alloc failed for writeset 1", __func__);
+		return r;
+	}
+
+	value = cpu_to_le32(0u);
+	__dm_bless_for_disk(&value);
+	r = dm_array_resize(&md->era_array_info, md->era_array_root,
+			    md->nr_blocks, *new_size,
+			    &value, &md->era_array_root);
+	if (r) {
+		DMERR("%s: dm_array_resize failed", __func__);
+		return r;
+	}
+
+	md->nr_blocks = *new_size;
+	return 0;
+}
+
+static int metadata_era_archive(struct era_metadata *md)
+{
+	int r;
+	uint64_t keys[1];
+	struct writeset_disk value;
+
+	r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root,
+			    &md->current_writeset->md.root);
+	if (r) {
+		DMERR("%s: dm_bitset_flush failed", __func__);
+		return r;
+	}
+
+	ws_pack(&md->current_writeset->md, &value);
+	md->current_writeset->md.root = INVALID_WRITESET_ROOT;
+
+	keys[0] = md->current_era;
+	__dm_bless_for_disk(&value);
+	r = dm_btree_insert(&md->writeset_tree_info, md->writeset_tree_root,
+			    keys, &value, &md->writeset_tree_root);
+	if (r) {
+		DMERR("%s: couldn't insert writeset into btree", __func__);
+		/* FIXME: fail mode */
+		return r;
+	}
+
+	md->archived_writesets = true;
+
+	return 0;
+}
+
+static struct writeset *next_writeset(struct era_metadata *md)
+{
+	return (md->current_writeset == &md->writesets[0]) ?
+		&md->writesets[1] : &md->writesets[0];
+}
+
+static int metadata_new_era(struct era_metadata *md)
+{
+	int r;
+	struct writeset *new_writeset = next_writeset(md);
+
+	r = writeset_init(&md->bitset_info, new_writeset);
+	if (r) {
+		DMERR("%s: writeset_init failed", __func__);
+		return r;
+	}
+
+	swap_writeset(md, new_writeset);
+	md->current_era++;
+
+	return 0;
+}
+
+static int metadata_era_rollover(struct era_metadata *md)
+{
+	int r;
+
+	if (md->current_writeset->md.root != INVALID_WRITESET_ROOT) {
+		r = metadata_era_archive(md);
+		if (r) {
+			DMERR("%s: metadata_archive_era failed", __func__);
+			/* FIXME: fail mode? */
+			return r;
+		}
+	}
+
+	r = metadata_new_era(md);
+	if (r) {
+		DMERR("%s: new era failed", __func__);
+		/* FIXME: fail mode */
+		return r;
+	}
+
+	return 0;
+}
+
+static bool metadata_current_marked(struct era_metadata *md, dm_block_t block)
+{
+	bool r;
+	struct writeset *ws;
+
+	rcu_read_lock();
+	ws = rcu_dereference(md->current_writeset);
+	r = writeset_marked(ws, block);
+	rcu_read_unlock();
+
+	return r;
+}
+
+static int metadata_commit(struct era_metadata *md)
+{
+	int r;
+	struct dm_block *sblock;
+
+	if (md->current_writeset->md.root != SUPERBLOCK_LOCATION) {
+		r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root,
+				    &md->current_writeset->md.root);
+		if (r) {
+			DMERR("%s: bitset flush failed", __func__);
+			return r;
+		}
+	}
+
+	r = save_sm_root(md);
+	if (r) {
+		DMERR("%s: save_sm_root failed", __func__);
+		return r;
+	}
+
+	r = dm_tm_pre_commit(md->tm);
+	if (r) {
+		DMERR("%s: pre commit failed", __func__);
+		return r;
+	}
+
+	r = superblock_lock(md, &sblock);
+	if (r) {
+		DMERR("%s: superblock lock failed", __func__);
+		return r;
+	}
+
+	prepare_superblock(md, dm_block_data(sblock));
+
+	return dm_tm_commit(md->tm, sblock);
+}
+
+static int metadata_checkpoint(struct era_metadata *md)
+{
+	/*
+	 * For now we just rollover, but later I want to put a check in to
+	 * avoid this if the filter is still pretty fresh.
+	 */
+	return metadata_era_rollover(md);
+}
+
+/*
+ * Metadata snapshots allow userland to access era data.
+ */
+static int metadata_take_snap(struct era_metadata *md)
+{
+	int r, inc;
+	struct dm_block *clone;
+
+	if (md->metadata_snap != SUPERBLOCK_LOCATION) {
+		DMERR("%s: metadata snapshot already exists", __func__);
+		return -EINVAL;
+	}
+
+	r = metadata_era_rollover(md);
+	if (r) {
+		DMERR("%s: era rollover failed", __func__);
+		return r;
+	}
+
+	r = metadata_commit(md);
+	if (r) {
+		DMERR("%s: pre commit failed", __func__);
+		return r;
+	}
+
+	r = dm_sm_inc_block(md->sm, SUPERBLOCK_LOCATION);
+	if (r) {
+		DMERR("%s: couldn't increment superblock", __func__);
+		return r;
+	}
+
+	r = dm_tm_shadow_block(md->tm, SUPERBLOCK_LOCATION,
+			       &sb_validator, &clone, &inc);
+	if (r) {
+		DMERR("%s: couldn't shadow superblock", __func__);
+		dm_sm_dec_block(md->sm, SUPERBLOCK_LOCATION);
+		return r;
+	}
+	BUG_ON(!inc);
+
+	r = dm_sm_inc_block(md->sm, md->writeset_tree_root);
+	if (r) {
+		DMERR("%s: couldn't inc writeset tree root", __func__);
+		dm_tm_unlock(md->tm, clone);
+		return r;
+	}
+
+	r = dm_sm_inc_block(md->sm, md->era_array_root);
+	if (r) {
+		DMERR("%s: couldn't inc era tree root", __func__);
+		dm_sm_dec_block(md->sm, md->writeset_tree_root);
+		dm_tm_unlock(md->tm, clone);
+		return r;
+	}
+
+	md->metadata_snap = dm_block_location(clone);
+
+	r = dm_tm_unlock(md->tm, clone);
+	if (r) {
+		DMERR("%s: couldn't unlock clone", __func__);
+		md->metadata_snap = SUPERBLOCK_LOCATION;
+		return r;
+	}
+
+	return 0;
+}
+
+static int metadata_drop_snap(struct era_metadata *md)
+{
+	int r;
+	dm_block_t location;
+	struct dm_block *clone;
+	struct superblock_disk *disk;
+
+	if (md->metadata_snap == SUPERBLOCK_LOCATION) {
+		DMERR("%s: no snap to drop", __func__);
+		return -EINVAL;
+	}
+
+	r = dm_tm_read_lock(md->tm, md->metadata_snap, &sb_validator, &clone);
+	if (r) {
+		DMERR("%s: couldn't read lock superblock clone", __func__);
+		return r;
+	}
+
+	/*
+	 * Whatever happens now we'll commit with no record of the metadata
+	 * snap.
+	 */
+	md->metadata_snap = SUPERBLOCK_LOCATION;
+
+	disk = dm_block_data(clone);
+	r = dm_btree_del(&md->writeset_tree_info,
+			 le64_to_cpu(disk->writeset_tree_root));
+	if (r) {
+		DMERR("%s: error deleting writeset tree clone", __func__);
+		dm_tm_unlock(md->tm, clone);
+		return r;
+	}
+
+	r = dm_array_del(&md->era_array_info, le64_to_cpu(disk->era_array_root));
+	if (r) {
+		DMERR("%s: error deleting era array clone", __func__);
+		dm_tm_unlock(md->tm, clone);
+		return r;
+	}
+
+	location = dm_block_location(clone);
+	dm_tm_unlock(md->tm, clone);
+
+	return dm_sm_dec_block(md->sm, location);
+}
+
+struct metadata_stats {
+	dm_block_t used;
+	dm_block_t total;
+	dm_block_t snap;
+	uint32_t era;
+};
+
+static int metadata_get_stats(struct era_metadata *md, void *ptr)
+{
+	int r;
+	struct metadata_stats *s = ptr;
+	dm_block_t nr_free, nr_total;
+
+	r = dm_sm_get_nr_free(md->sm, &nr_free);
+	if (r) {
+		DMERR("dm_sm_get_nr_free returned %d", r);
+		return r;
+	}
+
+	r = dm_sm_get_nr_blocks(md->sm, &nr_total);
+	if (r) {
+		DMERR("dm_pool_get_metadata_dev_size returned %d", r);
+		return r;
+	}
+
+	s->used = nr_total - nr_free;
+	s->total = nr_total;
+	s->snap = md->metadata_snap;
+	s->era = md->current_era;
+
+	return 0;
+}
+
+/*----------------------------------------------------------------*/
+
+struct era {
+	struct dm_target *ti;
+	struct dm_target_callbacks callbacks;
+
+	struct dm_dev *metadata_dev;
+	struct dm_dev *origin_dev;
+
+	dm_block_t nr_blocks;
+	uint32_t sectors_per_block;
+	int sectors_per_block_shift;
+	struct era_metadata *md;
+
+	struct workqueue_struct *wq;
+	struct work_struct worker;
+
+	spinlock_t deferred_lock;
+	struct bio_list deferred_bios;
+
+	spinlock_t rpc_lock;
+	struct list_head rpc_calls;
+
+	struct digest digest;
+	atomic_t suspended;
+};
+
+struct rpc {
+	struct list_head list;
+
+	int (*fn0)(struct era_metadata *);
+	int (*fn1)(struct era_metadata *, void *);
+	void *arg;
+	int result;
+
+	struct completion complete;
+};
+
+/*----------------------------------------------------------------
+ * Remapping.
+ *---------------------------------------------------------------*/
+static bool block_size_is_power_of_two(struct era *era)
+{
+	return era->sectors_per_block_shift >= 0;
+}
+
+static dm_block_t get_block(struct era *era, struct bio *bio)
+{
+	sector_t block_nr = bio->bi_iter.bi_sector;
+
+	if (!block_size_is_power_of_two(era))
+		(void) sector_div(block_nr, era->sectors_per_block);
+	else
+		block_nr >>= era->sectors_per_block_shift;
+
+	return block_nr;
+}
+
+static void remap_to_origin(struct era *era, struct bio *bio)
+{
+	bio->bi_bdev = era->origin_dev->bdev;
+}
+
+/*----------------------------------------------------------------
+ * Worker thread
+ *--------------------------------------------------------------*/
+static void wake_worker(struct era *era)
+{
+	if (!atomic_read(&era->suspended))
+		queue_work(era->wq, &era->worker);
+}
+
+static void process_old_eras(struct era *era)
+{
+	int r;
+
+	if (!era->digest.step)
+		return;
+
+	r = era->digest.step(era->md, &era->digest);
+	if (r < 0) {
+		DMERR("%s: digest step failed, stopping digestion", __func__);
+		era->digest.step = NULL;
+
+	} else if (era->digest.step)
+		wake_worker(era);
+}
+
+static void process_deferred_bios(struct era *era)
+{
+	int r;
+	struct bio_list deferred_bios, marked_bios;
+	struct bio *bio;
+	bool commit_needed = false;
+	bool failed = false;
+
+	bio_list_init(&deferred_bios);
+	bio_list_init(&marked_bios);
+
+	spin_lock(&era->deferred_lock);
+	bio_list_merge(&deferred_bios, &era->deferred_bios);
+	bio_list_init(&era->deferred_bios);
+	spin_unlock(&era->deferred_lock);
+
+	while ((bio = bio_list_pop(&deferred_bios))) {
+		r = writeset_test_and_set(&era->md->bitset_info,
+					  era->md->current_writeset,
+					  get_block(era, bio));
+		if (r < 0) {
+			/*
+			 * This is bad news, we need to rollback.
+			 * FIXME: finish.
+			 */
+			failed = true;
+
+		} else if (r == 0)
+			commit_needed = true;
+
+		bio_list_add(&marked_bios, bio);
+	}
+
+	if (commit_needed) {
+		r = metadata_commit(era->md);
+		if (r)
+			failed = true;
+	}
+
+	if (failed)
+		while ((bio = bio_list_pop(&marked_bios)))
+			bio_io_error(bio);
+	else
+		while ((bio = bio_list_pop(&marked_bios)))
+			generic_make_request(bio);
+}
+
+static void process_rpc_calls(struct era *era)
+{
+	int r;
+	bool need_commit = false;
+	struct list_head calls;
+	struct rpc *rpc, *tmp;
+
+	INIT_LIST_HEAD(&calls);
+	spin_lock(&era->rpc_lock);
+	list_splice_init(&era->rpc_calls, &calls);
+	spin_unlock(&era->rpc_lock);
+
+	list_for_each_entry_safe(rpc, tmp, &calls, list) {
+		rpc->result = rpc->fn0 ? rpc->fn0(era->md) : rpc->fn1(era->md, rpc->arg);
+		need_commit = true;
+	}
+
+	if (need_commit) {
+		r = metadata_commit(era->md);
+		if (r)
+			list_for_each_entry_safe(rpc, tmp, &calls, list)
+				rpc->result = r;
+	}
+
+	list_for_each_entry_safe(rpc, tmp, &calls, list)
+		complete(&rpc->complete);
+}
+
+static void kick_off_digest(struct era *era)
+{
+	if (era->md->archived_writesets) {
+		era->md->archived_writesets = false;
+		metadata_digest_start(era->md, &era->digest);
+	}
+}
+
+static void do_work(struct work_struct *ws)
+{
+	struct era *era = container_of(ws, struct era, worker);
+
+	kick_off_digest(era);
+	process_old_eras(era);
+	process_deferred_bios(era);
+	process_rpc_calls(era);
+}
+
+static void defer_bio(struct era *era, struct bio *bio)
+{
+	spin_lock(&era->deferred_lock);
+	bio_list_add(&era->deferred_bios, bio);
+	spin_unlock(&era->deferred_lock);
+
+	wake_worker(era);
+}
+
+/*
+ * Make an rpc call to the worker to change the metadata.
+ */
+static int perform_rpc(struct era *era, struct rpc *rpc)
+{
+	rpc->result = 0;
+	init_completion(&rpc->complete);
+
+	spin_lock(&era->rpc_lock);
+	list_add(&rpc->list, &era->rpc_calls);
+	spin_unlock(&era->rpc_lock);
+
+	wake_worker(era);
+	wait_for_completion(&rpc->complete);
+
+	return rpc->result;
+}
+
+static int in_worker0(struct era *era, int (*fn)(struct era_metadata *))
+{
+	struct rpc rpc;
+	rpc.fn0 = fn;
+	rpc.fn1 = NULL;
+
+	return perform_rpc(era, &rpc);
+}
+
+static int in_worker1(struct era *era,
+		      int (*fn)(struct era_metadata *, void *), void *arg)
+{
+	struct rpc rpc;
+	rpc.fn0 = NULL;
+	rpc.fn1 = fn;
+	rpc.arg = arg;
+
+	return perform_rpc(era, &rpc);
+}
+
+static void start_worker(struct era *era)
+{
+	atomic_set(&era->suspended, 0);
+}
+
+static void stop_worker(struct era *era)
+{
+	atomic_set(&era->suspended, 1);
+	flush_workqueue(era->wq);
+}
+
+/*----------------------------------------------------------------
+ * Target methods
+ *--------------------------------------------------------------*/
+static int dev_is_congested(struct dm_dev *dev, int bdi_bits)
+{
+	struct request_queue *q = bdev_get_queue(dev->bdev);
+	return bdi_congested(&q->backing_dev_info, bdi_bits);
+}
+
+static int era_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
+{
+	struct era *era = container_of(cb, struct era, callbacks);
+	return dev_is_congested(era->origin_dev, bdi_bits);
+}
+
+static void era_destroy(struct era *era)
+{
+	if (era->md)
+		metadata_close(era->md);
+
+	if (era->wq)
+		destroy_workqueue(era->wq);
+
+	if (era->origin_dev)
+		dm_put_device(era->ti, era->origin_dev);
+
+	if (era->metadata_dev)
+		dm_put_device(era->ti, era->metadata_dev);
+
+	kfree(era);
+}
+
+static dm_block_t calc_nr_blocks(struct era *era)
+{
+	return dm_sector_div_up(era->ti->len, era->sectors_per_block);
+}
+
+static bool valid_block_size(dm_block_t block_size)
+{
+	bool greater_than_zero = block_size > 0;
+	bool multiple_of_min_block_size = (block_size & (MIN_BLOCK_SIZE - 1)) == 0;
+
+	return greater_than_zero && multiple_of_min_block_size;
+}
+
+/*
+ * <metadata dev> <data dev> <data block size (sectors)>
+ */
+static int era_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+	int r;
+	char dummy;
+	struct era *era;
+	struct era_metadata *md;
+
+	if (argc != 3) {
+		ti->error = "Invalid argument count";
+		return -EINVAL;
+	}
+
+	era = kzalloc(sizeof(*era), GFP_KERNEL);
+	if (!era) {
+		ti->error = "Error allocating era structure";
+		return -ENOMEM;
+	}
+
+	era->ti = ti;
+
+	r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &era->metadata_dev);
+	if (r) {
+		ti->error = "Error opening metadata device";
+		era_destroy(era);
+		return -EINVAL;
+	}
+
+	r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &era->origin_dev);
+	if (r) {
+		ti->error = "Error opening data device";
+		era_destroy(era);
+		return -EINVAL;
+	}
+
+	r = sscanf(argv[2], "%u%c", &era->sectors_per_block, &dummy);
+	if (r != 1) {
+		ti->error = "Error parsing block size";
+		era_destroy(era);
+		return -EINVAL;
+	}
+
+	r = dm_set_target_max_io_len(ti, era->sectors_per_block);
+	if (r) {
+		ti->error = "could not set max io len";
+		era_destroy(era);
+		return -EINVAL;
+	}
+
+	if (!valid_block_size(era->sectors_per_block)) {
+		ti->error = "Invalid block size";
+		era_destroy(era);
+		return -EINVAL;
+	}
+	if (era->sectors_per_block & (era->sectors_per_block - 1))
+		era->sectors_per_block_shift = -1;
+	else
+		era->sectors_per_block_shift = __ffs(era->sectors_per_block);
+
+	md = metadata_open(era->metadata_dev->bdev, era->sectors_per_block, true);
+	if (IS_ERR(md)) {
+		ti->error = "Error reading metadata";
+		era_destroy(era);
+		return PTR_ERR(md);
+	}
+	era->md = md;
+
+	era->nr_blocks = calc_nr_blocks(era);
+
+	r = metadata_resize(era->md, &era->nr_blocks);
+	if (r) {
+		ti->error = "couldn't resize metadata";
+		era_destroy(era);
+		return -ENOMEM;
+	}
+
+	era->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
+	if (!era->wq) {
+		ti->error = "could not create workqueue for metadata object";
+		era_destroy(era);
+		return -ENOMEM;
+	}
+	INIT_WORK(&era->worker, do_work);
+
+	spin_lock_init(&era->deferred_lock);
+	bio_list_init(&era->deferred_bios);
+
+	spin_lock_init(&era->rpc_lock);
+	INIT_LIST_HEAD(&era->rpc_calls);
+
+	ti->private = era;
+	ti->num_flush_bios = 1;
+	ti->flush_supported = true;
+
+	ti->num_discard_bios = 1;
+	ti->discards_supported = true;
+	era->callbacks.congested_fn = era_is_congested;
+	dm_table_add_target_callbacks(ti->table, &era->callbacks);
+
+	return 0;
+}
+
+static void era_dtr(struct dm_target *ti)
+{
+	era_destroy(ti->private);
+}
+
+static int era_map(struct dm_target *ti, struct bio *bio)
+{
+	struct era *era = ti->private;
+	dm_block_t block = get_block(era, bio);
+
+	/*
+	 * All bios get remapped to the origin device.  We do this now, but
+	 * it may not get issued until later.  Depending on whether the
+	 * block is marked in this era.
+	 */
+	remap_to_origin(era, bio);
+
+	/*
+	 * REQ_FLUSH bios carry no data, so we're not interested in them.
+	 */
+	if (!(bio->bi_rw & REQ_FLUSH) &&
+	    (bio_data_dir(bio) == WRITE) &&
+	    !metadata_current_marked(era->md, block)) {
+		defer_bio(era, bio);
+		return DM_MAPIO_SUBMITTED;
+	}
+
+	return DM_MAPIO_REMAPPED;
+}
+
+static void era_postsuspend(struct dm_target *ti)
+{
+	int r;
+	struct era *era = ti->private;
+
+	r = in_worker0(era, metadata_era_archive);
+	if (r) {
+		DMERR("%s: couldn't archive current era", __func__);
+		/* FIXME: fail mode */
+	}
+
+	stop_worker(era);
+}
+
+static int era_preresume(struct dm_target *ti)
+{
+	int r;
+	struct era *era = ti->private;
+	dm_block_t new_size = calc_nr_blocks(era);
+
+	if (era->nr_blocks != new_size) {
+		r = in_worker1(era, metadata_resize, &new_size);
+		if (r)
+			return r;
+
+		era->nr_blocks = new_size;
+	}
+
+	start_worker(era);
+
+	r = in_worker0(era, metadata_new_era);
+	if (r) {
+		DMERR("%s: metadata_era_rollover failed", __func__);
+		return r;
+	}
+
+	return 0;
+}
+
+/*
+ * Status format:
+ *
+ * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
+ * <current era> <held metadata root | '-'>
+ */
+static void era_status(struct dm_target *ti, status_type_t type,
+		       unsigned status_flags, char *result, unsigned maxlen)
+{
+	int r;
+	struct era *era = ti->private;
+	ssize_t sz = 0;
+	struct metadata_stats stats;
+	char buf[BDEVNAME_SIZE];
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		r = in_worker1(era, metadata_get_stats, &stats);
+		if (r)
+			goto err;
+
+		DMEMIT("%u %llu/%llu %u",
+		       (unsigned) (DM_ERA_METADATA_BLOCK_SIZE >> SECTOR_SHIFT),
+		       (unsigned long long) stats.used,
+		       (unsigned long long) stats.total,
+		       (unsigned) stats.era);
+
+		if (stats.snap != SUPERBLOCK_LOCATION)
+			DMEMIT(" %llu", stats.snap);
+		else
+			DMEMIT(" -");
+		break;
+
+	case STATUSTYPE_TABLE:
+		format_dev_t(buf, era->metadata_dev->bdev->bd_dev);
+		DMEMIT("%s ", buf);
+		format_dev_t(buf, era->origin_dev->bdev->bd_dev);
+		DMEMIT("%s %u", buf, era->sectors_per_block);
+		break;
+	}
+
+	return;
+
+err:
+	DMEMIT("Error");
+}
+
+static int era_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+	struct era *era = ti->private;
+
+	if (argc != 1) {
+		DMERR("incorrect number of message arguments");
+		return -EINVAL;
+	}
+
+	if (!strcasecmp(argv[0], "checkpoint"))
+		return in_worker0(era, metadata_checkpoint);
+
+	if (!strcasecmp(argv[0], "take_metadata_snap"))
+		return in_worker0(era, metadata_take_snap);
+
+	if (!strcasecmp(argv[0], "drop_metadata_snap"))
+		return in_worker0(era, metadata_drop_snap);
+
+	DMERR("unsupported message '%s'", argv[0]);
+	return -EINVAL;
+}
+
+static sector_t get_dev_size(struct dm_dev *dev)
+{
+	return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
+}
+
+static int era_iterate_devices(struct dm_target *ti,
+			       iterate_devices_callout_fn fn, void *data)
+{
+	struct era *era = ti->private;
+	return fn(ti, era->origin_dev, 0, get_dev_size(era->origin_dev), data);
+}
+
+static int era_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+		     struct bio_vec *biovec, int max_size)
+{
+	struct era *era = ti->private;
+	struct request_queue *q = bdev_get_queue(era->origin_dev->bdev);
+
+	if (!q->merge_bvec_fn)
+		return max_size;
+
+	bvm->bi_bdev = era->origin_dev->bdev;
+
+	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static void era_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+	struct era *era = ti->private;
+	uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
+
+	/*
+	 * If the system-determined stacked limits are compatible with the
+	 * era device's blocksize (io_opt is a factor) do not override them.
+	 */
+	if (io_opt_sectors < era->sectors_per_block ||
+	    do_div(io_opt_sectors, era->sectors_per_block)) {
+		blk_limits_io_min(limits, 0);
+		blk_limits_io_opt(limits, era->sectors_per_block << SECTOR_SHIFT);
+	}
+}
+
+/*----------------------------------------------------------------*/
+
+static struct target_type era_target = {
+	.name = "era",
+	.version = {1, 0, 0},
+	.module = THIS_MODULE,
+	.ctr = era_ctr,
+	.dtr = era_dtr,
+	.map = era_map,
+	.postsuspend = era_postsuspend,
+	.preresume = era_preresume,
+	.status = era_status,
+	.message = era_message,
+	.iterate_devices = era_iterate_devices,
+	.merge = era_merge,
+	.io_hints = era_io_hints
+};
+
+static int __init dm_era_init(void)
+{
+	int r;
+
+	r = dm_register_target(&era_target);
+	if (r) {
+		DMERR("era target registration failed: %d", r);
+		return r;
+	}
+
+	return 0;
+}
+
+static void __exit dm_era_exit(void)
+{
+	dm_unregister_target(&era_target);
+}
+
+module_init(dm_era_init);
+module_exit(dm_era_exit);
+
+MODULE_DESCRIPTION(DM_NAME " era target");
+MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index cc15543a6ad..b257e46876d 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -39,6 +39,10 @@ enum feature_flag_bits {
 	DROP_WRITES
 };
 
+struct per_bio_data {
+	bool bio_submitted;
+};
+
 static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
 			  struct dm_target *ti)
 {
@@ -172,7 +176,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	fc = kzalloc(sizeof(*fc), GFP_KERNEL);
 	if (!fc) {
-		ti->error = "Cannot allocate linear context";
+		ti->error = "Cannot allocate context";
 		return -ENOMEM;
 	}
 	fc->start_time = jiffies;
@@ -212,8 +216,9 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
-	ti->num_flush_requests = 1;
-	ti->num_discard_requests = 1;
+	ti->num_flush_bios = 1;
+	ti->num_discard_bios = 1;
+	ti->per_bio_data_size = sizeof(struct per_bio_data);
 	ti->private = fc;
 	return 0;
 
@@ -243,7 +248,8 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
 
 	bio->bi_bdev = fc->dev->bdev;
 	if (bio_sectors(bio))
-		bio->bi_sector = flakey_map_sector(ti, bio->bi_sector);
+		bio->bi_iter.bi_sector =
+			flakey_map_sector(ti, bio->bi_iter.bi_sector);
 }
 
 static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
@@ -260,16 +266,17 @@ static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
 		DMDEBUG("Corrupting data bio=%p by writing %u to byte %u "
 			"(rw=%c bi_rw=%lu bi_sector=%llu cur_bytes=%u)\n",
 			bio, fc->corrupt_bio_value, fc->corrupt_bio_byte,
-			(bio_data_dir(bio) == WRITE) ? 'w' : 'r',
-			bio->bi_rw, (unsigned long long)bio->bi_sector, bio_bytes);
+			(bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_rw,
+			(unsigned long long)bio->bi_iter.bi_sector, bio_bytes);
 	}
 }
 
-static int flakey_map(struct dm_target *ti, struct bio *bio,
-		      union map_info *map_context)
+static int flakey_map(struct dm_target *ti, struct bio *bio)
 {
 	struct flakey_c *fc = ti->private;
 	unsigned elapsed;
+	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
+	pb->bio_submitted = false;
 
 	/* Are we alive ? */
 	elapsed = (jiffies - fc->start_time) / HZ;
@@ -277,7 +284,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio,
 		/*
 		 * Flag this bio as submitted while down.
 		 */
-		map_context->ll = 1;
+		pb->bio_submitted = true;
 
 		/*
 		 * Map reads as normal.
@@ -314,17 +321,16 @@ map_bio:
 	return DM_MAPIO_REMAPPED;
 }
 
-static int flakey_end_io(struct dm_target *ti, struct bio *bio,
-			 int error, union map_info *map_context)
+static int flakey_end_io(struct dm_target *ti, struct bio *bio, int error)
 {
 	struct flakey_c *fc = ti->private;
-	unsigned bio_submitted_while_down = map_context->ll;
+	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
 
 	/*
 	 * Corrupt successful READs while in down state.
 	 * If flags were specified, only corrupt those that match.
 	 */
-	if (fc->corrupt_bio_byte && !error && bio_submitted_while_down &&
+	if (fc->corrupt_bio_byte && !error && pb->bio_submitted &&
 	    (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) &&
 	    all_corrupt_bio_flags_match(bio, fc))
 		corrupt_bio_data(bio, fc);
@@ -332,8 +338,8 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio,
 	return error;
 }
 
-static int flakey_status(struct dm_target *ti, status_type_t type,
-			 unsigned status_flags, char *result, unsigned maxlen)
+static void flakey_status(struct dm_target *ti, status_type_t type,
+			  unsigned status_flags, char *result, unsigned maxlen)
 {
 	unsigned sz = 0;
 	struct flakey_c *fc = ti->private;
@@ -363,7 +369,6 @@ static int flakey_status(struct dm_target *ti, status_type_t type,
 
 		break;
 	}
-	return 0;
 }
 
 static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg)
@@ -406,7 +411,7 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_
 
 static struct target_type flakey_target = {
 	.name   = "flakey",
-	.version = {1, 2, 0},
+	.version = {1, 3, 1},
 	.module = THIS_MODULE,
 	.ctr    = flakey_ctr,
 	.dtr    = flakey_dtr,
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 1c46f97d666..db404a0f7e2 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -10,6 +10,7 @@
 #include <linux/device-mapper.h>
 
 #include <linux/bio.h>
+#include <linux/completion.h>
 #include <linux/mempool.h>
 #include <linux/module.h>
 #include <linux/sched.h>
@@ -19,8 +20,6 @@
 #define DM_MSG_PREFIX "io"
 
 #define DM_IO_MAX_REGIONS	BITS_PER_LONG
-#define MIN_IOS		16
-#define MIN_BIOS	16
 
 struct dm_io_client {
 	mempool_t *pool;
@@ -34,7 +33,7 @@ struct dm_io_client {
 struct io {
 	unsigned long error_bits;
 	atomic_t count;
-	struct task_struct *sleeper;
+	struct completion *wait;
 	struct dm_io_client *client;
 	io_notify_fn callback;
 	void *context;
@@ -50,16 +49,17 @@ static struct kmem_cache *_dm_io_cache;
 struct dm_io_client *dm_io_client_create(void)
 {
 	struct dm_io_client *client;
+	unsigned min_ios = dm_get_reserved_bio_based_ios();
 
 	client = kmalloc(sizeof(*client), GFP_KERNEL);
 	if (!client)
 		return ERR_PTR(-ENOMEM);
 
-	client->pool = mempool_create_slab_pool(MIN_IOS, _dm_io_cache);
+	client->pool = mempool_create_slab_pool(min_ios, _dm_io_cache);
 	if (!client->pool)
 		goto bad;
 
-	client->bios = bioset_create(MIN_BIOS, 0);
+	client->bios = bioset_create(min_ios, 0);
 	if (!client->bios)
 		goto bad;
 
@@ -122,8 +122,8 @@ static void dec_count(struct io *io, unsigned int region, int error)
 			invalidate_kernel_vmap_range(io->vma_invalidate_address,
 						     io->vma_invalidate_size);
 
-		if (io->sleeper)
-			wake_up_process(io->sleeper);
+		if (io->wait)
+			complete(io->wait);
 
 		else {
 			unsigned long r = io->error_bits;
@@ -202,26 +202,28 @@ static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offse
 /*
  * Functions for getting the pages from a bvec.
  */
-static void bvec_get_page(struct dpages *dp,
-		  struct page **p, unsigned long *len, unsigned *offset)
+static void bio_get_page(struct dpages *dp, struct page **p,
+			 unsigned long *len, unsigned *offset)
 {
-	struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
+	struct bio_vec *bvec = dp->context_ptr;
 	*p = bvec->bv_page;
-	*len = bvec->bv_len;
-	*offset = bvec->bv_offset;
+	*len = bvec->bv_len - dp->context_u;
+	*offset = bvec->bv_offset + dp->context_u;
 }
 
-static void bvec_next_page(struct dpages *dp)
+static void bio_next_page(struct dpages *dp)
 {
-	struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
+	struct bio_vec *bvec = dp->context_ptr;
 	dp->context_ptr = bvec + 1;
+	dp->context_u = 0;
 }
 
-static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec)
+static void bio_dp_init(struct dpages *dp, struct bio *bio)
 {
-	dp->get_page = bvec_get_page;
-	dp->next_page = bvec_next_page;
-	dp->context_ptr = bvec;
+	dp->get_page = bio_get_page;
+	dp->next_page = bio_next_page;
+	dp->context_ptr = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+	dp->context_u = bio->bi_iter.bi_bvec_done;
 }
 
 /*
@@ -287,7 +289,8 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
 	unsigned num_bvecs;
 	sector_t remaining = where->count;
 	struct request_queue *q = bdev_get_queue(where->bdev);
-	sector_t discard_sectors;
+	unsigned short logical_block_size = queue_logical_block_size(q);
+	sector_t num_sectors;
 
 	/*
 	 * where->count may be zero if rw holds a flush and we need to
@@ -297,22 +300,34 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
 		/*
 		 * Allocate a suitably sized-bio.
 		 */
-		if (rw & REQ_DISCARD)
+		if ((rw & REQ_DISCARD) || (rw & REQ_WRITE_SAME))
 			num_bvecs = 1;
 		else
 			num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev),
 					  dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
 
 		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
-		bio->bi_sector = where->sector + (where->count - remaining);
+		bio->bi_iter.bi_sector = where->sector + (where->count - remaining);
 		bio->bi_bdev = where->bdev;
 		bio->bi_end_io = endio;
 		store_io_and_region_in_bio(bio, io, region);
 
 		if (rw & REQ_DISCARD) {
-			discard_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining);
-			bio->bi_size = discard_sectors << SECTOR_SHIFT;
-			remaining -= discard_sectors;
+			num_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining);
+			bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
+			remaining -= num_sectors;
+		} else if (rw & REQ_WRITE_SAME) {
+			/*
+			 * WRITE SAME only uses a single page.
+			 */
+			dp->get_page(dp, &page, &len, &offset);
+			bio_add_page(bio, page, logical_block_size, offset);
+			num_sectors = min_t(sector_t, q->limits.max_write_same_sectors, remaining);
+			bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
+
+			offset = 0;
+			remaining -= num_sectors;
+			dp->next_page(dp);
 		} else while (remaining) {
 			/*
 			 * Try and add as many pages as possible.
@@ -373,6 +388,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
 	 */
 	volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
 	struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io));
+	DECLARE_COMPLETION_ONSTACK(wait);
 
 	if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
 		WARN_ON(1);
@@ -381,7 +397,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
 
 	io->error_bits = 0;
 	atomic_set(&io->count, 1); /* see dispatch_io() */
-	io->sleeper = current;
+	io->wait = &wait;
 	io->client = client;
 
 	io->vma_invalidate_address = dp->vma_invalidate_address;
@@ -389,15 +405,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
 
 	dispatch_io(rw, num_regions, where, dp, io, 1);
 
-	while (1) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
-
-		if (!atomic_read(&io->count))
-			break;
-
-		io_schedule();
-	}
-	set_current_state(TASK_RUNNING);
+	wait_for_completion_io(&wait);
 
 	if (error_bits)
 		*error_bits = io->error_bits;
@@ -420,7 +428,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
 	io = mempool_alloc(client->pool, GFP_NOIO);
 	io->error_bits = 0;
 	atomic_set(&io->count, 1); /* see dispatch_io() */
-	io->sleeper = NULL;
+	io->wait = NULL;
 	io->client = client;
 	io->callback = fn;
 	io->context = context;
@@ -445,8 +453,8 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
 		list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset);
 		break;
 
-	case DM_IO_BVEC:
-		bvec_dp_init(dp, io_req->mem.ptr.bvec);
+	case DM_IO_BIO:
+		bio_dp_init(dp, io_req->mem.ptr.bio);
 		break;
 
 	case DM_IO_VMA:
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index afd95986d09..51521429fb5 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -36,6 +36,14 @@ struct hash_cell {
 	struct dm_table *new_map;
 };
 
+/*
+ * A dummy definition to make RCU happy.
+ * struct dm_table should never be dereferenced in this file.
+ */
+struct dm_table {
+	int undefined__;
+};
+
 struct vers_iter {
     size_t param_size;
     struct dm_target_versions *vers, *old_vers;
@@ -49,7 +57,7 @@ struct vers_iter {
 static struct list_head _name_buckets[NUM_BUCKETS];
 static struct list_head _uuid_buckets[NUM_BUCKETS];
 
-static void dm_hash_remove_all(int keep_open_devices);
+static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred);
 
 /*
  * Guards access to both hash tables.
@@ -78,7 +86,7 @@ static int dm_hash_init(void)
 
 static void dm_hash_exit(void)
 {
-	dm_hash_remove_all(0);
+	dm_hash_remove_all(false, false, false);
 }
 
 /*-----------------------------------------------------------------
@@ -242,9 +250,10 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi
 	return -EBUSY;
 }
 
-static void __hash_remove(struct hash_cell *hc)
+static struct dm_table *__hash_remove(struct hash_cell *hc)
 {
 	struct dm_table *table;
+	int srcu_idx;
 
 	/* remove from the dev hash */
 	list_del(&hc->uuid_list);
@@ -253,23 +262,26 @@ static void __hash_remove(struct hash_cell *hc)
 	dm_set_mdptr(hc->md, NULL);
 	mutex_unlock(&dm_hash_cells_mutex);
 
-	table = dm_get_live_table(hc->md);
-	if (table) {
+	table = dm_get_live_table(hc->md, &srcu_idx);
+	if (table)
 		dm_table_event(table);
-		dm_table_put(table);
-	}
+	dm_put_live_table(hc->md, srcu_idx);
 
+	table = NULL;
 	if (hc->new_map)
-		dm_table_destroy(hc->new_map);
+		table = hc->new_map;
 	dm_put(hc->md);
 	free_cell(hc);
+
+	return table;
 }
 
-static void dm_hash_remove_all(int keep_open_devices)
+static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred)
 {
 	int i, dev_skipped;
 	struct hash_cell *hc;
 	struct mapped_device *md;
+	struct dm_table *t;
 
 retry:
 	dev_skipped = 0;
@@ -281,16 +293,21 @@ retry:
 			md = hc->md;
 			dm_get(md);
 
-			if (keep_open_devices && dm_lock_for_deletion(md)) {
+			if (keep_open_devices &&
+			    dm_lock_for_deletion(md, mark_deferred, only_deferred)) {
 				dm_put(md);
 				dev_skipped++;
 				continue;
 			}
 
-			__hash_remove(hc);
+			t = __hash_remove(hc);
 
 			up_write(&_hash_lock);
 
+			if (t) {
+				dm_sync_table(md);
+				dm_table_destroy(t);
+			}
 			dm_put(md);
 			if (likely(keep_open_devices))
 				dm_destroy(md);
@@ -356,6 +373,7 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
 	struct dm_table *table;
 	struct mapped_device *md;
 	unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
+	int srcu_idx;
 
 	/*
 	 * duplicate new.
@@ -418,11 +436,10 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
 	/*
 	 * Wake up any dm event waiters.
 	 */
-	table = dm_get_live_table(hc->md);
-	if (table) {
+	table = dm_get_live_table(hc->md, &srcu_idx);
+	if (table)
 		dm_table_event(table);
-		dm_table_put(table);
-	}
+	dm_put_live_table(hc->md, srcu_idx);
 
 	if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, param->event_nr))
 		param->flags |= DM_UEVENT_GENERATED_FLAG;
@@ -434,6 +451,11 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
 	return md;
 }
 
+void dm_deferred_remove(void)
+{
+	dm_hash_remove_all(true, false, true);
+}
+
 /*-----------------------------------------------------------------
  * Implementation of the ioctl commands
  *---------------------------------------------------------------*/
@@ -445,7 +467,7 @@ typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
 
 static int remove_all(struct dm_ioctl *param, size_t param_size)
 {
-	dm_hash_remove_all(1);
+	dm_hash_remove_all(true, !!(param->flags & DM_DEFERRED_REMOVE), false);
 	param->data_size = 0;
 	return 0;
 }
@@ -620,11 +642,14 @@ static int check_name(const char *name)
  * _hash_lock without first calling dm_table_put, because dm_table_destroy
  * waits for this dm_table_put and could be called under this lock.
  */
-static struct dm_table *dm_get_inactive_table(struct mapped_device *md)
+static struct dm_table *dm_get_inactive_table(struct mapped_device *md, int *srcu_idx)
 {
 	struct hash_cell *hc;
 	struct dm_table *table = NULL;
 
+	/* increment rcu count, we don't care about the table pointer */
+	dm_get_live_table(md, srcu_idx);
+
 	down_read(&_hash_lock);
 	hc = dm_get_mdptr(md);
 	if (!hc || hc->md != md) {
@@ -633,8 +658,6 @@ static struct dm_table *dm_get_inactive_table(struct mapped_device *md)
 	}
 
 	table = hc->new_map;
-	if (table)
-		dm_table_get(table);
 
 out:
 	up_read(&_hash_lock);
@@ -643,10 +666,11 @@ out:
 }
 
 static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md,
-						      struct dm_ioctl *param)
+						      struct dm_ioctl *param,
+						      int *srcu_idx)
 {
 	return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ?
-		dm_get_inactive_table(md) : dm_get_live_table(md);
+		dm_get_inactive_table(md, srcu_idx) : dm_get_live_table(md, srcu_idx);
 }
 
 /*
@@ -657,6 +681,7 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
 {
 	struct gendisk *disk = dm_disk(md);
 	struct dm_table *table;
+	int srcu_idx;
 
 	param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
 			  DM_ACTIVE_PRESENT_FLAG);
@@ -664,6 +689,9 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
 	if (dm_suspended_md(md))
 		param->flags |= DM_SUSPEND_FLAG;
 
+	if (dm_test_deferred_remove_flag(md))
+		param->flags |= DM_DEFERRED_REMOVE;
+
 	param->dev = huge_encode_dev(disk_devt(disk));
 
 	/*
@@ -676,26 +704,27 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
 	param->event_nr = dm_get_event_nr(md);
 	param->target_count = 0;
 
-	table = dm_get_live_table(md);
+	table = dm_get_live_table(md, &srcu_idx);
 	if (table) {
 		if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) {
 			if (get_disk_ro(disk))
 				param->flags |= DM_READONLY_FLAG;
 			param->target_count = dm_table_get_num_targets(table);
 		}
-		dm_table_put(table);
 
 		param->flags |= DM_ACTIVE_PRESENT_FLAG;
 	}
+	dm_put_live_table(md, srcu_idx);
 
 	if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) {
-		table = dm_get_inactive_table(md);
+		int srcu_idx;
+		table = dm_get_inactive_table(md, &srcu_idx);
 		if (table) {
 			if (!(dm_table_get_mode(table) & FMODE_WRITE))
 				param->flags |= DM_READONLY_FLAG;
 			param->target_count = dm_table_get_num_targets(table);
-			dm_table_put(table);
 		}
+		dm_put_live_table(md, srcu_idx);
 	}
 }
 
@@ -796,6 +825,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
 	struct hash_cell *hc;
 	struct mapped_device *md;
 	int r;
+	struct dm_table *t;
 
 	down_write(&_hash_lock);
 	hc = __find_device_hash_cell(param);
@@ -811,17 +841,29 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
 	/*
 	 * Ensure the device is not open and nothing further can open it.
 	 */
-	r = dm_lock_for_deletion(md);
+	r = dm_lock_for_deletion(md, !!(param->flags & DM_DEFERRED_REMOVE), false);
 	if (r) {
+		if (r == -EBUSY && param->flags & DM_DEFERRED_REMOVE) {
+			up_write(&_hash_lock);
+			dm_put(md);
+			return 0;
+		}
 		DMDEBUG_LIMIT("unable to remove open device %s", hc->name);
 		up_write(&_hash_lock);
 		dm_put(md);
 		return r;
 	}
 
-	__hash_remove(hc);
+	t = __hash_remove(hc);
 	up_write(&_hash_lock);
 
+	if (t) {
+		dm_sync_table(md);
+		dm_table_destroy(t);
+	}
+
+	param->flags &= ~DM_DEFERRED_REMOVE;
+
 	if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr))
 		param->flags |= DM_UEVENT_GENERATED_FLAG;
 
@@ -851,7 +893,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
 	unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
 
 	if (new_data < param->data ||
-	    invalid_str(new_data, (void *) param + param_size) ||
+	    invalid_str(new_data, (void *) param + param_size) || !*new_data ||
 	    strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) {
 		DMWARN("Invalid new mapped device name or uuid string supplied.");
 		return -EINVAL;
@@ -986,6 +1028,7 @@ static int do_resume(struct dm_ioctl *param)
 
 		old_map = dm_swap_table(md, new_map);
 		if (IS_ERR(old_map)) {
+			dm_sync_table(md);
 			dm_table_destroy(new_map);
 			dm_put(md);
 			return PTR_ERR(old_map);
@@ -1003,6 +1046,10 @@ static int do_resume(struct dm_ioctl *param)
 			param->flags |= DM_UEVENT_GENERATED_FLAG;
 	}
 
+	/*
+	 * Since dm_swap_table synchronizes RCU, nobody should be in
+	 * read-side critical section already.
+	 */
 	if (old_map)
 		dm_table_destroy(old_map);
 
@@ -1067,6 +1114,7 @@ static void retrieve_status(struct dm_table *table,
 	num_targets = dm_table_get_num_targets(table);
 	for (i = 0; i < num_targets; i++) {
 		struct dm_target *ti = dm_table_get_target(table, i);
+		size_t l;
 
 		remaining = len - (outptr - outbuf);
 		if (remaining <= sizeof(struct dm_target_spec)) {
@@ -1093,14 +1141,17 @@ static void retrieve_status(struct dm_table *table,
 		if (ti->type->status) {
 			if (param->flags & DM_NOFLUSH_FLAG)
 				status_flags |= DM_STATUS_NOFLUSH_FLAG;
-			if (ti->type->status(ti, type, status_flags, outptr, remaining)) {
-				param->flags |= DM_BUFFER_FULL_FLAG;
-				break;
-			}
+			ti->type->status(ti, type, status_flags, outptr, remaining);
 		} else
 			outptr[0] = '\0';
 
-		outptr += strlen(outptr) + 1;
+		l = strlen(outptr) + 1;
+		if (l == remaining) {
+			param->flags |= DM_BUFFER_FULL_FLAG;
+			break;
+		}
+
+		outptr += l;
 		used = param->data_start + (outptr - outbuf);
 
 		outptr = align_ptr(outptr);
@@ -1121,6 +1172,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size)
 	int r = 0;
 	struct mapped_device *md;
 	struct dm_table *table;
+	int srcu_idx;
 
 	md = find_device(param);
 	if (!md)
@@ -1141,11 +1193,10 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size)
 	 */
 	__dev_status(md, param);
 
-	table = dm_get_live_or_inactive_table(md, param);
-	if (table) {
+	table = dm_get_live_or_inactive_table(md, param, &srcu_idx);
+	if (table)
 		retrieve_status(table, param, param_size);
-		dm_table_put(table);
-	}
+	dm_put_live_table(md, srcu_idx);
 
 out:
 	dm_put(md);
@@ -1217,7 +1268,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
 {
 	int r;
 	struct hash_cell *hc;
-	struct dm_table *t;
+	struct dm_table *t, *old_map = NULL;
 	struct mapped_device *md;
 	struct target_type *immutable_target_type;
 
@@ -1227,44 +1278,37 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
 
 	r = dm_table_create(&t, get_mode(param), param->target_count, md);
 	if (r)
-		goto out;
+		goto err;
 
+	/* Protect md->type and md->queue against concurrent table loads. */
+	dm_lock_md_type(md);
 	r = populate_table(t, param, param_size);
-	if (r) {
-		dm_table_destroy(t);
-		goto out;
-	}
+	if (r)
+		goto err_unlock_md_type;
 
 	immutable_target_type = dm_get_immutable_target_type(md);
 	if (immutable_target_type &&
 	    (immutable_target_type != dm_table_get_immutable_target_type(t))) {
 		DMWARN("can't replace immutable target type %s",
 		       immutable_target_type->name);
-		dm_table_destroy(t);
 		r = -EINVAL;
-		goto out;
+		goto err_unlock_md_type;
 	}
 
-	/* Protect md->type and md->queue against concurrent table loads. */
-	dm_lock_md_type(md);
 	if (dm_get_md_type(md) == DM_TYPE_NONE)
 		/* Initial table load: acquire type of table. */
 		dm_set_md_type(md, dm_table_get_type(t));
 	else if (dm_get_md_type(md) != dm_table_get_type(t)) {
 		DMWARN("can't change device type after initial table load.");
-		dm_table_destroy(t);
-		dm_unlock_md_type(md);
 		r = -EINVAL;
-		goto out;
+		goto err_unlock_md_type;
 	}
 
 	/* setup md->queue to reflect md's type (may block) */
 	r = dm_setup_md_queue(md);
 	if (r) {
 		DMWARN("unable to set up device queue for new table.");
-		dm_table_destroy(t);
-		dm_unlock_md_type(md);
-		goto out;
+		goto err_unlock_md_type;
 	}
 	dm_unlock_md_type(md);
 
@@ -1273,21 +1317,33 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
 	hc = dm_get_mdptr(md);
 	if (!hc || hc->md != md) {
 		DMWARN("device has been removed from the dev hash table.");
-		dm_table_destroy(t);
 		up_write(&_hash_lock);
 		r = -ENXIO;
-		goto out;
+		goto err_destroy_table;
 	}
 
 	if (hc->new_map)
-		dm_table_destroy(hc->new_map);
+		old_map = hc->new_map;
 	hc->new_map = t;
 	up_write(&_hash_lock);
 
 	param->flags |= DM_INACTIVE_PRESENT_FLAG;
 	__dev_status(md, param);
 
-out:
+	if (old_map) {
+		dm_sync_table(md);
+		dm_table_destroy(old_map);
+	}
+
+	dm_put(md);
+
+	return 0;
+
+err_unlock_md_type:
+	dm_unlock_md_type(md);
+err_destroy_table:
+	dm_table_destroy(t);
+err:
 	dm_put(md);
 
 	return r;
@@ -1297,6 +1353,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
 {
 	struct hash_cell *hc;
 	struct mapped_device *md;
+	struct dm_table *old_map = NULL;
 
 	down_write(&_hash_lock);
 
@@ -1308,7 +1365,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
 	}
 
 	if (hc->new_map) {
-		dm_table_destroy(hc->new_map);
+		old_map = hc->new_map;
 		hc->new_map = NULL;
 	}
 
@@ -1317,6 +1374,10 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
 	__dev_status(hc->md, param);
 	md = hc->md;
 	up_write(&_hash_lock);
+	if (old_map) {
+		dm_sync_table(md);
+		dm_table_destroy(old_map);
+	}
 	dm_put(md);
 
 	return 0;
@@ -1366,6 +1427,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
 {
 	struct mapped_device *md;
 	struct dm_table *table;
+	int srcu_idx;
 
 	md = find_device(param);
 	if (!md)
@@ -1373,11 +1435,10 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
 
 	__dev_status(md, param);
 
-	table = dm_get_live_or_inactive_table(md, param);
-	if (table) {
+	table = dm_get_live_or_inactive_table(md, param, &srcu_idx);
+	if (table)
 		retrieve_deps(table, param, param_size);
-		dm_table_put(table);
-	}
+	dm_put_live_table(md, srcu_idx);
 
 	dm_put(md);
 
@@ -1392,6 +1453,7 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
 {
 	struct mapped_device *md;
 	struct dm_table *table;
+	int srcu_idx;
 
 	md = find_device(param);
 	if (!md)
@@ -1399,11 +1461,10 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
 
 	__dev_status(md, param);
 
-	table = dm_get_live_or_inactive_table(md, param);
-	if (table) {
+	table = dm_get_live_or_inactive_table(md, param, &srcu_idx);
+	if (table)
 		retrieve_status(table, param, param_size);
-		dm_table_put(table);
-	}
+	dm_put_live_table(md, srcu_idx);
 
 	dm_put(md);
 
@@ -1411,6 +1472,36 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
 }
 
 /*
+ * Process device-mapper dependent messages.  Messages prefixed with '@'
+ * are processed by the DM core.  All others are delivered to the target.
+ * Returns a number <= 1 if message was processed by device mapper.
+ * Returns 2 if message should be delivered to the target.
+ */
+static int message_for_md(struct mapped_device *md, unsigned argc, char **argv,
+			  char *result, unsigned maxlen)
+{
+	int r;
+
+	if (**argv != '@')
+		return 2; /* no '@' prefix, deliver to target */
+
+	if (!strcasecmp(argv[0], "@cancel_deferred_remove")) {
+		if (argc != 1) {
+			DMERR("Invalid arguments for @cancel_deferred_remove");
+			return -EINVAL;
+		}
+		return dm_cancel_deferred_remove(md);
+	}
+
+	r = dm_stats_message(md, argc, argv, result, maxlen);
+	if (r < 2)
+		return r;
+
+	DMERR("Unsupported message sent to DM core: %s", argv[0]);
+	return -EINVAL;
+}
+
+/*
  * Pass a message to the target that's at the supplied device offset.
  */
 static int target_message(struct dm_ioctl *param, size_t param_size)
@@ -1421,6 +1512,9 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
 	struct dm_table *table;
 	struct dm_target *ti;
 	struct dm_target_msg *tmsg = (void *) param + param->data_start;
+	size_t maxlen;
+	char *result = get_result_buffer(param, param_size, &maxlen);
+	int srcu_idx;
 
 	md = find_device(param);
 	if (!md)
@@ -1444,10 +1538,14 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
 		goto out_argv;
 	}
 
-	table = dm_get_live_table(md);
-	if (!table)
+	r = message_for_md(md, argc, argv, result, maxlen);
+	if (r <= 1)
 		goto out_argv;
 
+	table = dm_get_live_table(md, &srcu_idx);
+	if (!table)
+		goto out_table;
+
 	if (dm_deleting_md(md)) {
 		r = -ENXIO;
 		goto out_table;
@@ -1465,48 +1563,72 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
 	}
 
  out_table:
-	dm_table_put(table);
+	dm_put_live_table(md, srcu_idx);
  out_argv:
 	kfree(argv);
  out:
-	param->data_size = 0;
+	if (r >= 0)
+		__dev_status(md, param);
+
+	if (r == 1) {
+		param->flags |= DM_DATA_OUT_FLAG;
+		if (dm_message_test_buffer_overflow(result, maxlen))
+			param->flags |= DM_BUFFER_FULL_FLAG;
+		else
+			param->data_size = param->data_start + strlen(result) + 1;
+		r = 0;
+	}
+
 	dm_put(md);
 	return r;
 }
 
+/*
+ * The ioctl parameter block consists of two parts, a dm_ioctl struct
+ * followed by a data buffer.  This flag is set if the second part,
+ * which has a variable size, is not used by the function processing
+ * the ioctl.
+ */
+#define IOCTL_FLAGS_NO_PARAMS	1
+
 /*-----------------------------------------------------------------
  * Implementation of open/close/ioctl on the special char
  * device.
  *---------------------------------------------------------------*/
-static ioctl_fn lookup_ioctl(unsigned int cmd)
+static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags)
 {
 	static struct {
 		int cmd;
+		int flags;
 		ioctl_fn fn;
 	} _ioctls[] = {
-		{DM_VERSION_CMD, NULL},	/* version is dealt with elsewhere */
-		{DM_REMOVE_ALL_CMD, remove_all},
-		{DM_LIST_DEVICES_CMD, list_devices},
-
-		{DM_DEV_CREATE_CMD, dev_create},
-		{DM_DEV_REMOVE_CMD, dev_remove},
-		{DM_DEV_RENAME_CMD, dev_rename},
-		{DM_DEV_SUSPEND_CMD, dev_suspend},
-		{DM_DEV_STATUS_CMD, dev_status},
-		{DM_DEV_WAIT_CMD, dev_wait},
-
-		{DM_TABLE_LOAD_CMD, table_load},
-		{DM_TABLE_CLEAR_CMD, table_clear},
-		{DM_TABLE_DEPS_CMD, table_deps},
-		{DM_TABLE_STATUS_CMD, table_status},
-
-		{DM_LIST_VERSIONS_CMD, list_versions},
-
-		{DM_TARGET_MSG_CMD, target_message},
-		{DM_DEV_SET_GEOMETRY_CMD, dev_set_geometry}
+		{DM_VERSION_CMD, 0, NULL}, /* version is dealt with elsewhere */
+		{DM_REMOVE_ALL_CMD, IOCTL_FLAGS_NO_PARAMS, remove_all},
+		{DM_LIST_DEVICES_CMD, 0, list_devices},
+
+		{DM_DEV_CREATE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_create},
+		{DM_DEV_REMOVE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_remove},
+		{DM_DEV_RENAME_CMD, 0, dev_rename},
+		{DM_DEV_SUSPEND_CMD, IOCTL_FLAGS_NO_PARAMS, dev_suspend},
+		{DM_DEV_STATUS_CMD, IOCTL_FLAGS_NO_PARAMS, dev_status},
+		{DM_DEV_WAIT_CMD, 0, dev_wait},
+
+		{DM_TABLE_LOAD_CMD, 0, table_load},
+		{DM_TABLE_CLEAR_CMD, IOCTL_FLAGS_NO_PARAMS, table_clear},
+		{DM_TABLE_DEPS_CMD, 0, table_deps},
+		{DM_TABLE_STATUS_CMD, 0, table_status},
+
+		{DM_LIST_VERSIONS_CMD, 0, list_versions},
+
+		{DM_TARGET_MSG_CMD, 0, target_message},
+		{DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry}
 	};
 
-	return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn;
+	if (unlikely(cmd >= ARRAY_SIZE(_ioctls)))
+		return NULL;
+
+	*ioctl_flags = _ioctls[cmd].flags;
+	return _ioctls[cmd].fn;
 }
 
 /*
@@ -1543,40 +1665,93 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user)
 	return r;
 }
 
-static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param)
+#define DM_PARAMS_KMALLOC	0x0001	/* Params alloced with kmalloc */
+#define DM_PARAMS_VMALLOC	0x0002	/* Params alloced with vmalloc */
+#define DM_WIPE_BUFFER		0x0010	/* Wipe input buffer before returning from ioctl */
+
+static void free_params(struct dm_ioctl *param, size_t param_size, int param_flags)
 {
-	struct dm_ioctl tmp, *dmi;
+	if (param_flags & DM_WIPE_BUFFER)
+		memset(param, 0, param_size);
+
+	if (param_flags & DM_PARAMS_KMALLOC)
+		kfree(param);
+	if (param_flags & DM_PARAMS_VMALLOC)
+		vfree(param);
+}
+
+static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kernel,
+		       int ioctl_flags,
+		       struct dm_ioctl **param, int *param_flags)
+{
+	struct dm_ioctl *dmi;
 	int secure_data;
+	const size_t minimum_data_size = sizeof(*param_kernel) - sizeof(param_kernel->data);
 
-	if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data)))
+	if (copy_from_user(param_kernel, user, minimum_data_size))
 		return -EFAULT;
 
-	if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data)))
+	if (param_kernel->data_size < minimum_data_size)
 		return -EINVAL;
 
-	secure_data = tmp.flags & DM_SECURE_DATA_FLAG;
+	secure_data = param_kernel->flags & DM_SECURE_DATA_FLAG;
+
+	*param_flags = secure_data ? DM_WIPE_BUFFER : 0;
+
+	if (ioctl_flags & IOCTL_FLAGS_NO_PARAMS) {
+		dmi = param_kernel;
+		dmi->data_size = minimum_data_size;
+		goto data_copied;
+	}
+
+	/*
+	 * Try to avoid low memory issues when a device is suspended.
+	 * Use kmalloc() rather than vmalloc() when we can.
+	 */
+	dmi = NULL;
+	if (param_kernel->data_size <= KMALLOC_MAX_SIZE) {
+		dmi = kmalloc(param_kernel->data_size, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+		if (dmi)
+			*param_flags |= DM_PARAMS_KMALLOC;
+	}
 
-	dmi = vmalloc(tmp.data_size);
 	if (!dmi) {
-		if (secure_data && clear_user(user, tmp.data_size))
+		unsigned noio_flag;
+		noio_flag = memalloc_noio_save();
+		dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH | __GFP_HIGHMEM, PAGE_KERNEL);
+		memalloc_noio_restore(noio_flag);
+		if (dmi)
+			*param_flags |= DM_PARAMS_VMALLOC;
+	}
+
+	if (!dmi) {
+		if (secure_data && clear_user(user, param_kernel->data_size))
 			return -EFAULT;
 		return -ENOMEM;
 	}
 
-	if (copy_from_user(dmi, user, tmp.data_size))
+	if (copy_from_user(dmi, user, param_kernel->data_size))
+		goto bad;
+
+data_copied:
+	/*
+	 * Abort if something changed the ioctl data while it was being copied.
+	 */
+	if (dmi->data_size != param_kernel->data_size) {
+		DMERR("rejecting ioctl: data size modified while processing parameters");
 		goto bad;
+	}
 
 	/* Wipe the user buffer so we do not return it to userspace */
-	if (secure_data && clear_user(user, tmp.data_size))
+	if (secure_data && clear_user(user, param_kernel->data_size))
 		goto bad;
 
 	*param = dmi;
 	return 0;
 
 bad:
-	if (secure_data)
-		memset(dmi, 0, tmp.data_size);
-	vfree(dmi);
+	free_params(dmi, param_kernel->data_size, *param_flags);
+
 	return -EFAULT;
 }
 
@@ -1586,6 +1761,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
 	param->flags &= ~DM_BUFFER_FULL_FLAG;
 	param->flags &= ~DM_UEVENT_GENERATED_FLAG;
 	param->flags &= ~DM_SECURE_DATA_FLAG;
+	param->flags &= ~DM_DATA_OUT_FLAG;
 
 	/* Ignores parameters */
 	if (cmd == DM_REMOVE_ALL_CMD ||
@@ -1613,11 +1789,13 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
 static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 {
 	int r = 0;
-	int wipe_buffer;
+	int ioctl_flags;
+	int param_flags;
 	unsigned int cmd;
 	struct dm_ioctl *uninitialized_var(param);
 	ioctl_fn fn = NULL;
 	size_t input_param_size;
+	struct dm_ioctl param_kernel;
 
 	/* only root can play with this */
 	if (!capable(CAP_SYS_ADMIN))
@@ -1642,31 +1820,21 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 	if (cmd == DM_VERSION_CMD)
 		return 0;
 
-	fn = lookup_ioctl(cmd);
+	fn = lookup_ioctl(cmd, &ioctl_flags);
 	if (!fn) {
 		DMWARN("dm_ctl_ioctl: unknown command 0x%x", command);
 		return -ENOTTY;
 	}
 
 	/*
-	 * Trying to avoid low memory issues when a device is
-	 * suspended.
-	 */
-	current->flags |= PF_MEMALLOC;
-
-	/*
 	 * Copy the parameters into kernel space.
 	 */
-	r = copy_params(user, &param);
-
-	current->flags &= ~PF_MEMALLOC;
+	r = copy_params(user, &param_kernel, ioctl_flags, &param, &param_flags);
 
 	if (r)
 		return r;
 
 	input_param_size = param->data_size;
-	wipe_buffer = param->flags & DM_SECURE_DATA_FLAG;
-
 	r = validate_params(cmd, param);
 	if (r)
 		goto out;
@@ -1674,6 +1842,10 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 	param->data_size = sizeof(*param);
 	r = fn(param, input_param_size);
 
+	if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) &&
+	    unlikely(ioctl_flags & IOCTL_FLAGS_NO_PARAMS))
+		DMERR("ioctl %d tried to output some data but has IOCTL_FLAGS_NO_PARAMS set", cmd);
+
 	/*
 	 * Copy the results back to userland.
 	 */
@@ -1681,10 +1853,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 		r = -EFAULT;
 
 out:
-	if (wipe_buffer)
-		memset(param, 0, input_param_size);
-
-	vfree(param);
+	free_params(param, input_param_size, param_flags);
 	return r;
 }
 
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index bed444c93d8..3a7cade5e27 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -22,6 +22,7 @@
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 #include <linux/mutex.h>
+#include <linux/delay.h>
 #include <linux/device-mapper.h>
 #include <linux/dm-kcopyd.h>
 
@@ -51,6 +52,8 @@ struct dm_kcopyd_client {
 	struct workqueue_struct *kcopyd_wq;
 	struct work_struct kcopyd_work;
 
+	struct dm_kcopyd_throttle *throttle;
+
 /*
  * We maintain three lists of jobs:
  *
@@ -68,6 +71,117 @@ struct dm_kcopyd_client {
 
 static struct page_list zero_page_list;
 
+static DEFINE_SPINLOCK(throttle_spinlock);
+
+/*
+ * IO/IDLE accounting slowly decays after (1 << ACCOUNT_INTERVAL_SHIFT) period.
+ * When total_period >= (1 << ACCOUNT_INTERVAL_SHIFT) the counters are divided
+ * by 2.
+ */
+#define ACCOUNT_INTERVAL_SHIFT		SHIFT_HZ
+
+/*
+ * Sleep this number of milliseconds.
+ *
+ * The value was decided experimentally.
+ * Smaller values seem to cause an increased copy rate above the limit.
+ * The reason for this is unknown but possibly due to jiffies rounding errors
+ * or read/write cache inside the disk.
+ */
+#define SLEEP_MSEC			100
+
+/*
+ * Maximum number of sleep events. There is a theoretical livelock if more
+ * kcopyd clients do work simultaneously which this limit avoids.
+ */
+#define MAX_SLEEPS			10
+
+static void io_job_start(struct dm_kcopyd_throttle *t)
+{
+	unsigned throttle, now, difference;
+	int slept = 0, skew;
+
+	if (unlikely(!t))
+		return;
+
+try_again:
+	spin_lock_irq(&throttle_spinlock);
+
+	throttle = ACCESS_ONCE(t->throttle);
+
+	if (likely(throttle >= 100))
+		goto skip_limit;
+
+	now = jiffies;
+	difference = now - t->last_jiffies;
+	t->last_jiffies = now;
+	if (t->num_io_jobs)
+		t->io_period += difference;
+	t->total_period += difference;
+
+	/*
+	 * Maintain sane values if we got a temporary overflow.
+	 */
+	if (unlikely(t->io_period > t->total_period))
+		t->io_period = t->total_period;
+
+	if (unlikely(t->total_period >= (1 << ACCOUNT_INTERVAL_SHIFT))) {
+		int shift = fls(t->total_period >> ACCOUNT_INTERVAL_SHIFT);
+		t->total_period >>= shift;
+		t->io_period >>= shift;
+	}
+
+	skew = t->io_period - throttle * t->total_period / 100;
+
+	if (unlikely(skew > 0) && slept < MAX_SLEEPS) {
+		slept++;
+		spin_unlock_irq(&throttle_spinlock);
+		msleep(SLEEP_MSEC);
+		goto try_again;
+	}
+
+skip_limit:
+	t->num_io_jobs++;
+
+	spin_unlock_irq(&throttle_spinlock);
+}
+
+static void io_job_finish(struct dm_kcopyd_throttle *t)
+{
+	unsigned long flags;
+
+	if (unlikely(!t))
+		return;
+
+	spin_lock_irqsave(&throttle_spinlock, flags);
+
+	t->num_io_jobs--;
+
+	if (likely(ACCESS_ONCE(t->throttle) >= 100))
+		goto skip_limit;
+
+	if (!t->num_io_jobs) {
+		unsigned now, difference;
+
+		now = jiffies;
+		difference = now - t->last_jiffies;
+		t->last_jiffies = now;
+
+		t->io_period += difference;
+		t->total_period += difference;
+
+		/*
+		 * Maintain sane values if we got a temporary overflow.
+		 */
+		if (unlikely(t->io_period > t->total_period))
+			t->io_period = t->total_period;
+	}
+
+skip_limit:
+	spin_unlock_irqrestore(&throttle_spinlock, flags);
+}
+
+
 static void wake(struct dm_kcopyd_client *kc)
 {
 	queue_work(kc->kcopyd_wq, &kc->kcopyd_work);
@@ -348,8 +462,10 @@ static void complete_io(unsigned long error, void *context)
 	struct kcopyd_job *job = (struct kcopyd_job *) context;
 	struct dm_kcopyd_client *kc = job->kc;
 
+	io_job_finish(kc->throttle);
+
 	if (error) {
-		if (job->rw == WRITE)
+		if (job->rw & WRITE)
 			job->write_err |= error;
 		else
 			job->read_err = 1;
@@ -361,7 +477,7 @@ static void complete_io(unsigned long error, void *context)
 		}
 	}
 
-	if (job->rw == WRITE)
+	if (job->rw & WRITE)
 		push(&kc->complete_jobs, job);
 
 	else {
@@ -389,6 +505,8 @@ static int run_io_job(struct kcopyd_job *job)
 		.client = job->kc->io_client,
 	};
 
+	io_job_start(job->kc->throttle);
+
 	if (job->rw == READ)
 		r = dm_io(&io_req, 1, &job->source, NULL);
 	else
@@ -432,7 +550,7 @@ static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc,
 
 		if (r < 0) {
 			/* error this rogue job */
-			if (job->rw == WRITE)
+			if (job->rw & WRITE)
 				job->write_err = (unsigned long) -1L;
 			else
 				job->read_err = 1;
@@ -585,6 +703,7 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
 		   unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
 {
 	struct kcopyd_job *job;
+	int i;
 
 	/*
 	 * Allocate an array of jobs consisting of one master job
@@ -611,7 +730,16 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
 		memset(&job->source, 0, sizeof job->source);
 		job->source.count = job->dests[0].count;
 		job->pages = &zero_page_list;
-		job->rw = WRITE;
+
+		/*
+		 * Use WRITE SAME to optimize zeroing if all dests support it.
+		 */
+		job->rw = WRITE | REQ_WRITE_SAME;
+		for (i = 0; i < job->num_dests; i++)
+			if (!bdev_write_same(job->dests[i].bdev)) {
+				job->rw = WRITE;
+				break;
+			}
 	}
 
 	job->fn = fn;
@@ -685,7 +813,7 @@ int kcopyd_cancel(struct kcopyd_job *job, int block)
 /*-----------------------------------------------------------------
  * Client setup
  *---------------------------------------------------------------*/
-struct dm_kcopyd_client *dm_kcopyd_client_create(void)
+struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle)
 {
 	int r = -ENOMEM;
 	struct dm_kcopyd_client *kc;
@@ -698,14 +826,14 @@ struct dm_kcopyd_client *dm_kcopyd_client_create(void)
 	INIT_LIST_HEAD(&kc->complete_jobs);
 	INIT_LIST_HEAD(&kc->io_jobs);
 	INIT_LIST_HEAD(&kc->pages_jobs);
+	kc->throttle = throttle;
 
 	kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
 	if (!kc->job_pool)
 		goto bad_slab;
 
 	INIT_WORK(&kc->kcopyd_work, do_work);
-	kc->kcopyd_wq = alloc_workqueue("kcopyd",
-					WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
+	kc->kcopyd_wq = alloc_workqueue("kcopyd", WQ_MEM_RECLAIM, 0);
 	if (!kc->kcopyd_wq)
 		goto bad_workqueue;
 
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 1bf19a93eef..53e848c1093 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -53,8 +53,9 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
-	ti->num_flush_requests = 1;
-	ti->num_discard_requests = 1;
+	ti->num_flush_bios = 1;
+	ti->num_discard_bios = 1;
+	ti->num_write_same_bios = 1;
 	ti->private = lc;
 	return 0;
 
@@ -84,19 +85,19 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
 
 	bio->bi_bdev = lc->dev->bdev;
 	if (bio_sectors(bio))
-		bio->bi_sector = linear_map_sector(ti, bio->bi_sector);
+		bio->bi_iter.bi_sector =
+			linear_map_sector(ti, bio->bi_iter.bi_sector);
 }
 
-static int linear_map(struct dm_target *ti, struct bio *bio,
-		      union map_info *map_context)
+static int linear_map(struct dm_target *ti, struct bio *bio)
 {
 	linear_map_bio(ti, bio);
 
 	return DM_MAPIO_REMAPPED;
 }
 
-static int linear_status(struct dm_target *ti, status_type_t type,
-			 unsigned status_flags, char *result, unsigned maxlen)
+static void linear_status(struct dm_target *ti, status_type_t type,
+			  unsigned status_flags, char *result, unsigned maxlen)
 {
 	struct linear_c *lc = (struct linear_c *) ti->private;
 
@@ -110,7 +111,6 @@ static int linear_status(struct dm_target *ti, status_type_t type,
 				(unsigned long long)lc->start);
 		break;
 	}
-	return 0;
 }
 
 static int linear_ioctl(struct dm_target *ti, unsigned int cmd,
@@ -155,7 +155,7 @@ static int linear_iterate_devices(struct dm_target *ti,
 
 static struct target_type linear_target = {
 	.name   = "linear",
-	.version = {1, 1, 0},
+	.version = {1, 2, 1},
 	.module = THIS_MODULE,
 	.ctr    = linear_ctr,
 	.dtr    = linear_dtr,
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 9429159d9ee..b953db6cc22 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -10,10 +10,11 @@
 #include <linux/device-mapper.h>
 #include <linux/dm-log-userspace.h>
 #include <linux/module.h>
+#include <linux/workqueue.h>
 
 #include "dm-log-userspace-transfer.h"
 
-#define DM_LOG_USERSPACE_VSN "1.1.0"
+#define DM_LOG_USERSPACE_VSN "1.3.0"
 
 struct flush_entry {
 	int type;
@@ -58,6 +59,18 @@ struct log_c {
 	spinlock_t flush_lock;
 	struct list_head mark_list;
 	struct list_head clear_list;
+
+	/*
+	 * Workqueue for flush of clear region requests.
+	 */
+	struct workqueue_struct *dmlog_wq;
+	struct delayed_work flush_log_work;
+	atomic_t sched_flush;
+
+	/*
+	 * Combine userspace flush and mark requests for efficiency.
+	 */
+	uint32_t integrated_flush;
 };
 
 static mempool_t *flush_entry_pool;
@@ -122,6 +135,9 @@ static int build_constructor_string(struct dm_target *ti,
 
 	*ctr_str = NULL;
 
+	/*
+	 * Determine overall size of the string.
+	 */
 	for (i = 0, str_size = 0; i < argc; i++)
 		str_size += strlen(argv[i]) + 1; /* +1 for space between args */
 
@@ -141,18 +157,39 @@ static int build_constructor_string(struct dm_target *ti,
 	return str_size;
 }
 
+static void do_flush(struct work_struct *work)
+{
+	int r;
+	struct log_c *lc = container_of(work, struct log_c, flush_log_work.work);
+
+	atomic_set(&lc->sched_flush, 0);
+
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, NULL, 0, NULL, NULL);
+
+	if (r)
+		dm_table_event(lc->ti->table);
+}
+
 /*
  * userspace_ctr
  *
  * argv contains:
- *	<UUID> <other args>
- * Where 'other args' is the userspace implementation specific log
- * arguments.  An example might be:
- *	<UUID> clustered-disk <arg count> <log dev> <region_size> [[no]sync]
+ *	<UUID> [integrated_flush] <other args>
+ * Where 'other args' are the userspace implementation-specific log
+ * arguments.
+ *
+ * Example:
+ *	<UUID> [integrated_flush] clustered-disk <arg count> <log dev>
+ *	<region_size> [[no]sync]
+ *
+ * This module strips off the <UUID> and uses it for identification
+ * purposes when communicating with userspace about a log.
  *
- * So, this module will strip off the <UUID> for identification purposes
- * when communicating with userspace about a log; but will pass on everything
- * else.
+ * If integrated_flush is defined, the kernel combines flush
+ * and mark requests.
+ *
+ * The rest of the line, beginning with 'clustered-disk', is passed
+ * to the userspace ctr function.
  */
 static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 			 unsigned argc, char **argv)
@@ -188,12 +225,22 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 		return -EINVAL;
 	}
 
+	lc->usr_argc = argc;
+
 	strncpy(lc->uuid, argv[0], DM_UUID_LEN);
+	argc--;
+	argv++;
 	spin_lock_init(&lc->flush_lock);
 	INIT_LIST_HEAD(&lc->mark_list);
 	INIT_LIST_HEAD(&lc->clear_list);
 
-	str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
+	if (!strcasecmp(argv[0], "integrated_flush")) {
+		lc->integrated_flush = 1;
+		argc--;
+		argv++;
+	}
+
+	str_size = build_constructor_string(ti, argc, argv, &ctr_str);
 	if (str_size < 0) {
 		kfree(lc);
 		return str_size;
@@ -246,6 +293,19 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 			DMERR("Failed to register %s with device-mapper",
 			      devices_rdata);
 	}
+
+	if (lc->integrated_flush) {
+		lc->dmlog_wq = alloc_workqueue("dmlogd", WQ_MEM_RECLAIM, 0);
+		if (!lc->dmlog_wq) {
+			DMERR("couldn't start dmlogd");
+			r = -ENOMEM;
+			goto out;
+		}
+
+		INIT_DELAYED_WORK(&lc->flush_log_work, do_flush);
+		atomic_set(&lc->sched_flush, 0);
+	}
+
 out:
 	kfree(devices_rdata);
 	if (r) {
@@ -253,7 +313,6 @@ out:
 		kfree(ctr_str);
 	} else {
 		lc->usr_argv_str = ctr_str;
-		lc->usr_argc = argc;
 		log->context = lc;
 	}
 
@@ -264,9 +323,16 @@ static void userspace_dtr(struct dm_dirty_log *log)
 {
 	struct log_c *lc = log->context;
 
+	if (lc->integrated_flush) {
+		/* flush workqueue */
+		if (atomic_read(&lc->sched_flush))
+			flush_delayed_work(&lc->flush_log_work);
+
+		destroy_workqueue(lc->dmlog_wq);
+	}
+
 	(void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
-				 NULL, 0,
-				 NULL, NULL);
+				    NULL, 0, NULL, NULL);
 
 	if (lc->log_dev)
 		dm_put_device(lc->ti, lc->log_dev);
@@ -283,8 +349,7 @@ static int userspace_presuspend(struct dm_dirty_log *log)
 	struct log_c *lc = log->context;
 
 	r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND,
-				 NULL, 0,
-				 NULL, NULL);
+				 NULL, 0, NULL, NULL);
 
 	return r;
 }
@@ -294,9 +359,14 @@ static int userspace_postsuspend(struct dm_dirty_log *log)
 	int r;
 	struct log_c *lc = log->context;
 
+	/*
+	 * Run planned flush earlier.
+	 */
+	if (lc->integrated_flush && atomic_read(&lc->sched_flush))
+		flush_delayed_work(&lc->flush_log_work);
+
 	r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND,
-				 NULL, 0,
-				 NULL, NULL);
+				 NULL, 0, NULL, NULL);
 
 	return r;
 }
@@ -308,8 +378,7 @@ static int userspace_resume(struct dm_dirty_log *log)
 
 	lc->in_sync_hint = 0;
 	r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME,
-				 NULL, 0,
-				 NULL, NULL);
+				 NULL, 0, NULL, NULL);
 
 	return r;
 }
@@ -405,7 +474,8 @@ static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
 	return r;
 }
 
-static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
+static int flush_by_group(struct log_c *lc, struct list_head *flush_list,
+			  int flush_with_payload)
 {
 	int r = 0;
 	int count;
@@ -431,15 +501,29 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
 				break;
 		}
 
-		r = userspace_do_request(lc, lc->uuid, type,
-					 (char *)(group),
-					 count * sizeof(uint64_t),
-					 NULL, NULL);
-		if (r) {
-			/* Group send failed.  Attempt one-by-one. */
-			list_splice_init(&tmp_list, flush_list);
-			r = flush_one_by_one(lc, flush_list);
-			break;
+		if (flush_with_payload) {
+			r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
+						 (char *)(group),
+						 count * sizeof(uint64_t),
+						 NULL, NULL);
+			/*
+			 * Integrated flush failed.
+			 */
+			if (r)
+				break;
+		} else {
+			r = userspace_do_request(lc, lc->uuid, type,
+						 (char *)(group),
+						 count * sizeof(uint64_t),
+						 NULL, NULL);
+			if (r) {
+				/*
+				 * Group send failed.  Attempt one-by-one.
+				 */
+				list_splice_init(&tmp_list, flush_list);
+				r = flush_one_by_one(lc, flush_list);
+				break;
+			}
 		}
 	}
 
@@ -476,6 +560,8 @@ static int userspace_flush(struct dm_dirty_log *log)
 	struct log_c *lc = log->context;
 	LIST_HEAD(mark_list);
 	LIST_HEAD(clear_list);
+	int mark_list_is_empty;
+	int clear_list_is_empty;
 	struct flush_entry *fe, *tmp_fe;
 
 	spin_lock_irqsave(&lc->flush_lock, flags);
@@ -483,23 +569,51 @@ static int userspace_flush(struct dm_dirty_log *log)
 	list_splice_init(&lc->clear_list, &clear_list);
 	spin_unlock_irqrestore(&lc->flush_lock, flags);
 
-	if (list_empty(&mark_list) && list_empty(&clear_list))
+	mark_list_is_empty = list_empty(&mark_list);
+	clear_list_is_empty = list_empty(&clear_list);
+
+	if (mark_list_is_empty && clear_list_is_empty)
 		return 0;
 
-	r = flush_by_group(lc, &mark_list);
+	r = flush_by_group(lc, &clear_list, 0);
 	if (r)
-		goto fail;
+		goto out;
 
-	r = flush_by_group(lc, &clear_list);
+	if (!lc->integrated_flush) {
+		r = flush_by_group(lc, &mark_list, 0);
+		if (r)
+			goto out;
+		r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
+					 NULL, 0, NULL, NULL);
+		goto out;
+	}
+
+	/*
+	 * Send integrated flush request with mark_list as payload.
+	 */
+	r = flush_by_group(lc, &mark_list, 1);
 	if (r)
-		goto fail;
+		goto out;
 
-	r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
-				 NULL, 0, NULL, NULL);
+	if (mark_list_is_empty && !atomic_read(&lc->sched_flush)) {
+		/*
+		 * When there are only clear region requests,
+		 * we schedule a flush in the future.
+		 */
+		queue_delayed_work(lc->dmlog_wq, &lc->flush_log_work, 3 * HZ);
+		atomic_set(&lc->sched_flush, 1);
+	} else {
+		/*
+		 * Cancel pending flush because we
+		 * have already flushed in mark_region.
+		 */
+		cancel_delayed_work(&lc->flush_log_work);
+		atomic_set(&lc->sched_flush, 0);
+	}
 
-fail:
+out:
 	/*
-	 * We can safely remove these entries, even if failure.
+	 * We can safely remove these entries, even after failure.
 	 * Calling code will receive an error and will know that
 	 * the log facility has failed.
 	 */
@@ -603,8 +717,7 @@ static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
 
 	rdata_size = sizeof(pkg);
 	r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK,
-				 NULL, 0,
-				 (char *)&pkg, &rdata_size);
+				 NULL, 0, (char *)&pkg, &rdata_size);
 
 	*region = pkg.r;
 	return (r) ? r : (int)pkg.i;
@@ -630,8 +743,7 @@ static void userspace_set_region_sync(struct dm_dirty_log *log,
 	pkg.i = (int64_t)in_sync;
 
 	r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
-				 (char *)&pkg, sizeof(pkg),
-				 NULL, NULL);
+				 (char *)&pkg, sizeof(pkg), NULL, NULL);
 
 	/*
 	 * It would be nice to be able to report failures.
@@ -657,8 +769,7 @@ static region_t userspace_get_sync_count(struct dm_dirty_log *log)
 
 	rdata_size = sizeof(sync_count);
 	r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT,
-				 NULL, 0,
-				 (char *)&sync_count, &rdata_size);
+				 NULL, 0, (char *)&sync_count, &rdata_size);
 
 	if (r)
 		return 0;
@@ -685,8 +796,7 @@ static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
 	switch (status_type) {
 	case STATUSTYPE_INFO:
 		r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO,
-					 NULL, 0,
-					 result, &sz);
+					 NULL, 0, result, &sz);
 
 		if (r) {
 			sz = 0;
@@ -699,8 +809,10 @@ static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
 		BUG_ON(!table_args); /* There will always be a ' ' */
 		table_args++;
 
-		DMEMIT("%s %u %s %s ", log->type->name, lc->usr_argc,
-		       lc->uuid, table_args);
+		DMEMIT("%s %u %s ", log->type->name, lc->usr_argc, lc->uuid);
+		if (lc->integrated_flush)
+			DMEMIT("integrated_flush ");
+		DMEMIT("%s ", table_args);
 		break;
 	}
 	return (r) ? 0 : (int)sz;
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 08d9a207259..b428c0ae63d 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -66,7 +66,7 @@ static int dm_ulog_sendto_server(struct dm_ulog_request *tfr)
 	msg->seq = tfr->seq;
 	msg->len = sizeof(struct dm_ulog_request) + tfr->data_size;
 
-	r = cn_netlink_send(msg, 0, gfp_any());
+	r = cn_netlink_send(msg, 0, 0, gfp_any());
 
 	return r;
 }
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 573bd04591b..f4167b013d9 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -7,6 +7,7 @@
 
 #include <linux/device-mapper.h>
 
+#include "dm.h"
 #include "dm-path-selector.h"
 #include "dm-uevent.h"
 
@@ -86,15 +87,12 @@ struct multipath {
 	unsigned queue_if_no_path:1;	/* Queue I/O if last path fails? */
 	unsigned saved_queue_if_no_path:1; /* Saved state during suspension */
 	unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */
+	unsigned pg_init_disabled:1;	/* pg_init is not currently allowed */
 
 	unsigned pg_init_retries;	/* Number of times to retry pg_init */
 	unsigned pg_init_count;		/* Number of times pg_init called */
 	unsigned pg_init_delay_msecs;	/* Number of msecs before pg_init retry */
 
-	unsigned queue_size;
-	struct work_struct process_queued_ios;
-	struct list_head queued_ios;
-
 	struct work_struct trigger_event;
 
 	/*
@@ -116,14 +114,12 @@ struct dm_mpath_io {
 
 typedef int (*action_fn) (struct pgpath *pgpath);
 
-#define MIN_IOS 256	/* Mempool size */
-
 static struct kmem_cache *_mpio_cache;
 
 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
-static void process_queued_ios(struct work_struct *work);
 static void trigger_event(struct work_struct *work);
 static void activate_path(struct work_struct *work);
+static int __pgpath_busy(struct pgpath *pgpath);
 
 
 /*-----------------------------------------------
@@ -190,19 +186,18 @@ static void free_priority_group(struct priority_group *pg,
 static struct multipath *alloc_multipath(struct dm_target *ti)
 {
 	struct multipath *m;
+	unsigned min_ios = dm_get_reserved_rq_based_ios();
 
 	m = kzalloc(sizeof(*m), GFP_KERNEL);
 	if (m) {
 		INIT_LIST_HEAD(&m->priority_groups);
-		INIT_LIST_HEAD(&m->queued_ios);
 		spin_lock_init(&m->lock);
 		m->queue_io = 1;
 		m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
-		INIT_WORK(&m->process_queued_ios, process_queued_ios);
 		INIT_WORK(&m->trigger_event, trigger_event);
 		init_waitqueue_head(&m->pg_init_wait);
 		mutex_init(&m->work_mutex);
-		m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
+		m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
 		if (!m->mpio_pool) {
 			kfree(m);
 			return NULL;
@@ -255,13 +250,21 @@ static void clear_mapinfo(struct multipath *m, union map_info *info)
  * Path selection
  *-----------------------------------------------*/
 
-static void __pg_init_all_paths(struct multipath *m)
+static int __pg_init_all_paths(struct multipath *m)
 {
 	struct pgpath *pgpath;
 	unsigned long pg_init_delay = 0;
 
+	if (m->pg_init_in_progress || m->pg_init_disabled)
+		return 0;
+
 	m->pg_init_count++;
 	m->pg_init_required = 0;
+
+	/* Check here to reset pg_init_required */
+	if (!m->current_pg)
+		return 0;
+
 	if (m->pg_init_delay_retry)
 		pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
 						 m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
@@ -273,6 +276,7 @@ static void __pg_init_all_paths(struct multipath *m)
 				       pg_init_delay))
 			m->pg_init_in_progress++;
 	}
+	return m->pg_init_in_progress;
 }
 
 static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
@@ -364,19 +368,26 @@ failed:
  */
 static int __must_push_back(struct multipath *m)
 {
-	return (m->queue_if_no_path != m->saved_queue_if_no_path &&
-		dm_noflush_suspending(m->ti));
+	return (m->queue_if_no_path ||
+		(m->queue_if_no_path != m->saved_queue_if_no_path &&
+		 dm_noflush_suspending(m->ti)));
 }
 
-static int map_io(struct multipath *m, struct request *clone,
-		  union map_info *map_context, unsigned was_queued)
+#define pg_ready(m) (!(m)->queue_io && !(m)->pg_init_required)
+
+/*
+ * Map cloned requests
+ */
+static int multipath_map(struct dm_target *ti, struct request *clone,
+			 union map_info *map_context)
 {
-	int r = DM_MAPIO_REMAPPED;
+	struct multipath *m = (struct multipath *) ti->private;
+	int r = DM_MAPIO_REQUEUE;
 	size_t nr_bytes = blk_rq_bytes(clone);
 	unsigned long flags;
 	struct pgpath *pgpath;
 	struct block_device *bdev;
-	struct dm_mpath_io *mpio = map_context->ptr;
+	struct dm_mpath_io *mpio;
 
 	spin_lock_irqsave(&m->lock, flags);
 
@@ -387,35 +398,33 @@ static int map_io(struct multipath *m, struct request *clone,
 
 	pgpath = m->current_pgpath;
 
-	if (was_queued)
-		m->queue_size--;
-
-	if ((pgpath && m->queue_io) ||
-	    (!pgpath && m->queue_if_no_path)) {
-		/* Queue for the daemon to resubmit */
-		list_add_tail(&clone->queuelist, &m->queued_ios);
-		m->queue_size++;
-		if ((m->pg_init_required && !m->pg_init_in_progress) ||
-		    !m->queue_io)
-			queue_work(kmultipathd, &m->process_queued_ios);
-		pgpath = NULL;
-		r = DM_MAPIO_SUBMITTED;
-	} else if (pgpath) {
-		bdev = pgpath->path.dev->bdev;
-		clone->q = bdev_get_queue(bdev);
-		clone->rq_disk = bdev->bd_disk;
-	} else if (__must_push_back(m))
-		r = DM_MAPIO_REQUEUE;
-	else
-		r = -EIO;	/* Failed */
+	if (!pgpath) {
+		if (!__must_push_back(m))
+			r = -EIO;	/* Failed */
+		goto out_unlock;
+	}
+	if (!pg_ready(m)) {
+		__pg_init_all_paths(m);
+		goto out_unlock;
+	}
+	if (set_mapinfo(m, map_context) < 0)
+		/* ENOMEM, requeue */
+		goto out_unlock;
 
+	bdev = pgpath->path.dev->bdev;
+	clone->q = bdev_get_queue(bdev);
+	clone->rq_disk = bdev->bd_disk;
+	clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
+	mpio = map_context->ptr;
 	mpio->pgpath = pgpath;
 	mpio->nr_bytes = nr_bytes;
-
-	if (r == DM_MAPIO_REMAPPED && pgpath->pg->ps.type->start_io)
-		pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path,
+	if (pgpath->pg->ps.type->start_io)
+		pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
+					      &pgpath->path,
 					      nr_bytes);
+	r = DM_MAPIO_REMAPPED;
 
+out_unlock:
 	spin_unlock_irqrestore(&m->lock, flags);
 
 	return r;
@@ -436,73 +445,12 @@ static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path,
 	else
 		m->saved_queue_if_no_path = queue_if_no_path;
 	m->queue_if_no_path = queue_if_no_path;
-	if (!m->queue_if_no_path && m->queue_size)
-		queue_work(kmultipathd, &m->process_queued_ios);
-
 	spin_unlock_irqrestore(&m->lock, flags);
 
-	return 0;
-}
-
-/*-----------------------------------------------------------------
- * The multipath daemon is responsible for resubmitting queued ios.
- *---------------------------------------------------------------*/
-
-static void dispatch_queued_ios(struct multipath *m)
-{
-	int r;
-	unsigned long flags;
-	union map_info *info;
-	struct request *clone, *n;
-	LIST_HEAD(cl);
-
-	spin_lock_irqsave(&m->lock, flags);
-	list_splice_init(&m->queued_ios, &cl);
-	spin_unlock_irqrestore(&m->lock, flags);
-
-	list_for_each_entry_safe(clone, n, &cl, queuelist) {
-		list_del_init(&clone->queuelist);
-
-		info = dm_get_rq_mapinfo(clone);
-
-		r = map_io(m, clone, info, 1);
-		if (r < 0) {
-			clear_mapinfo(m, info);
-			dm_kill_unmapped_request(clone, r);
-		} else if (r == DM_MAPIO_REMAPPED)
-			dm_dispatch_request(clone);
-		else if (r == DM_MAPIO_REQUEUE) {
-			clear_mapinfo(m, info);
-			dm_requeue_unmapped_request(clone);
-		}
-	}
-}
-
-static void process_queued_ios(struct work_struct *work)
-{
-	struct multipath *m =
-		container_of(work, struct multipath, process_queued_ios);
-	struct pgpath *pgpath = NULL;
-	unsigned must_queue = 1;
-	unsigned long flags;
-
-	spin_lock_irqsave(&m->lock, flags);
-
-	if (!m->current_pgpath)
-		__choose_pgpath(m, 0);
-
-	pgpath = m->current_pgpath;
-
-	if ((pgpath && !m->queue_io) ||
-	    (!pgpath && !m->queue_if_no_path))
-		must_queue = 0;
+	if (!queue_if_no_path)
+		dm_table_run_md_queue_async(m->ti->table);
 
-	if (m->pg_init_required && !m->pg_init_in_progress && pgpath)
-		__pg_init_all_paths(m);
-
-	spin_unlock_irqrestore(&m->lock, flags);
-	if (!must_queue)
-		dispatch_queued_ios(m);
+	return 0;
 }
 
 /*
@@ -905,8 +853,9 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 		goto bad;
 	}
 
-	ti->num_flush_requests = 1;
-	ti->num_discard_requests = 1;
+	ti->num_flush_bios = 1;
+	ti->num_discard_bios = 1;
+	ti->num_write_same_bios = 1;
 
 	return 0;
 
@@ -941,10 +890,20 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m)
 
 static void flush_multipath_work(struct multipath *m)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&m->lock, flags);
+	m->pg_init_disabled = 1;
+	spin_unlock_irqrestore(&m->lock, flags);
+
 	flush_workqueue(kmpath_handlerd);
 	multipath_wait_for_pg_init_completion(m);
 	flush_workqueue(kmultipathd);
 	flush_work(&m->trigger_event);
+
+	spin_lock_irqsave(&m->lock, flags);
+	m->pg_init_disabled = 0;
+	spin_unlock_irqrestore(&m->lock, flags);
 }
 
 static void multipath_dtr(struct dm_target *ti)
@@ -956,27 +915,6 @@ static void multipath_dtr(struct dm_target *ti)
 }
 
 /*
- * Map cloned requests
- */
-static int multipath_map(struct dm_target *ti, struct request *clone,
-			 union map_info *map_context)
-{
-	int r;
-	struct multipath *m = (struct multipath *) ti->private;
-
-	if (set_mapinfo(m, map_context) < 0)
-		/* ENOMEM, requeue */
-		return DM_MAPIO_REQUEUE;
-
-	clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
-	r = map_io(m, clone, map_context, 0);
-	if (r < 0 || r == DM_MAPIO_REQUEUE)
-		clear_mapinfo(m, map_context);
-
-	return r;
-}
-
-/*
  * Take a path out of use.
  */
 static int fail_path(struct pgpath *pgpath)
@@ -1016,7 +954,7 @@ out:
  */
 static int reinstate_path(struct pgpath *pgpath)
 {
-	int r = 0;
+	int r = 0, run_queue = 0;
 	unsigned long flags;
 	struct multipath *m = pgpath->pg->m;
 
@@ -1038,9 +976,9 @@ static int reinstate_path(struct pgpath *pgpath)
 
 	pgpath->is_active = 1;
 
-	if (!m->nr_valid_paths++ && m->queue_size) {
+	if (!m->nr_valid_paths++) {
 		m->current_pgpath = NULL;
-		queue_work(kmultipathd, &m->process_queued_ios);
+		run_queue = 1;
 	} else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
 		if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
 			m->pg_init_in_progress++;
@@ -1053,6 +991,8 @@ static int reinstate_path(struct pgpath *pgpath)
 
 out:
 	spin_unlock_irqrestore(&m->lock, flags);
+	if (run_queue)
+		dm_table_run_md_queue_async(m->ti->table);
 
 	return r;
 }
@@ -1163,7 +1103,7 @@ static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
 
 	spin_lock_irqsave(&m->lock, flags);
 
-	if (m->pg_init_count <= m->pg_init_retries)
+	if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled)
 		m->pg_init_required = 1;
 	else
 		limit_reached = 1;
@@ -1236,11 +1176,12 @@ static void pg_init_done(void *data, int errors)
 		/* Activations of other paths are still on going */
 		goto out;
 
-	if (!m->pg_init_required)
-		m->queue_io = 0;
-
-	m->pg_init_delay_retry = delay_retry;
-	queue_work(kmultipathd, &m->process_queued_ios);
+	if (m->pg_init_required) {
+		m->pg_init_delay_retry = delay_retry;
+		if (__pg_init_all_paths(m))
+			goto out;
+	}
+	m->queue_io = 0;
 
 	/*
 	 * Wake up any thread waiting to suspend.
@@ -1256,8 +1197,26 @@ static void activate_path(struct work_struct *work)
 	struct pgpath *pgpath =
 		container_of(work, struct pgpath, activate_path.work);
 
-	scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
-				pg_init_done, pgpath);
+	if (pgpath->is_active)
+		scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
+				 pg_init_done, pgpath);
+	else
+		pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED);
+}
+
+static int noretry_error(int error)
+{
+	switch (error) {
+	case -EOPNOTSUPP:
+	case -EREMOTEIO:
+	case -EILSEQ:
+	case -ENODATA:
+	case -ENOSPC:
+		return 1;
+	}
+
+	/* Anything else could be a path failure, so should be retried */
+	return 0;
 }
 
 /*
@@ -1283,7 +1242,7 @@ static int do_end_io(struct multipath *m, struct request *clone,
 	if (!error && !clone->errors)
 		return 0;	/* I/O complete */
 
-	if (error == -EOPNOTSUPP || error == -EREMOTEIO || error == -EILSEQ)
+	if (noretry_error(error))
 		return error;
 
 	if (mpio->pgpath)
@@ -1378,8 +1337,8 @@ static void multipath_resume(struct dm_target *ti)
  *     [priority selector-name num_ps_args [ps_args]*
  *      num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
  */
-static int multipath_status(struct dm_target *ti, status_type_t type,
-			    unsigned status_flags, char *result, unsigned maxlen)
+static void multipath_status(struct dm_target *ti, status_type_t type,
+			     unsigned status_flags, char *result, unsigned maxlen)
 {
 	int sz = 0;
 	unsigned long flags;
@@ -1393,7 +1352,7 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
 
 	/* Features */
 	if (type == STATUSTYPE_INFO)
-		DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count);
+		DMEMIT("2 %u %u ", m->queue_io, m->pg_init_count);
 	else {
 		DMEMIT("%u ", m->queue_if_no_path +
 			      (m->pg_init_retries > 0) * 2 +
@@ -1485,8 +1444,6 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
 	}
 
 	spin_unlock_irqrestore(&m->lock, flags);
-
-	return 0;
 }
 
 static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
@@ -1514,7 +1471,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
 	}
 
 	if (argc != 2) {
-		DMWARN("Unrecognised multipath message received.");
+		DMWARN("Invalid multipath message arguments. Expected 2 arguments, got %d.", argc);
 		goto out;
 	}
 
@@ -1532,7 +1489,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
 	else if (!strcasecmp(argv[0], "fail_path"))
 		action = fail_path;
 	else {
-		DMWARN("Unrecognised multipath message received.");
+		DMWARN("Unrecognised multipath message received: %s", argv[0]);
 		goto out;
 	}
 
@@ -1562,7 +1519,6 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
 	unsigned long flags;
 	int r;
 
-again:
 	bdev = NULL;
 	mode = 0;
 	r = 0;
@@ -1580,7 +1536,7 @@ again:
 	}
 
 	if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path))
-		r = -EAGAIN;
+		r = -ENOTCONN;
 	else if (!bdev)
 		r = -EIO;
 
@@ -1589,13 +1545,22 @@ again:
 	/*
 	 * Only pass ioctls through if the device sizes match exactly.
 	 */
-	if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
-		r = scsi_verify_blk_ioctl(NULL, cmd);
+	if (!bdev || ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) {
+		int err = scsi_verify_blk_ioctl(NULL, cmd);
+		if (err)
+			r = err;
+	}
 
-	if (r == -EAGAIN && !fatal_signal_pending(current)) {
-		queue_work(kmultipathd, &m->process_queued_ios);
-		msleep(10);
-		goto again;
+	if (r == -ENOTCONN && !fatal_signal_pending(current)) {
+		spin_lock_irqsave(&m->lock, flags);
+		if (!m->current_pg) {
+			/* Path status changed, redo selection */
+			__choose_pgpath(m, 0);
+		}
+		if (m->pg_init_required)
+			__pg_init_all_paths(m);
+		spin_unlock_irqrestore(&m->lock, flags);
+		dm_table_run_md_queue_async(m->ti->table);
 	}
 
 	return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
@@ -1646,6 +1611,12 @@ static int multipath_busy(struct dm_target *ti)
 
 	spin_lock_irqsave(&m->lock, flags);
 
+	/* pg_init in progress or no paths available */
+	if (m->pg_init_in_progress ||
+	    (!m->nr_valid_paths && m->queue_if_no_path)) {
+		busy = 1;
+		goto out;
+	}
 	/* Guess which priority_group will be used at next mapping time */
 	if (unlikely(!m->current_pgpath && m->next_pg))
 		pg = m->next_pg;
@@ -1695,7 +1666,7 @@ out:
  *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
 	.name = "multipath",
-	.version = {1, 5, 0},
+	.version = {1, 7, 0},
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
 	.dtr = multipath_dtr,
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 45d94a7e7f6..4880b69e2e9 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -91,15 +91,44 @@ static struct raid_type {
 	{"raid6_nc", "RAID6 (N continue)",		2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
 };
 
+static char *raid10_md_layout_to_format(int layout)
+{
+	/*
+	 * Bit 16 and 17 stand for "offset" and "use_far_sets"
+	 * Refer to MD's raid10.c for details
+	 */
+	if ((layout & 0x10000) && (layout & 0x20000))
+		return "offset";
+
+	if ((layout & 0xFF) > 1)
+		return "near";
+
+	return "far";
+}
+
 static unsigned raid10_md_layout_to_copies(int layout)
 {
-	return layout & 0xFF;
+	if ((layout & 0xFF) > 1)
+		return layout & 0xFF;
+	return (layout >> 8) & 0xFF;
 }
 
 static int raid10_format_to_md_layout(char *format, unsigned copies)
 {
-	/* 1 "far" copy, and 'copies' "near" copies */
-	return (1 << 8) | (copies & 0xFF);
+	unsigned n = 1, f = 1;
+
+	if (!strcmp("near", format))
+		n = copies;
+	else
+		f = copies;
+
+	if (!strcmp("offset", format))
+		return 0x30000 | (f << 8) | n;
+
+	if (!strcmp("far", format))
+		return 0x20000 | (f << 8) | n;
+
+	return (f << 8) | n;
 }
 
 static struct raid_type *get_raid_type(char *name)
@@ -295,9 +324,11 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
 		 * Choose a reasonable default.  All figures in sectors.
 		 */
 		if (min_region_size > (1 << 13)) {
+			/* If not a power of 2, make it the next power of 2 */
+			if (min_region_size & (min_region_size - 1))
+				region_size = 1 << fls(region_size);
 			DMINFO("Choosing default region size of %lu sectors",
 			       region_size);
-			region_size = min_region_size;
 		} else {
 			DMINFO("Choosing default region size of 4MiB");
 			region_size = 1 << 13; /* sectors */
@@ -338,24 +369,23 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
 }
 
 /*
- * validate_rebuild_devices
+ * validate_raid_redundancy
  * @rs
  *
- * Determine if the devices specified for rebuild can result in a valid
- * usable array that is capable of rebuilding the given devices.
+ * Determine if there are enough devices in the array that haven't
+ * failed (or are being rebuilt) to form a usable array.
  *
  * Returns: 0 on success, -EINVAL on failure.
  */
-static int validate_rebuild_devices(struct raid_set *rs)
+static int validate_raid_redundancy(struct raid_set *rs)
 {
 	unsigned i, rebuild_cnt = 0;
-	unsigned rebuilds_per_group, copies, d;
-
-	if (!(rs->print_flags & DMPF_REBUILD))
-		return 0;
+	unsigned rebuilds_per_group = 0, copies, d;
+	unsigned group_size, last_group_start;
 
 	for (i = 0; i < rs->md.raid_disks; i++)
-		if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
+		if (!test_bit(In_sync, &rs->dev[i].rdev.flags) ||
+		    !rs->dev[i].rdev.sb_page)
 			rebuild_cnt++;
 
 	switch (rs->raid_type->level) {
@@ -379,9 +409,6 @@ static int validate_rebuild_devices(struct raid_set *rs)
 		 * as long as the failed devices occur in different mirror
 		 * groups (i.e. different stripes).
 		 *
-		 * Right now, we only allow for "near" copies.  When other
-		 * formats are added, we will have to check those too.
-		 *
 		 * When checking "near" format, make sure no adjacent devices
 		 * have failed beyond what can be handled.  In addition to the
 		 * simple case where the number of devices is a multiple of the
@@ -391,27 +418,51 @@ static int validate_rebuild_devices(struct raid_set *rs)
 		 *          A    A    B    B    C
 		 *          C    D    D    E    E
 		 */
-		rebuilds_per_group = 0;
-		for (i = 0; i < rs->md.raid_disks * copies; i++) {
-			d = i % rs->md.raid_disks;
-			if (!test_bit(In_sync, &rs->dev[d].rdev.flags) &&
-			    (++rebuilds_per_group >= copies))
-				goto too_many;
-			if (!((i + 1) % copies))
+		if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) {
+			for (i = 0; i < rs->md.raid_disks * copies; i++) {
+				if (!(i % copies))
+					rebuilds_per_group = 0;
+				d = i % rs->md.raid_disks;
+				if ((!rs->dev[d].rdev.sb_page ||
+				     !test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
+				    (++rebuilds_per_group >= copies))
+					goto too_many;
+			}
+			break;
+		}
+
+		/*
+		 * When checking "far" and "offset" formats, we need to ensure
+		 * that the device that holds its copy is not also dead or
+		 * being rebuilt.  (Note that "far" and "offset" formats only
+		 * support two copies right now.  These formats also only ever
+		 * use the 'use_far_sets' variant.)
+		 *
+		 * This check is somewhat complicated by the need to account
+		 * for arrays that are not a multiple of (far) copies.  This
+		 * results in the need to treat the last (potentially larger)
+		 * set differently.
+		 */
+		group_size = (rs->md.raid_disks / copies);
+		last_group_start = (rs->md.raid_disks / group_size) - 1;
+		last_group_start *= group_size;
+		for (i = 0; i < rs->md.raid_disks; i++) {
+			if (!(i % copies) && !(i > last_group_start))
 				rebuilds_per_group = 0;
+			if ((!rs->dev[i].rdev.sb_page ||
+			     !test_bit(In_sync, &rs->dev[i].rdev.flags)) &&
+			    (++rebuilds_per_group >= copies))
+					goto too_many;
 		}
 		break;
 	default:
-		DMERR("The rebuild parameter is not supported for %s",
-		      rs->raid_type->name);
-		rs->ti->error = "Rebuild not supported for this RAID type";
-		return -EINVAL;
+		if (rebuild_cnt)
+			return -EINVAL;
 	}
 
 	return 0;
 
 too_many:
-	rs->ti->error = "Too many rebuild devices specified";
 	return -EINVAL;
 }
 
@@ -436,7 +487,7 @@ too_many:
  *
  * RAID10-only options:
  *    [raid10_copies <# copies>]        Number of copies.  (Default: 2)
- *    [raid10_format <near>]            Layout algorithm.  (Default: near)
+ *    [raid10_format <near|far|offset>] Layout algorithm.  (Default: near)
  */
 static int parse_raid_params(struct raid_set *rs, char **argv,
 			     unsigned num_raid_params)
@@ -453,7 +504,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 	 * First, parse the in-order required arguments
 	 * "chunk_size" is the only argument of this type.
 	 */
-	if ((strict_strtoul(argv[0], 10, &value) < 0)) {
+	if ((kstrtoul(argv[0], 10, &value) < 0)) {
 		rs->ti->error = "Bad chunk size";
 		return -EINVAL;
 	} else if (rs->raid_type->level == 1) {
@@ -523,7 +574,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 				rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
 				return -EINVAL;
 			}
-			if (strcmp("near", argv[i])) {
+			if (strcmp("near", argv[i]) &&
+			    strcmp("far", argv[i]) &&
+			    strcmp("offset", argv[i])) {
 				rs->ti->error = "Invalid 'raid10_format' value given";
 				return -EINVAL;
 			}
@@ -532,7 +585,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 			continue;
 		}
 
-		if (strict_strtoul(argv[i], 10, &value) < 0) {
+		if (kstrtoul(argv[i], 10, &value) < 0) {
 			rs->ti->error = "Bad numerical argument given in raid params";
 			return -EINVAL;
 		}
@@ -647,6 +700,15 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 			return -EINVAL;
 		}
 
+		/*
+		 * If the format is not "near", we only support
+		 * two copies at the moment.
+		 */
+		if (strcmp("near", raid10_format) && (raid10_copies > 2)) {
+			rs->ti->error = "Too many copies for given RAID10 format.";
+			return -EINVAL;
+		}
+
 		/* (Len * #mirrors) / #devices */
 		sectors_per_dev = rs->ti->len * raid10_copies;
 		sector_div(sectors_per_dev, rs->md.raid_disks);
@@ -662,9 +724,6 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 	}
 	rs->md.dev_sectors = sectors_per_dev;
 
-	if (validate_rebuild_devices(rs))
-		return -EINVAL;
-
 	/* Assume there are no metadata devices until the drives are parsed */
 	rs->md.persistent = 0;
 	rs->md.external = 1;
@@ -860,17 +919,30 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
 	/*
 	 * Reshaping is not currently allowed
 	 */
-	if ((le32_to_cpu(sb->level) != mddev->level) ||
-	    (le32_to_cpu(sb->layout) != mddev->layout) ||
-	    (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) {
-		DMERR("Reshaping arrays not yet supported.");
+	if (le32_to_cpu(sb->level) != mddev->level) {
+		DMERR("Reshaping arrays not yet supported. (RAID level change)");
+		return -EINVAL;
+	}
+	if (le32_to_cpu(sb->layout) != mddev->layout) {
+		DMERR("Reshaping arrays not yet supported. (RAID layout change)");
+		DMERR("  0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout);
+		DMERR("  Old layout: %s w/ %d copies",
+		      raid10_md_layout_to_format(le32_to_cpu(sb->layout)),
+		      raid10_md_layout_to_copies(le32_to_cpu(sb->layout)));
+		DMERR("  New layout: %s w/ %d copies",
+		      raid10_md_layout_to_format(mddev->layout),
+		      raid10_md_layout_to_copies(mddev->layout));
+		return -EINVAL;
+	}
+	if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) {
+		DMERR("Reshaping arrays not yet supported. (stripe sectors change)");
 		return -EINVAL;
 	}
 
 	/* We can only change the number of devices in RAID1 right now */
 	if ((rs->raid_type->level != 1) &&
 	    (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
-		DMERR("Reshaping arrays not yet supported.");
+		DMERR("Reshaping arrays not yet supported. (device count change)");
 		return -EINVAL;
 	}
 
@@ -993,28 +1065,10 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
 static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 {
 	int ret;
-	unsigned redundancy = 0;
 	struct raid_dev *dev;
 	struct md_rdev *rdev, *tmp, *freshest;
 	struct mddev *mddev = &rs->md;
 
-	switch (rs->raid_type->level) {
-	case 1:
-		redundancy = rs->md.raid_disks - 1;
-		break;
-	case 4:
-	case 5:
-	case 6:
-		redundancy = rs->raid_type->parity_devs;
-		break;
-	case 10:
-		redundancy = raid10_md_layout_to_copies(mddev->layout) - 1;
-		break;
-	default:
-		ti->error = "Unknown RAID type";
-		return -EINVAL;
-	}
-
 	freshest = NULL;
 	rdev_for_each_safe(rdev, tmp, mddev) {
 		/*
@@ -1043,44 +1097,43 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 			break;
 		default:
 			dev = container_of(rdev, struct raid_dev, rdev);
-			if (redundancy--) {
-				if (dev->meta_dev)
-					dm_put_device(ti, dev->meta_dev);
-
-				dev->meta_dev = NULL;
-				rdev->meta_bdev = NULL;
+			if (dev->meta_dev)
+				dm_put_device(ti, dev->meta_dev);
 
-				if (rdev->sb_page)
-					put_page(rdev->sb_page);
+			dev->meta_dev = NULL;
+			rdev->meta_bdev = NULL;
 
-				rdev->sb_page = NULL;
+			if (rdev->sb_page)
+				put_page(rdev->sb_page);
 
-				rdev->sb_loaded = 0;
+			rdev->sb_page = NULL;
 
-				/*
-				 * We might be able to salvage the data device
-				 * even though the meta device has failed.  For
-				 * now, we behave as though '- -' had been
-				 * set for this device in the table.
-				 */
-				if (dev->data_dev)
-					dm_put_device(ti, dev->data_dev);
+			rdev->sb_loaded = 0;
 
-				dev->data_dev = NULL;
-				rdev->bdev = NULL;
+			/*
+			 * We might be able to salvage the data device
+			 * even though the meta device has failed.  For
+			 * now, we behave as though '- -' had been
+			 * set for this device in the table.
+			 */
+			if (dev->data_dev)
+				dm_put_device(ti, dev->data_dev);
 
-				list_del(&rdev->same_set);
+			dev->data_dev = NULL;
+			rdev->bdev = NULL;
 
-				continue;
-			}
-			ti->error = "Failed to load superblock";
-			return ret;
+			list_del(&rdev->same_set);
 		}
 	}
 
 	if (!freshest)
 		return 0;
 
+	if (validate_raid_redundancy(rs)) {
+		rs->ti->error = "Insufficient redundancy to activate array";
+		return -EINVAL;
+	}
+
 	/*
 	 * Validation of the freshest device provides the source of
 	 * validation for the remaining devices.
@@ -1128,7 +1181,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	argv++;
 
 	/* number of RAID parameters */
-	if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) {
+	if (kstrtoul(argv[0], 10, &num_raid_params) < 0) {
 		ti->error = "Cannot understand number of RAID parameters";
 		return -EINVAL;
 	}
@@ -1141,7 +1194,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		return -EINVAL;
 	}
 
-	if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
+	if ((kstrtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
 	    (num_raid_devs >= INT_MAX)) {
 		ti->error = "Cannot understand number of raid devices";
 		return -EINVAL;
@@ -1176,7 +1229,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	INIT_WORK(&rs->md.event_work, do_table_event);
 	ti->private = rs;
-	ti->num_flush_requests = 1;
+	ti->num_flush_bios = 1;
 
 	mutex_lock(&rs->md.reconfig_mutex);
 	ret = md_run(&rs->md);
@@ -1216,7 +1269,7 @@ static void raid_dtr(struct dm_target *ti)
 	context_free(rs);
 }
 
-static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context)
+static int raid_map(struct dm_target *ti, struct bio *bio)
 {
 	struct raid_set *rs = ti->private;
 	struct mddev *mddev = &rs->md;
@@ -1226,8 +1279,33 @@ static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_c
 	return DM_MAPIO_SUBMITTED;
 }
 
-static int raid_status(struct dm_target *ti, status_type_t type,
-		       unsigned status_flags, char *result, unsigned maxlen)
+static const char *decipher_sync_action(struct mddev *mddev)
+{
+	if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
+		return "frozen";
+
+	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+	    (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
+		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+			return "reshape";
+
+		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+			if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+				return "resync";
+			else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+				return "check";
+			return "repair";
+		}
+
+		if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
+			return "recover";
+	}
+
+	return "idle";
+}
+
+static void raid_status(struct dm_target *ti, status_type_t type,
+			unsigned status_flags, char *result, unsigned maxlen)
 {
 	struct raid_set *rs = ti->private;
 	unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
@@ -1245,8 +1323,18 @@ static int raid_status(struct dm_target *ti, status_type_t type,
 			sync = rs->md.recovery_cp;
 
 		if (sync >= rs->md.resync_max_sectors) {
+			/*
+			 * Sync complete.
+			 */
 			array_in_sync = 1;
 			sync = rs->md.resync_max_sectors;
+		} else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) {
+			/*
+			 * If "check" or "repair" is occurring, the array has
+			 * undergone and initial sync and the health characters
+			 * should not be 'a' anymore.
+			 */
+			array_in_sync = 1;
 		} else {
 			/*
 			 * The array may be doing an initial sync, or it may
@@ -1258,6 +1346,7 @@ static int raid_status(struct dm_target *ti, status_type_t type,
 				if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
 					array_in_sync = 1;
 		}
+
 		/*
 		 * Status characters:
 		 *  'D' = Dead/Failed device
@@ -1286,6 +1375,22 @@ static int raid_status(struct dm_target *ti, status_type_t type,
 		       (unsigned long long) sync,
 		       (unsigned long long) rs->md.resync_max_sectors);
 
+		/*
+		 * Sync action:
+		 *   See Documentation/device-mapper/dm-raid.c for
+		 *   information on each of these states.
+		 */
+		DMEMIT(" %s", decipher_sync_action(&rs->md));
+
+		/*
+		 * resync_mismatches/mismatch_cnt
+		 *   This field shows the number of discrepancies found when
+		 *   performing a "check" of the array.
+		 */
+		DMEMIT(" %llu",
+		       (strcmp(rs->md.last_sync_action, "check")) ? 0 :
+		       (unsigned long long)
+		       atomic64_read(&rs->md.resync_mismatches));
 		break;
 	case STATUSTYPE_TABLE:
 		/* The string you would use to construct this array */
@@ -1354,7 +1459,8 @@ static int raid_status(struct dm_target *ti, status_type_t type,
 			       raid10_md_layout_to_copies(rs->md.layout));
 
 		if (rs->print_flags & DMPF_RAID10_FORMAT)
-			DMEMIT(" raid10_format near");
+			DMEMIT(" raid10_format %s",
+			       raid10_md_layout_to_format(rs->md.layout));
 
 		DMEMIT(" %d", rs->md.raid_disks);
 		for (i = 0; i < rs->md.raid_disks; i++) {
@@ -1369,11 +1475,64 @@ static int raid_status(struct dm_target *ti, status_type_t type,
 				DMEMIT(" -");
 		}
 	}
+}
+
+static int raid_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+	struct raid_set *rs = ti->private;
+	struct mddev *mddev = &rs->md;
+
+	if (!strcasecmp(argv[0], "reshape")) {
+		DMERR("Reshape not supported.");
+		return -EINVAL;
+	}
+
+	if (!mddev->pers || !mddev->pers->sync_request)
+		return -EINVAL;
+
+	if (!strcasecmp(argv[0], "frozen"))
+		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+	else
+		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+
+	if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) {
+		if (mddev->sync_thread) {
+			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+			md_reap_sync_thread(mddev);
+		}
+	} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+		   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
+		return -EBUSY;
+	else if (!strcasecmp(argv[0], "resync"))
+		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+	else if (!strcasecmp(argv[0], "recover")) {
+		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+	} else {
+		if (!strcasecmp(argv[0], "check"))
+			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+		else if (!!strcasecmp(argv[0], "repair"))
+			return -EINVAL;
+		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+	}
+	if (mddev->ro == 2) {
+		/* A write to sync_action is enough to justify
+		 * canceling read-auto mode
+		 */
+		mddev->ro = 0;
+		if (!mddev->suspended)
+			md_wakeup_thread(mddev->sync_thread);
+	}
+	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+	if (!mddev->suspended)
+		md_wakeup_thread(mddev->thread);
 
 	return 0;
 }
 
-static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
+static int raid_iterate_devices(struct dm_target *ti,
+				iterate_devices_callout_fn fn, void *data)
 {
 	struct raid_set *rs = ti->private;
 	unsigned i;
@@ -1414,6 +1573,62 @@ static void raid_postsuspend(struct dm_target *ti)
 	mddev_suspend(&rs->md);
 }
 
+static void attempt_restore_of_faulty_devices(struct raid_set *rs)
+{
+	int i;
+	uint64_t failed_devices, cleared_failed_devices = 0;
+	unsigned long flags;
+	struct dm_raid_superblock *sb;
+	struct md_rdev *r;
+
+	for (i = 0; i < rs->md.raid_disks; i++) {
+		r = &rs->dev[i].rdev;
+		if (test_bit(Faulty, &r->flags) && r->sb_page &&
+		    sync_page_io(r, 0, r->sb_size, r->sb_page, READ, 1)) {
+			DMINFO("Faulty %s device #%d has readable super block."
+			       "  Attempting to revive it.",
+			       rs->raid_type->name, i);
+
+			/*
+			 * Faulty bit may be set, but sometimes the array can
+			 * be suspended before the personalities can respond
+			 * by removing the device from the array (i.e. calling
+			 * 'hot_remove_disk').  If they haven't yet removed
+			 * the failed device, its 'raid_disk' number will be
+			 * '>= 0' - meaning we must call this function
+			 * ourselves.
+			 */
+			if ((r->raid_disk >= 0) &&
+			    (r->mddev->pers->hot_remove_disk(r->mddev, r) != 0))
+				/* Failed to revive this device, try next */
+				continue;
+
+			r->raid_disk = i;
+			r->saved_raid_disk = i;
+			flags = r->flags;
+			clear_bit(Faulty, &r->flags);
+			clear_bit(WriteErrorSeen, &r->flags);
+			clear_bit(In_sync, &r->flags);
+			if (r->mddev->pers->hot_add_disk(r->mddev, r)) {
+				r->raid_disk = -1;
+				r->saved_raid_disk = -1;
+				r->flags = flags;
+			} else {
+				r->recovery_offset = 0;
+				cleared_failed_devices |= 1 << i;
+			}
+		}
+	}
+	if (cleared_failed_devices) {
+		rdev_for_each(r, &rs->md) {
+			sb = page_address(r->sb_page);
+			failed_devices = le64_to_cpu(sb->failed_devices);
+			failed_devices &= ~cleared_failed_devices;
+			sb->failed_devices = cpu_to_le64(failed_devices);
+		}
+	}
+}
+
 static void raid_resume(struct dm_target *ti)
 {
 	struct raid_set *rs = ti->private;
@@ -1422,6 +1637,13 @@ static void raid_resume(struct dm_target *ti)
 	if (!rs->bitmap_loaded) {
 		bitmap_load(&rs->md);
 		rs->bitmap_loaded = 1;
+	} else {
+		/*
+		 * A secondary resume while the device is active.
+		 * Take this opportunity to check whether any failed
+		 * devices are reachable again.
+		 */
+		attempt_restore_of_faulty_devices(rs);
 	}
 
 	clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
@@ -1430,12 +1652,13 @@ static void raid_resume(struct dm_target *ti)
 
 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 3, 1},
+	.version = {1, 5, 2},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,
 	.map = raid_map,
 	.status = raid_status,
+	.message = raid_message,
 	.iterate_devices = raid_iterate_devices,
 	.io_hints = raid_io_hints,
 	.presuspend = raid_presuspend,
@@ -1445,6 +1668,10 @@ static struct target_type raid_target = {
 
 static int __init dm_raid_init(void)
 {
+	DMINFO("Loading target version %u.%u.%u",
+	       raid_target.version[0],
+	       raid_target.version[1],
+	       raid_target.version[2]);
 	return dm_register_target(&raid_target);
 }
 
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index fd61f98ee1f..7dfdb5c746d 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -61,7 +61,6 @@ struct mirror_set {
 	struct dm_region_hash *rh;
 	struct dm_kcopyd_client *kcopyd_client;
 	struct dm_io_client *io_client;
-	mempool_t *read_record_pool;
 
 	/* recovery */
 	region_t nr_regions;
@@ -83,6 +82,9 @@ struct mirror_set {
 	struct mirror mirror[0];
 };
 
+DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(raid1_resync_throttle,
+		"A percentage of time allocated for raid resynchronization");
+
 static void wakeup_mirrord(void *context)
 {
 	struct mirror_set *ms = context;
@@ -139,14 +141,13 @@ static void dispatch_bios(void *context, struct bio_list *bio_list)
 		queue_bio(ms, bio, WRITE);
 }
 
-#define MIN_READ_RECORDS 20
-struct dm_raid1_read_record {
+struct dm_raid1_bio_record {
 	struct mirror *m;
+	/* if details->bi_bdev == NULL, details were not saved */
 	struct dm_bio_details details;
+	region_t write_region;
 };
 
-static struct kmem_cache *_dm_raid1_read_record_cache;
-
 /*
  * Every mirror should look like this one.
  */
@@ -431,7 +432,7 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio)
 	region_t region = dm_rh_bio_to_region(ms->rh, bio);
 
 	if (log->type->in_sync(log, region, 0))
-		return choose_mirror(ms,  bio->bi_sector) ? 1 : 0;
+		return choose_mirror(ms,  bio->bi_iter.bi_sector) ? 1 : 0;
 
 	return 0;
 }
@@ -441,15 +442,15 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio)
  */
 static sector_t map_sector(struct mirror *m, struct bio *bio)
 {
-	if (unlikely(!bio->bi_size))
+	if (unlikely(!bio->bi_iter.bi_size))
 		return 0;
-	return m->offset + dm_target_offset(m->ms->ti, bio->bi_sector);
+	return m->offset + dm_target_offset(m->ms->ti, bio->bi_iter.bi_sector);
 }
 
 static void map_bio(struct mirror *m, struct bio *bio)
 {
 	bio->bi_bdev = m->dev->bdev;
-	bio->bi_sector = map_sector(m, bio);
+	bio->bi_iter.bi_sector = map_sector(m, bio);
 }
 
 static void map_region(struct dm_io_region *io, struct mirror *m,
@@ -457,7 +458,7 @@ static void map_region(struct dm_io_region *io, struct mirror *m,
 {
 	io->bdev = m->dev->bdev;
 	io->sector = map_sector(m, bio);
-	io->count = bio->bi_size >> 9;
+	io->count = bio_sectors(bio);
 }
 
 static void hold_bio(struct mirror_set *ms, struct bio *bio)
@@ -525,8 +526,8 @@ static void read_async_bio(struct mirror *m, struct bio *bio)
 	struct dm_io_region io;
 	struct dm_io_request io_req = {
 		.bi_rw = READ,
-		.mem.type = DM_IO_BVEC,
-		.mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
+		.mem.type = DM_IO_BIO,
+		.mem.ptr.bio = bio,
 		.notify.fn = read_callback,
 		.notify.context = bio,
 		.client = m->ms->io_client,
@@ -558,7 +559,7 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
 		 * We can only read balance if the region is in sync.
 		 */
 		if (likely(region_in_sync(ms, region, 1)))
-			m = choose_mirror(ms, bio->bi_sector);
+			m = choose_mirror(ms, bio->bi_iter.bi_sector);
 		else if (m && atomic_read(&m->error_count))
 			m = NULL;
 
@@ -628,8 +629,8 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
 	struct mirror *m;
 	struct dm_io_request io_req = {
 		.bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA),
-		.mem.type = DM_IO_BVEC,
-		.mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
+		.mem.type = DM_IO_BIO,
+		.mem.ptr.bio = bio,
 		.notify.fn = write_callback,
 		.notify.context = bio,
 		.client = ms->io_client,
@@ -876,19 +877,9 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
 	atomic_set(&ms->suspend, 0);
 	atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
 
-	ms->read_record_pool = mempool_create_slab_pool(MIN_READ_RECORDS,
-						_dm_raid1_read_record_cache);
-
-	if (!ms->read_record_pool) {
-		ti->error = "Error creating mirror read_record_pool";
-		kfree(ms);
-		return NULL;
-	}
-
 	ms->io_client = dm_io_client_create();
 	if (IS_ERR(ms->io_client)) {
 		ti->error = "Error creating dm_io client";
-		mempool_destroy(ms->read_record_pool);
 		kfree(ms);
  		return NULL;
 	}
@@ -900,7 +891,6 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
 	if (IS_ERR(ms->rh)) {
 		ti->error = "Error creating dirty region hash";
 		dm_io_client_destroy(ms->io_client);
-		mempool_destroy(ms->read_record_pool);
 		kfree(ms);
 		return NULL;
 	}
@@ -916,7 +906,6 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti,
 
 	dm_io_client_destroy(ms->io_client);
 	dm_region_hash_destroy(ms->rh);
-	mempool_destroy(ms->read_record_pool);
 	kfree(ms);
 }
 
@@ -1086,12 +1075,12 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	if (r)
 		goto err_free_context;
 
-	ti->num_flush_requests = 1;
-	ti->num_discard_requests = 1;
+	ti->num_flush_bios = 1;
+	ti->num_discard_bios = 1;
+	ti->per_bio_data_size = sizeof(struct dm_raid1_bio_record);
 	ti->discard_zeroes_data_unsupported = true;
 
-	ms->kmirrord_wq = alloc_workqueue("kmirrord",
-					  WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
+	ms->kmirrord_wq = alloc_workqueue("kmirrord", WQ_MEM_RECLAIM, 0);
 	if (!ms->kmirrord_wq) {
 		DMERR("couldn't start kmirrord");
 		r = -ENOMEM;
@@ -1124,7 +1113,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto err_destroy_wq;
 	}
 
-	ms->kcopyd_client = dm_kcopyd_client_create();
+	ms->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
 	if (IS_ERR(ms->kcopyd_client)) {
 		r = PTR_ERR(ms->kcopyd_client);
 		goto err_destroy_wq;
@@ -1155,18 +1144,20 @@ static void mirror_dtr(struct dm_target *ti)
 /*
  * Mirror mapping function
  */
-static int mirror_map(struct dm_target *ti, struct bio *bio,
-		      union map_info *map_context)
+static int mirror_map(struct dm_target *ti, struct bio *bio)
 {
 	int r, rw = bio_rw(bio);
 	struct mirror *m;
 	struct mirror_set *ms = ti->private;
-	struct dm_raid1_read_record *read_record = NULL;
 	struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
+	struct dm_raid1_bio_record *bio_record =
+	  dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record));
+
+	bio_record->details.bi_bdev = NULL;
 
 	if (rw == WRITE) {
 		/* Save region for mirror_end_io() handler */
-		map_context->ll = dm_rh_bio_to_region(ms->rh, bio);
+		bio_record->write_region = dm_rh_bio_to_region(ms->rh, bio);
 		queue_bio(ms, bio, rw);
 		return DM_MAPIO_SUBMITTED;
 	}
@@ -1190,37 +1181,33 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
 	 * The region is in-sync and we can perform reads directly.
 	 * Store enough information so we can retry if it fails.
 	 */
-	m = choose_mirror(ms, bio->bi_sector);
+	m = choose_mirror(ms, bio->bi_iter.bi_sector);
 	if (unlikely(!m))
 		return -EIO;
 
-	read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO);
-	if (likely(read_record)) {
-		dm_bio_record(&read_record->details, bio);
-		map_context->ptr = read_record;
-		read_record->m = m;
-	}
+	dm_bio_record(&bio_record->details, bio);
+	bio_record->m = m;
 
 	map_bio(m, bio);
 
 	return DM_MAPIO_REMAPPED;
 }
 
-static int mirror_end_io(struct dm_target *ti, struct bio *bio,
-			 int error, union map_info *map_context)
+static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
 {
 	int rw = bio_rw(bio);
 	struct mirror_set *ms = (struct mirror_set *) ti->private;
 	struct mirror *m = NULL;
 	struct dm_bio_details *bd = NULL;
-	struct dm_raid1_read_record *read_record = map_context->ptr;
+	struct dm_raid1_bio_record *bio_record =
+	  dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record));
 
 	/*
 	 * We need to dec pending if this was a write.
 	 */
 	if (rw == WRITE) {
 		if (!(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD)))
-			dm_rh_dec(ms->rh, map_context->ll);
+			dm_rh_dec(ms->rh, bio_record->write_region);
 		return error;
 	}
 
@@ -1231,7 +1218,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
 		goto out;
 
 	if (unlikely(error)) {
-		if (!read_record) {
+		if (!bio_record->details.bi_bdev) {
 			/*
 			 * There wasn't enough memory to record necessary
 			 * information for a retry or there was no other
@@ -1241,7 +1228,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
 			return -EIO;
 		}
 
-		m = read_record->m;
+		m = bio_record->m;
 
 		DMERR("Mirror read failed from %s. Trying alternative device.",
 		      m->dev->name);
@@ -1253,22 +1240,21 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
 		 * mirror.
 		 */
 		if (default_ok(m) || mirror_available(ms, bio)) {
-			bd = &read_record->details;
+			bd = &bio_record->details;
 
 			dm_bio_restore(bd, bio);
-			mempool_free(read_record, ms->read_record_pool);
-			map_context->ptr = NULL;
+			bio_record->details.bi_bdev = NULL;
+
+			atomic_inc(&bio->bi_remaining);
+
 			queue_bio(ms, bio, rw);
-			return 1;
+			return DM_ENDIO_INCOMPLETE;
 		}
 		DMERR("All replicated volumes dead, failing I/O");
 	}
 
 out:
-	if (read_record) {
-		mempool_free(read_record, ms->read_record_pool);
-		map_context->ptr = NULL;
-	}
+	bio_record->details.bi_bdev = NULL;
 
 	return error;
 }
@@ -1366,8 +1352,8 @@ static char device_status_char(struct mirror *m)
 }
 
 
-static int mirror_status(struct dm_target *ti, status_type_t type,
-			 unsigned status_flags, char *result, unsigned maxlen)
+static void mirror_status(struct dm_target *ti, status_type_t type,
+			  unsigned status_flags, char *result, unsigned maxlen)
 {
 	unsigned int m, sz = 0;
 	struct mirror_set *ms = (struct mirror_set *) ti->private;
@@ -1402,8 +1388,6 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
 		if (ms->features & DM_RAID1_HANDLE_ERRORS)
 			DMEMIT(" 1 handle_errors");
 	}
-
-	return 0;
 }
 
 static int mirror_iterate_devices(struct dm_target *ti,
@@ -1422,7 +1406,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
 
 static struct target_type mirror_target = {
 	.name	 = "mirror",
-	.version = {1, 12, 1},
+	.version = {1, 13, 2},
 	.module	 = THIS_MODULE,
 	.ctr	 = mirror_ctr,
 	.dtr	 = mirror_dtr,
@@ -1439,13 +1423,6 @@ static int __init dm_mirror_init(void)
 {
 	int r;
 
-	_dm_raid1_read_record_cache = KMEM_CACHE(dm_raid1_read_record, 0);
-	if (!_dm_raid1_read_record_cache) {
-		DMERR("Can't allocate dm_raid1_read_record cache");
-		r = -ENOMEM;
-		goto bad_cache;
-	}
-
 	r = dm_register_target(&mirror_target);
 	if (r < 0) {
 		DMERR("Failed to register mirror target");
@@ -1455,15 +1432,12 @@ static int __init dm_mirror_init(void)
 	return 0;
 
 bad_target:
-	kmem_cache_destroy(_dm_raid1_read_record_cache);
-bad_cache:
 	return r;
 }
 
 static void __exit dm_mirror_exit(void)
 {
 	dm_unregister_target(&mirror_target);
-	kmem_cache_destroy(_dm_raid1_read_record_cache);
 }
 
 /* Module hooks */
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 69732e03eb3..b929fd5f498 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -126,7 +126,8 @@ EXPORT_SYMBOL_GPL(dm_rh_region_to_sector);
 
 region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio)
 {
-	return dm_rh_sector_to_region(rh, bio->bi_sector - rh->target_begin);
+	return dm_rh_sector_to_region(rh, bio->bi_iter.bi_sector -
+				      rh->target_begin);
 }
 EXPORT_SYMBOL_GPL(dm_rh_bio_to_region);
 
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 3ac415675b6..d6e88178d22 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -13,10 +13,13 @@
 #include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/dm-io.h>
+#include "dm-bufio.h"
 
 #define DM_MSG_PREFIX "persistent snapshot"
 #define DM_CHUNK_SIZE_DEFAULT_SECTORS 32	/* 16KB */
 
+#define DM_PREFETCH_CHUNKS		12
+
 /*-----------------------------------------------------------------
  * Persistent snapshots, by persistent we mean that the snapshot
  * will survive a reboot.
@@ -256,7 +259,8 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
 	 */
 	INIT_WORK_ONSTACK(&req.work, do_metadata);
 	queue_work(ps->metadata_wq, &req.work);
-	flush_work(&req.work);
+	flush_workqueue(ps->metadata_wq);
+	destroy_work_on_stack(&req.work);
 
 	return req.result;
 }
@@ -269,6 +273,14 @@ static chunk_t area_location(struct pstore *ps, chunk_t area)
 	return NUM_SNAPSHOT_HDR_CHUNKS + ((ps->exceptions_per_area + 1) * area);
 }
 
+static void skip_metadata(struct pstore *ps)
+{
+	uint32_t stride = ps->exceptions_per_area + 1;
+	chunk_t next_free = ps->next_free;
+	if (sector_div(next_free, stride) == NUM_SNAPSHOT_HDR_CHUNKS)
+		ps->next_free++;
+}
+
 /*
  * Read or write a metadata area.  Remembering to skip the first
  * chunk which holds the header.
@@ -393,17 +405,18 @@ static int write_header(struct pstore *ps)
 /*
  * Access functions for the disk exceptions, these do the endian conversions.
  */
-static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
+static struct disk_exception *get_exception(struct pstore *ps, void *ps_area,
+					    uint32_t index)
 {
 	BUG_ON(index >= ps->exceptions_per_area);
 
-	return ((struct disk_exception *) ps->area) + index;
+	return ((struct disk_exception *) ps_area) + index;
 }
 
-static void read_exception(struct pstore *ps,
+static void read_exception(struct pstore *ps, void *ps_area,
 			   uint32_t index, struct core_exception *result)
 {
-	struct disk_exception *de = get_exception(ps, index);
+	struct disk_exception *de = get_exception(ps, ps_area, index);
 
 	/* copy it */
 	result->old_chunk = le64_to_cpu(de->old_chunk);
@@ -413,7 +426,7 @@ static void read_exception(struct pstore *ps,
 static void write_exception(struct pstore *ps,
 			    uint32_t index, struct core_exception *e)
 {
-	struct disk_exception *de = get_exception(ps, index);
+	struct disk_exception *de = get_exception(ps, ps->area, index);
 
 	/* copy it */
 	de->old_chunk = cpu_to_le64(e->old_chunk);
@@ -422,7 +435,7 @@ static void write_exception(struct pstore *ps,
 
 static void clear_exception(struct pstore *ps, uint32_t index)
 {
-	struct disk_exception *de = get_exception(ps, index);
+	struct disk_exception *de = get_exception(ps, ps->area, index);
 
 	/* clear it */
 	de->old_chunk = 0;
@@ -434,7 +447,7 @@ static void clear_exception(struct pstore *ps, uint32_t index)
  * 'full' is filled in to indicate if the area has been
  * filled.
  */
-static int insert_exceptions(struct pstore *ps,
+static int insert_exceptions(struct pstore *ps, void *ps_area,
 			     int (*callback)(void *callback_context,
 					     chunk_t old, chunk_t new),
 			     void *callback_context,
@@ -448,7 +461,7 @@ static int insert_exceptions(struct pstore *ps,
 	*full = 1;
 
 	for (i = 0; i < ps->exceptions_per_area; i++) {
-		read_exception(ps, i, &e);
+		read_exception(ps, ps_area, i, &e);
 
 		/*
 		 * If the new_chunk is pointing at the start of
@@ -485,24 +498,75 @@ static int read_exceptions(struct pstore *ps,
 			   void *callback_context)
 {
 	int r, full = 1;
+	struct dm_bufio_client *client;
+	chunk_t prefetch_area = 0;
+
+	client = dm_bufio_client_create(dm_snap_cow(ps->store->snap)->bdev,
+					ps->store->chunk_size << SECTOR_SHIFT,
+					1, 0, NULL, NULL);
+
+	if (IS_ERR(client))
+		return PTR_ERR(client);
+
+	/*
+	 * Setup for one current buffer + desired readahead buffers.
+	 */
+	dm_bufio_set_minimum_buffers(client, 1 + DM_PREFETCH_CHUNKS);
 
 	/*
 	 * Keeping reading chunks and inserting exceptions until
 	 * we find a partially full area.
 	 */
 	for (ps->current_area = 0; full; ps->current_area++) {
-		r = area_io(ps, READ);
-		if (r)
-			return r;
+		struct dm_buffer *bp;
+		void *area;
+		chunk_t chunk;
+
+		if (unlikely(prefetch_area < ps->current_area))
+			prefetch_area = ps->current_area;
+
+		if (DM_PREFETCH_CHUNKS) do {
+			chunk_t pf_chunk = area_location(ps, prefetch_area);
+			if (unlikely(pf_chunk >= dm_bufio_get_device_size(client)))
+				break;
+			dm_bufio_prefetch(client, pf_chunk, 1);
+			prefetch_area++;
+			if (unlikely(!prefetch_area))
+				break;
+		} while (prefetch_area <= ps->current_area + DM_PREFETCH_CHUNKS);
+
+		chunk = area_location(ps, ps->current_area);
+
+		area = dm_bufio_read(client, chunk, &bp);
+		if (unlikely(IS_ERR(area))) {
+			r = PTR_ERR(area);
+			goto ret_destroy_bufio;
+		}
 
-		r = insert_exceptions(ps, callback, callback_context, &full);
-		if (r)
-			return r;
+		r = insert_exceptions(ps, area, callback, callback_context,
+				      &full);
+
+		if (!full)
+			memcpy(ps->area, area, ps->store->chunk_size << SECTOR_SHIFT);
+
+		dm_bufio_release(bp);
+
+		dm_bufio_forget(client, chunk);
+
+		if (unlikely(r))
+			goto ret_destroy_bufio;
 	}
 
 	ps->current_area--;
 
-	return 0;
+	skip_metadata(ps);
+
+	r = 0;
+
+ret_destroy_bufio:
+	dm_bufio_client_destroy(client);
+
+	return r;
 }
 
 static struct pstore *get_info(struct dm_exception_store *store)
@@ -616,8 +680,6 @@ static int persistent_prepare_exception(struct dm_exception_store *store,
 					struct dm_exception *e)
 {
 	struct pstore *ps = get_info(store);
-	uint32_t stride;
-	chunk_t next_free;
 	sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev);
 
 	/* Is there enough room ? */
@@ -630,10 +692,8 @@ static int persistent_prepare_exception(struct dm_exception_store *store,
 	 * Move onto the next free pending, making sure to take
 	 * into account the location of the metadata chunks.
 	 */
-	stride = (ps->exceptions_per_area + 1);
-	next_free = ++ps->next_free;
-	if (sector_div(next_free, stride) == 1)
-		ps->next_free++;
+	ps->next_free++;
+	skip_metadata(ps);
 
 	atomic_inc(&ps->pending_count);
 	return 0;
@@ -727,7 +787,7 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
 		ps->current_committed = ps->exceptions_per_area;
 	}
 
-	read_exception(ps, ps->current_committed - 1, &ce);
+	read_exception(ps, ps->area, ps->current_committed - 1, &ce);
 	*last_old_chunk = ce.old_chunk;
 	*last_new_chunk = ce.new_chunk;
 
@@ -737,8 +797,8 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
 	 */
 	for (nr_consecutive = 1; nr_consecutive < ps->current_committed;
 	     nr_consecutive++) {
-		read_exception(ps, ps->current_committed - 1 - nr_consecutive,
-			       &ce);
+		read_exception(ps, ps->area,
+			       ps->current_committed - 1 - nr_consecutive, &ce);
 		if (ce.old_chunk != *last_old_chunk - nr_consecutive ||
 		    ce.new_chunk != *last_new_chunk - nr_consecutive)
 			break;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index a143921feaf..5bd2290cfb1 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -66,6 +66,18 @@ struct dm_snapshot {
 
 	atomic_t pending_exceptions_count;
 
+	/* Protected by "lock" */
+	sector_t exception_start_sequence;
+
+	/* Protected by kcopyd single-threaded callback */
+	sector_t exception_complete_sequence;
+
+	/*
+	 * A list of pending exceptions that completed out of order.
+	 * Protected by kcopyd single-threaded callback.
+	 */
+	struct list_head out_of_order_list;
+
 	mempool_t *pending_pool;
 
 	struct dm_exception_table pending;
@@ -79,7 +91,6 @@ struct dm_snapshot {
 
 	/* Chunks with outstanding reads */
 	spinlock_t tracked_chunk_lock;
-	mempool_t *tracked_chunk_pool;
 	struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
 
 	/* The on disk metadata handler */
@@ -125,6 +136,9 @@ struct dm_snapshot {
 #define RUNNING_MERGE          0
 #define SHUTDOWN_MERGE         1
 
+DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
+		"A percentage of time allocated for copy on write");
+
 struct dm_dev *dm_snap_origin(struct dm_snapshot *s)
 {
 	return s->origin;
@@ -171,6 +185,14 @@ struct dm_snap_pending_exception {
 	 */
 	int started;
 
+	/* There was copying error. */
+	int copy_error;
+
+	/* A sequence number, it is used for in-order completion. */
+	sector_t exception_sequence;
+
+	struct list_head out_of_order_entry;
+
 	/*
 	 * For writing a complete chunk, bypassing the copy.
 	 */
@@ -191,46 +213,48 @@ struct dm_snap_tracked_chunk {
 	chunk_t chunk;
 };
 
-static struct kmem_cache *tracked_chunk_cache;
+static void init_tracked_chunk(struct bio *bio)
+{
+	struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
+	INIT_HLIST_NODE(&c->node);
+}
 
-static struct dm_snap_tracked_chunk *track_chunk(struct dm_snapshot *s,
-						 chunk_t chunk)
+static bool is_bio_tracked(struct bio *bio)
 {
-	struct dm_snap_tracked_chunk *c = mempool_alloc(s->tracked_chunk_pool,
-							GFP_NOIO);
-	unsigned long flags;
+	struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
+	return !hlist_unhashed(&c->node);
+}
+
+static void track_chunk(struct dm_snapshot *s, struct bio *bio, chunk_t chunk)
+{
+	struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
 
 	c->chunk = chunk;
 
-	spin_lock_irqsave(&s->tracked_chunk_lock, flags);
+	spin_lock_irq(&s->tracked_chunk_lock);
 	hlist_add_head(&c->node,
 		       &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]);
-	spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
-
-	return c;
+	spin_unlock_irq(&s->tracked_chunk_lock);
 }
 
-static void stop_tracking_chunk(struct dm_snapshot *s,
-				struct dm_snap_tracked_chunk *c)
+static void stop_tracking_chunk(struct dm_snapshot *s, struct bio *bio)
 {
+	struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
 	unsigned long flags;
 
 	spin_lock_irqsave(&s->tracked_chunk_lock, flags);
 	hlist_del(&c->node);
 	spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
-
-	mempool_free(c, s->tracked_chunk_pool);
 }
 
 static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk)
 {
 	struct dm_snap_tracked_chunk *c;
-	struct hlist_node *hn;
 	int found = 0;
 
 	spin_lock_irq(&s->tracked_chunk_lock);
 
-	hlist_for_each_entry(c, hn,
+	hlist_for_each_entry(c,
 	    &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)], node) {
 		if (c->chunk == chunk) {
 			found = 1;
@@ -586,12 +610,12 @@ static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
 	return NULL;
 }
 
-static struct dm_exception *alloc_completed_exception(void)
+static struct dm_exception *alloc_completed_exception(gfp_t gfp)
 {
 	struct dm_exception *e;
 
-	e = kmem_cache_alloc(exception_cache, GFP_NOIO);
-	if (!e)
+	e = kmem_cache_alloc(exception_cache, gfp);
+	if (!e && gfp == GFP_NOIO)
 		e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
 
 	return e;
@@ -618,7 +642,7 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe)
 	struct dm_snapshot *s = pe->snap;
 
 	mempool_free(pe, s->pending_pool);
-	smp_mb__before_atomic_dec();
+	smp_mb__before_atomic();
 	atomic_dec(&s->pending_exceptions_count);
 }
 
@@ -673,7 +697,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
 	struct dm_snapshot *s = context;
 	struct dm_exception *e;
 
-	e = alloc_completed_exception();
+	e = alloc_completed_exception(GFP_KERNEL);
 	if (!e)
 		return -ENOMEM;
 
@@ -721,17 +745,16 @@ static int calc_max_buckets(void)
  */
 static int init_hash_tables(struct dm_snapshot *s)
 {
-	sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
+	sector_t hash_size, cow_dev_size, max_buckets;
 
 	/*
 	 * Calculate based on the size of the original volume or
 	 * the COW volume...
 	 */
 	cow_dev_size = get_dev_size(s->cow->bdev);
-	origin_dev_size = get_dev_size(s->origin->bdev);
 	max_buckets = calc_max_buckets();
 
-	hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift;
+	hash_size = cow_dev_size >> s->store->chunk_shift;
 	hash_size = min(hash_size, max_buckets);
 
 	if (hash_size < 64)
@@ -760,7 +783,7 @@ static int init_hash_tables(struct dm_snapshot *s)
 static void merge_shutdown(struct dm_snapshot *s)
 {
 	clear_bit_unlock(RUNNING_MERGE, &s->state_bits);
-	smp_mb__after_clear_bit();
+	smp_mb__after_atomic();
 	wake_up_bit(&s->state_bits, RUNNING_MERGE);
 }
 
@@ -1036,7 +1059,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	int i;
 	int r = -EINVAL;
 	char *origin_path, *cow_path;
-	unsigned args_used, num_flush_requests = 1;
+	unsigned args_used, num_flush_bios = 1;
 	fmode_t origin_mode = FMODE_READ;
 
 	if (argc != 4) {
@@ -1046,7 +1069,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 
 	if (dm_target_is_snapshot_merge(ti)) {
-		num_flush_requests = 2;
+		num_flush_bios = 2;
 		origin_mode = FMODE_WRITE;
 	}
 
@@ -1091,6 +1114,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	s->valid = 1;
 	s->active = 0;
 	atomic_set(&s->pending_exceptions_count, 0);
+	s->exception_start_sequence = 0;
+	s->exception_complete_sequence = 0;
+	INIT_LIST_HEAD(&s->out_of_order_list);
 	init_rwsem(&s->lock);
 	INIT_LIST_HEAD(&s->list);
 	spin_lock_init(&s->pe_lock);
@@ -1107,7 +1133,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad_hash_tables;
 	}
 
-	s->kcopyd_client = dm_kcopyd_client_create();
+	s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
 	if (IS_ERR(s->kcopyd_client)) {
 		r = PTR_ERR(s->kcopyd_client);
 		ti->error = "Could not create kcopyd client";
@@ -1117,24 +1143,18 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache);
 	if (!s->pending_pool) {
 		ti->error = "Could not allocate mempool for pending exceptions";
+		r = -ENOMEM;
 		goto bad_pending_pool;
 	}
 
-	s->tracked_chunk_pool = mempool_create_slab_pool(MIN_IOS,
-							 tracked_chunk_cache);
-	if (!s->tracked_chunk_pool) {
-		ti->error = "Could not allocate tracked_chunk mempool for "
-			    "tracking reads";
-		goto bad_tracked_chunk_pool;
-	}
-
 	for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
 		INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]);
 
 	spin_lock_init(&s->tracked_chunk_lock);
 
 	ti->private = s;
-	ti->num_flush_requests = num_flush_requests;
+	ti->num_flush_bios = num_flush_bios;
+	ti->per_bio_data_size = sizeof(struct dm_snap_tracked_chunk);
 
 	/* Add snapshot to the list of snapshots for this origin */
 	/* Exceptions aren't triggered till snapshot_resume() is called */
@@ -1183,9 +1203,6 @@ bad_read_metadata:
 	unregister_snapshot(s);
 
 bad_load_and_register:
-	mempool_destroy(s->tracked_chunk_pool);
-
-bad_tracked_chunk_pool:
 	mempool_destroy(s->pending_pool);
 
 bad_pending_pool:
@@ -1290,8 +1307,6 @@ static void snapshot_dtr(struct dm_target *ti)
 		BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i]));
 #endif
 
-	mempool_destroy(s->tracked_chunk_pool);
-
 	__free_exceptions(s);
 
 	mempool_destroy(s->pending_pool);
@@ -1390,7 +1405,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
 		goto out;
 	}
 
-	e = alloc_completed_exception();
+	e = alloc_completed_exception(GFP_NOIO);
 	if (!e) {
 		down_write(&s->lock);
 		__invalidate_snapshot(s, -ENOMEM);
@@ -1423,6 +1438,7 @@ out:
 	if (full_bio) {
 		full_bio->bi_end_io = pe->full_bio_end_io;
 		full_bio->bi_private = pe->full_bio_private;
+		atomic_inc(&full_bio->bi_remaining);
 	}
 	free_pending_exception(pe);
 
@@ -1451,6 +1467,19 @@ static void commit_callback(void *context, int success)
 	pending_complete(pe, success);
 }
 
+static void complete_exception(struct dm_snap_pending_exception *pe)
+{
+	struct dm_snapshot *s = pe->snap;
+
+	if (unlikely(pe->copy_error))
+		pending_complete(pe, 0);
+
+	else
+		/* Update the metadata if we are persistent */
+		s->store->type->commit_exception(s->store, &pe->e,
+						 commit_callback, pe);
+}
+
 /*
  * Called when the copy I/O has finished.  kcopyd actually runs
  * this code so don't block.
@@ -1460,13 +1489,32 @@ static void copy_callback(int read_err, unsigned long write_err, void *context)
 	struct dm_snap_pending_exception *pe = context;
 	struct dm_snapshot *s = pe->snap;
 
-	if (read_err || write_err)
-		pending_complete(pe, 0);
+	pe->copy_error = read_err || write_err;
 
-	else
-		/* Update the metadata if we are persistent */
-		s->store->type->commit_exception(s->store, &pe->e,
-						 commit_callback, pe);
+	if (pe->exception_sequence == s->exception_complete_sequence) {
+		s->exception_complete_sequence++;
+		complete_exception(pe);
+
+		while (!list_empty(&s->out_of_order_list)) {
+			pe = list_entry(s->out_of_order_list.next,
+					struct dm_snap_pending_exception, out_of_order_entry);
+			if (pe->exception_sequence != s->exception_complete_sequence)
+				break;
+			s->exception_complete_sequence++;
+			list_del(&pe->out_of_order_entry);
+			complete_exception(pe);
+		}
+	} else {
+		struct list_head *lh;
+		struct dm_snap_pending_exception *pe2;
+
+		list_for_each_prev(lh, &s->out_of_order_list) {
+			pe2 = list_entry(lh, struct dm_snap_pending_exception, out_of_order_entry);
+			if (pe2->exception_sequence < pe->exception_sequence)
+				break;
+		}
+		list_add(&pe->out_of_order_entry, lh);
+	}
 }
 
 /*
@@ -1561,6 +1609,8 @@ __find_pending_exception(struct dm_snapshot *s,
 		return NULL;
 	}
 
+	pe->exception_sequence = s->exception_start_sequence++;
+
 	dm_insert_exception(&s->pending, &pe->e);
 
 	return pe;
@@ -1570,15 +1620,13 @@ static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
 			    struct bio *bio, chunk_t chunk)
 {
 	bio->bi_bdev = s->cow->bdev;
-	bio->bi_sector = chunk_to_sector(s->store,
-					 dm_chunk_number(e->new_chunk) +
-					 (chunk - e->old_chunk)) +
-					 (bio->bi_sector &
-					  s->store->chunk_mask);
+	bio->bi_iter.bi_sector =
+		chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) +
+				(chunk - e->old_chunk)) +
+		(bio->bi_iter.bi_sector & s->store->chunk_mask);
 }
 
-static int snapshot_map(struct dm_target *ti, struct bio *bio,
-			union map_info *map_context)
+static int snapshot_map(struct dm_target *ti, struct bio *bio)
 {
 	struct dm_exception *e;
 	struct dm_snapshot *s = ti->private;
@@ -1586,12 +1634,14 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
 	chunk_t chunk;
 	struct dm_snap_pending_exception *pe = NULL;
 
+	init_tracked_chunk(bio);
+
 	if (bio->bi_rw & REQ_FLUSH) {
 		bio->bi_bdev = s->cow->bdev;
 		return DM_MAPIO_REMAPPED;
 	}
 
-	chunk = sector_to_chunk(s->store, bio->bi_sector);
+	chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
 
 	/* Full snapshots are not usable */
 	/* To get here the table must be live so s->active is always set. */
@@ -1652,7 +1702,8 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
 		r = DM_MAPIO_SUBMITTED;
 
 		if (!pe->started &&
-		    bio->bi_size == (s->store->chunk_size << SECTOR_SHIFT)) {
+		    bio->bi_iter.bi_size ==
+		    (s->store->chunk_size << SECTOR_SHIFT)) {
 			pe->started = 1;
 			up_write(&s->lock);
 			start_full_bio(pe, bio);
@@ -1670,7 +1721,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
 		}
 	} else {
 		bio->bi_bdev = s->origin->bdev;
-		map_context->ptr = track_chunk(s, chunk);
+		track_chunk(s, bio, chunk);
 	}
 
 out_unlock:
@@ -1691,24 +1742,24 @@ out:
  * If merging is currently taking place on the chunk in question, the
  * I/O is deferred by adding it to s->bios_queued_during_merge.
  */
-static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
-			      union map_info *map_context)
+static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
 {
 	struct dm_exception *e;
 	struct dm_snapshot *s = ti->private;
 	int r = DM_MAPIO_REMAPPED;
 	chunk_t chunk;
 
+	init_tracked_chunk(bio);
+
 	if (bio->bi_rw & REQ_FLUSH) {
-		if (!map_context->target_request_nr)
+		if (!dm_bio_get_target_bio_nr(bio))
 			bio->bi_bdev = s->origin->bdev;
 		else
 			bio->bi_bdev = s->cow->bdev;
-		map_context->ptr = NULL;
 		return DM_MAPIO_REMAPPED;
 	}
 
-	chunk = sector_to_chunk(s->store, bio->bi_sector);
+	chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
 
 	down_write(&s->lock);
 
@@ -1733,7 +1784,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
 		remap_exception(s, e, bio, chunk);
 
 		if (bio_rw(bio) == WRITE)
-			map_context->ptr = track_chunk(s, chunk);
+			track_chunk(s, bio, chunk);
 		goto out_unlock;
 	}
 
@@ -1751,14 +1802,12 @@ out_unlock:
 	return r;
 }
 
-static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
-			   int error, union map_info *map_context)
+static int snapshot_end_io(struct dm_target *ti, struct bio *bio, int error)
 {
 	struct dm_snapshot *s = ti->private;
-	struct dm_snap_tracked_chunk *c = map_context->ptr;
 
-	if (c)
-		stop_tracking_chunk(s, c);
+	if (is_bio_tracked(bio))
+		stop_tracking_chunk(s, bio);
 
 	return 0;
 }
@@ -1848,8 +1897,8 @@ static void snapshot_merge_resume(struct dm_target *ti)
 	start_merge(s);
 }
 
-static int snapshot_status(struct dm_target *ti, status_type_t type,
-			   unsigned status_flags, char *result, unsigned maxlen)
+static void snapshot_status(struct dm_target *ti, status_type_t type,
+			    unsigned status_flags, char *result, unsigned maxlen)
 {
 	unsigned sz = 0;
 	struct dm_snapshot *snap = ti->private;
@@ -1895,8 +1944,6 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
 					  maxlen - sz);
 		break;
 	}
-
-	return 0;
 }
 
 static int snapshot_iterate_devices(struct dm_target *ti,
@@ -2049,7 +2096,7 @@ static int do_origin(struct dm_dev *origin, struct bio *bio)
 	down_read(&_origins_lock);
 	o = __lookup_origin(origin->bdev);
 	if (o)
-		r = __origin_write(&o->snapshots, bio->bi_sector, bio);
+		r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
 	up_read(&_origins_lock);
 
 	return r;
@@ -2094,6 +2141,11 @@ static int origin_write_extent(struct dm_snapshot *merging_snap,
  * Origin: maps a linear range of a device, with hooks for snapshotting.
  */
 
+struct dm_origin {
+	struct dm_dev *dev;
+	unsigned split_boundary;
+};
+
 /*
  * Construct an origin mapping: <dev_path>
  * The context for an origin is merely a 'struct dm_dev *'
@@ -2102,42 +2154,65 @@ static int origin_write_extent(struct dm_snapshot *merging_snap,
 static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
 	int r;
-	struct dm_dev *dev;
+	struct dm_origin *o;
 
 	if (argc != 1) {
 		ti->error = "origin: incorrect number of arguments";
 		return -EINVAL;
 	}
 
-	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev);
+	o = kmalloc(sizeof(struct dm_origin), GFP_KERNEL);
+	if (!o) {
+		ti->error = "Cannot allocate private origin structure";
+		r = -ENOMEM;
+		goto bad_alloc;
+	}
+
+	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &o->dev);
 	if (r) {
 		ti->error = "Cannot get target device";
-		return r;
+		goto bad_open;
 	}
 
-	ti->private = dev;
-	ti->num_flush_requests = 1;
+	ti->private = o;
+	ti->num_flush_bios = 1;
 
 	return 0;
+
+bad_open:
+	kfree(o);
+bad_alloc:
+	return r;
 }
 
 static void origin_dtr(struct dm_target *ti)
 {
-	struct dm_dev *dev = ti->private;
-	dm_put_device(ti, dev);
+	struct dm_origin *o = ti->private;
+	dm_put_device(ti, o->dev);
+	kfree(o);
 }
 
-static int origin_map(struct dm_target *ti, struct bio *bio,
-		      union map_info *map_context)
+static int origin_map(struct dm_target *ti, struct bio *bio)
 {
-	struct dm_dev *dev = ti->private;
-	bio->bi_bdev = dev->bdev;
+	struct dm_origin *o = ti->private;
+	unsigned available_sectors;
+
+	bio->bi_bdev = o->dev->bdev;
+
+	if (unlikely(bio->bi_rw & REQ_FLUSH))
+		return DM_MAPIO_REMAPPED;
 
-	if (bio->bi_rw & REQ_FLUSH)
+	if (bio_rw(bio) != WRITE)
 		return DM_MAPIO_REMAPPED;
 
+	available_sectors = o->split_boundary -
+		((unsigned)bio->bi_iter.bi_sector & (o->split_boundary - 1));
+
+	if (bio_sectors(bio) > available_sectors)
+		dm_accept_partial_bio(bio, available_sectors);
+
 	/* Only tell snapshots if this is a write */
-	return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED;
+	return do_origin(o->dev, bio);
 }
 
 /*
@@ -2146,15 +2221,15 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
  */
 static void origin_resume(struct dm_target *ti)
 {
-	struct dm_dev *dev = ti->private;
+	struct dm_origin *o = ti->private;
 
-	ti->max_io_len = get_origin_minimum_chunksize(dev->bdev);
+	o->split_boundary = get_origin_minimum_chunksize(o->dev->bdev);
 }
 
-static int origin_status(struct dm_target *ti, status_type_t type,
-			 unsigned status_flags, char *result, unsigned maxlen)
+static void origin_status(struct dm_target *ti, status_type_t type,
+			  unsigned status_flags, char *result, unsigned maxlen)
 {
-	struct dm_dev *dev = ti->private;
+	struct dm_origin *o = ti->private;
 
 	switch (type) {
 	case STATUSTYPE_INFO:
@@ -2162,23 +2237,21 @@ static int origin_status(struct dm_target *ti, status_type_t type,
 		break;
 
 	case STATUSTYPE_TABLE:
-		snprintf(result, maxlen, "%s", dev->name);
+		snprintf(result, maxlen, "%s", o->dev->name);
 		break;
 	}
-
-	return 0;
 }
 
 static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 			struct bio_vec *biovec, int max_size)
 {
-	struct dm_dev *dev = ti->private;
-	struct request_queue *q = bdev_get_queue(dev->bdev);
+	struct dm_origin *o = ti->private;
+	struct request_queue *q = bdev_get_queue(o->dev->bdev);
 
 	if (!q->merge_bvec_fn)
 		return max_size;
 
-	bvm->bi_bdev = dev->bdev;
+	bvm->bi_bdev = o->dev->bdev;
 
 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
@@ -2186,14 +2259,14 @@ static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 static int origin_iterate_devices(struct dm_target *ti,
 				  iterate_devices_callout_fn fn, void *data)
 {
-	struct dm_dev *dev = ti->private;
+	struct dm_origin *o = ti->private;
 
-	return fn(ti, dev, 0, ti->len, data);
+	return fn(ti, o->dev, 0, ti->len, data);
 }
 
 static struct target_type origin_target = {
 	.name    = "snapshot-origin",
-	.version = {1, 7, 1},
+	.version = {1, 8, 1},
 	.module  = THIS_MODULE,
 	.ctr     = origin_ctr,
 	.dtr     = origin_dtr,
@@ -2206,7 +2279,7 @@ static struct target_type origin_target = {
 
 static struct target_type snapshot_target = {
 	.name    = "snapshot",
-	.version = {1, 10, 0},
+	.version = {1, 12, 0},
 	.module  = THIS_MODULE,
 	.ctr     = snapshot_ctr,
 	.dtr     = snapshot_dtr,
@@ -2220,7 +2293,7 @@ static struct target_type snapshot_target = {
 
 static struct target_type merge_target = {
 	.name    = dm_snapshot_merge_target_name,
-	.version = {1, 1, 0},
+	.version = {1, 2, 0},
 	.module  = THIS_MODULE,
 	.ctr     = snapshot_ctr,
 	.dtr     = snapshot_dtr,
@@ -2281,17 +2354,8 @@ static int __init dm_snapshot_init(void)
 		goto bad_pending_cache;
 	}
 
-	tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0);
-	if (!tracked_chunk_cache) {
-		DMERR("Couldn't create cache to track chunks in use.");
-		r = -ENOMEM;
-		goto bad_tracked_chunk_cache;
-	}
-
 	return 0;
 
-bad_tracked_chunk_cache:
-	kmem_cache_destroy(pending_cache);
 bad_pending_cache:
 	kmem_cache_destroy(exception_cache);
 bad_exception_cache:
@@ -2317,7 +2381,6 @@ static void __exit dm_snapshot_exit(void)
 	exit_origin_hash();
 	kmem_cache_destroy(pending_cache);
 	kmem_cache_destroy(exception_cache);
-	kmem_cache_destroy(tracked_chunk_cache);
 
 	dm_exception_store_exit();
 }
@@ -2329,3 +2392,5 @@ module_exit(dm_snapshot_exit);
 MODULE_DESCRIPTION(DM_NAME " snapshot target");
 MODULE_AUTHOR("Joe Thornber");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("dm-snapshot-origin");
+MODULE_ALIAS("dm-snapshot-merge");
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
new file mode 100644
index 00000000000..28a90122a5a
--- /dev/null
+++ b/drivers/md/dm-stats.c
@@ -0,0 +1,981 @@
+#include <linux/errno.h>
+#include <linux/numa.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include <linux/threads.h>
+#include <linux/preempt.h>
+#include <linux/irqflags.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/device-mapper.h>
+
+#include "dm.h"
+#include "dm-stats.h"
+
+#define DM_MSG_PREFIX "stats"
+
+static int dm_stat_need_rcu_barrier;
+
+/*
+ * Using 64-bit values to avoid overflow (which is a
+ * problem that block/genhd.c's IO accounting has).
+ */
+struct dm_stat_percpu {
+	unsigned long long sectors[2];
+	unsigned long long ios[2];
+	unsigned long long merges[2];
+	unsigned long long ticks[2];
+	unsigned long long io_ticks[2];
+	unsigned long long io_ticks_total;
+	unsigned long long time_in_queue;
+};
+
+struct dm_stat_shared {
+	atomic_t in_flight[2];
+	unsigned long stamp;
+	struct dm_stat_percpu tmp;
+};
+
+struct dm_stat {
+	struct list_head list_entry;
+	int id;
+	size_t n_entries;
+	sector_t start;
+	sector_t end;
+	sector_t step;
+	const char *program_id;
+	const char *aux_data;
+	struct rcu_head rcu_head;
+	size_t shared_alloc_size;
+	size_t percpu_alloc_size;
+	struct dm_stat_percpu *stat_percpu[NR_CPUS];
+	struct dm_stat_shared stat_shared[0];
+};
+
+struct dm_stats_last_position {
+	sector_t last_sector;
+	unsigned last_rw;
+};
+
+/*
+ * A typo on the command line could possibly make the kernel run out of memory
+ * and crash. To prevent the crash we account all used memory. We fail if we
+ * exhaust 1/4 of all memory or 1/2 of vmalloc space.
+ */
+#define DM_STATS_MEMORY_FACTOR		4
+#define DM_STATS_VMALLOC_FACTOR		2
+
+static DEFINE_SPINLOCK(shared_memory_lock);
+
+static unsigned long shared_memory_amount;
+
+static bool __check_shared_memory(size_t alloc_size)
+{
+	size_t a;
+
+	a = shared_memory_amount + alloc_size;
+	if (a < shared_memory_amount)
+		return false;
+	if (a >> PAGE_SHIFT > totalram_pages / DM_STATS_MEMORY_FACTOR)
+		return false;
+#ifdef CONFIG_MMU
+	if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR)
+		return false;
+#endif
+	return true;
+}
+
+static bool check_shared_memory(size_t alloc_size)
+{
+	bool ret;
+
+	spin_lock_irq(&shared_memory_lock);
+
+	ret = __check_shared_memory(alloc_size);
+
+	spin_unlock_irq(&shared_memory_lock);
+
+	return ret;
+}
+
+static bool claim_shared_memory(size_t alloc_size)
+{
+	spin_lock_irq(&shared_memory_lock);
+
+	if (!__check_shared_memory(alloc_size)) {
+		spin_unlock_irq(&shared_memory_lock);
+		return false;
+	}
+
+	shared_memory_amount += alloc_size;
+
+	spin_unlock_irq(&shared_memory_lock);
+
+	return true;
+}
+
+static void free_shared_memory(size_t alloc_size)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&shared_memory_lock, flags);
+
+	if (WARN_ON_ONCE(shared_memory_amount < alloc_size)) {
+		spin_unlock_irqrestore(&shared_memory_lock, flags);
+		DMCRIT("Memory usage accounting bug.");
+		return;
+	}
+
+	shared_memory_amount -= alloc_size;
+
+	spin_unlock_irqrestore(&shared_memory_lock, flags);
+}
+
+static void *dm_kvzalloc(size_t alloc_size, int node)
+{
+	void *p;
+
+	if (!claim_shared_memory(alloc_size))
+		return NULL;
+
+	if (alloc_size <= KMALLOC_MAX_SIZE) {
+		p = kzalloc_node(alloc_size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN, node);
+		if (p)
+			return p;
+	}
+	p = vzalloc_node(alloc_size, node);
+	if (p)
+		return p;
+
+	free_shared_memory(alloc_size);
+
+	return NULL;
+}
+
+static void dm_kvfree(void *ptr, size_t alloc_size)
+{
+	if (!ptr)
+		return;
+
+	free_shared_memory(alloc_size);
+
+	if (is_vmalloc_addr(ptr))
+		vfree(ptr);
+	else
+		kfree(ptr);
+}
+
+static void dm_stat_free(struct rcu_head *head)
+{
+	int cpu;
+	struct dm_stat *s = container_of(head, struct dm_stat, rcu_head);
+
+	kfree(s->program_id);
+	kfree(s->aux_data);
+	for_each_possible_cpu(cpu)
+		dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size);
+	dm_kvfree(s, s->shared_alloc_size);
+}
+
+static int dm_stat_in_flight(struct dm_stat_shared *shared)
+{
+	return atomic_read(&shared->in_flight[READ]) +
+	       atomic_read(&shared->in_flight[WRITE]);
+}
+
+void dm_stats_init(struct dm_stats *stats)
+{
+	int cpu;
+	struct dm_stats_last_position *last;
+
+	mutex_init(&stats->mutex);
+	INIT_LIST_HEAD(&stats->list);
+	stats->last = alloc_percpu(struct dm_stats_last_position);
+	for_each_possible_cpu(cpu) {
+		last = per_cpu_ptr(stats->last, cpu);
+		last->last_sector = (sector_t)ULLONG_MAX;
+		last->last_rw = UINT_MAX;
+	}
+}
+
+void dm_stats_cleanup(struct dm_stats *stats)
+{
+	size_t ni;
+	struct dm_stat *s;
+	struct dm_stat_shared *shared;
+
+	while (!list_empty(&stats->list)) {
+		s = container_of(stats->list.next, struct dm_stat, list_entry);
+		list_del(&s->list_entry);
+		for (ni = 0; ni < s->n_entries; ni++) {
+			shared = &s->stat_shared[ni];
+			if (WARN_ON(dm_stat_in_flight(shared))) {
+				DMCRIT("leaked in-flight counter at index %lu "
+				       "(start %llu, end %llu, step %llu): reads %d, writes %d",
+				       (unsigned long)ni,
+				       (unsigned long long)s->start,
+				       (unsigned long long)s->end,
+				       (unsigned long long)s->step,
+				       atomic_read(&shared->in_flight[READ]),
+				       atomic_read(&shared->in_flight[WRITE]));
+			}
+		}
+		dm_stat_free(&s->rcu_head);
+	}
+	free_percpu(stats->last);
+}
+
+static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
+			   sector_t step, const char *program_id, const char *aux_data,
+			   void (*suspend_callback)(struct mapped_device *),
+			   void (*resume_callback)(struct mapped_device *),
+			   struct mapped_device *md)
+{
+	struct list_head *l;
+	struct dm_stat *s, *tmp_s;
+	sector_t n_entries;
+	size_t ni;
+	size_t shared_alloc_size;
+	size_t percpu_alloc_size;
+	struct dm_stat_percpu *p;
+	int cpu;
+	int ret_id;
+	int r;
+
+	if (end < start || !step)
+		return -EINVAL;
+
+	n_entries = end - start;
+	if (dm_sector_div64(n_entries, step))
+		n_entries++;
+
+	if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1))
+		return -EOVERFLOW;
+
+	shared_alloc_size = sizeof(struct dm_stat) + (size_t)n_entries * sizeof(struct dm_stat_shared);
+	if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries)
+		return -EOVERFLOW;
+
+	percpu_alloc_size = (size_t)n_entries * sizeof(struct dm_stat_percpu);
+	if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries)
+		return -EOVERFLOW;
+
+	if (!check_shared_memory(shared_alloc_size + num_possible_cpus() * percpu_alloc_size))
+		return -ENOMEM;
+
+	s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE);
+	if (!s)
+		return -ENOMEM;
+
+	s->n_entries = n_entries;
+	s->start = start;
+	s->end = end;
+	s->step = step;
+	s->shared_alloc_size = shared_alloc_size;
+	s->percpu_alloc_size = percpu_alloc_size;
+
+	s->program_id = kstrdup(program_id, GFP_KERNEL);
+	if (!s->program_id) {
+		r = -ENOMEM;
+		goto out;
+	}
+	s->aux_data = kstrdup(aux_data, GFP_KERNEL);
+	if (!s->aux_data) {
+		r = -ENOMEM;
+		goto out;
+	}
+
+	for (ni = 0; ni < n_entries; ni++) {
+		atomic_set(&s->stat_shared[ni].in_flight[READ], 0);
+		atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0);
+	}
+
+	for_each_possible_cpu(cpu) {
+		p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu));
+		if (!p) {
+			r = -ENOMEM;
+			goto out;
+		}
+		s->stat_percpu[cpu] = p;
+	}
+
+	/*
+	 * Suspend/resume to make sure there is no i/o in flight,
+	 * so that newly created statistics will be exact.
+	 *
+	 * (note: we couldn't suspend earlier because we must not
+	 * allocate memory while suspended)
+	 */
+	suspend_callback(md);
+
+	mutex_lock(&stats->mutex);
+	s->id = 0;
+	list_for_each(l, &stats->list) {
+		tmp_s = container_of(l, struct dm_stat, list_entry);
+		if (WARN_ON(tmp_s->id < s->id)) {
+			r = -EINVAL;
+			goto out_unlock_resume;
+		}
+		if (tmp_s->id > s->id)
+			break;
+		if (unlikely(s->id == INT_MAX)) {
+			r = -ENFILE;
+			goto out_unlock_resume;
+		}
+		s->id++;
+	}
+	ret_id = s->id;
+	list_add_tail_rcu(&s->list_entry, l);
+	mutex_unlock(&stats->mutex);
+
+	resume_callback(md);
+
+	return ret_id;
+
+out_unlock_resume:
+	mutex_unlock(&stats->mutex);
+	resume_callback(md);
+out:
+	dm_stat_free(&s->rcu_head);
+	return r;
+}
+
+static struct dm_stat *__dm_stats_find(struct dm_stats *stats, int id)
+{
+	struct dm_stat *s;
+
+	list_for_each_entry(s, &stats->list, list_entry) {
+		if (s->id > id)
+			break;
+		if (s->id == id)
+			return s;
+	}
+
+	return NULL;
+}
+
+static int dm_stats_delete(struct dm_stats *stats, int id)
+{
+	struct dm_stat *s;
+	int cpu;
+
+	mutex_lock(&stats->mutex);
+
+	s = __dm_stats_find(stats, id);
+	if (!s) {
+		mutex_unlock(&stats->mutex);
+		return -ENOENT;
+	}
+
+	list_del_rcu(&s->list_entry);
+	mutex_unlock(&stats->mutex);
+
+	/*
+	 * vfree can't be called from RCU callback
+	 */
+	for_each_possible_cpu(cpu)
+		if (is_vmalloc_addr(s->stat_percpu))
+			goto do_sync_free;
+	if (is_vmalloc_addr(s)) {
+do_sync_free:
+		synchronize_rcu_expedited();
+		dm_stat_free(&s->rcu_head);
+	} else {
+		ACCESS_ONCE(dm_stat_need_rcu_barrier) = 1;
+		call_rcu(&s->rcu_head, dm_stat_free);
+	}
+	return 0;
+}
+
+static int dm_stats_list(struct dm_stats *stats, const char *program,
+			 char *result, unsigned maxlen)
+{
+	struct dm_stat *s;
+	sector_t len;
+	unsigned sz = 0;
+
+	/*
+	 * Output format:
+	 *   <region_id>: <start_sector>+<length> <step> <program_id> <aux_data>
+	 */
+
+	mutex_lock(&stats->mutex);
+	list_for_each_entry(s, &stats->list, list_entry) {
+		if (!program || !strcmp(program, s->program_id)) {
+			len = s->end - s->start;
+			DMEMIT("%d: %llu+%llu %llu %s %s\n", s->id,
+				(unsigned long long)s->start,
+				(unsigned long long)len,
+				(unsigned long long)s->step,
+				s->program_id,
+				s->aux_data);
+		}
+	}
+	mutex_unlock(&stats->mutex);
+
+	return 1;
+}
+
+static void dm_stat_round(struct dm_stat_shared *shared, struct dm_stat_percpu *p)
+{
+	/*
+	 * This is racy, but so is part_round_stats_single.
+	 */
+	unsigned long now = jiffies;
+	unsigned in_flight_read;
+	unsigned in_flight_write;
+	unsigned long difference = now - shared->stamp;
+
+	if (!difference)
+		return;
+	in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]);
+	in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]);
+	if (in_flight_read)
+		p->io_ticks[READ] += difference;
+	if (in_flight_write)
+		p->io_ticks[WRITE] += difference;
+	if (in_flight_read + in_flight_write) {
+		p->io_ticks_total += difference;
+		p->time_in_queue += (in_flight_read + in_flight_write) * difference;
+	}
+	shared->stamp = now;
+}
+
+static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
+			      unsigned long bi_rw, sector_t len, bool merged,
+			      bool end, unsigned long duration)
+{
+	unsigned long idx = bi_rw & REQ_WRITE;
+	struct dm_stat_shared *shared = &s->stat_shared[entry];
+	struct dm_stat_percpu *p;
+
+	/*
+	 * For strict correctness we should use local_irq_save/restore
+	 * instead of preempt_disable/enable.
+	 *
+	 * preempt_disable/enable is racy if the driver finishes bios
+	 * from non-interrupt context as well as from interrupt context
+	 * or from more different interrupts.
+	 *
+	 * On 64-bit architectures the race only results in not counting some
+	 * events, so it is acceptable.  On 32-bit architectures the race could
+	 * cause the counter going off by 2^32, so we need to do proper locking
+	 * there.
+	 *
+	 * part_stat_lock()/part_stat_unlock() have this race too.
+	 */
+#if BITS_PER_LONG == 32
+	unsigned long flags;
+	local_irq_save(flags);
+#else
+	preempt_disable();
+#endif
+	p = &s->stat_percpu[smp_processor_id()][entry];
+
+	if (!end) {
+		dm_stat_round(shared, p);
+		atomic_inc(&shared->in_flight[idx]);
+	} else {
+		dm_stat_round(shared, p);
+		atomic_dec(&shared->in_flight[idx]);
+		p->sectors[idx] += len;
+		p->ios[idx] += 1;
+		p->merges[idx] += merged;
+		p->ticks[idx] += duration;
+	}
+
+#if BITS_PER_LONG == 32
+	local_irq_restore(flags);
+#else
+	preempt_enable();
+#endif
+}
+
+static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
+			  sector_t bi_sector, sector_t end_sector,
+			  bool end, unsigned long duration,
+			  struct dm_stats_aux *stats_aux)
+{
+	sector_t rel_sector, offset, todo, fragment_len;
+	size_t entry;
+
+	if (end_sector <= s->start || bi_sector >= s->end)
+		return;
+	if (unlikely(bi_sector < s->start)) {
+		rel_sector = 0;
+		todo = end_sector - s->start;
+	} else {
+		rel_sector = bi_sector - s->start;
+		todo = end_sector - bi_sector;
+	}
+	if (unlikely(end_sector > s->end))
+		todo -= (end_sector - s->end);
+
+	offset = dm_sector_div64(rel_sector, s->step);
+	entry = rel_sector;
+	do {
+		if (WARN_ON_ONCE(entry >= s->n_entries)) {
+			DMCRIT("Invalid area access in region id %d", s->id);
+			return;
+		}
+		fragment_len = todo;
+		if (fragment_len > s->step - offset)
+			fragment_len = s->step - offset;
+		dm_stat_for_entry(s, entry, bi_rw, fragment_len,
+				  stats_aux->merged, end, duration);
+		todo -= fragment_len;
+		entry++;
+		offset = 0;
+	} while (unlikely(todo != 0));
+}
+
+void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
+			 sector_t bi_sector, unsigned bi_sectors, bool end,
+			 unsigned long duration, struct dm_stats_aux *stats_aux)
+{
+	struct dm_stat *s;
+	sector_t end_sector;
+	struct dm_stats_last_position *last;
+
+	if (unlikely(!bi_sectors))
+		return;
+
+	end_sector = bi_sector + bi_sectors;
+
+	if (!end) {
+		/*
+		 * A race condition can at worst result in the merged flag being
+		 * misrepresented, so we don't have to disable preemption here.
+		 */
+		last = __this_cpu_ptr(stats->last);
+		stats_aux->merged =
+			(bi_sector == (ACCESS_ONCE(last->last_sector) &&
+				       ((bi_rw & (REQ_WRITE | REQ_DISCARD)) ==
+					(ACCESS_ONCE(last->last_rw) & (REQ_WRITE | REQ_DISCARD)))
+				       ));
+		ACCESS_ONCE(last->last_sector) = end_sector;
+		ACCESS_ONCE(last->last_rw) = bi_rw;
+	}
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(s, &stats->list, list_entry)
+		__dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration, stats_aux);
+
+	rcu_read_unlock();
+}
+
+static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared,
+						   struct dm_stat *s, size_t x)
+{
+	int cpu;
+	struct dm_stat_percpu *p;
+
+	local_irq_disable();
+	p = &s->stat_percpu[smp_processor_id()][x];
+	dm_stat_round(shared, p);
+	local_irq_enable();
+
+	memset(&shared->tmp, 0, sizeof(shared->tmp));
+	for_each_possible_cpu(cpu) {
+		p = &s->stat_percpu[cpu][x];
+		shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]);
+		shared->tmp.sectors[WRITE] += ACCESS_ONCE(p->sectors[WRITE]);
+		shared->tmp.ios[READ] += ACCESS_ONCE(p->ios[READ]);
+		shared->tmp.ios[WRITE] += ACCESS_ONCE(p->ios[WRITE]);
+		shared->tmp.merges[READ] += ACCESS_ONCE(p->merges[READ]);
+		shared->tmp.merges[WRITE] += ACCESS_ONCE(p->merges[WRITE]);
+		shared->tmp.ticks[READ] += ACCESS_ONCE(p->ticks[READ]);
+		shared->tmp.ticks[WRITE] += ACCESS_ONCE(p->ticks[WRITE]);
+		shared->tmp.io_ticks[READ] += ACCESS_ONCE(p->io_ticks[READ]);
+		shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]);
+		shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total);
+		shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue);
+	}
+}
+
+static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end,
+			    bool init_tmp_percpu_totals)
+{
+	size_t x;
+	struct dm_stat_shared *shared;
+	struct dm_stat_percpu *p;
+
+	for (x = idx_start; x < idx_end; x++) {
+		shared = &s->stat_shared[x];
+		if (init_tmp_percpu_totals)
+			__dm_stat_init_temporary_percpu_totals(shared, s, x);
+		local_irq_disable();
+		p = &s->stat_percpu[smp_processor_id()][x];
+		p->sectors[READ] -= shared->tmp.sectors[READ];
+		p->sectors[WRITE] -= shared->tmp.sectors[WRITE];
+		p->ios[READ] -= shared->tmp.ios[READ];
+		p->ios[WRITE] -= shared->tmp.ios[WRITE];
+		p->merges[READ] -= shared->tmp.merges[READ];
+		p->merges[WRITE] -= shared->tmp.merges[WRITE];
+		p->ticks[READ] -= shared->tmp.ticks[READ];
+		p->ticks[WRITE] -= shared->tmp.ticks[WRITE];
+		p->io_ticks[READ] -= shared->tmp.io_ticks[READ];
+		p->io_ticks[WRITE] -= shared->tmp.io_ticks[WRITE];
+		p->io_ticks_total -= shared->tmp.io_ticks_total;
+		p->time_in_queue -= shared->tmp.time_in_queue;
+		local_irq_enable();
+	}
+}
+
+static int dm_stats_clear(struct dm_stats *stats, int id)
+{
+	struct dm_stat *s;
+
+	mutex_lock(&stats->mutex);
+
+	s = __dm_stats_find(stats, id);
+	if (!s) {
+		mutex_unlock(&stats->mutex);
+		return -ENOENT;
+	}
+
+	__dm_stat_clear(s, 0, s->n_entries, true);
+
+	mutex_unlock(&stats->mutex);
+
+	return 1;
+}
+
+/*
+ * This is like jiffies_to_msec, but works for 64-bit values.
+ */
+static unsigned long long dm_jiffies_to_msec64(unsigned long long j)
+{
+	unsigned long long result = 0;
+	unsigned mult;
+
+	if (j)
+		result = jiffies_to_msecs(j & 0x3fffff);
+	if (j >= 1 << 22) {
+		mult = jiffies_to_msecs(1 << 22);
+		result += (unsigned long long)mult * (unsigned long long)jiffies_to_msecs((j >> 22) & 0x3fffff);
+	}
+	if (j >= 1ULL << 44)
+		result += (unsigned long long)mult * (unsigned long long)mult * (unsigned long long)jiffies_to_msecs(j >> 44);
+
+	return result;
+}
+
+static int dm_stats_print(struct dm_stats *stats, int id,
+			  size_t idx_start, size_t idx_len,
+			  bool clear, char *result, unsigned maxlen)
+{
+	unsigned sz = 0;
+	struct dm_stat *s;
+	size_t x;
+	sector_t start, end, step;
+	size_t idx_end;
+	struct dm_stat_shared *shared;
+
+	/*
+	 * Output format:
+	 *   <start_sector>+<length> counters
+	 */
+
+	mutex_lock(&stats->mutex);
+
+	s = __dm_stats_find(stats, id);
+	if (!s) {
+		mutex_unlock(&stats->mutex);
+		return -ENOENT;
+	}
+
+	idx_end = idx_start + idx_len;
+	if (idx_end < idx_start ||
+	    idx_end > s->n_entries)
+		idx_end = s->n_entries;
+
+	if (idx_start > idx_end)
+		idx_start = idx_end;
+
+	step = s->step;
+	start = s->start + (step * idx_start);
+
+	for (x = idx_start; x < idx_end; x++, start = end) {
+		shared = &s->stat_shared[x];
+		end = start + step;
+		if (unlikely(end > s->end))
+			end = s->end;
+
+		__dm_stat_init_temporary_percpu_totals(shared, s, x);
+
+		DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu\n",
+		       (unsigned long long)start,
+		       (unsigned long long)step,
+		       shared->tmp.ios[READ],
+		       shared->tmp.merges[READ],
+		       shared->tmp.sectors[READ],
+		       dm_jiffies_to_msec64(shared->tmp.ticks[READ]),
+		       shared->tmp.ios[WRITE],
+		       shared->tmp.merges[WRITE],
+		       shared->tmp.sectors[WRITE],
+		       dm_jiffies_to_msec64(shared->tmp.ticks[WRITE]),
+		       dm_stat_in_flight(shared),
+		       dm_jiffies_to_msec64(shared->tmp.io_ticks_total),
+		       dm_jiffies_to_msec64(shared->tmp.time_in_queue),
+		       dm_jiffies_to_msec64(shared->tmp.io_ticks[READ]),
+		       dm_jiffies_to_msec64(shared->tmp.io_ticks[WRITE]));
+
+		if (unlikely(sz + 1 >= maxlen))
+			goto buffer_overflow;
+	}
+
+	if (clear)
+		__dm_stat_clear(s, idx_start, idx_end, false);
+
+buffer_overflow:
+	mutex_unlock(&stats->mutex);
+
+	return 1;
+}
+
+static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data)
+{
+	struct dm_stat *s;
+	const char *new_aux_data;
+
+	mutex_lock(&stats->mutex);
+
+	s = __dm_stats_find(stats, id);
+	if (!s) {
+		mutex_unlock(&stats->mutex);
+		return -ENOENT;
+	}
+
+	new_aux_data = kstrdup(aux_data, GFP_KERNEL);
+	if (!new_aux_data) {
+		mutex_unlock(&stats->mutex);
+		return -ENOMEM;
+	}
+
+	kfree(s->aux_data);
+	s->aux_data = new_aux_data;
+
+	mutex_unlock(&stats->mutex);
+
+	return 0;
+}
+
+static int message_stats_create(struct mapped_device *md,
+				unsigned argc, char **argv,
+				char *result, unsigned maxlen)
+{
+	int id;
+	char dummy;
+	unsigned long long start, end, len, step;
+	unsigned divisor;
+	const char *program_id, *aux_data;
+
+	/*
+	 * Input format:
+	 *   <range> <step> [<program_id> [<aux_data>]]
+	 */
+
+	if (argc < 3 || argc > 5)
+		return -EINVAL;
+
+	if (!strcmp(argv[1], "-")) {
+		start = 0;
+		len = dm_get_size(md);
+		if (!len)
+			len = 1;
+	} else if (sscanf(argv[1], "%llu+%llu%c", &start, &len, &dummy) != 2 ||
+		   start != (sector_t)start || len != (sector_t)len)
+		return -EINVAL;
+
+	end = start + len;
+	if (start >= end)
+		return -EINVAL;
+
+	if (sscanf(argv[2], "/%u%c", &divisor, &dummy) == 1) {
+		step = end - start;
+		if (do_div(step, divisor))
+			step++;
+		if (!step)
+			step = 1;
+	} else if (sscanf(argv[2], "%llu%c", &step, &dummy) != 1 ||
+		   step != (sector_t)step || !step)
+		return -EINVAL;
+
+	program_id = "-";
+	aux_data = "-";
+
+	if (argc > 3)
+		program_id = argv[3];
+
+	if (argc > 4)
+		aux_data = argv[4];
+
+	/*
+	 * If a buffer overflow happens after we created the region,
+	 * it's too late (the userspace would retry with a larger
+	 * buffer, but the region id that caused the overflow is already
+	 * leaked).  So we must detect buffer overflow in advance.
+	 */
+	snprintf(result, maxlen, "%d", INT_MAX);
+	if (dm_message_test_buffer_overflow(result, maxlen))
+		return 1;
+
+	id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data,
+			     dm_internal_suspend, dm_internal_resume, md);
+	if (id < 0)
+		return id;
+
+	snprintf(result, maxlen, "%d", id);
+
+	return 1;
+}
+
+static int message_stats_delete(struct mapped_device *md,
+				unsigned argc, char **argv)
+{
+	int id;
+	char dummy;
+
+	if (argc != 2)
+		return -EINVAL;
+
+	if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
+		return -EINVAL;
+
+	return dm_stats_delete(dm_get_stats(md), id);
+}
+
+static int message_stats_clear(struct mapped_device *md,
+			       unsigned argc, char **argv)
+{
+	int id;
+	char dummy;
+
+	if (argc != 2)
+		return -EINVAL;
+
+	if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
+		return -EINVAL;
+
+	return dm_stats_clear(dm_get_stats(md), id);
+}
+
+static int message_stats_list(struct mapped_device *md,
+			      unsigned argc, char **argv,
+			      char *result, unsigned maxlen)
+{
+	int r;
+	const char *program = NULL;
+
+	if (argc < 1 || argc > 2)
+		return -EINVAL;
+
+	if (argc > 1) {
+		program = kstrdup(argv[1], GFP_KERNEL);
+		if (!program)
+			return -ENOMEM;
+	}
+
+	r = dm_stats_list(dm_get_stats(md), program, result, maxlen);
+
+	kfree(program);
+
+	return r;
+}
+
+static int message_stats_print(struct mapped_device *md,
+			       unsigned argc, char **argv, bool clear,
+			       char *result, unsigned maxlen)
+{
+	int id;
+	char dummy;
+	unsigned long idx_start = 0, idx_len = ULONG_MAX;
+
+	if (argc != 2 && argc != 4)
+		return -EINVAL;
+
+	if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
+		return -EINVAL;
+
+	if (argc > 3) {
+		if (strcmp(argv[2], "-") &&
+		    sscanf(argv[2], "%lu%c", &idx_start, &dummy) != 1)
+			return -EINVAL;
+		if (strcmp(argv[3], "-") &&
+		    sscanf(argv[3], "%lu%c", &idx_len, &dummy) != 1)
+			return -EINVAL;
+	}
+
+	return dm_stats_print(dm_get_stats(md), id, idx_start, idx_len, clear,
+			      result, maxlen);
+}
+
+static int message_stats_set_aux(struct mapped_device *md,
+				 unsigned argc, char **argv)
+{
+	int id;
+	char dummy;
+
+	if (argc != 3)
+		return -EINVAL;
+
+	if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
+		return -EINVAL;
+
+	return dm_stats_set_aux(dm_get_stats(md), id, argv[2]);
+}
+
+int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
+		     char *result, unsigned maxlen)
+{
+	int r;
+
+	if (dm_request_based(md)) {
+		DMWARN("Statistics are only supported for bio-based devices");
+		return -EOPNOTSUPP;
+	}
+
+	/* All messages here must start with '@' */
+	if (!strcasecmp(argv[0], "@stats_create"))
+		r = message_stats_create(md, argc, argv, result, maxlen);
+	else if (!strcasecmp(argv[0], "@stats_delete"))
+		r = message_stats_delete(md, argc, argv);
+	else if (!strcasecmp(argv[0], "@stats_clear"))
+		r = message_stats_clear(md, argc, argv);
+	else if (!strcasecmp(argv[0], "@stats_list"))
+		r = message_stats_list(md, argc, argv, result, maxlen);
+	else if (!strcasecmp(argv[0], "@stats_print"))
+		r = message_stats_print(md, argc, argv, false, result, maxlen);
+	else if (!strcasecmp(argv[0], "@stats_print_clear"))
+		r = message_stats_print(md, argc, argv, true, result, maxlen);
+	else if (!strcasecmp(argv[0], "@stats_set_aux"))
+		r = message_stats_set_aux(md, argc, argv);
+	else
+		return 2; /* this wasn't a stats message */
+
+	if (r == -EINVAL)
+		DMWARN("Invalid parameters for message %s", argv[0]);
+
+	return r;
+}
+
+int __init dm_statistics_init(void)
+{
+	shared_memory_amount = 0;
+	dm_stat_need_rcu_barrier = 0;
+	return 0;
+}
+
+void dm_statistics_exit(void)
+{
+	if (dm_stat_need_rcu_barrier)
+		rcu_barrier();
+	if (WARN_ON(shared_memory_amount))
+		DMCRIT("shared_memory_amount leaked: %lu", shared_memory_amount);
+}
+
+module_param_named(stats_current_allocated_bytes, shared_memory_amount, ulong, S_IRUGO);
+MODULE_PARM_DESC(stats_current_allocated_bytes, "Memory currently used by statistics");
diff --git a/drivers/md/dm-stats.h b/drivers/md/dm-stats.h
new file mode 100644
index 00000000000..e7c4984bf23
--- /dev/null
+++ b/drivers/md/dm-stats.h
@@ -0,0 +1,40 @@
+#ifndef DM_STATS_H
+#define DM_STATS_H
+
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+
+int dm_statistics_init(void);
+void dm_statistics_exit(void);
+
+struct dm_stats {
+	struct mutex mutex;
+	struct list_head list;	/* list of struct dm_stat */
+	struct dm_stats_last_position __percpu *last;
+	sector_t last_sector;
+	unsigned last_rw;
+};
+
+struct dm_stats_aux {
+	bool merged;
+};
+
+void dm_stats_init(struct dm_stats *st);
+void dm_stats_cleanup(struct dm_stats *st);
+
+struct mapped_device;
+
+int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
+		     char *result, unsigned maxlen);
+
+void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
+			 sector_t bi_sector, unsigned bi_sectors, bool end,
+			 unsigned long duration, struct dm_stats_aux *aux);
+
+static inline bool dm_stats_used(struct dm_stats *st)
+{
+	return !list_empty(&st->list);
+}
+
+#endif
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index e2f87653974..d1600d2aa2e 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -4,6 +4,7 @@
  * This file is released under the GPL.
  */
 
+#include "dm.h"
 #include <linux/device-mapper.h>
 
 #include <linux/module.h>
@@ -94,7 +95,7 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
 static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
 	struct stripe_c *sc;
-	sector_t width;
+	sector_t width, tmp_len;
 	uint32_t stripes;
 	uint32_t chunk_size;
 	int r;
@@ -116,15 +117,16 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 
 	width = ti->len;
-	if (sector_div(width, chunk_size)) {
+	if (sector_div(width, stripes)) {
 		ti->error = "Target length not divisible by "
-		    "chunk size";
+		    "number of stripes";
 		return -EINVAL;
 	}
 
-	if (sector_div(width, stripes)) {
+	tmp_len = width;
+	if (sector_div(tmp_len, chunk_size)) {
 		ti->error = "Target length not divisible by "
-		    "number of stripes";
+		    "chunk size";
 		return -EINVAL;
 	}
 
@@ -160,8 +162,9 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	if (r)
 		return r;
 
-	ti->num_flush_requests = stripes;
-	ti->num_discard_requests = stripes;
+	ti->num_flush_bios = stripes;
+	ti->num_discard_bios = stripes;
+	ti->num_write_same_bios = stripes;
 
 	sc->chunk_size = chunk_size;
 	if (chunk_size & (chunk_size - 1))
@@ -251,18 +254,20 @@ static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector,
 		*result += sc->chunk_size;		/* next chunk */
 }
 
-static int stripe_map_discard(struct stripe_c *sc, struct bio *bio,
-			      uint32_t target_stripe)
+static int stripe_map_range(struct stripe_c *sc, struct bio *bio,
+			    uint32_t target_stripe)
 {
 	sector_t begin, end;
 
-	stripe_map_range_sector(sc, bio->bi_sector, target_stripe, &begin);
-	stripe_map_range_sector(sc, bio->bi_sector + bio_sectors(bio),
+	stripe_map_range_sector(sc, bio->bi_iter.bi_sector,
+				target_stripe, &begin);
+	stripe_map_range_sector(sc, bio_end_sector(bio),
 				target_stripe, &end);
 	if (begin < end) {
 		bio->bi_bdev = sc->stripe[target_stripe].dev->bdev;
-		bio->bi_sector = begin + sc->stripe[target_stripe].physical_start;
-		bio->bi_size = to_bytes(end - begin);
+		bio->bi_iter.bi_sector = begin +
+			sc->stripe[target_stripe].physical_start;
+		bio->bi_iter.bi_size = to_bytes(end - begin);
 		return DM_MAPIO_REMAPPED;
 	} else {
 		/* The range doesn't map to the target stripe */
@@ -271,28 +276,29 @@ static int stripe_map_discard(struct stripe_c *sc, struct bio *bio,
 	}
 }
 
-static int stripe_map(struct dm_target *ti, struct bio *bio,
-		      union map_info *map_context)
+static int stripe_map(struct dm_target *ti, struct bio *bio)
 {
 	struct stripe_c *sc = ti->private;
 	uint32_t stripe;
-	unsigned target_request_nr;
+	unsigned target_bio_nr;
 
 	if (bio->bi_rw & REQ_FLUSH) {
-		target_request_nr = map_context->target_request_nr;
-		BUG_ON(target_request_nr >= sc->stripes);
-		bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev;
+		target_bio_nr = dm_bio_get_target_bio_nr(bio);
+		BUG_ON(target_bio_nr >= sc->stripes);
+		bio->bi_bdev = sc->stripe[target_bio_nr].dev->bdev;
 		return DM_MAPIO_REMAPPED;
 	}
-	if (unlikely(bio->bi_rw & REQ_DISCARD)) {
-		target_request_nr = map_context->target_request_nr;
-		BUG_ON(target_request_nr >= sc->stripes);
-		return stripe_map_discard(sc, bio, target_request_nr);
+	if (unlikely(bio->bi_rw & REQ_DISCARD) ||
+	    unlikely(bio->bi_rw & REQ_WRITE_SAME)) {
+		target_bio_nr = dm_bio_get_target_bio_nr(bio);
+		BUG_ON(target_bio_nr >= sc->stripes);
+		return stripe_map_range(sc, bio, target_bio_nr);
 	}
 
-	stripe_map_sector(sc, bio->bi_sector, &stripe, &bio->bi_sector);
+	stripe_map_sector(sc, bio->bi_iter.bi_sector,
+			  &stripe, &bio->bi_iter.bi_sector);
 
-	bio->bi_sector += sc->stripe[stripe].physical_start;
+	bio->bi_iter.bi_sector += sc->stripe[stripe].physical_start;
 	bio->bi_bdev = sc->stripe[stripe].dev->bdev;
 
 	return DM_MAPIO_REMAPPED;
@@ -311,8 +317,8 @@ static int stripe_map(struct dm_target *ti, struct bio *bio,
  *
  */
 
-static int stripe_status(struct dm_target *ti, status_type_t type,
-			 unsigned status_flags, char *result, unsigned maxlen)
+static void stripe_status(struct dm_target *ti, status_type_t type,
+			  unsigned status_flags, char *result, unsigned maxlen)
 {
 	struct stripe_c *sc = (struct stripe_c *) ti->private;
 	char buffer[sc->stripes + 1];
@@ -339,11 +345,9 @@ static int stripe_status(struct dm_target *ti, status_type_t type,
 			    (unsigned long long)sc->stripe[i].physical_start);
 		break;
 	}
-	return 0;
 }
 
-static int stripe_end_io(struct dm_target *ti, struct bio *bio,
-			 int error, union map_info *map_context)
+static int stripe_end_io(struct dm_target *ti, struct bio *bio, int error)
 {
 	unsigned i;
 	char major_minor[16];
@@ -428,7 +432,7 @@ static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 
 static struct target_type stripe_target = {
 	.name   = "striped",
-	.version = {1, 5, 0},
+	.version = {1, 5, 1},
 	.module = THIS_MODULE,
 	.ctr    = stripe_ctr,
 	.dtr    = stripe_dtr,
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
new file mode 100644
index 00000000000..09a688b3d48
--- /dev/null
+++ b/drivers/md/dm-switch.c
@@ -0,0 +1,538 @@
+/*
+ * Copyright (C) 2010-2012 by Dell Inc.  All rights reserved.
+ * Copyright (C) 2011-2013 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ *
+ * dm-switch is a device-mapper target that maps IO to underlying block
+ * devices efficiently when there are a large number of fixed-sized
+ * address regions but there is no simple pattern to allow for a compact
+ * mapping representation such as dm-stripe.
+ */
+
+#include <linux/device-mapper.h>
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+
+#define DM_MSG_PREFIX "switch"
+
+/*
+ * One region_table_slot_t holds <region_entries_per_slot> region table
+ * entries each of which is <region_table_entry_bits> in size.
+ */
+typedef unsigned long region_table_slot_t;
+
+/*
+ * A device with the offset to its start sector.
+ */
+struct switch_path {
+	struct dm_dev *dmdev;
+	sector_t start;
+};
+
+/*
+ * Context block for a dm switch device.
+ */
+struct switch_ctx {
+	struct dm_target *ti;
+
+	unsigned nr_paths;		/* Number of paths in path_list. */
+
+	unsigned region_size;		/* Region size in 512-byte sectors */
+	unsigned long nr_regions;	/* Number of regions making up the device */
+	signed char region_size_bits;	/* log2 of region_size or -1 */
+
+	unsigned char region_table_entry_bits;	/* Number of bits in one region table entry */
+	unsigned char region_entries_per_slot;	/* Number of entries in one region table slot */
+	signed char region_entries_per_slot_bits;	/* log2 of region_entries_per_slot or -1 */
+
+	region_table_slot_t *region_table;	/* Region table */
+
+	/*
+	 * Array of dm devices to switch between.
+	 */
+	struct switch_path path_list[0];
+};
+
+static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths,
+					   unsigned region_size)
+{
+	struct switch_ctx *sctx;
+
+	sctx = kzalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_path),
+		       GFP_KERNEL);
+	if (!sctx)
+		return NULL;
+
+	sctx->ti = ti;
+	sctx->region_size = region_size;
+
+	ti->private = sctx;
+
+	return sctx;
+}
+
+static int alloc_region_table(struct dm_target *ti, unsigned nr_paths)
+{
+	struct switch_ctx *sctx = ti->private;
+	sector_t nr_regions = ti->len;
+	sector_t nr_slots;
+
+	if (!(sctx->region_size & (sctx->region_size - 1)))
+		sctx->region_size_bits = __ffs(sctx->region_size);
+	else
+		sctx->region_size_bits = -1;
+
+	sctx->region_table_entry_bits = 1;
+	while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 &&
+	       (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths)
+		sctx->region_table_entry_bits++;
+
+	sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits;
+	if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1)))
+		sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot);
+	else
+		sctx->region_entries_per_slot_bits = -1;
+
+	if (sector_div(nr_regions, sctx->region_size))
+		nr_regions++;
+
+	sctx->nr_regions = nr_regions;
+	if (sctx->nr_regions != nr_regions || sctx->nr_regions >= ULONG_MAX) {
+		ti->error = "Region table too large";
+		return -EINVAL;
+	}
+
+	nr_slots = nr_regions;
+	if (sector_div(nr_slots, sctx->region_entries_per_slot))
+		nr_slots++;
+
+	if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) {
+		ti->error = "Region table too large";
+		return -EINVAL;
+	}
+
+	sctx->region_table = vmalloc(nr_slots * sizeof(region_table_slot_t));
+	if (!sctx->region_table) {
+		ti->error = "Cannot allocate region table";
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr,
+				unsigned long *region_index, unsigned *bit)
+{
+	if (sctx->region_entries_per_slot_bits >= 0) {
+		*region_index = region_nr >> sctx->region_entries_per_slot_bits;
+		*bit = region_nr & (sctx->region_entries_per_slot - 1);
+	} else {
+		*region_index = region_nr / sctx->region_entries_per_slot;
+		*bit = region_nr % sctx->region_entries_per_slot;
+	}
+
+	*bit *= sctx->region_table_entry_bits;
+}
+
+/*
+ * Find which path to use at given offset.
+ */
+static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
+{
+	unsigned long region_index;
+	unsigned bit, path_nr;
+	sector_t p;
+
+	p = offset;
+	if (sctx->region_size_bits >= 0)
+		p >>= sctx->region_size_bits;
+	else
+		sector_div(p, sctx->region_size);
+
+	switch_get_position(sctx, p, &region_index, &bit);
+	path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
+	       ((1 << sctx->region_table_entry_bits) - 1);
+
+	/* This can only happen if the processor uses non-atomic stores. */
+	if (unlikely(path_nr >= sctx->nr_paths))
+		path_nr = 0;
+
+	return path_nr;
+}
+
+static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr,
+				      unsigned value)
+{
+	unsigned long region_index;
+	unsigned bit;
+	region_table_slot_t pte;
+
+	switch_get_position(sctx, region_nr, &region_index, &bit);
+
+	pte = sctx->region_table[region_index];
+	pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit);
+	pte |= (region_table_slot_t)value << bit;
+	sctx->region_table[region_index] = pte;
+}
+
+/*
+ * Fill the region table with an initial round robin pattern.
+ */
+static void initialise_region_table(struct switch_ctx *sctx)
+{
+	unsigned path_nr = 0;
+	unsigned long region_nr;
+
+	for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) {
+		switch_region_table_write(sctx, region_nr, path_nr);
+		if (++path_nr >= sctx->nr_paths)
+			path_nr = 0;
+	}
+}
+
+static int parse_path(struct dm_arg_set *as, struct dm_target *ti)
+{
+	struct switch_ctx *sctx = ti->private;
+	unsigned long long start;
+	int r;
+
+	r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
+			  &sctx->path_list[sctx->nr_paths].dmdev);
+	if (r) {
+		ti->error = "Device lookup failed";
+		return r;
+	}
+
+	if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) {
+		ti->error = "Invalid device starting offset";
+		dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
+		return -EINVAL;
+	}
+
+	sctx->path_list[sctx->nr_paths].start = start;
+
+	sctx->nr_paths++;
+
+	return 0;
+}
+
+/*
+ * Destructor: Don't free the dm_target, just the ti->private data (if any).
+ */
+static void switch_dtr(struct dm_target *ti)
+{
+	struct switch_ctx *sctx = ti->private;
+
+	while (sctx->nr_paths--)
+		dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
+
+	vfree(sctx->region_table);
+	kfree(sctx);
+}
+
+/*
+ * Constructor arguments:
+ *   <num_paths> <region_size> <num_optional_args> [<optional_args>...]
+ *   [<dev_path> <offset>]+
+ *
+ * Optional args are to allow for future extension: currently this
+ * parameter must be 0.
+ */
+static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+	static struct dm_arg _args[] = {
+		{1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"},
+		{1, UINT_MAX, "Invalid region size"},
+		{0, 0, "Invalid number of optional args"},
+	};
+
+	struct switch_ctx *sctx;
+	struct dm_arg_set as;
+	unsigned nr_paths, region_size, nr_optional_args;
+	int r;
+
+	as.argc = argc;
+	as.argv = argv;
+
+	r = dm_read_arg(_args, &as, &nr_paths, &ti->error);
+	if (r)
+		return -EINVAL;
+
+	r = dm_read_arg(_args + 1, &as, &region_size, &ti->error);
+	if (r)
+		return r;
+
+	r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error);
+	if (r)
+		return r;
+	/* parse optional arguments here, if we add any */
+
+	if (as.argc != nr_paths * 2) {
+		ti->error = "Incorrect number of path arguments";
+		return -EINVAL;
+	}
+
+	sctx = alloc_switch_ctx(ti, nr_paths, region_size);
+	if (!sctx) {
+		ti->error = "Cannot allocate redirection context";
+		return -ENOMEM;
+	}
+
+	r = dm_set_target_max_io_len(ti, region_size);
+	if (r)
+		goto error;
+
+	while (as.argc) {
+		r = parse_path(&as, ti);
+		if (r)
+			goto error;
+	}
+
+	r = alloc_region_table(ti, nr_paths);
+	if (r)
+		goto error;
+
+	initialise_region_table(sctx);
+
+	/* For UNMAP, sending the request down any path is sufficient */
+	ti->num_discard_bios = 1;
+
+	return 0;
+
+error:
+	switch_dtr(ti);
+
+	return r;
+}
+
+static int switch_map(struct dm_target *ti, struct bio *bio)
+{
+	struct switch_ctx *sctx = ti->private;
+	sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector);
+	unsigned path_nr = switch_get_path_nr(sctx, offset);
+
+	bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev;
+	bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset;
+
+	return DM_MAPIO_REMAPPED;
+}
+
+/*
+ * We need to parse hex numbers in the message as quickly as possible.
+ *
+ * This table-based hex parser improves performance.
+ * It improves a time to load 1000000 entries compared to the condition-based
+ * parser.
+ *		table-based parser	condition-based parser
+ * PA-RISC	0.29s			0.31s
+ * Opteron	0.0495s			0.0498s
+ */
+static const unsigned char hex_table[256] = {
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255,
+255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+};
+
+static __always_inline unsigned long parse_hex(const char **string)
+{
+	unsigned char d;
+	unsigned long r = 0;
+
+	while ((d = hex_table[(unsigned char)**string]) < 16) {
+		r = (r << 4) | d;
+		(*string)++;
+	}
+
+	return r;
+}
+
+static int process_set_region_mappings(struct switch_ctx *sctx,
+			     unsigned argc, char **argv)
+{
+	unsigned i;
+	unsigned long region_index = 0;
+
+	for (i = 1; i < argc; i++) {
+		unsigned long path_nr;
+		const char *string = argv[i];
+
+		if (*string == ':')
+			region_index++;
+		else {
+			region_index = parse_hex(&string);
+			if (unlikely(*string != ':')) {
+				DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+				return -EINVAL;
+			}
+		}
+
+		string++;
+		if (unlikely(!*string)) {
+			DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+			return -EINVAL;
+		}
+
+		path_nr = parse_hex(&string);
+		if (unlikely(*string)) {
+			DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+			return -EINVAL;
+		}
+		if (unlikely(region_index >= sctx->nr_regions)) {
+			DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions);
+			return -EINVAL;
+		}
+		if (unlikely(path_nr >= sctx->nr_paths)) {
+			DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths);
+			return -EINVAL;
+		}
+
+		switch_region_table_write(sctx, region_index, path_nr);
+	}
+
+	return 0;
+}
+
+/*
+ * Messages are processed one-at-a-time.
+ *
+ * Only set_region_mappings is supported.
+ */
+static int switch_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+	static DEFINE_MUTEX(message_mutex);
+
+	struct switch_ctx *sctx = ti->private;
+	int r = -EINVAL;
+
+	mutex_lock(&message_mutex);
+
+	if (!strcasecmp(argv[0], "set_region_mappings"))
+		r = process_set_region_mappings(sctx, argc, argv);
+	else
+		DMWARN("Unrecognised message received.");
+
+	mutex_unlock(&message_mutex);
+
+	return r;
+}
+
+static void switch_status(struct dm_target *ti, status_type_t type,
+			  unsigned status_flags, char *result, unsigned maxlen)
+{
+	struct switch_ctx *sctx = ti->private;
+	unsigned sz = 0;
+	int path_nr;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		result[0] = '\0';
+		break;
+
+	case STATUSTYPE_TABLE:
+		DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size);
+		for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++)
+			DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name,
+			       (unsigned long long)sctx->path_list[path_nr].start);
+		break;
+	}
+}
+
+/*
+ * Switch ioctl:
+ *
+ * Passthrough all ioctls to the path for sector 0
+ */
+static int switch_ioctl(struct dm_target *ti, unsigned cmd,
+			unsigned long arg)
+{
+	struct switch_ctx *sctx = ti->private;
+	struct block_device *bdev;
+	fmode_t mode;
+	unsigned path_nr;
+	int r = 0;
+
+	path_nr = switch_get_path_nr(sctx, 0);
+
+	bdev = sctx->path_list[path_nr].dmdev->bdev;
+	mode = sctx->path_list[path_nr].dmdev->mode;
+
+	/*
+	 * Only pass ioctls through if the device sizes match exactly.
+	 */
+	if (ti->len + sctx->path_list[path_nr].start != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
+		r = scsi_verify_blk_ioctl(NULL, cmd);
+
+	return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+}
+
+static int switch_iterate_devices(struct dm_target *ti,
+				  iterate_devices_callout_fn fn, void *data)
+{
+	struct switch_ctx *sctx = ti->private;
+	int path_nr;
+	int r;
+
+	for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) {
+		r = fn(ti, sctx->path_list[path_nr].dmdev,
+			 sctx->path_list[path_nr].start, ti->len, data);
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
+static struct target_type switch_target = {
+	.name = "switch",
+	.version = {1, 0, 0},
+	.module = THIS_MODULE,
+	.ctr = switch_ctr,
+	.dtr = switch_dtr,
+	.map = switch_map,
+	.message = switch_message,
+	.status = switch_status,
+	.ioctl = switch_ioctl,
+	.iterate_devices = switch_iterate_devices,
+};
+
+static int __init dm_switch_init(void)
+{
+	int r;
+
+	r = dm_register_target(&switch_target);
+	if (r < 0)
+		DMERR("dm_register_target() failed %d", r);
+
+	return r;
+}
+
+static void __exit dm_switch_exit(void)
+{
+	dm_unregister_target(&switch_target);
+}
+
+module_init(dm_switch_init);
+module_exit(dm_switch_exit);
+
+MODULE_DESCRIPTION(DM_NAME " dynamic path switching target");
+MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
+MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>");
+MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>");
+MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index 84d2b91e4ef..c62c5ab6aed 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -86,6 +86,7 @@ static const struct sysfs_ops dm_sysfs_ops = {
 static struct kobj_type dm_ktype = {
 	.sysfs_ops	= &dm_sysfs_ops,
 	.default_attrs	= dm_attrs,
+	.release	= dm_kobject_release,
 };
 
 /*
@@ -104,5 +105,7 @@ int dm_sysfs_init(struct mapped_device *md)
  */
 void dm_sysfs_exit(struct mapped_device *md)
 {
-	kobject_put(dm_kobject(md));
+	struct kobject *kobj = dm_kobject(md);
+	kobject_put(kobj);
+	wait_for_completion(dm_get_completion_from_kobject(kobj));
 }
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 100368eb799..5f59f1e3e5b 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -26,22 +26,8 @@
 #define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
 #define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
 
-/*
- * The table has always exactly one reference from either mapped_device->map
- * or hash_cell->new_map. This reference is not counted in table->holders.
- * A pair of dm_create_table/dm_destroy_table functions is used for table
- * creation/destruction.
- *
- * Temporary references from the other code increase table->holders. A pair
- * of dm_table_get/dm_table_put functions is used to manipulate it.
- *
- * When the table is about to be destroyed, we wait for table->holders to
- * drop to zero.
- */
-
 struct dm_table {
 	struct mapped_device *md;
-	atomic_t holders;
 	unsigned type;
 
 	/* btree table */
@@ -169,7 +155,6 @@ static int alloc_targets(struct dm_table *t, unsigned int num)
 {
 	sector_t *n_highs;
 	struct dm_target *n_targets;
-	int n = t->num_targets;
 
 	/*
 	 * Allocate both the target array and offset array at once.
@@ -183,12 +168,7 @@ static int alloc_targets(struct dm_table *t, unsigned int num)
 
 	n_targets = (struct dm_target *) (n_highs + num);
 
-	if (n) {
-		memcpy(n_highs, t->highs, sizeof(*n_highs) * n);
-		memcpy(n_targets, t->targets, sizeof(*n_targets) * n);
-	}
-
-	memset(n_highs + n, -1, sizeof(*n_highs) * (num - n));
+	memset(n_highs, -1, sizeof(*n_highs) * num);
 	vfree(t->highs);
 
 	t->num_allocated = num;
@@ -208,16 +188,19 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
 
 	INIT_LIST_HEAD(&t->devices);
 	INIT_LIST_HEAD(&t->target_callbacks);
-	atomic_set(&t->holders, 0);
 
 	if (!num_targets)
 		num_targets = KEYS_PER_NODE;
 
 	num_targets = dm_round_up(num_targets, KEYS_PER_NODE);
 
+	if (!num_targets) {
+		kfree(t);
+		return -ENOMEM;
+	}
+
 	if (alloc_targets(t, num_targets)) {
 		kfree(t);
-		t = NULL;
 		return -ENOMEM;
 	}
 
@@ -247,10 +230,6 @@ void dm_table_destroy(struct dm_table *t)
 	if (!t)
 		return;
 
-	while (atomic_read(&t->holders))
-		msleep(1);
-	smp_mb();
-
 	/* free the indexes */
 	if (t->depth >= 2)
 		vfree(t->index[t->depth - 2]);
@@ -275,33 +254,6 @@ void dm_table_destroy(struct dm_table *t)
 	kfree(t);
 }
 
-void dm_table_get(struct dm_table *t)
-{
-	atomic_inc(&t->holders);
-}
-EXPORT_SYMBOL(dm_table_get);
-
-void dm_table_put(struct dm_table *t)
-{
-	if (!t)
-		return;
-
-	smp_mb__before_atomic_dec();
-	atomic_dec(&t->holders);
-}
-EXPORT_SYMBOL(dm_table_put);
-
-/*
- * Checks to see if we need to extend highs or targets.
- */
-static inline int check_space(struct dm_table *t)
-{
-	if (t->num_targets >= t->num_allocated)
-		return alloc_targets(t, t->num_allocated * 2);
-
-	return 0;
-}
-
 /*
  * See if we've already got a device in the list.
  */
@@ -513,8 +465,8 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
 }
 EXPORT_SYMBOL(dm_get_device);
 
-int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
-			 sector_t start, sector_t len, void *data)
+static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
+				sector_t start, sector_t len, void *data)
 {
 	struct queue_limits *limits = data;
 	struct block_device *bdev = dev->bdev;
@@ -547,7 +499,6 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
 					  (unsigned int) (PAGE_SIZE >> 9));
 	return 0;
 }
-EXPORT_SYMBOL_GPL(dm_set_device_limits);
 
 /*
  * Decrement a device's use count and remove it if necessary.
@@ -581,14 +532,28 @@ static int adjoin(struct dm_table *table, struct dm_target *ti)
 
 /*
  * Used to dynamically allocate the arg array.
+ *
+ * We do first allocation with GFP_NOIO because dm-mpath and dm-thin must
+ * process messages even if some device is suspended. These messages have a
+ * small fixed number of arguments.
+ *
+ * On the other hand, dm-switch needs to process bulk data using messages and
+ * excessive use of GFP_NOIO could cause trouble.
  */
 static char **realloc_argv(unsigned *array_size, char **old_argv)
 {
 	char **argv;
 	unsigned new_size;
+	gfp_t gfp;
 
-	new_size = *array_size ? *array_size * 2 : 64;
-	argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL);
+	if (*array_size) {
+		new_size = *array_size * 2;
+		gfp = GFP_KERNEL;
+	} else {
+		new_size = 8;
+		gfp = GFP_NOIO;
+	}
+	argv = kmalloc(new_size * sizeof(*argv), gfp);
 	if (argv) {
 		memcpy(argv, old_argv, *array_size * sizeof(*argv));
 		*array_size = new_size;
@@ -748,8 +713,7 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 		return -EINVAL;
 	}
 
-	if ((r = check_space(t)))
-		return r;
+	BUG_ON(t->num_targets >= t->num_allocated);
 
 	tgt = t->targets + t->num_targets;
 	memset(tgt, 0, sizeof(*tgt));
@@ -823,8 +787,8 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 
 	t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
 
-	if (!tgt->num_discard_requests && tgt->discards_supported)
-		DMWARN("%s: %s: ignoring discards_supported because num_discard_requests is zero.",
+	if (!tgt->num_discard_bios && tgt->discards_supported)
+		DMWARN("%s: %s: ignoring discards_supported because num_discard_bios is zero.",
 		       dm_device_name(t->md), type);
 
 	return 0;
@@ -896,14 +860,17 @@ EXPORT_SYMBOL(dm_consume_args);
 static int dm_table_set_type(struct dm_table *t)
 {
 	unsigned i;
-	unsigned bio_based = 0, request_based = 0;
+	unsigned bio_based = 0, request_based = 0, hybrid = 0;
 	struct dm_target *tgt;
 	struct dm_dev_internal *dd;
 	struct list_head *devices;
+	unsigned live_md_type;
 
 	for (i = 0; i < t->num_targets; i++) {
 		tgt = t->targets + i;
-		if (dm_target_request_based(tgt))
+		if (dm_target_hybrid(tgt))
+			hybrid = 1;
+		else if (dm_target_request_based(tgt))
 			request_based = 1;
 		else
 			bio_based = 1;
@@ -915,6 +882,19 @@ static int dm_table_set_type(struct dm_table *t)
 		}
 	}
 
+	if (hybrid && !bio_based && !request_based) {
+		/*
+		 * The targets can work either way.
+		 * Determine the type from the live device.
+		 * Default to bio-based if device is new.
+		 */
+		live_md_type = dm_get_md_type(t->md);
+		if (live_md_type == DM_TYPE_REQUEST_BASED)
+			request_based = 1;
+		else
+			bio_based = 1;
+	}
+
 	if (bio_based) {
 		/* We must use this table as bio-based */
 		t->type = DM_TYPE_BIO_BASED;
@@ -964,16 +944,25 @@ bool dm_table_request_based(struct dm_table *t)
 	return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
 }
 
-int dm_table_alloc_md_mempools(struct dm_table *t)
+static int dm_table_alloc_md_mempools(struct dm_table *t)
 {
 	unsigned type = dm_table_get_type(t);
+	unsigned per_bio_data_size = 0;
+	struct dm_target *tgt;
+	unsigned i;
 
 	if (unlikely(type == DM_TYPE_NONE)) {
 		DMWARN("no table type is set, can't allocate mempools");
 		return -EINVAL;
 	}
 
-	t->mempools = dm_alloc_md_mempools(type, t->integrity_supported);
+	if (type == DM_TYPE_BIO_BASED)
+		for (i = 0; i < t->num_targets; i++) {
+			tgt = t->targets + i;
+			per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size);
+		}
+
+	t->mempools = dm_alloc_md_mempools(type, t->integrity_supported, per_bio_data_size);
 	if (!t->mempools)
 		return -ENOMEM;
 
@@ -1351,7 +1340,7 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
 	while (i < dm_table_get_num_targets(t)) {
 		ti = dm_table_get_target(t, i++);
 
-		if (!ti->num_flush_requests)
+		if (!ti->num_flush_bios)
 			continue;
 
 		if (ti->flush_supported)
@@ -1414,6 +1403,33 @@ static bool dm_table_all_devices_attribute(struct dm_table *t,
 	return 1;
 }
 
+static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev,
+					 sector_t start, sector_t len, void *data)
+{
+	struct request_queue *q = bdev_get_queue(dev->bdev);
+
+	return q && !q->limits.max_write_same_sectors;
+}
+
+static bool dm_table_supports_write_same(struct dm_table *t)
+{
+	struct dm_target *ti;
+	unsigned i = 0;
+
+	while (i < dm_table_get_num_targets(t)) {
+		ti = dm_table_get_target(t, i++);
+
+		if (!ti->num_write_same_bios)
+			return false;
+
+		if (!ti->type->iterate_devices ||
+		    ti->type->iterate_devices(ti, device_not_write_same_capable, NULL))
+			return false;
+	}
+
+	return true;
+}
+
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 			       struct queue_limits *limits)
 {
@@ -1445,6 +1461,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	else
 		queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, q);
 
+	if (!dm_table_supports_write_same(t))
+		q->limits.max_write_same_sectors = 0;
+
 	dm_table_set_integrity(t);
 
 	/*
@@ -1529,8 +1548,11 @@ int dm_table_resume_targets(struct dm_table *t)
 			continue;
 
 		r = ti->type->preresume(ti);
-		if (r)
+		if (r) {
+			DMERR("%s: %s: preresume failed, error = %d",
+			      dm_device_name(t->md), ti->type->name, r);
 			return r;
+		}
 	}
 
 	for (i = 0; i < t->num_targets; i++) {
@@ -1595,6 +1617,25 @@ struct mapped_device *dm_table_get_md(struct dm_table *t)
 }
 EXPORT_SYMBOL(dm_table_get_md);
 
+void dm_table_run_md_queue_async(struct dm_table *t)
+{
+	struct mapped_device *md;
+	struct request_queue *queue;
+	unsigned long flags;
+
+	if (!dm_table_request_based(t))
+		return;
+
+	md = dm_table_get_md(t);
+	queue = dm_get_md_queue(md);
+	if (queue) {
+		spin_lock_irqsave(queue->queue_lock, flags);
+		blk_run_queue_async(queue);
+		spin_unlock_irqrestore(queue->queue_lock, flags);
+	}
+}
+EXPORT_SYMBOL(dm_table_run_md_queue_async);
+
 static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
 				  sector_t start, sector_t len, void *data)
 {
@@ -1618,7 +1659,7 @@ bool dm_table_supports_discards(struct dm_table *t)
 	while (i < dm_table_get_num_targets(t)) {
 		ti = dm_table_get_target(t, i++);
 
-		if (!ti->num_discard_requests)
+		if (!ti->num_discard_bios)
 			continue;
 
 		if (ti->discards_supported)
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 8da366cf381..242e3cec397 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -116,7 +116,7 @@ static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args)
 	/*
 	 * Return error for discards instead of -EOPNOTSUPP
 	 */
-	tt->num_discard_requests = 1;
+	tt->num_discard_bios = 1;
 
 	return 0;
 }
@@ -126,18 +126,24 @@ static void io_err_dtr(struct dm_target *tt)
 	/* empty */
 }
 
-static int io_err_map(struct dm_target *tt, struct bio *bio,
-		      union map_info *map_context)
+static int io_err_map(struct dm_target *tt, struct bio *bio)
+{
+	return -EIO;
+}
+
+static int io_err_map_rq(struct dm_target *ti, struct request *clone,
+			 union map_info *map_context)
 {
 	return -EIO;
 }
 
 static struct target_type error_target = {
 	.name = "error",
-	.version = {1, 0, 1},
+	.version = {1, 2, 0},
 	.ctr  = io_err_ctr,
 	.dtr  = io_err_dtr,
 	.map  = io_err_map,
+	.map_rq = io_err_map_rq,
 };
 
 int __init dm_target_init(void)
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 693e149e972..e9d33ad59df 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -76,7 +76,7 @@
 
 #define THIN_SUPERBLOCK_MAGIC 27022010
 #define THIN_SUPERBLOCK_LOCATION 0
-#define THIN_VERSION 1
+#define THIN_VERSION 2
 #define THIN_METADATA_CACHE_SIZE 64
 #define SECTOR_TO_BLOCK_SHIFT 3
 
@@ -192,6 +192,13 @@ struct dm_pool_metadata {
 	 * operation possible in this state is the closing of the device.
 	 */
 	bool fail_io:1;
+
+	/*
+	 * Reading the space map roots can fail, so we read it into these
+	 * buffers before the superblock is locked and updated.
+	 */
+	__u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
+	__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
 };
 
 struct dm_thin_device {
@@ -280,7 +287,7 @@ static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t)
 	*t = v & ((1 << 24) - 1);
 }
 
-static void data_block_inc(void *context, void *value_le)
+static void data_block_inc(void *context, const void *value_le)
 {
 	struct dm_space_map *sm = context;
 	__le64 v_le;
@@ -292,7 +299,7 @@ static void data_block_inc(void *context, void *value_le)
 	dm_sm_inc_block(sm, b);
 }
 
-static void data_block_dec(void *context, void *value_le)
+static void data_block_dec(void *context, const void *value_le)
 {
 	struct dm_space_map *sm = context;
 	__le64 v_le;
@@ -304,7 +311,7 @@ static void data_block_dec(void *context, void *value_le)
 	dm_sm_dec_block(sm, b);
 }
 
-static int data_block_equal(void *context, void *value1_le, void *value2_le)
+static int data_block_equal(void *context, const void *value1_le, const void *value2_le)
 {
 	__le64 v1_le, v2_le;
 	uint64_t b1, b2;
@@ -318,7 +325,7 @@ static int data_block_equal(void *context, void *value1_le, void *value2_le)
 	return b1 == b2;
 }
 
-static void subtree_inc(void *context, void *value)
+static void subtree_inc(void *context, const void *value)
 {
 	struct dm_btree_info *info = context;
 	__le64 root_le;
@@ -329,7 +336,7 @@ static void subtree_inc(void *context, void *value)
 	dm_tm_inc(info->tm, root);
 }
 
-static void subtree_dec(void *context, void *value)
+static void subtree_dec(void *context, const void *value)
 {
 	struct dm_btree_info *info = context;
 	__le64 root_le;
@@ -341,7 +348,7 @@ static void subtree_dec(void *context, void *value)
 		DMERR("btree delete failed\n");
 }
 
-static int subtree_equal(void *context, void *value1_le, void *value2_le)
+static int subtree_equal(void *context, const void *value1_le, const void *value2_le)
 {
 	__le64 v1_le, v2_le;
 	memcpy(&v1_le, value1_le, sizeof(v1_le));
@@ -408,7 +415,7 @@ static void __setup_btree_details(struct dm_pool_metadata *pmd)
 
 	pmd->tl_info.tm = pmd->tm;
 	pmd->tl_info.levels = 1;
-	pmd->tl_info.value_type.context = &pmd->info;
+	pmd->tl_info.value_type.context = &pmd->bl_info;
 	pmd->tl_info.value_type.size = sizeof(__le64);
 	pmd->tl_info.value_type.inc = subtree_inc;
 	pmd->tl_info.value_type.dec = subtree_dec;
@@ -431,26 +438,53 @@ static void __setup_btree_details(struct dm_pool_metadata *pmd)
 	pmd->details_info.value_type.equal = NULL;
 }
 
+static int save_sm_roots(struct dm_pool_metadata *pmd)
+{
+	int r;
+	size_t len;
+
+	r = dm_sm_root_size(pmd->metadata_sm, &len);
+	if (r < 0)
+		return r;
+
+	r = dm_sm_copy_root(pmd->metadata_sm, &pmd->metadata_space_map_root, len);
+	if (r < 0)
+		return r;
+
+	r = dm_sm_root_size(pmd->data_sm, &len);
+	if (r < 0)
+		return r;
+
+	return dm_sm_copy_root(pmd->data_sm, &pmd->data_space_map_root, len);
+}
+
+static void copy_sm_roots(struct dm_pool_metadata *pmd,
+			  struct thin_disk_superblock *disk)
+{
+	memcpy(&disk->metadata_space_map_root,
+	       &pmd->metadata_space_map_root,
+	       sizeof(pmd->metadata_space_map_root));
+
+	memcpy(&disk->data_space_map_root,
+	       &pmd->data_space_map_root,
+	       sizeof(pmd->data_space_map_root));
+}
+
 static int __write_initial_superblock(struct dm_pool_metadata *pmd)
 {
 	int r;
 	struct dm_block *sblock;
-	size_t metadata_len, data_len;
 	struct thin_disk_superblock *disk_super;
 	sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT;
 
 	if (bdev_size > THIN_METADATA_MAX_SECTORS)
 		bdev_size = THIN_METADATA_MAX_SECTORS;
 
-	r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
-	if (r < 0)
-		return r;
-
-	r = dm_sm_root_size(pmd->data_sm, &data_len);
+	r = dm_sm_commit(pmd->data_sm);
 	if (r < 0)
 		return r;
 
-	r = dm_sm_commit(pmd->data_sm);
+	r = save_sm_roots(pmd);
 	if (r < 0)
 		return r;
 
@@ -471,27 +505,15 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd)
 	disk_super->trans_id = 0;
 	disk_super->held_root = 0;
 
-	r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root,
-			    metadata_len);
-	if (r < 0)
-		goto bad_locked;
-
-	r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root,
-			    data_len);
-	if (r < 0)
-		goto bad_locked;
+	copy_sm_roots(pmd, disk_super);
 
 	disk_super->data_mapping_root = cpu_to_le64(pmd->root);
 	disk_super->device_details_root = cpu_to_le64(pmd->details_root);
-	disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
+	disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE);
 	disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
 	disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
 
 	return dm_tm_commit(pmd->tm, sblock);
-
-bad_locked:
-	dm_bm_unlock(sblock);
-	return r;
 }
 
 static int __format_metadata(struct dm_pool_metadata *pmd)
@@ -591,6 +613,15 @@ static int __open_metadata(struct dm_pool_metadata *pmd)
 
 	disk_super = dm_block_data(sblock);
 
+	/* Verify the data block size hasn't changed */
+	if (le32_to_cpu(disk_super->data_block_size) != pmd->data_block_size) {
+		DMERR("changing the data block size (from %u to %llu) is not supported",
+		      le32_to_cpu(disk_super->data_block_size),
+		      (unsigned long long)pmd->data_block_size);
+		r = -EINVAL;
+		goto bad_unlock_sblock;
+	}
+
 	r = __check_incompat_features(disk_super, pmd);
 	if (r < 0)
 		goto bad_unlock_sblock;
@@ -651,7 +682,7 @@ static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool f
 {
 	int r;
 
-	pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE,
+	pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
 					  THIN_METADATA_CACHE_SIZE,
 					  THIN_MAX_CONCURRENT_LOCKS);
 	if (IS_ERR(pmd->bm)) {
@@ -769,6 +800,10 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
 	if (r < 0)
 		return r;
 
+	r = save_sm_roots(pmd);
+	if (r < 0)
+		return r;
+
 	r = superblock_lock(pmd, &sblock);
 	if (r)
 		return r;
@@ -780,21 +815,9 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
 	disk_super->trans_id = cpu_to_le64(pmd->trans_id);
 	disk_super->flags = cpu_to_le32(pmd->flags);
 
-	r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root,
-			    metadata_len);
-	if (r < 0)
-		goto out_locked;
-
-	r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root,
-			    data_len);
-	if (r < 0)
-		goto out_locked;
+	copy_sm_roots(pmd, disk_super);
 
 	return dm_tm_commit(pmd->tm, sblock);
-
-out_locked:
-	dm_bm_unlock(sblock);
-	return r;
 }
 
 struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
@@ -1349,6 +1372,12 @@ dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
 	return td->id;
 }
 
+/*
+ * Check whether @time (of block creation) is older than @td's last snapshot.
+ * If so then the associated block is shared with the last snapshot device.
+ * Any block on a device created *after* the device last got snapshotted is
+ * necessarily not shared.
+ */
 static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
 {
 	return td->snapshotted_time > time;
@@ -1458,6 +1487,20 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
 	return r;
 }
 
+int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
+{
+	int r;
+	uint32_t ref_count;
+
+	down_read(&pmd->root_lock);
+	r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
+	if (!r)
+		*result = (ref_count != 0);
+	up_read(&pmd->root_lock);
+
+	return r;
+}
+
 bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
 {
 	int r;
@@ -1469,6 +1512,23 @@ bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
 	return r;
 }
 
+bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd)
+{
+	bool r = false;
+	struct dm_thin_device *td, *tmp;
+
+	down_read(&pmd->root_lock);
+	list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
+		if (td->changed) {
+			r = td->changed;
+			break;
+		}
+	}
+	up_read(&pmd->root_lock);
+
+	return r;
+}
+
 bool dm_thin_aborted_changes(struct dm_thin_device *td)
 {
 	bool r;
@@ -1645,12 +1705,12 @@ int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
 	return r;
 }
 
-static int __resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
+static int __resize_space_map(struct dm_space_map *sm, dm_block_t new_count)
 {
 	int r;
 	dm_block_t old_count;
 
-	r = dm_sm_get_nr_blocks(pmd->data_sm, &old_count);
+	r = dm_sm_get_nr_blocks(sm, &old_count);
 	if (r)
 		return r;
 
@@ -1658,11 +1718,11 @@ static int __resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
 		return 0;
 
 	if (new_count < old_count) {
-		DMERR("cannot reduce size of data device");
+		DMERR("cannot reduce size of space map");
 		return -EINVAL;
 	}
 
-	return dm_sm_extend(pmd->data_sm, new_count - old_count);
+	return dm_sm_extend(sm, new_count - old_count);
 }
 
 int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
@@ -1671,7 +1731,19 @@ int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
 
 	down_write(&pmd->root_lock);
 	if (!pmd->fail_io)
-		r = __resize_data_dev(pmd, new_count);
+		r = __resize_space_map(pmd->data_sm, new_count);
+	up_write(&pmd->root_lock);
+
+	return r;
+}
+
+int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
+{
+	int r = -EINVAL;
+
+	down_write(&pmd->root_lock);
+	if (!pmd->fail_io)
+		r = __resize_space_map(pmd->metadata_sm, new_count);
 	up_write(&pmd->root_lock);
 
 	return r;
@@ -1684,3 +1756,60 @@ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
 	dm_bm_set_read_only(pmd->bm);
 	up_write(&pmd->root_lock);
 }
+
+void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
+{
+	down_write(&pmd->root_lock);
+	pmd->read_only = false;
+	dm_bm_set_read_write(pmd->bm);
+	up_write(&pmd->root_lock);
+}
+
+int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
+					dm_block_t threshold,
+					dm_sm_threshold_fn fn,
+					void *context)
+{
+	int r;
+
+	down_write(&pmd->root_lock);
+	r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context);
+	up_write(&pmd->root_lock);
+
+	return r;
+}
+
+int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
+{
+	int r;
+	struct dm_block *sblock;
+	struct thin_disk_superblock *disk_super;
+
+	down_write(&pmd->root_lock);
+	pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
+
+	r = superblock_lock(pmd, &sblock);
+	if (r) {
+		DMERR("couldn't read superblock");
+		goto out;
+	}
+
+	disk_super = dm_block_data(sblock);
+	disk_super->flags = cpu_to_le32(pmd->flags);
+
+	dm_bm_unlock(sblock);
+out:
+	up_write(&pmd->root_lock);
+	return r;
+}
+
+bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
+{
+	bool needs_check;
+
+	down_read(&pmd->root_lock);
+	needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG;
+	up_read(&pmd->root_lock);
+
+	return needs_check;
+}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 0cecc370288..e3c857db195 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -8,16 +8,15 @@
 #define DM_THIN_METADATA_H
 
 #include "persistent-data/dm-block-manager.h"
+#include "persistent-data/dm-space-map.h"
+#include "persistent-data/dm-space-map-metadata.h"
 
-#define THIN_METADATA_BLOCK_SIZE 4096
+#define THIN_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE
 
 /*
  * The metadata device is currently limited in size.
- *
- * We have one block of index, which can hold 255 index entries.  Each
- * index entry contains allocation info about 16k metadata blocks.
  */
-#define THIN_METADATA_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
+#define THIN_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS
 
 /*
  * A metadata device larger than 16GB triggers a warning.
@@ -26,6 +25,11 @@
 
 /*----------------------------------------------------------------*/
 
+/*
+ * Thin metadata superblock flags.
+ */
+#define THIN_METADATA_NEEDS_CHECK_FLAG (1 << 0)
+
 struct dm_pool_metadata;
 struct dm_thin_device;
 
@@ -130,7 +134,7 @@ dm_thin_id dm_thin_dev_id(struct dm_thin_device *td);
 
 struct dm_thin_lookup_result {
 	dm_block_t block;
-	unsigned shared:1;
+	bool shared:1;
 };
 
 /*
@@ -160,6 +164,8 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block);
  */
 bool dm_thin_changed_this_transaction(struct dm_thin_device *td);
 
+bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd);
+
 bool dm_thin_aborted_changes(struct dm_thin_device *td);
 
 int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
@@ -180,17 +186,32 @@ int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result);
 
 int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result);
 
+int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result);
+
 /*
  * Returns -ENOSPC if the new size is too small and already allocated
  * blocks would be lost.
  */
 int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_size);
+int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_size);
 
 /*
  * Flicks the underlying block manager into read only mode, so you know
  * that nothing is changing.
  */
 void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd);
+void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd);
+
+int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
+					dm_block_t threshold,
+					dm_sm_threshold_fn fn,
+					void *context);
+
+/*
+ * Updates the superblock immediately.
+ */
+int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd);
+bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd);
 
 /*----------------------------------------------------------------*/
 
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 058acf3a5ba..fc9c848a60c 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -12,9 +12,11 @@
 #include <linux/dm-io.h>
 #include <linux/dm-kcopyd.h>
 #include <linux/list.h>
+#include <linux/rculist.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/rbtree.h>
 
 #define	DM_MSG_PREFIX	"thin"
 
@@ -25,6 +27,12 @@
 #define MAPPING_POOL_SIZE 1024
 #define PRISON_CELLS 1024
 #define COMMIT_PERIOD HZ
+#define NO_SPACE_TIMEOUT_SECS 60
+
+static unsigned no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS;
+
+DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
+		"A percentage of time allocated for copy on write");
 
 /*
  * The block size of the device holding pool data must be
@@ -127,10 +135,11 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
 struct dm_thin_new_mapping;
 
 /*
- * The pool runs in 3 modes.  Ordered in degraded order for comparisons.
+ * The pool runs in 4 modes.  Ordered in degraded order for comparisons.
  */
 enum pool_mode {
 	PM_WRITE,		/* metadata may be changed */
+	PM_OUT_OF_DATA_SPACE,	/* metadata may be changed, though data may not be allocated */
 	PM_READ_ONLY,		/* metadata may not be changed */
 	PM_FAIL,		/* all I/O fails */
 };
@@ -141,6 +150,7 @@ struct pool_features {
 	bool zero_new_blocks:1;
 	bool discard_enabled:1;
 	bool discard_passdown:1;
+	bool error_if_no_space:1;
 };
 
 struct thin_c;
@@ -160,8 +170,7 @@ struct pool {
 	int sectors_per_block_shift;
 
 	struct pool_features pf;
-	unsigned low_water_triggered:1;	/* A dm event has been sent */
-	unsigned no_free_space:1;	/* A -ENOSPC warning has been issued */
+	bool low_water_triggered:1;	/* A dm event has been sent */
 
 	struct dm_bio_prison *prison;
 	struct dm_kcopyd_client *copier;
@@ -169,24 +178,22 @@ struct pool {
 	struct workqueue_struct *wq;
 	struct work_struct worker;
 	struct delayed_work waker;
+	struct delayed_work no_space_timeout;
 
 	unsigned long last_commit_jiffies;
 	unsigned ref_count;
 
 	spinlock_t lock;
-	struct bio_list deferred_bios;
 	struct bio_list deferred_flush_bios;
 	struct list_head prepared_mappings;
 	struct list_head prepared_discards;
-
-	struct bio_list retry_on_resume_list;
+	struct list_head active_thins;
 
 	struct dm_deferred_set *shared_read_ds;
 	struct dm_deferred_set *all_io_ds;
 
 	struct dm_thin_new_mapping *next_mapping;
 	mempool_t *mapping_pool;
-	mempool_t *endio_hook_pool;
 
 	process_bio_fn process_bio;
 	process_bio_fn process_discard;
@@ -196,7 +203,7 @@ struct pool {
 };
 
 static enum pool_mode get_pool_mode(struct pool *pool);
-static void set_pool_mode(struct pool *pool, enum pool_mode mode);
+static void metadata_operation_failed(struct pool *pool, const char *op, int r);
 
 /*
  * Target context for a pool.
@@ -217,17 +224,107 @@ struct pool_c {
  * Target context for a thin.
  */
 struct thin_c {
+	struct list_head list;
 	struct dm_dev *pool_dev;
 	struct dm_dev *origin_dev;
 	dm_thin_id dev_id;
 
 	struct pool *pool;
 	struct dm_thin_device *td;
+	bool requeue_mode:1;
+	spinlock_t lock;
+	struct bio_list deferred_bio_list;
+	struct bio_list retry_on_resume_list;
+	struct rb_root sort_bio_list; /* sorted list of deferred bios */
+
+	/*
+	 * Ensures the thin is not destroyed until the worker has finished
+	 * iterating the active_thins list.
+	 */
+	atomic_t refcount;
+	struct completion can_destroy;
 };
 
 /*----------------------------------------------------------------*/
 
 /*
+ * wake_worker() is used when new work is queued and when pool_resume is
+ * ready to continue deferred IO processing.
+ */
+static void wake_worker(struct pool *pool)
+{
+	queue_work(pool->wq, &pool->worker);
+}
+
+/*----------------------------------------------------------------*/
+
+static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio,
+		      struct dm_bio_prison_cell **cell_result)
+{
+	int r;
+	struct dm_bio_prison_cell *cell_prealloc;
+
+	/*
+	 * Allocate a cell from the prison's mempool.
+	 * This might block but it can't fail.
+	 */
+	cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);
+
+	r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);
+	if (r)
+		/*
+		 * We reused an old cell; we can get rid of
+		 * the new one.
+		 */
+		dm_bio_prison_free_cell(pool->prison, cell_prealloc);
+
+	return r;
+}
+
+static void cell_release(struct pool *pool,
+			 struct dm_bio_prison_cell *cell,
+			 struct bio_list *bios)
+{
+	dm_cell_release(pool->prison, cell, bios);
+	dm_bio_prison_free_cell(pool->prison, cell);
+}
+
+static void cell_release_no_holder(struct pool *pool,
+				   struct dm_bio_prison_cell *cell,
+				   struct bio_list *bios)
+{
+	dm_cell_release_no_holder(pool->prison, cell, bios);
+	dm_bio_prison_free_cell(pool->prison, cell);
+}
+
+static void cell_defer_no_holder_no_free(struct thin_c *tc,
+					 struct dm_bio_prison_cell *cell)
+{
+	struct pool *pool = tc->pool;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tc->lock, flags);
+	dm_cell_release_no_holder(pool->prison, cell, &tc->deferred_bio_list);
+	spin_unlock_irqrestore(&tc->lock, flags);
+
+	wake_worker(pool);
+}
+
+static void cell_error_with_code(struct pool *pool,
+				 struct dm_bio_prison_cell *cell, int error_code)
+{
+	dm_cell_error(pool->prison, cell, error_code);
+	dm_bio_prison_free_cell(pool->prison, cell);
+}
+
+static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
+{
+	cell_error_with_code(pool, cell, -EIO);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
  * A global list of pools that uses a struct mapped_device as a key.
  */
 static struct dm_thin_pool_table {
@@ -292,36 +389,57 @@ struct dm_thin_endio_hook {
 	struct dm_deferred_entry *shared_read_entry;
 	struct dm_deferred_entry *all_io_entry;
 	struct dm_thin_new_mapping *overwrite_mapping;
+	struct rb_node rb_node;
 };
 
-static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
+static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
 {
 	struct bio *bio;
 	struct bio_list bios;
+	unsigned long flags;
 
 	bio_list_init(&bios);
+
+	spin_lock_irqsave(&tc->lock, flags);
 	bio_list_merge(&bios, master);
 	bio_list_init(master);
+	spin_unlock_irqrestore(&tc->lock, flags);
 
-	while ((bio = bio_list_pop(&bios))) {
-		struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
-
-		if (h->tc == tc)
-			bio_endio(bio, DM_ENDIO_REQUEUE);
-		else
-			bio_list_add(master, bio);
-	}
+	while ((bio = bio_list_pop(&bios)))
+		bio_endio(bio, DM_ENDIO_REQUEUE);
 }
 
 static void requeue_io(struct thin_c *tc)
 {
-	struct pool *pool = tc->pool;
+	requeue_bio_list(tc, &tc->deferred_bio_list);
+	requeue_bio_list(tc, &tc->retry_on_resume_list);
+}
+
+static void error_thin_retry_list(struct thin_c *tc)
+{
+	struct bio *bio;
 	unsigned long flags;
+	struct bio_list bios;
 
-	spin_lock_irqsave(&pool->lock, flags);
-	__requeue_bio_list(tc, &pool->deferred_bios);
-	__requeue_bio_list(tc, &pool->retry_on_resume_list);
-	spin_unlock_irqrestore(&pool->lock, flags);
+	bio_list_init(&bios);
+
+	spin_lock_irqsave(&tc->lock, flags);
+	bio_list_merge(&bios, &tc->retry_on_resume_list);
+	bio_list_init(&tc->retry_on_resume_list);
+	spin_unlock_irqrestore(&tc->lock, flags);
+
+	while ((bio = bio_list_pop(&bios)))
+		bio_io_error(bio);
+}
+
+static void error_retry_list(struct pool *pool)
+{
+	struct thin_c *tc;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tc, &pool->active_thins, list)
+		error_thin_retry_list(tc);
+	rcu_read_unlock();
 }
 
 /*
@@ -331,14 +449,20 @@ static void requeue_io(struct thin_c *tc)
  * target.
  */
 
+static bool block_size_is_power_of_two(struct pool *pool)
+{
+	return pool->sectors_per_block_shift >= 0;
+}
+
 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 {
-	sector_t block_nr = bio->bi_sector;
+	struct pool *pool = tc->pool;
+	sector_t block_nr = bio->bi_iter.bi_sector;
 
-	if (tc->pool->sectors_per_block_shift < 0)
-		(void) sector_div(block_nr, tc->pool->sectors_per_block);
+	if (block_size_is_power_of_two(pool))
+		block_nr >>= pool->sectors_per_block_shift;
 	else
-		block_nr >>= tc->pool->sectors_per_block_shift;
+		(void) sector_div(block_nr, pool->sectors_per_block);
 
 	return block_nr;
 }
@@ -346,15 +470,16 @@ static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 {
 	struct pool *pool = tc->pool;
-	sector_t bi_sector = bio->bi_sector;
+	sector_t bi_sector = bio->bi_iter.bi_sector;
 
 	bio->bi_bdev = tc->pool_dev->bdev;
-	if (tc->pool->sectors_per_block_shift < 0)
-		bio->bi_sector = (block * pool->sectors_per_block) +
-				 sector_div(bi_sector, pool->sectors_per_block);
+	if (block_size_is_power_of_two(pool))
+		bio->bi_iter.bi_sector =
+			(block << pool->sectors_per_block_shift) |
+			(bi_sector & (pool->sectors_per_block - 1));
 	else
-		bio->bi_sector = (block << pool->sectors_per_block_shift) |
-				(bi_sector & (pool->sectors_per_block - 1));
+		bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
+				 sector_div(bi_sector, pool->sectors_per_block);
 }
 
 static void remap_to_origin(struct thin_c *tc, struct bio *bio)
@@ -368,6 +493,17 @@ static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
 		dm_thin_changed_this_transaction(tc->td);
 }
 
+static void inc_all_io_entry(struct pool *pool, struct bio *bio)
+{
+	struct dm_thin_endio_hook *h;
+
+	if (bio->bi_rw & REQ_DISCARD)
+		return;
+
+	h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
+	h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
+}
+
 static void issue(struct thin_c *tc, struct bio *bio)
 {
 	struct pool *pool = tc->pool;
@@ -410,15 +546,6 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
 	issue(tc, bio);
 }
 
-/*
- * wake_worker() is used when new work is queued and when pool_resume is
- * ready to continue deferred IO processing.
- */
-static void wake_worker(struct pool *pool)
-{
-	queue_work(pool->wq, &pool->worker);
-}
-
 /*----------------------------------------------------------------*/
 
 /*
@@ -427,15 +554,16 @@ static void wake_worker(struct pool *pool)
 struct dm_thin_new_mapping {
 	struct list_head list;
 
-	unsigned quiesced:1;
-	unsigned prepared:1;
-	unsigned pass_discard:1;
+	bool quiesced:1;
+	bool prepared:1;
+	bool pass_discard:1;
+	bool definitely_not_shared:1;
 
+	int err;
 	struct thin_c *tc;
 	dm_block_t virt_block;
 	dm_block_t data_block;
 	struct dm_bio_prison_cell *cell, *cell2;
-	int err;
 
 	/*
 	 * If the bio covers the whole area of a block then we can avoid
@@ -452,7 +580,7 @@ static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
 	struct pool *pool = m->tc->pool;
 
 	if (m->quiesced && m->prepared) {
-		list_add(&m->list, &pool->prepared_mappings);
+		list_add_tail(&m->list, &pool->prepared_mappings);
 		wake_worker(pool);
 	}
 }
@@ -466,7 +594,7 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
 	m->err = read_err || write_err ? -EIO : 0;
 
 	spin_lock_irqsave(&pool->lock, flags);
-	m->prepared = 1;
+	m->prepared = true;
 	__maybe_add_mapping(m);
 	spin_unlock_irqrestore(&pool->lock, flags);
 }
@@ -474,14 +602,14 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
 static void overwrite_endio(struct bio *bio, int err)
 {
 	unsigned long flags;
-	struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
+	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 	struct dm_thin_new_mapping *m = h->overwrite_mapping;
 	struct pool *pool = m->tc->pool;
 
 	m->err = err;
 
 	spin_lock_irqsave(&pool->lock, flags);
-	m->prepared = 1;
+	m->prepared = true;
 	__maybe_add_mapping(m);
 	spin_unlock_irqrestore(&pool->lock, flags);
 }
@@ -499,58 +627,59 @@ static void overwrite_endio(struct bio *bio, int err)
 /*
  * This sends the bios in the cell back to the deferred_bios list.
  */
-static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell,
-		       dm_block_t data_block)
+static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 {
 	struct pool *pool = tc->pool;
 	unsigned long flags;
 
-	spin_lock_irqsave(&pool->lock, flags);
-	dm_cell_release(cell, &pool->deferred_bios);
-	spin_unlock_irqrestore(&tc->pool->lock, flags);
+	spin_lock_irqsave(&tc->lock, flags);
+	cell_release(pool, cell, &tc->deferred_bio_list);
+	spin_unlock_irqrestore(&tc->lock, flags);
 
 	wake_worker(pool);
 }
 
 /*
- * Same as cell_defer above, except it omits one particular detainee,
- * a write bio that covers the block and has already been processed.
+ * Same as cell_defer above, except it omits the original holder of the cell.
  */
-static void cell_defer_except(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 {
-	struct bio_list bios;
 	struct pool *pool = tc->pool;
 	unsigned long flags;
 
-	bio_list_init(&bios);
-
-	spin_lock_irqsave(&pool->lock, flags);
-	dm_cell_release_no_holder(cell, &pool->deferred_bios);
-	spin_unlock_irqrestore(&pool->lock, flags);
+	spin_lock_irqsave(&tc->lock, flags);
+	cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
+	spin_unlock_irqrestore(&tc->lock, flags);
 
 	wake_worker(pool);
 }
 
 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 {
-	if (m->bio)
+	if (m->bio) {
 		m->bio->bi_end_io = m->saved_bi_end_io;
-	dm_cell_error(m->cell);
+		atomic_inc(&m->bio->bi_remaining);
+	}
+	cell_error(m->tc->pool, m->cell);
 	list_del(&m->list);
 	mempool_free(m, m->tc->pool->mapping_pool);
 }
+
 static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 {
 	struct thin_c *tc = m->tc;
+	struct pool *pool = tc->pool;
 	struct bio *bio;
 	int r;
 
 	bio = m->bio;
-	if (bio)
+	if (bio) {
 		bio->bi_end_io = m->saved_bi_end_io;
+		atomic_inc(&bio->bi_remaining);
+	}
 
 	if (m->err) {
-		dm_cell_error(m->cell);
+		cell_error(pool, m->cell);
 		goto out;
 	}
 
@@ -561,8 +690,8 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	 */
 	r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
 	if (r) {
-		DMERR("dm_thin_insert_block() failed");
-		dm_cell_error(m->cell);
+		metadata_operation_failed(pool, "dm_thin_insert_block", r);
+		cell_error(pool, m->cell);
 		goto out;
 	}
 
@@ -573,14 +702,14 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	 * the bios in the cell.
 	 */
 	if (bio) {
-		cell_defer_except(tc, m->cell);
+		cell_defer_no_holder(tc, m->cell);
 		bio_endio(bio, 0);
 	} else
-		cell_defer(tc, m->cell, m->data_block);
+		cell_defer(tc, m->cell);
 
 out:
 	list_del(&m->list);
-	mempool_free(m, tc->pool->mapping_pool);
+	mempool_free(m, pool->mapping_pool);
 }
 
 static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
@@ -588,8 +717,8 @@ static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
 	struct thin_c *tc = m->tc;
 
 	bio_io_error(m->bio);
-	cell_defer_except(tc, m->cell);
-	cell_defer_except(tc, m->cell2);
+	cell_defer_no_holder(tc, m->cell);
+	cell_defer_no_holder(tc, m->cell2);
 	mempool_free(m, tc->pool->mapping_pool);
 }
 
@@ -597,13 +726,23 @@ static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
 {
 	struct thin_c *tc = m->tc;
 
+	inc_all_io_entry(tc->pool, m->bio);
+	cell_defer_no_holder(tc, m->cell);
+	cell_defer_no_holder(tc, m->cell2);
+
 	if (m->pass_discard)
-		remap_and_issue(tc, m->bio, m->data_block);
+		if (m->definitely_not_shared)
+			remap_and_issue(tc, m->bio, m->data_block);
+		else {
+			bool used = false;
+			if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
+				bio_endio(m->bio, 0);
+			else
+				remap_and_issue(tc, m->bio, m->data_block);
+		}
 	else
 		bio_endio(m->bio, 0);
 
-	cell_defer_except(tc, m->cell);
-	cell_defer_except(tc, m->cell2);
 	mempool_free(m, tc->pool->mapping_pool);
 }
 
@@ -614,7 +753,7 @@ static void process_prepared_discard(struct dm_thin_new_mapping *m)
 
 	r = dm_thin_remove_block(tc->td, m->virt_block);
 	if (r)
-		DMERR("dm_thin_remove_block() failed");
+		DMERR_LIMIT("dm_thin_remove_block() failed");
 
 	process_prepared_discard_passdown(m);
 }
@@ -640,7 +779,8 @@ static void process_prepared(struct pool *pool, struct list_head *head,
  */
 static int io_overlaps_block(struct pool *pool, struct bio *bio)
 {
-	return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT);
+	return bio->bi_iter.bi_size ==
+		(pool->sectors_per_block << SECTOR_SHIFT);
 }
 
 static int io_overwrites_block(struct pool *pool, struct bio *bio)
@@ -668,13 +808,17 @@ static int ensure_next_mapping(struct pool *pool)
 
 static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
 {
-	struct dm_thin_new_mapping *r = pool->next_mapping;
+	struct dm_thin_new_mapping *m = pool->next_mapping;
 
 	BUG_ON(!pool->next_mapping);
 
+	memset(m, 0, sizeof(struct dm_thin_new_mapping));
+	INIT_LIST_HEAD(&m->list);
+	m->bio = NULL;
+
 	pool->next_mapping = NULL;
 
-	return r;
+	return m;
 }
 
 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
@@ -686,18 +830,13 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 	struct pool *pool = tc->pool;
 	struct dm_thin_new_mapping *m = get_next_mapping(pool);
 
-	INIT_LIST_HEAD(&m->list);
-	m->quiesced = 0;
-	m->prepared = 0;
 	m->tc = tc;
 	m->virt_block = virt_block;
 	m->data_block = data_dest;
 	m->cell = cell;
-	m->err = 0;
-	m->bio = NULL;
 
 	if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
-		m->quiesced = 1;
+		m->quiesced = true;
 
 	/*
 	 * IO to pool_dev remaps to the pool target's data_dev.
@@ -706,11 +845,12 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 	 * bio immediately. Otherwise we use kcopyd to clone the data first.
 	 */
 	if (io_overwrites_block(pool, bio)) {
-		struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
+		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 
 		h->overwrite_mapping = m;
 		m->bio = bio;
 		save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
+		inc_all_io_entry(pool, bio);
 		remap_and_issue(tc, bio, data_dest);
 	} else {
 		struct dm_io_region from, to;
@@ -727,8 +867,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 				   0, copy_complete, m);
 		if (r < 0) {
 			mempool_free(m, pool->mapping_pool);
-			DMERR("dm_kcopyd_copy() failed");
-			dm_cell_error(cell);
+			DMERR_LIMIT("dm_kcopyd_copy() failed");
+			cell_error(pool, cell);
 		}
 	}
 }
@@ -756,15 +896,12 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 	struct pool *pool = tc->pool;
 	struct dm_thin_new_mapping *m = get_next_mapping(pool);
 
-	INIT_LIST_HEAD(&m->list);
-	m->quiesced = 1;
-	m->prepared = 0;
+	m->quiesced = true;
+	m->prepared = false;
 	m->tc = tc;
 	m->virt_block = virt_block;
 	m->data_block = data_block;
 	m->cell = cell;
-	m->err = 0;
-	m->bio = NULL;
 
 	/*
 	 * If the whole block of data is being overwritten or we are not
@@ -775,11 +912,12 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 		process_prepared_mapping(m);
 
 	else if (io_overwrites_block(pool, bio)) {
-		struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
+		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 
 		h->overwrite_mapping = m;
 		m->bio = bio;
 		save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
+		inc_all_io_entry(pool, bio);
 		remap_and_issue(tc, bio, data_block);
 	} else {
 		int r;
@@ -792,93 +930,89 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 		r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
 		if (r < 0) {
 			mempool_free(m, pool->mapping_pool);
-			DMERR("dm_kcopyd_zero() failed");
-			dm_cell_error(cell);
+			DMERR_LIMIT("dm_kcopyd_zero() failed");
+			cell_error(pool, cell);
 		}
 	}
 }
 
-static int commit(struct pool *pool)
-{
-	int r;
-
-	r = dm_pool_commit_metadata(pool->pmd);
-	if (r)
-		DMERR("commit failed, error = %d", r);
-
-	return r;
-}
-
 /*
  * A non-zero return indicates read_only or fail_io mode.
  * Many callers don't care about the return value.
  */
-static int commit_or_fallback(struct pool *pool)
+static int commit(struct pool *pool)
 {
 	int r;
 
-	if (get_pool_mode(pool) != PM_WRITE)
+	if (get_pool_mode(pool) >= PM_READ_ONLY)
 		return -EINVAL;
 
-	r = commit(pool);
+	r = dm_pool_commit_metadata(pool->pmd);
 	if (r)
-		set_pool_mode(pool, PM_READ_ONLY);
+		metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
 
 	return r;
 }
 
-static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
+static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
 {
-	int r;
-	dm_block_t free_blocks;
 	unsigned long flags;
-	struct pool *pool = tc->pool;
-
-	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
-	if (r)
-		return r;
 
 	if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
-		DMWARN("%s: reached low water mark, sending event.",
+		DMWARN("%s: reached low water mark for data device: sending event.",
 		       dm_device_name(pool->pool_md));
 		spin_lock_irqsave(&pool->lock, flags);
-		pool->low_water_triggered = 1;
+		pool->low_water_triggered = true;
 		spin_unlock_irqrestore(&pool->lock, flags);
 		dm_table_event(pool->ti->table);
 	}
+}
+
+static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
+
+static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
+{
+	int r;
+	dm_block_t free_blocks;
+	struct pool *pool = tc->pool;
+
+	if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
+		return -EINVAL;
+
+	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
+	if (r) {
+		metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
+		return r;
+	}
+
+	check_low_water_mark(pool, free_blocks);
 
 	if (!free_blocks) {
-		if (pool->no_free_space)
-			return -ENOSPC;
-		else {
-			/*
-			 * Try to commit to see if that will free up some
-			 * more space.
-			 */
-			(void) commit_or_fallback(pool);
+		/*
+		 * Try to commit to see if that will free up some
+		 * more space.
+		 */
+		r = commit(pool);
+		if (r)
+			return r;
 
-			r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
-			if (r)
-				return r;
+		r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
+		if (r) {
+			metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
+			return r;
+		}
 
-			/*
-			 * If we still have no space we set a flag to avoid
-			 * doing all this checking and return -ENOSPC.
-			 */
-			if (!free_blocks) {
-				DMWARN("%s: no free space available.",
-				       dm_device_name(pool->pool_md));
-				spin_lock_irqsave(&pool->lock, flags);
-				pool->no_free_space = 1;
-				spin_unlock_irqrestore(&pool->lock, flags);
-				return -ENOSPC;
-			}
+		if (!free_blocks) {
+			set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
+			return -ENOSPC;
 		}
 	}
 
 	r = dm_pool_alloc_data_block(pool->pmd, result);
-	if (r)
+	if (r) {
+		metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
 		return r;
+	}
 
 	return 0;
 }
@@ -889,26 +1023,70 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
  */
 static void retry_on_resume(struct bio *bio)
 {
-	struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
+	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 	struct thin_c *tc = h->tc;
-	struct pool *pool = tc->pool;
 	unsigned long flags;
 
-	spin_lock_irqsave(&pool->lock, flags);
-	bio_list_add(&pool->retry_on_resume_list, bio);
-	spin_unlock_irqrestore(&pool->lock, flags);
+	spin_lock_irqsave(&tc->lock, flags);
+	bio_list_add(&tc->retry_on_resume_list, bio);
+	spin_unlock_irqrestore(&tc->lock, flags);
 }
 
-static void no_space(struct dm_bio_prison_cell *cell)
+static int should_error_unserviceable_bio(struct pool *pool)
+{
+	enum pool_mode m = get_pool_mode(pool);
+
+	switch (m) {
+	case PM_WRITE:
+		/* Shouldn't get here */
+		DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
+		return -EIO;
+
+	case PM_OUT_OF_DATA_SPACE:
+		return pool->pf.error_if_no_space ? -ENOSPC : 0;
+
+	case PM_READ_ONLY:
+	case PM_FAIL:
+		return -EIO;
+	default:
+		/* Shouldn't get here */
+		DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
+		return -EIO;
+	}
+}
+
+static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
+{
+	int error = should_error_unserviceable_bio(pool);
+
+	if (error)
+		bio_endio(bio, error);
+	else
+		retry_on_resume(bio);
+}
+
+static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
 {
 	struct bio *bio;
 	struct bio_list bios;
+	int error;
+
+	error = should_error_unserviceable_bio(pool);
+	if (error) {
+		cell_error_with_code(pool, cell, error);
+		return;
+	}
 
 	bio_list_init(&bios);
-	dm_cell_release(cell, &bios);
+	cell_release(pool, cell, &bios);
 
-	while ((bio = bio_list_pop(&bios)))
-		retry_on_resume(bio);
+	error = should_error_unserviceable_bio(pool);
+	if (error)
+		while ((bio = bio_list_pop(&bios)))
+			bio_endio(bio, error);
+	else
+		while ((bio = bio_list_pop(&bios)))
+			retry_on_resume(bio);
 }
 
 static void process_discard(struct thin_c *tc, struct bio *bio)
@@ -923,7 +1101,7 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
 	struct dm_thin_new_mapping *m;
 
 	build_virtual_key(tc->td, block, &key);
-	if (dm_bio_detain(tc->pool->prison, &key, bio, &cell))
+	if (bio_detain(tc->pool, &key, bio, &cell))
 		return;
 
 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
@@ -935,8 +1113,8 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
 		 * on this block.
 		 */
 		build_data_key(tc->td, lookup_result.block, &key2);
-		if (dm_bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
-			dm_cell_release_singleton(cell, bio);
+		if (bio_detain(tc->pool, &key2, bio, &cell2)) {
+			cell_defer_no_holder(tc, cell);
 			break;
 		}
 
@@ -947,28 +1125,30 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
 			 */
 			m = get_next_mapping(pool);
 			m->tc = tc;
-			m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown;
+			m->pass_discard = pool->pf.discard_passdown;
+			m->definitely_not_shared = !lookup_result.shared;
 			m->virt_block = block;
 			m->data_block = lookup_result.block;
 			m->cell = cell;
 			m->cell2 = cell2;
-			m->err = 0;
 			m->bio = bio;
 
 			if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
 				spin_lock_irqsave(&pool->lock, flags);
-				list_add(&m->list, &pool->prepared_discards);
+				list_add_tail(&m->list, &pool->prepared_discards);
 				spin_unlock_irqrestore(&pool->lock, flags);
 				wake_worker(pool);
 			}
 		} else {
+			inc_all_io_entry(pool, bio);
+			cell_defer_no_holder(tc, cell);
+			cell_defer_no_holder(tc, cell2);
+
 			/*
 			 * The DM core makes sure that the discard doesn't span
 			 * a block boundary.  So we submit the discard of a
 			 * partial block appropriately.
 			 */
-			dm_cell_release_singleton(cell, bio);
-			dm_cell_release_singleton(cell2, bio);
 			if ((!lookup_result.shared) && pool->pf.discard_passdown)
 				remap_and_issue(tc, bio, lookup_result.block);
 			else
@@ -980,13 +1160,14 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
 		/*
 		 * It isn't provisioned, just forget it.
 		 */
-		dm_cell_release_singleton(cell, bio);
+		cell_defer_no_holder(tc, cell);
 		bio_endio(bio, 0);
 		break;
 
 	default:
-		DMERR("discard: find block unexpectedly returned %d", r);
-		dm_cell_release_singleton(cell, bio);
+		DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
+			    __func__, r);
+		cell_defer_no_holder(tc, cell);
 		bio_io_error(bio);
 		break;
 	}
@@ -999,6 +1180,7 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
 {
 	int r;
 	dm_block_t data_block;
+	struct pool *pool = tc->pool;
 
 	r = alloc_data_block(tc, &data_block);
 	switch (r) {
@@ -1008,12 +1190,13 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
 		break;
 
 	case -ENOSPC:
-		no_space(cell);
+		retry_bios_on_resume(pool, cell);
 		break;
 
 	default:
-		DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
-		dm_cell_error(cell);
+		DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
+			    __func__, r);
+		cell_error(pool, cell);
 		break;
 	}
 }
@@ -1031,17 +1214,18 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
 	 * of being broken so we have nothing further to do here.
 	 */
 	build_data_key(tc->td, lookup_result->block, &key);
-	if (dm_bio_detain(pool->prison, &key, bio, &cell))
+	if (bio_detain(pool, &key, bio, &cell))
 		return;
 
-	if (bio_data_dir(bio) == WRITE && bio->bi_size)
+	if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size)
 		break_sharing(tc, bio, block, &key, lookup_result, cell);
 	else {
-		struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
+		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 
 		h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
+		inc_all_io_entry(pool, bio);
+		cell_defer_no_holder(tc, cell);
 
-		dm_cell_release_singleton(cell, bio);
 		remap_and_issue(tc, bio, lookup_result->block);
 	}
 }
@@ -1051,12 +1235,15 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
 {
 	int r;
 	dm_block_t data_block;
+	struct pool *pool = tc->pool;
 
 	/*
 	 * Remap empty bios (flushes) immediately, without provisioning.
 	 */
-	if (!bio->bi_size) {
-		dm_cell_release_singleton(cell, bio);
+	if (!bio->bi_iter.bi_size) {
+		inc_all_io_entry(pool, bio);
+		cell_defer_no_holder(tc, cell);
+
 		remap_and_issue(tc, bio, 0);
 		return;
 	}
@@ -1066,7 +1253,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
 	 */
 	if (bio_data_dir(bio) == READ) {
 		zero_fill_bio(bio);
-		dm_cell_release_singleton(cell, bio);
+		cell_defer_no_holder(tc, cell);
 		bio_endio(bio, 0);
 		return;
 	}
@@ -1081,13 +1268,13 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
 		break;
 
 	case -ENOSPC:
-		no_space(cell);
+		retry_bios_on_resume(pool, cell);
 		break;
 
 	default:
-		DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
-		set_pool_mode(tc->pool, PM_READ_ONLY);
-		dm_cell_error(cell);
+		DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
+			    __func__, r);
+		cell_error(pool, cell);
 		break;
 	}
 }
@@ -1095,6 +1282,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
 static void process_bio(struct thin_c *tc, struct bio *bio)
 {
 	int r;
+	struct pool *pool = tc->pool;
 	dm_block_t block = get_bio_block(tc, bio);
 	struct dm_bio_prison_cell *cell;
 	struct dm_cell_key key;
@@ -1105,40 +1293,37 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
 	 * being provisioned so we have nothing further to do here.
 	 */
 	build_virtual_key(tc->td, block, &key);
-	if (dm_bio_detain(tc->pool->prison, &key, bio, &cell))
+	if (bio_detain(pool, &key, bio, &cell))
 		return;
 
 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
 	switch (r) {
 	case 0:
-		/*
-		 * We can release this cell now.  This thread is the only
-		 * one that puts bios into a cell, and we know there were
-		 * no preceding bios.
-		 */
-		/*
-		 * TODO: this will probably have to change when discard goes
-		 * back in.
-		 */
-		dm_cell_release_singleton(cell, bio);
-
-		if (lookup_result.shared)
+		if (lookup_result.shared) {
 			process_shared_bio(tc, bio, block, &lookup_result);
-		else
+			cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */
+		} else {
+			inc_all_io_entry(pool, bio);
+			cell_defer_no_holder(tc, cell);
+
 			remap_and_issue(tc, bio, lookup_result.block);
+		}
 		break;
 
 	case -ENODATA:
 		if (bio_data_dir(bio) == READ && tc->origin_dev) {
-			dm_cell_release_singleton(cell, bio);
+			inc_all_io_entry(pool, bio);
+			cell_defer_no_holder(tc, cell);
+
 			remap_to_origin_and_issue(tc, bio);
 		} else
 			provision_block(tc, bio, block, cell);
 		break;
 
 	default:
-		DMERR("dm_thin_find_block() failed, error = %d", r);
-		dm_cell_release_singleton(cell, bio);
+		DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
+			    __func__, r);
+		cell_defer_no_holder(tc, cell);
 		bio_io_error(bio);
 		break;
 	}
@@ -1154,19 +1339,22 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
 	switch (r) {
 	case 0:
-		if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
-			bio_io_error(bio);
-		else
+		if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size)
+			handle_unserviceable_bio(tc->pool, bio);
+		else {
+			inc_all_io_entry(tc->pool, bio);
 			remap_and_issue(tc, bio, lookup_result.block);
+		}
 		break;
 
 	case -ENODATA:
 		if (rw != READ) {
-			bio_io_error(bio);
+			handle_unserviceable_bio(tc->pool, bio);
 			break;
 		}
 
 		if (tc->origin_dev) {
+			inc_all_io_entry(tc->pool, bio);
 			remap_to_origin_and_issue(tc, bio);
 			break;
 		}
@@ -1176,50 +1364,138 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
 		break;
 
 	default:
-		DMERR("dm_thin_find_block() failed, error = %d", r);
+		DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
+			    __func__, r);
 		bio_io_error(bio);
 		break;
 	}
 }
 
+static void process_bio_success(struct thin_c *tc, struct bio *bio)
+{
+	bio_endio(bio, 0);
+}
+
 static void process_bio_fail(struct thin_c *tc, struct bio *bio)
 {
 	bio_io_error(bio);
 }
 
+/*
+ * FIXME: should we also commit due to size of transaction, measured in
+ * metadata blocks?
+ */
 static int need_commit_due_to_time(struct pool *pool)
 {
 	return jiffies < pool->last_commit_jiffies ||
 	       jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
 }
 
-static void process_deferred_bios(struct pool *pool)
+#define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
+#define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook))
+
+static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio)
+{
+	struct rb_node **rbp, *parent;
+	struct dm_thin_endio_hook *pbd;
+	sector_t bi_sector = bio->bi_iter.bi_sector;
+
+	rbp = &tc->sort_bio_list.rb_node;
+	parent = NULL;
+	while (*rbp) {
+		parent = *rbp;
+		pbd = thin_pbd(parent);
+
+		if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector)
+			rbp = &(*rbp)->rb_left;
+		else
+			rbp = &(*rbp)->rb_right;
+	}
+
+	pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
+	rb_link_node(&pbd->rb_node, parent, rbp);
+	rb_insert_color(&pbd->rb_node, &tc->sort_bio_list);
+}
+
+static void __extract_sorted_bios(struct thin_c *tc)
+{
+	struct rb_node *node;
+	struct dm_thin_endio_hook *pbd;
+	struct bio *bio;
+
+	for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) {
+		pbd = thin_pbd(node);
+		bio = thin_bio(pbd);
+
+		bio_list_add(&tc->deferred_bio_list, bio);
+		rb_erase(&pbd->rb_node, &tc->sort_bio_list);
+	}
+
+	WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list));
+}
+
+static void __sort_thin_deferred_bios(struct thin_c *tc)
+{
+	struct bio *bio;
+	struct bio_list bios;
+
+	bio_list_init(&bios);
+	bio_list_merge(&bios, &tc->deferred_bio_list);
+	bio_list_init(&tc->deferred_bio_list);
+
+	/* Sort deferred_bio_list using rb-tree */
+	while ((bio = bio_list_pop(&bios)))
+		__thin_bio_rb_add(tc, bio);
+
+	/*
+	 * Transfer the sorted bios in sort_bio_list back to
+	 * deferred_bio_list to allow lockless submission of
+	 * all bios.
+	 */
+	__extract_sorted_bios(tc);
+}
+
+static void process_thin_deferred_bios(struct thin_c *tc)
 {
+	struct pool *pool = tc->pool;
 	unsigned long flags;
 	struct bio *bio;
 	struct bio_list bios;
+	struct blk_plug plug;
+
+	if (tc->requeue_mode) {
+		requeue_bio_list(tc, &tc->deferred_bio_list);
+		return;
+	}
 
 	bio_list_init(&bios);
 
-	spin_lock_irqsave(&pool->lock, flags);
-	bio_list_merge(&bios, &pool->deferred_bios);
-	bio_list_init(&pool->deferred_bios);
-	spin_unlock_irqrestore(&pool->lock, flags);
+	spin_lock_irqsave(&tc->lock, flags);
 
-	while ((bio = bio_list_pop(&bios))) {
-		struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
-		struct thin_c *tc = h->tc;
+	if (bio_list_empty(&tc->deferred_bio_list)) {
+		spin_unlock_irqrestore(&tc->lock, flags);
+		return;
+	}
 
+	__sort_thin_deferred_bios(tc);
+
+	bio_list_merge(&bios, &tc->deferred_bio_list);
+	bio_list_init(&tc->deferred_bio_list);
+
+	spin_unlock_irqrestore(&tc->lock, flags);
+
+	blk_start_plug(&plug);
+	while ((bio = bio_list_pop(&bios))) {
 		/*
 		 * If we've got no free new_mapping structs, and processing
 		 * this bio might require one, we pause until there are some
 		 * prepared mappings to process.
 		 */
 		if (ensure_next_mapping(pool)) {
-			spin_lock_irqsave(&pool->lock, flags);
-			bio_list_merge(&pool->deferred_bios, &bios);
-			spin_unlock_irqrestore(&pool->lock, flags);
-
+			spin_lock_irqsave(&tc->lock, flags);
+			bio_list_add(&tc->deferred_bio_list, bio);
+			bio_list_merge(&tc->deferred_bio_list, &bios);
+			spin_unlock_irqrestore(&tc->lock, flags);
 			break;
 		}
 
@@ -1228,6 +1504,60 @@ static void process_deferred_bios(struct pool *pool)
 		else
 			pool->process_bio(tc, bio);
 	}
+	blk_finish_plug(&plug);
+}
+
+static void thin_get(struct thin_c *tc);
+static void thin_put(struct thin_c *tc);
+
+/*
+ * We can't hold rcu_read_lock() around code that can block.  So we
+ * find a thin with the rcu lock held; bump a refcount; then drop
+ * the lock.
+ */
+static struct thin_c *get_first_thin(struct pool *pool)
+{
+	struct thin_c *tc = NULL;
+
+	rcu_read_lock();
+	if (!list_empty(&pool->active_thins)) {
+		tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);
+		thin_get(tc);
+	}
+	rcu_read_unlock();
+
+	return tc;
+}
+
+static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc)
+{
+	struct thin_c *old_tc = tc;
+
+	rcu_read_lock();
+	list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) {
+		thin_get(tc);
+		thin_put(old_tc);
+		rcu_read_unlock();
+		return tc;
+	}
+	thin_put(old_tc);
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+static void process_deferred_bios(struct pool *pool)
+{
+	unsigned long flags;
+	struct bio *bio;
+	struct bio_list bios;
+	struct thin_c *tc;
+
+	tc = get_first_thin(pool);
+	while (tc) {
+		process_thin_deferred_bios(tc);
+		tc = get_next_thin(pool, tc);
+	}
 
 	/*
 	 * If there are any deferred flush bios, we must commit
@@ -1239,10 +1569,11 @@ static void process_deferred_bios(struct pool *pool)
 	bio_list_init(&pool->deferred_flush_bios);
 	spin_unlock_irqrestore(&pool->lock, flags);
 
-	if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
+	if (bio_list_empty(&bios) &&
+	    !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
 		return;
 
-	if (commit_or_fallback(pool)) {
+	if (commit(pool)) {
 		while ((bio = bio_list_pop(&bios)))
 			bio_io_error(bio);
 		return;
@@ -1273,6 +1604,81 @@ static void do_waker(struct work_struct *ws)
 	queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
 }
 
+/*
+ * We're holding onto IO to allow userland time to react.  After the
+ * timeout either the pool will have been resized (and thus back in
+ * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO.
+ */
+static void do_no_space_timeout(struct work_struct *ws)
+{
+	struct pool *pool = container_of(to_delayed_work(ws), struct pool,
+					 no_space_timeout);
+
+	if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space)
+		set_pool_mode(pool, PM_READ_ONLY);
+}
+
+/*----------------------------------------------------------------*/
+
+struct pool_work {
+	struct work_struct worker;
+	struct completion complete;
+};
+
+static struct pool_work *to_pool_work(struct work_struct *ws)
+{
+	return container_of(ws, struct pool_work, worker);
+}
+
+static void pool_work_complete(struct pool_work *pw)
+{
+	complete(&pw->complete);
+}
+
+static void pool_work_wait(struct pool_work *pw, struct pool *pool,
+			   void (*fn)(struct work_struct *))
+{
+	INIT_WORK_ONSTACK(&pw->worker, fn);
+	init_completion(&pw->complete);
+	queue_work(pool->wq, &pw->worker);
+	wait_for_completion(&pw->complete);
+}
+
+/*----------------------------------------------------------------*/
+
+struct noflush_work {
+	struct pool_work pw;
+	struct thin_c *tc;
+};
+
+static struct noflush_work *to_noflush(struct work_struct *ws)
+{
+	return container_of(to_pool_work(ws), struct noflush_work, pw);
+}
+
+static void do_noflush_start(struct work_struct *ws)
+{
+	struct noflush_work *w = to_noflush(ws);
+	w->tc->requeue_mode = true;
+	requeue_io(w->tc);
+	pool_work_complete(&w->pw);
+}
+
+static void do_noflush_stop(struct work_struct *ws)
+{
+	struct noflush_work *w = to_noflush(ws);
+	w->tc->requeue_mode = false;
+	pool_work_complete(&w->pw);
+}
+
+static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
+{
+	struct noflush_work w;
+
+	w.tc = tc;
+	pool_work_wait(&w.pw, tc->pool, fn);
+}
+
 /*----------------------------------------------------------------*/
 
 static enum pool_mode get_pool_mode(struct pool *pool)
@@ -1280,43 +1686,127 @@ static enum pool_mode get_pool_mode(struct pool *pool)
 	return pool->pf.mode;
 }
 
-static void set_pool_mode(struct pool *pool, enum pool_mode mode)
+static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
 {
-	int r;
+	dm_table_event(pool->ti->table);
+	DMINFO("%s: switching pool to %s mode",
+	       dm_device_name(pool->pool_md), new_mode);
+}
 
-	pool->pf.mode = mode;
+static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
+{
+	struct pool_c *pt = pool->ti->private;
+	bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
+	enum pool_mode old_mode = get_pool_mode(pool);
+	unsigned long no_space_timeout = ACCESS_ONCE(no_space_timeout_secs) * HZ;
 
-	switch (mode) {
+	/*
+	 * Never allow the pool to transition to PM_WRITE mode if user
+	 * intervention is required to verify metadata and data consistency.
+	 */
+	if (new_mode == PM_WRITE && needs_check) {
+		DMERR("%s: unable to switch pool to write mode until repaired.",
+		      dm_device_name(pool->pool_md));
+		if (old_mode != new_mode)
+			new_mode = old_mode;
+		else
+			new_mode = PM_READ_ONLY;
+	}
+	/*
+	 * If we were in PM_FAIL mode, rollback of metadata failed.  We're
+	 * not going to recover without a thin_repair.	So we never let the
+	 * pool move out of the old mode.
+	 */
+	if (old_mode == PM_FAIL)
+		new_mode = old_mode;
+
+	switch (new_mode) {
 	case PM_FAIL:
-		DMERR("switching pool to failure mode");
+		if (old_mode != new_mode)
+			notify_of_pool_mode_change(pool, "failure");
+		dm_pool_metadata_read_only(pool->pmd);
 		pool->process_bio = process_bio_fail;
 		pool->process_discard = process_bio_fail;
 		pool->process_prepared_mapping = process_prepared_mapping_fail;
 		pool->process_prepared_discard = process_prepared_discard_fail;
+
+		error_retry_list(pool);
 		break;
 
 	case PM_READ_ONLY:
-		DMERR("switching pool to read-only mode");
-		r = dm_pool_abort_metadata(pool->pmd);
-		if (r) {
-			DMERR("aborting transaction failed");
-			set_pool_mode(pool, PM_FAIL);
-		} else {
-			dm_pool_metadata_read_only(pool->pmd);
-			pool->process_bio = process_bio_read_only;
-			pool->process_discard = process_discard;
-			pool->process_prepared_mapping = process_prepared_mapping_fail;
-			pool->process_prepared_discard = process_prepared_discard_passdown;
-		}
+		if (old_mode != new_mode)
+			notify_of_pool_mode_change(pool, "read-only");
+		dm_pool_metadata_read_only(pool->pmd);
+		pool->process_bio = process_bio_read_only;
+		pool->process_discard = process_bio_success;
+		pool->process_prepared_mapping = process_prepared_mapping_fail;
+		pool->process_prepared_discard = process_prepared_discard_passdown;
+
+		error_retry_list(pool);
+		break;
+
+	case PM_OUT_OF_DATA_SPACE:
+		/*
+		 * Ideally we'd never hit this state; the low water mark
+		 * would trigger userland to extend the pool before we
+		 * completely run out of data space.  However, many small
+		 * IOs to unprovisioned space can consume data space at an
+		 * alarming rate.  Adjust your low water mark if you're
+		 * frequently seeing this mode.
+		 */
+		if (old_mode != new_mode)
+			notify_of_pool_mode_change(pool, "out-of-data-space");
+		pool->process_bio = process_bio_read_only;
+		pool->process_discard = process_discard;
+		pool->process_prepared_mapping = process_prepared_mapping;
+		pool->process_prepared_discard = process_prepared_discard_passdown;
+
+		if (!pool->pf.error_if_no_space && no_space_timeout)
+			queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
 		break;
 
 	case PM_WRITE:
+		if (old_mode != new_mode)
+			notify_of_pool_mode_change(pool, "write");
+		dm_pool_metadata_read_write(pool->pmd);
 		pool->process_bio = process_bio;
 		pool->process_discard = process_discard;
 		pool->process_prepared_mapping = process_prepared_mapping;
 		pool->process_prepared_discard = process_prepared_discard;
 		break;
 	}
+
+	pool->pf.mode = new_mode;
+	/*
+	 * The pool mode may have changed, sync it so bind_control_target()
+	 * doesn't cause an unexpected mode transition on resume.
+	 */
+	pt->adjusted_pf.mode = new_mode;
+}
+
+static void abort_transaction(struct pool *pool)
+{
+	const char *dev_name = dm_device_name(pool->pool_md);
+
+	DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
+	if (dm_pool_abort_metadata(pool->pmd)) {
+		DMERR("%s: failed to abort metadata transaction", dev_name);
+		set_pool_mode(pool, PM_FAIL);
+	}
+
+	if (dm_pool_metadata_set_needs_check(pool->pmd)) {
+		DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
+		set_pool_mode(pool, PM_FAIL);
+	}
+}
+
+static void metadata_operation_failed(struct pool *pool, const char *op, int r)
+{
+	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
+		    dm_device_name(pool->pool_md), op, r);
+
+	abort_transaction(pool);
+	set_pool_mode(pool, PM_READ_ONLY);
 }
 
 /*----------------------------------------------------------------*/
@@ -1333,39 +1823,43 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
 	unsigned long flags;
 	struct pool *pool = tc->pool;
 
-	spin_lock_irqsave(&pool->lock, flags);
-	bio_list_add(&pool->deferred_bios, bio);
-	spin_unlock_irqrestore(&pool->lock, flags);
+	spin_lock_irqsave(&tc->lock, flags);
+	bio_list_add(&tc->deferred_bio_list, bio);
+	spin_unlock_irqrestore(&tc->lock, flags);
 
 	wake_worker(pool);
 }
 
-static struct dm_thin_endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
+static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
 {
-	struct pool *pool = tc->pool;
-	struct dm_thin_endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
+	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 
 	h->tc = tc;
 	h->shared_read_entry = NULL;
-	h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : dm_deferred_entry_inc(pool->all_io_ds);
+	h->all_io_entry = NULL;
 	h->overwrite_mapping = NULL;
-
-	return h;
 }
 
 /*
  * Non-blocking function called from the thin target's map function.
  */
-static int thin_bio_map(struct dm_target *ti, struct bio *bio,
-			union map_info *map_context)
+static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 {
 	int r;
 	struct thin_c *tc = ti->private;
 	dm_block_t block = get_bio_block(tc, bio);
 	struct dm_thin_device *td = tc->td;
 	struct dm_thin_lookup_result result;
+	struct dm_bio_prison_cell cell1, cell2;
+	struct dm_bio_prison_cell *cell_result;
+	struct dm_cell_key key;
 
-	map_context->ptr = thin_hook_bio(tc, bio);
+	thin_hook_bio(tc, bio);
+
+	if (tc->requeue_mode) {
+		bio_endio(bio, DM_ENDIO_REQUEUE);
+		return DM_MAPIO_SUBMITTED;
+	}
 
 	if (get_pool_mode(tc->pool) == PM_FAIL) {
 		bio_io_error(bio);
@@ -1400,22 +1894,34 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
 			 * shared flag will be set in their case.
 			 */
 			thin_defer_bio(tc, bio);
-			r = DM_MAPIO_SUBMITTED;
-		} else {
-			remap(tc, bio, result.block);
-			r = DM_MAPIO_REMAPPED;
+			return DM_MAPIO_SUBMITTED;
 		}
-		break;
+
+		build_virtual_key(tc->td, block, &key);
+		if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result))
+			return DM_MAPIO_SUBMITTED;
+
+		build_data_key(tc->td, result.block, &key);
+		if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) {
+			cell_defer_no_holder_no_free(tc, &cell1);
+			return DM_MAPIO_SUBMITTED;
+		}
+
+		inc_all_io_entry(tc->pool, bio);
+		cell_defer_no_holder_no_free(tc, &cell2);
+		cell_defer_no_holder_no_free(tc, &cell1);
+
+		remap(tc, bio, result.block);
+		return DM_MAPIO_REMAPPED;
 
 	case -ENODATA:
 		if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
 			/*
 			 * This block isn't provisioned, and we have no way
-			 * of doing so.  Just error it.
+			 * of doing so.
 			 */
-			bio_io_error(bio);
-			r = DM_MAPIO_SUBMITTED;
-			break;
+			handle_unserviceable_bio(tc->pool, bio);
+			return DM_MAPIO_SUBMITTED;
 		}
 		/* fall through */
 
@@ -1425,8 +1931,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
 		 * provide the hint to load the metadata into cache.
 		 */
 		thin_defer_bio(tc, bio);
-		r = DM_MAPIO_SUBMITTED;
-		break;
+		return DM_MAPIO_SUBMITTED;
 
 	default:
 		/*
@@ -1435,35 +1940,35 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
 		 * pool is switched to fail-io mode.
 		 */
 		bio_io_error(bio);
-		r = DM_MAPIO_SUBMITTED;
-		break;
+		return DM_MAPIO_SUBMITTED;
 	}
-
-	return r;
 }
 
 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
 {
-	int r;
-	unsigned long flags;
 	struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
+	struct request_queue *q;
 
-	spin_lock_irqsave(&pt->pool->lock, flags);
-	r = !bio_list_empty(&pt->pool->retry_on_resume_list);
-	spin_unlock_irqrestore(&pt->pool->lock, flags);
+	if (get_pool_mode(pt->pool) == PM_OUT_OF_DATA_SPACE)
+		return 1;
 
-	if (!r) {
-		struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
-		r = bdi_congested(&q->backing_dev_info, bdi_bits);
-	}
-
-	return r;
+	q = bdev_get_queue(pt->data_dev->bdev);
+	return bdi_congested(&q->backing_dev_info, bdi_bits);
 }
 
-static void __requeue_bios(struct pool *pool)
+static void requeue_bios(struct pool *pool)
 {
-	bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list);
-	bio_list_init(&pool->retry_on_resume_list);
+	unsigned long flags;
+	struct thin_c *tc;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tc, &pool->active_thins, list) {
+		spin_lock_irqsave(&tc->lock, flags);
+		bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list);
+		bio_list_init(&tc->retry_on_resume_list);
+		spin_unlock_irqrestore(&tc->lock, flags);
+	}
+	rcu_read_unlock();
 }
 
 /*----------------------------------------------------------------
@@ -1476,6 +1981,11 @@ static bool data_dev_supports_discard(struct pool_c *pt)
 	return q && blk_queue_discard(q);
 }
 
+static bool is_factor(sector_t block_size, uint32_t n)
+{
+	return !sector_div(block_size, n);
+}
+
 /*
  * If discard_passdown was enabled verify that the data device
  * supports discards.  Disable discard_passdown if not.
@@ -1501,7 +2011,7 @@ static void disable_passdown_if_not_supported(struct pool_c *pt)
 	else if (data_limits->discard_granularity > block_size)
 		reason = "discard granularity larger than a block";
 
-	else if (block_size & (data_limits->discard_granularity - 1))
+	else if (!is_factor(block_size, data_limits->discard_granularity))
 		reason = "discard granularity not a factor of block size";
 
 	if (reason) {
@@ -1515,17 +2025,21 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
 	struct pool_c *pt = ti->private;
 
 	/*
-	 * We want to make sure that degraded pools are never upgraded.
+	 * We want to make sure that a pool in PM_FAIL mode is never upgraded.
 	 */
-	enum pool_mode old_mode = pool->pf.mode;
+	enum pool_mode old_mode = get_pool_mode(pool);
 	enum pool_mode new_mode = pt->adjusted_pf.mode;
 
-	if (old_mode > new_mode)
-		new_mode = old_mode;
+	/*
+	 * Don't change the pool's mode until set_pool_mode() below.
+	 * Otherwise the pool's process_* function pointers may
+	 * not match the desired pool mode.
+	 */
+	pt->adjusted_pf.mode = old_mode;
 
 	pool->ti = ti;
-	pool->low_water_blocks = pt->low_water_blocks;
 	pool->pf = pt->adjusted_pf;
+	pool->low_water_blocks = pt->low_water_blocks;
 
 	set_pool_mode(pool, new_mode);
 
@@ -1548,6 +2062,7 @@ static void pool_features_init(struct pool_features *pf)
 	pf->zero_new_blocks = true;
 	pf->discard_enabled = true;
 	pf->discard_passdown = true;
+	pf->error_if_no_space = false;
 }
 
 static void __pool_destroy(struct pool *pool)
@@ -1566,14 +2081,12 @@ static void __pool_destroy(struct pool *pool)
 	if (pool->next_mapping)
 		mempool_free(pool->next_mapping, pool->mapping_pool);
 	mempool_destroy(pool->mapping_pool);
-	mempool_destroy(pool->endio_hook_pool);
 	dm_deferred_set_destroy(pool->shared_read_ds);
 	dm_deferred_set_destroy(pool->all_io_ds);
 	kfree(pool);
 }
 
 static struct kmem_cache *_new_mapping_cache;
-static struct kmem_cache *_endio_hook_cache;
 
 static struct pool *pool_create(struct mapped_device *pool_md,
 				struct block_device *metadata_dev,
@@ -1614,7 +2127,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 		goto bad_prison;
 	}
 
-	pool->copier = dm_kcopyd_client_create();
+	pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
 	if (IS_ERR(pool->copier)) {
 		r = PTR_ERR(pool->copier);
 		*error = "Error creating pool's kcopyd client";
@@ -1635,14 +2148,13 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 
 	INIT_WORK(&pool->worker, do_worker);
 	INIT_DELAYED_WORK(&pool->waker, do_waker);
+	INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
 	spin_lock_init(&pool->lock);
-	bio_list_init(&pool->deferred_bios);
 	bio_list_init(&pool->deferred_flush_bios);
 	INIT_LIST_HEAD(&pool->prepared_mappings);
 	INIT_LIST_HEAD(&pool->prepared_discards);
-	pool->low_water_triggered = 0;
-	pool->no_free_space = 0;
-	bio_list_init(&pool->retry_on_resume_list);
+	INIT_LIST_HEAD(&pool->active_thins);
+	pool->low_water_triggered = false;
 
 	pool->shared_read_ds = dm_deferred_set_create();
 	if (!pool->shared_read_ds) {
@@ -1667,13 +2179,6 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 		goto bad_mapping_pool;
 	}
 
-	pool->endio_hook_pool = mempool_create_slab_pool(ENDIO_HOOK_POOL_SIZE,
-							 _endio_hook_cache);
-	if (!pool->endio_hook_pool) {
-		*error = "Error creating pool's endio_hook mempool";
-		err_p = ERR_PTR(-ENOMEM);
-		goto bad_endio_hook_pool;
-	}
 	pool->ref_count = 1;
 	pool->last_commit_jiffies = jiffies;
 	pool->pool_md = pool_md;
@@ -1682,8 +2187,6 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 
 	return pool;
 
-bad_endio_hook_pool:
-	mempool_destroy(pool->mapping_pool);
 bad_mapping_pool:
 	dm_deferred_set_destroy(pool->all_io_ds);
 bad_all_io_ds:
@@ -1775,7 +2278,7 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
 	const char *arg_name;
 
 	static struct dm_arg _args[] = {
-		{0, 3, "Invalid number of pool feature arguments"},
+		{0, 4, "Invalid number of pool feature arguments"},
 	};
 
 	/*
@@ -1804,6 +2307,9 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
 		else if (!strcasecmp(arg_name, "read_only"))
 			pf->mode = PM_READ_ONLY;
 
+		else if (!strcasecmp(arg_name, "error_if_no_space"))
+			pf->error_if_no_space = true;
+
 		else {
 			ti->error = "Unrecognised pool feature requested";
 			r = -EINVAL;
@@ -1814,6 +2320,67 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
 	return r;
 }
 
+static void metadata_low_callback(void *context)
+{
+	struct pool *pool = context;
+
+	DMWARN("%s: reached low water mark for metadata device: sending event.",
+	       dm_device_name(pool->pool_md));
+
+	dm_table_event(pool->ti->table);
+}
+
+static sector_t get_dev_size(struct block_device *bdev)
+{
+	return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+}
+
+static void warn_if_metadata_device_too_big(struct block_device *bdev)
+{
+	sector_t metadata_dev_size = get_dev_size(bdev);
+	char buffer[BDEVNAME_SIZE];
+
+	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
+		DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
+		       bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS);
+}
+
+static sector_t get_metadata_dev_size(struct block_device *bdev)
+{
+	sector_t metadata_dev_size = get_dev_size(bdev);
+
+	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
+		metadata_dev_size = THIN_METADATA_MAX_SECTORS;
+
+	return metadata_dev_size;
+}
+
+static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
+{
+	sector_t metadata_dev_size = get_metadata_dev_size(bdev);
+
+	sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
+
+	return metadata_dev_size;
+}
+
+/*
+ * When a metadata threshold is crossed a dm event is triggered, and
+ * userland should respond by growing the metadata device.  We could let
+ * userland set the threshold, like we do with the data threshold, but I'm
+ * not sure they know enough to do this well.
+ */
+static dm_block_t calc_metadata_threshold(struct pool_c *pt)
+{
+	/*
+	 * 4M is ample for all ops with the possible exception of thin
+	 * device deletion which is harmless if it fails (just retry the
+	 * delete after you've grown the device).
+	 */
+	dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4;
+	return min((dm_block_t)1024ULL /* 4M */, quarter);
+}
+
 /*
  * thin-pool <metadata dev> <data dev>
  *	     <data block size (sectors)>
@@ -1824,6 +2391,8 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
  *	     skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
  *	     ignore_discard: disable discard
  *	     no_discard_passdown: don't pass discards down to the data device
+ *	     read_only: Don't allow any changes to be made to the pool metadata.
+ *	     error_if_no_space: error IOs, instead of queueing, if no space.
  */
 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
@@ -1836,8 +2405,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	unsigned long block_size;
 	dm_block_t low_water_blocks;
 	struct dm_dev *metadata_dev;
-	sector_t metadata_dev_size;
-	char b[BDEVNAME_SIZE];
+	fmode_t metadata_mode;
 
 	/*
 	 * FIXME Remove validation from scope of lock.
@@ -1849,19 +2417,27 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		r = -EINVAL;
 		goto out_unlock;
 	}
+
 	as.argc = argc;
 	as.argv = argv;
 
-	r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev);
+	/*
+	 * Set default pool features.
+	 */
+	pool_features_init(&pf);
+
+	dm_consume_args(&as, 4);
+	r = parse_pool_features(&as, &pf, ti);
+	if (r)
+		goto out_unlock;
+
+	metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE);
+	r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev);
 	if (r) {
 		ti->error = "Error opening metadata block device";
 		goto out_unlock;
 	}
-
-	metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
-	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
-		DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
-		       bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
+	warn_if_metadata_device_too_big(metadata_dev->bdev);
 
 	r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
 	if (r) {
@@ -1884,16 +2460,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto out;
 	}
 
-	/*
-	 * Set default pool features.
-	 */
-	pool_features_init(&pf);
-
-	dm_consume_args(&as, 4);
-	r = parse_pool_features(&as, &pf, ti);
-	if (r)
-		goto out;
-
 	pt = kzalloc(sizeof(*pt), GFP_KERNEL);
 	if (!pt) {
 		r = -ENOMEM;
@@ -1925,15 +2491,16 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	pt->data_dev = data_dev;
 	pt->low_water_blocks = low_water_blocks;
 	pt->adjusted_pf = pt->requested_pf = pf;
-	ti->num_flush_requests = 1;
+	ti->num_flush_bios = 1;
 
 	/*
 	 * Only need to enable discards if the pool should pass
 	 * them down to the data device.  The thin device's discard
 	 * processing will cause mappings to be removed from the btree.
 	 */
+	ti->discard_zeroes_data_unsupported = true;
 	if (pf.discard_enabled && pf.discard_passdown) {
-		ti->num_discard_requests = 1;
+		ti->num_discard_bios = 1;
 
 		/*
 		 * Setting 'discards_supported' circumvents the normal
@@ -1941,10 +2508,16 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		 * thin devices' discard limits consistent).
 		 */
 		ti->discards_supported = true;
-		ti->discard_zeroes_data_unsupported = true;
 	}
 	ti->private = pt;
 
+	r = dm_pool_register_metadata_threshold(pt->pool->pmd,
+						calc_metadata_threshold(pt),
+						metadata_low_callback,
+						pool);
+	if (r)
+		goto out_free_pt;
+
 	pt->callbacks.congested_fn = pool_is_congested;
 	dm_table_add_target_callbacks(ti->table, &pt->callbacks);
 
@@ -1966,8 +2539,7 @@ out_unlock:
 	return r;
 }
 
-static int pool_map(struct dm_target *ti, struct bio *bio,
-		    union map_info *map_context)
+static int pool_map(struct dm_target *ti, struct bio *bio)
 {
 	int r;
 	struct pool_c *pt = ti->private;
@@ -1985,6 +2557,101 @@ static int pool_map(struct dm_target *ti, struct bio *bio,
 	return r;
 }
 
+static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
+{
+	int r;
+	struct pool_c *pt = ti->private;
+	struct pool *pool = pt->pool;
+	sector_t data_size = ti->len;
+	dm_block_t sb_data_size;
+
+	*need_commit = false;
+
+	(void) sector_div(data_size, pool->sectors_per_block);
+
+	r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
+	if (r) {
+		DMERR("%s: failed to retrieve data device size",
+		      dm_device_name(pool->pool_md));
+		return r;
+	}
+
+	if (data_size < sb_data_size) {
+		DMERR("%s: pool target (%llu blocks) too small: expected %llu",
+		      dm_device_name(pool->pool_md),
+		      (unsigned long long)data_size, sb_data_size);
+		return -EINVAL;
+
+	} else if (data_size > sb_data_size) {
+		if (dm_pool_metadata_needs_check(pool->pmd)) {
+			DMERR("%s: unable to grow the data device until repaired.",
+			      dm_device_name(pool->pool_md));
+			return 0;
+		}
+
+		if (sb_data_size)
+			DMINFO("%s: growing the data device from %llu to %llu blocks",
+			       dm_device_name(pool->pool_md),
+			       sb_data_size, (unsigned long long)data_size);
+		r = dm_pool_resize_data_dev(pool->pmd, data_size);
+		if (r) {
+			metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
+			return r;
+		}
+
+		*need_commit = true;
+	}
+
+	return 0;
+}
+
+static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
+{
+	int r;
+	struct pool_c *pt = ti->private;
+	struct pool *pool = pt->pool;
+	dm_block_t metadata_dev_size, sb_metadata_dev_size;
+
+	*need_commit = false;
+
+	metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev);
+
+	r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size);
+	if (r) {
+		DMERR("%s: failed to retrieve metadata device size",
+		      dm_device_name(pool->pool_md));
+		return r;
+	}
+
+	if (metadata_dev_size < sb_metadata_dev_size) {
+		DMERR("%s: metadata device (%llu blocks) too small: expected %llu",
+		      dm_device_name(pool->pool_md),
+		      metadata_dev_size, sb_metadata_dev_size);
+		return -EINVAL;
+
+	} else if (metadata_dev_size > sb_metadata_dev_size) {
+		if (dm_pool_metadata_needs_check(pool->pmd)) {
+			DMERR("%s: unable to grow the metadata device until repaired.",
+			      dm_device_name(pool->pool_md));
+			return 0;
+		}
+
+		warn_if_metadata_device_too_big(pool->md_dev);
+		DMINFO("%s: growing the metadata device from %llu to %llu blocks",
+		       dm_device_name(pool->pool_md),
+		       sb_metadata_dev_size, metadata_dev_size);
+		r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
+		if (r) {
+			metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
+			return r;
+		}
+
+		*need_commit = true;
+	}
+
+	return 0;
+}
+
 /*
  * Retrieves the number of blocks of the data device from
  * the superblock and compares it to the actual device size,
@@ -1999,10 +2666,9 @@ static int pool_map(struct dm_target *ti, struct bio *bio,
 static int pool_preresume(struct dm_target *ti)
 {
 	int r;
+	bool need_commit1, need_commit2;
 	struct pool_c *pt = ti->private;
 	struct pool *pool = pt->pool;
-	sector_t data_size = ti->len;
-	dm_block_t sb_data_size;
 
 	/*
 	 * Take control of the pool object.
@@ -2011,30 +2677,16 @@ static int pool_preresume(struct dm_target *ti)
 	if (r)
 		return r;
 
-	(void) sector_div(data_size, pool->sectors_per_block);
-
-	r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
-	if (r) {
-		DMERR("failed to retrieve data device size");
+	r = maybe_resize_data_dev(ti, &need_commit1);
+	if (r)
 		return r;
-	}
 
-	if (data_size < sb_data_size) {
-		DMERR("pool target too small, is %llu blocks (expected %llu)",
-		      (unsigned long long)data_size, sb_data_size);
-		return -EINVAL;
-
-	} else if (data_size > sb_data_size) {
-		r = dm_pool_resize_data_dev(pool->pmd, data_size);
-		if (r) {
-			DMERR("failed to resize data device");
-			/* FIXME Stricter than necessary: Rollback transaction instead here */
-			set_pool_mode(pool, PM_READ_ONLY);
-			return r;
-		}
+	r = maybe_resize_metadata_dev(ti, &need_commit2);
+	if (r)
+		return r;
 
-		(void) commit_or_fallback(pool);
-	}
+	if (need_commit1 || need_commit2)
+		(void) commit(pool);
 
 	return 0;
 }
@@ -2046,10 +2698,9 @@ static void pool_resume(struct dm_target *ti)
 	unsigned long flags;
 
 	spin_lock_irqsave(&pool->lock, flags);
-	pool->low_water_triggered = 0;
-	pool->no_free_space = 0;
-	__requeue_bios(pool);
+	pool->low_water_triggered = false;
 	spin_unlock_irqrestore(&pool->lock, flags);
+	requeue_bios(pool);
 
 	do_waker(&pool->waker.work);
 }
@@ -2060,8 +2711,9 @@ static void pool_postsuspend(struct dm_target *ti)
 	struct pool *pool = pt->pool;
 
 	cancel_delayed_work(&pool->waker);
+	cancel_delayed_work(&pool->no_space_timeout);
 	flush_workqueue(pool->wq);
-	(void) commit_or_fallback(pool);
+	(void) commit(pool);
 }
 
 static int check_arg_count(unsigned argc, unsigned args_required)
@@ -2195,7 +2847,7 @@ static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct
 	if (r)
 		return r;
 
-	(void) commit_or_fallback(pool);
+	(void) commit(pool);
 
 	r = dm_pool_reserve_metadata_snap(pool->pmd);
 	if (r)
@@ -2257,7 +2909,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
 		DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
 
 	if (!r)
-		(void) commit_or_fallback(pool);
+		(void) commit(pool);
 
 	return r;
 }
@@ -2266,7 +2918,8 @@ static void emit_flags(struct pool_features *pf, char *result,
 		       unsigned sz, unsigned maxlen)
 {
 	unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
-		!pf->discard_passdown + (pf->mode == PM_READ_ONLY);
+		!pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
+		pf->error_if_no_space;
 	DMEMIT("%u ", count);
 
 	if (!pf->zero_new_blocks)
@@ -2280,6 +2933,9 @@ static void emit_flags(struct pool_features *pf, char *result,
 
 	if (pf->mode == PM_READ_ONLY)
 		DMEMIT("read_only ");
+
+	if (pf->error_if_no_space)
+		DMEMIT("error_if_no_space ");
 }
 
 /*
@@ -2287,8 +2943,8 @@ static void emit_flags(struct pool_features *pf, char *result,
  *    <transaction id> <used metadata sectors>/<total metadata sectors>
  *    <used data sectors>/<total data sectors> <held metadata root>
  */
-static int pool_status(struct dm_target *ti, status_type_t type,
-		       unsigned status_flags, char *result, unsigned maxlen)
+static void pool_status(struct dm_target *ti, status_type_t type,
+			unsigned status_flags, char *result, unsigned maxlen)
 {
 	int r;
 	unsigned sz = 0;
@@ -2312,34 +2968,49 @@ static int pool_status(struct dm_target *ti, status_type_t type,
 
 		/* Commit to ensure statistics aren't out-of-date */
 		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
-			(void) commit_or_fallback(pool);
+			(void) commit(pool);
 
-		r = dm_pool_get_metadata_transaction_id(pool->pmd,
-							&transaction_id);
-		if (r)
-			return r;
+		r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
+		if (r) {
+			DMERR("%s: dm_pool_get_metadata_transaction_id returned %d",
+			      dm_device_name(pool->pool_md), r);
+			goto err;
+		}
 
-		r = dm_pool_get_free_metadata_block_count(pool->pmd,
-							  &nr_free_blocks_metadata);
-		if (r)
-			return r;
+		r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);
+		if (r) {
+			DMERR("%s: dm_pool_get_free_metadata_block_count returned %d",
+			      dm_device_name(pool->pool_md), r);
+			goto err;
+		}
 
 		r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
-		if (r)
-			return r;
+		if (r) {
+			DMERR("%s: dm_pool_get_metadata_dev_size returned %d",
+			      dm_device_name(pool->pool_md), r);
+			goto err;
+		}
 
-		r = dm_pool_get_free_block_count(pool->pmd,
-						 &nr_free_blocks_data);
-		if (r)
-			return r;
+		r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);
+		if (r) {
+			DMERR("%s: dm_pool_get_free_block_count returned %d",
+			      dm_device_name(pool->pool_md), r);
+			goto err;
+		}
 
 		r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
-		if (r)
-			return r;
+		if (r) {
+			DMERR("%s: dm_pool_get_data_dev_size returned %d",
+			      dm_device_name(pool->pool_md), r);
+			goto err;
+		}
 
 		r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
-		if (r)
-			return r;
+		if (r) {
+			DMERR("%s: dm_pool_get_metadata_snap returned %d",
+			      dm_device_name(pool->pool_md), r);
+			goto err;
+		}
 
 		DMEMIT("%llu %llu/%llu %llu/%llu ",
 		       (unsigned long long)transaction_id,
@@ -2353,15 +3024,24 @@ static int pool_status(struct dm_target *ti, status_type_t type,
 		else
 			DMEMIT("- ");
 
-		if (pool->pf.mode == PM_READ_ONLY)
+		if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
+			DMEMIT("out_of_data_space ");
+		else if (pool->pf.mode == PM_READ_ONLY)
 			DMEMIT("ro ");
 		else
 			DMEMIT("rw ");
 
-		if (pool->pf.discard_enabled && pool->pf.discard_passdown)
-			DMEMIT("discard_passdown");
+		if (!pool->pf.discard_enabled)
+			DMEMIT("ignore_discard ");
+		else if (pool->pf.discard_passdown)
+			DMEMIT("discard_passdown ");
+		else
+			DMEMIT("no_discard_passdown ");
+
+		if (pool->pf.error_if_no_space)
+			DMEMIT("error_if_no_space ");
 		else
-			DMEMIT("no_discard_passdown");
+			DMEMIT("queue_if_no_space ");
 
 		break;
 
@@ -2374,8 +3054,10 @@ static int pool_status(struct dm_target *ti, status_type_t type,
 		emit_flags(&pt->requested_pf, result, sz, maxlen);
 		break;
 	}
+	return;
 
-	return 0;
+err:
+	DMEMIT("Error");
 }
 
 static int pool_iterate_devices(struct dm_target *ti,
@@ -2400,11 +3082,6 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
 
-static bool block_size_is_power_of_two(struct pool *pool)
-{
-	return pool->sectors_per_block_shift >= 0;
-}
-
 static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
 {
 	struct pool *pool = pt->pool;
@@ -2417,33 +3094,43 @@ static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
 	 */
 	if (pt->adjusted_pf.discard_passdown) {
 		data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
-		limits->discard_granularity = data_limits->discard_granularity;
-	} else if (block_size_is_power_of_two(pool))
+		limits->discard_granularity = max(data_limits->discard_granularity,
+						  pool->sectors_per_block << SECTOR_SHIFT);
+	} else
 		limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
-	else
-		/*
-		 * Use largest power of 2 that is a factor of sectors_per_block
-		 * but at least DATA_DEV_BLOCK_SIZE_MIN_SECTORS.
-		 */
-		limits->discard_granularity = max(1 << (ffs(pool->sectors_per_block) - 1),
-						  DATA_DEV_BLOCK_SIZE_MIN_SECTORS) << SECTOR_SHIFT;
 }
 
 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
 	struct pool_c *pt = ti->private;
 	struct pool *pool = pt->pool;
+	uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
 
-	blk_limits_io_min(limits, 0);
-	blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
+	/*
+	 * If the system-determined stacked limits are compatible with the
+	 * pool's blocksize (io_opt is a factor) do not override them.
+	 */
+	if (io_opt_sectors < pool->sectors_per_block ||
+	    do_div(io_opt_sectors, pool->sectors_per_block)) {
+		blk_limits_io_min(limits, 0);
+		blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
+	}
 
 	/*
 	 * pt->adjusted_pf is a staging area for the actual features to use.
 	 * They get transferred to the live pool in bind_control_target()
 	 * called from pool_preresume().
 	 */
-	if (!pt->adjusted_pf.discard_enabled)
+	if (!pt->adjusted_pf.discard_enabled) {
+		/*
+		 * Must explicitly disallow stacking discard limits otherwise the
+		 * block layer will stack them if pool's data device has support.
+		 * QUEUE_FLAG_DISCARD wouldn't be set but there is no way for the
+		 * user to see that, so make sure to set all discard limits to 0.
+		 */
+		limits->discard_granularity = 0;
 		return;
+	}
 
 	disable_passdown_if_not_supported(pt);
 
@@ -2454,7 +3141,7 @@ static struct target_type pool_target = {
 	.name = "thin-pool",
 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 		    DM_TARGET_IMMUTABLE,
-	.version = {1, 5, 0},
+	.version = {1, 12, 0},
 	.module = THIS_MODULE,
 	.ctr = pool_ctr,
 	.dtr = pool_dtr,
@@ -2472,9 +3159,29 @@ static struct target_type pool_target = {
 /*----------------------------------------------------------------
  * Thin target methods
  *--------------------------------------------------------------*/
+static void thin_get(struct thin_c *tc)
+{
+	atomic_inc(&tc->refcount);
+}
+
+static void thin_put(struct thin_c *tc)
+{
+	if (atomic_dec_and_test(&tc->refcount))
+		complete(&tc->can_destroy);
+}
+
 static void thin_dtr(struct dm_target *ti)
 {
 	struct thin_c *tc = ti->private;
+	unsigned long flags;
+
+	thin_put(tc);
+	wait_for_completion(&tc->can_destroy);
+
+	spin_lock_irqsave(&tc->pool->lock, flags);
+	list_del_rcu(&tc->list);
+	spin_unlock_irqrestore(&tc->pool->lock, flags);
+	synchronize_rcu();
 
 	mutex_lock(&dm_thin_pool_table.mutex);
 
@@ -2506,6 +3213,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	struct thin_c *tc;
 	struct dm_dev *pool_dev, *origin_dev;
 	struct mapped_device *pool_md;
+	unsigned long flags;
 
 	mutex_lock(&dm_thin_pool_table.mutex);
 
@@ -2521,6 +3229,10 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		r = -ENOMEM;
 		goto out_unlock;
 	}
+	spin_lock_init(&tc->lock);
+	bio_list_init(&tc->deferred_bio_list);
+	bio_list_init(&tc->retry_on_resume_list);
+	tc->sort_bio_list = RB_ROOT;
 
 	if (argc == 3) {
 		r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
@@ -2561,6 +3273,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	if (get_pool_mode(tc->pool) == PM_FAIL) {
 		ti->error = "Couldn't open thin device, Pool is in fail mode";
+		r = -EINVAL;
 		goto bad_thin_open;
 	}
 
@@ -2572,26 +3285,43 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
 	if (r)
-		goto bad_thin_open;
+		goto bad_target_max_io_len;
 
-	ti->num_flush_requests = 1;
+	ti->num_flush_bios = 1;
 	ti->flush_supported = true;
+	ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook);
 
 	/* In case the pool supports discards, pass them on. */
+	ti->discard_zeroes_data_unsupported = true;
 	if (tc->pool->pf.discard_enabled) {
 		ti->discards_supported = true;
-		ti->num_discard_requests = 1;
-		ti->discard_zeroes_data_unsupported = true;
-		/* Discard requests must be split on a block boundary */
-		ti->split_discard_requests = true;
+		ti->num_discard_bios = 1;
+		/* Discard bios must be split on a block boundary */
+		ti->split_discard_bios = true;
 	}
 
 	dm_put(pool_md);
 
 	mutex_unlock(&dm_thin_pool_table.mutex);
 
+	atomic_set(&tc->refcount, 1);
+	init_completion(&tc->can_destroy);
+
+	spin_lock_irqsave(&tc->pool->lock, flags);
+	list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
+	spin_unlock_irqrestore(&tc->pool->lock, flags);
+	/*
+	 * This synchronize_rcu() call is needed here otherwise we risk a
+	 * wake_worker() call finding no bios to process (because the newly
+	 * added tc isn't yet visible).  So this reduces latency since we
+	 * aren't then dependent on the periodic commit to wake_worker().
+	 */
+	synchronize_rcu();
+
 	return 0;
 
+bad_target_max_io_len:
+	dm_pool_close_thin_device(tc->td);
 bad_thin_open:
 	__pool_dec(tc->pool);
 bad_pool_lookup:
@@ -2609,20 +3339,17 @@ out_unlock:
 	return r;
 }
 
-static int thin_map(struct dm_target *ti, struct bio *bio,
-		    union map_info *map_context)
+static int thin_map(struct dm_target *ti, struct bio *bio)
 {
-	bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
+	bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
 
-	return thin_bio_map(ti, bio, map_context);
+	return thin_bio_map(ti, bio);
 }
 
-static int thin_endio(struct dm_target *ti,
-		      struct bio *bio, int err,
-		      union map_info *map_context)
+static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
 {
 	unsigned long flags;
-	struct dm_thin_endio_hook *h = map_context->ptr;
+	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 	struct list_head work;
 	struct dm_thin_new_mapping *m, *tmp;
 	struct pool *pool = h->tc->pool;
@@ -2634,7 +3361,7 @@ static int thin_endio(struct dm_target *ti,
 		spin_lock_irqsave(&pool->lock, flags);
 		list_for_each_entry_safe(m, tmp, &work, list) {
 			list_del(&m->list);
-			m->quiesced = 1;
+			m->quiesced = true;
 			__maybe_add_mapping(m);
 		}
 		spin_unlock_irqrestore(&pool->lock, flags);
@@ -2643,28 +3370,42 @@ static int thin_endio(struct dm_target *ti,
 	if (h->all_io_entry) {
 		INIT_LIST_HEAD(&work);
 		dm_deferred_entry_dec(h->all_io_entry, &work);
-		spin_lock_irqsave(&pool->lock, flags);
-		list_for_each_entry_safe(m, tmp, &work, list)
-			list_add(&m->list, &pool->prepared_discards);
-		spin_unlock_irqrestore(&pool->lock, flags);
+		if (!list_empty(&work)) {
+			spin_lock_irqsave(&pool->lock, flags);
+			list_for_each_entry_safe(m, tmp, &work, list)
+				list_add_tail(&m->list, &pool->prepared_discards);
+			spin_unlock_irqrestore(&pool->lock, flags);
+			wake_worker(pool);
+		}
 	}
 
-	mempool_free(h, pool->endio_hook_pool);
-
 	return 0;
 }
 
-static void thin_postsuspend(struct dm_target *ti)
+static void thin_presuspend(struct dm_target *ti)
 {
+	struct thin_c *tc = ti->private;
+
 	if (dm_noflush_suspending(ti))
-		requeue_io((struct thin_c *)ti->private);
+		noflush_work(tc, do_noflush_start);
+}
+
+static void thin_postsuspend(struct dm_target *ti)
+{
+	struct thin_c *tc = ti->private;
+
+	/*
+	 * The dm_noflush_suspending flag has been cleared by now, so
+	 * unfortunately we must always run this.
+	 */
+	noflush_work(tc, do_noflush_stop);
 }
 
 /*
  * <nr mapped sectors> <highest mapped sector>
  */
-static int thin_status(struct dm_target *ti, status_type_t type,
-		       unsigned status_flags, char *result, unsigned maxlen)
+static void thin_status(struct dm_target *ti, status_type_t type,
+			unsigned status_flags, char *result, unsigned maxlen)
 {
 	int r;
 	ssize_t sz = 0;
@@ -2674,7 +3415,7 @@ static int thin_status(struct dm_target *ti, status_type_t type,
 
 	if (get_pool_mode(tc->pool) == PM_FAIL) {
 		DMEMIT("Fail");
-		return 0;
+		return;
 	}
 
 	if (!tc->td)
@@ -2683,12 +3424,16 @@ static int thin_status(struct dm_target *ti, status_type_t type,
 		switch (type) {
 		case STATUSTYPE_INFO:
 			r = dm_thin_get_mapped_count(tc->td, &mapped);
-			if (r)
-				return r;
+			if (r) {
+				DMERR("dm_thin_get_mapped_count returned %d", r);
+				goto err;
+			}
 
 			r = dm_thin_get_highest_mapped_block(tc->td, &highest);
-			if (r < 0)
-				return r;
+			if (r < 0) {
+				DMERR("dm_thin_get_highest_mapped_block returned %d", r);
+				goto err;
+			}
 
 			DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
 			if (r)
@@ -2708,7 +3453,10 @@ static int thin_status(struct dm_target *ti, status_type_t type,
 		}
 	}
 
-	return 0;
+	return;
+
+err:
+	DMEMIT("Error");
 }
 
 static int thin_iterate_devices(struct dm_target *ti,
@@ -2733,28 +3481,18 @@ static int thin_iterate_devices(struct dm_target *ti,
 	return 0;
 }
 
-/*
- * A thin device always inherits its queue limits from its pool.
- */
-static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
-{
-	struct thin_c *tc = ti->private;
-
-	*limits = bdev_get_queue(tc->pool_dev->bdev)->limits;
-}
-
 static struct target_type thin_target = {
 	.name = "thin",
-	.version = {1, 5, 0},
+	.version = {1, 12, 0},
 	.module	= THIS_MODULE,
 	.ctr = thin_ctr,
 	.dtr = thin_dtr,
 	.map = thin_map,
 	.end_io = thin_endio,
+	.presuspend = thin_presuspend,
 	.postsuspend = thin_postsuspend,
 	.status = thin_status,
 	.iterate_devices = thin_iterate_devices,
-	.io_hints = thin_io_hints,
 };
 
 /*----------------------------------------------------------------*/
@@ -2779,14 +3517,8 @@ static int __init dm_thin_init(void)
 	if (!_new_mapping_cache)
 		goto bad_new_mapping_cache;
 
-	_endio_hook_cache = KMEM_CACHE(dm_thin_endio_hook, 0);
-	if (!_endio_hook_cache)
-		goto bad_endio_hook_cache;
-
 	return 0;
 
-bad_endio_hook_cache:
-	kmem_cache_destroy(_new_mapping_cache);
 bad_new_mapping_cache:
 	dm_unregister_target(&pool_target);
 bad_pool_target:
@@ -2801,12 +3533,14 @@ static void dm_thin_exit(void)
 	dm_unregister_target(&pool_target);
 
 	kmem_cache_destroy(_new_mapping_cache);
-	kmem_cache_destroy(_endio_hook_cache);
 }
 
 module_init(dm_thin_init);
 module_exit(dm_thin_exit);
 
+module_param_named(no_space_timeout, no_space_timeout_secs, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds");
+
 MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 9e7328bb403..7a7bab8947a 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -55,7 +55,6 @@ struct dm_verity {
 	unsigned shash_descsize;/* the size of temporary space for crypto */
 	int hash_failed;	/* set to 1 if hash of any block failed */
 
-	mempool_t *io_mempool;	/* mempool of struct dm_verity_io */
 	mempool_t *vec_mempool;	/* mempool of bio vector */
 
 	struct workqueue_struct *verify_wq;
@@ -66,7 +65,6 @@ struct dm_verity {
 
 struct dm_verity_io {
 	struct dm_verity *v;
-	struct bio *bio;
 
 	/* original values of bio->bi_end_io and bio->bi_private */
 	bio_end_io_t *orig_bi_end_io;
@@ -75,15 +73,10 @@ struct dm_verity_io {
 	sector_t block;
 	unsigned n_blocks;
 
-	/* saved bio vector */
-	struct bio_vec *io_vec;
-	unsigned io_vec_size;
+	struct bvec_iter iter;
 
 	struct work_struct work;
 
-	/* A space for short vectors; longer vectors are allocated separately. */
-	struct bio_vec io_vec_inline[DM_VERITY_IO_VEC_INLINE];
-
 	/*
 	 * Three variably-size fields follow this struct:
 	 *
@@ -95,6 +88,13 @@ struct dm_verity_io {
 	 */
 };
 
+struct dm_verity_prefetch_work {
+	struct work_struct work;
+	struct dm_verity *v;
+	sector_t block;
+	unsigned n_blocks;
+};
+
 static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io)
 {
 	return (struct shash_desc *)(io + 1);
@@ -279,9 +279,10 @@ release_ret_r:
 static int verity_verify_io(struct dm_verity_io *io)
 {
 	struct dm_verity *v = io->v;
+	struct bio *bio = dm_bio_from_per_bio_data(io,
+						   v->ti->per_bio_data_size);
 	unsigned b;
 	int i;
-	unsigned vector = 0, offset = 0;
 
 	for (b = 0; b < io->n_blocks; b++) {
 		struct shash_desc *desc;
@@ -329,31 +330,25 @@ test_block_hash:
 				return r;
 			}
 		}
-
 		todo = 1 << v->data_dev_block_bits;
 		do {
-			struct bio_vec *bv;
 			u8 *page;
 			unsigned len;
+			struct bio_vec bv = bio_iter_iovec(bio, io->iter);
 
-			BUG_ON(vector >= io->io_vec_size);
-			bv = &io->io_vec[vector];
-			page = kmap_atomic(bv->bv_page);
-			len = bv->bv_len - offset;
+			page = kmap_atomic(bv.bv_page);
+			len = bv.bv_len;
 			if (likely(len >= todo))
 				len = todo;
-			r = crypto_shash_update(desc,
-					page + bv->bv_offset + offset, len);
+			r = crypto_shash_update(desc, page + bv.bv_offset, len);
 			kunmap_atomic(page);
+
 			if (r < 0) {
 				DMERR("crypto_shash_update failed: %d", r);
 				return r;
 			}
-			offset += len;
-			if (likely(offset == bv->bv_len)) {
-				offset = 0;
-				vector++;
-			}
+
+			bio_advance_iter(bio, &io->iter, len);
 			todo -= len;
 		} while (todo);
 
@@ -378,8 +373,6 @@ test_block_hash:
 			return -EIO;
 		}
 	}
-	BUG_ON(vector != io->io_vec_size);
-	BUG_ON(offset);
 
 	return 0;
 }
@@ -389,18 +382,13 @@ test_block_hash:
  */
 static void verity_finish_io(struct dm_verity_io *io, int error)
 {
-	struct bio *bio = io->bio;
 	struct dm_verity *v = io->v;
+	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size);
 
 	bio->bi_end_io = io->orig_bi_end_io;
 	bio->bi_private = io->orig_bi_private;
 
-	if (io->io_vec != io->io_vec_inline)
-		mempool_free(io->io_vec, v->vec_mempool);
-
-	mempool_free(io, v->io_mempool);
-
-	bio_endio(bio, error);
+	bio_endio_nodec(bio, error);
 }
 
 static void verity_work(struct work_struct *w)
@@ -428,15 +416,18 @@ static void verity_end_io(struct bio *bio, int error)
  * The root buffer is not prefetched, it is assumed that it will be cached
  * all the time.
  */
-static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io)
+static void verity_prefetch_io(struct work_struct *work)
 {
+	struct dm_verity_prefetch_work *pw =
+		container_of(work, struct dm_verity_prefetch_work, work);
+	struct dm_verity *v = pw->v;
 	int i;
 
 	for (i = v->levels - 2; i >= 0; i--) {
 		sector_t hash_block_start;
 		sector_t hash_block_end;
-		verity_hash_at_level(v, io->block, i, &hash_block_start, NULL);
-		verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL);
+		verity_hash_at_level(v, pw->block, i, &hash_block_start, NULL);
+		verity_hash_at_level(v, pw->block + pw->n_blocks - 1, i, &hash_block_end, NULL);
 		if (!i) {
 			unsigned cluster = ACCESS_ONCE(dm_verity_prefetch_cluster);
 
@@ -445,7 +436,7 @@ static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io)
 				goto no_prefetch_cluster;
 
 			if (unlikely(cluster & (cluster - 1)))
-				cluster = 1 << (fls(cluster) - 1);
+				cluster = 1 << __fls(cluster);
 
 			hash_block_start &= ~(sector_t)(cluster - 1);
 			hash_block_end |= cluster - 1;
@@ -456,28 +447,46 @@ no_prefetch_cluster:
 		dm_bufio_prefetch(v->bufio, hash_block_start,
 				  hash_block_end - hash_block_start + 1);
 	}
+
+	kfree(pw);
+}
+
+static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
+{
+	struct dm_verity_prefetch_work *pw;
+
+	pw = kmalloc(sizeof(struct dm_verity_prefetch_work),
+		GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+
+	if (!pw)
+		return;
+
+	INIT_WORK(&pw->work, verity_prefetch_io);
+	pw->v = v;
+	pw->block = io->block;
+	pw->n_blocks = io->n_blocks;
+	queue_work(v->verify_wq, &pw->work);
 }
 
 /*
  * Bio map function. It allocates dm_verity_io structure and bio vector and
  * fills them. Then it issues prefetches and the I/O.
  */
-static int verity_map(struct dm_target *ti, struct bio *bio,
-		      union map_info *map_context)
+static int verity_map(struct dm_target *ti, struct bio *bio)
 {
 	struct dm_verity *v = ti->private;
 	struct dm_verity_io *io;
 
 	bio->bi_bdev = v->data_dev->bdev;
-	bio->bi_sector = verity_map_sector(v, bio->bi_sector);
+	bio->bi_iter.bi_sector = verity_map_sector(v, bio->bi_iter.bi_sector);
 
-	if (((unsigned)bio->bi_sector | bio_sectors(bio)) &
+	if (((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
 	    ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) {
 		DMERR_LIMIT("unaligned io");
 		return -EIO;
 	}
 
-	if ((bio->bi_sector + bio_sectors(bio)) >>
+	if (bio_end_sector(bio) >>
 	    (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) {
 		DMERR_LIMIT("io out of range");
 		return -EIO;
@@ -486,25 +495,18 @@ static int verity_map(struct dm_target *ti, struct bio *bio,
 	if (bio_data_dir(bio) == WRITE)
 		return -EIO;
 
-	io = mempool_alloc(v->io_mempool, GFP_NOIO);
+	io = dm_per_bio_data(bio, ti->per_bio_data_size);
 	io->v = v;
-	io->bio = bio;
 	io->orig_bi_end_io = bio->bi_end_io;
 	io->orig_bi_private = bio->bi_private;
-	io->block = bio->bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT);
-	io->n_blocks = bio->bi_size >> v->data_dev_block_bits;
+	io->block = bio->bi_iter.bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT);
+	io->n_blocks = bio->bi_iter.bi_size >> v->data_dev_block_bits;
 
 	bio->bi_end_io = verity_end_io;
 	bio->bi_private = io;
-	io->io_vec_size = bio->bi_vcnt - bio->bi_idx;
-	if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE)
-		io->io_vec = io->io_vec_inline;
-	else
-		io->io_vec = mempool_alloc(v->vec_mempool, GFP_NOIO);
-	memcpy(io->io_vec, bio_iovec(bio),
-	       io->io_vec_size * sizeof(struct bio_vec));
+	io->iter = bio->bi_iter;
 
-	verity_prefetch_io(v, io);
+	verity_submit_prefetch(v, io);
 
 	generic_make_request(bio);
 
@@ -514,8 +516,8 @@ static int verity_map(struct dm_target *ti, struct bio *bio,
 /*
  * Status: V (valid) or C (corruption found)
  */
-static int verity_status(struct dm_target *ti, status_type_t type,
-			 unsigned status_flags, char *result, unsigned maxlen)
+static void verity_status(struct dm_target *ti, status_type_t type,
+			  unsigned status_flags, char *result, unsigned maxlen)
 {
 	struct dm_verity *v = ti->private;
 	unsigned sz = 0;
@@ -546,8 +548,6 @@ static int verity_status(struct dm_target *ti, status_type_t type,
 				DMEMIT("%02x", v->salt[x]);
 		break;
 	}
-
-	return 0;
 }
 
 static int verity_ioctl(struct dm_target *ti, unsigned cmd,
@@ -610,9 +610,6 @@ static void verity_dtr(struct dm_target *ti)
 	if (v->vec_mempool)
 		mempool_destroy(v->vec_mempool);
 
-	if (v->io_mempool)
-		mempool_destroy(v->io_mempool);
-
 	if (v->bufio)
 		dm_bufio_client_destroy(v->bufio);
 
@@ -677,8 +674,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto bad;
 	}
 
-	if (sscanf(argv[0], "%d%c", &num, &dummy) != 1 ||
-	    num < 0 || num > 1) {
+	if (sscanf(argv[0], "%u%c", &num, &dummy) != 1 ||
+	    num > 1) {
 		ti->error = "Invalid version";
 		r = -EINVAL;
 		goto bad;
@@ -705,7 +702,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		r = -EINVAL;
 		goto bad;
 	}
-	v->data_dev_block_bits = ffs(num) - 1;
+	v->data_dev_block_bits = __ffs(num);
 
 	if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 ||
 	    !num || (num & (num - 1)) ||
@@ -715,7 +712,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		r = -EINVAL;
 		goto bad;
 	}
-	v->hash_dev_block_bits = ffs(num) - 1;
+	v->hash_dev_block_bits = __ffs(num);
 
 	if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 ||
 	    (sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT))
@@ -794,7 +791,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}
 
 	v->hash_per_block_bits =
-		fls((1 << v->hash_dev_block_bits) / v->digest_size) - 1;
+		__fls((1 << v->hash_dev_block_bits) / v->digest_size);
 
 	v->levels = 0;
 	if (v->data_blocks)
@@ -813,9 +810,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	for (i = v->levels - 1; i >= 0; i--) {
 		sector_t s;
 		v->hash_level_block[i] = hash_position;
-		s = verity_position_at_level(v, v->data_blocks, i);
-		s = (s >> v->hash_per_block_bits) +
-		    !!(s & ((1 << v->hash_per_block_bits) - 1));
+		s = (v->data_blocks + ((sector_t)1 << ((i + 1) * v->hash_per_block_bits)) - 1)
+					>> ((i + 1) * v->hash_per_block_bits);
 		if (hash_position + s < hash_position) {
 			ti->error = "Hash device offset overflow";
 			r = -E2BIG;
@@ -841,13 +837,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto bad;
 	}
 
-	v->io_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE,
-	  sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2);
-	if (!v->io_mempool) {
-		ti->error = "Cannot allocate io mempool";
-		r = -ENOMEM;
-		goto bad;
-	}
+	ti->per_bio_data_size = roundup(sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2, __alignof__(struct dm_verity_io));
 
 	v->vec_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE,
 					BIO_MAX_PAGES * sizeof(struct bio_vec));
@@ -875,7 +865,7 @@ bad:
 
 static struct target_type verity_target = {
 	.name		= "verity",
-	.version	= {1, 0, 0},
+	.version	= {1, 2, 0},
 	.module		= THIS_MODULE,
 	.ctr		= verity_ctr,
 	.dtr		= verity_dtr,
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index cc2b3cb8194..b9a64bbce30 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
+ * Copyright (C) 2003 Jana Saout <jana@saout.de>
  *
  * This file is released under the GPL.
  */
@@ -25,7 +25,7 @@ static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	/*
 	 * Silently drop discards, avoiding -EOPNOTSUPP.
 	 */
-	ti->num_discard_requests = 1;
+	ti->num_discard_bios = 1;
 
 	return 0;
 }
@@ -33,8 +33,7 @@ static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 /*
  * Return zeros only on reads
  */
-static int zero_map(struct dm_target *ti, struct bio *bio,
-		      union map_info *map_context)
+static int zero_map(struct dm_target *ti, struct bio *bio)
 {
 	switch(bio_rw(bio)) {
 	case READ:
@@ -56,7 +55,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio,
 
 static struct target_type zero_target = {
 	.name   = "zero",
-	.version = {1, 0, 0},
+	.version = {1, 1, 0},
 	.module = THIS_MODULE,
 	.ctr    = zero_ctr,
 	.map    = zero_map,
@@ -80,6 +79,6 @@ static void __exit dm_zero_exit(void)
 module_init(dm_zero_init)
 module_exit(dm_zero_exit)
 
-MODULE_AUTHOR("Christophe Saout <christophe@saout.de>");
+MODULE_AUTHOR("Jana Saout <jana@saout.de>");
 MODULE_DESCRIPTION(DM_NAME " dummy target returning zeros");
 MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 02db9183ca0..32b958dbc49 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -49,6 +49,13 @@ static unsigned int _major = 0;
 static DEFINE_IDR(_minor_idr);
 
 static DEFINE_SPINLOCK(_minor_lock);
+
+static void do_deferred_remove(struct work_struct *w);
+
+static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
+
+static struct workqueue_struct *deferred_remove_workqueue;
+
 /*
  * For bio-based dm.
  * One of these is allocated per bio.
@@ -60,18 +67,7 @@ struct dm_io {
 	struct bio *bio;
 	unsigned long start_time;
 	spinlock_t endio_lock;
-};
-
-/*
- * For bio-based dm.
- * One of these is allocated per target within a bio.  Hopefully
- * this will be simplified out one day.
- */
-struct dm_target_io {
-	struct dm_io *io;
-	struct dm_target *ti;
-	union map_info info;
-	struct bio clone;
+	struct dm_stats_aux stats_aux;
 };
 
 /*
@@ -100,13 +96,6 @@ struct dm_rq_clone_bio_info {
 	struct bio clone;
 };
 
-union map_info *dm_get_mapinfo(struct bio *bio)
-{
-	if (bio && bio->bi_private)
-		return &((struct dm_target_io *)bio->bi_private)->info;
-	return NULL;
-}
-
 union map_info *dm_get_rq_mapinfo(struct request *rq)
 {
 	if (rq && rq->end_io_data)
@@ -127,17 +116,32 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
 #define DMF_DELETING 4
 #define DMF_NOFLUSH_SUSPENDING 5
 #define DMF_MERGE_IS_OPTIONAL 6
+#define DMF_DEFERRED_REMOVE 7
+
+/*
+ * A dummy definition to make RCU happy.
+ * struct dm_table should never be dereferenced in this file.
+ */
+struct dm_table {
+	int undefined__;
+};
 
 /*
  * Work processed by per-device workqueue.
  */
 struct mapped_device {
-	struct rw_semaphore io_lock;
+	struct srcu_struct io_barrier;
 	struct mutex suspend_lock;
-	rwlock_t map_lock;
 	atomic_t holders;
 	atomic_t open_count;
 
+	/*
+	 * The current mapping.
+	 * Use dm_get_live_table{_fast} or take suspend_lock for
+	 * dereference.
+	 */
+	struct dm_table *map;
+
 	unsigned long flags;
 
 	struct request_queue *queue;
@@ -167,15 +171,9 @@ struct mapped_device {
 	struct workqueue_struct *wq;
 
 	/*
-	 * The current mapping.
-	 */
-	struct dm_table *map;
-
-	/*
 	 * io objects are allocated from here.
 	 */
 	mempool_t *io_pool;
-	mempool_t *tio_pool;
 
 	struct bio_set *bs;
 
@@ -197,11 +195,13 @@ struct mapped_device {
 	/* forced geometry settings */
 	struct hd_geometry geometry;
 
-	/* sysfs handle */
-	struct kobject kobj;
+	/* kobject and completion */
+	struct dm_kobject_holder kobj_holder;
 
 	/* zero-length flush that will be cloned and submitted to targets */
 	struct bio flush_bio;
+
+	struct dm_stats stats;
 };
 
 /*
@@ -209,19 +209,57 @@ struct mapped_device {
  */
 struct dm_md_mempools {
 	mempool_t *io_pool;
-	mempool_t *tio_pool;
 	struct bio_set *bs;
 };
 
-#define MIN_IOS 256
+#define RESERVED_BIO_BASED_IOS		16
+#define RESERVED_REQUEST_BASED_IOS	256
+#define RESERVED_MAX_IOS		1024
 static struct kmem_cache *_io_cache;
 static struct kmem_cache *_rq_tio_cache;
 
 /*
- * Unused now, and needs to be deleted. But since io_pool is overloaded and it's
- * still used for _io_cache, I'm leaving this for a later cleanup
+ * Bio-based DM's mempools' reserved IOs set by the user.
  */
-static struct kmem_cache *_rq_bio_info_cache;
+static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
+
+/*
+ * Request-based DM's mempools' reserved IOs set by the user.
+ */
+static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
+
+static unsigned __dm_get_reserved_ios(unsigned *reserved_ios,
+				      unsigned def, unsigned max)
+{
+	unsigned ios = ACCESS_ONCE(*reserved_ios);
+	unsigned modified_ios = 0;
+
+	if (!ios)
+		modified_ios = def;
+	else if (ios > max)
+		modified_ios = max;
+
+	if (modified_ios) {
+		(void)cmpxchg(reserved_ios, ios, modified_ios);
+		ios = modified_ios;
+	}
+
+	return ios;
+}
+
+unsigned dm_get_reserved_bio_based_ios(void)
+{
+	return __dm_get_reserved_ios(&reserved_bio_based_ios,
+				     RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS);
+}
+EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
+
+unsigned dm_get_reserved_rq_based_ios(void)
+{
+	return __dm_get_reserved_ios(&reserved_rq_based_ios,
+				     RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS);
+}
+EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
 
 static int __init local_init(void)
 {
@@ -236,28 +274,30 @@ static int __init local_init(void)
 	if (!_rq_tio_cache)
 		goto out_free_io_cache;
 
-	_rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
-	if (!_rq_bio_info_cache)
-		goto out_free_rq_tio_cache;
-
 	r = dm_uevent_init();
 	if (r)
-		goto out_free_rq_bio_info_cache;
+		goto out_free_rq_tio_cache;
+
+	deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
+	if (!deferred_remove_workqueue) {
+		r = -ENOMEM;
+		goto out_uevent_exit;
+	}
 
 	_major = major;
 	r = register_blkdev(_major, _name);
 	if (r < 0)
-		goto out_uevent_exit;
+		goto out_free_workqueue;
 
 	if (!_major)
 		_major = r;
 
 	return 0;
 
+out_free_workqueue:
+	destroy_workqueue(deferred_remove_workqueue);
 out_uevent_exit:
 	dm_uevent_exit();
-out_free_rq_bio_info_cache:
-	kmem_cache_destroy(_rq_bio_info_cache);
 out_free_rq_tio_cache:
 	kmem_cache_destroy(_rq_tio_cache);
 out_free_io_cache:
@@ -268,7 +308,9 @@ out_free_io_cache:
 
 static void local_exit(void)
 {
-	kmem_cache_destroy(_rq_bio_info_cache);
+	flush_scheduled_work();
+	destroy_workqueue(deferred_remove_workqueue);
+
 	kmem_cache_destroy(_rq_tio_cache);
 	kmem_cache_destroy(_io_cache);
 	unregister_blkdev(_major, _name);
@@ -287,6 +329,7 @@ static int (*_inits[])(void) __initdata = {
 	dm_io_init,
 	dm_kcopyd_init,
 	dm_interface_init,
+	dm_statistics_init,
 };
 
 static void (*_exits[])(void) = {
@@ -297,6 +340,7 @@ static void (*_exits[])(void) = {
 	dm_io_exit,
 	dm_kcopyd_exit,
 	dm_interface_exit,
+	dm_statistics_exit,
 };
 
 static int __init dm_init(void)
@@ -330,7 +374,6 @@ static void __exit dm_exit(void)
 	/*
 	 * Should be empty by this point.
 	 */
-	idr_remove_all(&_minor_idr);
 	idr_destroy(&_minor_idr);
 }
 
@@ -367,18 +410,19 @@ out:
 	return md ? 0 : -ENXIO;
 }
 
-static int dm_blk_close(struct gendisk *disk, fmode_t mode)
+static void dm_blk_close(struct gendisk *disk, fmode_t mode)
 {
 	struct mapped_device *md = disk->private_data;
 
 	spin_lock(&_minor_lock);
 
-	atomic_dec(&md->open_count);
+	if (atomic_dec_and_test(&md->open_count) &&
+	    (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
+		queue_work(deferred_remove_workqueue, &deferred_remove_work);
+
 	dm_put(md);
 
 	spin_unlock(&_minor_lock);
-
-	return 0;
 }
 
 int dm_open_count(struct mapped_device *md)
@@ -389,14 +433,18 @@ int dm_open_count(struct mapped_device *md)
 /*
  * Guarantees nothing is using the device before it's deleted.
  */
-int dm_lock_for_deletion(struct mapped_device *md)
+int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
 {
 	int r = 0;
 
 	spin_lock(&_minor_lock);
 
-	if (dm_open_count(md))
+	if (dm_open_count(md)) {
 		r = -EBUSY;
+		if (mark_deferred)
+			set_bit(DMF_DEFERRED_REMOVE, &md->flags);
+	} else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
+		r = -EEXIST;
 	else
 		set_bit(DMF_DELETING, &md->flags);
 
@@ -405,6 +453,42 @@ int dm_lock_for_deletion(struct mapped_device *md)
 	return r;
 }
 
+int dm_cancel_deferred_remove(struct mapped_device *md)
+{
+	int r = 0;
+
+	spin_lock(&_minor_lock);
+
+	if (test_bit(DMF_DELETING, &md->flags))
+		r = -EBUSY;
+	else
+		clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
+
+	spin_unlock(&_minor_lock);
+
+	return r;
+}
+
+static void do_deferred_remove(struct work_struct *w)
+{
+	dm_deferred_remove();
+}
+
+sector_t dm_get_size(struct mapped_device *md)
+{
+	return get_capacity(md->disk);
+}
+
+struct request_queue *dm_get_md_queue(struct mapped_device *md)
+{
+	return md->queue;
+}
+
+struct dm_stats *dm_get_stats(struct mapped_device *md)
+{
+	return &md->stats;
+}
+
 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
 	struct mapped_device *md = bdev->bd_disk->private_data;
@@ -416,10 +500,14 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 			unsigned int cmd, unsigned long arg)
 {
 	struct mapped_device *md = bdev->bd_disk->private_data;
-	struct dm_table *map = dm_get_live_table(md);
+	int srcu_idx;
+	struct dm_table *map;
 	struct dm_target *tgt;
 	int r = -ENOTTY;
 
+retry:
+	map = dm_get_live_table(md, &srcu_idx);
+
 	if (!map || !dm_table_get_size(map))
 		goto out;
 
@@ -438,7 +526,12 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 		r = tgt->type->ioctl(tgt, cmd, arg);
 
 out:
-	dm_table_put(map);
+	dm_put_live_table(md, srcu_idx);
+
+	if (r == -ENOTCONN) {
+		msleep(10);
+		goto retry;
+	}
 
 	return r;
 }
@@ -461,12 +554,12 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
 					    gfp_t gfp_mask)
 {
-	return mempool_alloc(md->tio_pool, gfp_mask);
+	return mempool_alloc(md->io_pool, gfp_mask);
 }
 
 static void free_rq_tio(struct dm_rq_target_io *tio)
 {
-	mempool_free(tio, tio->md->tio_pool);
+	mempool_free(tio, tio->md->io_pool);
 }
 
 static int md_in_flight(struct mapped_device *md)
@@ -478,8 +571,9 @@ static int md_in_flight(struct mapped_device *md)
 static void start_io_acct(struct dm_io *io)
 {
 	struct mapped_device *md = io->md;
+	struct bio *bio = io->bio;
 	int cpu;
-	int rw = bio_data_dir(io->bio);
+	int rw = bio_data_dir(bio);
 
 	io->start_time = jiffies;
 
@@ -488,6 +582,10 @@ static void start_io_acct(struct dm_io *io)
 	part_stat_unlock();
 	atomic_set(&dm_disk(md)->part0.in_flight[rw],
 		atomic_inc_return(&md->pending[rw]));
+
+	if (unlikely(dm_stats_used(&md->stats)))
+		dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
+				    bio_sectors(bio), false, 0, &io->stats_aux);
 }
 
 static void end_io_acct(struct dm_io *io)
@@ -503,6 +601,10 @@ static void end_io_acct(struct dm_io *io)
 	part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
 	part_stat_unlock();
 
+	if (unlikely(dm_stats_used(&md->stats)))
+		dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
+				    bio_sectors(bio), true, duration, &io->stats_aux);
+
 	/*
 	 * After this is decremented the bio must not be touched if it is
 	 * a flush.
@@ -532,20 +634,39 @@ static void queue_io(struct mapped_device *md, struct bio *bio)
 /*
  * Everyone (including functions in this file), should use this
  * function to access the md->map field, and make sure they call
- * dm_table_put() when finished.
+ * dm_put_live_table() when finished.
  */
-struct dm_table *dm_get_live_table(struct mapped_device *md)
+struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
 {
-	struct dm_table *t;
-	unsigned long flags;
+	*srcu_idx = srcu_read_lock(&md->io_barrier);
+
+	return srcu_dereference(md->map, &md->io_barrier);
+}
+
+void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
+{
+	srcu_read_unlock(&md->io_barrier, srcu_idx);
+}
+
+void dm_sync_table(struct mapped_device *md)
+{
+	synchronize_srcu(&md->io_barrier);
+	synchronize_rcu_expedited();
+}
 
-	read_lock_irqsave(&md->map_lock, flags);
-	t = md->map;
-	if (t)
-		dm_table_get(t);
-	read_unlock_irqrestore(&md->map_lock, flags);
+/*
+ * A fast alternative to dm_get_live_table/dm_put_live_table.
+ * The caller must not block between these two functions.
+ */
+static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
+{
+	rcu_read_lock();
+	return rcu_dereference(md->map);
+}
 
-	return t;
+static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
+{
+	rcu_read_unlock();
 }
 
 /*
@@ -630,7 +751,7 @@ static void dec_pending(struct dm_io *io, int error)
 		if (io_error == DM_ENDIO_REQUEUE)
 			return;
 
-		if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
+		if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) {
 			/*
 			 * Preflush done for flush with data, reissue
 			 * without REQ_FLUSH.
@@ -645,10 +766,18 @@ static void dec_pending(struct dm_io *io, int error)
 	}
 }
 
+static void disable_write_same(struct mapped_device *md)
+{
+	struct queue_limits *limits = dm_get_queue_limits(md);
+
+	/* device doesn't really support WRITE SAME, disable it */
+	limits->max_write_same_sectors = 0;
+}
+
 static void clone_endio(struct bio *bio, int error)
 {
 	int r = 0;
-	struct dm_target_io *tio = bio->bi_private;
+	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 	struct dm_io *io = tio->io;
 	struct mapped_device *md = tio->io->md;
 	dm_endio_fn endio = tio->ti->type->end_io;
@@ -657,7 +786,7 @@ static void clone_endio(struct bio *bio, int error)
 		error = -EIO;
 
 	if (endio) {
-		r = endio(tio->ti, bio, error, &tio->info);
+		r = endio(tio->ti, bio, error);
 		if (r < 0 || r == DM_ENDIO_REQUEUE)
 			/*
 			 * error and requeue request are handled
@@ -673,6 +802,10 @@ static void clone_endio(struct bio *bio, int error)
 		}
 	}
 
+	if (unlikely(r == -EREMOTEIO && (bio->bi_rw & REQ_WRITE_SAME) &&
+		     !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors))
+		disable_write_same(md);
+
 	free_tio(md, tio);
 	dec_pending(io, error);
 }
@@ -682,10 +815,11 @@ static void clone_endio(struct bio *bio, int error)
  */
 static void end_clone_bio(struct bio *clone, int error)
 {
-	struct dm_rq_clone_bio_info *info = clone->bi_private;
+	struct dm_rq_clone_bio_info *info =
+		container_of(clone, struct dm_rq_clone_bio_info, clone);
 	struct dm_rq_target_io *tio = info->tio;
 	struct bio *bio = info->orig;
-	unsigned int nr_bytes = info->orig->bi_size;
+	unsigned int nr_bytes = info->orig->bi_iter.bi_size;
 
 	bio_put(clone);
 
@@ -740,8 +874,14 @@ static void rq_completed(struct mapped_device *md, int rw, int run_queue)
 	if (!md_in_flight(md))
 		wake_up(&md->wait);
 
+	/*
+	 * Run this off this callpath, as drivers could invoke end_io while
+	 * inside their request_fn (and holding the queue lock). Calling
+	 * back into ->request_fn() could deadlock attempting to grab the
+	 * queue lock again.
+	 */
 	if (run_queue)
-		blk_run_queue(md->queue);
+		blk_run_queue_async(md->queue);
 
 	/*
 	 * dm_put() must be at the end of this function. See the comment above
@@ -860,6 +1000,10 @@ static void dm_done(struct request *clone, int error, bool mapped)
 			r = rq_end_io(tio->ti, clone, error, &tio->info);
 	}
 
+	if (unlikely(r == -EREMOTEIO && (clone->cmd_flags & REQ_WRITE_SAME) &&
+		     !clone->q->limits.max_write_same_sectors))
+		disable_write_same(tio->md);
+
 	if (r <= 0)
 		/* The target wants to complete the I/O */
 		dm_end_request(clone, r);
@@ -993,15 +1137,55 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
 }
 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
 
-static void __map_bio(struct dm_target *ti, struct dm_target_io *tio)
+/*
+ * A target may call dm_accept_partial_bio only from the map routine.  It is
+ * allowed for all bio types except REQ_FLUSH.
+ *
+ * dm_accept_partial_bio informs the dm that the target only wants to process
+ * additional n_sectors sectors of the bio and the rest of the data should be
+ * sent in a next bio.
+ *
+ * A diagram that explains the arithmetics:
+ * +--------------------+---------------+-------+
+ * |         1          |       2       |   3   |
+ * +--------------------+---------------+-------+
+ *
+ * <-------------- *tio->len_ptr --------------->
+ *                      <------- bi_size ------->
+ *                      <-- n_sectors -->
+ *
+ * Region 1 was already iterated over with bio_advance or similar function.
+ *	(it may be empty if the target doesn't use bio_advance)
+ * Region 2 is the remaining bio size that the target wants to process.
+ *	(it may be empty if region 1 is non-empty, although there is no reason
+ *	 to make it empty)
+ * The target requires that region 3 is to be sent in the next bio.
+ *
+ * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
+ * the partially processed part (the sum of regions 1+2) must be the same for all
+ * copies of the bio.
+ */
+void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
+{
+	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
+	unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
+	BUG_ON(bio->bi_rw & REQ_FLUSH);
+	BUG_ON(bi_size > *tio->len_ptr);
+	BUG_ON(n_sectors > bi_size);
+	*tio->len_ptr -= bi_size - n_sectors;
+	bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
+}
+EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
+
+static void __map_bio(struct dm_target_io *tio)
 {
 	int r;
 	sector_t sector;
 	struct mapped_device *md;
 	struct bio *clone = &tio->clone;
+	struct dm_target *ti = tio->ti;
 
 	clone->bi_end_io = clone_endio;
-	clone->bi_private = tio;
 
 	/*
 	 * Map the clone.  If r == 0 we don't need to do
@@ -1009,8 +1193,8 @@ static void __map_bio(struct dm_target *ti, struct dm_target_io *tio)
 	 * this io.
 	 */
 	atomic_inc(&tio->io->io_count);
-	sector = clone->bi_sector;
-	r = ti->type->map(ti, clone, &tio->info);
+	sector = clone->bi_iter.bi_sector;
+	r = ti->type->map(ti, clone);
 	if (r == DM_MAPIO_REMAPPED) {
 		/* the bio has been remapped so dispatch it */
 
@@ -1035,66 +1219,38 @@ struct clone_info {
 	struct bio *bio;
 	struct dm_io *io;
 	sector_t sector;
-	sector_t sector_count;
-	unsigned short idx;
+	unsigned sector_count;
 };
 
-/*
- * Creates a little bio that just does part of a bvec.
- */
-static void split_bvec(struct dm_target_io *tio, struct bio *bio,
-		       sector_t sector, unsigned short idx, unsigned int offset,
-		       unsigned int len, struct bio_set *bs)
+static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
 {
-	struct bio *clone = &tio->clone;
-	struct bio_vec *bv = bio->bi_io_vec + idx;
-
-	*clone->bi_io_vec = *bv;
-
-	clone->bi_sector = sector;
-	clone->bi_bdev = bio->bi_bdev;
-	clone->bi_rw = bio->bi_rw;
-	clone->bi_vcnt = 1;
-	clone->bi_size = to_bytes(len);
-	clone->bi_io_vec->bv_offset = offset;
-	clone->bi_io_vec->bv_len = clone->bi_size;
-	clone->bi_flags |= 1 << BIO_CLONED;
-
-	if (bio_integrity(bio)) {
-		bio_integrity_clone(clone, bio, GFP_NOIO);
-		bio_integrity_trim(clone,
-				   bio_sector_offset(bio, idx, offset), len);
-	}
+	bio->bi_iter.bi_sector = sector;
+	bio->bi_iter.bi_size = to_bytes(len);
 }
 
 /*
  * Creates a bio that consists of range of complete bvecs.
  */
 static void clone_bio(struct dm_target_io *tio, struct bio *bio,
-		      sector_t sector, unsigned short idx,
-		      unsigned short bv_count, unsigned int len,
-		      struct bio_set *bs)
+		      sector_t sector, unsigned len)
 {
 	struct bio *clone = &tio->clone;
 
-	__bio_clone(clone, bio);
-	clone->bi_sector = sector;
-	clone->bi_idx = idx;
-	clone->bi_vcnt = idx + bv_count;
-	clone->bi_size = to_bytes(len);
-	clone->bi_flags &= ~(1 << BIO_SEG_VALID);
+	__bio_clone_fast(clone, bio);
 
-	if (bio_integrity(bio)) {
+	if (bio_integrity(bio))
 		bio_integrity_clone(clone, bio, GFP_NOIO);
 
-		if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
-			bio_integrity_trim(clone,
-					   bio_sector_offset(bio, idx, 0), len);
-	}
+	bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
+	clone->bi_iter.bi_size = to_bytes(len);
+
+	if (bio_integrity(bio))
+		bio_integrity_trim(clone, 0, len);
 }
 
 static struct dm_target_io *alloc_tio(struct clone_info *ci,
-				      struct dm_target *ti, int nr_iovecs)
+				      struct dm_target *ti, int nr_iovecs,
+				      unsigned target_bio_nr)
 {
 	struct dm_target_io *tio;
 	struct bio *clone;
@@ -1104,74 +1260,101 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci,
 
 	tio->io = ci->io;
 	tio->ti = ti;
-	memset(&tio->info, 0, sizeof(tio->info));
+	tio->target_bio_nr = target_bio_nr;
 
 	return tio;
 }
 
-static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
-				   unsigned request_nr, sector_t len)
+static void __clone_and_map_simple_bio(struct clone_info *ci,
+				       struct dm_target *ti,
+				       unsigned target_bio_nr, unsigned *len)
 {
-	struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs);
+	struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs, target_bio_nr);
 	struct bio *clone = &tio->clone;
 
-	tio->info.target_request_nr = request_nr;
+	tio->len_ptr = len;
 
 	/*
 	 * Discard requests require the bio's inline iovecs be initialized.
 	 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
 	 * and discard, so no need for concern about wasted bvec allocations.
 	 */
+	 __bio_clone_fast(clone, ci->bio);
+	if (len)
+		bio_setup_sector(clone, ci->sector, *len);
 
-	 __bio_clone(clone, ci->bio);
-	if (len) {
-		clone->bi_sector = ci->sector;
-		clone->bi_size = to_bytes(len);
-	}
-
-	__map_bio(ti, tio);
+	__map_bio(tio);
 }
 
-static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
-				    unsigned num_requests, sector_t len)
+static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
+				  unsigned num_bios, unsigned *len)
 {
-	unsigned request_nr;
+	unsigned target_bio_nr;
 
-	for (request_nr = 0; request_nr < num_requests; request_nr++)
-		__issue_target_request(ci, ti, request_nr, len);
+	for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++)
+		__clone_and_map_simple_bio(ci, ti, target_bio_nr, len);
 }
 
-static int __clone_and_map_empty_flush(struct clone_info *ci)
+static int __send_empty_flush(struct clone_info *ci)
 {
 	unsigned target_nr = 0;
 	struct dm_target *ti;
 
 	BUG_ON(bio_has_data(ci->bio));
 	while ((ti = dm_table_get_target(ci->map, target_nr++)))
-		__issue_target_requests(ci, ti, ti->num_flush_requests, 0);
+		__send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
 
 	return 0;
 }
 
-/*
- * Perform all io with a single clone.
- */
-static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti)
+static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
+				     sector_t sector, unsigned *len)
 {
 	struct bio *bio = ci->bio;
 	struct dm_target_io *tio;
+	unsigned target_bio_nr;
+	unsigned num_target_bios = 1;
+
+	/*
+	 * Does the target want to receive duplicate copies of the bio?
+	 */
+	if (bio_data_dir(bio) == WRITE && ti->num_write_bios)
+		num_target_bios = ti->num_write_bios(ti, bio);
+
+	for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
+		tio = alloc_tio(ci, ti, 0, target_bio_nr);
+		tio->len_ptr = len;
+		clone_bio(tio, bio, sector, *len);
+		__map_bio(tio);
+	}
+}
 
-	tio = alloc_tio(ci, ti, bio->bi_max_vecs);
-	clone_bio(tio, bio, ci->sector, ci->idx, bio->bi_vcnt - ci->idx,
-		  ci->sector_count, ci->md->bs);
-	__map_bio(ti, tio);
-	ci->sector_count = 0;
+typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
+
+static unsigned get_num_discard_bios(struct dm_target *ti)
+{
+	return ti->num_discard_bios;
+}
+
+static unsigned get_num_write_same_bios(struct dm_target *ti)
+{
+	return ti->num_write_same_bios;
 }
 
-static int __clone_and_map_discard(struct clone_info *ci)
+typedef bool (*is_split_required_fn)(struct dm_target *ti);
+
+static bool is_split_required_for_discard(struct dm_target *ti)
+{
+	return ti->split_discard_bios;
+}
+
+static int __send_changing_extent_only(struct clone_info *ci,
+				       get_num_bios_fn get_num_bios,
+				       is_split_required_fn is_split_required)
 {
 	struct dm_target *ti;
-	sector_t len;
+	unsigned len;
+	unsigned num_bios;
 
 	do {
 		ti = dm_table_find_target(ci->map, ci->sector);
@@ -1179,20 +1362,21 @@ static int __clone_and_map_discard(struct clone_info *ci)
 			return -EIO;
 
 		/*
-		 * Even though the device advertised discard support,
-		 * that does not mean every target supports it, and
+		 * Even though the device advertised support for this type of
+		 * request, that does not mean every target supports it, and
 		 * reconfiguration might also have changed that since the
 		 * check was performed.
 		 */
-		if (!ti->num_discard_requests)
+		num_bios = get_num_bios ? get_num_bios(ti) : 0;
+		if (!num_bios)
 			return -EOPNOTSUPP;
 
-		if (!ti->split_discard_requests)
-			len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
+		if (is_split_required && !is_split_required(ti))
+			len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
 		else
-			len = min(ci->sector_count, max_io_len(ci->sector, ti));
+			len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
 
-		__issue_target_requests(ci, ti, ti->num_discard_requests, len);
+		__send_duplicate_bios(ci, ti, num_bios, &len);
 
 		ci->sector += len;
 	} while (ci->sector_count -= len);
@@ -1200,107 +1384,60 @@ static int __clone_and_map_discard(struct clone_info *ci)
 	return 0;
 }
 
-static int __clone_and_map(struct clone_info *ci)
+static int __send_discard(struct clone_info *ci)
+{
+	return __send_changing_extent_only(ci, get_num_discard_bios,
+					   is_split_required_for_discard);
+}
+
+static int __send_write_same(struct clone_info *ci)
+{
+	return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
+}
+
+/*
+ * Select the correct strategy for processing a non-flush bio.
+ */
+static int __split_and_process_non_flush(struct clone_info *ci)
 {
 	struct bio *bio = ci->bio;
 	struct dm_target *ti;
-	sector_t len = 0, max;
-	struct dm_target_io *tio;
+	unsigned len;
 
 	if (unlikely(bio->bi_rw & REQ_DISCARD))
-		return __clone_and_map_discard(ci);
+		return __send_discard(ci);
+	else if (unlikely(bio->bi_rw & REQ_WRITE_SAME))
+		return __send_write_same(ci);
 
 	ti = dm_table_find_target(ci->map, ci->sector);
 	if (!dm_target_is_valid(ti))
 		return -EIO;
 
-	max = max_io_len(ci->sector, ti);
-
-	if (ci->sector_count <= max) {
-		/*
-		 * Optimise for the simple case where we can do all of
-		 * the remaining io with a single clone.
-		 */
-		__clone_and_map_simple(ci, ti);
-
-	} else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
-		/*
-		 * There are some bvecs that don't span targets.
-		 * Do as many of these as possible.
-		 */
-		int i;
-		sector_t remaining = max;
-		sector_t bv_len;
-
-		for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
-			bv_len = to_sector(bio->bi_io_vec[i].bv_len);
-
-			if (bv_len > remaining)
-				break;
-
-			remaining -= bv_len;
-			len += bv_len;
-		}
-
-		tio = alloc_tio(ci, ti, bio->bi_max_vecs);
-		clone_bio(tio, bio, ci->sector, ci->idx, i - ci->idx, len,
-			  ci->md->bs);
-		__map_bio(ti, tio);
-
-		ci->sector += len;
-		ci->sector_count -= len;
-		ci->idx = i;
-
-	} else {
-		/*
-		 * Handle a bvec that must be split between two or more targets.
-		 */
-		struct bio_vec *bv = bio->bi_io_vec + ci->idx;
-		sector_t remaining = to_sector(bv->bv_len);
-		unsigned int offset = 0;
-
-		do {
-			if (offset) {
-				ti = dm_table_find_target(ci->map, ci->sector);
-				if (!dm_target_is_valid(ti))
-					return -EIO;
-
-				max = max_io_len(ci->sector, ti);
-			}
+	len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
 
-			len = min(remaining, max);
+	__clone_and_map_data_bio(ci, ti, ci->sector, &len);
 
-			tio = alloc_tio(ci, ti, 1);
-			split_bvec(tio, bio, ci->sector, ci->idx,
-				   bv->bv_offset + offset, len, ci->md->bs);
-
-			__map_bio(ti, tio);
-
-			ci->sector += len;
-			ci->sector_count -= len;
-			offset += to_bytes(len);
-		} while (remaining -= len);
-
-		ci->idx++;
-	}
+	ci->sector += len;
+	ci->sector_count -= len;
 
 	return 0;
 }
 
 /*
- * Split the bio into several clones and submit it to targets.
+ * Entry point to split a bio into clones and submit them to the targets.
  */
-static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
+static void __split_and_process_bio(struct mapped_device *md,
+				    struct dm_table *map, struct bio *bio)
 {
 	struct clone_info ci;
 	int error = 0;
 
-	ci.map = dm_get_live_table(md);
-	if (unlikely(!ci.map)) {
+	if (unlikely(!map)) {
 		bio_io_error(bio);
 		return;
 	}
 
+	ci.map = map;
 	ci.md = md;
 	ci.io = alloc_io(md);
 	ci.io->error = 0;
@@ -1308,25 +1445,24 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 	ci.io->bio = bio;
 	ci.io->md = md;
 	spin_lock_init(&ci.io->endio_lock);
-	ci.sector = bio->bi_sector;
-	ci.idx = bio->bi_idx;
+	ci.sector = bio->bi_iter.bi_sector;
 
 	start_io_acct(ci.io);
+
 	if (bio->bi_rw & REQ_FLUSH) {
 		ci.bio = &ci.md->flush_bio;
 		ci.sector_count = 0;
-		error = __clone_and_map_empty_flush(&ci);
+		error = __send_empty_flush(&ci);
 		/* dec_pending submits any data associated with flush */
 	} else {
 		ci.bio = bio;
 		ci.sector_count = bio_sectors(bio);
 		while (ci.sector_count && !error)
-			error = __clone_and_map(&ci);
+			error = __split_and_process_non_flush(&ci);
 	}
 
 	/* drop the extra reference count */
 	dec_pending(ci.io, error);
-	dm_table_put(ci.map);
 }
 /*-----------------------------------------------------------------
  * CRUD END
@@ -1337,7 +1473,7 @@ static int dm_merge_bvec(struct request_queue *q,
 			 struct bio_vec *biovec)
 {
 	struct mapped_device *md = q->queuedata;
-	struct dm_table *map = dm_get_live_table(md);
+	struct dm_table *map = dm_get_live_table_fast(md);
 	struct dm_target *ti;
 	sector_t max_sectors;
 	int max_size = 0;
@@ -1347,7 +1483,7 @@ static int dm_merge_bvec(struct request_queue *q,
 
 	ti = dm_table_find_target(map, bvm->bi_sector);
 	if (!dm_target_is_valid(ti))
-		goto out_table;
+		goto out;
 
 	/*
 	 * Find maximum amount of I/O that won't need splitting
@@ -1373,13 +1509,10 @@ static int dm_merge_bvec(struct request_queue *q,
 	 * just one page.
 	 */
 	else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
-
 		max_size = 0;
 
-out_table:
-	dm_table_put(map);
-
 out:
+	dm_put_live_table_fast(md);
 	/*
 	 * Always allow an entire first page
 	 */
@@ -1398,8 +1531,10 @@ static void _dm_request(struct request_queue *q, struct bio *bio)
 	int rw = bio_data_dir(bio);
 	struct mapped_device *md = q->queuedata;
 	int cpu;
+	int srcu_idx;
+	struct dm_table *map;
 
-	down_read(&md->io_lock);
+	map = dm_get_live_table(md, &srcu_idx);
 
 	cpu = part_stat_lock();
 	part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
@@ -1408,7 +1543,7 @@ static void _dm_request(struct request_queue *q, struct bio *bio)
 
 	/* if we're suspended, we have to queue this io for later */
 	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
-		up_read(&md->io_lock);
+		dm_put_live_table(md, srcu_idx);
 
 		if (bio_rw(bio) != READA)
 			queue_io(md, bio);
@@ -1417,12 +1552,12 @@ static void _dm_request(struct request_queue *q, struct bio *bio)
 		return;
 	}
 
-	__split_and_process_bio(md, bio);
-	up_read(&md->io_lock);
+	__split_and_process_bio(md, map, bio);
+	dm_put_live_table(md, srcu_idx);
 	return;
 }
 
-static int dm_request_based(struct mapped_device *md)
+int dm_request_based(struct mapped_device *md)
 {
 	return blk_queue_stackable(md->queue);
 }
@@ -1461,7 +1596,6 @@ static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
 	info->orig = bio_orig;
 	info->tio = tio;
 	bio->bi_end_io = end_clone_bio;
-	bio->bi_private = info;
 
 	return 0;
 }
@@ -1479,7 +1613,6 @@ static int setup_clone(struct request *clone, struct request *rq,
 	clone->cmd = rq->cmd;
 	clone->cmd_len = rq->cmd_len;
 	clone->sense = rq->sense;
-	clone->buffer = rq->buffer;
 	clone->end_io = end_clone_request;
 	clone->end_io_data = tio;
 
@@ -1604,7 +1737,8 @@ static struct request *dm_start_request(struct mapped_device *md, struct request
 static void dm_request_fn(struct request_queue *q)
 {
 	struct mapped_device *md = q->queuedata;
-	struct dm_table *map = dm_get_live_table(md);
+	int srcu_idx;
+	struct dm_table *map = dm_get_live_table(md, &srcu_idx);
 	struct dm_target *ti;
 	struct request *rq, *clone;
 	sector_t pos;
@@ -1659,7 +1793,7 @@ requeued:
 delay_and_out:
 	blk_delay_queue(q, HZ / 10);
 out:
-	dm_table_put(map);
+	dm_put_live_table(md, srcu_idx);
 }
 
 int dm_underlying_device_busy(struct request_queue *q)
@@ -1672,14 +1806,14 @@ static int dm_lld_busy(struct request_queue *q)
 {
 	int r;
 	struct mapped_device *md = q->queuedata;
-	struct dm_table *map = dm_get_live_table(md);
+	struct dm_table *map = dm_get_live_table_fast(md);
 
 	if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
 		r = 1;
 	else
 		r = dm_table_any_busy_target(map);
 
-	dm_table_put(map);
+	dm_put_live_table_fast(md);
 
 	return r;
 }
@@ -1691,7 +1825,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 	struct dm_table *map;
 
 	if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
-		map = dm_get_live_table(md);
+		map = dm_get_live_table_fast(md);
 		if (map) {
 			/*
 			 * Request-based dm cares about only own queue for
@@ -1702,9 +1836,8 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 				    bdi_bits;
 			else
 				r = dm_table_any_congested(map, bdi_bits);
-
-			dm_table_put(map);
 		}
+		dm_put_live_table_fast(md);
 	}
 
 	return r;
@@ -1725,62 +1858,38 @@ static void free_minor(int minor)
  */
 static int specific_minor(int minor)
 {
-	int r, m;
+	int r;
 
 	if (minor >= (1 << MINORBITS))
 		return -EINVAL;
 
-	r = idr_pre_get(&_minor_idr, GFP_KERNEL);
-	if (!r)
-		return -ENOMEM;
-
+	idr_preload(GFP_KERNEL);
 	spin_lock(&_minor_lock);
 
-	if (idr_find(&_minor_idr, minor)) {
-		r = -EBUSY;
-		goto out;
-	}
-
-	r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m);
-	if (r)
-		goto out;
-
-	if (m != minor) {
-		idr_remove(&_minor_idr, m);
-		r = -EBUSY;
-		goto out;
-	}
+	r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
 
-out:
 	spin_unlock(&_minor_lock);
-	return r;
+	idr_preload_end();
+	if (r < 0)
+		return r == -ENOSPC ? -EBUSY : r;
+	return 0;
 }
 
 static int next_free_minor(int *minor)
 {
-	int r, m;
-
-	r = idr_pre_get(&_minor_idr, GFP_KERNEL);
-	if (!r)
-		return -ENOMEM;
+	int r;
 
+	idr_preload(GFP_KERNEL);
 	spin_lock(&_minor_lock);
 
-	r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);
-	if (r)
-		goto out;
+	r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
 
-	if (m >= (1 << MINORBITS)) {
-		idr_remove(&_minor_idr, m);
-		r = -ENOSPC;
-		goto out;
-	}
-
-	*minor = m;
-
-out:
 	spin_unlock(&_minor_lock);
-	return r;
+	idr_preload_end();
+	if (r < 0)
+		return r;
+	*minor = r;
+	return 0;
 }
 
 static const struct block_device_operations dm_blk_dops;
@@ -1833,12 +1942,14 @@ static struct mapped_device *alloc_dev(int minor)
 	if (r < 0)
 		goto bad_minor;
 
+	r = init_srcu_struct(&md->io_barrier);
+	if (r < 0)
+		goto bad_io_barrier;
+
 	md->type = DM_TYPE_NONE;
-	init_rwsem(&md->io_lock);
 	mutex_init(&md->suspend_lock);
 	mutex_init(&md->type_lock);
 	spin_lock_init(&md->deferred_lock);
-	rwlock_init(&md->map_lock);
 	atomic_set(&md->holders, 1);
 	atomic_set(&md->open_count, 0);
 	atomic_set(&md->event_nr, 0);
@@ -1861,6 +1972,7 @@ static struct mapped_device *alloc_dev(int minor)
 	init_waitqueue_head(&md->wait);
 	INIT_WORK(&md->work, dm_wq_work);
 	init_waitqueue_head(&md->eventq);
+	init_completion(&md->kobj_holder.completion);
 
 	md->disk->major = _major;
 	md->disk->first_minor = minor;
@@ -1871,8 +1983,7 @@ static struct mapped_device *alloc_dev(int minor)
 	add_disk(md->disk);
 	format_dev_t(md->name, MKDEV(_major, minor));
 
-	md->wq = alloc_workqueue("kdmflush",
-				 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
+	md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
 	if (!md->wq)
 		goto bad_thread;
 
@@ -1884,6 +1995,8 @@ static struct mapped_device *alloc_dev(int minor)
 	md->flush_bio.bi_bdev = md->bdev;
 	md->flush_bio.bi_rw = WRITE_FLUSH;
 
+	dm_stats_init(&md->stats);
+
 	/* Populate the mapping, nobody knows we exist yet */
 	spin_lock(&_minor_lock);
 	old_md = idr_replace(&_minor_idr, md, minor);
@@ -1901,6 +2014,8 @@ bad_thread:
 bad_disk:
 	blk_cleanup_queue(md->queue);
 bad_queue:
+	cleanup_srcu_struct(&md->io_barrier);
+bad_io_barrier:
 	free_minor(minor);
 bad_minor:
 	module_put(THIS_MODULE);
@@ -1918,14 +2033,13 @@ static void free_dev(struct mapped_device *md)
 	unlock_fs(md);
 	bdput(md->bdev);
 	destroy_workqueue(md->wq);
-	if (md->tio_pool)
-		mempool_destroy(md->tio_pool);
 	if (md->io_pool)
 		mempool_destroy(md->io_pool);
 	if (md->bs)
 		bioset_free(md->bs);
 	blk_integrity_unregister(md->disk);
 	del_gendisk(md->disk);
+	cleanup_srcu_struct(&md->io_barrier);
 	free_minor(minor);
 
 	spin_lock(&_minor_lock);
@@ -1934,25 +2048,42 @@ static void free_dev(struct mapped_device *md)
 
 	put_disk(md->disk);
 	blk_cleanup_queue(md->queue);
+	dm_stats_cleanup(&md->stats);
 	module_put(THIS_MODULE);
 	kfree(md);
 }
 
 static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 {
-	struct dm_md_mempools *p;
+	struct dm_md_mempools *p = dm_table_get_md_mempools(t);
 
-	if (md->io_pool && (md->tio_pool || dm_table_get_type(t) == DM_TYPE_BIO_BASED) && md->bs)
-		/* the md already has necessary mempools */
+	if (md->io_pool && md->bs) {
+		/* The md already has necessary mempools. */
+		if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
+			/*
+			 * Reload bioset because front_pad may have changed
+			 * because a different table was loaded.
+			 */
+			bioset_free(md->bs);
+			md->bs = p->bs;
+			p->bs = NULL;
+		} else if (dm_table_get_type(t) == DM_TYPE_REQUEST_BASED) {
+			/*
+			 * There's no need to reload with request-based dm
+			 * because the size of front_pad doesn't change.
+			 * Note for future: If you are to reload bioset,
+			 * prep-ed requests in the queue may refer
+			 * to bio from the old bioset, so you must walk
+			 * through the queue to unprep.
+			 */
+		}
 		goto out;
+	}
 
-	p = dm_table_get_md_mempools(t);
-	BUG_ON(!p || md->io_pool || md->tio_pool || md->bs);
+	BUG_ON(!p || md->io_pool || md->bs);
 
 	md->io_pool = p->io_pool;
 	p->io_pool = NULL;
-	md->tio_pool = p->tio_pool;
-	p->tio_pool = NULL;
 	md->bs = p->bs;
 	p->bs = NULL;
 
@@ -2052,7 +2183,6 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 	struct dm_table *old_map;
 	struct request_queue *q = md->queue;
 	sector_t size;
-	unsigned long flags;
 	int merge_is_optional;
 
 	size = dm_table_get_size(t);
@@ -2060,7 +2190,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 	/*
 	 * Wipe any geometry if the size of the table changed.
 	 */
-	if (size != get_capacity(md->disk))
+	if (size != dm_get_size(md))
 		memset(&md->geometry, 0, sizeof(md->geometry));
 
 	__set_size(md, size);
@@ -2081,9 +2211,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 
 	merge_is_optional = dm_table_merge_is_optional(t);
 
-	write_lock_irqsave(&md->map_lock, flags);
 	old_map = md->map;
-	md->map = t;
+	rcu_assign_pointer(md->map, t);
 	md->immutable_target_type = dm_table_get_immutable_target_type(t);
 
 	dm_table_set_restrictions(t, q, limits);
@@ -2091,7 +2220,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 		set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
 	else
 		clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
-	write_unlock_irqrestore(&md->map_lock, flags);
+	dm_sync_table(md);
 
 	return old_map;
 }
@@ -2102,15 +2231,13 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 static struct dm_table *__unbind(struct mapped_device *md)
 {
 	struct dm_table *map = md->map;
-	unsigned long flags;
 
 	if (!map)
 		return NULL;
 
 	dm_table_event_callback(map, NULL, NULL);
-	write_lock_irqsave(&md->map_lock, flags);
-	md->map = NULL;
-	write_unlock_irqrestore(&md->map_lock, flags);
+	RCU_INIT_POINTER(md->map, NULL);
+	dm_sync_table(md);
 
 	return map;
 }
@@ -2148,11 +2275,13 @@ void dm_unlock_md_type(struct mapped_device *md)
 
 void dm_set_md_type(struct mapped_device *md, unsigned type)
 {
+	BUG_ON(!mutex_is_locked(&md->type_lock));
 	md->type = type;
 }
 
 unsigned dm_get_md_type(struct mapped_device *md)
 {
+	BUG_ON(!mutex_is_locked(&md->type_lock));
 	return md->type;
 }
 
@@ -2162,6 +2291,17 @@ struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
 }
 
 /*
+ * The queue_limits are only valid as long as you have a reference
+ * count on 'md'.
+ */
+struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
+{
+	BUG_ON(!atomic_read(&md->holders));
+	return &md->queue->limits;
+}
+EXPORT_SYMBOL_GPL(dm_get_queue_limits);
+
+/*
  * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
  */
 static int dm_init_request_based_queue(struct mapped_device *md)
@@ -2262,11 +2402,12 @@ EXPORT_SYMBOL_GPL(dm_device_name);
 static void __dm_destroy(struct mapped_device *md, bool wait)
 {
 	struct dm_table *map;
+	int srcu_idx;
 
 	might_sleep();
 
 	spin_lock(&_minor_lock);
-	map = dm_get_live_table(md);
+	map = dm_get_live_table(md, &srcu_idx);
 	idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
 	set_bit(DMF_FREEING, &md->flags);
 	spin_unlock(&_minor_lock);
@@ -2276,6 +2417,9 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
 		dm_table_postsuspend_targets(map);
 	}
 
+	/* dm_put_live_table must be before msleep, otherwise deadlock is possible */
+	dm_put_live_table(md, srcu_idx);
+
 	/*
 	 * Rare, but there may be I/O requests still going to complete,
 	 * for example.  Wait for all references to disappear.
@@ -2290,7 +2434,6 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
 		       dm_device_name(md), atomic_read(&md->holders));
 
 	dm_sysfs_exit(md);
-	dm_table_put(map);
 	dm_table_destroy(__unbind(md));
 	free_dev(md);
 }
@@ -2347,8 +2490,10 @@ static void dm_wq_work(struct work_struct *work)
 	struct mapped_device *md = container_of(work, struct mapped_device,
 						work);
 	struct bio *c;
+	int srcu_idx;
+	struct dm_table *map;
 
-	down_read(&md->io_lock);
+	map = dm_get_live_table(md, &srcu_idx);
 
 	while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
 		spin_lock_irq(&md->deferred_lock);
@@ -2358,23 +2503,19 @@ static void dm_wq_work(struct work_struct *work)
 		if (!c)
 			break;
 
-		up_read(&md->io_lock);
-
 		if (dm_request_based(md))
 			generic_make_request(c);
 		else
-			__split_and_process_bio(md, c);
-
-		down_read(&md->io_lock);
+			__split_and_process_bio(md, map, c);
 	}
 
-	up_read(&md->io_lock);
+	dm_put_live_table(md, srcu_idx);
 }
 
 static void dm_queue_flush(struct mapped_device *md)
 {
 	clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
-	smp_mb__after_clear_bit();
+	smp_mb__after_atomic();
 	queue_work(md->wq, &md->work);
 }
 
@@ -2383,7 +2524,7 @@ static void dm_queue_flush(struct mapped_device *md)
  */
 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
 {
-	struct dm_table *live_map, *map = ERR_PTR(-EINVAL);
+	struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
 	struct queue_limits limits;
 	int r;
 
@@ -2400,16 +2541,18 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
 	 * reappear.
 	 */
 	if (dm_table_has_no_data_devices(table)) {
-		live_map = dm_get_live_table(md);
+		live_map = dm_get_live_table_fast(md);
 		if (live_map)
 			limits = md->queue->limits;
-		dm_table_put(live_map);
+		dm_put_live_table_fast(md);
 	}
 
-	r = dm_calculate_queue_limits(table, &limits);
-	if (r) {
-		map = ERR_PTR(r);
-		goto out;
+	if (!live_map) {
+		r = dm_calculate_queue_limits(table, &limits);
+		if (r) {
+			map = ERR_PTR(r);
+			goto out;
+		}
 	}
 
 	map = __bind(md, table, &limits);
@@ -2481,7 +2624,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 		goto out_unlock;
 	}
 
-	map = dm_get_live_table(md);
+	map = md->map;
 
 	/*
 	 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
@@ -2502,7 +2645,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	if (!noflush && do_lockfs) {
 		r = lock_fs(md);
 		if (r)
-			goto out;
+			goto out_unlock;
 	}
 
 	/*
@@ -2517,9 +2660,8 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
 	 * flush_workqueue(md->wq).
 	 */
-	down_write(&md->io_lock);
 	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
-	up_write(&md->io_lock);
+	synchronize_srcu(&md->io_barrier);
 
 	/*
 	 * Stop md->queue before flushing md->wq in case request-based
@@ -2537,10 +2679,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	 */
 	r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
 
-	down_write(&md->io_lock);
 	if (noflush)
 		clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
-	up_write(&md->io_lock);
+	synchronize_srcu(&md->io_barrier);
 
 	/* were we interrupted ? */
 	if (r < 0) {
@@ -2550,7 +2691,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 			start_queue(md->queue);
 
 		unlock_fs(md);
-		goto out; /* pushback list is already flushed, so skip flush */
+		goto out_unlock; /* pushback list is already flushed, so skip flush */
 	}
 
 	/*
@@ -2563,9 +2704,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 
 	dm_table_postsuspend_targets(map);
 
-out:
-	dm_table_put(map);
-
 out_unlock:
 	mutex_unlock(&md->suspend_lock);
 	return r;
@@ -2580,7 +2718,7 @@ int dm_resume(struct mapped_device *md)
 	if (!dm_suspended_md(md))
 		goto out;
 
-	map = dm_get_live_table(md);
+	map = md->map;
 	if (!map || !dm_table_get_size(map))
 		goto out;
 
@@ -2604,12 +2742,43 @@ int dm_resume(struct mapped_device *md)
 
 	r = 0;
 out:
-	dm_table_put(map);
 	mutex_unlock(&md->suspend_lock);
 
 	return r;
 }
 
+/*
+ * Internal suspend/resume works like userspace-driven suspend. It waits
+ * until all bios finish and prevents issuing new bios to the target drivers.
+ * It may be used only from the kernel.
+ *
+ * Internal suspend holds md->suspend_lock, which prevents interaction with
+ * userspace-driven suspend.
+ */
+
+void dm_internal_suspend(struct mapped_device *md)
+{
+	mutex_lock(&md->suspend_lock);
+	if (dm_suspended_md(md))
+		return;
+
+	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
+	synchronize_srcu(&md->io_barrier);
+	flush_workqueue(md->wq);
+	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
+}
+
+void dm_internal_resume(struct mapped_device *md)
+{
+	if (dm_suspended_md(md))
+		goto done;
+
+	dm_queue_flush(md);
+
+done:
+	mutex_unlock(&md->suspend_lock);
+}
+
 /*-----------------------------------------------------------------
  * Event notification.
  *---------------------------------------------------------------*/
@@ -2665,20 +2834,14 @@ struct gendisk *dm_disk(struct mapped_device *md)
 
 struct kobject *dm_kobject(struct mapped_device *md)
 {
-	return &md->kobj;
+	return &md->kobj_holder.kobj;
 }
 
-/*
- * struct mapped_device should not be exported outside of dm.c
- * so use this check to verify that kobj is part of md structure
- */
 struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
 {
 	struct mapped_device *md;
 
-	md = container_of(kobj, struct mapped_device, kobj);
-	if (&md->kobj != kobj)
-		return NULL;
+	md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
 
 	if (test_bit(DMF_FREEING, &md->flags) ||
 	    dm_deleting_md(md))
@@ -2693,6 +2856,11 @@ int dm_suspended_md(struct mapped_device *md)
 	return test_bit(DMF_SUSPENDED, &md->flags);
 }
 
+int dm_test_deferred_remove_flag(struct mapped_device *md)
+{
+	return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
+}
+
 int dm_suspended(struct dm_target *ti)
 {
 	return dm_suspended_md(dm_table_get_md(ti->table));
@@ -2705,52 +2873,44 @@ int dm_noflush_suspending(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size)
 {
-	struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
-	unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS;
+	struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
+	struct kmem_cache *cachep;
+	unsigned int pool_size;
+	unsigned int front_pad;
 
 	if (!pools)
 		return NULL;
 
-	pools->io_pool = (type == DM_TYPE_BIO_BASED) ?
-			 mempool_create_slab_pool(MIN_IOS, _io_cache) :
-			 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);
-	if (!pools->io_pool)
-		goto free_pools_and_out;
+	if (type == DM_TYPE_BIO_BASED) {
+		cachep = _io_cache;
+		pool_size = dm_get_reserved_bio_based_ios();
+		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
+	} else if (type == DM_TYPE_REQUEST_BASED) {
+		cachep = _rq_tio_cache;
+		pool_size = dm_get_reserved_rq_based_ios();
+		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
+		/* per_bio_data_size is not used. See __bind_mempools(). */
+		WARN_ON(per_bio_data_size != 0);
+	} else
+		goto out;
 
-	pools->tio_pool = NULL;
-	if (type == DM_TYPE_REQUEST_BASED) {
-		pools->tio_pool = mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
-		if (!pools->tio_pool)
-			goto free_io_pool_and_out;
-	}
+	pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
+	if (!pools->io_pool)
+		goto out;
 
-	pools->bs = (type == DM_TYPE_BIO_BASED) ?
-		bioset_create(pool_size,
-			      offsetof(struct dm_target_io, clone)) :
-		bioset_create(pool_size,
-			      offsetof(struct dm_rq_clone_bio_info, clone));
+	pools->bs = bioset_create(pool_size, front_pad);
 	if (!pools->bs)
-		goto free_tio_pool_and_out;
+		goto out;
 
 	if (integrity && bioset_integrity_create(pools->bs, pool_size))
-		goto free_bioset_and_out;
+		goto out;
 
 	return pools;
 
-free_bioset_and_out:
-	bioset_free(pools->bs);
-
-free_tio_pool_and_out:
-	if (pools->tio_pool)
-		mempool_destroy(pools->tio_pool);
-
-free_io_pool_and_out:
-	mempool_destroy(pools->io_pool);
-
-free_pools_and_out:
-	kfree(pools);
+out:
+	dm_free_md_mempools(pools);
 
 	return NULL;
 }
@@ -2763,9 +2923,6 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
 	if (pools->io_pool)
 		mempool_destroy(pools->io_pool);
 
-	if (pools->tio_pool)
-		mempool_destroy(pools->tio_pool);
-
 	if (pools->bs)
 		bioset_free(pools->bs);
 
@@ -2780,8 +2937,6 @@ static const struct block_device_operations dm_blk_dops = {
 	.owner = THIS_MODULE
 };
 
-EXPORT_SYMBOL(dm_get_mapinfo);
-
 /*
  * module hooks
  */
@@ -2790,6 +2945,13 @@ module_exit(dm_exit);
 
 module_param(major, uint, 0);
 MODULE_PARM_DESC(major, "The major number of the device mapper");
+
+module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
+
+module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
+
 MODULE_DESCRIPTION(DM_NAME " driver");
 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 6a99fefaa74..ed76126aac5 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -15,6 +15,10 @@
 #include <linux/list.h>
 #include <linux/blkdev.h>
 #include <linux/hdreg.h>
+#include <linux/completion.h>
+#include <linux/kobject.h>
+
+#include "dm-stats.h"
 
 /*
  * Suspend feature flags
@@ -69,7 +73,6 @@ unsigned dm_table_get_type(struct dm_table *t);
 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
 bool dm_table_request_based(struct dm_table *t);
 bool dm_table_supports_discards(struct dm_table *t);
-int dm_table_alloc_md_mempools(struct dm_table *t);
 void dm_table_free_md_mempools(struct dm_table *t);
 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
 
@@ -89,10 +92,21 @@ int dm_setup_md_queue(struct mapped_device *md);
 #define dm_target_is_valid(t) ((t)->table)
 
 /*
+ * To check whether the target type is bio-based or not (request-based).
+ */
+#define dm_target_bio_based(t) ((t)->type->map != NULL)
+
+/*
  * To check whether the target type is request-based or not (bio-based).
  */
 #define dm_target_request_based(t) ((t)->type->map_rq != NULL)
 
+/*
+ * To check whether the target type is a hybrid (capable of being
+ * either request-based or bio-based).
+ */
+#define dm_target_hybrid(t) (dm_target_bio_based(t) && dm_target_request_based(t))
+
 /*-----------------------------------------------------------------
  * A registry of target types.
  *---------------------------------------------------------------*/
@@ -116,6 +130,16 @@ int dm_deleting_md(struct mapped_device *md);
 int dm_suspended_md(struct mapped_device *md);
 
 /*
+ * Test if the device is scheduled for deferred remove.
+ */
+int dm_test_deferred_remove_flag(struct mapped_device *md);
+
+/*
+ * Try to remove devices marked for deferred removal.
+ */
+void dm_deferred_remove(void);
+
+/*
  * The device-mapper can be driven through one of two interfaces;
  * ioctl or filesystem, depending which patch you have applied.
  */
@@ -125,12 +149,27 @@ void dm_interface_exit(void);
 /*
  * sysfs interface
  */
+struct dm_kobject_holder {
+	struct kobject kobj;
+	struct completion completion;
+};
+
+static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
+{
+	return &container_of(kobj, struct dm_kobject_holder, kobj)->completion;
+}
+
 int dm_sysfs_init(struct mapped_device *md);
 void dm_sysfs_exit(struct mapped_device *md);
 struct kobject *dm_kobject(struct mapped_device *md);
 struct mapped_device *dm_get_from_kobject(struct kobject *kobj);
 
 /*
+ * The kobject helper
+ */
+void dm_kobject_release(struct kobject *kobj);
+
+/*
  * Targets for linear and striped mappings
  */
 int dm_linear_init(void);
@@ -145,11 +184,19 @@ void dm_stripe_exit(void);
 void dm_destroy(struct mapped_device *md);
 void dm_destroy_immediate(struct mapped_device *md);
 int dm_open_count(struct mapped_device *md);
-int dm_lock_for_deletion(struct mapped_device *md);
+int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred);
+int dm_cancel_deferred_remove(struct mapped_device *md);
+int dm_request_based(struct mapped_device *md);
+sector_t dm_get_size(struct mapped_device *md);
+struct request_queue *dm_get_md_queue(struct mapped_device *md);
+struct dm_stats *dm_get_stats(struct mapped_device *md);
 
 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
 		      unsigned cookie);
 
+void dm_internal_suspend(struct mapped_device *md);
+void dm_internal_resume(struct mapped_device *md);
+
 int dm_io_init(void);
 void dm_io_exit(void);
 
@@ -159,7 +206,18 @@ void dm_kcopyd_exit(void);
 /*
  * Mempool operations
  */
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity);
+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size);
 void dm_free_md_mempools(struct dm_md_mempools *pools);
 
+/*
+ * Helpers that are used by DM core
+ */
+unsigned dm_get_reserved_bio_based_ios(void);
+unsigned dm_get_reserved_rq_based_ios(void);
+
+static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen)
+{
+	return !maxlen || strlen(result) + 1 >= maxlen;
+}
+
 #endif
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 45135f69509..e8b4574956c 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -74,8 +74,8 @@ static void faulty_fail(struct bio *bio, int error)
 {
 	struct bio *b = bio->bi_private;
 
-	b->bi_size = bio->bi_size;
-	b->bi_sector = bio->bi_sector;
+	b->bi_iter.bi_size = bio->bi_iter.bi_size;
+	b->bi_iter.bi_sector = bio->bi_iter.bi_sector;
 
 	bio_put(bio);
 
@@ -185,28 +185,31 @@ static void make_request(struct mddev *mddev, struct bio *bio)
 			return;
 		}
 
-		if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9),
-				 WRITE))
+		if (check_sector(conf, bio->bi_iter.bi_sector,
+				 bio_end_sector(bio), WRITE))
 			failit = 1;
 		if (check_mode(conf, WritePersistent)) {
-			add_sector(conf, bio->bi_sector, WritePersistent);
+			add_sector(conf, bio->bi_iter.bi_sector,
+				   WritePersistent);
 			failit = 1;
 		}
 		if (check_mode(conf, WriteTransient))
 			failit = 1;
 	} else {
 		/* read request */
-		if (check_sector(conf, bio->bi_sector, bio->bi_sector + (bio->bi_size>>9),
-				 READ))
+		if (check_sector(conf, bio->bi_iter.bi_sector,
+				 bio_end_sector(bio), READ))
 			failit = 1;
 		if (check_mode(conf, ReadTransient))
 			failit = 1;
 		if (check_mode(conf, ReadPersistent)) {
-			add_sector(conf, bio->bi_sector, ReadPersistent);
+			add_sector(conf, bio->bi_iter.bi_sector,
+				   ReadPersistent);
 			failit = 1;
 		}
 		if (check_mode(conf, ReadFixable)) {
-			add_sector(conf, bio->bi_sector, ReadFixable);
+			add_sector(conf, bio->bi_iter.bi_sector,
+				   ReadFixable);
 			failit = 1;
 		}
 	}
@@ -315,8 +318,11 @@ static int run(struct mddev *mddev)
 	}
 	conf->nfaults = 0;
 
-	rdev_for_each(rdev, mddev)
+	rdev_for_each(rdev, mddev) {
 		conf->rdev = rdev;
+		disk_stack_limits(mddev->gendisk, rdev->bdev,
+				  rdev->data_offset << 9);
+	}
 
 	md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
 	mddev->private = conf;
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 21014836bdb..56f534b4a2d 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -288,66 +288,65 @@ static int linear_stop (struct mddev *mddev)
 
 static void linear_make_request(struct mddev *mddev, struct bio *bio)
 {
+	char b[BDEVNAME_SIZE];
 	struct dev_info *tmp_dev;
-	sector_t start_sector;
+	struct bio *split;
+	sector_t start_sector, end_sector, data_offset;
 
 	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
 		md_flush_request(mddev, bio);
 		return;
 	}
 
-	rcu_read_lock();
-	tmp_dev = which_dev(mddev, bio->bi_sector);
-	start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
-
-
-	if (unlikely(bio->bi_sector >= (tmp_dev->end_sector)
-		     || (bio->bi_sector < start_sector))) {
-		char b[BDEVNAME_SIZE];
-
-		printk(KERN_ERR
-		       "md/linear:%s: make_request: Sector %llu out of bounds on "
-		       "dev %s: %llu sectors, offset %llu\n",
-		       mdname(mddev),
-		       (unsigned long long)bio->bi_sector,
-		       bdevname(tmp_dev->rdev->bdev, b),
-		       (unsigned long long)tmp_dev->rdev->sectors,
-		       (unsigned long long)start_sector);
-		rcu_read_unlock();
-		bio_io_error(bio);
-		return;
-	}
-	if (unlikely(bio->bi_sector + (bio->bi_size >> 9) >
-		     tmp_dev->end_sector)) {
-		/* This bio crosses a device boundary, so we have to
-		 * split it.
-		 */
-		struct bio_pair *bp;
-		sector_t end_sector = tmp_dev->end_sector;
+	do {
+		rcu_read_lock();
 
-		rcu_read_unlock();
-
-		bp = bio_split(bio, end_sector - bio->bi_sector);
+		tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector);
+		start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
+		end_sector = tmp_dev->end_sector;
+		data_offset = tmp_dev->rdev->data_offset;
+		bio->bi_bdev = tmp_dev->rdev->bdev;
 
-		linear_make_request(mddev, &bp->bio1);
-		linear_make_request(mddev, &bp->bio2);
-		bio_pair_release(bp);
-		return;
-	}
-		    
-	bio->bi_bdev = tmp_dev->rdev->bdev;
-	bio->bi_sector = bio->bi_sector - start_sector
-		+ tmp_dev->rdev->data_offset;
-	rcu_read_unlock();
+		rcu_read_unlock();
 
-	if (unlikely((bio->bi_rw & REQ_DISCARD) &&
-		     !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
-		/* Just ignore it */
-		bio_endio(bio, 0);
-		return;
-	}
+		if (unlikely(bio->bi_iter.bi_sector >= end_sector ||
+			     bio->bi_iter.bi_sector < start_sector))
+			goto out_of_bounds;
+
+		if (unlikely(bio_end_sector(bio) > end_sector)) {
+			/* This bio crosses a device boundary, so we have to
+			 * split it.
+			 */
+			split = bio_split(bio, end_sector -
+					  bio->bi_iter.bi_sector,
+					  GFP_NOIO, fs_bio_set);
+			bio_chain(split, bio);
+		} else {
+			split = bio;
+		}
 
-	generic_make_request(bio);
+		split->bi_iter.bi_sector = split->bi_iter.bi_sector -
+			start_sector + data_offset;
+
+		if (unlikely((split->bi_rw & REQ_DISCARD) &&
+			 !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
+			/* Just ignore it */
+			bio_endio(split, 0);
+		} else
+			generic_make_request(split);
+	} while (split != bio);
+	return;
+
+out_of_bounds:
+	printk(KERN_ERR
+	       "md/linear:%s: make_request: Sector %llu out of bounds on "
+	       "dev %s: %llu sectors, offset %llu\n",
+	       mdname(mddev),
+	       (unsigned long long)bio->bi_iter.bi_sector,
+	       bdevname(tmp_dev->rdev->bdev, b),
+	       (unsigned long long)tmp_dev->rdev->sectors,
+	       (unsigned long long)start_sector);
+	bio_io_error(bio);
 }
 
 static void linear_status (struct seq_file *seq, struct mddev *mddev)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9ab768acfb6..32fc19c540d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -72,6 +72,9 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
 static struct workqueue_struct *md_wq;
 static struct workqueue_struct *md_misc_wq;
 
+static int remove_and_add_spares(struct mddev *mddev,
+				 struct md_rdev *this);
+
 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
 
 /*
@@ -109,7 +112,7 @@ static inline int speed_max(struct mddev *mddev)
 
 static struct ctl_table_header *raid_table_header;
 
-static ctl_table raid_table[] = {
+static struct ctl_table raid_table[] = {
 	{
 		.procname	= "speed_limit_min",
 		.data		= &sysctl_speed_limit_min,
@@ -127,7 +130,7 @@ static ctl_table raid_table[] = {
 	{ }
 };
 
-static ctl_table raid_dir_table[] = {
+static struct ctl_table raid_dir_table[] = {
 	{
 		.procname	= "raid",
 		.maxlen		= 0,
@@ -137,7 +140,7 @@ static ctl_table raid_dir_table[] = {
 	{ }
 };
 
-static ctl_table raid_root_table[] = {
+static struct ctl_table raid_root_table[] = {
 	{
 		.procname	= "dev",
 		.maxlen		= 0,
@@ -180,55 +183,6 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
 }
 EXPORT_SYMBOL_GPL(bio_clone_mddev);
 
-void md_trim_bio(struct bio *bio, int offset, int size)
-{
-	/* 'bio' is a cloned bio which we need to trim to match
-	 * the given offset and size.
-	 * This requires adjusting bi_sector, bi_size, and bi_io_vec
-	 */
-	int i;
-	struct bio_vec *bvec;
-	int sofar = 0;
-
-	size <<= 9;
-	if (offset == 0 && size == bio->bi_size)
-		return;
-
-	bio->bi_sector += offset;
-	bio->bi_size = size;
-	offset <<= 9;
-	clear_bit(BIO_SEG_VALID, &bio->bi_flags);
-
-	while (bio->bi_idx < bio->bi_vcnt &&
-	       bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
-		/* remove this whole bio_vec */
-		offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
-		bio->bi_idx++;
-	}
-	if (bio->bi_idx < bio->bi_vcnt) {
-		bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
-		bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
-	}
-	/* avoid any complications with bi_idx being non-zero*/
-	if (bio->bi_idx) {
-		memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
-			(bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
-		bio->bi_vcnt -= bio->bi_idx;
-		bio->bi_idx = 0;
-	}
-	/* Make sure vcnt and last bv are not too big */
-	bio_for_each_segment(bvec, bio, i) {
-		if (sofar + bvec->bv_len > size)
-			bvec->bv_len = size - sofar;
-		if (bvec->bv_len == 0) {
-			bio->bi_vcnt = i;
-			break;
-		}
-		sofar += bvec->bv_len;
-	}
-}
-EXPORT_SYMBOL_GPL(md_trim_bio);
-
 /*
  * We have a system wide 'event count' that is incremented
  * on any 'interesting' event, and readers of /proc/mdstat
@@ -307,6 +261,10 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
 		bio_io_error(bio);
 		return;
 	}
+	if (mddev->ro == 1 && unlikely(rw == WRITE)) {
+		bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS);
+		return;
+	}
 	smp_rmb(); /* Ensure implications of  'active' are visible */
 	rcu_read_lock();
 	if (mddev->suspended) {
@@ -435,7 +393,7 @@ static void md_submit_flush_data(struct work_struct *ws)
 	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
 	struct bio *bio = mddev->flush_bio;
 
-	if (bio->bi_size == 0)
+	if (bio->bi_iter.bi_size == 0)
 		/* an empty barrier - all done */
 		bio_endio(bio, 0);
 	else {
@@ -452,7 +410,7 @@ void md_flush_request(struct mddev *mddev, struct bio *bio)
 	spin_lock_irq(&mddev->write_lock);
 	wait_event_lock_irq(mddev->sb_wait,
 			    !mddev->flush_bio,
-			    mddev->write_lock, /*nothing*/);
+			    mddev->write_lock);
 	mddev->flush_bio = bio;
 	spin_unlock_irq(&mddev->write_lock);
 
@@ -523,6 +481,7 @@ void mddev_init(struct mddev *mddev)
 	init_waitqueue_head(&mddev->recovery_wait);
 	mddev->reshape_position = MaxSector;
 	mddev->reshape_backwards = 0;
+	mddev->last_sync_action = "none";
 	mddev->resync_min = 0;
 	mddev->resync_max = MaxSector;
 	mddev->level = LEVEL_NONE;
@@ -603,11 +562,19 @@ static struct mddev * mddev_find(dev_t unit)
 	goto retry;
 }
 
-static inline int mddev_lock(struct mddev * mddev)
+static inline int __must_check mddev_lock(struct mddev * mddev)
 {
 	return mutex_lock_interruptible(&mddev->reconfig_mutex);
 }
 
+/* Sometimes we need to take the lock in a situation where
+ * failure due to interrupts is not acceptable.
+ */
+static inline void mddev_lock_nointr(struct mddev * mddev)
+{
+	mutex_lock(&mddev->reconfig_mutex);
+}
+
 static inline int mddev_is_locked(struct mddev *mddev)
 {
 	return mutex_is_locked(&mddev->reconfig_mutex);
@@ -787,7 +754,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
 	struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
 
 	bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
-	bio->bi_sector = sector;
+	bio->bi_iter.bi_sector = sector;
 	bio_add_page(bio, page, size, 0);
 	bio->bi_private = rdev;
 	bio->bi_end_io = super_written;
@@ -809,36 +776,24 @@ void md_super_wait(struct mddev *mddev)
 	finish_wait(&mddev->sb_wait, &wq);
 }
 
-static void bi_complete(struct bio *bio, int error)
-{
-	complete((struct completion*)bio->bi_private);
-}
-
 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 		 struct page *page, int rw, bool metadata_op)
 {
 	struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
-	struct completion event;
 	int ret;
 
-	rw |= REQ_SYNC;
-
 	bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
 		rdev->meta_bdev : rdev->bdev;
 	if (metadata_op)
-		bio->bi_sector = sector + rdev->sb_start;
+		bio->bi_iter.bi_sector = sector + rdev->sb_start;
 	else if (rdev->mddev->reshape_position != MaxSector &&
 		 (rdev->mddev->reshape_backwards ==
 		  (sector >= rdev->mddev->reshape_position)))
-		bio->bi_sector = sector + rdev->new_data_offset;
+		bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
 	else
-		bio->bi_sector = sector + rdev->data_offset;
+		bio->bi_iter.bi_sector = sector + rdev->data_offset;
 	bio_add_page(bio, page, size, 0);
-	init_completion(&event);
-	bio->bi_private = &event;
-	bio->bi_end_io = bi_complete;
-	submit_bio(rw, bio);
-	wait_for_completion(&event);
+	submit_bio_wait(rw, bio);
 
 	ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	bio_put(bio);
@@ -1120,6 +1075,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
 	rdev->raid_disk = -1;
 	clear_bit(Faulty, &rdev->flags);
 	clear_bit(In_sync, &rdev->flags);
+	clear_bit(Bitmap_sync, &rdev->flags);
 	clear_bit(WriteMostly, &rdev->flags);
 
 	if (mddev->raid_disks == 0) {
@@ -1181,7 +1137,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
 			mddev->bitmap_info.offset =
 				mddev->bitmap_info.default_offset;
 			mddev->bitmap_info.space =
-				mddev->bitmap_info.space;
+				mddev->bitmap_info.default_space;
 		}
 
 	} else if (mddev->pers == NULL) {
@@ -1198,6 +1154,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
 		 */
 		if (ev1 < mddev->bitmap->events_cleared)
 			return 0;
+		if (ev1 < mddev->events)
+			set_bit(Bitmap_sync, &rdev->flags);
 	} else {
 		if (ev1 < mddev->events)
 			/* just a hot-add of a new device, leave raid_disk at -1 */
@@ -1213,6 +1171,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
 			    desc->raid_disk < mddev->raid_disks */) {
 			set_bit(In_sync, &rdev->flags);
 			rdev->raid_disk = desc->raid_disk;
+			rdev->saved_raid_disk = desc->raid_disk;
 		} else if (desc->state & (1<<MD_DISK_ACTIVE)) {
 			/* active but not in sync implies recovery up to
 			 * reshape position.  We don't know exactly where
@@ -1414,12 +1373,11 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
 	unsigned long long newcsum;
 	int size = 256 + le32_to_cpu(sb->max_dev)*2;
 	__le32 *isuper = (__le32*)sb;
-	int i;
 
 	disk_csum = sb->sb_csum;
 	sb->sb_csum = 0;
 	newcsum = 0;
-	for (i=0; size>=4; size -= 4 )
+	for (; size >= 4; size -= 4)
 		newcsum += le32_to_cpu(*isuper++);
 
 	if (size == 2)
@@ -1561,8 +1519,8 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
 					     sector, count, 1) == 0)
 				return -EINVAL;
 		}
-	} else if (sb->bblog_offset == 0)
-		rdev->badblocks.shift = -1;
+	} else if (sb->bblog_offset != 0)
+		rdev->badblocks.shift = 0;
 
 	if (!refdev) {
 		ret = 1;
@@ -1607,6 +1565,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 	rdev->raid_disk = -1;
 	clear_bit(Faulty, &rdev->flags);
 	clear_bit(In_sync, &rdev->flags);
+	clear_bit(Bitmap_sync, &rdev->flags);
 	clear_bit(WriteMostly, &rdev->flags);
 
 	if (mddev->raid_disks == 0) {
@@ -1689,6 +1648,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 		 */
 		if (ev1 < mddev->bitmap->events_cleared)
 			return 0;
+		if (ev1 < mddev->events)
+			set_bit(Bitmap_sync, &rdev->flags);
 	} else {
 		if (ev1 < mddev->events)
 			/* just a hot-add of a new device, leave raid_disk at -1 */
@@ -1709,10 +1670,14 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 			set_bit(Faulty, &rdev->flags);
 			break;
 		default:
+			rdev->saved_raid_disk = role;
 			if ((le32_to_cpu(sb->feature_map) &
-			     MD_FEATURE_RECOVERY_OFFSET))
+			     MD_FEATURE_RECOVERY_OFFSET)) {
 				rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
-			else
+				if (!(le32_to_cpu(sb->feature_map) &
+				      MD_FEATURE_RECOVERY_BITMAP))
+					rdev->saved_raid_disk = -1;
+			} else
 				set_bit(In_sync, &rdev->flags);
 			rdev->raid_disk = role;
 			break;
@@ -1774,6 +1739,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
 			cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
 		sb->recovery_offset =
 			cpu_to_le64(rdev->recovery_offset);
+		if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
+			sb->feature_map |=
+				cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
 	}
 	if (test_bit(Replacement, &rdev->flags))
 		sb->feature_map |=
@@ -1817,10 +1785,10 @@ retry:
 			memset(bbp, 0xff, PAGE_SIZE);
 
 			for (i = 0 ; i < bb->count ; i++) {
-				u64 internal_bb = *p++;
+				u64 internal_bb = p[i];
 				u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
 						| BB_LEN(internal_bb));
-				*bbp++ = cpu_to_le64(store_bb);
+				bbp[i] = cpu_to_le64(store_bb);
 			}
 			bb->changed = 0;
 			if (read_seqretry(&bb->lock, seq))
@@ -2408,6 +2376,11 @@ static void md_update_sb(struct mddev * mddev, int force_change)
 	int nospares = 0;
 	int any_badblocks_changed = 0;
 
+	if (mddev->ro) {
+		if (force_change)
+			set_bit(MD_CHANGE_DEVS, &mddev->flags);
+		return;
+	}
 repeat:
 	/* First make sure individual recovery_offsets are correct */
 	rdev_for_each(rdev, mddev) {
@@ -2510,8 +2483,7 @@ repeat:
 		if (rdev->sb_loaded != 1)
 			continue; /* no noise on spare devices */
 
-		if (!test_bit(Faulty, &rdev->flags) &&
-		    rdev->saved_raid_disk == -1) {
+		if (!test_bit(Faulty, &rdev->flags)) {
 			md_super_write(mddev,rdev,
 				       rdev->sb_start, rdev->sb_size,
 				       rdev->sb_page);
@@ -2527,11 +2499,9 @@ repeat:
 				rdev->badblocks.size = 0;
 			}
 
-		} else if (test_bit(Faulty, &rdev->flags))
+		} else
 			pr_debug("md: %s (skipping faulty)\n",
 				 bdevname(rdev->bdev, b));
-		else
-			pr_debug("(skipping incremental s/r ");
 
 		if (mddev->level == LEVEL_MULTIPATH)
 			/* only need to write one superblock... */
@@ -2647,6 +2617,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
 	 *  blocked - sets the Blocked flags
 	 *  -blocked - clears the Blocked and possibly simulates an error
 	 *  insync - sets Insync providing device isn't active
+	 *  -insync - clear Insync for a device with a slot assigned,
+	 *            so that it gets rebuilt based on bitmap
 	 *  write_error - sets WriteErrorSeen
 	 *  -write_error - clears WriteErrorSeen
 	 */
@@ -2695,6 +2667,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
 	} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
 		set_bit(In_sync, &rdev->flags);
 		err = 0;
+	} else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) {
+		clear_bit(In_sync, &rdev->flags);
+		rdev->saved_raid_disk = rdev->raid_disk;
+		rdev->raid_disk = -1;
+		err = 0;
 	} else if (cmd_match(buf, "write_error")) {
 		set_bit(WriteErrorSeen, &rdev->flags);
 		err = 0;
@@ -2797,12 +2774,10 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
 		/* personality does all needed checks */
 		if (rdev->mddev->pers->hot_remove_disk == NULL)
 			return -EINVAL;
-		err = rdev->mddev->pers->
-			hot_remove_disk(rdev->mddev, rdev);
-		if (err)
-			return err;
-		sysfs_unlink_rdev(rdev->mddev, rdev);
-		rdev->raid_disk = -1;
+		clear_bit(Blocked, &rdev->flags);
+		remove_and_add_spares(rdev->mddev, rdev);
+		if (rdev->raid_disk >= 0)
+			return -EBUSY;
 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
 		md_wakeup_thread(rdev->mddev->thread);
 	} else if (rdev->mddev->pers) {
@@ -2829,6 +2804,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
 		else
 			rdev->saved_raid_disk = -1;
 		clear_bit(In_sync, &rdev->flags);
+		clear_bit(Bitmap_sync, &rdev->flags);
 		err = rdev->mddev->pers->
 			hot_add_disk(rdev->mddev, rdev);
 		if (err) {
@@ -2867,7 +2843,7 @@ static ssize_t
 offset_store(struct md_rdev *rdev, const char *buf, size_t len)
 {
 	unsigned long long offset;
-	if (strict_strtoull(buf, 10, &offset) < 0)
+	if (kstrtoull(buf, 10, &offset) < 0)
 		return -EINVAL;
 	if (rdev->mddev->pers && rdev->raid_disk >= 0)
 		return -EBUSY;
@@ -2895,7 +2871,7 @@ static ssize_t new_offset_store(struct md_rdev *rdev,
 	unsigned long long new_offset;
 	struct mddev *mddev = rdev->mddev;
 
-	if (strict_strtoull(buf, 10, &new_offset) < 0)
+	if (kstrtoull(buf, 10, &new_offset) < 0)
 		return -EINVAL;
 
 	if (mddev->sync_thread)
@@ -2961,7 +2937,7 @@ static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
 	unsigned long long blocks;
 	sector_t new;
 
-	if (strict_strtoull(buf, 10, &blocks) < 0)
+	if (kstrtoull(buf, 10, &blocks) < 0)
 		return -EINVAL;
 
 	if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
@@ -2995,6 +2971,9 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
 		} else if (!sectors)
 			sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
 				rdev->data_offset;
+		if (!my_mddev->pers->resize)
+			/* Cannot change size for RAID0 or Linear etc */
+			return -EINVAL;
 	}
 	if (sectors < my_mddev->dev_sectors)
 		return -EINVAL; /* component must fit device */
@@ -3014,7 +2993,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
 		for_each_mddev(mddev, tmp) {
 			struct md_rdev *rdev2;
 
-			mddev_lock(mddev);
+			mddev_lock_nointr(mddev);
 			rdev_for_each(rdev2, mddev)
 				if (rdev->bdev == rdev2->bdev &&
 				    rdev != rdev2 &&
@@ -3030,7 +3009,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
 				break;
 			}
 		}
-		mddev_lock(my_mddev);
+		mddev_lock_nointr(my_mddev);
 		if (overlap) {
 			/* Someone else could have slipped in a size
 			 * change here, but doing so is just silly.
@@ -3066,7 +3045,7 @@ static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_
 
 	if (cmd_match(buf, "none"))
 		recovery_start = MaxSector;
-	else if (strict_strtoull(buf, 10, &recovery_start))
+	else if (kstrtoull(buf, 10, &recovery_start))
 		return -EINVAL;
 
 	if (rdev->mddev->pers &&
@@ -3215,7 +3194,7 @@ int md_rdev_init(struct md_rdev *rdev)
 	 * be used - I wonder if that matters
 	 */
 	rdev->badblocks.count = 0;
-	rdev->badblocks.shift = 0;
+	rdev->badblocks.shift = -1; /* disabled until explicitly enabled */
 	rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	seqlock_init(&rdev->badblocks.lock);
 	if (rdev->badblocks.page == NULL)
@@ -3287,9 +3266,6 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
 			goto abort_free;
 		}
 	}
-	if (super_format == -1)
-		/* hot-add for 0.90, or non-persistent: so no badblocks */
-		rdev->badblocks.shift = -1;
 
 	return rdev;
 
@@ -3428,7 +3404,7 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
 		mddev->safemode_delay = (msec*HZ)/1000;
 		if (mddev->safemode_delay == 0)
 			mddev->safemode_delay = 1;
-		if (mddev->safemode_delay < old_delay)
+		if (mddev->safemode_delay < old_delay || old_delay == 0)
 			md_safemode_timeout((unsigned long)mddev);
 	}
 	return len;
@@ -3472,6 +3448,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
 		mddev->level = LEVEL_NONE;
 		return rv;
 	}
+	if (mddev->ro)
+		return  -EROFS;
 
 	/* request to change the personality.  Need to ensure:
 	 *  - array is not engaged in resync/recovery/reshape
@@ -3497,7 +3475,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
 	if (clevel[len-1] == '\n')
 		len--;
 	clevel[len] = 0;
-	if (strict_strtol(clevel, 10, &level))
+	if (kstrtol(clevel, 10, &level))
 		level = LEVEL_NONE;
 
 	if (request_module("md-%s", clevel) != 0)
@@ -3554,7 +3532,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
 			printk(KERN_WARNING
 			       "md: cannot register extra attributes for %s\n",
 			       mdname(mddev));
-		mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action");
+		mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
 	}		
 	if (mddev->pers->sync_request != NULL &&
 	    pers->sync_request == NULL) {
@@ -3619,9 +3597,12 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
 		mddev->in_sync = 1;
 		del_timer_sync(&mddev->safemode_timer);
 	}
+	blk_set_stacking_limits(&mddev->queue->limits);
 	pers->run(mddev);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 	mddev_resume(mddev);
+	if (!mddev->thread)
+		md_update_sb(mddev, 1);
 	sysfs_notify(&mddev->kobj, NULL, "level");
 	md_new_event(mddev);
 	return rv;
@@ -3655,6 +3636,8 @@ layout_store(struct mddev *mddev, const char *buf, size_t len)
 		int err;
 		if (mddev->pers->check_reshape == NULL)
 			return -EBUSY;
+		if (mddev->ro)
+			return -EROFS;
 		mddev->new_layout = n;
 		err = mddev->pers->check_reshape(mddev);
 		if (err) {
@@ -3744,6 +3727,8 @@ chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
 		int err;
 		if (mddev->pers->check_reshape == NULL)
 			return -EBUSY;
+		if (mddev->ro)
+			return -EROFS;
 		mddev->new_chunk_sectors = n >> 9;
 		err = mddev->pers->check_reshape(mddev);
 		if (err) {
@@ -4124,7 +4109,7 @@ static struct md_sysfs_entry md_size =
 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
 
 
-/* Metdata version.
+/* Metadata version.
  * This is one of
  *   'none' for arrays with no metadata (good luck...)
  *   'external' for arrays with externally managed metadata,
@@ -4219,8 +4204,6 @@ action_show(struct mddev *mddev, char *page)
 	return sprintf(page, "%s\n", type);
 }
 
-static void reap_sync_thread(struct mddev *mddev);
-
 static ssize_t
 action_store(struct mddev *mddev, const char *page, size_t len)
 {
@@ -4235,7 +4218,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
 	if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
 		if (mddev->sync_thread) {
 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-			reap_sync_thread(mddev);
+			md_reap_sync_thread(mddev);
 		}
 	} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
 		   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
@@ -4274,6 +4257,17 @@ action_store(struct mddev *mddev, const char *page, size_t len)
 	return len;
 }
 
+static struct md_sysfs_entry md_scan_mode =
+__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
+
+static ssize_t
+last_sync_action_show(struct mddev *mddev, char *page)
+{
+	return sprintf(page, "%s\n", mddev->last_sync_action);
+}
+
+static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
+
 static ssize_t
 mismatch_cnt_show(struct mddev *mddev, char *page)
 {
@@ -4282,10 +4276,6 @@ mismatch_cnt_show(struct mddev *mddev, char *page)
 		       atomic64_read(&mddev->resync_mismatches));
 }
 
-static struct md_sysfs_entry md_scan_mode =
-__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
-
-
 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
 
 static ssize_t
@@ -4358,7 +4348,7 @@ sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	long n;
 
-	if (strict_strtol(buf, 10, &n))
+	if (kstrtol(buf, 10, &n))
 		return -EINVAL;
 
 	if (n != 0 && n != 1)
@@ -4426,7 +4416,7 @@ static ssize_t
 min_sync_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	unsigned long long min;
-	if (strict_strtoull(buf, 10, &min))
+	if (kstrtoull(buf, 10, &min))
 		return -EINVAL;
 	if (min > mddev->resync_max)
 		return -EINVAL;
@@ -4463,7 +4453,7 @@ max_sync_store(struct mddev *mddev, const char *buf, size_t len)
 		mddev->resync_max = MaxSector;
 	else {
 		unsigned long long max;
-		if (strict_strtoull(buf, 10, &max))
+		if (kstrtoull(buf, 10, &max))
 			return -EINVAL;
 		if (max < mddev->resync_min)
 			return -EINVAL;
@@ -4688,6 +4678,7 @@ static struct attribute *md_default_attrs[] = {
 
 static struct attribute *md_redundancy_attrs[] = {
 	&md_scan_mode.attr,
+	&md_last_scan_mode.attr,
 	&md_mismatches.attr,
 	&md_sync_min.attr,
 	&md_sync_max.attr,
@@ -4753,6 +4744,8 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
 	}
 	mddev_get(mddev);
 	spin_unlock(&all_mddevs_lock);
+	if (entry->store == new_dev_store)
+		flush_workqueue(md_misc_wq);
 	rv = mddev_lock(mddev);
 	if (!rv) {
 		rv = entry->store(mddev, page, length);
@@ -5135,7 +5128,7 @@ int md_run(struct mddev *mddev)
 	
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	
-	if (mddev->flags)
+	if (mddev->flags & MD_UPDATE_SB_FLAGS)
 		md_update_sb(mddev, 0);
 
 	md_new_event(mddev);
@@ -5194,32 +5187,6 @@ static int restart_array(struct mddev *mddev)
 	return 0;
 }
 
-/* similar to deny_write_access, but accounts for our holding a reference
- * to the file ourselves */
-static int deny_bitmap_write_access(struct file * file)
-{
-	struct inode *inode = file->f_mapping->host;
-
-	spin_lock(&inode->i_lock);
-	if (atomic_read(&inode->i_writecount) > 1) {
-		spin_unlock(&inode->i_lock);
-		return -ETXTBSY;
-	}
-	atomic_set(&inode->i_writecount, -1);
-	spin_unlock(&inode->i_lock);
-
-	return 0;
-}
-
-void restore_bitmap_write_access(struct file *file)
-{
-	struct inode *inode = file->f_mapping->host;
-
-	spin_lock(&inode->i_lock);
-	atomic_set(&inode->i_writecount, 1);
-	spin_unlock(&inode->i_lock);
-}
-
 static void md_clean(struct mddev *mddev)
 {
 	mddev->array_sectors = 0;
@@ -5268,10 +5235,10 @@ static void md_clean(struct mddev *mddev)
 
 static void __md_stop_writes(struct mddev *mddev)
 {
+	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 	if (mddev->sync_thread) {
-		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-		reap_sync_thread(mddev);
+		md_reap_sync_thread(mddev);
 	}
 
 	del_timer_sync(&mddev->safemode_timer);
@@ -5279,7 +5246,8 @@ static void __md_stop_writes(struct mddev *mddev)
 	bitmap_flush(mddev);
 	md_super_wait(mddev);
 
-	if (!mddev->in_sync || mddev->flags) {
+	if (mddev->ro == 0 &&
+	    (!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) {
 		/* mark array as shutdown cleanly */
 		mddev->in_sync = 1;
 		md_update_sb(mddev, 1);
@@ -5288,13 +5256,13 @@ static void __md_stop_writes(struct mddev *mddev)
 
 void md_stop_writes(struct mddev *mddev)
 {
-	mddev_lock(mddev);
+	mddev_lock_nointr(mddev);
 	__md_stop_writes(mddev);
 	mddev_unlock(mddev);
 }
 EXPORT_SYMBOL_GPL(md_stop_writes);
 
-void md_stop(struct mddev *mddev)
+static void __md_stop(struct mddev *mddev)
 {
 	mddev->ready = 0;
 	mddev->pers->stop(mddev);
@@ -5304,19 +5272,52 @@ void md_stop(struct mddev *mddev)
 	mddev->pers = NULL;
 	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 }
+
+void md_stop(struct mddev *mddev)
+{
+	/* stop the array and free an attached data structures.
+	 * This is called from dm-raid
+	 */
+	__md_stop(mddev);
+	bitmap_destroy(mddev);
+	if (mddev->bio_set)
+		bioset_free(mddev->bio_set);
+}
+
 EXPORT_SYMBOL_GPL(md_stop);
 
 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
 {
 	int err = 0;
+	int did_freeze = 0;
+
+	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
+		did_freeze = 1;
+		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+		md_wakeup_thread(mddev->thread);
+	}
+	if (mddev->sync_thread) {
+		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+		/* Thread might be blocked waiting for metadata update
+		 * which will now never happen */
+		wake_up_process(mddev->sync_thread->tsk);
+	}
+	mddev_unlock(mddev);
+	wait_event(resync_wait, mddev->sync_thread == NULL);
+	mddev_lock_nointr(mddev);
+
 	mutex_lock(&mddev->open_mutex);
-	if (atomic_read(&mddev->openers) > !!bdev) {
+	if (atomic_read(&mddev->openers) > !!bdev ||
+	    mddev->sync_thread ||
+	    (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
 		printk("md: %s still in use.\n",mdname(mddev));
+		if (did_freeze) {
+			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+			md_wakeup_thread(mddev->thread);
+		}
 		err = -EBUSY;
 		goto out;
 	}
-	if (bdev)
-		sync_blockdev(bdev);
 	if (mddev->pers) {
 		__md_stop_writes(mddev);
 
@@ -5327,7 +5328,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
 		set_disk_ro(mddev->gendisk, 1);
 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 		sysfs_notify_dirent_safe(mddev->sysfs_state);
-		err = 0;	
+		err = 0;
 	}
 out:
 	mutex_unlock(&mddev->open_mutex);
@@ -5343,28 +5344,42 @@ static int do_md_stop(struct mddev * mddev, int mode,
 {
 	struct gendisk *disk = mddev->gendisk;
 	struct md_rdev *rdev;
+	int did_freeze = 0;
+
+	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
+		did_freeze = 1;
+		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+		md_wakeup_thread(mddev->thread);
+	}
+	if (mddev->sync_thread) {
+		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+		/* Thread might be blocked waiting for metadata update
+		 * which will now never happen */
+		wake_up_process(mddev->sync_thread->tsk);
+	}
+	mddev_unlock(mddev);
+	wait_event(resync_wait, mddev->sync_thread == NULL);
+	mddev_lock_nointr(mddev);
 
 	mutex_lock(&mddev->open_mutex);
 	if (atomic_read(&mddev->openers) > !!bdev ||
-	    mddev->sysfs_active) {
+	    mddev->sysfs_active ||
+	    mddev->sync_thread ||
+	    (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
 		printk("md: %s still in use.\n",mdname(mddev));
 		mutex_unlock(&mddev->open_mutex);
+		if (did_freeze) {
+			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+			md_wakeup_thread(mddev->thread);
+		}
 		return -EBUSY;
 	}
-	if (bdev)
-		/* It is possible IO was issued on some other
-		 * open file which was closed before we took ->open_mutex.
-		 * As that was not the last close __blkdev_put will not
-		 * have called sync_blockdev, so we must.
-		 */
-		sync_blockdev(bdev);
-
 	if (mddev->pers) {
 		if (mddev->ro)
 			set_disk_ro(disk, 0);
 
 		__md_stop_writes(mddev);
-		md_stop(mddev);
+		__md_stop(mddev);
 		mddev->queue->merge_bvec_fn = NULL;
 		mddev->queue->backing_dev_info.congested_fn = NULL;
 
@@ -5392,7 +5407,6 @@ static int do_md_stop(struct mddev * mddev, int mode,
 
 		bitmap_destroy(mddev);
 		if (mddev->bitmap_info.file) {
-			restore_bitmap_write_access(mddev->bitmap_info.file);
 			fput(mddev->bitmap_info.file);
 			mddev->bitmap_info.file = NULL;
 		}
@@ -5585,7 +5599,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg)
 	if (mddev->in_sync)
 		info.state = (1<<MD_SB_CLEAN);
 	if (mddev->bitmap && mddev->bitmap_info.offset)
-		info.state = (1<<MD_SB_BITMAP_PRESENT);
+		info.state |= (1<<MD_SB_BITMAP_PRESENT);
 	info.active_disks  = insync;
 	info.working_disks = working;
 	info.failed_disks  = failed;
@@ -5606,10 +5620,7 @@ static int get_bitmap_file(struct mddev * mddev, void __user * arg)
 	char *ptr, *buf = NULL;
 	int err = -ENOMEM;
 
-	if (md_allow_write(mddev))
-		file = kmalloc(sizeof(*file), GFP_NOIO);
-	else
-		file = kmalloc(sizeof(*file), GFP_KERNEL);
+	file = kmalloc(sizeof(*file), GFP_NOIO);
 
 	if (!file)
 		goto out;
@@ -5747,8 +5758,10 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
 			    info->raid_disk < mddev->raid_disks) {
 				rdev->raid_disk = info->raid_disk;
 				set_bit(In_sync, &rdev->flags);
+				clear_bit(Bitmap_sync, &rdev->flags);
 			} else
 				rdev->raid_disk = -1;
+			rdev->saved_raid_disk = rdev->raid_disk;
 		} else
 			super_types[mddev->major_version].
 				validate_super(mddev, rdev);
@@ -5761,11 +5774,6 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
 			return -EINVAL;
 		}
 
-		if (test_bit(In_sync, &rdev->flags))
-			rdev->saved_raid_disk = rdev->raid_disk;
-		else
-			rdev->saved_raid_disk = -1;
-
 		clear_bit(In_sync, &rdev->flags); /* just to be sure */
 		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
 			set_bit(WriteMostly, &rdev->flags);
@@ -5790,7 +5798,7 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
 		else
 			sysfs_notify_dirent_safe(rdev->sysfs_state);
 
-		md_update_sb(mddev, 1);
+		set_bit(MD_CHANGE_DEVS, &mddev->flags);
 		if (mddev->degraded)
 			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -5857,6 +5865,9 @@ static int hot_remove_disk(struct mddev * mddev, dev_t dev)
 	if (!rdev)
 		return -ENXIO;
 
+	clear_bit(Blocked, &rdev->flags);
+	remove_and_add_spares(mddev, rdev);
+
 	if (rdev->raid_disk >= 0)
 		goto busy;
 
@@ -5947,7 +5958,7 @@ abort_export:
 
 static int set_bitmap_file(struct mddev *mddev, int fd)
 {
-	int err;
+	int err = 0;
 
 	if (mddev->pers) {
 		if (!mddev->pers->quiesce)
@@ -5959,6 +5970,7 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
 
 
 	if (fd >= 0) {
+		struct inode *inode;
 		if (mddev->bitmap)
 			return -EEXIST; /* cannot add when bitmap is present */
 		mddev->bitmap_info.file = fget(fd);
@@ -5969,10 +5981,21 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
 			return -EBADF;
 		}
 
-		err = deny_bitmap_write_access(mddev->bitmap_info.file);
-		if (err) {
+		inode = mddev->bitmap_info.file->f_mapping->host;
+		if (!S_ISREG(inode->i_mode)) {
+			printk(KERN_ERR "%s: error: bitmap file must be a regular file\n",
+			       mdname(mddev));
+			err = -EBADF;
+		} else if (!(mddev->bitmap_info.file->f_mode & FMODE_WRITE)) {
+			printk(KERN_ERR "%s: error: bitmap file must open for write\n",
+			       mdname(mddev));
+			err = -EBADF;
+		} else if (atomic_read(&inode->i_writecount) != 1) {
 			printk(KERN_ERR "%s: error: bitmap file is already in use\n",
 			       mdname(mddev));
+			err = -EBUSY;
+		}
+		if (err) {
 			fput(mddev->bitmap_info.file);
 			mddev->bitmap_info.file = NULL;
 			return err;
@@ -5995,10 +6018,8 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
 		mddev->pers->quiesce(mddev, 0);
 	}
 	if (fd < 0) {
-		if (mddev->bitmap_info.file) {
-			restore_bitmap_write_access(mddev->bitmap_info.file);
+		if (mddev->bitmap_info.file)
 			fput(mddev->bitmap_info.file);
-		}
 		mddev->bitmap_info.file = NULL;
 	}
 
@@ -6120,6 +6141,8 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
 	 */
 	if (mddev->sync_thread)
 		return -EBUSY;
+	if (mddev->ro)
+		return -EROFS;
 
 	rdev_for_each(rdev, mddev) {
 		sector_t avail = rdev->sectors;
@@ -6142,6 +6165,8 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
 	/* change the number of raid disks */
 	if (mddev->pers->check_reshape == NULL)
 		return -EINVAL;
+	if (mddev->ro)
+		return -EROFS;
 	if (raid_disks <= 0 ||
 	    (mddev->max_disks && raid_disks >= mddev->max_disks))
 		return -EINVAL;
@@ -6312,6 +6337,32 @@ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
+static inline bool md_ioctl_valid(unsigned int cmd)
+{
+	switch (cmd) {
+	case ADD_NEW_DISK:
+	case BLKROSET:
+	case GET_ARRAY_INFO:
+	case GET_BITMAP_FILE:
+	case GET_DISK_INFO:
+	case HOT_ADD_DISK:
+	case HOT_REMOVE_DISK:
+	case PRINT_RAID_DEBUG:
+	case RAID_AUTORUN:
+	case RAID_VERSION:
+	case RESTART_ARRAY_RW:
+	case RUN_ARRAY:
+	case SET_ARRAY_INFO:
+	case SET_BITMAP_FILE:
+	case SET_DISK_FAULTY:
+	case STOP_ARRAY:
+	case STOP_ARRAY_RO:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static int md_ioctl(struct block_device *bdev, fmode_t mode,
 			unsigned int cmd, unsigned long arg)
 {
@@ -6320,6 +6371,9 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 	struct mddev *mddev = NULL;
 	int ro;
 
+	if (!md_ioctl_valid(cmd))
+		return -ENOTTY;
+
 	switch (cmd) {
 	case RAID_VERSION:
 	case GET_ARRAY_INFO:
@@ -6334,24 +6388,23 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 	 * Commands dealing with the RAID driver but not any
 	 * particular array:
 	 */
-	switch (cmd)
-	{
-		case RAID_VERSION:
-			err = get_version(argp);
-			goto done;
+	switch (cmd) {
+	case RAID_VERSION:
+		err = get_version(argp);
+		goto done;
 
-		case PRINT_RAID_DEBUG:
-			err = 0;
-			md_print_devices();
-			goto done;
+	case PRINT_RAID_DEBUG:
+		err = 0;
+		md_print_devices();
+		goto done;
 
 #ifndef MODULE
-		case RAID_AUTORUN:
-			err = 0;
-			autostart_arrays(arg);
-			goto done;
+	case RAID_AUTORUN:
+		err = 0;
+		autostart_arrays(arg);
+		goto done;
 #endif
-		default:;
+	default:;
 	}
 
 	/*
@@ -6386,6 +6439,30 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 		goto abort;
 	}
 
+	if (cmd == ADD_NEW_DISK)
+		/* need to ensure md_delayed_delete() has completed */
+		flush_workqueue(md_misc_wq);
+
+	if (cmd == HOT_REMOVE_DISK)
+		/* need to ensure recovery thread has run */
+		wait_event_interruptible_timeout(mddev->sb_wait,
+						 !test_bit(MD_RECOVERY_NEEDED,
+							   &mddev->flags),
+						 msecs_to_jiffies(5000));
+	if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
+		/* Need to flush page cache, and ensure no-one else opens
+		 * and writes
+		 */
+		mutex_lock(&mddev->open_mutex);
+		if (atomic_read(&mddev->openers) > 1) {
+			mutex_unlock(&mddev->open_mutex);
+			err = -EBUSY;
+			goto abort;
+		}
+		set_bit(MD_STILL_CLOSED, &mddev->flags);
+		mutex_unlock(&mddev->open_mutex);
+		sync_blockdev(bdev);
+	}
 	err = mddev_lock(mddev);
 	if (err) {
 		printk(KERN_INFO 
@@ -6394,50 +6471,44 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 		goto abort;
 	}
 
-	switch (cmd)
-	{
-		case SET_ARRAY_INFO:
-			{
-				mdu_array_info_t info;
-				if (!arg)
-					memset(&info, 0, sizeof(info));
-				else if (copy_from_user(&info, argp, sizeof(info))) {
-					err = -EFAULT;
-					goto abort_unlock;
-				}
-				if (mddev->pers) {
-					err = update_array_info(mddev, &info);
-					if (err) {
-						printk(KERN_WARNING "md: couldn't update"
-						       " array info. %d\n", err);
-						goto abort_unlock;
-					}
-					goto done_unlock;
-				}
-				if (!list_empty(&mddev->disks)) {
-					printk(KERN_WARNING
-					       "md: array %s already has disks!\n",
-					       mdname(mddev));
-					err = -EBUSY;
-					goto abort_unlock;
-				}
-				if (mddev->raid_disks) {
-					printk(KERN_WARNING
-					       "md: array %s already initialised!\n",
-					       mdname(mddev));
-					err = -EBUSY;
-					goto abort_unlock;
-				}
-				err = set_array_info(mddev, &info);
-				if (err) {
-					printk(KERN_WARNING "md: couldn't set"
-					       " array info. %d\n", err);
-					goto abort_unlock;
-				}
+	if (cmd == SET_ARRAY_INFO) {
+		mdu_array_info_t info;
+		if (!arg)
+			memset(&info, 0, sizeof(info));
+		else if (copy_from_user(&info, argp, sizeof(info))) {
+			err = -EFAULT;
+			goto abort_unlock;
+		}
+		if (mddev->pers) {
+			err = update_array_info(mddev, &info);
+			if (err) {
+				printk(KERN_WARNING "md: couldn't update"
+				       " array info. %d\n", err);
+				goto abort_unlock;
 			}
 			goto done_unlock;
-
-		default:;
+		}
+		if (!list_empty(&mddev->disks)) {
+			printk(KERN_WARNING
+			       "md: array %s already has disks!\n",
+			       mdname(mddev));
+			err = -EBUSY;
+			goto abort_unlock;
+		}
+		if (mddev->raid_disks) {
+			printk(KERN_WARNING
+			       "md: array %s already initialised!\n",
+			       mdname(mddev));
+			err = -EBUSY;
+			goto abort_unlock;
+		}
+		err = set_array_info(mddev, &info);
+		if (err) {
+			printk(KERN_WARNING "md: couldn't set"
+			       " array info. %d\n", err);
+			goto abort_unlock;
+		}
+		goto done_unlock;
 	}
 
 	/*
@@ -6456,52 +6527,73 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 	/*
 	 * Commands even a read-only array can execute:
 	 */
-	switch (cmd)
-	{
-		case GET_BITMAP_FILE:
-			err = get_bitmap_file(mddev, argp);
-			goto done_unlock;
+	switch (cmd) {
+	case GET_BITMAP_FILE:
+		err = get_bitmap_file(mddev, argp);
+		goto done_unlock;
 
-		case RESTART_ARRAY_RW:
-			err = restart_array(mddev);
-			goto done_unlock;
+	case RESTART_ARRAY_RW:
+		err = restart_array(mddev);
+		goto done_unlock;
 
-		case STOP_ARRAY:
-			err = do_md_stop(mddev, 0, bdev);
-			goto done_unlock;
+	case STOP_ARRAY:
+		err = do_md_stop(mddev, 0, bdev);
+		goto done_unlock;
 
-		case STOP_ARRAY_RO:
-			err = md_set_readonly(mddev, bdev);
-			goto done_unlock;
+	case STOP_ARRAY_RO:
+		err = md_set_readonly(mddev, bdev);
+		goto done_unlock;
+
+	case HOT_REMOVE_DISK:
+		err = hot_remove_disk(mddev, new_decode_dev(arg));
+		goto done_unlock;
 
-		case BLKROSET:
-			if (get_user(ro, (int __user *)(arg))) {
+	case ADD_NEW_DISK:
+		/* We can support ADD_NEW_DISK on read-only arrays
+		 * on if we are re-adding a preexisting device.
+		 * So require mddev->pers and MD_DISK_SYNC.
+		 */
+		if (mddev->pers) {
+			mdu_disk_info_t info;
+			if (copy_from_user(&info, argp, sizeof(info)))
 				err = -EFAULT;
-				goto done_unlock;
-			}
-			err = -EINVAL;
+			else if (!(info.state & (1<<MD_DISK_SYNC)))
+				/* Need to clear read-only for this */
+				break;
+			else
+				err = add_new_disk(mddev, &info);
+			goto done_unlock;
+		}
+		break;
 
-			/* if the bdev is going readonly the value of mddev->ro
-			 * does not matter, no writes are coming
-			 */
-			if (ro)
-				goto done_unlock;
+	case BLKROSET:
+		if (get_user(ro, (int __user *)(arg))) {
+			err = -EFAULT;
+			goto done_unlock;
+		}
+		err = -EINVAL;
 
-			/* are we are already prepared for writes? */
-			if (mddev->ro != 1)
-				goto done_unlock;
+		/* if the bdev is going readonly the value of mddev->ro
+		 * does not matter, no writes are coming
+		 */
+		if (ro)
+			goto done_unlock;
 
-			/* transitioning to readauto need only happen for
-			 * arrays that call md_write_start
-			 */
-			if (mddev->pers) {
-				err = restart_array(mddev);
-				if (err == 0) {
-					mddev->ro = 2;
-					set_disk_ro(mddev->gendisk, 0);
-				}
-			}
+		/* are we are already prepared for writes? */
+		if (mddev->ro != 1)
 			goto done_unlock;
+
+		/* transitioning to readauto need only happen for
+		 * arrays that call md_write_start
+		 */
+		if (mddev->pers) {
+			err = restart_array(mddev);
+			if (err == 0) {
+				mddev->ro = 2;
+				set_disk_ro(mddev->gendisk, 0);
+			}
+		}
+		goto done_unlock;
 	}
 
 	/*
@@ -6516,44 +6608,49 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 			mddev->ro = 0;
 			sysfs_notify_dirent_safe(mddev->sysfs_state);
 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-			md_wakeup_thread(mddev->thread);
+			/* mddev_unlock will wake thread */
+			/* If a device failed while we were read-only, we
+			 * need to make sure the metadata is updated now.
+			 */
+			if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
+				mddev_unlock(mddev);
+				wait_event(mddev->sb_wait,
+					   !test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
+					   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
+				mddev_lock_nointr(mddev);
+			}
 		} else {
 			err = -EROFS;
 			goto abort_unlock;
 		}
 	}
 
-	switch (cmd)
+	switch (cmd) {
+	case ADD_NEW_DISK:
 	{
-		case ADD_NEW_DISK:
-		{
-			mdu_disk_info_t info;
-			if (copy_from_user(&info, argp, sizeof(info)))
-				err = -EFAULT;
-			else
-				err = add_new_disk(mddev, &info);
-			goto done_unlock;
-		}
-
-		case HOT_REMOVE_DISK:
-			err = hot_remove_disk(mddev, new_decode_dev(arg));
-			goto done_unlock;
+		mdu_disk_info_t info;
+		if (copy_from_user(&info, argp, sizeof(info)))
+			err = -EFAULT;
+		else
+			err = add_new_disk(mddev, &info);
+		goto done_unlock;
+	}
 
-		case HOT_ADD_DISK:
-			err = hot_add_disk(mddev, new_decode_dev(arg));
-			goto done_unlock;
+	case HOT_ADD_DISK:
+		err = hot_add_disk(mddev, new_decode_dev(arg));
+		goto done_unlock;
 
-		case RUN_ARRAY:
-			err = do_md_run(mddev);
-			goto done_unlock;
+	case RUN_ARRAY:
+		err = do_md_run(mddev);
+		goto done_unlock;
 
-		case SET_BITMAP_FILE:
-			err = set_bitmap_file(mddev, (int)arg);
-			goto done_unlock;
+	case SET_BITMAP_FILE:
+		err = set_bitmap_file(mddev, (int)arg);
+		goto done_unlock;
 
-		default:
-			err = -EINVAL;
-			goto abort_unlock;
+	default:
+		err = -EINVAL;
+		goto abort_unlock;
 	}
 
 done_unlock:
@@ -6619,6 +6716,7 @@ static int md_open(struct block_device *bdev, fmode_t mode)
 
 	err = 0;
 	atomic_inc(&mddev->openers);
+	clear_bit(MD_STILL_CLOSED, &mddev->flags);
 	mutex_unlock(&mddev->open_mutex);
 
 	check_disk_change(bdev);
@@ -6626,15 +6724,13 @@ static int md_open(struct block_device *bdev, fmode_t mode)
 	return err;
 }
 
-static int md_release(struct gendisk *disk, fmode_t mode)
+static void md_release(struct gendisk *disk, fmode_t mode)
 {
  	struct mddev *mddev = disk->private_data;
 
 	BUG_ON(!mddev);
 	atomic_dec(&mddev->openers);
 	mddev_put(mddev);
-
-	return 0;
 }
 
 static int md_media_changed(struct gendisk *disk)
@@ -7079,11 +7175,14 @@ static int md_seq_open(struct inode *inode, struct file *file)
 	return error;
 }
 
+static int md_unloading;
 static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
 {
 	struct seq_file *seq = filp->private_data;
 	int mask;
 
+	if (md_unloading)
+		return POLLIN|POLLRDNORM|POLLERR|POLLPRI;;
 	poll_wait(filp, &md_event_waiters, wait);
 
 	/* always allow read */
@@ -7172,6 +7271,7 @@ void md_done_sync(struct mddev *mddev, int blocks, int ok)
 	wake_up(&mddev->recovery_wait);
 	if (!ok) {
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+		set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
 		md_wakeup_thread(mddev->thread);
 		// stop recovery, signal do_sync ....
 	}
@@ -7269,6 +7369,7 @@ EXPORT_SYMBOL_GPL(md_allow_write);
 
 #define SYNC_MARKS	10
 #define	SYNC_MARK_STEP	(3*HZ)
+#define UPDATE_FREQUENCY (5*60*HZ)
 void md_do_sync(struct md_thread *thread)
 {
 	struct mddev *mddev = thread->mddev;
@@ -7277,33 +7378,40 @@ void md_do_sync(struct md_thread *thread)
 		 window;
 	sector_t max_sectors,j, io_sectors;
 	unsigned long mark[SYNC_MARKS];
+	unsigned long update_time;
 	sector_t mark_cnt[SYNC_MARKS];
 	int last_mark,m;
 	struct list_head *tmp;
 	sector_t last_check;
 	int skipped = 0;
 	struct md_rdev *rdev;
-	char *desc;
+	char *desc, *action = NULL;
 	struct blk_plug plug;
 
 	/* just incase thread restarts... */
 	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
 		return;
-	if (mddev->ro) /* never try to sync a read-only array */
+	if (mddev->ro) {/* never try to sync a read-only array */
+		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 		return;
+	}
 
 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
-		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
 			desc = "data-check";
-		else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+			action = "check";
+		} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
 			desc = "requested-resync";
-		else
+			action = "repair";
+		} else
 			desc = "resync";
 	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 		desc = "reshape";
 	else
 		desc = "recovery";
 
+	mddev->last_sync_action = action ?: desc;
+
 	/* we overload curr_resync somewhat here.
 	 * 0 == not engaged in resync at all
 	 * 2 == checking that there is no conflict with another sync
@@ -7324,9 +7432,6 @@ void md_do_sync(struct md_thread *thread)
 		mddev->curr_resync = 2;
 
 	try_again:
-		if (kthread_should_stop())
-			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-
 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
 			goto skip;
 		for_each_mddev(mddev2, tmp) {
@@ -7351,7 +7456,7 @@ void md_do_sync(struct md_thread *thread)
 				 * be caught by 'softlockup'
 				 */
 				prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
-				if (!kthread_should_stop() &&
+				if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
 				    mddev2->curr_resync >= mddev->curr_resync) {
 					printk(KERN_INFO "md: delaying %s of %s"
 					       " until %s has finished (they"
@@ -7396,6 +7501,19 @@ void md_do_sync(struct md_thread *thread)
 			    rdev->recovery_offset < j)
 				j = rdev->recovery_offset;
 		rcu_read_unlock();
+
+		/* If there is a bitmap, we need to make sure all
+		 * writes that started before we added a spare
+		 * complete before we start doing a recovery.
+		 * Otherwise the write might complete and (via
+		 * bitmap_endwrite) set a bit in the bitmap after the
+		 * recovery has checked that bit and skipped that
+		 * region.
+		 */
+		if (mddev->bitmap) {
+			mddev->pers->quiesce(mddev, 1);
+			mddev->pers->quiesce(mddev, 0);
+		}
 	}
 
 	printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
@@ -7427,7 +7545,7 @@ void md_do_sync(struct md_thread *thread)
 	last_check = 0;
 
 	if (j>2) {
-		printk(KERN_INFO 
+		printk(KERN_INFO
 		       "md: resuming %s of %s from checkpoint.\n",
 		       desc, mdname(mddev));
 		mddev->curr_resync = j;
@@ -7436,6 +7554,7 @@ void md_do_sync(struct md_thread *thread)
 	mddev->curr_resync_completed = j;
 	sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 	md_new_event(mddev);
+	update_time = jiffies;
 
 	blk_start_plug(&plug);
 	while (j < max_sectors) {
@@ -7447,6 +7566,7 @@ void md_do_sync(struct md_thread *thread)
 		    ((mddev->curr_resync > mddev->curr_resync_completed &&
 		      (mddev->curr_resync - mddev->curr_resync_completed)
 		      > (max_sectors >> 4)) ||
+		     time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
 		     (j - mddev->curr_resync_completed)*2
 		     >= mddev->resync_max - mddev->curr_resync_completed
 			    )) {
@@ -7454,11 +7574,16 @@ void md_do_sync(struct md_thread *thread)
 			wait_event(mddev->recovery_wait,
 				   atomic_read(&mddev->recovery_active) == 0);
 			mddev->curr_resync_completed = j;
+			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
+			    j > mddev->recovery_cp)
+				mddev->recovery_cp = j;
+			update_time = jiffies;
 			set_bit(MD_CHANGE_CLEAN, &mddev->flags);
 			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 		}
 
-		while (j >= mddev->resync_max && !kthread_should_stop()) {
+		while (j >= mddev->resync_max &&
+		       !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
 			/* As this condition is controlled by user-space,
 			 * we can block indefinitely, so use '_interruptible'
 			 * to avoid triggering warnings.
@@ -7466,17 +7591,18 @@ void md_do_sync(struct md_thread *thread)
 			flush_signals(current); /* just in case */
 			wait_event_interruptible(mddev->recovery_wait,
 						 mddev->resync_max > j
-						 || kthread_should_stop());
+						 || test_bit(MD_RECOVERY_INTR,
+							     &mddev->recovery));
 		}
 
-		if (kthread_should_stop())
-			goto interrupted;
+		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+			break;
 
 		sectors = mddev->pers->sync_request(mddev, j, &skipped,
 						  currspeed < speed_min(mddev));
 		if (sectors == 0) {
 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-			goto out;
+			break;
 		}
 
 		if (!skipped) { /* actual IO requested */
@@ -7513,10 +7639,8 @@ void md_do_sync(struct md_thread *thread)
 			last_mark = next;
 		}
 
-
-		if (kthread_should_stop())
-			goto interrupted;
-
+		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+			break;
 
 		/*
 		 * this loop exits only if either when we are slower than
@@ -7539,11 +7663,12 @@ void md_do_sync(struct md_thread *thread)
 			}
 		}
 	}
-	printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
+	printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc,
+	       test_bit(MD_RECOVERY_INTR, &mddev->recovery)
+	       ? "interrupted" : "done");
 	/*
 	 * this also signals 'finished resyncing' to md_stop
 	 */
- out:
 	blk_finish_plug(&plug);
 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
 
@@ -7558,8 +7683,13 @@ void md_do_sync(struct md_thread *thread)
 					printk(KERN_INFO
 					       "md: checkpointing %s of %s.\n",
 					       desc, mdname(mddev));
-					mddev->recovery_cp =
-						mddev->curr_resync_completed;
+					if (test_bit(MD_RECOVERY_ERROR,
+						&mddev->recovery))
+						mddev->recovery_cp =
+							mddev->curr_resync_completed;
+					else
+						mddev->recovery_cp =
+							mddev->curr_resync;
 				}
 			} else
 				mddev->recovery_cp = MaxSector;
@@ -7592,27 +7722,19 @@ void md_do_sync(struct md_thread *thread)
 	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
 	return;
-
- interrupted:
-	/*
-	 * got a signal, exit.
-	 */
-	printk(KERN_INFO
-	       "md: md_do_sync() got signal ... exiting\n");
-	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-	goto out;
-
 }
 EXPORT_SYMBOL_GPL(md_do_sync);
 
-static int remove_and_add_spares(struct mddev *mddev)
+static int remove_and_add_spares(struct mddev *mddev,
+				 struct md_rdev *this)
 {
 	struct md_rdev *rdev;
 	int spares = 0;
 	int removed = 0;
 
 	rdev_for_each(rdev, mddev)
-		if (rdev->raid_disk >= 0 &&
+		if ((this == NULL || rdev == this) &&
+		    rdev->raid_disk >= 0 &&
 		    !test_bit(Blocked, &rdev->flags) &&
 		    (test_bit(Faulty, &rdev->flags) ||
 		     ! test_bit(In_sync, &rdev->flags)) &&
@@ -7624,79 +7746,43 @@ static int remove_and_add_spares(struct mddev *mddev)
 				removed++;
 			}
 		}
-	if (removed)
-		sysfs_notify(&mddev->kobj, NULL,
-			     "degraded");
+	if (removed && mddev->kobj.sd)
+		sysfs_notify(&mddev->kobj, NULL, "degraded");
 
+	if (this)
+		goto no_add;
 
 	rdev_for_each(rdev, mddev) {
 		if (rdev->raid_disk >= 0 &&
 		    !test_bit(In_sync, &rdev->flags) &&
 		    !test_bit(Faulty, &rdev->flags))
 			spares++;
-		if (rdev->raid_disk < 0
-		    && !test_bit(Faulty, &rdev->flags)) {
+		if (rdev->raid_disk >= 0)
+			continue;
+		if (test_bit(Faulty, &rdev->flags))
+			continue;
+		if (mddev->ro &&
+		    ! (rdev->saved_raid_disk >= 0 &&
+		       !test_bit(Bitmap_sync, &rdev->flags)))
+			continue;
+
+		if (rdev->saved_raid_disk < 0)
 			rdev->recovery_offset = 0;
-			if (mddev->pers->
-			    hot_add_disk(mddev, rdev) == 0) {
-				if (sysfs_link_rdev(mddev, rdev))
-					/* failure here is OK */;
-				spares++;
-				md_new_event(mddev);
-				set_bit(MD_CHANGE_DEVS, &mddev->flags);
-			}
+		if (mddev->pers->
+		    hot_add_disk(mddev, rdev) == 0) {
+			if (sysfs_link_rdev(mddev, rdev))
+				/* failure here is OK */;
+			spares++;
+			md_new_event(mddev);
+			set_bit(MD_CHANGE_DEVS, &mddev->flags);
 		}
 	}
+no_add:
 	if (removed)
 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
 	return spares;
 }
 
-static void reap_sync_thread(struct mddev *mddev)
-{
-	struct md_rdev *rdev;
-
-	/* resync has finished, collect result */
-	md_unregister_thread(&mddev->sync_thread);
-	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
-	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
-		/* success...*/
-		/* activate any spares */
-		if (mddev->pers->spare_active(mddev)) {
-			sysfs_notify(&mddev->kobj, NULL,
-				     "degraded");
-			set_bit(MD_CHANGE_DEVS, &mddev->flags);
-		}
-	}
-	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
-	    mddev->pers->finish_reshape)
-		mddev->pers->finish_reshape(mddev);
-
-	/* If array is no-longer degraded, then any saved_raid_disk
-	 * information must be scrapped.  Also if any device is now
-	 * In_sync we must scrape the saved_raid_disk for that device
-	 * do the superblock for an incrementally recovered device
-	 * written out.
-	 */
-	rdev_for_each(rdev, mddev)
-		if (!mddev->degraded ||
-		    test_bit(In_sync, &rdev->flags))
-			rdev->saved_raid_disk = -1;
-
-	md_update_sb(mddev, 1);
-	clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
-	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
-	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
-	clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
-	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
-	/* flag recovery needed just to double check */
-	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-	sysfs_notify_dirent_safe(mddev->sysfs_action);
-	md_new_event(mddev);
-	if (mddev->event_work.func)
-		queue_work(md_misc_wq, &mddev->event_work);
-}
-
 /*
  * This routine is regularly called by all per-raid-array threads to
  * deal with generic issues like resync and super-block update.
@@ -7739,7 +7825,7 @@ void md_check_recovery(struct mddev *mddev)
 	if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
 		return;
 	if ( ! (
-		(mddev->flags & ~ (1<<MD_CHANGE_PENDING)) ||
+		(mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) ||
 		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
 		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
 		(mddev->external == 0 && mddev->safemode == 1) ||
@@ -7752,21 +7838,19 @@ void md_check_recovery(struct mddev *mddev)
 		int spares = 0;
 
 		if (mddev->ro) {
-			/* Only thing we do on a ro array is remove
-			 * failed devices.
+			/* On a read-only array we can:
+			 * - remove failed devices
+			 * - add already-in_sync devices if the array itself
+			 *   is in-sync.
+			 * As we only add devices that are already in-sync,
+			 * we can activate the spares immediately.
 			 */
-			struct md_rdev *rdev;
-			rdev_for_each(rdev, mddev)
-				if (rdev->raid_disk >= 0 &&
-				    !test_bit(Blocked, &rdev->flags) &&
-				    test_bit(Faulty, &rdev->flags) &&
-				    atomic_read(&rdev->nr_pending)==0) {
-					if (mddev->pers->hot_remove_disk(
-						    mddev, rdev) == 0) {
-						sysfs_unlink_rdev(mddev, rdev);
-						rdev->raid_disk = -1;
-					}
-				}
+			remove_and_add_spares(mddev, NULL);
+			/* There is no thread, but we need to call
+			 * ->spare_active and clear saved_raid_disk
+			 */
+			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+			md_reap_sync_thread(mddev);
 			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 			goto unlock;
 		}
@@ -7789,7 +7873,7 @@ void md_check_recovery(struct mddev *mddev)
 				sysfs_notify_dirent_safe(mddev->sysfs_state);
 		}
 
-		if (mddev->flags)
+		if (mddev->flags & MD_UPDATE_SB_FLAGS)
 			md_update_sb(mddev, 0);
 
 		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
@@ -7799,7 +7883,7 @@ void md_check_recovery(struct mddev *mddev)
 			goto unlock;
 		}
 		if (mddev->sync_thread) {
-			reap_sync_thread(mddev);
+			md_reap_sync_thread(mddev);
 			goto unlock;
 		}
 		/* Set RUNNING before clearing NEEDED to avoid
@@ -7830,7 +7914,7 @@ void md_check_recovery(struct mddev *mddev)
 				goto unlock;
 			set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
 			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
-		} else if ((spares = remove_and_add_spares(mddev))) {
+		} else if ((spares = remove_and_add_spares(mddev, NULL))) {
 			clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 			clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 			clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
@@ -7869,6 +7953,8 @@ void md_check_recovery(struct mddev *mddev)
 			md_new_event(mddev);
 		}
 	unlock:
+		wake_up(&mddev->sb_wait);
+
 		if (!mddev->sync_thread) {
 			clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 			if (test_and_clear_bit(MD_RECOVERY_RECOVER,
@@ -7880,6 +7966,48 @@ void md_check_recovery(struct mddev *mddev)
 	}
 }
 
+void md_reap_sync_thread(struct mddev *mddev)
+{
+	struct md_rdev *rdev;
+
+	/* resync has finished, collect result */
+	md_unregister_thread(&mddev->sync_thread);
+	wake_up(&resync_wait);
+	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+		/* success...*/
+		/* activate any spares */
+		if (mddev->pers->spare_active(mddev)) {
+			sysfs_notify(&mddev->kobj, NULL,
+				     "degraded");
+			set_bit(MD_CHANGE_DEVS, &mddev->flags);
+		}
+	}
+	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+	    mddev->pers->finish_reshape)
+		mddev->pers->finish_reshape(mddev);
+
+	/* If array is no-longer degraded, then any saved_raid_disk
+	 * information must be scrapped.
+	 */
+	if (!mddev->degraded)
+		rdev_for_each(rdev, mddev)
+			rdev->saved_raid_disk = -1;
+
+	md_update_sb(mddev, 1);
+	clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+	clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+	/* flag recovery needed just to double check */
+	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+	sysfs_notify_dirent_safe(mddev->sysfs_action);
+	md_new_event(mddev);
+	if (mddev->event_work.func)
+		queue_work(md_misc_wq, &mddev->event_work);
+}
+
 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
 {
 	sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -7936,9 +8064,9 @@ int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
 		   sector_t *first_bad, int *bad_sectors)
 {
 	int hi;
-	int lo = 0;
+	int lo;
 	u64 *p = bb->page;
-	int rv = 0;
+	int rv;
 	sector_t target = s + sectors;
 	unsigned seq;
 
@@ -7953,7 +8081,8 @@ int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
 
 retry:
 	seq = read_seqbegin(&bb->lock);
-
+	lo = 0;
+	rv = 0;
 	hi = bb->count;
 
 	/* Binary search between lo and hi for 'target'
@@ -8017,6 +8146,7 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
 	u64 *p;
 	int lo, hi;
 	int rv = 1;
+	unsigned long flags;
 
 	if (bb->shift < 0)
 		/* badblocks are disabled */
@@ -8031,7 +8161,7 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
 		sectors = next - s;
 	}
 
-	write_seqlock_irq(&bb->lock);
+	write_seqlock_irqsave(&bb->lock, flags);
 
 	p = bb->page;
 	lo = 0;
@@ -8147,7 +8277,7 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
 	bb->changed = 1;
 	if (!acknowledged)
 		bb->unacked_exist = 1;
-	write_sequnlock_irq(&bb->lock);
+	write_sequnlock_irqrestore(&bb->lock, flags);
 
 	return rv;
 }
@@ -8226,7 +8356,7 @@ static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
 			if (a < s) {
 				/* we need to split this range */
 				if (bb->count >= MD_MAX_BADBLOCKS) {
-					rv = 0;
+					rv = -ENOSPC;
 					goto out;
 				}
 				memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
@@ -8412,7 +8542,8 @@ static int md_notify_reboot(struct notifier_block *this,
 		if (mddev_trylock(mddev)) {
 			if (mddev->pers)
 				__md_stop_writes(mddev);
-			mddev->safemode = 2;
+			if (mddev->persistent)
+				mddev->safemode = 2;
 			mddev_unlock(mddev);
 		}
 		need_delay = 1;
@@ -8554,6 +8685,7 @@ static __exit void md_exit(void)
 {
 	struct mddev *mddev;
 	struct list_head *tmp;
+	int delay = 1;
 
 	blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
 	blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
@@ -8562,7 +8694,19 @@ static __exit void md_exit(void)
 	unregister_blkdev(mdp_major, "mdp");
 	unregister_reboot_notifier(&md_notifier);
 	unregister_sysctl_table(raid_table_header);
+
+	/* We cannot unload the modules while some process is
+	 * waiting for us in select() or poll() - wake them up
+	 */
+	md_unloading = 1;
+	while (waitqueue_active(&md_event_waiters)) {
+		/* not safe to leave yet */
+		wake_up(&md_event_waiters);
+		msleep(delay);
+		delay += delay;
+	}
 	remove_proc_entry("mdstat", NULL);
+
 	for_each_mddev(mddev, tmp) {
 		export_array(mddev);
 		mddev->hold_active = 0;
@@ -8604,6 +8748,7 @@ EXPORT_SYMBOL(md_register_thread);
 EXPORT_SYMBOL(md_unregister_thread);
 EXPORT_SYMBOL(md_wakeup_thread);
 EXPORT_SYMBOL(md_check_recovery);
+EXPORT_SYMBOL(md_reap_sync_thread);
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("MD RAID framework");
 MODULE_ALIAS("md");
diff --git a/drivers/md/md.h b/drivers/md/md.h
index af443ab868d..a49d991f3fe 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -106,7 +106,7 @@ struct md_rdev {
 					   */
 	struct work_struct del_work;	/* used for delayed sysfs removal */
 
-	struct sysfs_dirent *sysfs_state; /* handle for 'state'
+	struct kernfs_node *sysfs_state; /* handle for 'state'
 					   * sysfs entry */
 
 	struct badblocks {
@@ -129,6 +129,9 @@ struct md_rdev {
 enum flag_bits {
 	Faulty,			/* device is known to have a fault */
 	In_sync,		/* device is in_sync with rest of array */
+	Bitmap_sync,		/* ..actually, not quite In_sync.  Need a
+				 * bitmap-based recovery to get fully in sync
+				 */
 	Unmerged,		/* device is being added to array and should
 				 * be considerred for bvec_merge_fn but not
 				 * yet for actual IO
@@ -204,12 +207,16 @@ struct mddev {
 	struct md_personality		*pers;
 	dev_t				unit;
 	int				md_minor;
-	struct list_head 		disks;
+	struct list_head		disks;
 	unsigned long			flags;
 #define MD_CHANGE_DEVS	0	/* Some device status has changed */
 #define MD_CHANGE_CLEAN 1	/* transition to or from 'clean' */
 #define MD_CHANGE_PENDING 2	/* switch from 'clean' to 'active' in progress */
+#define MD_UPDATE_SB_FLAGS (1 | 2 | 4)	/* If these are set, md_update_sb needed */
 #define MD_ARRAY_FIRST_USE 3    /* First use of array, needs initialization */
+#define MD_STILL_CLOSED	4	/* If set, then array has not been opened since
+				 * md_ioctl checked on it.
+				 */
 
 	int				suspended;
 	atomic_t			active_io;
@@ -218,7 +225,7 @@ struct mddev {
 						       * are happening, so run/
 						       * takeover/stop are not safe
 						       */
-	int				ready; /* See when safe to pass 
+	int				ready; /* See when safe to pass
 						* IO requests down */
 	struct gendisk			*gendisk;
 
@@ -268,6 +275,14 @@ struct mddev {
 
 	struct md_thread		*thread;	/* management thread */
 	struct md_thread		*sync_thread;	/* doing resync or reconstruct */
+
+	/* 'last_sync_action' is initialized to "none".  It is set when a
+	 * sync operation (i.e "data-check", "requested-resync", "resync",
+	 * "recovery", or "reshape") is started.  It holds this value even
+	 * when the sync thread is "frozen" (interrupted) or "idle" (stopped
+	 * or finished).  It is overwritten when a new sync operation is begun.
+	 */
+	char				*last_sync_action;
 	sector_t			curr_resync;	/* last block scheduled */
 	/* As resync requests can complete out of order, we cannot easily track
 	 * how much resync has been completed.  So we occasionally pause until
@@ -307,6 +322,7 @@ struct mddev {
 	 * REQUEST:  user-space has requested a sync (used with SYNC)
 	 * CHECK:    user-space request for check-only, no repair
 	 * RESHAPE:  A reshape is happening
+	 * ERROR:    sync-action interrupted because io-error
 	 *
 	 * If neither SYNC or RESHAPE are set, then it is a recovery.
 	 */
@@ -320,6 +336,7 @@ struct mddev {
 #define	MD_RECOVERY_CHECK	7
 #define MD_RECOVERY_RESHAPE	8
 #define	MD_RECOVERY_FROZEN	9
+#define	MD_RECOVERY_ERROR	10
 
 	unsigned long			recovery;
 	/* If a RAID personality determines that recovery (of a particular
@@ -362,10 +379,10 @@ struct mddev {
 	sector_t			resync_max;	/* resync should pause
 							 * when it gets here */
 
-	struct sysfs_dirent		*sysfs_state;	/* handle for 'array_state'
+	struct kernfs_node		*sysfs_state;	/* handle for 'array_state'
 							 * file in sysfs.
 							 */
-	struct sysfs_dirent		*sysfs_action;  /* handle for 'sync_action' */
+	struct kernfs_node		*sysfs_action;  /* handle for 'sync_action' */
 
 	struct work_struct del_work;	/* used for delayed sysfs removal */
 
@@ -484,13 +501,13 @@ struct md_sysfs_entry {
 };
 extern struct attribute_group md_bitmap_group;
 
-static inline struct sysfs_dirent *sysfs_get_dirent_safe(struct sysfs_dirent *sd, char *name)
+static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name)
 {
 	if (sd)
-		return sysfs_get_dirent(sd, NULL, name);
+		return sysfs_get_dirent(sd, name);
 	return sd;
 }
-static inline void sysfs_notify_dirent_safe(struct sysfs_dirent *sd)
+static inline void sysfs_notify_dirent_safe(struct kernfs_node *sd)
 {
 	if (sd)
 		sysfs_notify_dirent(sd);
@@ -504,7 +521,7 @@ static inline char * mdname (struct mddev * mddev)
 static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
 {
 	char nm[20];
-	if (!test_bit(Replacement, &rdev->flags)) {
+	if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) {
 		sprintf(nm, "rd%d", rdev->raid_disk);
 		return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
 	} else
@@ -514,7 +531,7 @@ static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
 static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
 {
 	char nm[20];
-	if (!test_bit(Replacement, &rdev->flags)) {
+	if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) {
 		sprintf(nm, "rd%d", rdev->raid_disk);
 		sysfs_remove_link(&mddev->kobj, nm);
 	}
@@ -551,32 +568,6 @@ struct md_thread {
 
 #define THREAD_WAKEUP  0
 
-#define __wait_event_lock_irq(wq, condition, lock, cmd) 		\
-do {									\
-	wait_queue_t __wait;						\
-	init_waitqueue_entry(&__wait, current);				\
-									\
-	add_wait_queue(&wq, &__wait);					\
-	for (;;) {							\
-		set_current_state(TASK_UNINTERRUPTIBLE);		\
-		if (condition)						\
-			break;						\
-		spin_unlock_irq(&lock);					\
-		cmd;							\
-		schedule();						\
-		spin_lock_irq(&lock);					\
-	}								\
-	current->state = TASK_RUNNING;					\
-	remove_wait_queue(&wq, &__wait);				\
-} while (0)
-
-#define wait_event_lock_irq(wq, condition, lock, cmd) 			\
-do {									\
-	if (condition)	 						\
-		break;							\
-	__wait_event_lock_irq(wq, condition, lock, cmd);		\
-} while (0)
-
 static inline void safe_put_page(struct page *p)
 {
 	if (p) put_page(p);
@@ -591,6 +582,7 @@ extern struct md_thread *md_register_thread(
 extern void md_unregister_thread(struct md_thread **threadp);
 extern void md_wakeup_thread(struct md_thread *thread);
 extern void md_check_recovery(struct mddev *mddev);
+extern void md_reap_sync_thread(struct mddev *mddev);
 extern void md_write_start(struct mddev *mddev, struct bio *bi);
 extern void md_write_end(struct mddev *mddev);
 extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
@@ -613,7 +605,6 @@ extern int md_check_no_bitmap(struct mddev *mddev);
 extern int md_integrity_register(struct mddev *mddev);
 extern void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev);
 extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
-extern void restore_bitmap_write_access(struct file *file);
 
 extern void mddev_init(struct mddev *mddev);
 extern int md_run(struct mddev *mddev);
@@ -628,7 +619,6 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
 				   struct mddev *mddev);
 extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 				   struct mddev *mddev);
-extern void md_trim_bio(struct bio *bio, int offset, int size);
 
 extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
 static inline int mddev_check_plugged(struct mddev *mddev)
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 1642eae75a3..849ad39f547 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -100,7 +100,7 @@ static void multipath_end_request(struct bio *bio, int error)
 		md_error (mp_bh->mddev, rdev);
 		printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n", 
 		       bdevname(rdev->bdev,b), 
-		       (unsigned long long)bio->bi_sector);
+		       (unsigned long long)bio->bi_iter.bi_sector);
 		multipath_reschedule_retry(mp_bh);
 	} else
 		multipath_end_bh_io(mp_bh, error);
@@ -132,7 +132,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
 	multipath = conf->multipaths + mp_bh->path;
 
 	mp_bh->bio = *bio;
-	mp_bh->bio.bi_sector += multipath->rdev->data_offset;
+	mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset;
 	mp_bh->bio.bi_bdev = multipath->rdev->bdev;
 	mp_bh->bio.bi_rw |= REQ_FAILFAST_TRANSPORT;
 	mp_bh->bio.bi_end_io = multipath_end_request;
@@ -355,21 +355,22 @@ static void multipathd(struct md_thread *thread)
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 
 		bio = &mp_bh->bio;
-		bio->bi_sector = mp_bh->master_bio->bi_sector;
+		bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector;
 		
 		if ((mp_bh->path = multipath_map (conf))<0) {
 			printk(KERN_ALERT "multipath: %s: unrecoverable IO read"
 				" error for block %llu\n",
 				bdevname(bio->bi_bdev,b),
-				(unsigned long long)bio->bi_sector);
+				(unsigned long long)bio->bi_iter.bi_sector);
 			multipath_end_bh_io(mp_bh, -EIO);
 		} else {
 			printk(KERN_ERR "multipath: %s: redirecting sector %llu"
 				" to another IO path\n",
 				bdevname(bio->bi_bdev,b),
-				(unsigned long long)bio->bi_sector);
+				(unsigned long long)bio->bi_iter.bi_sector);
 			*bio = *(mp_bh->master_bio);
-			bio->bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset;
+			bio->bi_iter.bi_sector +=
+				conf->multipaths[mp_bh->path].rdev->data_offset;
 			bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev;
 			bio->bi_rw |= REQ_FAILFAST_TRANSPORT;
 			bio->bi_end_io = multipath_end_request;
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
index ceb359050a5..0c2dec7aec2 100644
--- a/drivers/md/persistent-data/Kconfig
+++ b/drivers/md/persistent-data/Kconfig
@@ -1,8 +1,18 @@
 config DM_PERSISTENT_DATA
        tristate
-       depends on BLK_DEV_DM && EXPERIMENTAL
+       depends on BLK_DEV_DM
        select LIBCRC32C
        select DM_BUFIO
        ---help---
 	 Library providing immutable on-disk data structure support for
 	 device-mapper targets such as the thin provisioning target.
+
+config DM_DEBUG_BLOCK_STACK_TRACING
+       boolean "Keep stack trace of persistent data block lock holders"
+       depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA
+       select STACKTRACE
+       ---help---
+	 Enable this for messages that may help debug problems with the
+	 block manager locking used by thin provisioning and caching.
+
+	 If unsure, say N.
diff --git a/drivers/md/persistent-data/Makefile b/drivers/md/persistent-data/Makefile
index d8e7cb767c1..ff528792c35 100644
--- a/drivers/md/persistent-data/Makefile
+++ b/drivers/md/persistent-data/Makefile
@@ -1,5 +1,7 @@
 obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o
 dm-persistent-data-objs := \
+	dm-array.o \
+	dm-bitset.o \
 	dm-block-manager.o \
 	dm-space-map-common.o \
 	dm-space-map-disk.o \
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
new file mode 100644
index 00000000000..1d75b1dc1e2
--- /dev/null
+++ b/drivers/md/persistent-data/dm-array.c
@@ -0,0 +1,819 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-array.h"
+#include "dm-space-map.h"
+#include "dm-transaction-manager.h"
+
+#include <linux/export.h>
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "array"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * The array is implemented as a fully populated btree, which points to
+ * blocks that contain the packed values.  This is more space efficient
+ * than just using a btree since we don't store 1 key per value.
+ */
+struct array_block {
+	__le32 csum;
+	__le32 max_entries;
+	__le32 nr_entries;
+	__le32 value_size;
+	__le64 blocknr; /* Block this node is supposed to live in. */
+} __packed;
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Validator methods.  As usual we calculate a checksum, and also write the
+ * block location into the header (paranoia about ssds remapping areas by
+ * mistake).
+ */
+#define CSUM_XOR 595846735
+
+static void array_block_prepare_for_write(struct dm_block_validator *v,
+					  struct dm_block *b,
+					  size_t size_of_block)
+{
+	struct array_block *bh_le = dm_block_data(b);
+
+	bh_le->blocknr = cpu_to_le64(dm_block_location(b));
+	bh_le->csum = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries,
+						 size_of_block - sizeof(__le32),
+						 CSUM_XOR));
+}
+
+static int array_block_check(struct dm_block_validator *v,
+			     struct dm_block *b,
+			     size_t size_of_block)
+{
+	struct array_block *bh_le = dm_block_data(b);
+	__le32 csum_disk;
+
+	if (dm_block_location(b) != le64_to_cpu(bh_le->blocknr)) {
+		DMERR_LIMIT("array_block_check failed: blocknr %llu != wanted %llu",
+			    (unsigned long long) le64_to_cpu(bh_le->blocknr),
+			    (unsigned long long) dm_block_location(b));
+		return -ENOTBLK;
+	}
+
+	csum_disk = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries,
+					       size_of_block - sizeof(__le32),
+					       CSUM_XOR));
+	if (csum_disk != bh_le->csum) {
+		DMERR_LIMIT("array_block_check failed: csum %u != wanted %u",
+			    (unsigned) le32_to_cpu(csum_disk),
+			    (unsigned) le32_to_cpu(bh_le->csum));
+		return -EILSEQ;
+	}
+
+	return 0;
+}
+
+static struct dm_block_validator array_validator = {
+	.name = "array",
+	.prepare_for_write = array_block_prepare_for_write,
+	.check = array_block_check
+};
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Functions for manipulating the array blocks.
+ */
+
+/*
+ * Returns a pointer to a value within an array block.
+ *
+ * index - The index into _this_ specific block.
+ */
+static void *element_at(struct dm_array_info *info, struct array_block *ab,
+			unsigned index)
+{
+	unsigned char *entry = (unsigned char *) (ab + 1);
+
+	entry += index * info->value_type.size;
+
+	return entry;
+}
+
+/*
+ * Utility function that calls one of the value_type methods on every value
+ * in an array block.
+ */
+static void on_entries(struct dm_array_info *info, struct array_block *ab,
+		       void (*fn)(void *, const void *))
+{
+	unsigned i, nr_entries = le32_to_cpu(ab->nr_entries);
+
+	for (i = 0; i < nr_entries; i++)
+		fn(info->value_type.context, element_at(info, ab, i));
+}
+
+/*
+ * Increment every value in an array block.
+ */
+static void inc_ablock_entries(struct dm_array_info *info, struct array_block *ab)
+{
+	struct dm_btree_value_type *vt = &info->value_type;
+
+	if (vt->inc)
+		on_entries(info, ab, vt->inc);
+}
+
+/*
+ * Decrement every value in an array block.
+ */
+static void dec_ablock_entries(struct dm_array_info *info, struct array_block *ab)
+{
+	struct dm_btree_value_type *vt = &info->value_type;
+
+	if (vt->dec)
+		on_entries(info, ab, vt->dec);
+}
+
+/*
+ * Each array block can hold this many values.
+ */
+static uint32_t calc_max_entries(size_t value_size, size_t size_of_block)
+{
+	return (size_of_block - sizeof(struct array_block)) / value_size;
+}
+
+/*
+ * Allocate a new array block.  The caller will need to unlock block.
+ */
+static int alloc_ablock(struct dm_array_info *info, size_t size_of_block,
+			uint32_t max_entries,
+			struct dm_block **block, struct array_block **ab)
+{
+	int r;
+
+	r = dm_tm_new_block(info->btree_info.tm, &array_validator, block);
+	if (r)
+		return r;
+
+	(*ab) = dm_block_data(*block);
+	(*ab)->max_entries = cpu_to_le32(max_entries);
+	(*ab)->nr_entries = cpu_to_le32(0);
+	(*ab)->value_size = cpu_to_le32(info->value_type.size);
+
+	return 0;
+}
+
+/*
+ * Pad an array block out with a particular value.  Every instance will
+ * cause an increment of the value_type.  new_nr must always be more than
+ * the current number of entries.
+ */
+static void fill_ablock(struct dm_array_info *info, struct array_block *ab,
+			const void *value, unsigned new_nr)
+{
+	unsigned i;
+	uint32_t nr_entries;
+	struct dm_btree_value_type *vt = &info->value_type;
+
+	BUG_ON(new_nr > le32_to_cpu(ab->max_entries));
+	BUG_ON(new_nr < le32_to_cpu(ab->nr_entries));
+
+	nr_entries = le32_to_cpu(ab->nr_entries);
+	for (i = nr_entries; i < new_nr; i++) {
+		if (vt->inc)
+			vt->inc(vt->context, value);
+		memcpy(element_at(info, ab, i), value, vt->size);
+	}
+	ab->nr_entries = cpu_to_le32(new_nr);
+}
+
+/*
+ * Remove some entries from the back of an array block.  Every value
+ * removed will be decremented.  new_nr must be <= the current number of
+ * entries.
+ */
+static void trim_ablock(struct dm_array_info *info, struct array_block *ab,
+			unsigned new_nr)
+{
+	unsigned i;
+	uint32_t nr_entries;
+	struct dm_btree_value_type *vt = &info->value_type;
+
+	BUG_ON(new_nr > le32_to_cpu(ab->max_entries));
+	BUG_ON(new_nr > le32_to_cpu(ab->nr_entries));
+
+	nr_entries = le32_to_cpu(ab->nr_entries);
+	for (i = nr_entries; i > new_nr; i--)
+		if (vt->dec)
+			vt->dec(vt->context, element_at(info, ab, i - 1));
+	ab->nr_entries = cpu_to_le32(new_nr);
+}
+
+/*
+ * Read locks a block, and coerces it to an array block.  The caller must
+ * unlock 'block' when finished.
+ */
+static int get_ablock(struct dm_array_info *info, dm_block_t b,
+		      struct dm_block **block, struct array_block **ab)
+{
+	int r;
+
+	r = dm_tm_read_lock(info->btree_info.tm, b, &array_validator, block);
+	if (r)
+		return r;
+
+	*ab = dm_block_data(*block);
+	return 0;
+}
+
+/*
+ * Unlocks an array block.
+ */
+static int unlock_ablock(struct dm_array_info *info, struct dm_block *block)
+{
+	return dm_tm_unlock(info->btree_info.tm, block);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Btree manipulation.
+ */
+
+/*
+ * Looks up an array block in the btree, and then read locks it.
+ *
+ * index is the index of the index of the array_block, (ie. the array index
+ * / max_entries).
+ */
+static int lookup_ablock(struct dm_array_info *info, dm_block_t root,
+			 unsigned index, struct dm_block **block,
+			 struct array_block **ab)
+{
+	int r;
+	uint64_t key = index;
+	__le64 block_le;
+
+	r = dm_btree_lookup(&info->btree_info, root, &key, &block_le);
+	if (r)
+		return r;
+
+	return get_ablock(info, le64_to_cpu(block_le), block, ab);
+}
+
+/*
+ * Insert an array block into the btree.  The block is _not_ unlocked.
+ */
+static int insert_ablock(struct dm_array_info *info, uint64_t index,
+			 struct dm_block *block, dm_block_t *root)
+{
+	__le64 block_le = cpu_to_le64(dm_block_location(block));
+
+	__dm_bless_for_disk(block_le);
+	return dm_btree_insert(&info->btree_info, *root, &index, &block_le, root);
+}
+
+/*
+ * Looks up an array block in the btree.  Then shadows it, and updates the
+ * btree to point to this new shadow.  'root' is an input/output parameter
+ * for both the current root block, and the new one.
+ */
+static int shadow_ablock(struct dm_array_info *info, dm_block_t *root,
+			 unsigned index, struct dm_block **block,
+			 struct array_block **ab)
+{
+	int r, inc;
+	uint64_t key = index;
+	dm_block_t b;
+	__le64 block_le;
+
+	/*
+	 * lookup
+	 */
+	r = dm_btree_lookup(&info->btree_info, *root, &key, &block_le);
+	if (r)
+		return r;
+	b = le64_to_cpu(block_le);
+
+	/*
+	 * shadow
+	 */
+	r = dm_tm_shadow_block(info->btree_info.tm, b,
+			       &array_validator, block, &inc);
+	if (r)
+		return r;
+
+	*ab = dm_block_data(*block);
+	if (inc)
+		inc_ablock_entries(info, *ab);
+
+	/*
+	 * Reinsert.
+	 *
+	 * The shadow op will often be a noop.  Only insert if it really
+	 * copied data.
+	 */
+	if (dm_block_location(*block) != b) {
+		/*
+		 * dm_tm_shadow_block will have already decremented the old
+		 * block, but it is still referenced by the btree.  We
+		 * increment to stop the insert decrementing it below zero
+		 * when overwriting the old value.
+		 */
+		dm_tm_inc(info->btree_info.tm, b);
+		r = insert_ablock(info, index, *block, root);
+	}
+
+	return r;
+}
+
+/*
+ * Allocate an new array block, and fill it with some values.
+ */
+static int insert_new_ablock(struct dm_array_info *info, size_t size_of_block,
+			     uint32_t max_entries,
+			     unsigned block_index, uint32_t nr,
+			     const void *value, dm_block_t *root)
+{
+	int r;
+	struct dm_block *block;
+	struct array_block *ab;
+
+	r = alloc_ablock(info, size_of_block, max_entries, &block, &ab);
+	if (r)
+		return r;
+
+	fill_ablock(info, ab, value, nr);
+	r = insert_ablock(info, block_index, block, root);
+	unlock_ablock(info, block);
+
+	return r;
+}
+
+static int insert_full_ablocks(struct dm_array_info *info, size_t size_of_block,
+			       unsigned begin_block, unsigned end_block,
+			       unsigned max_entries, const void *value,
+			       dm_block_t *root)
+{
+	int r = 0;
+
+	for (; !r && begin_block != end_block; begin_block++)
+		r = insert_new_ablock(info, size_of_block, max_entries, begin_block, max_entries, value, root);
+
+	return r;
+}
+
+/*
+ * There are a bunch of functions involved with resizing an array.  This
+ * structure holds information that commonly needed by them.  Purely here
+ * to reduce parameter count.
+ */
+struct resize {
+	/*
+	 * Describes the array.
+	 */
+	struct dm_array_info *info;
+
+	/*
+	 * The current root of the array.  This gets updated.
+	 */
+	dm_block_t root;
+
+	/*
+	 * Metadata block size.  Used to calculate the nr entries in an
+	 * array block.
+	 */
+	size_t size_of_block;
+
+	/*
+	 * Maximum nr entries in an array block.
+	 */
+	unsigned max_entries;
+
+	/*
+	 * nr of completely full blocks in the array.
+	 *
+	 * 'old' refers to before the resize, 'new' after.
+	 */
+	unsigned old_nr_full_blocks, new_nr_full_blocks;
+
+	/*
+	 * Number of entries in the final block.  0 iff only full blocks in
+	 * the array.
+	 */
+	unsigned old_nr_entries_in_last_block, new_nr_entries_in_last_block;
+
+	/*
+	 * The default value used when growing the array.
+	 */
+	const void *value;
+};
+
+/*
+ * Removes a consecutive set of array blocks from the btree.  The values
+ * in block are decremented as a side effect of the btree remove.
+ *
+ * begin_index - the index of the first array block to remove.
+ * end_index - the one-past-the-end value.  ie. this block is not removed.
+ */
+static int drop_blocks(struct resize *resize, unsigned begin_index,
+		       unsigned end_index)
+{
+	int r;
+
+	while (begin_index != end_index) {
+		uint64_t key = begin_index++;
+		r = dm_btree_remove(&resize->info->btree_info, resize->root,
+				    &key, &resize->root);
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
+/*
+ * Calculates how many blocks are needed for the array.
+ */
+static unsigned total_nr_blocks_needed(unsigned nr_full_blocks,
+				       unsigned nr_entries_in_last_block)
+{
+	return nr_full_blocks + (nr_entries_in_last_block ? 1 : 0);
+}
+
+/*
+ * Shrink an array.
+ */
+static int shrink(struct resize *resize)
+{
+	int r;
+	unsigned begin, end;
+	struct dm_block *block;
+	struct array_block *ab;
+
+	/*
+	 * Lose some blocks from the back?
+	 */
+	if (resize->new_nr_full_blocks < resize->old_nr_full_blocks) {
+		begin = total_nr_blocks_needed(resize->new_nr_full_blocks,
+					       resize->new_nr_entries_in_last_block);
+		end = total_nr_blocks_needed(resize->old_nr_full_blocks,
+					     resize->old_nr_entries_in_last_block);
+
+		r = drop_blocks(resize, begin, end);
+		if (r)
+			return r;
+	}
+
+	/*
+	 * Trim the new tail block
+	 */
+	if (resize->new_nr_entries_in_last_block) {
+		r = shadow_ablock(resize->info, &resize->root,
+				  resize->new_nr_full_blocks, &block, &ab);
+		if (r)
+			return r;
+
+		trim_ablock(resize->info, ab, resize->new_nr_entries_in_last_block);
+		unlock_ablock(resize->info, block);
+	}
+
+	return 0;
+}
+
+/*
+ * Grow an array.
+ */
+static int grow_extend_tail_block(struct resize *resize, uint32_t new_nr_entries)
+{
+	int r;
+	struct dm_block *block;
+	struct array_block *ab;
+
+	r = shadow_ablock(resize->info, &resize->root,
+			  resize->old_nr_full_blocks, &block, &ab);
+	if (r)
+		return r;
+
+	fill_ablock(resize->info, ab, resize->value, new_nr_entries);
+	unlock_ablock(resize->info, block);
+
+	return r;
+}
+
+static int grow_add_tail_block(struct resize *resize)
+{
+	return insert_new_ablock(resize->info, resize->size_of_block,
+				 resize->max_entries,
+				 resize->new_nr_full_blocks,
+				 resize->new_nr_entries_in_last_block,
+				 resize->value, &resize->root);
+}
+
+static int grow_needs_more_blocks(struct resize *resize)
+{
+	int r;
+	unsigned old_nr_blocks = resize->old_nr_full_blocks;
+
+	if (resize->old_nr_entries_in_last_block > 0) {
+		old_nr_blocks++;
+
+		r = grow_extend_tail_block(resize, resize->max_entries);
+		if (r)
+			return r;
+	}
+
+	r = insert_full_ablocks(resize->info, resize->size_of_block,
+				old_nr_blocks,
+				resize->new_nr_full_blocks,
+				resize->max_entries, resize->value,
+				&resize->root);
+	if (r)
+		return r;
+
+	if (resize->new_nr_entries_in_last_block)
+		r = grow_add_tail_block(resize);
+
+	return r;
+}
+
+static int grow(struct resize *resize)
+{
+	if (resize->new_nr_full_blocks > resize->old_nr_full_blocks)
+		return grow_needs_more_blocks(resize);
+
+	else if (resize->old_nr_entries_in_last_block)
+		return grow_extend_tail_block(resize, resize->new_nr_entries_in_last_block);
+
+	else
+		return grow_add_tail_block(resize);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * These are the value_type functions for the btree elements, which point
+ * to array blocks.
+ */
+static void block_inc(void *context, const void *value)
+{
+	__le64 block_le;
+	struct dm_array_info *info = context;
+
+	memcpy(&block_le, value, sizeof(block_le));
+	dm_tm_inc(info->btree_info.tm, le64_to_cpu(block_le));
+}
+
+static void block_dec(void *context, const void *value)
+{
+	int r;
+	uint64_t b;
+	__le64 block_le;
+	uint32_t ref_count;
+	struct dm_block *block;
+	struct array_block *ab;
+	struct dm_array_info *info = context;
+
+	memcpy(&block_le, value, sizeof(block_le));
+	b = le64_to_cpu(block_le);
+
+	r = dm_tm_ref(info->btree_info.tm, b, &ref_count);
+	if (r) {
+		DMERR_LIMIT("couldn't get reference count for block %llu",
+			    (unsigned long long) b);
+		return;
+	}
+
+	if (ref_count == 1) {
+		/*
+		 * We're about to drop the last reference to this ablock.
+		 * So we need to decrement the ref count of the contents.
+		 */
+		r = get_ablock(info, b, &block, &ab);
+		if (r) {
+			DMERR_LIMIT("couldn't get array block %llu",
+				    (unsigned long long) b);
+			return;
+		}
+
+		dec_ablock_entries(info, ab);
+		unlock_ablock(info, block);
+	}
+
+	dm_tm_dec(info->btree_info.tm, b);
+}
+
+static int block_equal(void *context, const void *value1, const void *value2)
+{
+	return !memcmp(value1, value2, sizeof(__le64));
+}
+
+/*----------------------------------------------------------------*/
+
+void dm_array_info_init(struct dm_array_info *info,
+			struct dm_transaction_manager *tm,
+			struct dm_btree_value_type *vt)
+{
+	struct dm_btree_value_type *bvt = &info->btree_info.value_type;
+
+	memcpy(&info->value_type, vt, sizeof(info->value_type));
+	info->btree_info.tm = tm;
+	info->btree_info.levels = 1;
+
+	bvt->context = info;
+	bvt->size = sizeof(__le64);
+	bvt->inc = block_inc;
+	bvt->dec = block_dec;
+	bvt->equal = block_equal;
+}
+EXPORT_SYMBOL_GPL(dm_array_info_init);
+
+int dm_array_empty(struct dm_array_info *info, dm_block_t *root)
+{
+	return dm_btree_empty(&info->btree_info, root);
+}
+EXPORT_SYMBOL_GPL(dm_array_empty);
+
+static int array_resize(struct dm_array_info *info, dm_block_t root,
+			uint32_t old_size, uint32_t new_size,
+			const void *value, dm_block_t *new_root)
+{
+	int r;
+	struct resize resize;
+
+	if (old_size == new_size)
+		return 0;
+
+	resize.info = info;
+	resize.root = root;
+	resize.size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm));
+	resize.max_entries = calc_max_entries(info->value_type.size,
+					      resize.size_of_block);
+
+	resize.old_nr_full_blocks = old_size / resize.max_entries;
+	resize.old_nr_entries_in_last_block = old_size % resize.max_entries;
+	resize.new_nr_full_blocks = new_size / resize.max_entries;
+	resize.new_nr_entries_in_last_block = new_size % resize.max_entries;
+	resize.value = value;
+
+	r = ((new_size > old_size) ? grow : shrink)(&resize);
+	if (r)
+		return r;
+
+	*new_root = resize.root;
+	return 0;
+}
+
+int dm_array_resize(struct dm_array_info *info, dm_block_t root,
+		    uint32_t old_size, uint32_t new_size,
+		    const void *value, dm_block_t *new_root)
+		    __dm_written_to_disk(value)
+{
+	int r = array_resize(info, root, old_size, new_size, value, new_root);
+	__dm_unbless_for_disk(value);
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_array_resize);
+
+int dm_array_del(struct dm_array_info *info, dm_block_t root)
+{
+	return dm_btree_del(&info->btree_info, root);
+}
+EXPORT_SYMBOL_GPL(dm_array_del);
+
+int dm_array_get_value(struct dm_array_info *info, dm_block_t root,
+		       uint32_t index, void *value_le)
+{
+	int r;
+	struct dm_block *block;
+	struct array_block *ab;
+	size_t size_of_block;
+	unsigned entry, max_entries;
+
+	size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm));
+	max_entries = calc_max_entries(info->value_type.size, size_of_block);
+
+	r = lookup_ablock(info, root, index / max_entries, &block, &ab);
+	if (r)
+		return r;
+
+	entry = index % max_entries;
+	if (entry >= le32_to_cpu(ab->nr_entries))
+		r = -ENODATA;
+	else
+		memcpy(value_le, element_at(info, ab, entry),
+		       info->value_type.size);
+
+	unlock_ablock(info, block);
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_array_get_value);
+
+static int array_set_value(struct dm_array_info *info, dm_block_t root,
+			   uint32_t index, const void *value, dm_block_t *new_root)
+{
+	int r;
+	struct dm_block *block;
+	struct array_block *ab;
+	size_t size_of_block;
+	unsigned max_entries;
+	unsigned entry;
+	void *old_value;
+	struct dm_btree_value_type *vt = &info->value_type;
+
+	size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm));
+	max_entries = calc_max_entries(info->value_type.size, size_of_block);
+
+	r = shadow_ablock(info, &root, index / max_entries, &block, &ab);
+	if (r)
+		return r;
+	*new_root = root;
+
+	entry = index % max_entries;
+	if (entry >= le32_to_cpu(ab->nr_entries)) {
+		r = -ENODATA;
+		goto out;
+	}
+
+	old_value = element_at(info, ab, entry);
+	if (vt->dec &&
+	    (!vt->equal || !vt->equal(vt->context, old_value, value))) {
+		vt->dec(vt->context, old_value);
+		if (vt->inc)
+			vt->inc(vt->context, value);
+	}
+
+	memcpy(old_value, value, info->value_type.size);
+
+out:
+	unlock_ablock(info, block);
+	return r;
+}
+
+int dm_array_set_value(struct dm_array_info *info, dm_block_t root,
+		 uint32_t index, const void *value, dm_block_t *new_root)
+		 __dm_written_to_disk(value)
+{
+	int r;
+
+	r = array_set_value(info, root, index, value, new_root);
+	__dm_unbless_for_disk(value);
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_array_set_value);
+
+struct walk_info {
+	struct dm_array_info *info;
+	int (*fn)(void *context, uint64_t key, void *leaf);
+	void *context;
+};
+
+static int walk_ablock(void *context, uint64_t *keys, void *leaf)
+{
+	struct walk_info *wi = context;
+
+	int r;
+	unsigned i;
+	__le64 block_le;
+	unsigned nr_entries, max_entries;
+	struct dm_block *block;
+	struct array_block *ab;
+
+	memcpy(&block_le, leaf, sizeof(block_le));
+	r = get_ablock(wi->info, le64_to_cpu(block_le), &block, &ab);
+	if (r)
+		return r;
+
+	max_entries = le32_to_cpu(ab->max_entries);
+	nr_entries = le32_to_cpu(ab->nr_entries);
+	for (i = 0; i < nr_entries; i++) {
+		r = wi->fn(wi->context, keys[0] * max_entries + i,
+			   element_at(wi->info, ab, i));
+
+		if (r)
+			break;
+	}
+
+	unlock_ablock(wi->info, block);
+	return r;
+}
+
+int dm_array_walk(struct dm_array_info *info, dm_block_t root,
+		  int (*fn)(void *, uint64_t key, void *leaf),
+		  void *context)
+{
+	struct walk_info wi;
+
+	wi.info = info;
+	wi.fn = fn;
+	wi.context = context;
+
+	return dm_btree_walk(&info->btree_info, root, walk_ablock, &wi);
+}
+EXPORT_SYMBOL_GPL(dm_array_walk);
+
+/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-array.h b/drivers/md/persistent-data/dm-array.h
new file mode 100644
index 00000000000..ea177d6fa58
--- /dev/null
+++ b/drivers/md/persistent-data/dm-array.h
@@ -0,0 +1,166 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+#ifndef _LINUX_DM_ARRAY_H
+#define _LINUX_DM_ARRAY_H
+
+#include "dm-btree.h"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * The dm-array is a persistent version of an array.  It packs the data
+ * more efficiently than a btree which will result in less disk space use,
+ * and a performance boost.  The element get and set operations are still
+ * O(ln(n)), but with a much smaller constant.
+ *
+ * The value type structure is reused from the btree type to support proper
+ * reference counting of values.
+ *
+ * The arrays implicitly know their length, and bounds are checked for
+ * lookups and updated.  It doesn't store this in an accessible place
+ * because it would waste a whole metadata block.  Make sure you store the
+ * size along with the array root in your encompassing data.
+ *
+ * Array entries are indexed via an unsigned integer starting from zero.
+ * Arrays are not sparse; if you resize an array to have 'n' entries then
+ * 'n - 1' will be the last valid index.
+ *
+ * Typical use:
+ *
+ * a) initialise a dm_array_info structure.  This describes the array
+ *    values and ties it into a specific transaction manager.  It holds no
+ *    instance data; the same info can be used for many similar arrays if
+ *    you wish.
+ *
+ * b) Get yourself a root.  The root is the index of a block of data on the
+ *    disk that holds a particular instance of an array.  You may have a
+ *    pre existing root in your metadata that you wish to use, or you may
+ *    want to create a brand new, empty array with dm_array_empty().
+ *
+ * Like the other data structures in this library, dm_array objects are
+ * immutable between transactions.  Update functions will return you the
+ * root for a _new_ array.  If you've incremented the old root, via
+ * dm_tm_inc(), before calling the update function you may continue to use
+ * it in parallel with the new root.
+ *
+ * c) resize an array with dm_array_resize().
+ *
+ * d) Get a value from the array with dm_array_get_value().
+ *
+ * e) Set a value in the array with dm_array_set_value().
+ *
+ * f) Walk an array of values in index order with dm_array_walk().  More
+ *    efficient than making many calls to dm_array_get_value().
+ *
+ * g) Destroy the array with dm_array_del().  This tells the transaction
+ *    manager that you're no longer using this data structure so it can
+ *    recycle it's blocks.  (dm_array_dec() would be a better name for it,
+ *    but del is in keeping with dm_btree_del()).
+ */
+
+/*
+ * Describes an array.  Don't initialise this structure yourself, use the
+ * init function below.
+ */
+struct dm_array_info {
+	struct dm_transaction_manager *tm;
+	struct dm_btree_value_type value_type;
+	struct dm_btree_info btree_info;
+};
+
+/*
+ * Sets up a dm_array_info structure.  You don't need to do anything with
+ * this structure when you finish using it.
+ *
+ * info - the structure being filled in.
+ * tm   - the transaction manager that should supervise this structure.
+ * vt   - describes the leaf values.
+ */
+void dm_array_info_init(struct dm_array_info *info,
+			struct dm_transaction_manager *tm,
+			struct dm_btree_value_type *vt);
+
+/*
+ * Create an empty, zero length array.
+ *
+ * info - describes the array
+ * root - on success this will be filled out with the root block
+ */
+int dm_array_empty(struct dm_array_info *info, dm_block_t *root);
+
+/*
+ * Resizes the array.
+ *
+ * info - describes the array
+ * root - the root block of the array on disk
+ * old_size - the caller is responsible for remembering the size of
+ *            the array
+ * new_size - can be bigger or smaller than old_size
+ * value - if we're growing the array the new entries will have this value
+ * new_root - on success, points to the new root block
+ *
+ * If growing the inc function for 'value' will be called the appropriate
+ * number of times.  So if the caller is holding a reference they may want
+ * to drop it.
+ */
+int dm_array_resize(struct dm_array_info *info, dm_block_t root,
+		    uint32_t old_size, uint32_t new_size,
+		    const void *value, dm_block_t *new_root)
+	__dm_written_to_disk(value);
+
+/*
+ * Frees a whole array.  The value_type's decrement operation will be called
+ * for all values in the array
+ */
+int dm_array_del(struct dm_array_info *info, dm_block_t root);
+
+/*
+ * Lookup a value in the array
+ *
+ * info - describes the array
+ * root - root block of the array
+ * index - array index
+ * value - the value to be read.  Will be in on-disk format of course.
+ *
+ * -ENODATA will be returned if the index is out of bounds.
+ */
+int dm_array_get_value(struct dm_array_info *info, dm_block_t root,
+		       uint32_t index, void *value);
+
+/*
+ * Set an entry in the array.
+ *
+ * info - describes the array
+ * root - root block of the array
+ * index - array index
+ * value - value to be written to disk.  Make sure you confirm the value is
+ *         in on-disk format with__dm_bless_for_disk() before calling.
+ * new_root - the new root block
+ *
+ * The old value being overwritten will be decremented, the new value
+ * incremented.
+ *
+ * -ENODATA will be returned if the index is out of bounds.
+ */
+int dm_array_set_value(struct dm_array_info *info, dm_block_t root,
+		       uint32_t index, const void *value, dm_block_t *new_root)
+	__dm_written_to_disk(value);
+
+/*
+ * Walk through all the entries in an array.
+ *
+ * info - describes the array
+ * root - root block of the array
+ * fn - called back for every element
+ * context - passed to the callback
+ */
+int dm_array_walk(struct dm_array_info *info, dm_block_t root,
+		  int (*fn)(void *context, uint64_t key, void *leaf),
+		  void *context);
+
+/*----------------------------------------------------------------*/
+
+#endif	/* _LINUX_DM_ARRAY_H */
diff --git a/drivers/md/persistent-data/dm-bitset.c b/drivers/md/persistent-data/dm-bitset.c
new file mode 100644
index 00000000000..36f7cc2c710
--- /dev/null
+++ b/drivers/md/persistent-data/dm-bitset.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-bitset.h"
+#include "dm-transaction-manager.h"
+
+#include <linux/export.h>
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "bitset"
+#define BITS_PER_ARRAY_ENTRY 64
+
+/*----------------------------------------------------------------*/
+
+static struct dm_btree_value_type bitset_bvt = {
+	.context = NULL,
+	.size = sizeof(__le64),
+	.inc = NULL,
+	.dec = NULL,
+	.equal = NULL,
+};
+
+/*----------------------------------------------------------------*/
+
+void dm_disk_bitset_init(struct dm_transaction_manager *tm,
+			 struct dm_disk_bitset *info)
+{
+	dm_array_info_init(&info->array_info, tm, &bitset_bvt);
+	info->current_index_set = false;
+}
+EXPORT_SYMBOL_GPL(dm_disk_bitset_init);
+
+int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *root)
+{
+	return dm_array_empty(&info->array_info, root);
+}
+EXPORT_SYMBOL_GPL(dm_bitset_empty);
+
+int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t root,
+		     uint32_t old_nr_entries, uint32_t new_nr_entries,
+		     bool default_value, dm_block_t *new_root)
+{
+	uint32_t old_blocks = dm_div_up(old_nr_entries, BITS_PER_ARRAY_ENTRY);
+	uint32_t new_blocks = dm_div_up(new_nr_entries, BITS_PER_ARRAY_ENTRY);
+	__le64 value = default_value ? cpu_to_le64(~0) : cpu_to_le64(0);
+
+	__dm_bless_for_disk(&value);
+	return dm_array_resize(&info->array_info, root, old_blocks, new_blocks,
+			       &value, new_root);
+}
+EXPORT_SYMBOL_GPL(dm_bitset_resize);
+
+int dm_bitset_del(struct dm_disk_bitset *info, dm_block_t root)
+{
+	return dm_array_del(&info->array_info, root);
+}
+EXPORT_SYMBOL_GPL(dm_bitset_del);
+
+int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root,
+		    dm_block_t *new_root)
+{
+	int r;
+	__le64 value;
+
+	if (!info->current_index_set || !info->dirty)
+		return 0;
+
+	value = cpu_to_le64(info->current_bits);
+
+	__dm_bless_for_disk(&value);
+	r = dm_array_set_value(&info->array_info, root, info->current_index,
+			       &value, new_root);
+	if (r)
+		return r;
+
+	info->current_index_set = false;
+	info->dirty = false;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bitset_flush);
+
+static int read_bits(struct dm_disk_bitset *info, dm_block_t root,
+		     uint32_t array_index)
+{
+	int r;
+	__le64 value;
+
+	r = dm_array_get_value(&info->array_info, root, array_index, &value);
+	if (r)
+		return r;
+
+	info->current_bits = le64_to_cpu(value);
+	info->current_index_set = true;
+	info->current_index = array_index;
+	info->dirty = false;
+
+	return 0;
+}
+
+static int get_array_entry(struct dm_disk_bitset *info, dm_block_t root,
+			   uint32_t index, dm_block_t *new_root)
+{
+	int r;
+	unsigned array_index = index / BITS_PER_ARRAY_ENTRY;
+
+	if (info->current_index_set) {
+		if (info->current_index == array_index)
+			return 0;
+
+		r = dm_bitset_flush(info, root, new_root);
+		if (r)
+			return r;
+	}
+
+	return read_bits(info, root, array_index);
+}
+
+int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root,
+		      uint32_t index, dm_block_t *new_root)
+{
+	int r;
+	unsigned b = index % BITS_PER_ARRAY_ENTRY;
+
+	r = get_array_entry(info, root, index, new_root);
+	if (r)
+		return r;
+
+	set_bit(b, (unsigned long *) &info->current_bits);
+	info->dirty = true;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bitset_set_bit);
+
+int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root,
+			uint32_t index, dm_block_t *new_root)
+{
+	int r;
+	unsigned b = index % BITS_PER_ARRAY_ENTRY;
+
+	r = get_array_entry(info, root, index, new_root);
+	if (r)
+		return r;
+
+	clear_bit(b, (unsigned long *) &info->current_bits);
+	info->dirty = true;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bitset_clear_bit);
+
+int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root,
+		       uint32_t index, dm_block_t *new_root, bool *result)
+{
+	int r;
+	unsigned b = index % BITS_PER_ARRAY_ENTRY;
+
+	r = get_array_entry(info, root, index, new_root);
+	if (r)
+		return r;
+
+	*result = test_bit(b, (unsigned long *) &info->current_bits);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bitset_test_bit);
+
+/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-bitset.h b/drivers/md/persistent-data/dm-bitset.h
new file mode 100644
index 00000000000..c2287d672ef
--- /dev/null
+++ b/drivers/md/persistent-data/dm-bitset.h
@@ -0,0 +1,166 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+#ifndef _LINUX_DM_BITSET_H
+#define _LINUX_DM_BITSET_H
+
+#include "dm-array.h"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * This bitset type is a thin wrapper round a dm_array of 64bit words.  It
+ * uses a tiny, one word cache to reduce the number of array lookups and so
+ * increase performance.
+ *
+ * Like the dm-array that it's based on, the caller needs to keep track of
+ * the size of the bitset separately.  The underlying dm-array implicitly
+ * knows how many words it's storing and will return -ENODATA if you try
+ * and access an out of bounds word.  However, an out of bounds bit in the
+ * final word will _not_ be detected, you have been warned.
+ *
+ * Bits are indexed from zero.
+
+ * Typical use:
+ *
+ * a) Initialise a dm_disk_bitset structure with dm_disk_bitset_init().
+ *    This describes the bitset and includes the cache.  It's not called it
+ *    dm_bitset_info in line with other data structures because it does
+ *    include instance data.
+ *
+ * b) Get yourself a root.  The root is the index of a block of data on the
+ *    disk that holds a particular instance of an bitset.  You may have a
+ *    pre existing root in your metadata that you wish to use, or you may
+ *    want to create a brand new, empty bitset with dm_bitset_empty().
+ *
+ * Like the other data structures in this library, dm_bitset objects are
+ * immutable between transactions.  Update functions will return you the
+ * root for a _new_ array.  If you've incremented the old root, via
+ * dm_tm_inc(), before calling the update function you may continue to use
+ * it in parallel with the new root.
+ *
+ * Even read operations may trigger the cache to be flushed and as such
+ * return a root for a new, updated bitset.
+ *
+ * c) resize a bitset with dm_bitset_resize().
+ *
+ * d) Set a bit with dm_bitset_set_bit().
+ *
+ * e) Clear a bit with dm_bitset_clear_bit().
+ *
+ * f) Test a bit with dm_bitset_test_bit().
+ *
+ * g) Flush all updates from the cache with dm_bitset_flush().
+ *
+ * h) Destroy the bitset with dm_bitset_del().  This tells the transaction
+ *    manager that you're no longer using this data structure so it can
+ *    recycle it's blocks.  (dm_bitset_dec() would be a better name for it,
+ *    but del is in keeping with dm_btree_del()).
+ */
+
+/*
+ * Opaque object.  Unlike dm_array_info, you should have one of these per
+ * bitset.  Initialise with dm_disk_bitset_init().
+ */
+struct dm_disk_bitset {
+	struct dm_array_info array_info;
+
+	uint32_t current_index;
+	uint64_t current_bits;
+
+	bool current_index_set:1;
+	bool dirty:1;
+};
+
+/*
+ * Sets up a dm_disk_bitset structure.  You don't need to do anything with
+ * this structure when you finish using it.
+ *
+ * tm - the transaction manager that should supervise this structure
+ * info - the structure being initialised
+ */
+void dm_disk_bitset_init(struct dm_transaction_manager *tm,
+			 struct dm_disk_bitset *info);
+
+/*
+ * Create an empty, zero length bitset.
+ *
+ * info - describes the bitset
+ * new_root - on success, points to the new root block
+ */
+int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *new_root);
+
+/*
+ * Resize the bitset.
+ *
+ * info - describes the bitset
+ * old_root - the root block of the array on disk
+ * old_nr_entries - the number of bits in the old bitset
+ * new_nr_entries - the number of bits you want in the new bitset
+ * default_value - the value for any new bits
+ * new_root - on success, points to the new root block
+ */
+int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t old_root,
+		     uint32_t old_nr_entries, uint32_t new_nr_entries,
+		     bool default_value, dm_block_t *new_root);
+
+/*
+ * Frees the bitset.
+ */
+int dm_bitset_del(struct dm_disk_bitset *info, dm_block_t root);
+
+/*
+ * Set a bit.
+ *
+ * info - describes the bitset
+ * root - the root block of the bitset
+ * index - the bit index
+ * new_root - on success, points to the new root block
+ *
+ * -ENODATA will be returned if the index is out of bounds.
+ */
+int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root,
+		      uint32_t index, dm_block_t *new_root);
+
+/*
+ * Clears a bit.
+ *
+ * info - describes the bitset
+ * root - the root block of the bitset
+ * index - the bit index
+ * new_root - on success, points to the new root block
+ *
+ * -ENODATA will be returned if the index is out of bounds.
+ */
+int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root,
+			uint32_t index, dm_block_t *new_root);
+
+/*
+ * Tests a bit.
+ *
+ * info - describes the bitset
+ * root - the root block of the bitset
+ * index - the bit index
+ * new_root - on success, points to the new root block (cached values may have been written)
+ * result - the bit value you're after
+ *
+ * -ENODATA will be returned if the index is out of bounds.
+ */
+int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root,
+		       uint32_t index, dm_block_t *new_root, bool *result);
+
+/*
+ * Flush any cached changes to disk.
+ *
+ * info - describes the bitset
+ * root - the root block of the bitset
+ * new_root - on success, points to the new root block
+ */
+int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root,
+		    dm_block_t *new_root);
+
+/*----------------------------------------------------------------*/
+
+#endif /* _LINUX_DM_BITSET_H */
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 5ba277768d9..087411c95ff 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -25,7 +25,7 @@
  * may be held at once.  This is just an implementation detail.
  *
  * ii) Recursive locking attempts are detected and return EINVAL.  A stack
- * trace is also emitted for the previous lock aquisition.
+ * trace is also emitted for the previous lock acquisition.
  *
  * iii) Priority is given to write locks.
  */
@@ -104,12 +104,12 @@ static int __check_holder(struct block_lock *lock)
 
 	for (i = 0; i < MAX_HOLDERS; i++) {
 		if (lock->holders[i] == current) {
-			DMERR("recursive lock detected in pool metadata");
+			DMERR("recursive lock detected in metadata");
 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
 			DMERR("previously held here:");
 			print_stack_trace(lock->traces + i, 4);
 
-			DMERR("subsequent aquisition attempted here:");
+			DMERR("subsequent acquisition attempted here:");
 			t.nr_entries = 0;
 			t.max_entries = MAX_STACK;
 			t.entries = entries;
@@ -428,15 +428,17 @@ static int dm_bm_validate_buffer(struct dm_block_manager *bm,
 		if (!v)
 			return 0;
 		r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(bm->bufio));
-		if (unlikely(r))
+		if (unlikely(r)) {
+			DMERR_LIMIT("%s validator check failed for block %llu", v->name,
+				    (unsigned long long) dm_bufio_get_block_number(buf));
 			return r;
+		}
 		aux->validator = v;
 	} else {
 		if (unlikely(aux->validator != v)) {
-			DMERR("validator mismatch (old=%s vs new=%s) for block %llu",
-				aux->validator->name, v ? v->name : "NULL",
-				(unsigned long long)
-					dm_bufio_get_block_number(buf));
+			DMERR_LIMIT("validator mismatch (old=%s vs new=%s) for block %llu",
+				    aux->validator->name, v ? v->name : "NULL",
+				    (unsigned long long) dm_bufio_get_block_number(buf));
 			return -EINVAL;
 		}
 	}
@@ -593,24 +595,19 @@ int dm_bm_unlock(struct dm_block *b)
 }
 EXPORT_SYMBOL_GPL(dm_bm_unlock);
 
-int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
-			   struct dm_block *superblock)
+int dm_bm_flush(struct dm_block_manager *bm)
 {
-	int r;
-
 	if (bm->read_only)
 		return -EPERM;
 
-	r = dm_bufio_write_dirty_buffers(bm->bufio);
-	if (unlikely(r)) {
-		dm_bm_unlock(superblock);
-		return r;
-	}
-
-	dm_bm_unlock(superblock);
-
 	return dm_bufio_write_dirty_buffers(bm->bufio);
 }
+EXPORT_SYMBOL_GPL(dm_bm_flush);
+
+void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b)
+{
+	dm_bufio_prefetch(bm->bufio, b, 1);
+}
 
 void dm_bm_set_read_only(struct dm_block_manager *bm)
 {
@@ -618,6 +615,12 @@ void dm_bm_set_read_only(struct dm_block_manager *bm)
 }
 EXPORT_SYMBOL_GPL(dm_bm_set_read_only);
 
+void dm_bm_set_read_write(struct dm_block_manager *bm)
+{
+	bm->read_only = false;
+}
+EXPORT_SYMBOL_GPL(dm_bm_set_read_write);
+
 u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor)
 {
 	return crc32c(~(u32) 0, data, len) ^ init_xor;
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
index be5bff61be2..1b95dfc1778 100644
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -105,8 +105,12 @@ int dm_bm_unlock(struct dm_block *b);
  *
  * This method always blocks.
  */
-int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
-			   struct dm_block *superblock);
+int dm_bm_flush(struct dm_block_manager *bm);
+
+/*
+ * Request data is prefetched into the cache.
+ */
+void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b);
 
 /*
  * Switches the bm to a read only mode.  Once read-only mode
@@ -120,6 +124,7 @@ int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
  * be returned if you do.
  */
 void dm_bm_set_read_only(struct dm_block_manager *bm);
+void dm_bm_set_read_write(struct dm_block_manager *bm);
 
 u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor);
 
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
index 5709bfeab1e..37d367bb9aa 100644
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -36,13 +36,13 @@ struct node_header {
 	__le32 padding;
 } __packed;
 
-struct node {
+struct btree_node {
 	struct node_header header;
 	__le64 keys[0];
 } __packed;
 
 
-void inc_children(struct dm_transaction_manager *tm, struct node *n,
+void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
 		  struct dm_btree_value_type *vt);
 
 int new_block(struct dm_btree_info *info, struct dm_block **result);
@@ -64,7 +64,8 @@ struct ro_spine {
 void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info);
 int exit_ro_spine(struct ro_spine *s);
 int ro_step(struct ro_spine *s, dm_block_t new_child);
-struct node *ro_node(struct ro_spine *s);
+void ro_pop(struct ro_spine *s);
+struct btree_node *ro_node(struct ro_spine *s);
 
 struct shadow_spine {
 	struct dm_btree_info *info;
@@ -98,17 +99,17 @@ int shadow_root(struct shadow_spine *s);
 /*
  * Some inlines.
  */
-static inline __le64 *key_ptr(struct node *n, uint32_t index)
+static inline __le64 *key_ptr(struct btree_node *n, uint32_t index)
 {
 	return n->keys + index;
 }
 
-static inline void *value_base(struct node *n)
+static inline void *value_base(struct btree_node *n)
 {
 	return &n->keys[le32_to_cpu(n->header.max_entries)];
 }
 
-static inline void *value_ptr(struct node *n, uint32_t index)
+static inline void *value_ptr(struct btree_node *n, uint32_t index)
 {
 	uint32_t value_size = le32_to_cpu(n->header.value_size);
 	return value_base(n) + (value_size * index);
@@ -117,7 +118,7 @@ static inline void *value_ptr(struct node *n, uint32_t index)
 /*
  * Assumes the values are suitably-aligned and converts to core format.
  */
-static inline uint64_t value64(struct node *n, uint32_t index)
+static inline uint64_t value64(struct btree_node *n, uint32_t index)
 {
 	__le64 *values_le = value_base(n);
 
@@ -127,7 +128,7 @@ static inline uint64_t value64(struct node *n, uint32_t index)
 /*
  * Searching for a key within a single node.
  */
-int lower_bound(struct node *n, uint64_t key);
+int lower_bound(struct btree_node *n, uint64_t key);
 
 extern struct dm_block_validator btree_node_validator;
 
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index aa71e2359a0..b88757cd0d1 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -53,7 +53,7 @@
 /*
  * Some little utilities for moving node data around.
  */
-static void node_shift(struct node *n, int shift)
+static void node_shift(struct btree_node *n, int shift)
 {
 	uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
 	uint32_t value_size = le32_to_cpu(n->header.value_size);
@@ -79,7 +79,7 @@ static void node_shift(struct node *n, int shift)
 	}
 }
 
-static void node_copy(struct node *left, struct node *right, int shift)
+static void node_copy(struct btree_node *left, struct btree_node *right, int shift)
 {
 	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 	uint32_t value_size = le32_to_cpu(left->header.value_size);
@@ -108,7 +108,7 @@ static void node_copy(struct node *left, struct node *right, int shift)
 /*
  * Delete a specific entry from a leaf node.
  */
-static void delete_at(struct node *n, unsigned index)
+static void delete_at(struct btree_node *n, unsigned index)
 {
 	unsigned nr_entries = le32_to_cpu(n->header.nr_entries);
 	unsigned nr_to_copy = nr_entries - (index + 1);
@@ -128,7 +128,7 @@ static void delete_at(struct node *n, unsigned index)
 	n->header.nr_entries = cpu_to_le32(nr_entries - 1);
 }
 
-static unsigned merge_threshold(struct node *n)
+static unsigned merge_threshold(struct btree_node *n)
 {
 	return le32_to_cpu(n->header.max_entries) / 3;
 }
@@ -136,18 +136,11 @@ static unsigned merge_threshold(struct node *n)
 struct child {
 	unsigned index;
 	struct dm_block *block;
-	struct node *n;
+	struct btree_node *n;
 };
 
-static struct dm_btree_value_type le64_type = {
-	.context = NULL,
-	.size = sizeof(__le64),
-	.inc = NULL,
-	.dec = NULL,
-	.equal = NULL
-};
-
-static int init_child(struct dm_btree_info *info, struct node *parent,
+static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
+		      struct btree_node *parent,
 		      unsigned index, struct child *result)
 {
 	int r, inc;
@@ -164,7 +157,7 @@ static int init_child(struct dm_btree_info *info, struct node *parent,
 	result->n = dm_block_data(result->block);
 
 	if (inc)
-		inc_children(info->tm, result->n, &le64_type);
+		inc_children(info->tm, result->n, vt);
 
 	*((__le64 *) value_ptr(parent, index)) =
 		cpu_to_le64(dm_block_location(result->block));
@@ -177,7 +170,7 @@ static int exit_child(struct dm_btree_info *info, struct child *c)
 	return dm_tm_unlock(info->tm, c->block);
 }
 
-static void shift(struct node *left, struct node *right, int count)
+static void shift(struct btree_node *left, struct btree_node *right, int count)
 {
 	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
@@ -203,11 +196,11 @@ static void shift(struct node *left, struct node *right, int count)
 	right->header.nr_entries = cpu_to_le32(nr_right + count);
 }
 
-static void __rebalance2(struct dm_btree_info *info, struct node *parent,
+static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
 			 struct child *l, struct child *r)
 {
-	struct node *left = l->n;
-	struct node *right = r->n;
+	struct btree_node *left = l->n;
+	struct btree_node *right = r->n;
 	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
 	unsigned threshold = 2 * merge_threshold(left) + 1;
@@ -236,19 +229,19 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent,
 }
 
 static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
-		      unsigned left_index)
+		      struct dm_btree_value_type *vt, unsigned left_index)
 {
 	int r;
-	struct node *parent;
+	struct btree_node *parent;
 	struct child left, right;
 
 	parent = dm_block_data(shadow_current(s));
 
-	r = init_child(info, parent, left_index, &left);
+	r = init_child(info, vt, parent, left_index, &left);
 	if (r)
 		return r;
 
-	r = init_child(info, parent, left_index + 1, &right);
+	r = init_child(info, vt, parent, left_index + 1, &right);
 	if (r) {
 		exit_child(info, &left);
 		return r;
@@ -270,9 +263,9 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
  * in right, then rebalance2.  This wastes some cpu, but I want something
  * simple atm.
  */
-static void delete_center_node(struct dm_btree_info *info, struct node *parent,
+static void delete_center_node(struct dm_btree_info *info, struct btree_node *parent,
 			       struct child *l, struct child *c, struct child *r,
-			       struct node *left, struct node *center, struct node *right,
+			       struct btree_node *left, struct btree_node *center, struct btree_node *right,
 			       uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
 {
 	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
@@ -301,9 +294,9 @@ static void delete_center_node(struct dm_btree_info *info, struct node *parent,
 /*
  * Redistributes entries among 3 sibling nodes.
  */
-static void redistribute3(struct dm_btree_info *info, struct node *parent,
+static void redistribute3(struct dm_btree_info *info, struct btree_node *parent,
 			  struct child *l, struct child *c, struct child *r,
-			  struct node *left, struct node *center, struct node *right,
+			  struct btree_node *left, struct btree_node *center, struct btree_node *right,
 			  uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
 {
 	int s;
@@ -343,12 +336,12 @@ static void redistribute3(struct dm_btree_info *info, struct node *parent,
 	*key_ptr(parent, r->index) = right->keys[0];
 }
 
-static void __rebalance3(struct dm_btree_info *info, struct node *parent,
+static void __rebalance3(struct dm_btree_info *info, struct btree_node *parent,
 			 struct child *l, struct child *c, struct child *r)
 {
-	struct node *left = l->n;
-	struct node *center = c->n;
-	struct node *right = r->n;
+	struct btree_node *left = l->n;
+	struct btree_node *center = c->n;
+	struct btree_node *right = r->n;
 
 	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 	uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
@@ -368,26 +361,26 @@ static void __rebalance3(struct dm_btree_info *info, struct node *parent,
 }
 
 static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
-		      unsigned left_index)
+		      struct dm_btree_value_type *vt, unsigned left_index)
 {
 	int r;
-	struct node *parent = dm_block_data(shadow_current(s));
+	struct btree_node *parent = dm_block_data(shadow_current(s));
 	struct child left, center, right;
 
 	/*
 	 * FIXME: fill out an array?
 	 */
-	r = init_child(info, parent, left_index, &left);
+	r = init_child(info, vt, parent, left_index, &left);
 	if (r)
 		return r;
 
-	r = init_child(info, parent, left_index + 1, &center);
+	r = init_child(info, vt, parent, left_index + 1, &center);
 	if (r) {
 		exit_child(info, &left);
 		return r;
 	}
 
-	r = init_child(info, parent, left_index + 2, &right);
+	r = init_child(info, vt, parent, left_index + 2, &right);
 	if (r) {
 		exit_child(info, &left);
 		exit_child(info, &center);
@@ -421,7 +414,7 @@ static int get_nr_entries(struct dm_transaction_manager *tm,
 {
 	int r;
 	struct dm_block *block;
-	struct node *n;
+	struct btree_node *n;
 
 	r = dm_tm_read_lock(tm, b, &btree_node_validator, &block);
 	if (r)
@@ -434,11 +427,12 @@ static int get_nr_entries(struct dm_transaction_manager *tm,
 }
 
 static int rebalance_children(struct shadow_spine *s,
-			      struct dm_btree_info *info, uint64_t key)
+			      struct dm_btree_info *info,
+			      struct dm_btree_value_type *vt, uint64_t key)
 {
 	int i, r, has_left_sibling, has_right_sibling;
 	uint32_t child_entries;
-	struct node *n;
+	struct btree_node *n;
 
 	n = dm_block_data(shadow_current(s));
 
@@ -472,18 +466,18 @@ static int rebalance_children(struct shadow_spine *s,
 	has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
 
 	if (!has_left_sibling)
-		r = rebalance2(s, info, i);
+		r = rebalance2(s, info, vt, i);
 
 	else if (!has_right_sibling)
-		r = rebalance2(s, info, i - 1);
+		r = rebalance2(s, info, vt, i - 1);
 
 	else
-		r = rebalance3(s, info, i - 1);
+		r = rebalance3(s, info, vt, i - 1);
 
 	return r;
 }
 
-static int do_leaf(struct node *n, uint64_t key, unsigned *index)
+static int do_leaf(struct btree_node *n, uint64_t key, unsigned *index)
 {
 	int i = lower_bound(n, key);
 
@@ -506,7 +500,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
 		      uint64_t key, unsigned *index)
 {
 	int i = *index, r;
-	struct node *n;
+	struct btree_node *n;
 
 	for (;;) {
 		r = shadow_step(s, root, vt);
@@ -529,7 +523,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
 		if (le32_to_cpu(n->header.flags) & LEAF_NODE)
 			return do_leaf(n, key, index);
 
-		r = rebalance_children(s, info, key);
+		r = rebalance_children(s, info, vt, key);
 		if (r)
 			break;
 
@@ -550,13 +544,21 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
 	return r;
 }
 
+static struct dm_btree_value_type le64_type = {
+	.context = NULL,
+	.size = sizeof(__le64),
+	.inc = NULL,
+	.dec = NULL,
+	.equal = NULL
+};
+
 int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
 		    uint64_t *keys, dm_block_t *new_root)
 {
 	unsigned level, last_level = info->levels - 1;
 	int index = 0, r = 0;
 	struct shadow_spine spine;
-	struct node *n;
+	struct btree_node *n;
 
 	init_shadow_spine(&spine, info);
 	for (level = 0; level < info->levels; level++) {
diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c
index d9a7912ee8e..cf9fd676ae4 100644
--- a/drivers/md/persistent-data/dm-btree-spine.c
+++ b/drivers/md/persistent-data/dm-btree-spine.c
@@ -23,7 +23,7 @@ static void node_prepare_for_write(struct dm_block_validator *v,
 				   struct dm_block *b,
 				   size_t block_size)
 {
-	struct node *n = dm_block_data(b);
+	struct btree_node *n = dm_block_data(b);
 	struct node_header *h = &n->header;
 
 	h->blocknr = cpu_to_le64(dm_block_location(b));
@@ -38,15 +38,15 @@ static int node_check(struct dm_block_validator *v,
 		      struct dm_block *b,
 		      size_t block_size)
 {
-	struct node *n = dm_block_data(b);
+	struct btree_node *n = dm_block_data(b);
 	struct node_header *h = &n->header;
 	size_t value_size;
 	__le32 csum_disk;
 	uint32_t flags;
 
 	if (dm_block_location(b) != le64_to_cpu(h->blocknr)) {
-		DMERR("node_check failed blocknr %llu wanted %llu",
-		      le64_to_cpu(h->blocknr), dm_block_location(b));
+		DMERR_LIMIT("node_check failed: blocknr %llu != wanted %llu",
+			    le64_to_cpu(h->blocknr), dm_block_location(b));
 		return -ENOTBLK;
 	}
 
@@ -54,8 +54,8 @@ static int node_check(struct dm_block_validator *v,
 					       block_size - sizeof(__le32),
 					       BTREE_CSUM_XOR));
 	if (csum_disk != h->csum) {
-		DMERR("node_check failed csum %u wanted %u",
-		      le32_to_cpu(csum_disk), le32_to_cpu(h->csum));
+		DMERR_LIMIT("node_check failed: csum %u != wanted %u",
+			    le32_to_cpu(csum_disk), le32_to_cpu(h->csum));
 		return -EILSEQ;
 	}
 
@@ -63,12 +63,12 @@ static int node_check(struct dm_block_validator *v,
 
 	if (sizeof(struct node_header) +
 	    (sizeof(__le64) + value_size) * le32_to_cpu(h->max_entries) > block_size) {
-		DMERR("node_check failed: max_entries too large");
+		DMERR_LIMIT("node_check failed: max_entries too large");
 		return -EILSEQ;
 	}
 
 	if (le32_to_cpu(h->nr_entries) > le32_to_cpu(h->max_entries)) {
-		DMERR("node_check failed, too many entries");
+		DMERR_LIMIT("node_check failed: too many entries");
 		return -EILSEQ;
 	}
 
@@ -77,7 +77,7 @@ static int node_check(struct dm_block_validator *v,
 	 */
 	flags = le32_to_cpu(h->flags);
 	if (!(flags & INTERNAL_NODE) && !(flags & LEAF_NODE)) {
-		DMERR("node_check failed, node is neither INTERNAL or LEAF");
+		DMERR_LIMIT("node_check failed: node is neither INTERNAL or LEAF");
 		return -EILSEQ;
 	}
 
@@ -164,7 +164,14 @@ int ro_step(struct ro_spine *s, dm_block_t new_child)
 	return r;
 }
 
-struct node *ro_node(struct ro_spine *s)
+void ro_pop(struct ro_spine *s)
+{
+	BUG_ON(!s->count);
+	--s->count;
+	unlock_block(s->info, s->nodes[s->count]);
+}
+
+struct btree_node *ro_node(struct ro_spine *s)
 {
 	struct dm_block *block;
 
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index d12b2cc51f1..416060c2570 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -38,7 +38,7 @@ static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
 /*----------------------------------------------------------------*/
 
 /* makes the assumption that no two keys are the same. */
-static int bsearch(struct node *n, uint64_t key, int want_hi)
+static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
 {
 	int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
 
@@ -58,12 +58,12 @@ static int bsearch(struct node *n, uint64_t key, int want_hi)
 	return want_hi ? hi : lo;
 }
 
-int lower_bound(struct node *n, uint64_t key)
+int lower_bound(struct btree_node *n, uint64_t key)
 {
 	return bsearch(n, key, 0);
 }
 
-void inc_children(struct dm_transaction_manager *tm, struct node *n,
+void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
 		  struct dm_btree_value_type *vt)
 {
 	unsigned i;
@@ -77,7 +77,7 @@ void inc_children(struct dm_transaction_manager *tm, struct node *n,
 			vt->inc(vt->context, value_ptr(n, i));
 }
 
-static int insert_at(size_t value_size, struct node *node, unsigned index,
+static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
 		      uint64_t key, void *value)
 		      __dm_written_to_disk(value)
 {
@@ -122,7 +122,7 @@ int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
 {
 	int r;
 	struct dm_block *b;
-	struct node *n;
+	struct btree_node *n;
 	size_t block_size;
 	uint32_t max_entries;
 
@@ -154,13 +154,14 @@ EXPORT_SYMBOL_GPL(dm_btree_empty);
 #define MAX_SPINE_DEPTH 64
 struct frame {
 	struct dm_block *b;
-	struct node *n;
+	struct btree_node *n;
 	unsigned level;
 	unsigned nr_children;
 	unsigned current_child;
 };
 
 struct del_stack {
+	struct dm_btree_info *info;
 	struct dm_transaction_manager *tm;
 	int top;
 	struct frame spine[MAX_SPINE_DEPTH];
@@ -183,6 +184,20 @@ static int unprocessed_frames(struct del_stack *s)
 	return s->top >= 0;
 }
 
+static void prefetch_children(struct del_stack *s, struct frame *f)
+{
+	unsigned i;
+	struct dm_block_manager *bm = dm_tm_get_bm(s->tm);
+
+	for (i = 0; i < f->nr_children; i++)
+		dm_bm_prefetch(bm, value64(f->n, i));
+}
+
+static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
+{
+	return f->level < (info->levels - 1);
+}
+
 static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
 {
 	int r;
@@ -205,6 +220,7 @@ static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
 		dm_tm_dec(s->tm, b);
 
 	else {
+		uint32_t flags;
 		struct frame *f = s->spine + ++s->top;
 
 		r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
@@ -217,6 +233,10 @@ static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
 		f->level = level;
 		f->nr_children = le32_to_cpu(f->n->header.nr_entries);
 		f->current_child = 0;
+
+		flags = le32_to_cpu(f->n->header.flags);
+		if (flags & INTERNAL_NODE || is_internal_level(s->info, f))
+			prefetch_children(s, f);
 	}
 
 	return 0;
@@ -238,10 +258,11 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
 	s = kmalloc(sizeof(*s), GFP_KERNEL);
 	if (!s)
 		return -ENOMEM;
+	s->info = info;
 	s->tm = info->tm;
 	s->top = -1;
 
-	r = push_frame(s, root, 1);
+	r = push_frame(s, root, 0);
 	if (r)
 		goto out;
 
@@ -267,7 +288,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
 			if (r)
 				goto out;
 
-		} else if (f->level != (info->levels - 1)) {
+		} else if (is_internal_level(info, f)) {
 			b = value64(f->n, f->current_child);
 			f->current_child++;
 			r = push_frame(s, b, f->level + 1);
@@ -282,7 +303,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
 					info->value_type.dec(info->value_type.context,
 							     value_ptr(f->n, i));
 			}
-			f->current_child = f->nr_children;
+			pop_frame(s);
 		}
 	}
 
@@ -295,7 +316,7 @@ EXPORT_SYMBOL_GPL(dm_btree_del);
 /*----------------------------------------------------------------*/
 
 static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
-			    int (*search_fn)(struct node *, uint64_t),
+			    int (*search_fn)(struct btree_node *, uint64_t),
 			    uint64_t *result_key, void *v, size_t value_size)
 {
 	int i, r;
@@ -406,7 +427,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
 	size_t size;
 	unsigned nr_left, nr_right;
 	struct dm_block *left, *right, *parent;
-	struct node *ln, *rn, *pn;
+	struct btree_node *ln, *rn, *pn;
 	__le64 location;
 
 	left = shadow_current(s);
@@ -491,7 +512,7 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
 	size_t size;
 	unsigned nr_left, nr_right;
 	struct dm_block *left, *right, *new_parent;
-	struct node *pn, *ln, *rn;
+	struct btree_node *pn, *ln, *rn;
 	__le64 val;
 
 	new_parent = shadow_current(s);
@@ -576,7 +597,7 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
 			    uint64_t key, unsigned *index)
 {
 	int r, i = *index, top = 1;
-	struct node *node;
+	struct btree_node *node;
 
 	for (;;) {
 		r = shadow_step(s, root, vt);
@@ -643,7 +664,7 @@ static int insert(struct dm_btree_info *info, dm_block_t root,
 	unsigned level, index = -1, last_level = info->levels - 1;
 	dm_block_t block = root;
 	struct shadow_spine spine;
-	struct node *n;
+	struct btree_node *n;
 	struct dm_btree_value_type le64_type;
 
 	le64_type.context = NULL;
@@ -749,8 +770,8 @@ EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
 
 /*----------------------------------------------------------------*/
 
-static int find_highest_key(struct ro_spine *s, dm_block_t block,
-			    uint64_t *result_key, dm_block_t *next_block)
+static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
+		    uint64_t *result_key, dm_block_t *next_block)
 {
 	int i, r;
 	uint32_t flags;
@@ -767,7 +788,11 @@ static int find_highest_key(struct ro_spine *s, dm_block_t block,
 		else
 			i--;
 
-		*result_key = le64_to_cpu(ro_node(s)->keys[i]);
+		if (find_highest)
+			*result_key = le64_to_cpu(ro_node(s)->keys[i]);
+		else
+			*result_key = le64_to_cpu(ro_node(s)->keys[0]);
+
 		if (next_block || flags & INTERNAL_NODE)
 			block = value64(ro_node(s), i);
 
@@ -778,16 +803,16 @@ static int find_highest_key(struct ro_spine *s, dm_block_t block,
 	return 0;
 }
 
-int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
-			      uint64_t *result_keys)
+static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root,
+			     bool find_highest, uint64_t *result_keys)
 {
 	int r = 0, count = 0, level;
 	struct ro_spine spine;
 
 	init_ro_spine(&spine, info);
 	for (level = 0; level < info->levels; level++) {
-		r = find_highest_key(&spine, root, result_keys + level,
-				     level == info->levels - 1 ? NULL : &root);
+		r = find_key(&spine, root, find_highest, result_keys + level,
+			     level == info->levels - 1 ? NULL : &root);
 		if (r == -ENODATA) {
 			r = 0;
 			break;
@@ -801,4 +826,71 @@ int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
 
 	return r ? r : count;
 }
+
+int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
+			      uint64_t *result_keys)
+{
+	return dm_btree_find_key(info, root, true, result_keys);
+}
 EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
+
+int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
+			     uint64_t *result_keys)
+{
+	return dm_btree_find_key(info, root, false, result_keys);
+}
+EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
+
+/*----------------------------------------------------------------*/
+
+/*
+ * FIXME: We shouldn't use a recursive algorithm when we have limited stack
+ * space.  Also this only works for single level trees.
+ */
+static int walk_node(struct ro_spine *s, dm_block_t block,
+		     int (*fn)(void *context, uint64_t *keys, void *leaf),
+		     void *context)
+{
+	int r;
+	unsigned i, nr;
+	struct btree_node *n;
+	uint64_t keys;
+
+	r = ro_step(s, block);
+	n = ro_node(s);
+
+	nr = le32_to_cpu(n->header.nr_entries);
+	for (i = 0; i < nr; i++) {
+		if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
+			r = walk_node(s, value64(n, i), fn, context);
+			if (r)
+				goto out;
+		} else {
+			keys = le64_to_cpu(*key_ptr(n, i));
+			r = fn(context, &keys, value_ptr(n, i));
+			if (r)
+				goto out;
+		}
+	}
+
+out:
+	ro_pop(s);
+	return r;
+}
+
+int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
+		  int (*fn)(void *context, uint64_t *keys, void *leaf),
+		  void *context)
+{
+	int r;
+	struct ro_spine spine;
+
+	BUG_ON(info->levels > 1);
+
+	init_ro_spine(&spine, info);
+	r = walk_node(&spine, root, fn, context);
+	exit_ro_spine(&spine);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_btree_walk);
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
index ae02c84410f..dacfc34180b 100644
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -35,7 +35,7 @@ struct dm_transaction_manager;
  */
 
 /*
- * Infomation about the values stored within the btree.
+ * Information about the values stored within the btree.
  */
 struct dm_btree_value_type {
 	void *context;
@@ -58,21 +58,21 @@ struct dm_btree_value_type {
 	 * somewhere.) This method is _not_ called for insertion of a new
 	 * value: It is assumed the ref count is already 1.
 	 */
-	void (*inc)(void *context, void *value);
+	void (*inc)(void *context, const void *value);
 
 	/*
 	 * This value is being deleted.  The btree takes care of freeing
 	 * the memory pointed to by @value.  Often the del function just
 	 * needs to decrement a reference count somewhere.
 	 */
-	void (*dec)(void *context, void *value);
+	void (*dec)(void *context, const void *value);
 
 	/*
 	 * A test for equality between two values.  When a value is
 	 * overwritten with a new one, the old one has the dec method
 	 * called _unless_ the new and old value are deemed equal.
 	 */
-	int (*equal)(void *context, void *value1, void *value2);
+	int (*equal)(void *context, const void *value1, const void *value2);
 };
 
 /*
@@ -137,9 +137,26 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
 /*
  * Returns < 0 on failure.  Otherwise the number of key entries that have
  * been filled out.  Remember trees can have zero entries, and as such have
+ * no lowest key.
+ */
+int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
+			     uint64_t *result_keys);
+
+/*
+ * Returns < 0 on failure.  Otherwise the number of key entries that have
+ * been filled out.  Remember trees can have zero entries, and as such have
  * no highest key.
  */
 int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
 			      uint64_t *result_keys);
 
+/*
+ * Iterate through the a btree, calling fn() on each entry.
+ * It only works for single level trees and is internally recursive, so
+ * monitor stack usage carefully.
+ */
+int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
+		  int (*fn)(void *context, uint64_t *keys, void *leaf),
+		  void *context);
+
 #endif	/* _LINUX_DM_BTREE_H */
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index f3a9af8cdec..aacbe70c2c2 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -39,8 +39,8 @@ static int index_check(struct dm_block_validator *v,
 	__le32 csum_disk;
 
 	if (dm_block_location(b) != le64_to_cpu(mi_le->blocknr)) {
-		DMERR("index_check failed blocknr %llu wanted %llu",
-		      le64_to_cpu(mi_le->blocknr), dm_block_location(b));
+		DMERR_LIMIT("index_check failed: blocknr %llu != wanted %llu",
+			    le64_to_cpu(mi_le->blocknr), dm_block_location(b));
 		return -ENOTBLK;
 	}
 
@@ -48,8 +48,8 @@ static int index_check(struct dm_block_validator *v,
 					       block_size - sizeof(__le32),
 					       INDEX_CSUM_XOR));
 	if (csum_disk != mi_le->csum) {
-		DMERR("index_check failed csum %u wanted %u",
-		      le32_to_cpu(csum_disk), le32_to_cpu(mi_le->csum));
+		DMERR_LIMIT("index_check failed: csum %u != wanted %u",
+			    le32_to_cpu(csum_disk), le32_to_cpu(mi_le->csum));
 		return -EILSEQ;
 	}
 
@@ -89,8 +89,8 @@ static int bitmap_check(struct dm_block_validator *v,
 	__le32 csum_disk;
 
 	if (dm_block_location(b) != le64_to_cpu(disk_header->blocknr)) {
-		DMERR("bitmap check failed blocknr %llu wanted %llu",
-		      le64_to_cpu(disk_header->blocknr), dm_block_location(b));
+		DMERR_LIMIT("bitmap check failed: blocknr %llu != wanted %llu",
+			    le64_to_cpu(disk_header->blocknr), dm_block_location(b));
 		return -ENOTBLK;
 	}
 
@@ -98,8 +98,8 @@ static int bitmap_check(struct dm_block_validator *v,
 					       block_size - sizeof(__le32),
 					       BITMAP_CSUM_XOR));
 	if (csum_disk != disk_header->csum) {
-		DMERR("bitmap check failed csum %u wanted %u",
-		      le32_to_cpu(csum_disk), le32_to_cpu(disk_header->csum));
+		DMERR_LIMIT("bitmap check failed: csum %u != wanted %u",
+			    le32_to_cpu(csum_disk), le32_to_cpu(disk_header->csum));
 		return -EILSEQ;
 	}
 
@@ -245,6 +245,10 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
 		return -EINVAL;
 	}
 
+	/*
+	 * We need to set this before the dm_tm_new_block() call below.
+	 */
+	ll->nr_blocks = nr_blocks;
 	for (i = old_blocks; i < blocks; i++) {
 		struct dm_block *b;
 		struct disk_index_entry idx;
@@ -252,6 +256,7 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
 		r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b);
 		if (r < 0)
 			return r;
+
 		idx.blocknr = cpu_to_le64(dm_block_location(b));
 
 		r = dm_tm_unlock(ll->tm, b);
@@ -266,7 +271,6 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
 			return r;
 	}
 
-	ll->nr_blocks = nr_blocks;
 	return 0;
 }
 
@@ -292,16 +296,11 @@ int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result)
 	return dm_tm_unlock(ll->tm, blk);
 }
 
-int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result)
+static int sm_ll_lookup_big_ref_count(struct ll_disk *ll, dm_block_t b,
+				      uint32_t *result)
 {
 	__le32 le_rc;
-	int r = sm_ll_lookup_bitmap(ll, b, result);
-
-	if (r)
-		return r;
-
-	if (*result != 3)
-		return r;
+	int r;
 
 	r = dm_btree_lookup(&ll->ref_count_info, ll->ref_count_root, &b, &le_rc);
 	if (r < 0)
@@ -312,6 +311,19 @@ int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result)
 	return r;
 }
 
+int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result)
+{
+	int r = sm_ll_lookup_bitmap(ll, b, result);
+
+	if (r)
+		return r;
+
+	if (*result != 3)
+		return r;
+
+	return sm_ll_lookup_big_ref_count(ll, b, result);
+}
+
 int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
 			  dm_block_t end, dm_block_t *result)
 {
@@ -372,11 +384,12 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
 	return -ENOSPC;
 }
 
-int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
-		 uint32_t ref_count, enum allocation_event *ev)
+static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
+			int (*mutator)(void *context, uint32_t old, uint32_t *new),
+			void *context, enum allocation_event *ev)
 {
 	int r;
-	uint32_t bit, old;
+	uint32_t bit, old, ref_count;
 	struct dm_block *nb;
 	dm_block_t index = b;
 	struct disk_index_entry ie_disk;
@@ -399,6 +412,20 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
 	bm_le = dm_bitmap_data(nb);
 	old = sm_lookup_bitmap(bm_le, bit);
 
+	if (old > 2) {
+		r = sm_ll_lookup_big_ref_count(ll, b, &old);
+		if (r < 0) {
+			dm_tm_unlock(ll->tm, nb);
+			return r;
+		}
+	}
+
+	r = mutator(context, old, &ref_count);
+	if (r) {
+		dm_tm_unlock(ll->tm, nb);
+		return r;
+	}
+
 	if (ref_count <= 2) {
 		sm_set_bitmap(bm_le, bit, ref_count);
 
@@ -448,31 +475,43 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
 	return ll->save_ie(ll, index, &ie_disk);
 }
 
-int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
+static int set_ref_count(void *context, uint32_t old, uint32_t *new)
 {
-	int r;
-	uint32_t rc;
-
-	r = sm_ll_lookup(ll, b, &rc);
-	if (r)
-		return r;
+	*new = *((uint32_t *) context);
+	return 0;
+}
 
-	return sm_ll_insert(ll, b, rc + 1, ev);
+int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
+		 uint32_t ref_count, enum allocation_event *ev)
+{
+	return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev);
 }
 
-int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
+static int inc_ref_count(void *context, uint32_t old, uint32_t *new)
 {
-	int r;
-	uint32_t rc;
+	*new = old + 1;
+	return 0;
+}
 
-	r = sm_ll_lookup(ll, b, &rc);
-	if (r)
-		return r;
+int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
+{
+	return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev);
+}
 
-	if (!rc)
+static int dec_ref_count(void *context, uint32_t old, uint32_t *new)
+{
+	if (!old) {
+		DMERR_LIMIT("unable to decrement a reference count below 0");
 		return -EINVAL;
+	}
 
-	return sm_ll_insert(ll, b, rc - 1, ev);
+	*new = old - 1;
+	return 0;
+}
+
+int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
+{
+	return sm_ll_mutate(ll, b, dec_ref_count, NULL, ev);
 }
 
 int sm_ll_commit(struct ll_disk *ll)
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
index f6d29e614ab..cfbf9617e46 100644
--- a/drivers/md/persistent-data/dm-space-map-disk.c
+++ b/drivers/md/persistent-data/dm-space-map-disk.c
@@ -140,26 +140,10 @@ static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b)
 
 static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b)
 {
-	int r;
-	uint32_t old_count;
 	enum allocation_event ev;
 	struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
 
-	r = sm_ll_dec(&smd->ll, b, &ev);
-	if (!r && (ev == SM_FREE)) {
-		/*
-		 * It's only free if it's also free in the last
-		 * transaction.
-		 */
-		r = sm_ll_lookup(&smd->old_ll, b, &old_count);
-		if (r)
-			return r;
-
-		if (!old_count)
-			smd->nr_allocated_this_transaction--;
-	}
-
-	return r;
+	return sm_ll_dec(&smd->ll, b, &ev);
 }
 
 static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
@@ -248,7 +232,8 @@ static struct dm_space_map ops = {
 	.new_block = sm_disk_new_block,
 	.commit = sm_disk_commit,
 	.root_size = sm_disk_root_size,
-	.copy_root = sm_disk_copy_root
+	.copy_root = sm_disk_copy_root,
+	.register_threshold_callback = NULL
 };
 
 struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm,
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index e89ae5e7a51..786b689bdfc 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -17,6 +17,55 @@
 /*----------------------------------------------------------------*/
 
 /*
+ * An edge triggered threshold.
+ */
+struct threshold {
+	bool threshold_set;
+	bool value_set;
+	dm_block_t threshold;
+	dm_block_t current_value;
+	dm_sm_threshold_fn fn;
+	void *context;
+};
+
+static void threshold_init(struct threshold *t)
+{
+	t->threshold_set = false;
+	t->value_set = false;
+}
+
+static void set_threshold(struct threshold *t, dm_block_t value,
+			  dm_sm_threshold_fn fn, void *context)
+{
+	t->threshold_set = true;
+	t->threshold = value;
+	t->fn = fn;
+	t->context = context;
+}
+
+static bool below_threshold(struct threshold *t, dm_block_t value)
+{
+	return t->threshold_set && value <= t->threshold;
+}
+
+static bool threshold_already_triggered(struct threshold *t)
+{
+	return t->value_set && below_threshold(t, t->current_value);
+}
+
+static void check_threshold(struct threshold *t, dm_block_t value)
+{
+	if (below_threshold(t, value) &&
+	    !threshold_already_triggered(t))
+		t->fn(t->context);
+
+	t->value_set = true;
+	t->current_value = value;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
  * Space map interface.
  *
  * The low level disk format is written using the standard btree and
@@ -42,6 +91,69 @@ struct block_op {
 	dm_block_t block;
 };
 
+struct bop_ring_buffer {
+	unsigned begin;
+	unsigned end;
+	struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1];
+};
+
+static void brb_init(struct bop_ring_buffer *brb)
+{
+	brb->begin = 0;
+	brb->end = 0;
+}
+
+static bool brb_empty(struct bop_ring_buffer *brb)
+{
+	return brb->begin == brb->end;
+}
+
+static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old)
+{
+	unsigned r = old + 1;
+	return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r;
+}
+
+static int brb_push(struct bop_ring_buffer *brb,
+		    enum block_op_type type, dm_block_t b)
+{
+	struct block_op *bop;
+	unsigned next = brb_next(brb, brb->end);
+
+	/*
+	 * We don't allow the last bop to be filled, this way we can
+	 * differentiate between full and empty.
+	 */
+	if (next == brb->begin)
+		return -ENOMEM;
+
+	bop = brb->bops + brb->end;
+	bop->type = type;
+	bop->block = b;
+
+	brb->end = next;
+
+	return 0;
+}
+
+static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result)
+{
+	struct block_op *bop;
+
+	if (brb_empty(brb))
+		return -ENODATA;
+
+	bop = brb->bops + brb->begin;
+	result->type = bop->type;
+	result->block = bop->block;
+
+	brb->begin = brb_next(brb, brb->begin);
+
+	return 0;
+}
+
+/*----------------------------------------------------------------*/
+
 struct sm_metadata {
 	struct dm_space_map sm;
 
@@ -52,23 +164,20 @@ struct sm_metadata {
 
 	unsigned recursion_count;
 	unsigned allocated_this_transaction;
-	unsigned nr_uncommitted;
-	struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS];
+	struct bop_ring_buffer uncommitted;
+
+	struct threshold threshold;
 };
 
 static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b)
 {
-	struct block_op *op;
+	int r = brb_push(&smm->uncommitted, type, b);
 
-	if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) {
+	if (r) {
 		DMERR("too many recursive allocations");
 		return -ENOMEM;
 	}
 
-	op = smm->uncommitted + smm->nr_uncommitted++;
-	op->type = type;
-	op->block = b;
-
 	return 0;
 }
 
@@ -107,11 +216,17 @@ static int out(struct sm_metadata *smm)
 		return -ENOMEM;
 	}
 
-	if (smm->recursion_count == 1 && smm->nr_uncommitted) {
-		while (smm->nr_uncommitted && !r) {
-			smm->nr_uncommitted--;
-			r = commit_bop(smm, smm->uncommitted +
-				       smm->nr_uncommitted);
+	if (smm->recursion_count == 1) {
+		while (!brb_empty(&smm->uncommitted)) {
+			struct block_op bop;
+
+			r = brb_pop(&smm->uncommitted, &bop);
+			if (r) {
+				DMERR("bug in bop ring buffer");
+				break;
+			}
+
+			r = commit_bop(smm, &bop);
 			if (r)
 				break;
 		}
@@ -144,12 +259,6 @@ static void sm_metadata_destroy(struct dm_space_map *sm)
 	kfree(smm);
 }
 
-static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
-{
-	DMERR("doesn't support extend");
-	return -EINVAL;
-}
-
 static int sm_metadata_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
 {
 	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
@@ -172,7 +281,8 @@ static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
 static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
 				 uint32_t *result)
 {
-	int r, i;
+	int r;
+	unsigned i;
 	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
 	unsigned adjustment = 0;
 
@@ -180,8 +290,10 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
 	 * We may have some uncommitted adjustments to add.  This list
 	 * should always be really short.
 	 */
-	for (i = 0; i < smm->nr_uncommitted; i++) {
-		struct block_op *op = smm->uncommitted + i;
+	for (i = smm->uncommitted.begin;
+	     i != smm->uncommitted.end;
+	     i = brb_next(&smm->uncommitted, i)) {
+		struct block_op *op = smm->uncommitted.bops + i;
 
 		if (op->block != b)
 			continue;
@@ -209,7 +321,8 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
 static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
 					      dm_block_t b, int *result)
 {
-	int r, i, adjustment = 0;
+	int r, adjustment = 0;
+	unsigned i;
 	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
 	uint32_t rc;
 
@@ -217,8 +330,11 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
 	 * We may have some uncommitted adjustments to add.  This list
 	 * should always be really short.
 	 */
-	for (i = 0; i < smm->nr_uncommitted; i++) {
-		struct block_op *op = smm->uncommitted + i;
+	for (i = smm->uncommitted.begin;
+	     i != smm->uncommitted.end;
+	     i = brb_next(&smm->uncommitted, i)) {
+
+		struct block_op *op = smm->uncommitted.bops + i;
 
 		if (op->block != b)
 			continue;
@@ -335,9 +451,23 @@ static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b)
 
 static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b)
 {
+	dm_block_t count;
+	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+
 	int r = sm_metadata_new_block_(sm, b);
-	if (r)
-		DMERR("out of metadata space");
+	if (r) {
+		DMERR_LIMIT("unable to allocate new metadata block");
+		return r;
+	}
+
+	r = sm_metadata_get_nr_free(sm, &count);
+	if (r) {
+		DMERR_LIMIT("couldn't get free block count");
+		return r;
+	}
+
+	check_threshold(&smm->threshold, count);
+
 	return r;
 }
 
@@ -357,6 +487,18 @@ static int sm_metadata_commit(struct dm_space_map *sm)
 	return 0;
 }
 
+static int sm_metadata_register_threshold_callback(struct dm_space_map *sm,
+						   dm_block_t threshold,
+						   dm_sm_threshold_fn fn,
+						   void *context)
+{
+	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+
+	set_threshold(&smm->threshold, threshold, fn, context);
+
+	return 0;
+}
+
 static int sm_metadata_root_size(struct dm_space_map *sm, size_t *result)
 {
 	*result = sizeof(struct disk_sm_root);
@@ -382,6 +524,8 @@ static int sm_metadata_copy_root(struct dm_space_map *sm, void *where_le, size_t
 	return 0;
 }
 
+static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks);
+
 static struct dm_space_map ops = {
 	.destroy = sm_metadata_destroy,
 	.extend = sm_metadata_extend,
@@ -395,7 +539,8 @@ static struct dm_space_map ops = {
 	.new_block = sm_metadata_new_block,
 	.commit = sm_metadata_commit,
 	.root_size = sm_metadata_root_size,
-	.copy_root = sm_metadata_copy_root
+	.copy_root = sm_metadata_copy_root,
+	.register_threshold_callback = sm_metadata_register_threshold_callback
 };
 
 /*----------------------------------------------------------------*/
@@ -410,7 +555,7 @@ static void sm_bootstrap_destroy(struct dm_space_map *sm)
 
 static int sm_bootstrap_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
 {
-	DMERR("boostrap doesn't support extend");
+	DMERR("bootstrap doesn't support extend");
 
 	return -EINVAL;
 }
@@ -450,7 +595,7 @@ static int sm_bootstrap_count_is_more_than_one(struct dm_space_map *sm,
 static int sm_bootstrap_set_count(struct dm_space_map *sm, dm_block_t b,
 				  uint32_t count)
 {
-	DMERR("boostrap doesn't support set_count");
+	DMERR("bootstrap doesn't support set_count");
 
 	return -EINVAL;
 }
@@ -491,7 +636,7 @@ static int sm_bootstrap_commit(struct dm_space_map *sm)
 
 static int sm_bootstrap_root_size(struct dm_space_map *sm, size_t *result)
 {
-	DMERR("boostrap doesn't support root_size");
+	DMERR("bootstrap doesn't support root_size");
 
 	return -EINVAL;
 }
@@ -499,7 +644,7 @@ static int sm_bootstrap_root_size(struct dm_space_map *sm, size_t *result)
 static int sm_bootstrap_copy_root(struct dm_space_map *sm, void *where,
 				  size_t max)
 {
-	DMERR("boostrap doesn't support copy_root");
+	DMERR("bootstrap doesn't support copy_root");
 
 	return -EINVAL;
 }
@@ -517,11 +662,60 @@ static struct dm_space_map bootstrap_ops = {
 	.new_block = sm_bootstrap_new_block,
 	.commit = sm_bootstrap_commit,
 	.root_size = sm_bootstrap_root_size,
-	.copy_root = sm_bootstrap_copy_root
+	.copy_root = sm_bootstrap_copy_root,
+	.register_threshold_callback = NULL
 };
 
 /*----------------------------------------------------------------*/
 
+static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
+{
+	int r, i;
+	enum allocation_event ev;
+	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+	dm_block_t old_len = smm->ll.nr_blocks;
+
+	/*
+	 * Flick into a mode where all blocks get allocated in the new area.
+	 */
+	smm->begin = old_len;
+	memcpy(sm, &bootstrap_ops, sizeof(*sm));
+
+	/*
+	 * Extend.
+	 */
+	r = sm_ll_extend(&smm->ll, extra_blocks);
+	if (r)
+		goto out;
+
+	/*
+	 * We repeatedly increment then commit until the commit doesn't
+	 * allocate any new blocks.
+	 */
+	do {
+		for (i = old_len; !r && i < smm->begin; i++) {
+			r = sm_ll_inc(&smm->ll, i, &ev);
+			if (r)
+				goto out;
+		}
+		old_len = smm->begin;
+
+		r = sm_ll_commit(&smm->ll);
+		if (r)
+			goto out;
+
+	} while (old_len != smm->begin);
+
+out:
+	/*
+	 * Switch back to normal behaviour.
+	 */
+	memcpy(sm, &ops, sizeof(*sm));
+	return r;
+}
+
+/*----------------------------------------------------------------*/
+
 struct dm_space_map *dm_sm_metadata_init(void)
 {
 	struct sm_metadata *smm;
@@ -548,7 +742,8 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
 	smm->begin = superblock + 1;
 	smm->recursion_count = 0;
 	smm->allocated_this_transaction = 0;
-	smm->nr_uncommitted = 0;
+	brb_init(&smm->uncommitted);
+	threshold_init(&smm->threshold);
 
 	memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
 
@@ -556,6 +751,8 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
 	if (r)
 		return r;
 
+	if (nr_blocks > DM_SM_METADATA_MAX_BLOCKS)
+		nr_blocks = DM_SM_METADATA_MAX_BLOCKS;
 	r = sm_ll_extend(&smm->ll, nr_blocks);
 	if (r)
 		return r;
@@ -589,7 +786,8 @@ int dm_sm_metadata_open(struct dm_space_map *sm,
 	smm->begin = 0;
 	smm->recursion_count = 0;
 	smm->allocated_this_transaction = 0;
-	smm->nr_uncommitted = 0;
+	brb_init(&smm->uncommitted);
+	threshold_init(&smm->threshold);
 
 	memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
 	return 0;
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.h b/drivers/md/persistent-data/dm-space-map-metadata.h
index 39bba0801cf..64df923974d 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.h
+++ b/drivers/md/persistent-data/dm-space-map-metadata.h
@@ -9,6 +9,17 @@
 
 #include "dm-transaction-manager.h"
 
+#define DM_SM_METADATA_BLOCK_SIZE (4096 >> SECTOR_SHIFT)
+
+/*
+ * The metadata device is currently limited in size.
+ *
+ * We have one block of index, which can hold 255 index entries.  Each
+ * index entry contains allocation info about ~16k metadata blocks.
+ */
+#define DM_SM_METADATA_MAX_BLOCKS (255 * ((1 << 14) - 64))
+#define DM_SM_METADATA_MAX_SECTORS (DM_SM_METADATA_MAX_BLOCKS * DM_SM_METADATA_BLOCK_SIZE)
+
 /*
  * Unfortunately we have to use two-phase construction due to the cycle
  * between the tm and sm.
diff --git a/drivers/md/persistent-data/dm-space-map.h b/drivers/md/persistent-data/dm-space-map.h
index 1cbfc6b1638..3e6d1153b7c 100644
--- a/drivers/md/persistent-data/dm-space-map.h
+++ b/drivers/md/persistent-data/dm-space-map.h
@@ -9,6 +9,8 @@
 
 #include "dm-block-manager.h"
 
+typedef void (*dm_sm_threshold_fn)(void *context);
+
 /*
  * struct dm_space_map keeps a record of how many times each block in a device
  * is referenced.  It needs to be fixed on disk as part of the transaction.
@@ -59,6 +61,15 @@ struct dm_space_map {
 	 */
 	int (*root_size)(struct dm_space_map *sm, size_t *result);
 	int (*copy_root)(struct dm_space_map *sm, void *copy_to_here_le, size_t len);
+
+	/*
+	 * You can register one threshold callback which is edge-triggered
+	 * when the free space in the space map drops below the threshold.
+	 */
+	int (*register_threshold_callback)(struct dm_space_map *sm,
+					   dm_block_t threshold,
+					   dm_sm_threshold_fn fn,
+					   void *context);
 };
 
 /*----------------------------------------------------------------*/
@@ -131,4 +142,16 @@ static inline int dm_sm_copy_root(struct dm_space_map *sm, void *copy_to_here_le
 	return sm->copy_root(sm, copy_to_here_le, len);
 }
 
+static inline int dm_sm_register_threshold_callback(struct dm_space_map *sm,
+						    dm_block_t threshold,
+						    dm_sm_threshold_fn fn,
+						    void *context)
+{
+	if (sm->register_threshold_callback)
+		return sm->register_threshold_callback(sm, threshold, fn, context);
+
+	return -EINVAL;
+}
+
+
 #endif	/* _LINUX_DM_SPACE_MAP_H */
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
index d247a35da3c..3bc30a0ae3d 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -25,8 +25,8 @@ struct shadow_info {
 /*
  * It would be nice if we scaled with the size of transaction.
  */
-#define HASH_SIZE 256
-#define HASH_MASK (HASH_SIZE - 1)
+#define DM_HASH_SIZE 256
+#define DM_HASH_MASK (DM_HASH_SIZE - 1)
 
 struct dm_transaction_manager {
 	int is_clone;
@@ -36,7 +36,7 @@ struct dm_transaction_manager {
 	struct dm_space_map *sm;
 
 	spinlock_t lock;
-	struct hlist_head buckets[HASH_SIZE];
+	struct hlist_head buckets[DM_HASH_SIZE];
 };
 
 /*----------------------------------------------------------------*/
@@ -44,12 +44,11 @@ struct dm_transaction_manager {
 static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b)
 {
 	int r = 0;
-	unsigned bucket = dm_hash_block(b, HASH_MASK);
+	unsigned bucket = dm_hash_block(b, DM_HASH_MASK);
 	struct shadow_info *si;
-	struct hlist_node *n;
 
 	spin_lock(&tm->lock);
-	hlist_for_each_entry(si, n, tm->buckets + bucket, hlist)
+	hlist_for_each_entry(si, tm->buckets + bucket, hlist)
 		if (si->where == b) {
 			r = 1;
 			break;
@@ -71,7 +70,7 @@ static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b)
 	si = kmalloc(sizeof(*si), GFP_NOIO);
 	if (si) {
 		si->where = b;
-		bucket = dm_hash_block(b, HASH_MASK);
+		bucket = dm_hash_block(b, DM_HASH_MASK);
 		spin_lock(&tm->lock);
 		hlist_add_head(&si->hlist, tm->buckets + bucket);
 		spin_unlock(&tm->lock);
@@ -81,14 +80,14 @@ static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b)
 static void wipe_shadow_table(struct dm_transaction_manager *tm)
 {
 	struct shadow_info *si;
-	struct hlist_node *n, *tmp;
+	struct hlist_node *tmp;
 	struct hlist_head *bucket;
 	int i;
 
 	spin_lock(&tm->lock);
-	for (i = 0; i < HASH_SIZE; i++) {
+	for (i = 0; i < DM_HASH_SIZE; i++) {
 		bucket = tm->buckets + i;
-		hlist_for_each_entry_safe(si, n, tmp, bucket, hlist)
+		hlist_for_each_entry_safe(si, tmp, bucket, hlist)
 			kfree(si);
 
 		INIT_HLIST_HEAD(bucket);
@@ -115,7 +114,7 @@ static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm,
 	tm->sm = sm;
 
 	spin_lock_init(&tm->lock);
-	for (i = 0; i < HASH_SIZE; i++)
+	for (i = 0; i < DM_HASH_SIZE; i++)
 		INIT_HLIST_HEAD(tm->buckets + i);
 
 	return tm;
@@ -155,7 +154,7 @@ int dm_tm_pre_commit(struct dm_transaction_manager *tm)
 	if (r < 0)
 		return r;
 
-	return 0;
+	return dm_bm_flush(tm->bm);
 }
 EXPORT_SYMBOL_GPL(dm_tm_pre_commit);
 
@@ -165,8 +164,9 @@ int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root)
 		return -EWOULDBLOCK;
 
 	wipe_shadow_table(tm);
+	dm_bm_unlock(root);
 
-	return dm_bm_flush_and_unlock(tm->bm, root);
+	return dm_bm_flush(tm->bm);
 }
 EXPORT_SYMBOL_GPL(dm_tm_commit);
 
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h
index b5b139076ca..2772ed2a781 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.h
+++ b/drivers/md/persistent-data/dm-transaction-manager.h
@@ -38,18 +38,17 @@ struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transac
 /*
  * We use a 2-phase commit here.
  *
- * i) In the first phase the block manager is told to start flushing, and
- * the changes to the space map are written to disk.  You should interrogate
- * your particular space map to get detail of its root node etc. to be
- * included in your superblock.
+ * i) Make all changes for the transaction *except* for the superblock.
+ * Then call dm_tm_pre_commit() to flush them to disk.
  *
- * ii) @root will be committed last.  You shouldn't use more than the
- * first 512 bytes of @root if you wish the transaction to survive a power
- * failure.  You *must* have a write lock held on @root for both stage (i)
- * and (ii).  The commit will drop the write lock.
+ * ii) Lock your superblock.  Update.  Then call dm_tm_commit() which will
+ * unlock the superblock and flush it.  No other blocks should be updated
+ * during this period.  Care should be taken to never unlock a partially
+ * updated superblock; perform any operations that could fail *before* you
+ * take the superblock lock.
  */
 int dm_tm_pre_commit(struct dm_transaction_manager *tm);
-int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root);
+int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *superblock);
 
 /*
  * These methods are the only way to get hold of a writeable block.
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 24b359717a7..407a99e46f6 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -175,7 +175,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 			rdev1->new_raid_disk = j;
 		}
 
-		if (j < 0 || j >= mddev->raid_disks) {
+		if (j < 0) {
+			printk(KERN_ERR
+			       "md/raid0:%s: remove inactive devices before converting to RAID0\n",
+			       mdname(mddev));
+			goto abort;
+		}
+		if (j >= mddev->raid_disks) {
 			printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
 			       "aborting!\n", mdname(mddev), j);
 			goto abort;
@@ -289,7 +295,7 @@ abort:
 	kfree(conf->strip_zone);
 	kfree(conf->devlist);
 	kfree(conf);
-	*private_conf = NULL;
+	*private_conf = ERR_PTR(err);
 	return err;
 }
 
@@ -411,7 +417,8 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
 		  "%s does not support generic reshape\n", __func__);
 
 	rdev_for_each(rdev, mddev)
-		array_sectors += rdev->sectors;
+		array_sectors += (rdev->sectors &
+				  ~(sector_t)(mddev->chunk_sectors-1));
 
 	return array_sectors;
 }
@@ -494,76 +501,56 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
 			unsigned int chunk_sects, struct bio *bio)
 {
 	if (likely(is_power_of_2(chunk_sects))) {
-		return chunk_sects >= ((bio->bi_sector & (chunk_sects-1))
-					+ (bio->bi_size >> 9));
+		return chunk_sects >=
+			((bio->bi_iter.bi_sector & (chunk_sects-1))
+					+ bio_sectors(bio));
 	} else{
-		sector_t sector = bio->bi_sector;
+		sector_t sector = bio->bi_iter.bi_sector;
 		return chunk_sects >= (sector_div(sector, chunk_sects)
-						+ (bio->bi_size >> 9));
+						+ bio_sectors(bio));
 	}
 }
 
 static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 {
-	unsigned int chunk_sects;
-	sector_t sector_offset;
 	struct strip_zone *zone;
 	struct md_rdev *tmp_dev;
+	struct bio *split;
 
 	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
 		md_flush_request(mddev, bio);
 		return;
 	}
 
-	chunk_sects = mddev->chunk_sectors;
-	if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) {
-		sector_t sector = bio->bi_sector;
-		struct bio_pair *bp;
-		/* Sanity check -- queue functions should prevent this happening */
-		if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) ||
-		    bio->bi_idx != 0)
-			goto bad_map;
-		/* This is a one page bio that upper layers
-		 * refuse to split for us, so we need to split it.
-		 */
-		if (likely(is_power_of_2(chunk_sects)))
-			bp = bio_split(bio, chunk_sects - (sector &
-							   (chunk_sects-1)));
-		else
-			bp = bio_split(bio, chunk_sects -
-				       sector_div(sector, chunk_sects));
-		raid0_make_request(mddev, &bp->bio1);
-		raid0_make_request(mddev, &bp->bio2);
-		bio_pair_release(bp);
-		return;
-	}
-
-	sector_offset = bio->bi_sector;
-	zone = find_zone(mddev->private, &sector_offset);
-	tmp_dev = map_sector(mddev, zone, bio->bi_sector,
-			     &sector_offset);
-	bio->bi_bdev = tmp_dev->bdev;
-	bio->bi_sector = sector_offset + zone->dev_start +
-		tmp_dev->data_offset;
-
-	if (unlikely((bio->bi_rw & REQ_DISCARD) &&
-		     !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
-		/* Just ignore it */
-		bio_endio(bio, 0);
-		return;
-	}
+	do {
+		sector_t sector = bio->bi_iter.bi_sector;
+		unsigned chunk_sects = mddev->chunk_sectors;
 
-	generic_make_request(bio);
-	return;
+		unsigned sectors = chunk_sects -
+			(likely(is_power_of_2(chunk_sects))
+			 ? (sector & (chunk_sects-1))
+			 : sector_div(sector, chunk_sects));
 
-bad_map:
-	printk("md/raid0:%s: make_request bug: can't convert block across chunks"
-	       " or bigger than %dk %llu %d\n",
-	       mdname(mddev), chunk_sects / 2,
-	       (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
+		if (sectors < bio_sectors(bio)) {
+			split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
+			bio_chain(split, bio);
+		} else {
+			split = bio;
+		}
 
-	bio_io_error(bio);
-	return;
+		zone = find_zone(mddev->private, &sector);
+		tmp_dev = map_sector(mddev, zone, sector, &sector);
+		split->bi_bdev = tmp_dev->bdev;
+		split->bi_iter.bi_sector = sector + zone->dev_start +
+			tmp_dev->data_offset;
+
+		if (unlikely((split->bi_rw & REQ_DISCARD) &&
+			 !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
+			/* Just ignore it */
+			bio_endio(split, 0);
+		} else
+			generic_make_request(split);
+	} while (split != bio);
 }
 
 static void raid0_status(struct seq_file *seq, struct mddev *mddev)
@@ -591,6 +578,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
 			       mdname(mddev));
 			return ERR_PTR(-EINVAL);
 		}
+		rdev->sectors = mddev->dev_sectors;
 	}
 
 	/* Set new parameters */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 8034fbd6190..56e24c072b6 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -66,7 +66,8 @@
  */
 static int max_queued_requests = 1024;
 
-static void allow_barrier(struct r1conf *conf);
+static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
+			  sector_t bi_sector);
 static void lower_barrier(struct r1conf *conf);
 
 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
@@ -84,17 +85,19 @@ static void r1bio_pool_free(void *r1_bio, void *data)
 }
 
 #define RESYNC_BLOCK_SIZE (64*1024)
-//#define RESYNC_BLOCK_SIZE PAGE_SIZE
+#define RESYNC_DEPTH 32
 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
-#define RESYNC_WINDOW (2048*1024)
+#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
+#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
+#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
 
 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct pool_info *pi = data;
-	struct page *page;
 	struct r1bio *r1_bio;
 	struct bio *bio;
+	int need_pages;
 	int i, j;
 
 	r1_bio = r1bio_pool_alloc(gfp_flags, pi);
@@ -117,19 +120,15 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 	 * RESYNC_PAGES for each bio.
 	 */
 	if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
-		j = pi->raid_disks;
+		need_pages = pi->raid_disks;
 	else
-		j = 1;
-	while(j--) {
+		need_pages = 1;
+	for (j = 0; j < need_pages; j++) {
 		bio = r1_bio->bios[j];
-		for (i = 0; i < RESYNC_PAGES; i++) {
-			page = alloc_page(gfp_flags);
-			if (unlikely(!page))
-				goto out_free_pages;
+		bio->bi_vcnt = RESYNC_PAGES;
 
-			bio->bi_io_vec[i].bv_page = page;
-			bio->bi_vcnt = i+1;
-		}
+		if (bio_alloc_pages(bio, gfp_flags))
+			goto out_free_pages;
 	}
 	/* If not user-requests, copy the page pointers to all bios */
 	if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
@@ -144,10 +143,13 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 	return r1_bio;
 
 out_free_pages:
-	for (j=0 ; j < pi->raid_disks; j++)
-		for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++)
-			put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
-	j = -1;
+	while (--j >= 0) {
+		struct bio_vec *bv;
+
+		bio_for_each_segment_all(bv, r1_bio->bios[j], i)
+			__free_page(bv->bv_page);
+	}
+
 out_free_bio:
 	while (++j < pi->raid_disks)
 		bio_put(r1_bio->bios[j]);
@@ -235,6 +237,8 @@ static void call_bio_endio(struct r1bio *r1_bio)
 	struct bio *bio = r1_bio->master_bio;
 	int done;
 	struct r1conf *conf = r1_bio->mddev->private;
+	sector_t start_next_window = r1_bio->start_next_window;
+	sector_t bi_sector = bio->bi_iter.bi_sector;
 
 	if (bio->bi_phys_segments) {
 		unsigned long flags;
@@ -242,6 +246,11 @@ static void call_bio_endio(struct r1bio *r1_bio)
 		bio->bi_phys_segments--;
 		done = (bio->bi_phys_segments == 0);
 		spin_unlock_irqrestore(&conf->device_lock, flags);
+		/*
+		 * make_request() might be waiting for
+		 * bi_phys_segments to decrease
+		 */
+		wake_up(&conf->wait_barrier);
 	} else
 		done = 1;
 
@@ -253,7 +262,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
 		 * Wake up any possible resync thread that waits for the device
 		 * to go idle.
 		 */
-		allow_barrier(conf);
+		allow_barrier(conf, start_next_window, bi_sector);
 	}
 }
 
@@ -265,9 +274,8 @@ static void raid_end_bio_io(struct r1bio *r1_bio)
 	if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
 		pr_debug("raid1: sync end %s on sectors %llu-%llu\n",
 			 (bio_data_dir(bio) == WRITE) ? "write" : "read",
-			 (unsigned long long) bio->bi_sector,
-			 (unsigned long long) bio->bi_sector +
-			 (bio->bi_size >> 9) - 1);
+			 (unsigned long long) bio->bi_iter.bi_sector,
+			 (unsigned long long) bio_end_sector(bio) - 1);
 
 		call_bio_endio(r1_bio);
 	}
@@ -427,7 +435,17 @@ static void raid1_end_write_request(struct bio *bio, int error)
 
 		r1_bio->bios[mirror] = NULL;
 		to_put = bio;
-		set_bit(R1BIO_Uptodate, &r1_bio->state);
+		/*
+		 * Do not set R1BIO_Uptodate if the current device is
+		 * rebuilding or Faulty. This is because we cannot use
+		 * such device for properly reading the data back (we could
+		 * potentially use it, if the current write would have felt
+		 * before rdev->recovery_offset, but for simplicity we don't
+		 * check this here.
+		 */
+		if (test_bit(In_sync, &conf->mirrors[mirror].rdev->flags) &&
+		    !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags))
+			set_bit(R1BIO_Uptodate, &r1_bio->state);
 
 		/* Maybe we can clear some bad blocks. */
 		if (is_badblock(conf->mirrors[mirror].rdev,
@@ -456,9 +474,8 @@ static void raid1_end_write_request(struct bio *bio, int error)
 				struct bio *mbio = r1_bio->master_bio;
 				pr_debug("raid1: behind end write sectors"
 					 " %llu-%llu\n",
-					 (unsigned long long) mbio->bi_sector,
-					 (unsigned long long) mbio->bi_sector +
-					 (mbio->bi_size >> 9) - 1);
+					 (unsigned long long) mbio->bi_iter.bi_sector,
+					 (unsigned long long) bio_end_sector(mbio) - 1);
 				call_bio_endio(r1_bio);
 			}
 		}
@@ -814,23 +831,31 @@ static void flush_pending_writes(struct r1conf *conf)
  *    there is no normal IO happeing.  It must arrange to call
  *    lower_barrier when the particular background IO completes.
  */
-#define RESYNC_DEPTH 32
-
 static void raise_barrier(struct r1conf *conf)
 {
 	spin_lock_irq(&conf->resync_lock);
 
 	/* Wait until no block IO is waiting */
 	wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
-			    conf->resync_lock, );
+			    conf->resync_lock);
 
 	/* block any new IO from starting */
 	conf->barrier++;
 
-	/* Now wait for all pending IO to complete */
+	/* For these conditions we must wait:
+	 * A: while the array is in frozen state
+	 * B: while barrier >= RESYNC_DEPTH, meaning resync reach
+	 *    the max count which allowed.
+	 * C: next_resync + RESYNC_SECTORS > start_next_window, meaning
+	 *    next resync will reach to the window which normal bios are
+	 *    handling.
+	 */
 	wait_event_lock_irq(conf->wait_barrier,
-			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
-			    conf->resync_lock, );
+			    !conf->array_frozen &&
+			    conf->barrier < RESYNC_DEPTH &&
+			    (conf->start_next_window >=
+			     conf->next_resync + RESYNC_SECTORS),
+			    conf->resync_lock);
 
 	spin_unlock_irq(&conf->resync_lock);
 }
@@ -845,10 +870,33 @@ static void lower_barrier(struct r1conf *conf)
 	wake_up(&conf->wait_barrier);
 }
 
-static void wait_barrier(struct r1conf *conf)
+static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
 {
+	bool wait = false;
+
+	if (conf->array_frozen || !bio)
+		wait = true;
+	else if (conf->barrier && bio_data_dir(bio) == WRITE) {
+		if (conf->next_resync < RESYNC_WINDOW_SECTORS)
+			wait = true;
+		else if ((conf->next_resync - RESYNC_WINDOW_SECTORS
+				>= bio_end_sector(bio)) ||
+			 (conf->next_resync + NEXT_NORMALIO_DISTANCE
+				<= bio->bi_iter.bi_sector))
+			wait = false;
+		else
+			wait = true;
+	}
+
+	return wait;
+}
+
+static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
+{
+	sector_t sector = 0;
+
 	spin_lock_irq(&conf->resync_lock);
-	if (conf->barrier) {
+	if (need_to_wait_for_sync(conf, bio)) {
 		conf->nr_waiting++;
 		/* Wait for the barrier to drop.
 		 * However if there are already pending
@@ -860,56 +908,96 @@ static void wait_barrier(struct r1conf *conf)
 		 * count down.
 		 */
 		wait_event_lock_irq(conf->wait_barrier,
-				    !conf->barrier ||
-				    (conf->nr_pending &&
+				    !conf->array_frozen &&
+				    (!conf->barrier ||
+				    ((conf->start_next_window <
+				      conf->next_resync + RESYNC_SECTORS) &&
 				     current->bio_list &&
-				     !bio_list_empty(current->bio_list)),
-				    conf->resync_lock,
-			);
+				     !bio_list_empty(current->bio_list))),
+				    conf->resync_lock);
 		conf->nr_waiting--;
 	}
+
+	if (bio && bio_data_dir(bio) == WRITE) {
+		if (conf->next_resync + NEXT_NORMALIO_DISTANCE
+		    <= bio->bi_iter.bi_sector) {
+			if (conf->start_next_window == MaxSector)
+				conf->start_next_window =
+					conf->next_resync +
+					NEXT_NORMALIO_DISTANCE;
+
+			if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
+			    <= bio->bi_iter.bi_sector)
+				conf->next_window_requests++;
+			else
+				conf->current_window_requests++;
+			sector = conf->start_next_window;
+		}
+	}
+
 	conf->nr_pending++;
 	spin_unlock_irq(&conf->resync_lock);
+	return sector;
 }
 
-static void allow_barrier(struct r1conf *conf)
+static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
+			  sector_t bi_sector)
 {
 	unsigned long flags;
+
 	spin_lock_irqsave(&conf->resync_lock, flags);
 	conf->nr_pending--;
+	if (start_next_window) {
+		if (start_next_window == conf->start_next_window) {
+			if (conf->start_next_window + NEXT_NORMALIO_DISTANCE
+			    <= bi_sector)
+				conf->next_window_requests--;
+			else
+				conf->current_window_requests--;
+		} else
+			conf->current_window_requests--;
+
+		if (!conf->current_window_requests) {
+			if (conf->next_window_requests) {
+				conf->current_window_requests =
+					conf->next_window_requests;
+				conf->next_window_requests = 0;
+				conf->start_next_window +=
+					NEXT_NORMALIO_DISTANCE;
+			} else
+				conf->start_next_window = MaxSector;
+		}
+	}
 	spin_unlock_irqrestore(&conf->resync_lock, flags);
 	wake_up(&conf->wait_barrier);
 }
 
-static void freeze_array(struct r1conf *conf)
+static void freeze_array(struct r1conf *conf, int extra)
 {
 	/* stop syncio and normal IO and wait for everything to
 	 * go quite.
-	 * We increment barrier and nr_waiting, and then
-	 * wait until nr_pending match nr_queued+1
+	 * We wait until nr_pending match nr_queued+extra
 	 * This is called in the context of one normal IO request
 	 * that has failed. Thus any sync request that might be pending
 	 * will be blocked by nr_pending, and we need to wait for
 	 * pending IO requests to complete or be queued for re-try.
-	 * Thus the number queued (nr_queued) plus this request (1)
+	 * Thus the number queued (nr_queued) plus this request (extra)
 	 * must match the number of pending IOs (nr_pending) before
 	 * we continue.
 	 */
 	spin_lock_irq(&conf->resync_lock);
-	conf->barrier++;
-	conf->nr_waiting++;
-	wait_event_lock_irq(conf->wait_barrier,
-			    conf->nr_pending == conf->nr_queued+1,
-			    conf->resync_lock,
-			    flush_pending_writes(conf));
+	conf->array_frozen = 1;
+	wait_event_lock_irq_cmd(conf->wait_barrier,
+				conf->nr_pending == conf->nr_queued+extra,
+				conf->resync_lock,
+				flush_pending_writes(conf));
 	spin_unlock_irq(&conf->resync_lock);
 }
 static void unfreeze_array(struct r1conf *conf)
 {
 	/* reverse the effect of the freeze */
 	spin_lock_irq(&conf->resync_lock);
-	conf->barrier--;
-	conf->nr_waiting--;
+	conf->array_frozen = 0;
 	wake_up(&conf->wait_barrier);
 	spin_unlock_irq(&conf->resync_lock);
 }
@@ -926,7 +1014,7 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
 	if (unlikely(!bvecs))
 		return;
 
-	bio_for_each_segment(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i) {
 		bvecs[i] = *bvec;
 		bvecs[i].bv_page = alloc_page(GFP_NOIO);
 		if (unlikely(!bvecs[i].bv_page))
@@ -946,7 +1034,8 @@ do_sync_io:
 		if (bvecs[i].bv_page)
 			put_page(bvecs[i].bv_page);
 	kfree(bvecs);
-	pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
+	pr_debug("%dB behind alloc failed, doing sync I/O\n",
+		 bio->bi_iter.bi_size);
 }
 
 struct raid1_plug_cb {
@@ -963,11 +1052,12 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
 	struct r1conf *conf = mddev->private;
 	struct bio *bio;
 
-	if (from_schedule) {
+	if (from_schedule || current->bio_list) {
 		spin_lock_irq(&conf->device_lock);
 		bio_list_merge(&conf->pending_bio_list, &plug->pending);
 		conf->pending_count += plug->pending_cnt;
 		spin_unlock_irq(&conf->device_lock);
+		wake_up(&conf->wait_barrier);
 		md_wakeup_thread(mddev->thread);
 		kfree(plug);
 		return;
@@ -981,7 +1071,12 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
 	while (bio) { /* submit pending writes */
 		struct bio *next = bio->bi_next;
 		bio->bi_next = NULL;
-		generic_make_request(bio);
+		if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+		    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+			/* Just ignore it */
+			bio_endio(bio, 0);
+		else
+			generic_make_request(bio);
 		bio = next;
 	}
 	kfree(plug);
@@ -1001,12 +1096,14 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
 	const unsigned long do_discard = (bio->bi_rw
 					  & (REQ_DISCARD | REQ_SECURE));
+	const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
 	struct md_rdev *blocked_rdev;
 	struct blk_plug_cb *cb;
 	struct raid1_plug_cb *plug = NULL;
 	int first_clone;
 	int sectors_handled;
 	int max_sectors;
+	sector_t start_next_window;
 
 	/*
 	 * Register the new request and wait if the reconstruction
@@ -1017,8 +1114,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	md_write_start(mddev, bio); /* wait on superblock update early */
 
 	if (bio_data_dir(bio) == WRITE &&
-	    bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo &&
-	    bio->bi_sector < mddev->suspend_hi) {
+	    bio_end_sector(bio) > mddev->suspend_lo &&
+	    bio->bi_iter.bi_sector < mddev->suspend_hi) {
 		/* As the suspend_* range is controlled by
 		 * userspace, we want an interruptible
 		 * wait.
@@ -1028,15 +1125,15 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 			flush_signals(current);
 			prepare_to_wait(&conf->wait_barrier,
 					&w, TASK_INTERRUPTIBLE);
-			if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo ||
-			    bio->bi_sector >= mddev->suspend_hi)
+			if (bio_end_sector(bio) <= mddev->suspend_lo ||
+			    bio->bi_iter.bi_sector >= mddev->suspend_hi)
 				break;
 			schedule();
 		}
 		finish_wait(&conf->wait_barrier, &w);
 	}
 
-	wait_barrier(conf);
+	start_next_window = wait_barrier(conf, bio);
 
 	bitmap = mddev->bitmap;
 
@@ -1048,10 +1145,10 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
 
 	r1_bio->master_bio = bio;
-	r1_bio->sectors = bio->bi_size >> 9;
+	r1_bio->sectors = bio_sectors(bio);
 	r1_bio->state = 0;
 	r1_bio->mddev = mddev;
-	r1_bio->sector = bio->bi_sector;
+	r1_bio->sector = bio->bi_iter.bi_sector;
 
 	/* We might need to issue multiple reads to different
 	 * devices if there are bad blocks around, so we keep
@@ -1091,12 +1188,13 @@ read_again:
 		r1_bio->read_disk = rdisk;
 
 		read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		md_trim_bio(read_bio, r1_bio->sector - bio->bi_sector,
-			    max_sectors);
+		bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
+			 max_sectors);
 
 		r1_bio->bios[rdisk] = read_bio;
 
-		read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
+		read_bio->bi_iter.bi_sector = r1_bio->sector +
+			mirror->rdev->data_offset;
 		read_bio->bi_bdev = mirror->rdev->bdev;
 		read_bio->bi_end_io = raid1_end_read_request;
 		read_bio->bi_rw = READ | do_sync;
@@ -1108,7 +1206,7 @@ read_again:
 			 */
 
 			sectors_handled = (r1_bio->sector + max_sectors
-					   - bio->bi_sector);
+					   - bio->bi_iter.bi_sector);
 			r1_bio->sectors = max_sectors;
 			spin_lock_irq(&conf->device_lock);
 			if (bio->bi_phys_segments == 0)
@@ -1126,10 +1224,11 @@ read_again:
 			r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
 
 			r1_bio->master_bio = bio;
-			r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+			r1_bio->sectors = bio_sectors(bio) - sectors_handled;
 			r1_bio->state = 0;
 			r1_bio->mddev = mddev;
-			r1_bio->sector = bio->bi_sector + sectors_handled;
+			r1_bio->sector = bio->bi_iter.bi_sector +
+				sectors_handled;
 			goto read_again;
 		} else
 			generic_make_request(read_bio);
@@ -1157,6 +1256,7 @@ read_again:
 
 	disks = conf->raid_disks * 2;
  retry_write:
+	r1_bio->start_next_window = start_next_window;
 	blocked_rdev = NULL;
 	rcu_read_lock();
 	max_sectors = r1_bio->sectors;
@@ -1225,14 +1325,24 @@ read_again:
 	if (unlikely(blocked_rdev)) {
 		/* Wait for this device to become unblocked */
 		int j;
+		sector_t old = start_next_window;
 
 		for (j = 0; j < i; j++)
 			if (r1_bio->bios[j])
 				rdev_dec_pending(conf->mirrors[j].rdev, mddev);
 		r1_bio->state = 0;
-		allow_barrier(conf);
+		allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector);
 		md_wait_for_blocked_rdev(blocked_rdev, mddev);
-		wait_barrier(conf);
+		start_next_window = wait_barrier(conf, bio);
+		/*
+		 * We must make sure the multi r1bios of bio have
+		 * the same value of bi_phys_segments
+		 */
+		if (bio->bi_phys_segments && old &&
+		    old != start_next_window)
+			/* Wait for the former r1bio(s) to complete */
+			wait_event(conf->wait_barrier,
+				   bio->bi_phys_segments == 1);
 		goto retry_write;
 	}
 
@@ -1248,7 +1358,7 @@ read_again:
 			bio->bi_phys_segments++;
 		spin_unlock_irq(&conf->device_lock);
 	}
-	sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector;
+	sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector;
 
 	atomic_set(&r1_bio->remaining, 1);
 	atomic_set(&r1_bio->behind_remaining, 0);
@@ -1260,7 +1370,7 @@ read_again:
 			continue;
 
 		mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors);
+		bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors);
 
 		if (first_clone) {
 			/* do behind I/O ?
@@ -1283,14 +1393,10 @@ read_again:
 			struct bio_vec *bvec;
 			int j;
 
-			/* Yes, I really want the '__' version so that
-			 * we clear any unused pointer in the io_vec, rather
-			 * than leave them unchanged.  This is important
-			 * because when we come to free the pages, we won't
-			 * know the original bi_idx, so we just free
-			 * them all
+			/*
+			 * We trimmed the bio, so _all is legit
 			 */
-			__bio_for_each_segment(bvec, mbio, j, 0)
+			bio_for_each_segment_all(bvec, mbio, j)
 				bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
 			if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
 				atomic_inc(&r1_bio->behind_remaining);
@@ -1298,11 +1404,12 @@ read_again:
 
 		r1_bio->bios[i] = mbio;
 
-		mbio->bi_sector	= (r1_bio->sector +
+		mbio->bi_iter.bi_sector	= (r1_bio->sector +
 				   conf->mirrors[i].rdev->data_offset);
 		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
 		mbio->bi_end_io	= raid1_end_write_request;
-		mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard;
+		mbio->bi_rw =
+			WRITE | do_flush_fua | do_sync | do_discard | do_same;
 		mbio->bi_private = r1_bio;
 
 		atomic_inc(&r1_bio->remaining);
@@ -1327,17 +1434,17 @@ read_again:
 	/* Mustn't call r1_bio_write_done before this next test,
 	 * as it could result in the bio being freed.
 	 */
-	if (sectors_handled < (bio->bi_size >> 9)) {
+	if (sectors_handled < bio_sectors(bio)) {
 		r1_bio_write_done(r1_bio);
 		/* We need another r1_bio.  It has already been counted
 		 * in bio->bi_phys_segments
 		 */
 		r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
 		r1_bio->master_bio = bio;
-		r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+		r1_bio->sectors = bio_sectors(bio) - sectors_handled;
 		r1_bio->state = 0;
 		r1_bio->mddev = mddev;
-		r1_bio->sector = bio->bi_sector + sectors_handled;
+		r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
 		goto retry_write;
 	}
 
@@ -1435,11 +1542,14 @@ static void print_conf(struct r1conf *conf)
 
 static void close_sync(struct r1conf *conf)
 {
-	wait_barrier(conf);
-	allow_barrier(conf);
+	wait_barrier(conf, NULL);
+	allow_barrier(conf, 0, 0);
 
 	mempool_destroy(conf->r1buf_pool);
 	conf->r1buf_pool = NULL;
+
+	conf->next_resync = 0;
+	conf->start_next_window = MaxSector;
 }
 
 static int raid1_spare_active(struct mddev *mddev)
@@ -1476,6 +1586,7 @@ static int raid1_spare_active(struct mddev *mddev)
 			}
 		}
 		if (rdev
+		    && rdev->recovery_offset == MaxSector
 		    && !test_bit(Faulty, &rdev->flags)
 		    && !test_and_set_bit(In_sync, &rdev->flags)) {
 			count++;
@@ -1516,8 +1627,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		p = conf->mirrors+mirror;
 		if (!p->rdev) {
 
-			disk_stack_limits(mddev->gendisk, rdev->bdev,
-					  rdev->data_offset << 9);
+			if (mddev->gendisk)
+				disk_stack_limits(mddev->gendisk, rdev->bdev,
+						  rdev->data_offset << 9);
 
 			p->head_position = 0;
 			rdev->raid_disk = mirror;
@@ -1551,12 +1663,12 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		 * we wait for all outstanding requests to complete.
 		 */
 		synchronize_sched();
-		raise_barrier(conf);
-		lower_barrier(conf);
+		freeze_array(conf, 0);
+		unfreeze_array(conf);
 		clear_bit(Unmerged, &rdev->flags);
 	}
 	md_integrity_add_rdev(rdev, mddev);
-	if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
 	print_conf(conf);
 	return err;
@@ -1602,11 +1714,11 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 			 */
 			struct md_rdev *repl =
 				conf->mirrors[conf->raid_disks + number].rdev;
-			raise_barrier(conf);
+			freeze_array(conf, 0);
 			clear_bit(Replacement, &repl->flags);
 			p->rdev = repl;
 			conf->mirrors[conf->raid_disks + number].rdev = NULL;
-			lower_barrier(conf);
+			unfreeze_array(conf);
 			clear_bit(WantReplacement, &rdev->flags);
 		} else
 			clear_bit(WantReplacement, &rdev->flags);
@@ -1845,6 +1957,40 @@ static int process_checks(struct r1bio *r1_bio)
 	int i;
 	int vcnt;
 
+	/* Fix variable parts of all bios */
+	vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9);
+	for (i = 0; i < conf->raid_disks * 2; i++) {
+		int j;
+		int size;
+		int uptodate;
+		struct bio *b = r1_bio->bios[i];
+		if (b->bi_end_io != end_sync_read)
+			continue;
+		/* fixup the bio for reuse, but preserve BIO_UPTODATE */
+		uptodate = test_bit(BIO_UPTODATE, &b->bi_flags);
+		bio_reset(b);
+		if (!uptodate)
+			clear_bit(BIO_UPTODATE, &b->bi_flags);
+		b->bi_vcnt = vcnt;
+		b->bi_iter.bi_size = r1_bio->sectors << 9;
+		b->bi_iter.bi_sector = r1_bio->sector +
+			conf->mirrors[i].rdev->data_offset;
+		b->bi_bdev = conf->mirrors[i].rdev->bdev;
+		b->bi_end_io = end_sync_read;
+		b->bi_private = r1_bio;
+
+		size = b->bi_iter.bi_size;
+		for (j = 0; j < vcnt ; j++) {
+			struct bio_vec *bi;
+			bi = &b->bi_io_vec[j];
+			bi->bv_offset = 0;
+			if (size > PAGE_SIZE)
+				bi->bv_len = PAGE_SIZE;
+			else
+				bi->bv_len = size;
+			size -= PAGE_SIZE;
+		}
+	}
 	for (primary = 0; primary < conf->raid_disks * 2; primary++)
 		if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
 		    test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
@@ -1853,17 +1999,18 @@ static int process_checks(struct r1bio *r1_bio)
 			break;
 		}
 	r1_bio->read_disk = primary;
-	vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9);
 	for (i = 0; i < conf->raid_disks * 2; i++) {
 		int j;
 		struct bio *pbio = r1_bio->bios[primary];
 		struct bio *sbio = r1_bio->bios[i];
-		int size;
+		int uptodate = test_bit(BIO_UPTODATE, &sbio->bi_flags);
 
-		if (r1_bio->bios[i]->bi_end_io != end_sync_read)
+		if (sbio->bi_end_io != end_sync_read)
 			continue;
+		/* Now we can 'fixup' the BIO_UPTODATE flag */
+		set_bit(BIO_UPTODATE, &sbio->bi_flags);
 
-		if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
+		if (uptodate) {
 			for (j = vcnt; j-- ; ) {
 				struct page *p, *s;
 				p = pbio->bi_io_vec[j].bv_page;
@@ -1878,37 +2025,14 @@ static int process_checks(struct r1bio *r1_bio)
 		if (j >= 0)
 			atomic64_add(r1_bio->sectors, &mddev->resync_mismatches);
 		if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
-			      && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
+			      && uptodate)) {
 			/* No need to write to this device. */
 			sbio->bi_end_io = NULL;
 			rdev_dec_pending(conf->mirrors[i].rdev, mddev);
 			continue;
 		}
-		/* fixup the bio for reuse */
-		sbio->bi_vcnt = vcnt;
-		sbio->bi_size = r1_bio->sectors << 9;
-		sbio->bi_idx = 0;
-		sbio->bi_phys_segments = 0;
-		sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
-		sbio->bi_flags |= 1 << BIO_UPTODATE;
-		sbio->bi_next = NULL;
-		sbio->bi_sector = r1_bio->sector +
-			conf->mirrors[i].rdev->data_offset;
-		sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
-		size = sbio->bi_size;
-		for (j = 0; j < vcnt ; j++) {
-			struct bio_vec *bi;
-			bi = &sbio->bi_io_vec[j];
-			bi->bv_offset = 0;
-			if (size > PAGE_SIZE)
-				bi->bv_len = PAGE_SIZE;
-			else
-				bi->bv_len = size;
-			size -= PAGE_SIZE;
-			memcpy(page_address(bi->bv_page),
-			       page_address(pbio->bi_io_vec[j].bv_page),
-			       PAGE_SIZE);
-		}
+
+		bio_copy_data(sbio, pbio);
 	}
 	return 0;
 }
@@ -1945,7 +2069,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
 		wbio->bi_rw = WRITE;
 		wbio->bi_end_io = end_sync_write;
 		atomic_inc(&r1_bio->remaining);
-		md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
+		md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
 
 		generic_make_request(wbio);
 	}
@@ -2057,32 +2181,11 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
 	}
 }
 
-static void bi_complete(struct bio *bio, int error)
-{
-	complete((struct completion *)bio->bi_private);
-}
-
-static int submit_bio_wait(int rw, struct bio *bio)
-{
-	struct completion event;
-	rw |= REQ_SYNC;
-
-	init_completion(&event);
-	bio->bi_private = &event;
-	bio->bi_end_io = bi_complete;
-	submit_bio(rw, bio);
-	wait_for_completion(&event);
-
-	return test_bit(BIO_UPTODATE, &bio->bi_flags);
-}
-
 static int narrow_write_error(struct r1bio *r1_bio, int i)
 {
 	struct mddev *mddev = r1_bio->mddev;
 	struct r1conf *conf = mddev->private;
 	struct md_rdev *rdev = conf->mirrors[i].rdev;
-	int vcnt, idx;
-	struct bio_vec *vec;
 
 	/* bio has the data to be written to device 'i' where
 	 * we just recently had a write error.
@@ -2110,33 +2213,35 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
 		   & ~(sector_t)(block_sectors - 1))
 		- sector;
 
-	if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
-		vcnt = r1_bio->behind_page_count;
-		vec = r1_bio->behind_bvecs;
-		idx = 0;
-		while (vec[idx].bv_page == NULL)
-			idx++;
-	} else {
-		vcnt = r1_bio->master_bio->bi_vcnt;
-		vec = r1_bio->master_bio->bi_io_vec;
-		idx = r1_bio->master_bio->bi_idx;
-	}
 	while (sect_to_write) {
 		struct bio *wbio;
 		if (sectors > sect_to_write)
 			sectors = sect_to_write;
 		/* Write at 'sector' for 'sectors'*/
 
-		wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
-		memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
-		wbio->bi_sector = r1_bio->sector;
+		if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+			unsigned vcnt = r1_bio->behind_page_count;
+			struct bio_vec *vec = r1_bio->behind_bvecs;
+
+			while (!vec->bv_page) {
+				vec++;
+				vcnt--;
+			}
+
+			wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
+			memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
+
+			wbio->bi_vcnt = vcnt;
+		} else {
+			wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
+		}
+
 		wbio->bi_rw = WRITE;
-		wbio->bi_vcnt = vcnt;
-		wbio->bi_size = r1_bio->sectors << 9;
-		wbio->bi_idx = idx;
+		wbio->bi_iter.bi_sector = r1_bio->sector;
+		wbio->bi_iter.bi_size = r1_bio->sectors << 9;
 
-		md_trim_bio(wbio, sector - r1_bio->sector, sectors);
-		wbio->bi_sector += rdev->data_offset;
+		bio_trim(wbio, sector - r1_bio->sector, sectors);
+		wbio->bi_iter.bi_sector += rdev->data_offset;
 		wbio->bi_bdev = rdev->bdev;
 		if (submit_bio_wait(WRITE, wbio) == 0)
 			/* failure! */
@@ -2223,7 +2328,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 	 * frozen
 	 */
 	if (mddev->ro == 0) {
-		freeze_array(conf);
+		freeze_array(conf, 1);
 		fix_read_error(conf, r1_bio->read_disk,
 			       r1_bio->sector, r1_bio->sectors);
 		unfreeze_array(conf);
@@ -2250,7 +2355,8 @@ read_more:
 		}
 		r1_bio->read_disk = disk;
 		bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
-		md_trim_bio(bio, r1_bio->sector - bio->bi_sector, max_sectors);
+		bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector,
+			 max_sectors);
 		r1_bio->bios[r1_bio->read_disk] = bio;
 		rdev = conf->mirrors[disk].rdev;
 		printk_ratelimited(KERN_ERR
@@ -2259,7 +2365,7 @@ read_more:
 				   mdname(mddev),
 				   (unsigned long long)r1_bio->sector,
 				   bdevname(rdev->bdev, b));
-		bio->bi_sector = r1_bio->sector + rdev->data_offset;
+		bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset;
 		bio->bi_bdev = rdev->bdev;
 		bio->bi_end_io = raid1_end_read_request;
 		bio->bi_rw = READ | do_sync;
@@ -2268,7 +2374,7 @@ read_more:
 			/* Drat - have to split this up more */
 			struct bio *mbio = r1_bio->master_bio;
 			int sectors_handled = (r1_bio->sector + max_sectors
-					       - mbio->bi_sector);
+					       - mbio->bi_iter.bi_sector);
 			r1_bio->sectors = max_sectors;
 			spin_lock_irq(&conf->device_lock);
 			if (mbio->bi_phys_segments == 0)
@@ -2282,12 +2388,12 @@ read_more:
 			r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
 
 			r1_bio->master_bio = mbio;
-			r1_bio->sectors = (mbio->bi_size >> 9)
-					  - sectors_handled;
+			r1_bio->sectors = bio_sectors(mbio) - sectors_handled;
 			r1_bio->state = 0;
 			set_bit(R1BIO_ReadError, &r1_bio->state);
 			r1_bio->mddev = mddev;
-			r1_bio->sector = mbio->bi_sector + sectors_handled;
+			r1_bio->sector = mbio->bi_iter.bi_sector +
+				sectors_handled;
 
 			goto read_more;
 		} else
@@ -2457,18 +2563,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
 	for (i = 0; i < conf->raid_disks * 2; i++) {
 		struct md_rdev *rdev;
 		bio = r1_bio->bios[i];
-
-		/* take from bio_init */
-		bio->bi_next = NULL;
-		bio->bi_flags &= ~(BIO_POOL_MASK-1);
-		bio->bi_flags |= 1 << BIO_UPTODATE;
-		bio->bi_rw = READ;
-		bio->bi_vcnt = 0;
-		bio->bi_idx = 0;
-		bio->bi_phys_segments = 0;
-		bio->bi_size = 0;
-		bio->bi_end_io = NULL;
-		bio->bi_private = NULL;
+		bio_reset(bio);
 
 		rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev == NULL ||
@@ -2522,7 +2617,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
 		}
 		if (bio->bi_end_io) {
 			atomic_inc(&rdev->nr_pending);
-			bio->bi_sector = sector_nr + rdev->data_offset;
+			bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
 			bio->bi_bdev = rdev->bdev;
 			bio->bi_private = r1_bio;
 		}
@@ -2622,7 +2717,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
 							continue;
 						/* remove last page from this bio */
 						bio->bi_vcnt--;
-						bio->bi_size -= len;
+						bio->bi_iter.bi_size -= len;
 						bio->bi_flags &= ~(1<< BIO_SEG_VALID);
 					}
 					goto bio_full;
@@ -2710,7 +2805,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 		    || disk_idx < 0)
 			continue;
 		if (test_bit(Replacement, &rdev->flags))
-			disk = conf->mirrors + conf->raid_disks + disk_idx;
+			disk = conf->mirrors + mddev->raid_disks + disk_idx;
 		else
 			disk = conf->mirrors + disk_idx;
 
@@ -2735,6 +2830,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	conf->pending_count = 0;
 	conf->recovery_disabled = mddev->recovery_disabled - 1;
 
+	conf->start_next_window = MaxSector;
+	conf->current_window_requests = conf->next_window_requests = 0;
+
 	err = -EIO;
 	for (i = 0; i < conf->raid_disks * 2; i++) {
 
@@ -2819,6 +2917,9 @@ static int run(struct mddev *mddev)
 	if (IS_ERR(conf))
 		return PTR_ERR(conf);
 
+	if (mddev->queue)
+		blk_queue_max_write_same_sectors(mddev->queue, 0);
+
 	rdev_for_each(rdev, mddev) {
 		if (!mddev->gendisk)
 			continue;
@@ -2889,13 +2990,14 @@ static int stop(struct mddev *mddev)
 			   atomic_read(&bitmap->behind_writes) == 0);
 	}
 
-	raise_barrier(conf);
-	lower_barrier(conf);
+	freeze_array(conf, 0);
+	unfreeze_array(conf);
 
 	md_unregister_thread(&mddev->thread);
 	if (conf->r1bio_pool)
 		mempool_destroy(conf->r1bio_pool);
 	kfree(conf->mirrors);
+	safe_put_page(conf->tmppage);
 	kfree(conf->poolinfo);
 	kfree(conf);
 	mddev->private = NULL;
@@ -2999,7 +3101,7 @@ static int raid1_reshape(struct mddev *mddev)
 		return -ENOMEM;
 	}
 
-	raise_barrier(conf);
+	freeze_array(conf, 0);
 
 	/* ok, everything is stopped */
 	oldpool = conf->r1bio_pool;
@@ -3030,7 +3132,7 @@ static int raid1_reshape(struct mddev *mddev)
 	conf->raid_disks = mddev->raid_disks = raid_disks;
 	mddev->delta_disks = 0;
 
-	lower_barrier(conf);
+	unfreeze_array(conf);
 
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
@@ -3048,10 +3150,10 @@ static void raid1_quiesce(struct mddev *mddev, int state)
 		wake_up(&conf->wait_barrier);
 		break;
 	case 1:
-		raise_barrier(conf);
+		freeze_array(conf, 0);
 		break;
 	case 0:
-		lower_barrier(conf);
+		unfreeze_array(conf);
 		break;
 	}
 }
@@ -3068,7 +3170,8 @@ static void *raid1_takeover(struct mddev *mddev)
 		mddev->new_chunk_sectors = 0;
 		conf = setup_conf(mddev);
 		if (!IS_ERR(conf))
-			conf->barrier = 1;
+			/* Array must appear to be quiesced */
+			conf->array_frozen = 1;
 		return conf;
 	}
 	return ERR_PTR(-EINVAL);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 0ff3715fb7e..9bebca7bff2 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -41,6 +41,19 @@ struct r1conf {
 	 */
 	sector_t		next_resync;
 
+	/* When raid1 starts resync, we divide array into four partitions
+	 * |---------|--------------|---------------------|-------------|
+	 *        next_resync   start_next_window       end_window
+	 * start_next_window = next_resync + NEXT_NORMALIO_DISTANCE
+	 * end_window = start_next_window + NEXT_NORMALIO_DISTANCE
+	 * current_window_requests means the count of normalIO between
+	 *   start_next_window and end_window.
+	 * next_window_requests means the count of normalIO after end_window.
+	 * */
+	sector_t		start_next_window;
+	int			current_window_requests;
+	int			next_window_requests;
+
 	spinlock_t		device_lock;
 
 	/* list of 'struct r1bio' that need to be processed by raid1d,
@@ -65,6 +78,7 @@ struct r1conf {
 	int			nr_waiting;
 	int			nr_queued;
 	int			barrier;
+	int			array_frozen;
 
 	/* Set to 1 if a full sync is needed, (fresh device added).
 	 * Cleared when a sync completes.
@@ -111,6 +125,7 @@ struct r1bio {
 						 * in this BehindIO request
 						 */
 	sector_t		sector;
+	sector_t		start_next_window;
 	int			sectors;
 	unsigned long		state;
 	struct mddev		*mddev;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 906ccbd0f7d..cb882aae9e2 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -38,21 +38,36 @@
  *    near_copies (stored in low byte of layout)
  *    far_copies (stored in second byte of layout)
  *    far_offset (stored in bit 16 of layout )
+ *    use_far_sets (stored in bit 17 of layout )
  *
- * The data to be stored is divided into chunks using chunksize.
- * Each device is divided into far_copies sections.
- * In each section, chunks are laid out in a style similar to raid0, but
- * near_copies copies of each chunk is stored (each on a different drive).
- * The starting device for each section is offset near_copies from the starting
- * device of the previous section.
- * Thus they are (near_copies*far_copies) of each chunk, and each is on a different
- * drive.
- * near_copies and far_copies must be at least one, and their product is at most
- * raid_disks.
+ * The data to be stored is divided into chunks using chunksize.  Each device
+ * is divided into far_copies sections.   In each section, chunks are laid out
+ * in a style similar to raid0, but near_copies copies of each chunk is stored
+ * (each on a different drive).  The starting device for each section is offset
+ * near_copies from the starting device of the previous section.  Thus there
+ * are (near_copies * far_copies) of each chunk, and each is on a different
+ * drive.  near_copies and far_copies must be at least one, and their product
+ * is at most raid_disks.
  *
  * If far_offset is true, then the far_copies are handled a bit differently.
- * The copies are still in different stripes, but instead of be very far apart
- * on disk, there are adjacent stripes.
+ * The copies are still in different stripes, but instead of being very far
+ * apart on disk, there are adjacent stripes.
+ *
+ * The far and offset algorithms are handled slightly differently if
+ * 'use_far_sets' is true.  In this case, the array's devices are grouped into
+ * sets that are (near_copies * far_copies) in size.  The far copied stripes
+ * are still shifted by 'near_copies' devices, but this shifting stays confined
+ * to the set rather than the entire array.  This is done to improve the number
+ * of device combinations that can fail without causing the array to fail.
+ * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
+ * on a device):
+ *    A B C D    A B C D E
+ *      ...         ...
+ *    D A B C    E A B C D
+ * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
+ *    [A B] [C D]    [A B] [C D E]
+ *    |...| |...|    |...| | ... |
+ *    [B A] [D C]    [B A] [E C D]
  */
 
 /*
@@ -82,7 +97,7 @@ static int max_queued_requests = 1024;
 
 static void allow_barrier(struct r10conf *conf);
 static void lower_barrier(struct r10conf *conf);
-static int enough(struct r10conf *conf, int ignore);
+static int _enough(struct r10conf *conf, int previous, int ignore);
 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 				int *skipped);
 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
@@ -377,11 +392,9 @@ static void raid10_end_read_request(struct bio *bio, int error)
 		 * than fail the last device.  Here we redefine
 		 * "uptodate" to mean "Don't want to retry"
 		 */
-		unsigned long flags;
-		spin_lock_irqsave(&conf->device_lock, flags);
-		if (!enough(conf, rdev->raid_disk))
+		if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
+			     rdev->raid_disk))
 			uptodate = 1;
-		spin_unlock_irqrestore(&conf->device_lock, flags);
 	}
 	if (uptodate) {
 		raid_end_bio_io(r10_bio);
@@ -475,7 +488,17 @@ static void raid10_end_write_request(struct bio *bio, int error)
 		sector_t first_bad;
 		int bad_sectors;
 
-		set_bit(R10BIO_Uptodate, &r10_bio->state);
+		/*
+		 * Do not set R10BIO_Uptodate if the current device is
+		 * rebuilding or Faulty. This is because we cannot use
+		 * such device for properly reading the data back (we could
+		 * potentially use it, if the current write would have felt
+		 * before rdev->recovery_offset, but for simplicity we don't
+		 * check this here.
+		 */
+		if (test_bit(In_sync, &rdev->flags) &&
+		    !test_bit(Faulty, &rdev->flags))
+			set_bit(R10BIO_Uptodate, &r10_bio->state);
 
 		/* Maybe we can clear some bad blocks. */
 		if (is_badblock(rdev,
@@ -499,7 +522,7 @@ static void raid10_end_write_request(struct bio *bio, int error)
 	 */
 	one_write_done(r10_bio);
 	if (dec_rdev)
-		rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
+		rdev_dec_pending(rdev, conf->mddev);
 }
 
 /*
@@ -535,6 +558,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
 	sector_t stripe;
 	int dev;
 	int slot = 0;
+	int last_far_set_start, last_far_set_size;
+
+	last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
+	last_far_set_start *= geo->far_set_size;
+
+	last_far_set_size = geo->far_set_size;
+	last_far_set_size += (geo->raid_disks % geo->far_set_size);
 
 	/* now calculate first sector/dev */
 	chunk = r10bio->sector >> geo->chunk_shift;
@@ -551,15 +581,25 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
 	/* and calculate all the others */
 	for (n = 0; n < geo->near_copies; n++) {
 		int d = dev;
+		int set;
 		sector_t s = sector;
-		r10bio->devs[slot].addr = sector;
 		r10bio->devs[slot].devnum = d;
+		r10bio->devs[slot].addr = s;
 		slot++;
 
 		for (f = 1; f < geo->far_copies; f++) {
+			set = d / geo->far_set_size;
 			d += geo->near_copies;
-			if (d >= geo->raid_disks)
-				d -= geo->raid_disks;
+
+			if ((geo->raid_disks % geo->far_set_size) &&
+			    (d > last_far_set_start)) {
+				d -= last_far_set_start;
+				d %= last_far_set_size;
+				d += last_far_set_start;
+			} else {
+				d %= geo->far_set_size;
+				d += geo->far_set_size * set;
+			}
 			s += geo->stride;
 			r10bio->devs[slot].devnum = d;
 			r10bio->devs[slot].addr = s;
@@ -595,6 +635,20 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
 	 * or recovery, so reshape isn't happening
 	 */
 	struct geom *geo = &conf->geo;
+	int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
+	int far_set_size = geo->far_set_size;
+	int last_far_set_start;
+
+	if (geo->raid_disks % geo->far_set_size) {
+		last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
+		last_far_set_start *= geo->far_set_size;
+
+		if (dev >= last_far_set_start) {
+			far_set_size = geo->far_set_size;
+			far_set_size += (geo->raid_disks % geo->far_set_size);
+			far_set_start = last_far_set_start;
+		}
+	}
 
 	offset = sector & geo->chunk_mask;
 	if (geo->far_offset) {
@@ -602,13 +656,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
 		chunk = sector >> geo->chunk_shift;
 		fc = sector_div(chunk, geo->far_copies);
 		dev -= fc * geo->near_copies;
-		if (dev < 0)
-			dev += geo->raid_disks;
+		if (dev < far_set_start)
+			dev += far_set_size;
 	} else {
 		while (sector >= geo->stride) {
 			sector -= geo->stride;
-			if (dev < geo->near_copies)
-				dev += geo->raid_disks - geo->near_copies;
+			if (dev < (geo->near_copies + far_set_start))
+				dev += far_set_size - geo->near_copies;
 			else
 				dev -= geo->near_copies;
 		}
@@ -952,7 +1006,7 @@ static void raise_barrier(struct r10conf *conf, int force)
 
 	/* Wait until no block IO is waiting (unless 'force') */
 	wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
-			    conf->resync_lock, );
+			    conf->resync_lock);
 
 	/* block any new IO from starting */
 	conf->barrier++;
@@ -960,7 +1014,7 @@ static void raise_barrier(struct r10conf *conf, int force)
 	/* Now wait for all pending IO to complete */
 	wait_event_lock_irq(conf->wait_barrier,
 			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
-			    conf->resync_lock, );
+			    conf->resync_lock);
 
 	spin_unlock_irq(&conf->resync_lock);
 }
@@ -993,8 +1047,7 @@ static void wait_barrier(struct r10conf *conf)
 				    (conf->nr_pending &&
 				     current->bio_list &&
 				     !bio_list_empty(current->bio_list)),
-				    conf->resync_lock,
-			);
+				    conf->resync_lock);
 		conf->nr_waiting--;
 	}
 	conf->nr_pending++;
@@ -1010,27 +1063,27 @@ static void allow_barrier(struct r10conf *conf)
 	wake_up(&conf->wait_barrier);
 }
 
-static void freeze_array(struct r10conf *conf)
+static void freeze_array(struct r10conf *conf, int extra)
 {
 	/* stop syncio and normal IO and wait for everything to
 	 * go quiet.
 	 * We increment barrier and nr_waiting, and then
-	 * wait until nr_pending match nr_queued+1
+	 * wait until nr_pending match nr_queued+extra
 	 * This is called in the context of one normal IO request
 	 * that has failed. Thus any sync request that might be pending
 	 * will be blocked by nr_pending, and we need to wait for
 	 * pending IO requests to complete or be queued for re-try.
-	 * Thus the number queued (nr_queued) plus this request (1)
+	 * Thus the number queued (nr_queued) plus this request (extra)
 	 * must match the number of pending IOs (nr_pending) before
 	 * we continue.
 	 */
 	spin_lock_irq(&conf->resync_lock);
 	conf->barrier++;
 	conf->nr_waiting++;
-	wait_event_lock_irq(conf->wait_barrier,
-			    conf->nr_pending == conf->nr_queued+1,
-			    conf->resync_lock,
-			    flush_pending_writes(conf));
+	wait_event_lock_irq_cmd(conf->wait_barrier,
+				conf->nr_pending == conf->nr_queued+extra,
+				conf->resync_lock,
+				flush_pending_writes(conf));
 
 	spin_unlock_irq(&conf->resync_lock);
 }
@@ -1069,11 +1122,12 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
 	struct r10conf *conf = mddev->private;
 	struct bio *bio;
 
-	if (from_schedule) {
+	if (from_schedule || current->bio_list) {
 		spin_lock_irq(&conf->device_lock);
 		bio_list_merge(&conf->pending_bio_list, &plug->pending);
 		conf->pending_count += plug->pending_cnt;
 		spin_unlock_irq(&conf->device_lock);
+		wake_up(&conf->wait_barrier);
 		md_wakeup_thread(mddev->thread);
 		kfree(plug);
 		return;
@@ -1087,25 +1141,29 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
 	while (bio) { /* submit pending writes */
 		struct bio *next = bio->bi_next;
 		bio->bi_next = NULL;
-		generic_make_request(bio);
+		if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+		    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+			/* Just ignore it */
+			bio_endio(bio, 0);
+		else
+			generic_make_request(bio);
 		bio = next;
 	}
 	kfree(plug);
 }
 
-static void make_request(struct mddev *mddev, struct bio * bio)
+static void __make_request(struct mddev *mddev, struct bio *bio)
 {
 	struct r10conf *conf = mddev->private;
 	struct r10bio *r10_bio;
 	struct bio *read_bio;
 	int i;
-	sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
-	int chunk_sects = chunk_mask + 1;
 	const int rw = bio_data_dir(bio);
 	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
 	const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
 	const unsigned long do_discard = (bio->bi_rw
 					  & (REQ_DISCARD | REQ_SECURE));
+	const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
 	unsigned long flags;
 	struct md_rdev *blocked_rdev;
 	struct blk_plug_cb *cb;
@@ -1114,62 +1172,6 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	int max_sectors;
 	int sectors;
 
-	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
-		md_flush_request(mddev, bio);
-		return;
-	}
-
-	/* If this request crosses a chunk boundary, we need to
-	 * split it.  This will only happen for 1 PAGE (or less) requests.
-	 */
-	if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9)
-		     > chunk_sects
-		     && (conf->geo.near_copies < conf->geo.raid_disks
-			 || conf->prev.near_copies < conf->prev.raid_disks))) {
-		struct bio_pair *bp;
-		/* Sanity check -- queue functions should prevent this happening */
-		if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) ||
-		    bio->bi_idx != 0)
-			goto bad_map;
-		/* This is a one page bio that upper layers
-		 * refuse to split for us, so we need to split it.
-		 */
-		bp = bio_split(bio,
-			       chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
-
-		/* Each of these 'make_request' calls will call 'wait_barrier'.
-		 * If the first succeeds but the second blocks due to the resync
-		 * thread raising the barrier, we will deadlock because the
-		 * IO to the underlying device will be queued in generic_make_request
-		 * and will never complete, so will never reduce nr_pending.
-		 * So increment nr_waiting here so no new raise_barriers will
-		 * succeed, and so the second wait_barrier cannot block.
-		 */
-		spin_lock_irq(&conf->resync_lock);
-		conf->nr_waiting++;
-		spin_unlock_irq(&conf->resync_lock);
-
-		make_request(mddev, &bp->bio1);
-		make_request(mddev, &bp->bio2);
-
-		spin_lock_irq(&conf->resync_lock);
-		conf->nr_waiting--;
-		wake_up(&conf->wait_barrier);
-		spin_unlock_irq(&conf->resync_lock);
-
-		bio_pair_release(bp);
-		return;
-	bad_map:
-		printk("md/raid10:%s: make_request bug: can't convert block across chunks"
-		       " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
-		       (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
-
-		bio_io_error(bio);
-		return;
-	}
-
-	md_write_start(mddev, bio);
-
 	/*
 	 * Register the new request and wait if the reconstruction
 	 * thread has put up a bar for new requests.
@@ -1177,26 +1179,27 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	 */
 	wait_barrier(conf);
 
-	sectors = bio->bi_size >> 9;
+	sectors = bio_sectors(bio);
 	while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
-	    bio->bi_sector < conf->reshape_progress &&
-	    bio->bi_sector + sectors > conf->reshape_progress) {
+	    bio->bi_iter.bi_sector < conf->reshape_progress &&
+	    bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
 		/* IO spans the reshape position.  Need to wait for
 		 * reshape to pass
 		 */
 		allow_barrier(conf);
 		wait_event(conf->wait_barrier,
-			   conf->reshape_progress <= bio->bi_sector ||
-			   conf->reshape_progress >= bio->bi_sector + sectors);
+			   conf->reshape_progress <= bio->bi_iter.bi_sector ||
+			   conf->reshape_progress >= bio->bi_iter.bi_sector +
+			   sectors);
 		wait_barrier(conf);
 	}
 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 	    bio_data_dir(bio) == WRITE &&
 	    (mddev->reshape_backwards
-	     ? (bio->bi_sector < conf->reshape_safe &&
-		bio->bi_sector + sectors > conf->reshape_progress)
-	     : (bio->bi_sector + sectors > conf->reshape_safe &&
-		bio->bi_sector < conf->reshape_progress))) {
+	     ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
+		bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
+	     : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
+		bio->bi_iter.bi_sector < conf->reshape_progress))) {
 		/* Need to update reshape_position in metadata */
 		mddev->reshape_position = conf->reshape_progress;
 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -1214,7 +1217,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	r10_bio->sectors = sectors;
 
 	r10_bio->mddev = mddev;
-	r10_bio->sector = bio->bi_sector;
+	r10_bio->sector = bio->bi_iter.bi_sector;
 	r10_bio->state = 0;
 
 	/* We might need to issue multiple reads to different
@@ -1243,13 +1246,13 @@ read_again:
 		slot = r10_bio->read_slot;
 
 		read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
-			    max_sectors);
+		bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
+			 max_sectors);
 
 		r10_bio->devs[slot].bio = read_bio;
 		r10_bio->devs[slot].rdev = rdev;
 
-		read_bio->bi_sector = r10_bio->devs[slot].addr +
+		read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
 			choose_data_offset(r10_bio, rdev);
 		read_bio->bi_bdev = rdev->bdev;
 		read_bio->bi_end_io = raid10_end_read_request;
@@ -1260,15 +1263,15 @@ read_again:
 			/* Could not read all from this device, so we will
 			 * need another r10_bio.
 			 */
-			sectors_handled = (r10_bio->sectors + max_sectors
-					   - bio->bi_sector);
+			sectors_handled = (r10_bio->sector + max_sectors
+					   - bio->bi_iter.bi_sector);
 			r10_bio->sectors = max_sectors;
 			spin_lock_irq(&conf->device_lock);
 			if (bio->bi_phys_segments == 0)
 				bio->bi_phys_segments = 2;
 			else
 				bio->bi_phys_segments++;
-			spin_unlock(&conf->device_lock);
+			spin_unlock_irq(&conf->device_lock);
 			/* Cannot call generic_make_request directly
 			 * as that will be queued in __generic_make_request
 			 * and subsequent mempool_alloc might block
@@ -1279,11 +1282,11 @@ read_again:
 			r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
 
 			r10_bio->master_bio = bio;
-			r10_bio->sectors = ((bio->bi_size >> 9)
-					    - sectors_handled);
+			r10_bio->sectors = bio_sectors(bio) - sectors_handled;
 			r10_bio->state = 0;
 			r10_bio->mddev = mddev;
-			r10_bio->sector = bio->bi_sector + sectors_handled;
+			r10_bio->sector = bio->bi_iter.bi_sector +
+				sectors_handled;
 			goto read_again;
 		} else
 			generic_make_request(read_bio);
@@ -1334,18 +1337,21 @@ retry_write:
 			blocked_rdev = rrdev;
 			break;
 		}
+		if (rdev && (test_bit(Faulty, &rdev->flags)
+			     || test_bit(Unmerged, &rdev->flags)))
+			rdev = NULL;
 		if (rrdev && (test_bit(Faulty, &rrdev->flags)
 			      || test_bit(Unmerged, &rrdev->flags)))
 			rrdev = NULL;
 
 		r10_bio->devs[i].bio = NULL;
 		r10_bio->devs[i].repl_bio = NULL;
-		if (!rdev || test_bit(Faulty, &rdev->flags) ||
-		    test_bit(Unmerged, &rdev->flags)) {
+
+		if (!rdev && !rrdev) {
 			set_bit(R10BIO_Degraded, &r10_bio->state);
 			continue;
 		}
-		if (test_bit(WriteErrorSeen, &rdev->flags)) {
+		if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
 			sector_t first_bad;
 			sector_t dev_sector = r10_bio->devs[i].addr;
 			int bad_sectors;
@@ -1387,8 +1393,10 @@ retry_write:
 					max_sectors = good_sectors;
 			}
 		}
-		r10_bio->devs[i].bio = bio;
-		atomic_inc(&rdev->nr_pending);
+		if (rdev) {
+			r10_bio->devs[i].bio = bio;
+			atomic_inc(&rdev->nr_pending);
+		}
 		if (rrdev) {
 			r10_bio->devs[i].repl_bio = bio;
 			atomic_inc(&rrdev->nr_pending);
@@ -1436,7 +1444,8 @@ retry_write:
 			bio->bi_phys_segments++;
 		spin_unlock_irq(&conf->device_lock);
 	}
-	sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
+	sectors_handled = r10_bio->sector + max_sectors -
+		bio->bi_iter.bi_sector;
 
 	atomic_set(&r10_bio->remaining, 1);
 	bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
@@ -1444,76 +1453,80 @@ retry_write:
 	for (i = 0; i < conf->copies; i++) {
 		struct bio *mbio;
 		int d = r10_bio->devs[i].devnum;
-		if (!r10_bio->devs[i].bio)
-			continue;
+		if (r10_bio->devs[i].bio) {
+			struct md_rdev *rdev = conf->mirrors[d].rdev;
+			mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+			bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
+				 max_sectors);
+			r10_bio->devs[i].bio = mbio;
+
+			mbio->bi_iter.bi_sector	= (r10_bio->devs[i].addr+
+					   choose_data_offset(r10_bio,
+							      rdev));
+			mbio->bi_bdev = rdev->bdev;
+			mbio->bi_end_io	= raid10_end_write_request;
+			mbio->bi_rw =
+				WRITE | do_sync | do_fua | do_discard | do_same;
+			mbio->bi_private = r10_bio;
 
-		mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
-			    max_sectors);
-		r10_bio->devs[i].bio = mbio;
+			atomic_inc(&r10_bio->remaining);
 
-		mbio->bi_sector	= (r10_bio->devs[i].addr+
-				   choose_data_offset(r10_bio,
-						      conf->mirrors[d].rdev));
-		mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
-		mbio->bi_end_io	= raid10_end_write_request;
-		mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
-		mbio->bi_private = r10_bio;
+			cb = blk_check_plugged(raid10_unplug, mddev,
+					       sizeof(*plug));
+			if (cb)
+				plug = container_of(cb, struct raid10_plug_cb,
+						    cb);
+			else
+				plug = NULL;
+			spin_lock_irqsave(&conf->device_lock, flags);
+			if (plug) {
+				bio_list_add(&plug->pending, mbio);
+				plug->pending_cnt++;
+			} else {
+				bio_list_add(&conf->pending_bio_list, mbio);
+				conf->pending_count++;
+			}
+			spin_unlock_irqrestore(&conf->device_lock, flags);
+			if (!plug)
+				md_wakeup_thread(mddev->thread);
+		}
 
-		atomic_inc(&r10_bio->remaining);
+		if (r10_bio->devs[i].repl_bio) {
+			struct md_rdev *rdev = conf->mirrors[d].replacement;
+			if (rdev == NULL) {
+				/* Replacement just got moved to main 'rdev' */
+				smp_mb();
+				rdev = conf->mirrors[d].rdev;
+			}
+			mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+			bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
+				 max_sectors);
+			r10_bio->devs[i].repl_bio = mbio;
+
+			mbio->bi_iter.bi_sector	= (r10_bio->devs[i].addr +
+					   choose_data_offset(
+						   r10_bio, rdev));
+			mbio->bi_bdev = rdev->bdev;
+			mbio->bi_end_io	= raid10_end_write_request;
+			mbio->bi_rw =
+				WRITE | do_sync | do_fua | do_discard | do_same;
+			mbio->bi_private = r10_bio;
 
-		cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
-		if (cb)
-			plug = container_of(cb, struct raid10_plug_cb, cb);
-		else
-			plug = NULL;
-		spin_lock_irqsave(&conf->device_lock, flags);
-		if (plug) {
-			bio_list_add(&plug->pending, mbio);
-			plug->pending_cnt++;
-		} else {
+			atomic_inc(&r10_bio->remaining);
+			spin_lock_irqsave(&conf->device_lock, flags);
 			bio_list_add(&conf->pending_bio_list, mbio);
 			conf->pending_count++;
+			spin_unlock_irqrestore(&conf->device_lock, flags);
+			if (!mddev_check_plugged(mddev))
+				md_wakeup_thread(mddev->thread);
 		}
-		spin_unlock_irqrestore(&conf->device_lock, flags);
-		if (!plug)
-			md_wakeup_thread(mddev->thread);
-
-		if (!r10_bio->devs[i].repl_bio)
-			continue;
-
-		mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
-			    max_sectors);
-		r10_bio->devs[i].repl_bio = mbio;
-
-		/* We are actively writing to the original device
-		 * so it cannot disappear, so the replacement cannot
-		 * become NULL here
-		 */
-		mbio->bi_sector	= (r10_bio->devs[i].addr +
-				   choose_data_offset(
-					   r10_bio,
-					   conf->mirrors[d].replacement));
-		mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
-		mbio->bi_end_io	= raid10_end_write_request;
-		mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
-		mbio->bi_private = r10_bio;
-
-		atomic_inc(&r10_bio->remaining);
-		spin_lock_irqsave(&conf->device_lock, flags);
-		bio_list_add(&conf->pending_bio_list, mbio);
-		conf->pending_count++;
-		spin_unlock_irqrestore(&conf->device_lock, flags);
-		if (!mddev_check_plugged(mddev))
-			md_wakeup_thread(mddev->thread);
 	}
 
 	/* Don't remove the bias on 'remaining' (one_write_done) until
 	 * after checking if we need to go around again.
 	 */
 
-	if (sectors_handled < (bio->bi_size >> 9)) {
+	if (sectors_handled < bio_sectors(bio)) {
 		one_write_done(r10_bio);
 		/* We need another r10_bio.  It has already been counted
 		 * in bio->bi_phys_segments.
@@ -1521,14 +1534,54 @@ retry_write:
 		r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
 
 		r10_bio->master_bio = bio;
-		r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+		r10_bio->sectors = bio_sectors(bio) - sectors_handled;
 
 		r10_bio->mddev = mddev;
-		r10_bio->sector = bio->bi_sector + sectors_handled;
+		r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
 		r10_bio->state = 0;
 		goto retry_write;
 	}
 	one_write_done(r10_bio);
+}
+
+static void make_request(struct mddev *mddev, struct bio *bio)
+{
+	struct r10conf *conf = mddev->private;
+	sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
+	int chunk_sects = chunk_mask + 1;
+
+	struct bio *split;
+
+	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
+		md_flush_request(mddev, bio);
+		return;
+	}
+
+	md_write_start(mddev, bio);
+
+
+	do {
+
+		/*
+		 * If this request crosses a chunk boundary, we need to split
+		 * it.
+		 */
+		if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
+			     bio_sectors(bio) > chunk_sects
+			     && (conf->geo.near_copies < conf->geo.raid_disks
+				 || conf->prev.near_copies <
+				 conf->prev.raid_disks))) {
+			split = bio_split(bio, chunk_sects -
+					  (bio->bi_iter.bi_sector &
+					   (chunk_sects - 1)),
+					  GFP_NOIO, fs_bio_set);
+			bio_chain(split, bio);
+		} else {
+			split = bio;
+		}
+
+		__make_request(mddev, split);
+	} while (split != bio);
 
 	/* In case raid10d snuck in to freeze_array */
 	wake_up(&conf->wait_barrier);
@@ -1563,37 +1616,58 @@ static void status(struct seq_file *seq, struct mddev *mddev)
  * Don't consider the device numbered 'ignore'
  * as we might be about to remove it.
  */
-static int _enough(struct r10conf *conf, struct geom *geo, int ignore)
+static int _enough(struct r10conf *conf, int previous, int ignore)
 {
 	int first = 0;
+	int has_enough = 0;
+	int disks, ncopies;
+	if (previous) {
+		disks = conf->prev.raid_disks;
+		ncopies = conf->prev.near_copies;
+	} else {
+		disks = conf->geo.raid_disks;
+		ncopies = conf->geo.near_copies;
+	}
 
+	rcu_read_lock();
 	do {
 		int n = conf->copies;
 		int cnt = 0;
 		int this = first;
 		while (n--) {
-			if (conf->mirrors[this].rdev &&
-			    this != ignore)
+			struct md_rdev *rdev;
+			if (this != ignore &&
+			    (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
+			    test_bit(In_sync, &rdev->flags))
 				cnt++;
-			this = (this+1) % geo->raid_disks;
+			this = (this+1) % disks;
 		}
 		if (cnt == 0)
-			return 0;
-		first = (first + geo->near_copies) % geo->raid_disks;
+			goto out;
+		first = (first + ncopies) % disks;
 	} while (first != 0);
-	return 1;
+	has_enough = 1;
+out:
+	rcu_read_unlock();
+	return has_enough;
 }
 
 static int enough(struct r10conf *conf, int ignore)
 {
-	return _enough(conf, &conf->geo, ignore) &&
-		_enough(conf, &conf->prev, ignore);
+	/* when calling 'enough', both 'prev' and 'geo' must
+	 * be stable.
+	 * This is ensured if ->reconfig_mutex or ->device_lock
+	 * is held.
+	 */
+	return _enough(conf, 0, ignore) &&
+		_enough(conf, 1, ignore);
 }
 
 static void error(struct mddev *mddev, struct md_rdev *rdev)
 {
 	char b[BDEVNAME_SIZE];
 	struct r10conf *conf = mddev->private;
+	unsigned long flags;
 
 	/*
 	 * If it is not operational, then we have already marked it as dead
@@ -1601,18 +1675,18 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
 	 * next level up know.
 	 * else mark the drive as failed
 	 */
+	spin_lock_irqsave(&conf->device_lock, flags);
 	if (test_bit(In_sync, &rdev->flags)
-	    && !enough(conf, rdev->raid_disk))
+	    && !enough(conf, rdev->raid_disk)) {
 		/*
 		 * Don't fail the drive, just return an IO error.
 		 */
+		spin_unlock_irqrestore(&conf->device_lock, flags);
 		return;
+	}
 	if (test_and_clear_bit(In_sync, &rdev->flags)) {
-		unsigned long flags;
-		spin_lock_irqsave(&conf->device_lock, flags);
 		mddev->degraded++;
-		spin_unlock_irqrestore(&conf->device_lock, flags);
-		/*
+			/*
 		 * if recovery is running, make sure it aborts.
 		 */
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
@@ -1620,6 +1694,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
 	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+	spin_unlock_irqrestore(&conf->device_lock, flags);
 	printk(KERN_ALERT
 	       "md/raid10:%s: Disk failure on %s, disabling device.\n"
 	       "md/raid10:%s: Operation continuing on %d devices.\n",
@@ -1693,6 +1768,7 @@ static int raid10_spare_active(struct mddev *mddev)
 			}
 			sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
 		} else if (tmp->rdev
+			   && tmp->rdev->recovery_offset == MaxSector
 			   && !test_bit(Faulty, &tmp->rdev->flags)
 			   && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
 			count++;
@@ -1722,7 +1798,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		 * very different from resync
 		 */
 		return -EBUSY;
-	if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1))
+	if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
 		return -EINVAL;
 
 	if (rdev->raid_disk >= 0)
@@ -1750,15 +1826,17 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			set_bit(Replacement, &rdev->flags);
 			rdev->raid_disk = mirror;
 			err = 0;
-			disk_stack_limits(mddev->gendisk, rdev->bdev,
-					  rdev->data_offset << 9);
+			if (mddev->gendisk)
+				disk_stack_limits(mddev->gendisk, rdev->bdev,
+						  rdev->data_offset << 9);
 			conf->fullsync = 1;
 			rcu_assign_pointer(p->replacement, rdev);
 			break;
 		}
 
-		disk_stack_limits(mddev->gendisk, rdev->bdev,
-				  rdev->data_offset << 9);
+		if (mddev->gendisk)
+			disk_stack_limits(mddev->gendisk, rdev->bdev,
+					  rdev->data_offset << 9);
 
 		p->head_position = 0;
 		p->recovery_disabled = mddev->recovery_disabled - 1;
@@ -1778,12 +1856,12 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		 * we wait for all outstanding requests to complete.
 		 */
 		synchronize_sched();
-		raise_barrier(conf, 0);
-		lower_barrier(conf);
+		freeze_array(conf, 0);
+		unfreeze_array(conf);
 		clear_bit(Unmerged, &rdev->flags);
 	}
 	md_integrity_add_rdev(rdev, mddev);
-	if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
 
 	print_conf(conf);
@@ -2006,11 +2084,17 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 			 * both 'first' and 'i', so we just compare them.
 			 * All vec entries are PAGE_SIZE;
 			 */
-			for (j = 0; j < vcnt; j++)
+			int sectors = r10_bio->sectors;
+			for (j = 0; j < vcnt; j++) {
+				int len = PAGE_SIZE;
+				if (sectors < (len / 512))
+					len = sectors * 512;
 				if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
 					   page_address(tbio->bi_io_vec[j].bv_page),
-					   fbio->bi_io_vec[j].bv_len))
+					   len))
 					break;
+				sectors -= len/512;
+			}
 			if (j == vcnt)
 				continue;
 			atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
@@ -2023,16 +2107,13 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 		 * First we need to fixup bv_offset, bv_len and
 		 * bi_vecs, as the read request might have corrupted these
 		 */
+		bio_reset(tbio);
+
 		tbio->bi_vcnt = vcnt;
-		tbio->bi_size = r10_bio->sectors << 9;
-		tbio->bi_idx = 0;
-		tbio->bi_phys_segments = 0;
-		tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
-		tbio->bi_flags |= 1 << BIO_UPTODATE;
-		tbio->bi_next = NULL;
+		tbio->bi_iter.bi_size = r10_bio->sectors << 9;
 		tbio->bi_rw = WRITE;
 		tbio->bi_private = r10_bio;
-		tbio->bi_sector = r10_bio->devs[i].addr;
+		tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
 
 		for (j=0; j < vcnt ; j++) {
 			tbio->bi_io_vec[j].bv_offset = 0;
@@ -2047,9 +2128,9 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 		d = r10_bio->devs[i].devnum;
 		atomic_inc(&conf->mirrors[d].rdev->nr_pending);
 		atomic_inc(&r10_bio->remaining);
-		md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
+		md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
 
-		tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
+		tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
 		tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
 		generic_make_request(tbio);
 	}
@@ -2072,7 +2153,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 		d = r10_bio->devs[i].devnum;
 		atomic_inc(&r10_bio->remaining);
 		md_sync_acct(conf->mirrors[d].replacement->bdev,
-			     tbio->bi_size >> 9);
+			     bio_sectors(tbio));
 		generic_make_request(tbio);
 	}
 
@@ -2196,15 +2277,21 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 	d = r10_bio->devs[1].devnum;
 	wbio = r10_bio->devs[1].bio;
 	wbio2 = r10_bio->devs[1].repl_bio;
+	/* Need to test wbio2->bi_end_io before we call
+	 * generic_make_request as if the former is NULL,
+	 * the latter is free to free wbio2.
+	 */
+	if (wbio2 && !wbio2->bi_end_io)
+		wbio2 = NULL;
 	if (wbio->bi_end_io) {
 		atomic_inc(&conf->mirrors[d].rdev->nr_pending);
-		md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
+		md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
 		generic_make_request(wbio);
 	}
-	if (wbio2 && wbio2->bi_end_io) {
+	if (wbio2) {
 		atomic_inc(&conf->mirrors[d].replacement->nr_pending);
 		md_sync_acct(conf->mirrors[d].replacement->bdev,
-			     wbio2->bi_size >> 9);
+			     bio_sectors(wbio2));
 		generic_make_request(wbio2);
 	}
 }
@@ -2475,25 +2562,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 	}
 }
 
-static void bi_complete(struct bio *bio, int error)
-{
-	complete((struct completion *)bio->bi_private);
-}
-
-static int submit_bio_wait(int rw, struct bio *bio)
-{
-	struct completion event;
-	rw |= REQ_SYNC;
-
-	init_completion(&event);
-	bio->bi_private = &event;
-	bio->bi_end_io = bi_complete;
-	submit_bio(rw, bio);
-	wait_for_completion(&event);
-
-	return test_bit(BIO_UPTODATE, &bio->bi_flags);
-}
-
 static int narrow_write_error(struct r10bio *r10_bio, int i)
 {
 	struct bio *bio = r10_bio->master_bio;
@@ -2532,8 +2600,8 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
 			sectors = sect_to_write;
 		/* Write at 'sector' for 'sectors' */
 		wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		md_trim_bio(wbio, sector - bio->bi_sector, sectors);
-		wbio->bi_sector = (r10_bio->devs[i].addr+
+		bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
+		wbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
 				   choose_data_offset(r10_bio, rdev) +
 				   (sector - r10_bio->sector));
 		wbio->bi_bdev = rdev->bdev;
@@ -2575,7 +2643,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 	r10_bio->devs[slot].bio = NULL;
 
 	if (mddev->ro == 0) {
-		freeze_array(conf);
+		freeze_array(conf, 1);
 		fix_read_error(conf, mddev, r10_bio);
 		unfreeze_array(conf);
 	} else
@@ -2605,12 +2673,10 @@ read_more:
 		(unsigned long long)r10_bio->sector);
 	bio = bio_clone_mddev(r10_bio->master_bio,
 			      GFP_NOIO, mddev);
-	md_trim_bio(bio,
-		    r10_bio->sector - bio->bi_sector,
-		    max_sectors);
+	bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
 	r10_bio->devs[slot].bio = bio;
 	r10_bio->devs[slot].rdev = rdev;
-	bio->bi_sector = r10_bio->devs[slot].addr
+	bio->bi_iter.bi_sector = r10_bio->devs[slot].addr
 		+ choose_data_offset(r10_bio, rdev);
 	bio->bi_bdev = rdev->bdev;
 	bio->bi_rw = READ | do_sync;
@@ -2621,7 +2687,7 @@ read_more:
 		struct bio *mbio = r10_bio->master_bio;
 		int sectors_handled =
 			r10_bio->sector + max_sectors
-			- mbio->bi_sector;
+			- mbio->bi_iter.bi_sector;
 		r10_bio->sectors = max_sectors;
 		spin_lock_irq(&conf->device_lock);
 		if (mbio->bi_phys_segments == 0)
@@ -2634,13 +2700,12 @@ read_more:
 		r10_bio = mempool_alloc(conf->r10bio_pool,
 					GFP_NOIO);
 		r10_bio->master_bio = mbio;
-		r10_bio->sectors = (mbio->bi_size >> 9)
-			- sectors_handled;
+		r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
 		r10_bio->state = 0;
 		set_bit(R10BIO_ReadError,
 			&r10_bio->state);
 		r10_bio->mddev = mddev;
-		r10_bio->sector = mbio->bi_sector
+		r10_bio->sector = mbio->bi_iter.bi_sector
 			+ sectors_handled;
 
 		goto read_more;
@@ -2857,6 +2922,21 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 		if (init_resync(conf))
 			return 0;
 
+	/*
+	 * Allow skipping a full rebuild for incremental assembly
+	 * of a clean array, like RAID1 does.
+	 */
+	if (mddev->bitmap == NULL &&
+	    mddev->recovery_cp == MaxSector &&
+	    mddev->reshape_position == MaxSector &&
+	    !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
+	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+	    !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+	    conf->fullsync == 0) {
+		*skipped = 1;
+		return mddev->dev_sectors - sector_nr;
+	}
+
  skipped:
 	max_sector = mddev->dev_sectors;
 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
@@ -3056,13 +3136,15 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 					}
 				}
 				bio = r10_bio->devs[0].bio;
+				bio_reset(bio);
 				bio->bi_next = biolist;
 				biolist = bio;
 				bio->bi_private = r10_bio;
 				bio->bi_end_io = end_sync_read;
 				bio->bi_rw = READ;
 				from_addr = r10_bio->devs[j].addr;
-				bio->bi_sector = from_addr + rdev->data_offset;
+				bio->bi_iter.bi_sector = from_addr +
+					rdev->data_offset;
 				bio->bi_bdev = rdev->bdev;
 				atomic_inc(&rdev->nr_pending);
 				/* and we write to 'i' (if not in_sync) */
@@ -3080,12 +3162,13 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 				rdev = mirror->rdev;
 				if (!test_bit(In_sync, &rdev->flags)) {
 					bio = r10_bio->devs[1].bio;
+					bio_reset(bio);
 					bio->bi_next = biolist;
 					biolist = bio;
 					bio->bi_private = r10_bio;
 					bio->bi_end_io = end_sync_write;
 					bio->bi_rw = WRITE;
-					bio->bi_sector = to_addr
+					bio->bi_iter.bi_sector = to_addr
 						+ rdev->data_offset;
 					bio->bi_bdev = rdev->bdev;
 					atomic_inc(&r10_bio->remaining);
@@ -3108,12 +3191,14 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 				if (rdev == NULL || bio == NULL ||
 				    test_bit(Faulty, &rdev->flags))
 					break;
+				bio_reset(bio);
 				bio->bi_next = biolist;
 				biolist = bio;
 				bio->bi_private = r10_bio;
 				bio->bi_end_io = end_sync_write;
 				bio->bi_rw = WRITE;
-				bio->bi_sector = to_addr + rdev->data_offset;
+				bio->bi_iter.bi_sector = to_addr +
+					rdev->data_offset;
 				bio->bi_bdev = rdev->bdev;
 				atomic_inc(&r10_bio->remaining);
 				break;
@@ -3121,10 +3206,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 			if (j == conf->copies) {
 				/* Cannot recover, so abort the recovery or
 				 * record a bad block */
-				put_buf(r10_bio);
-				if (rb2)
-					atomic_dec(&rb2->remaining);
-				r10_bio = rb2;
 				if (any_working) {
 					/* problem is that there are bad blocks
 					 * on other device(s)
@@ -3156,6 +3237,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 					mirror->recovery_disabled
 						= mddev->recovery_disabled;
 				}
+				put_buf(r10_bio);
+				if (rb2)
+					atomic_dec(&rb2->remaining);
+				r10_bio = rb2;
 				break;
 			}
 		}
@@ -3206,7 +3291,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 				r10_bio->devs[i].repl_bio->bi_end_io = NULL;
 
 			bio = r10_bio->devs[i].bio;
-			bio->bi_end_io = NULL;
+			bio_reset(bio);
 			clear_bit(BIO_UPTODATE, &bio->bi_flags);
 			if (conf->mirrors[d].rdev == NULL ||
 			    test_bit(Faulty, &conf->mirrors[d].rdev->flags))
@@ -3231,7 +3316,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 			bio->bi_private = r10_bio;
 			bio->bi_end_io = end_sync_read;
 			bio->bi_rw = READ;
-			bio->bi_sector = sector +
+			bio->bi_iter.bi_sector = sector +
 				conf->mirrors[d].rdev->data_offset;
 			bio->bi_bdev = conf->mirrors[d].rdev->bdev;
 			count++;
@@ -3243,6 +3328,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 
 			/* Need to set up for writing to the replacement */
 			bio = r10_bio->devs[i].repl_bio;
+			bio_reset(bio);
 			clear_bit(BIO_UPTODATE, &bio->bi_flags);
 
 			sector = r10_bio->devs[i].addr;
@@ -3252,7 +3338,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 			bio->bi_private = r10_bio;
 			bio->bi_end_io = end_sync_write;
 			bio->bi_rw = WRITE;
-			bio->bi_sector = sector +
+			bio->bi_iter.bi_sector = sector +
 				conf->mirrors[d].replacement->data_offset;
 			bio->bi_bdev = conf->mirrors[d].replacement->bdev;
 			count++;
@@ -3276,17 +3362,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 		}
 	}
 
-	for (bio = biolist; bio ; bio=bio->bi_next) {
-
-		bio->bi_flags &= ~(BIO_POOL_MASK - 1);
-		if (bio->bi_end_io)
-			bio->bi_flags |= 1 << BIO_UPTODATE;
-		bio->bi_vcnt = 0;
-		bio->bi_idx = 0;
-		bio->bi_phys_segments = 0;
-		bio->bi_size = 0;
-	}
-
 	nr_sectors = 0;
 	if (sector_nr + max_sync < max_sector)
 		max_sector = sector_nr + max_sync;
@@ -3310,7 +3385,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 			     bio2 = bio2->bi_next) {
 				/* remove last page from this bio */
 				bio2->bi_vcnt--;
-				bio2->bi_size -= len;
+				bio2->bi_iter.bi_size -= len;
 				bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
 			}
 			goto bio_full;
@@ -3331,6 +3406,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 
 		if (bio->bi_end_io == end_sync_read) {
 			md_sync_acct(bio->bi_bdev, nr_sectors);
+			set_bit(BIO_UPTODATE, &bio->bi_flags);
 			generic_make_request(bio);
 		}
 	}
@@ -3430,7 +3506,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
 		disks = mddev->raid_disks + mddev->delta_disks;
 		break;
 	}
-	if (layout >> 17)
+	if (layout >> 18)
 		return -1;
 	if (chunk < (PAGE_SIZE >> 9) ||
 	    !is_power_of_2(chunk))
@@ -3442,6 +3518,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
 	geo->near_copies = nc;
 	geo->far_copies = fc;
 	geo->far_offset = fo;
+	geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
 	geo->chunk_mask = chunk - 1;
 	geo->chunk_shift = ffz(~chunk);
 	return nc*fc;
@@ -3476,7 +3553,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 
 	/* FIXME calc properly */
 	conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
-							    max(0,mddev->delta_disks)),
+							    max(0,-mddev->delta_disks)),
 				GFP_KERNEL);
 	if (!conf->mirrors)
 		goto out;
@@ -3563,6 +3640,7 @@ static int run(struct mddev *mddev)
 	if (mddev->queue) {
 		blk_queue_max_discard_sectors(mddev->queue,
 					      mddev->chunk_sectors);
+		blk_queue_max_write_same_sectors(mddev->queue, 0);
 		blk_queue_io_min(mddev->queue, chunk_size);
 		if (conf->geo.raid_disks % conf->geo.near_copies)
 			blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
@@ -3613,11 +3691,14 @@ static int run(struct mddev *mddev)
 			discard_supported = true;
 	}
 
-	if (discard_supported)
-		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
-	else
-		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
-
+	if (mddev->queue) {
+		if (discard_supported)
+			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
+						mddev->queue);
+		else
+			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
+						  mddev->queue);
+	}
 	/* need to check that every block has at least one working mirror */
 	if (!enough(conf, -1)) {
 		printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
@@ -3631,7 +3712,7 @@ static int run(struct mddev *mddev)
 		    conf->geo.far_offset == 0)
 			goto out_free_conf;
 		if (conf->prev.far_copies != 1 &&
-		    conf->geo.far_offset == 0)
+		    conf->prev.far_offset == 0)
 			goto out_free_conf;
 	}
 
@@ -3654,7 +3735,8 @@ static int run(struct mddev *mddev)
 		    !test_bit(In_sync, &disk->rdev->flags)) {
 			disk->head_position = 0;
 			mddev->degraded++;
-			if (disk->rdev)
+			if (disk->rdev &&
+			    disk->rdev->saved_raid_disk < 0)
 				conf->fullsync = 1;
 		}
 		disk->recovery_disabled = mddev->recovery_disabled - 1;
@@ -3748,6 +3830,7 @@ static int stop(struct mddev *mddev)
 
 	if (conf->r10bio_pool)
 		mempool_destroy(conf->r10bio_pool);
+	safe_put_page(conf->tmppage);
 	kfree(conf->mirrors);
 	kfree(conf);
 	mddev->private = NULL;
@@ -4290,7 +4373,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
 		md_wakeup_thread(mddev->thread);
 		wait_event(mddev->sb_wait, mddev->flags == 0 ||
-			   kthread_should_stop());
+			   test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+			allow_barrier(conf);
+			return sectors_done;
+		}
 		conf->reshape_safe = mddev->reshape_position;
 		allow_barrier(conf);
 	}
@@ -4319,7 +4406,7 @@ read_more:
 	read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
 
 	read_bio->bi_bdev = rdev->bdev;
-	read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
+	read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
 			       + rdev->data_offset);
 	read_bio->bi_private = r10_bio;
 	read_bio->bi_end_io = end_sync_read;
@@ -4327,8 +4414,7 @@ read_more:
 	read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
 	read_bio->bi_flags |= 1 << BIO_UPTODATE;
 	read_bio->bi_vcnt = 0;
-	read_bio->bi_idx = 0;
-	read_bio->bi_size = 0;
+	read_bio->bi_iter.bi_size = 0;
 	r10_bio->master_bio = read_bio;
 	r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
 
@@ -4351,17 +4437,15 @@ read_more:
 		}
 		if (!rdev2 || test_bit(Faulty, &rdev2->flags))
 			continue;
+
+		bio_reset(b);
 		b->bi_bdev = rdev2->bdev;
-		b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
+		b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
+			rdev2->new_data_offset;
 		b->bi_private = r10_bio;
 		b->bi_end_io = end_reshape_write;
 		b->bi_rw = WRITE;
-		b->bi_flags &= ~(BIO_POOL_MASK - 1);
-		b->bi_flags |= 1 << BIO_UPTODATE;
 		b->bi_next = blist;
-		b->bi_vcnt = 0;
-		b->bi_idx = 0;
-		b->bi_size = 0;
 		blist = b;
 	}
 
@@ -4384,7 +4468,7 @@ read_more:
 			     bio2 = bio2->bi_next) {
 				/* Remove last page from this bio */
 				bio2->bi_vcnt--;
-				bio2->bi_size -= len;
+				bio2->bi_iter.bi_size -= len;
 				bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
 			}
 			goto bio_full;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 1054cf60234..157d69e83ff 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -33,6 +33,11 @@ struct r10conf {
 					       * far_offset, in which case it is
 					       * 1 stripe.
 					       */
+		int             far_set_size; /* The number of devices in a set,
+					       * where a 'set' are devices that
+					       * contain far/offset copies of
+					       * each other.
+					       */
 		int		chunk_shift; /* shift from chunks to sectors */
 		sector_t	chunk_mask;
 	} prev, geo;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c5439dce029..6234b2e8458 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -53,11 +53,18 @@
 #include <linux/cpu.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
+#include <linux/nodemask.h>
+#include <trace/events/block.h>
+
 #include "md.h"
 #include "raid5.h"
 #include "raid0.h"
 #include "bitmap.h"
 
+#define cpu_to_group(cpu) cpu_to_node(cpu)
+#define ANY_GROUP NUMA_NO_NODE
+
+static struct workqueue_struct *raid5_wq;
 /*
  * Stripe cache
  */
@@ -70,6 +77,7 @@
 #define BYPASS_THRESHOLD	1
 #define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
 #define HASH_MASK		(NR_HASH - 1)
+#define MAX_STRIPE_BATCH	8
 
 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
 {
@@ -77,6 +85,42 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
 	return &conf->stripe_hashtbl[hash];
 }
 
+static inline int stripe_hash_locks_hash(sector_t sect)
+{
+	return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
+}
+
+static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
+{
+	spin_lock_irq(conf->hash_locks + hash);
+	spin_lock(&conf->device_lock);
+}
+
+static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
+{
+	spin_unlock(&conf->device_lock);
+	spin_unlock_irq(conf->hash_locks + hash);
+}
+
+static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
+{
+	int i;
+	local_irq_disable();
+	spin_lock(conf->hash_locks);
+	for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
+		spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
+	spin_lock(&conf->device_lock);
+}
+
+static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
+{
+	int i;
+	spin_unlock(&conf->device_lock);
+	for (i = NR_STRIPE_HASH_LOCKS; i; i--)
+		spin_unlock(conf->hash_locks + i - 1);
+	local_irq_enable();
+}
+
 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
  * order without overlap.  There may be several bio's per stripe+device, and
  * a bio could span several devices.
@@ -88,8 +132,8 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
  */
 static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
 {
-	int sectors = bio->bi_size >> 9;
-	if (bio->bi_sector + sectors < sector + STRIPE_SECTORS)
+	int sectors = bio_sectors(bio);
+	if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
 		return bio->bi_next;
 	else
 		return NULL;
@@ -181,7 +225,9 @@ static void return_io(struct bio *return_bi)
 
 		return_bi = bi->bi_next;
 		bi->bi_next = NULL;
-		bi->bi_size = 0;
+		bi->bi_iter.bi_size = 0;
+		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
+					 bi, 0);
 		bio_endio(bi, 0);
 		bi = return_bi;
 	}
@@ -196,21 +242,73 @@ static int stripe_operations_active(struct stripe_head *sh)
 	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
 }
 
-static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
+static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
+{
+	struct r5conf *conf = sh->raid_conf;
+	struct r5worker_group *group;
+	int thread_cnt;
+	int i, cpu = sh->cpu;
+
+	if (!cpu_online(cpu)) {
+		cpu = cpumask_any(cpu_online_mask);
+		sh->cpu = cpu;
+	}
+
+	if (list_empty(&sh->lru)) {
+		struct r5worker_group *group;
+		group = conf->worker_groups + cpu_to_group(cpu);
+		list_add_tail(&sh->lru, &group->handle_list);
+		group->stripes_cnt++;
+		sh->group = group;
+	}
+
+	if (conf->worker_cnt_per_group == 0) {
+		md_wakeup_thread(conf->mddev->thread);
+		return;
+	}
+
+	group = conf->worker_groups + cpu_to_group(sh->cpu);
+
+	group->workers[0].working = true;
+	/* at least one worker should run to avoid race */
+	queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);
+
+	thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1;
+	/* wakeup more workers */
+	for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) {
+		if (group->workers[i].working == false) {
+			group->workers[i].working = true;
+			queue_work_on(sh->cpu, raid5_wq,
+				      &group->workers[i].work);
+			thread_cnt--;
+		}
+	}
+}
+
+static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
+			      struct list_head *temp_inactive_list)
 {
 	BUG_ON(!list_empty(&sh->lru));
 	BUG_ON(atomic_read(&conf->active_stripes)==0);
 	if (test_bit(STRIPE_HANDLE, &sh->state)) {
 		if (test_bit(STRIPE_DELAYED, &sh->state) &&
-		    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+		    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
 			list_add_tail(&sh->lru, &conf->delayed_list);
-		else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+			if (atomic_read(&conf->preread_active_stripes)
+			    < IO_THRESHOLD)
+				md_wakeup_thread(conf->mddev->thread);
+		} else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
 			   sh->bm_seq - conf->seq_write > 0)
 			list_add_tail(&sh->lru, &conf->bitmap_list);
 		else {
 			clear_bit(STRIPE_DELAYED, &sh->state);
 			clear_bit(STRIPE_BIT_DELAY, &sh->state);
-			list_add_tail(&sh->lru, &conf->handle_list);
+			if (conf->worker_cnt_per_group == 0) {
+				list_add_tail(&sh->lru, &conf->handle_list);
+			} else {
+				raid5_wakeup_stripe_thread(sh);
+				return;
+			}
 		}
 		md_wakeup_thread(conf->mddev->thread);
 	} else {
@@ -220,30 +318,125 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
 			    < IO_THRESHOLD)
 				md_wakeup_thread(conf->mddev->thread);
 		atomic_dec(&conf->active_stripes);
-		if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
-			list_add_tail(&sh->lru, &conf->inactive_list);
-			wake_up(&conf->wait_for_stripe);
-			if (conf->retry_read_aligned)
-				md_wakeup_thread(conf->mddev->thread);
-		}
+		if (!test_bit(STRIPE_EXPANDING, &sh->state))
+			list_add_tail(&sh->lru, temp_inactive_list);
 	}
 }
 
-static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
+static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
+			     struct list_head *temp_inactive_list)
 {
 	if (atomic_dec_and_test(&sh->count))
-		do_release_stripe(conf, sh);
+		do_release_stripe(conf, sh, temp_inactive_list);
+}
+
+/*
+ * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
+ *
+ * Be careful: Only one task can add/delete stripes from temp_inactive_list at
+ * given time. Adding stripes only takes device lock, while deleting stripes
+ * only takes hash lock.
+ */
+static void release_inactive_stripe_list(struct r5conf *conf,
+					 struct list_head *temp_inactive_list,
+					 int hash)
+{
+	int size;
+	bool do_wakeup = false;
+	unsigned long flags;
+
+	if (hash == NR_STRIPE_HASH_LOCKS) {
+		size = NR_STRIPE_HASH_LOCKS;
+		hash = NR_STRIPE_HASH_LOCKS - 1;
+	} else
+		size = 1;
+	while (size) {
+		struct list_head *list = &temp_inactive_list[size - 1];
+
+		/*
+		 * We don't hold any lock here yet, get_active_stripe() might
+		 * remove stripes from the list
+		 */
+		if (!list_empty_careful(list)) {
+			spin_lock_irqsave(conf->hash_locks + hash, flags);
+			if (list_empty(conf->inactive_list + hash) &&
+			    !list_empty(list))
+				atomic_dec(&conf->empty_inactive_list_nr);
+			list_splice_tail_init(list, conf->inactive_list + hash);
+			do_wakeup = true;
+			spin_unlock_irqrestore(conf->hash_locks + hash, flags);
+		}
+		size--;
+		hash--;
+	}
+
+	if (do_wakeup) {
+		wake_up(&conf->wait_for_stripe);
+		if (conf->retry_read_aligned)
+			md_wakeup_thread(conf->mddev->thread);
+	}
+}
+
+/* should hold conf->device_lock already */
+static int release_stripe_list(struct r5conf *conf,
+			       struct list_head *temp_inactive_list)
+{
+	struct stripe_head *sh;
+	int count = 0;
+	struct llist_node *head;
+
+	head = llist_del_all(&conf->released_stripes);
+	head = llist_reverse_order(head);
+	while (head) {
+		int hash;
+
+		sh = llist_entry(head, struct stripe_head, release_list);
+		head = llist_next(head);
+		/* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
+		smp_mb();
+		clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
+		/*
+		 * Don't worry the bit is set here, because if the bit is set
+		 * again, the count is always > 1. This is true for
+		 * STRIPE_ON_UNPLUG_LIST bit too.
+		 */
+		hash = sh->hash_lock_index;
+		__release_stripe(conf, sh, &temp_inactive_list[hash]);
+		count++;
+	}
+
+	return count;
 }
 
 static void release_stripe(struct stripe_head *sh)
 {
 	struct r5conf *conf = sh->raid_conf;
 	unsigned long flags;
+	struct list_head list;
+	int hash;
+	bool wakeup;
 
+	/* Avoid release_list until the last reference.
+	 */
+	if (atomic_add_unless(&sh->count, -1, 1))
+		return;
+
+	if (unlikely(!conf->mddev->thread) ||
+		test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
+		goto slow_path;
+	wakeup = llist_add(&sh->release_list, &conf->released_stripes);
+	if (wakeup)
+		md_wakeup_thread(conf->mddev->thread);
+	return;
+slow_path:
 	local_irq_save(flags);
+	/* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
 	if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
-		do_release_stripe(conf, sh);
+		INIT_LIST_HEAD(&list);
+		hash = sh->hash_lock_index;
+		do_release_stripe(conf, sh, &list);
 		spin_unlock(&conf->device_lock);
+		release_inactive_stripe_list(conf, &list, hash);
 	}
 	local_irq_restore(flags);
 }
@@ -268,18 +461,21 @@ static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
 
 
 /* find an idle stripe, make sure it is unhashed, and return it. */
-static struct stripe_head *get_free_stripe(struct r5conf *conf)
+static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
 {
 	struct stripe_head *sh = NULL;
 	struct list_head *first;
 
-	if (list_empty(&conf->inactive_list))
+	if (list_empty(conf->inactive_list + hash))
 		goto out;
-	first = conf->inactive_list.next;
+	first = (conf->inactive_list + hash)->next;
 	sh = list_entry(first, struct stripe_head, lru);
 	list_del_init(first);
 	remove_hash(sh);
 	atomic_inc(&conf->active_stripes);
+	BUG_ON(hash != sh->hash_lock_index);
+	if (list_empty(conf->inactive_list + hash))
+		atomic_inc(&conf->empty_inactive_list_nr);
 out:
 	return sh;
 }
@@ -291,6 +487,7 @@ static void shrink_buffers(struct stripe_head *sh)
 	int num = sh->raid_conf->pool_size;
 
 	for (i = 0; i < num ; i++) {
+		WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
 		p = sh->dev[i].page;
 		if (!p)
 			continue;
@@ -311,6 +508,7 @@ static int grow_buffers(struct stripe_head *sh)
 			return 1;
 		}
 		sh->dev[i].page = page;
+		sh->dev[i].orig_page = page;
 	}
 	return 0;
 }
@@ -322,7 +520,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 {
 	struct r5conf *conf = sh->raid_conf;
-	int i;
+	int i, seq;
 
 	BUG_ON(atomic_read(&sh->count) != 0);
 	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
@@ -332,7 +530,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 		(unsigned long long)sh->sector);
 
 	remove_hash(sh);
-
+retry:
+	seq = read_seqcount_begin(&conf->gen_lock);
 	sh->generation = conf->generation - previous;
 	sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
 	sh->sector = sector;
@@ -354,17 +553,19 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 		dev->flags = 0;
 		raid5_build_block(sh, i, previous);
 	}
+	if (read_seqcount_retry(&conf->gen_lock, seq))
+		goto retry;
 	insert_hash(conf, sh);
+	sh->cpu = smp_processor_id();
 }
 
 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
 					 short generation)
 {
 	struct stripe_head *sh;
-	struct hlist_node *hn;
 
 	pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
-	hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
+	hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
 		if (sh->sector == sector && sh->generation == generation)
 			return sh;
 	pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
@@ -458,53 +659,55 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
 		  int previous, int noblock, int noquiesce)
 {
 	struct stripe_head *sh;
+	int hash = stripe_hash_locks_hash(sector);
 
 	pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
 
-	spin_lock_irq(&conf->device_lock);
+	spin_lock_irq(conf->hash_locks + hash);
 
 	do {
 		wait_event_lock_irq(conf->wait_for_stripe,
 				    conf->quiesce == 0 || noquiesce,
-				    conf->device_lock, /* nothing */);
+				    *(conf->hash_locks + hash));
 		sh = __find_stripe(conf, sector, conf->generation - previous);
 		if (!sh) {
 			if (!conf->inactive_blocked)
-				sh = get_free_stripe(conf);
+				sh = get_free_stripe(conf, hash);
 			if (noblock && sh == NULL)
 				break;
 			if (!sh) {
 				conf->inactive_blocked = 1;
-				wait_event_lock_irq(conf->wait_for_stripe,
-						    !list_empty(&conf->inactive_list) &&
-						    (atomic_read(&conf->active_stripes)
-						     < (conf->max_nr_stripes *3/4)
-						     || !conf->inactive_blocked),
-						    conf->device_lock,
-						    );
+				wait_event_lock_irq(
+					conf->wait_for_stripe,
+					!list_empty(conf->inactive_list + hash) &&
+					(atomic_read(&conf->active_stripes)
+					 < (conf->max_nr_stripes * 3 / 4)
+					 || !conf->inactive_blocked),
+					*(conf->hash_locks + hash));
 				conf->inactive_blocked = 0;
-			} else
-				init_stripe(sh, sector, previous);
-		} else {
-			if (atomic_read(&sh->count)) {
-				BUG_ON(!list_empty(&sh->lru)
-				    && !test_bit(STRIPE_EXPANDING, &sh->state)
-				    && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state));
 			} else {
+				init_stripe(sh, sector, previous);
+				atomic_inc(&sh->count);
+			}
+		} else if (!atomic_inc_not_zero(&sh->count)) {
+			spin_lock(&conf->device_lock);
+			if (!atomic_read(&sh->count)) {
 				if (!test_bit(STRIPE_HANDLE, &sh->state))
 					atomic_inc(&conf->active_stripes);
-				if (list_empty(&sh->lru) &&
-				    !test_bit(STRIPE_EXPANDING, &sh->state))
-					BUG();
+				BUG_ON(list_empty(&sh->lru) &&
+				       !test_bit(STRIPE_EXPANDING, &sh->state));
 				list_del_init(&sh->lru);
+				if (sh->group) {
+					sh->group->stripes_cnt--;
+					sh->group = NULL;
+				}
 			}
+			atomic_inc(&sh->count);
+			spin_unlock(&conf->device_lock);
 		}
 	} while (sh == NULL);
 
-	if (sh)
-		atomic_inc(&sh->count);
-
-	spin_unlock_irq(&conf->device_lock);
+	spin_unlock_irq(conf->hash_locks + hash);
 	return sh;
 }
 
@@ -567,14 +770,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 		bi = &sh->dev[i].req;
 		rbi = &sh->dev[i].rreq; /* For writing to replacement */
 
-		bi->bi_rw = rw;
-		rbi->bi_rw = rw;
-		if (rw & WRITE) {
-			bi->bi_end_io = raid5_end_write_request;
-			rbi->bi_end_io = raid5_end_write_request;
-		} else
-			bi->bi_end_io = raid5_end_read_request;
-
 		rcu_read_lock();
 		rrdev = rcu_dereference(conf->disks[i].replacement);
 		smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
@@ -649,28 +844,47 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 
 			set_bit(STRIPE_IO_STARTED, &sh->state);
 
+			bio_reset(bi);
 			bi->bi_bdev = rdev->bdev;
+			bi->bi_rw = rw;
+			bi->bi_end_io = (rw & WRITE)
+				? raid5_end_write_request
+				: raid5_end_read_request;
+			bi->bi_private = sh;
+
 			pr_debug("%s: for %llu schedule op %ld on disc %d\n",
 				__func__, (unsigned long long)sh->sector,
 				bi->bi_rw, i);
 			atomic_inc(&sh->count);
 			if (use_new_offset(conf, sh))
-				bi->bi_sector = (sh->sector
+				bi->bi_iter.bi_sector = (sh->sector
 						 + rdev->new_data_offset);
 			else
-				bi->bi_sector = (sh->sector
+				bi->bi_iter.bi_sector = (sh->sector
 						 + rdev->data_offset);
 			if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
-				bi->bi_rw |= REQ_FLUSH;
+				bi->bi_rw |= REQ_NOMERGE;
 
-			bi->bi_flags = 1 << BIO_UPTODATE;
-			bi->bi_idx = 0;
+			if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
+				WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
+			sh->dev[i].vec.bv_page = sh->dev[i].page;
+			bi->bi_vcnt = 1;
 			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
 			bi->bi_io_vec[0].bv_offset = 0;
-			bi->bi_size = STRIPE_SIZE;
-			bi->bi_next = NULL;
+			bi->bi_iter.bi_size = STRIPE_SIZE;
+			/*
+			 * If this is discard request, set bi_vcnt 0. We don't
+			 * want to confuse SCSI because SCSI will replace payload
+			 */
+			if (rw & REQ_DISCARD)
+				bi->bi_vcnt = 0;
 			if (rrdev)
 				set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
+
+			if (conf->mddev->gendisk)
+				trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
+						      bi, disk_devt(conf->mddev->gendisk),
+						      sh->dev[i].sector);
 			generic_make_request(bi);
 		}
 		if (rrdev) {
@@ -680,24 +894,41 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 
 			set_bit(STRIPE_IO_STARTED, &sh->state);
 
+			bio_reset(rbi);
 			rbi->bi_bdev = rrdev->bdev;
+			rbi->bi_rw = rw;
+			BUG_ON(!(rw & WRITE));
+			rbi->bi_end_io = raid5_end_write_request;
+			rbi->bi_private = sh;
+
 			pr_debug("%s: for %llu schedule op %ld on "
 				 "replacement disc %d\n",
 				__func__, (unsigned long long)sh->sector,
 				rbi->bi_rw, i);
 			atomic_inc(&sh->count);
 			if (use_new_offset(conf, sh))
-				rbi->bi_sector = (sh->sector
+				rbi->bi_iter.bi_sector = (sh->sector
 						  + rrdev->new_data_offset);
 			else
-				rbi->bi_sector = (sh->sector
+				rbi->bi_iter.bi_sector = (sh->sector
 						  + rrdev->data_offset);
-			rbi->bi_flags = 1 << BIO_UPTODATE;
-			rbi->bi_idx = 0;
+			if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
+				WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
+			sh->dev[i].rvec.bv_page = sh->dev[i].page;
+			rbi->bi_vcnt = 1;
 			rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
 			rbi->bi_io_vec[0].bv_offset = 0;
-			rbi->bi_size = STRIPE_SIZE;
-			rbi->bi_next = NULL;
+			rbi->bi_iter.bi_size = STRIPE_SIZE;
+			/*
+			 * If this is discard request, set bi_vcnt 0. We don't
+			 * want to confuse SCSI because SCSI will replace payload
+			 */
+			if (rw & REQ_DISCARD)
+				rbi->bi_vcnt = 0;
+			if (conf->mddev->gendisk)
+				trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
+						      rbi, disk_devt(conf->mddev->gendisk),
+						      sh->dev[i].sector);
 			generic_make_request(rbi);
 		}
 		if (!rdev && !rrdev) {
@@ -712,27 +943,28 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 }
 
 static struct dma_async_tx_descriptor *
-async_copy_data(int frombio, struct bio *bio, struct page *page,
-	sector_t sector, struct dma_async_tx_descriptor *tx)
+async_copy_data(int frombio, struct bio *bio, struct page **page,
+	sector_t sector, struct dma_async_tx_descriptor *tx,
+	struct stripe_head *sh)
 {
-	struct bio_vec *bvl;
+	struct bio_vec bvl;
+	struct bvec_iter iter;
 	struct page *bio_page;
-	int i;
 	int page_offset;
 	struct async_submit_ctl submit;
 	enum async_tx_flags flags = 0;
 
-	if (bio->bi_sector >= sector)
-		page_offset = (signed)(bio->bi_sector - sector) * 512;
+	if (bio->bi_iter.bi_sector >= sector)
+		page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
 	else
-		page_offset = (signed)(sector - bio->bi_sector) * -512;
+		page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
 
 	if (frombio)
 		flags |= ASYNC_TX_FENCE;
 	init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
 
-	bio_for_each_segment(bvl, bio, i) {
-		int len = bvl->bv_len;
+	bio_for_each_segment(bvl, bio, iter) {
+		int len = bvl.bv_len;
 		int clen;
 		int b_offset = 0;
 
@@ -748,13 +980,18 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
 			clen = len;
 
 		if (clen > 0) {
-			b_offset += bvl->bv_offset;
-			bio_page = bvl->bv_page;
-			if (frombio)
-				tx = async_memcpy(page, bio_page, page_offset,
+			b_offset += bvl.bv_offset;
+			bio_page = bvl.bv_page;
+			if (frombio) {
+				if (sh->raid_conf->skip_copy &&
+				    b_offset == 0 && page_offset == 0 &&
+				    clen == STRIPE_SIZE)
+					*page = bio_page;
+				else
+					tx = async_memcpy(*page, bio_page, page_offset,
 						  b_offset, clen, &submit);
-			else
-				tx = async_memcpy(bio_page, page, b_offset,
+			} else
+				tx = async_memcpy(bio_page, *page, b_offset,
 						  page_offset, clen, &submit);
 		}
 		/* chain the operations */
@@ -792,7 +1029,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
 			BUG_ON(!dev->read);
 			rbi = dev->read;
 			dev->read = NULL;
-			while (rbi && rbi->bi_sector <
+			while (rbi && rbi->bi_iter.bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				rbi2 = r5_next_bio(rbi, dev->sector);
 				if (!raid5_dec_bi_active_stripes(rbi)) {
@@ -828,10 +1065,10 @@ static void ops_run_biofill(struct stripe_head *sh)
 			dev->read = rbi = dev->toread;
 			dev->toread = NULL;
 			spin_unlock_irq(&sh->stripe_lock);
-			while (rbi && rbi->bi_sector <
+			while (rbi && rbi->bi_iter.bi_sector <
 				dev->sector + STRIPE_SECTORS) {
-				tx = async_copy_data(0, rbi, dev->page,
-					dev->sector, tx);
+				tx = async_copy_data(0, rbi, &dev->page,
+					dev->sector, tx, sh);
 				rbi = r5_next_bio(rbi, dev->sector);
 			}
 		}
@@ -1169,8 +1406,9 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 			BUG_ON(dev->written);
 			wbi = dev->written = chosen;
 			spin_unlock_irq(&sh->stripe_lock);
+			WARN_ON(dev->page != dev->orig_page);
 
-			while (wbi && wbi->bi_sector <
+			while (wbi && wbi->bi_iter.bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				if (wbi->bi_rw & REQ_FUA)
 					set_bit(R5_WantFUA, &dev->flags);
@@ -1178,9 +1416,15 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 					set_bit(R5_SyncIO, &dev->flags);
 				if (wbi->bi_rw & REQ_DISCARD)
 					set_bit(R5_Discard, &dev->flags);
-				else
-					tx = async_copy_data(1, wbi, dev->page,
-						dev->sector, tx);
+				else {
+					tx = async_copy_data(1, wbi, &dev->page,
+						dev->sector, tx, sh);
+					if (dev->page != dev->orig_page) {
+						set_bit(R5_SkipCopy, &dev->flags);
+						clear_bit(R5_UPTODATE, &dev->flags);
+						clear_bit(R5_OVERWRITE, &dev->flags);
+					}
+				}
 				wbi = r5_next_bio(wbi, dev->sector);
 			}
 		}
@@ -1211,7 +1455,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
 		struct r5dev *dev = &sh->dev[i];
 
 		if (dev->written || i == pd_idx || i == qd_idx) {
-			if (!discard)
+			if (!discard && !test_bit(R5_SkipCopy, &dev->flags))
 				set_bit(R5_UPTODATE, &dev->flags);
 			if (fua)
 				set_bit(R5_WantFUA, &dev->flags);
@@ -1397,7 +1641,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
 			   &sh->ops.zero_sum_result, percpu->spare_page, &submit);
 }
 
-static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
+static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 {
 	int overlap_clear = 0, i, disks = sh->disks;
 	struct dma_async_tx_descriptor *tx = NULL;
@@ -1462,37 +1706,7 @@ static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 	put_cpu();
 }
 
-#ifdef CONFIG_MULTICORE_RAID456
-static void async_run_ops(void *param, async_cookie_t cookie)
-{
-	struct stripe_head *sh = param;
-	unsigned long ops_request = sh->ops.request;
-
-	clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state);
-	wake_up(&sh->ops.wait_for_ops);
-
-	__raid_run_ops(sh, ops_request);
-	release_stripe(sh);
-}
-
-static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
-{
-	/* since handle_stripe can be called outside of raid5d context
-	 * we need to ensure sh->ops.request is de-staged before another
-	 * request arrives
-	 */
-	wait_event(sh->ops.wait_for_ops,
-		   !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state));
-	sh->ops.request = ops_request;
-
-	atomic_inc(&sh->count);
-	async_schedule(async_run_ops, sh);
-}
-#else
-#define raid_run_ops __raid_run_ops
-#endif
-
-static int grow_one_stripe(struct r5conf *conf)
+static int grow_one_stripe(struct r5conf *conf, int hash)
 {
 	struct stripe_head *sh;
 	sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
@@ -1500,9 +1714,6 @@ static int grow_one_stripe(struct r5conf *conf)
 		return 0;
 
 	sh->raid_conf = conf;
-	#ifdef CONFIG_MULTICORE_RAID456
-	init_waitqueue_head(&sh->ops.wait_for_ops);
-	#endif
 
 	spin_lock_init(&sh->stripe_lock);
 
@@ -1511,6 +1722,7 @@ static int grow_one_stripe(struct r5conf *conf)
 		kmem_cache_free(conf->slab_cache, sh);
 		return 0;
 	}
+	sh->hash_lock_index = hash;
 	/* we just created an active stripe so... */
 	atomic_set(&sh->count, 1);
 	atomic_inc(&conf->active_stripes);
@@ -1523,6 +1735,7 @@ static int grow_stripes(struct r5conf *conf, int num)
 {
 	struct kmem_cache *sc;
 	int devs = max(conf->raid_disks, conf->previous_raid_disks);
+	int hash;
 
 	if (conf->mddev->gendisk)
 		sprintf(conf->cache_name[0],
@@ -1540,9 +1753,13 @@ static int grow_stripes(struct r5conf *conf, int num)
 		return 1;
 	conf->slab_cache = sc;
 	conf->pool_size = devs;
-	while (num--)
-		if (!grow_one_stripe(conf))
+	hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
+	while (num--) {
+		if (!grow_one_stripe(conf, hash))
 			return 1;
+		conf->max_nr_stripes++;
+		hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
+	}
 	return 0;
 }
 
@@ -1576,7 +1793,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	 * This happens in stages:
 	 * 1/ create a new kmem_cache and allocate the required number of
 	 *    stripe_heads.
-	 * 2/ gather all the old stripe_heads and tranfer the pages across
+	 * 2/ gather all the old stripe_heads and transfer the pages across
 	 *    to the new stripe_heads.  This will have the side effect of
 	 *    freezing the array as once all stripe_heads have been collected,
 	 *    no IO will be possible.  Old stripe heads are freed once their
@@ -1600,6 +1817,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	int err;
 	struct kmem_cache *sc;
 	int i;
+	int hash, cnt;
 
 	if (newsize <= conf->pool_size)
 		return 0; /* never bother to shrink */
@@ -1621,9 +1839,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 			break;
 
 		nsh->raid_conf = conf;
-		#ifdef CONFIG_MULTICORE_RAID456
-		init_waitqueue_head(&nsh->ops.wait_for_ops);
-		#endif
 		spin_lock_init(&nsh->stripe_lock);
 
 		list_add(&nsh->lru, &newstripes);
@@ -1642,20 +1857,31 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	 * OK, we have enough stripes, start collecting inactive
 	 * stripes and copying them over
 	 */
+	hash = 0;
+	cnt = 0;
 	list_for_each_entry(nsh, &newstripes, lru) {
-		spin_lock_irq(&conf->device_lock);
-		wait_event_lock_irq(conf->wait_for_stripe,
-				    !list_empty(&conf->inactive_list),
-				    conf->device_lock,
-				    );
-		osh = get_free_stripe(conf);
-		spin_unlock_irq(&conf->device_lock);
+		lock_device_hash_lock(conf, hash);
+		wait_event_cmd(conf->wait_for_stripe,
+				    !list_empty(conf->inactive_list + hash),
+				    unlock_device_hash_lock(conf, hash),
+				    lock_device_hash_lock(conf, hash));
+		osh = get_free_stripe(conf, hash);
+		unlock_device_hash_lock(conf, hash);
 		atomic_set(&nsh->count, 1);
-		for(i=0; i<conf->pool_size; i++)
+		for(i=0; i<conf->pool_size; i++) {
 			nsh->dev[i].page = osh->dev[i].page;
+			nsh->dev[i].orig_page = osh->dev[i].page;
+		}
 		for( ; i<newsize; i++)
 			nsh->dev[i].page = NULL;
+		nsh->hash_lock_index = hash;
 		kmem_cache_free(conf->slab_cache, osh);
+		cnt++;
+		if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
+		    !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
+			hash++;
+			cnt = 0;
+		}
 	}
 	kmem_cache_destroy(conf->slab_cache);
 
@@ -1701,6 +1927,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 			if (nsh->dev[i].page == NULL) {
 				struct page *p = alloc_page(GFP_NOIO);
 				nsh->dev[i].page = p;
+				nsh->dev[i].orig_page = p;
 				if (!p)
 					err = -ENOMEM;
 			}
@@ -1714,13 +1941,13 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	return err;
 }
 
-static int drop_one_stripe(struct r5conf *conf)
+static int drop_one_stripe(struct r5conf *conf, int hash)
 {
 	struct stripe_head *sh;
 
-	spin_lock_irq(&conf->device_lock);
-	sh = get_free_stripe(conf);
-	spin_unlock_irq(&conf->device_lock);
+	spin_lock_irq(conf->hash_locks + hash);
+	sh = get_free_stripe(conf, hash);
+	spin_unlock_irq(conf->hash_locks + hash);
 	if (!sh)
 		return 0;
 	BUG_ON(atomic_read(&sh->count));
@@ -1732,8 +1959,10 @@ static int drop_one_stripe(struct r5conf *conf)
 
 static void shrink_stripes(struct r5conf *conf)
 {
-	while (drop_one_stripe(conf))
-		;
+	int hash;
+	for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++)
+		while (drop_one_stripe(conf, hash))
+			;
 
 	if (conf->slab_cache)
 		kmem_cache_destroy(conf->slab_cache);
@@ -1838,6 +2067,9 @@ static void raid5_end_read_request(struct bio * bi, int error)
 			       mdname(conf->mddev), bdn);
 		else
 			retry = 1;
+		if (set_bad && test_bit(In_sync, &rdev->flags)
+		    && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
+			retry = 1;
 		if (retry)
 			if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
 				set_bit(R5_ReadError, &sh->dev[i].flags);
@@ -1906,6 +2138,7 @@ static void raid5_end_write_request(struct bio *bi, int error)
 			set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
 	} else {
 		if (!uptodate) {
+			set_bit(STRIPE_DEGRADED, &sh->state);
 			set_bit(WriteErrorSeen, &rdev->flags);
 			set_bit(R5_WriteError, &sh->dev[i].flags);
 			if (!test_and_set_bit(WantReplacement, &rdev->flags))
@@ -1913,8 +2146,15 @@ static void raid5_end_write_request(struct bio *bi, int error)
 					&rdev->mddev->recovery);
 		} else if (is_badblock(rdev, sh->sector,
 				       STRIPE_SECTORS,
-				       &first_bad, &bad_sectors))
+				       &first_bad, &bad_sectors)) {
 			set_bit(R5_MadeGood, &sh->dev[i].flags);
+			if (test_bit(R5_ReadError, &sh->dev[i].flags))
+				/* That was a successful write so make
+				 * sure it looks like we already did
+				 * a re-write.
+				 */
+				set_bit(R5_ReWrite, &sh->dev[i].flags);
+		}
 	}
 	rdev_dec_pending(rdev, conf->mddev);
 
@@ -1925,24 +2165,20 @@ static void raid5_end_write_request(struct bio *bi, int error)
 }
 
 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
-	
+
 static void raid5_build_block(struct stripe_head *sh, int i, int previous)
 {
 	struct r5dev *dev = &sh->dev[i];
 
 	bio_init(&dev->req);
 	dev->req.bi_io_vec = &dev->vec;
-	dev->req.bi_vcnt++;
-	dev->req.bi_max_vecs++;
+	dev->req.bi_max_vecs = 1;
 	dev->req.bi_private = sh;
-	dev->vec.bv_page = dev->page;
 
 	bio_init(&dev->rreq);
 	dev->rreq.bi_io_vec = &dev->rvec;
-	dev->rreq.bi_vcnt++;
-	dev->rreq.bi_max_vecs++;
+	dev->rreq.bi_max_vecs = 1;
 	dev->rreq.bi_private = sh;
-	dev->rvec.bv_page = dev->page;
 
 	dev->flags = 0;
 	dev->sector = compute_blocknr(sh, i, previous);
@@ -2311,17 +2547,6 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 	int level = conf->level;
 
 	if (rcw) {
-		/* if we are not expanding this is a proper write request, and
-		 * there will be bios with new data to be drained into the
-		 * stripe cache
-		 */
-		if (!expand) {
-			sh->reconstruct_state = reconstruct_state_drain_run;
-			set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
-		} else
-			sh->reconstruct_state = reconstruct_state_run;
-
-		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
 
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
@@ -2334,6 +2559,21 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 				s->locked++;
 			}
 		}
+		/* if we are not expanding this is a proper write request, and
+		 * there will be bios with new data to be drained into the
+		 * stripe cache
+		 */
+		if (!expand) {
+			if (!s->locked)
+				/* False alarm, nothing to do */
+				return;
+			sh->reconstruct_state = reconstruct_state_drain_run;
+			set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+		} else
+			sh->reconstruct_state = reconstruct_state_run;
+
+		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
+
 		if (s->locked + conf->max_degraded == disks)
 			if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
 				atomic_inc(&conf->pending_full_writes);
@@ -2342,11 +2582,6 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 		BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
 			test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
 
-		sh->reconstruct_state = reconstruct_state_prexor_drain_run;
-		set_bit(STRIPE_OP_PREXOR, &s->ops_request);
-		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
-		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
-
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
 			if (i == pd_idx)
@@ -2361,6 +2596,13 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 				s->locked++;
 			}
 		}
+		if (!s->locked)
+			/* False alarm - nothing to do */
+			return;
+		sh->reconstruct_state = reconstruct_state_prexor_drain_run;
+		set_bit(STRIPE_OP_PREXOR, &s->ops_request);
+		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
 	}
 
 	/* keep the parity disk(s) locked while asynchronous operations
@@ -2396,7 +2638,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 	int firstwrite=0;
 
 	pr_debug("adding bi b#%llu to stripe s#%llu\n",
-		(unsigned long long)bi->bi_sector,
+		(unsigned long long)bi->bi_iter.bi_sector,
 		(unsigned long long)sh->sector);
 
 	/*
@@ -2414,12 +2656,12 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 			firstwrite = 1;
 	} else
 		bip = &sh->dev[dd_idx].toread;
-	while (*bip && (*bip)->bi_sector < bi->bi_sector) {
-		if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
+	while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
+		if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
 			goto overlap;
 		bip = & (*bip)->bi_next;
 	}
-	if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
+	if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
 		goto overlap;
 
 	BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
@@ -2433,17 +2675,17 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 		sector_t sector = sh->dev[dd_idx].sector;
 		for (bi=sh->dev[dd_idx].towrite;
 		     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
-			     bi && bi->bi_sector <= sector;
+			     bi && bi->bi_iter.bi_sector <= sector;
 		     bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
-			if (bi->bi_sector + (bi->bi_size>>9) >= sector)
-				sector = bi->bi_sector + (bi->bi_size>>9);
+			if (bio_end_sector(bi) >= sector)
+				sector = bio_end_sector(bi);
 		}
 		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
 			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
 	}
 
 	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
-		(unsigned long long)(*bip)->bi_sector,
+		(unsigned long long)(*bip)->bi_iter.bi_sector,
 		(unsigned long long)sh->sector, dd_idx);
 	spin_unlock_irq(&sh->stripe_lock);
 
@@ -2518,7 +2760,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
 			wake_up(&conf->wait_for_overlap);
 
-		while (bi && bi->bi_sector <
+		while (bi && bi->bi_iter.bi_sector <
 			sh->dev[i].sector + STRIPE_SECTORS) {
 			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
 			clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -2536,8 +2778,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 		/* and fail all 'written' */
 		bi = sh->dev[i].written;
 		sh->dev[i].written = NULL;
+		if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
+			WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
+			sh->dev[i].page = sh->dev[i].orig_page;
+		}
+
 		if (bi) bitmap_end = 1;
-		while (bi && bi->bi_sector <
+		while (bi && bi->bi_iter.bi_sector <
 		       sh->dev[i].sector + STRIPE_SECTORS) {
 			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
 			clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -2561,7 +2808,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			spin_unlock_irq(&sh->stripe_lock);
 			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
 				wake_up(&conf->wait_for_overlap);
-			while (bi && bi->bi_sector <
+			while (bi && bi->bi_iter.bi_sector <
 			       sh->dev[i].sector + STRIPE_SECTORS) {
 				struct bio *nextbi =
 					r5_next_bio(bi, sh->dev[i].sector);
@@ -2595,6 +2842,8 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
 	int i;
 
 	clear_bit(STRIPE_SYNCING, &sh->state);
+	if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
+		wake_up(&conf->wait_for_overlap);
 	s->syncing = 0;
 	s->replacing = 0;
 	/* There is nothing more to do for sync/check/repair.
@@ -2670,8 +2919,11 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
 	     (s->failed >= 1 && fdev[0]->toread) ||
 	     (s->failed >= 2 && fdev[1]->toread) ||
 	     (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
+	      (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) &&
 	      !test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
-	     (sh->raid_conf->level == 6 && s->failed && s->to_write))) {
+	     (sh->raid_conf->level == 6 && s->failed && s->to_write &&
+	      s->to_write < sh->raid_conf->raid_disks - 2 &&
+	      (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) {
 		/* we would like to get this block, possibly by computing it,
 		 * otherwise read it if the backing disk is insync
 		 */
@@ -2768,19 +3020,27 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 {
 	int i;
 	struct r5dev *dev;
+	int discard_pending = 0;
 
 	for (i = disks; i--; )
 		if (sh->dev[i].written) {
 			dev = &sh->dev[i];
 			if (!test_bit(R5_LOCKED, &dev->flags) &&
 			    (test_bit(R5_UPTODATE, &dev->flags) ||
-			     test_and_clear_bit(R5_Discard, &dev->flags))) {
+			     test_bit(R5_Discard, &dev->flags) ||
+			     test_bit(R5_SkipCopy, &dev->flags))) {
 				/* We can return any write requests */
 				struct bio *wbi, *wbi2;
 				pr_debug("Return write for disc %d\n", i);
+				if (test_and_clear_bit(R5_Discard, &dev->flags))
+					clear_bit(R5_UPTODATE, &dev->flags);
+				if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
+					WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
+					dev->page = dev->orig_page;
+				}
 				wbi = dev->written;
 				dev->written = NULL;
-				while (wbi && wbi->bi_sector <
+				while (wbi && wbi->bi_iter.bi_sector <
 					dev->sector + STRIPE_SECTORS) {
 					wbi2 = r5_next_bio(wbi, dev->sector);
 					if (!raid5_dec_bi_active_stripes(wbi)) {
@@ -2794,8 +3054,33 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 						STRIPE_SECTORS,
 					 !test_bit(STRIPE_DEGRADED, &sh->state),
 						0);
-			}
+			} else if (test_bit(R5_Discard, &dev->flags))
+				discard_pending = 1;
+			WARN_ON(test_bit(R5_SkipCopy, &dev->flags));
+			WARN_ON(dev->page != dev->orig_page);
+		}
+	if (!discard_pending &&
+	    test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
+		clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
+		clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
+		if (sh->qd_idx >= 0) {
+			clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
+			clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags);
 		}
+		/* now that discard is done we can proceed with any sync */
+		clear_bit(STRIPE_DISCARD, &sh->state);
+		/*
+		 * SCSI discard will change some bio fields and the stripe has
+		 * no updated data, so remove it from hash list and the stripe
+		 * will be reinitialized
+		 */
+		spin_lock_irq(&conf->device_lock);
+		remove_hash(sh);
+		spin_unlock_irq(&conf->device_lock);
+		if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
+			set_bit(STRIPE_HANDLE, &sh->state);
+
+	}
 
 	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
 		if (atomic_dec_and_test(&conf->pending_full_writes))
@@ -2844,7 +3129,8 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 		    !test_bit(R5_LOCKED, &dev->flags) &&
 		    !(test_bit(R5_UPTODATE, &dev->flags) ||
 		    test_bit(R5_Wantcompute, &dev->flags))) {
-			if (test_bit(R5_Insync, &dev->flags)) rcw++;
+			if (test_bit(R5_Insync, &dev->flags))
+				rcw++;
 			else
 				rcw += 2*disks;
 		}
@@ -2852,8 +3138,12 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 	pr_debug("for sector %llu, rmw=%d rcw=%d\n",
 		(unsigned long long)sh->sector, rmw, rcw);
 	set_bit(STRIPE_HANDLE, &sh->state);
-	if (rmw < rcw && rmw > 0)
+	if (rmw < rcw && rmw > 0) {
 		/* prefer read-modify-write, but need to get some data */
+		if (conf->mddev->queue)
+			blk_add_trace_msg(conf->mddev->queue,
+					  "raid5 rmw %llu %d",
+					  (unsigned long long)sh->sector, rmw);
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
 			if ((dev->towrite || i == sh->pd_idx) &&
@@ -2861,10 +3151,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 			    !(test_bit(R5_UPTODATE, &dev->flags) ||
 			    test_bit(R5_Wantcompute, &dev->flags)) &&
 			    test_bit(R5_Insync, &dev->flags)) {
-				if (
-				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-					pr_debug("Read_old block "
-						"%d for r-m-w\n", i);
+				if (test_bit(STRIPE_PREREAD_ACTIVE,
+					     &sh->state)) {
+					pr_debug("Read_old block %d for r-m-w\n",
+						 i);
 					set_bit(R5_LOCKED, &dev->flags);
 					set_bit(R5_Wantread, &dev->flags);
 					s->locked++;
@@ -2874,8 +3164,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 				}
 			}
 		}
+	}
 	if (rcw <= rmw && rcw > 0) {
 		/* want reconstruct write, but need to get some data */
+		int qread =0;
 		rcw = 0;
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
@@ -2885,21 +3177,25 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 			    !(test_bit(R5_UPTODATE, &dev->flags) ||
 			      test_bit(R5_Wantcompute, &dev->flags))) {
 				rcw++;
-				if (!test_bit(R5_Insync, &dev->flags))
-					continue; /* it's a failed drive */
-				if (
-				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+				if (test_bit(R5_Insync, &dev->flags) &&
+				    test_bit(STRIPE_PREREAD_ACTIVE,
+					     &sh->state)) {
 					pr_debug("Read_old block "
 						"%d for Reconstruct\n", i);
 					set_bit(R5_LOCKED, &dev->flags);
 					set_bit(R5_Wantread, &dev->flags);
 					s->locked++;
+					qread++;
 				} else {
 					set_bit(STRIPE_DELAYED, &sh->state);
 					set_bit(STRIPE_HANDLE, &sh->state);
 				}
 			}
 		}
+		if (rcw && conf->mddev->queue)
+			blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
+					  (unsigned long long)sh->sector,
+					  rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
 	}
 	/* now if nothing is locked, and if we have enough data,
 	 * we can start a write request
@@ -3221,10 +3517,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
 
 		}
 	/* done submitting copies, wait for them to complete */
-	if (tx) {
-		async_tx_ack(tx);
-		dma_wait_for_async_tx(tx);
-	}
+	async_tx_quiesce(&tx);
 }
 
 /*
@@ -3355,7 +3648,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 			 */
 			set_bit(R5_Insync, &dev->flags);
 
-		if (rdev && test_bit(R5_WriteError, &dev->flags)) {
+		if (test_bit(R5_WriteError, &dev->flags)) {
 			/* This flag does not apply to '.replacement'
 			 * only to .rdev, so make sure to check that*/
 			struct md_rdev *rdev2 = rcu_dereference(
@@ -3368,7 +3661,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 			} else
 				clear_bit(R5_WriteError, &dev->flags);
 		}
-		if (rdev && test_bit(R5_MadeGood, &dev->flags)) {
+		if (test_bit(R5_MadeGood, &dev->flags)) {
 			/* This flag does not apply to '.replacement'
 			 * only to .rdev, so make sure to check that*/
 			struct md_rdev *rdev2 = rcu_dereference(
@@ -3439,9 +3732,16 @@ static void handle_stripe(struct stripe_head *sh)
 		return;
 	}
 
-	if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
-		set_bit(STRIPE_SYNCING, &sh->state);
-		clear_bit(STRIPE_INSYNC, &sh->state);
+	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+		spin_lock(&sh->stripe_lock);
+		/* Cannot process 'sync' concurrently with 'discard' */
+		if (!test_bit(STRIPE_DISCARD, &sh->state) &&
+		    test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+			set_bit(STRIPE_SYNCING, &sh->state);
+			clear_bit(STRIPE_INSYNC, &sh->state);
+			clear_bit(STRIPE_REPLACED, &sh->state);
+		}
+		spin_unlock(&sh->stripe_lock);
 	}
 	clear_bit(STRIPE_DELAYED, &sh->state);
 
@@ -3490,40 +3790,6 @@ static void handle_stripe(struct stripe_head *sh)
 			handle_failed_sync(conf, sh, &s);
 	}
 
-	/*
-	 * might be able to return some write requests if the parity blocks
-	 * are safe, or on a failed drive
-	 */
-	pdev = &sh->dev[sh->pd_idx];
-	s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
-		|| (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
-	qdev = &sh->dev[sh->qd_idx];
-	s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
-		|| (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
-		|| conf->level < 6;
-
-	if (s.written &&
-	    (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
-			     && !test_bit(R5_LOCKED, &pdev->flags)
-			     && (test_bit(R5_UPTODATE, &pdev->flags) ||
-				 test_bit(R5_Discard, &pdev->flags))))) &&
-	    (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
-			     && !test_bit(R5_LOCKED, &qdev->flags)
-			     && (test_bit(R5_UPTODATE, &qdev->flags) ||
-				 test_bit(R5_Discard, &qdev->flags))))))
-		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
-
-	/* Now we might consider reading some blocks, either to check/generate
-	 * parity, or to satisfy requests
-	 * or to load a block that is being partially written.
-	 */
-	if (s.to_read || s.non_overwrite
-	    || (conf->level == 6 && s.to_write && s.failed)
-	    || (s.syncing && (s.uptodate + s.compute < disks))
-	    || s.replacing
-	    || s.expanding)
-		handle_stripe_fill(sh, &s, disks);
-
 	/* Now we check to see if any write operations have recently
 	 * completed
 	 */
@@ -3561,6 +3827,40 @@ static void handle_stripe(struct stripe_head *sh)
 			s.dec_preread_active = 1;
 	}
 
+	/*
+	 * might be able to return some write requests if the parity blocks
+	 * are safe, or on a failed drive
+	 */
+	pdev = &sh->dev[sh->pd_idx];
+	s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
+		|| (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
+	qdev = &sh->dev[sh->qd_idx];
+	s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
+		|| (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
+		|| conf->level < 6;
+
+	if (s.written &&
+	    (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
+			     && !test_bit(R5_LOCKED, &pdev->flags)
+			     && (test_bit(R5_UPTODATE, &pdev->flags) ||
+				 test_bit(R5_Discard, &pdev->flags))))) &&
+	    (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
+			     && !test_bit(R5_LOCKED, &qdev->flags)
+			     && (test_bit(R5_UPTODATE, &qdev->flags) ||
+				 test_bit(R5_Discard, &qdev->flags))))))
+		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
+
+	/* Now we might consider reading some blocks, either to check/generate
+	 * parity, or to satisfy requests
+	 * or to load a block that is being partially written.
+	 */
+	if (s.to_read || s.non_overwrite
+	    || (conf->level == 6 && s.to_write && s.failed)
+	    || (s.syncing && (s.uptodate + s.compute < disks))
+	    || s.replacing
+	    || s.expanding)
+		handle_stripe_fill(sh, &s, disks);
+
 	/* Now to consider new write requests and what else, if anything
 	 * should be read.  We do not handle new writes when:
 	 * 1/ A 'write' operation (copy+xor) is already in flight.
@@ -3585,22 +3885,28 @@ static void handle_stripe(struct stripe_head *sh)
 			handle_parity_checks5(conf, sh, &s, disks);
 	}
 
-	if (s.replacing && s.locked == 0
-	    && !test_bit(STRIPE_INSYNC, &sh->state)) {
+	if ((s.replacing || s.syncing) && s.locked == 0
+	    && !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
+	    && !test_bit(STRIPE_REPLACED, &sh->state)) {
 		/* Write out to replacement devices where possible */
 		for (i = 0; i < conf->raid_disks; i++)
-			if (test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
-			    test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
+			if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
+				WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
 				set_bit(R5_WantReplace, &sh->dev[i].flags);
 				set_bit(R5_LOCKED, &sh->dev[i].flags);
 				s.locked++;
 			}
-		set_bit(STRIPE_INSYNC, &sh->state);
+		if (s.replacing)
+			set_bit(STRIPE_INSYNC, &sh->state);
+		set_bit(STRIPE_REPLACED, &sh->state);
 	}
 	if ((s.syncing || s.replacing) && s.locked == 0 &&
+	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
 	    test_bit(STRIPE_INSYNC, &sh->state)) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
 		clear_bit(STRIPE_SYNCING, &sh->state);
+		if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
+			wake_up(&conf->wait_for_overlap);
 	}
 
 	/* If the failed drives are just a ReadError, then we might need
@@ -3750,11 +4056,13 @@ static void raid5_activate_delayed(struct r5conf *conf)
 			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 				atomic_inc(&conf->preread_active_stripes);
 			list_add_tail(&sh->lru, &conf->hold_list);
+			raid5_wakeup_stripe_thread(sh);
 		}
 	}
 }
 
-static void activate_bit_delay(struct r5conf *conf)
+static void activate_bit_delay(struct r5conf *conf,
+	struct list_head *temp_inactive_list)
 {
 	/* device_lock is held */
 	struct list_head head;
@@ -3762,9 +4070,11 @@ static void activate_bit_delay(struct r5conf *conf)
 	list_del_init(&conf->bitmap_list);
 	while (!list_empty(&head)) {
 		struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
+		int hash;
 		list_del_init(&sh->lru);
 		atomic_inc(&sh->count);
-		__release_stripe(conf, sh);
+		hash = sh->hash_lock_index;
+		__release_stripe(conf, sh, &temp_inactive_list[hash]);
 	}
 }
 
@@ -3780,7 +4090,7 @@ int md_raid5_congested(struct mddev *mddev, int bits)
 		return 1;
 	if (conf->quiesce)
 		return 1;
-	if (list_empty_careful(&conf->inactive_list))
+	if (atomic_read(&conf->empty_inactive_list_nr))
 		return 1;
 
 	return 0;
@@ -3824,9 +4134,9 @@ static int raid5_mergeable_bvec(struct request_queue *q,
 
 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
 {
-	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+	sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev);
 	unsigned int chunk_sectors = mddev->chunk_sectors;
-	unsigned int bio_sectors = bio->bi_size >> 9;
+	unsigned int bio_sectors = bio_sectors(bio);
 
 	if (mddev->new_chunk_sectors < mddev->chunk_sectors)
 		chunk_sectors = mddev->new_chunk_sectors;
@@ -3900,6 +4210,8 @@ static void raid5_align_endio(struct bio *bi, int error)
 	rdev_dec_pending(rdev, conf->mddev);
 
 	if (!error && uptodate) {
+		trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
+					 raid_bi, 0);
 		bio_endio(raid_bi, 0);
 		if (atomic_dec_and_test(&conf->active_aligned_reads))
 			wake_up(&conf->wait_for_stripe);
@@ -3916,7 +4228,7 @@ static int bio_fits_rdev(struct bio *bi)
 {
 	struct request_queue *q = bdev_get_queue(bi->bi_bdev);
 
-	if ((bi->bi_size>>9) > queue_max_sectors(q))
+	if (bio_sectors(bi) > queue_max_sectors(q))
 		return 0;
 	blk_recount_segments(q, bi);
 	if (bi->bi_phys_segments > queue_max_segments(q))
@@ -3959,11 +4271,11 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
 	/*
 	 *	compute position
 	 */
-	align_bi->bi_sector =  raid5_compute_sector(conf, raid_bio->bi_sector,
-						    0,
-						    &dd_idx, NULL);
+	align_bi->bi_iter.bi_sector =
+		raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector,
+				     0, &dd_idx, NULL);
 
-	end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9);
+	end_sector = bio_end_sector(align_bi);
 	rcu_read_lock();
 	rdev = rcu_dereference(conf->disks[dd_idx].replacement);
 	if (!rdev || test_bit(Faulty, &rdev->flags) ||
@@ -3986,7 +4298,8 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
 		align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
 
 		if (!bio_fits_rdev(align_bi) ||
-		    is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
+		    is_badblock(rdev, align_bi->bi_iter.bi_sector,
+				bio_sectors(align_bi),
 				&first_bad, &bad_sectors)) {
 			/* too big in some way, or has a known bad block */
 			bio_put(align_bi);
@@ -3995,15 +4308,19 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
 		}
 
 		/* No reshape active, so we can trust rdev->data_offset */
-		align_bi->bi_sector += rdev->data_offset;
+		align_bi->bi_iter.bi_sector += rdev->data_offset;
 
 		spin_lock_irq(&conf->device_lock);
 		wait_event_lock_irq(conf->wait_for_stripe,
 				    conf->quiesce == 0,
-				    conf->device_lock, /* nothing */);
+				    conf->device_lock);
 		atomic_inc(&conf->active_aligned_reads);
 		spin_unlock_irq(&conf->device_lock);
 
+		if (mddev->gendisk)
+			trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
+					      align_bi, disk_devt(mddev->gendisk),
+					      raid_bio->bi_iter.bi_sector);
 		generic_make_request(align_bi);
 		return 1;
 	} else {
@@ -4023,18 +4340,35 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
  * head of the hold_list has changed, i.e. the head was promoted to the
  * handle_list.
  */
-static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
+static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
 {
-	struct stripe_head *sh;
+	struct stripe_head *sh = NULL, *tmp;
+	struct list_head *handle_list = NULL;
+	struct r5worker_group *wg = NULL;
+
+	if (conf->worker_cnt_per_group == 0) {
+		handle_list = &conf->handle_list;
+	} else if (group != ANY_GROUP) {
+		handle_list = &conf->worker_groups[group].handle_list;
+		wg = &conf->worker_groups[group];
+	} else {
+		int i;
+		for (i = 0; i < conf->group_cnt; i++) {
+			handle_list = &conf->worker_groups[i].handle_list;
+			wg = &conf->worker_groups[i];
+			if (!list_empty(handle_list))
+				break;
+		}
+	}
 
 	pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
 		  __func__,
-		  list_empty(&conf->handle_list) ? "empty" : "busy",
+		  list_empty(handle_list) ? "empty" : "busy",
 		  list_empty(&conf->hold_list) ? "empty" : "busy",
 		  atomic_read(&conf->pending_full_writes), conf->bypass_count);
 
-	if (!list_empty(&conf->handle_list)) {
-		sh = list_entry(conf->handle_list.next, typeof(*sh), lru);
+	if (!list_empty(handle_list)) {
+		sh = list_entry(handle_list->next, typeof(*sh), lru);
 
 		if (list_empty(&conf->hold_list))
 			conf->bypass_count = 0;
@@ -4052,23 +4386,41 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
 		   ((conf->bypass_threshold &&
 		     conf->bypass_count > conf->bypass_threshold) ||
 		    atomic_read(&conf->pending_full_writes) == 0)) {
-		sh = list_entry(conf->hold_list.next,
-				typeof(*sh), lru);
-		conf->bypass_count -= conf->bypass_threshold;
-		if (conf->bypass_count < 0)
-			conf->bypass_count = 0;
-	} else
+
+		list_for_each_entry(tmp, &conf->hold_list,  lru) {
+			if (conf->worker_cnt_per_group == 0 ||
+			    group == ANY_GROUP ||
+			    !cpu_online(tmp->cpu) ||
+			    cpu_to_group(tmp->cpu) == group) {
+				sh = tmp;
+				break;
+			}
+		}
+
+		if (sh) {
+			conf->bypass_count -= conf->bypass_threshold;
+			if (conf->bypass_count < 0)
+				conf->bypass_count = 0;
+		}
+		wg = NULL;
+	}
+
+	if (!sh)
 		return NULL;
 
+	if (wg) {
+		wg->stripes_cnt--;
+		sh->group = NULL;
+	}
 	list_del_init(&sh->lru);
-	atomic_inc(&sh->count);
-	BUG_ON(atomic_read(&sh->count) != 1);
+	BUG_ON(atomic_inc_return(&sh->count) != 1);
 	return sh;
 }
 
 struct raid5_plug_cb {
 	struct blk_plug_cb	cb;
 	struct list_head	list;
+	struct list_head	temp_inactive_list[NR_STRIPE_HASH_LOCKS];
 };
 
 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
@@ -4078,6 +4430,8 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
 	struct stripe_head *sh;
 	struct mddev *mddev = cb->cb.data;
 	struct r5conf *conf = mddev->private;
+	int cnt = 0;
+	int hash;
 
 	if (cb->list.next && !list_empty(&cb->list)) {
 		spin_lock_irq(&conf->device_lock);
@@ -4089,12 +4443,22 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
 			 * STRIPE_ON_UNPLUG_LIST clear but the stripe
 			 * is still in our list
 			 */
-			smp_mb__before_clear_bit();
+			smp_mb__before_atomic();
 			clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
-			__release_stripe(conf, sh);
+			/*
+			 * STRIPE_ON_RELEASE_LIST could be set here. In that
+			 * case, the count is always > 1 here
+			 */
+			hash = sh->hash_lock_index;
+			__release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
+			cnt++;
 		}
 		spin_unlock_irq(&conf->device_lock);
 	}
+	release_inactive_stripe_list(conf, cb->temp_inactive_list,
+				     NR_STRIPE_HASH_LOCKS);
+	if (mddev->queue)
+		trace_block_unplug(mddev->queue, cnt, !from_schedule);
 	kfree(cb);
 }
 
@@ -4113,8 +4477,12 @@ static void release_stripe_plug(struct mddev *mddev,
 
 	cb = container_of(blk_cb, struct raid5_plug_cb, cb);
 
-	if (cb->list.next == NULL)
+	if (cb->list.next == NULL) {
+		int i;
 		INIT_LIST_HEAD(&cb->list);
+		for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+			INIT_LIST_HEAD(cb->temp_inactive_list + i);
+	}
 
 	if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
 		list_add_tail(&sh->lru, &cb->list);
@@ -4134,8 +4502,8 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 		/* Skip discard while reshape is happening */
 		return;
 
-	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
-	last_sector = bi->bi_sector + (bi->bi_size>>9);
+	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+	last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
 
 	bi->bi_next = NULL;
 	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
@@ -4157,6 +4525,13 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 		sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
 		prepare_to_wait(&conf->wait_for_overlap, &w,
 				TASK_UNINTERRUPTIBLE);
+		set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
+		if (test_bit(STRIPE_SYNCING, &sh->state)) {
+			release_stripe(sh);
+			schedule();
+			goto again;
+		}
+		clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
 		spin_lock_irq(&sh->stripe_lock);
 		for (d = 0; d < conf->raid_disks; d++) {
 			if (d == sh->pd_idx || d == sh->qd_idx)
@@ -4169,6 +4544,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 				goto again;
 			}
 		}
+		set_bit(STRIPE_DISCARD, &sh->state);
 		finish_wait(&conf->wait_for_overlap, &w);
 		for (d = 0; d < conf->raid_disks; d++) {
 			if (d == sh->pd_idx || d == sh->qd_idx)
@@ -4213,6 +4589,8 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 	struct stripe_head *sh;
 	const int rw = bio_data_dir(bi);
 	int remaining;
+	DEFINE_WAIT(w);
+	bool do_prepare;
 
 	if (unlikely(bi->bi_rw & REQ_FLUSH)) {
 		md_flush_request(mddev, bi);
@@ -4231,18 +4609,23 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 		return;
 	}
 
-	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
-	last_sector = bi->bi_sector + (bi->bi_size>>9);
+	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+	last_sector = bio_end_sector(bi);
 	bi->bi_next = NULL;
 	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
 
+	prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
-		DEFINE_WAIT(w);
 		int previous;
+		int seq;
 
+		do_prepare = false;
 	retry:
+		seq = read_seqcount_begin(&conf->gen_lock);
 		previous = 0;
-		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
+		if (do_prepare)
+			prepare_to_wait(&conf->wait_for_overlap, &w,
+				TASK_UNINTERRUPTIBLE);
 		if (unlikely(conf->reshape_progress != MaxSector)) {
 			/* spinlock is needed as reshape_progress may be
 			 * 64bit on a 32bit platform, and so it might be
@@ -4263,6 +4646,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 				    : logical_sector >= conf->reshape_safe) {
 					spin_unlock_irq(&conf->device_lock);
 					schedule();
+					do_prepare = true;
 					goto retry;
 				}
 			}
@@ -4273,7 +4657,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 						  previous,
 						  &dd_idx, NULL);
 		pr_debug("raid456: make_request, sector %llu logical %llu\n",
-			(unsigned long long)new_sector, 
+			(unsigned long long)new_sector,
 			(unsigned long long)logical_sector);
 
 		sh = get_active_stripe(conf, new_sector, previous,
@@ -4299,9 +4683,17 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 				if (must_retry) {
 					release_stripe(sh);
 					schedule();
+					do_prepare = true;
 					goto retry;
 				}
 			}
+			if (read_seqcount_retry(&conf->gen_lock, seq)) {
+				/* Might have got the wrong stripe_head
+				 * by accident
+				 */
+				release_stripe(sh);
+				goto retry;
+			}
 
 			if (rw == WRITE &&
 			    logical_sector >= mddev->suspend_lo &&
@@ -4315,8 +4707,10 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 				prepare_to_wait(&conf->wait_for_overlap,
 						&w, TASK_INTERRUPTIBLE);
 				if (logical_sector >= mddev->suspend_lo &&
-				    logical_sector < mddev->suspend_hi)
+				    logical_sector < mddev->suspend_hi) {
 					schedule();
+					do_prepare = true;
+				}
 				goto retry;
 			}
 
@@ -4329,9 +4723,9 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 				md_wakeup_thread(mddev->thread);
 				release_stripe(sh);
 				schedule();
+				do_prepare = true;
 				goto retry;
 			}
-			finish_wait(&conf->wait_for_overlap, &w);
 			set_bit(STRIPE_HANDLE, &sh->state);
 			clear_bit(STRIPE_DELAYED, &sh->state);
 			if ((bi->bi_rw & REQ_SYNC) &&
@@ -4341,10 +4735,10 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 		} else {
 			/* cannot get stripe for read-ahead, just give-up */
 			clear_bit(BIO_UPTODATE, &bi->bi_flags);
-			finish_wait(&conf->wait_for_overlap, &w);
 			break;
 		}
 	}
+	finish_wait(&conf->wait_for_overlap, &w);
 
 	remaining = raid5_dec_bi_active_stripes(bi);
 	if (remaining == 0) {
@@ -4352,6 +4746,8 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 		if ( rw == WRITE )
 			md_write_end(mddev);
 
+		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
+					 bi, 0);
 		bio_endio(bi, 0);
 	}
 }
@@ -4478,14 +4874,19 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 	    time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
 		/* Cannot proceed until we've updated the superblock... */
 		wait_event(conf->wait_for_overlap,
-			   atomic_read(&conf->reshape_stripes)==0);
+			   atomic_read(&conf->reshape_stripes)==0
+			   || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+		if (atomic_read(&conf->reshape_stripes) != 0)
+			return 0;
 		mddev->reshape_position = conf->reshape_progress;
 		mddev->curr_resync_completed = sector_nr;
 		conf->reshape_checkpoint = jiffies;
 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
 		md_wakeup_thread(mddev->thread);
 		wait_event(mddev->sb_wait, mddev->flags == 0 ||
-			   kthread_should_stop());
+			   test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+			return 0;
 		spin_lock_irq(&conf->device_lock);
 		conf->reshape_safe = mddev->reshape_position;
 		spin_unlock_irq(&conf->device_lock);
@@ -4568,7 +4969,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 	    >= mddev->resync_max - mddev->curr_resync_completed) {
 		/* Cannot proceed until we've updated the superblock... */
 		wait_event(conf->wait_for_overlap,
-			   atomic_read(&conf->reshape_stripes) == 0);
+			   atomic_read(&conf->reshape_stripes) == 0
+			   || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+		if (atomic_read(&conf->reshape_stripes) != 0)
+			goto ret;
 		mddev->reshape_position = conf->reshape_progress;
 		mddev->curr_resync_completed = sector_nr;
 		conf->reshape_checkpoint = jiffies;
@@ -4576,13 +4980,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 		md_wakeup_thread(mddev->thread);
 		wait_event(mddev->sb_wait,
 			   !test_bit(MD_CHANGE_DEVS, &mddev->flags)
-			   || kthread_should_stop());
+			   || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+			goto ret;
 		spin_lock_irq(&conf->device_lock);
 		conf->reshape_safe = mddev->reshape_position;
 		spin_unlock_irq(&conf->device_lock);
 		wake_up(&conf->wait_for_overlap);
 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 	}
+ret:
 	return reshape_sectors;
 }
 
@@ -4636,9 +5043,10 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
 		*skipped = 1;
 		return rv;
 	}
-	if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
-	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
-	    !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
+	if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+	    !conf->fullsync &&
+	    !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
+	    sync_blocks >= STRIPE_SECTORS) {
 		/* we can skip this block, and probably more */
 		sync_blocks /= STRIPE_SECTORS;
 		*skipped = 1;
@@ -4666,8 +5074,8 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
 	bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
 
 	set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
+	set_bit(STRIPE_HANDLE, &sh->state);
 
-	handle_stripe(sh);
 	release_stripe(sh);
 
 	return STRIPE_SECTORS;
@@ -4692,10 +5100,11 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 	int remaining;
 	int handled = 0;
 
-	logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+	logical_sector = raid_bio->bi_iter.bi_sector &
+		~((sector_t)STRIPE_SECTORS-1);
 	sector = raid5_compute_sector(conf, logical_sector,
 				      0, &dd_idx, NULL);
-	last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
+	last_sector = bio_end_sector(raid_bio);
 
 	for (; logical_sector < last_sector;
 	     logical_sector += STRIPE_SECTORS,
@@ -4706,7 +5115,7 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 			/* already done this stripe */
 			continue;
 
-		sh = get_active_stripe(conf, sector, 0, 1, 0);
+		sh = get_active_stripe(conf, sector, 0, 1, 1);
 
 		if (!sh) {
 			/* failed to get a stripe - must wait */
@@ -4728,38 +5137,93 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 		handled++;
 	}
 	remaining = raid5_dec_bi_active_stripes(raid_bio);
-	if (remaining == 0)
+	if (remaining == 0) {
+		trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
+					 raid_bio, 0);
 		bio_endio(raid_bio, 0);
+	}
 	if (atomic_dec_and_test(&conf->active_aligned_reads))
 		wake_up(&conf->wait_for_stripe);
 	return handled;
 }
 
-#define MAX_STRIPE_BATCH 8
-static int handle_active_stripes(struct r5conf *conf)
+static int handle_active_stripes(struct r5conf *conf, int group,
+				 struct r5worker *worker,
+				 struct list_head *temp_inactive_list)
 {
 	struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
-	int i, batch_size = 0;
+	int i, batch_size = 0, hash;
+	bool release_inactive = false;
 
 	while (batch_size < MAX_STRIPE_BATCH &&
-			(sh = __get_priority_stripe(conf)) != NULL)
+			(sh = __get_priority_stripe(conf, group)) != NULL)
 		batch[batch_size++] = sh;
 
-	if (batch_size == 0)
-		return batch_size;
+	if (batch_size == 0) {
+		for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+			if (!list_empty(temp_inactive_list + i))
+				break;
+		if (i == NR_STRIPE_HASH_LOCKS)
+			return batch_size;
+		release_inactive = true;
+	}
 	spin_unlock_irq(&conf->device_lock);
 
+	release_inactive_stripe_list(conf, temp_inactive_list,
+				     NR_STRIPE_HASH_LOCKS);
+
+	if (release_inactive) {
+		spin_lock_irq(&conf->device_lock);
+		return 0;
+	}
+
 	for (i = 0; i < batch_size; i++)
 		handle_stripe(batch[i]);
 
 	cond_resched();
 
 	spin_lock_irq(&conf->device_lock);
-	for (i = 0; i < batch_size; i++)
-		__release_stripe(conf, batch[i]);
+	for (i = 0; i < batch_size; i++) {
+		hash = batch[i]->hash_lock_index;
+		__release_stripe(conf, batch[i], &temp_inactive_list[hash]);
+	}
 	return batch_size;
 }
 
+static void raid5_do_work(struct work_struct *work)
+{
+	struct r5worker *worker = container_of(work, struct r5worker, work);
+	struct r5worker_group *group = worker->group;
+	struct r5conf *conf = group->conf;
+	int group_id = group - conf->worker_groups;
+	int handled;
+	struct blk_plug plug;
+
+	pr_debug("+++ raid5worker active\n");
+
+	blk_start_plug(&plug);
+	handled = 0;
+	spin_lock_irq(&conf->device_lock);
+	while (1) {
+		int batch_size, released;
+
+		released = release_stripe_list(conf, worker->temp_inactive_list);
+
+		batch_size = handle_active_stripes(conf, group_id, worker,
+						   worker->temp_inactive_list);
+		worker->working = false;
+		if (!batch_size && !released)
+			break;
+		handled += batch_size;
+	}
+	pr_debug("%d stripes handled\n", handled);
+
+	spin_unlock_irq(&conf->device_lock);
+	blk_finish_plug(&plug);
+
+	pr_debug("--- raid5worker inactive\n");
+}
+
 /*
  * This is our raid5 kernel thread.
  *
@@ -4783,7 +5247,9 @@ static void raid5d(struct md_thread *thread)
 	spin_lock_irq(&conf->device_lock);
 	while (1) {
 		struct bio *bio;
-		int batch_size;
+		int batch_size, released;
+
+		released = release_stripe_list(conf, conf->temp_inactive_list);
 
 		if (
 		    !list_empty(&conf->bitmap_list)) {
@@ -4793,7 +5259,7 @@ static void raid5d(struct md_thread *thread)
 			bitmap_unplug(mddev->bitmap);
 			spin_lock_irq(&conf->device_lock);
 			conf->seq_write = conf->seq_flush;
-			activate_bit_delay(conf);
+			activate_bit_delay(conf, conf->temp_inactive_list);
 		}
 		raid5_activate_delayed(conf);
 
@@ -4807,8 +5273,9 @@ static void raid5d(struct md_thread *thread)
 			handled++;
 		}
 
-		batch_size = handle_active_stripes(conf);
-		if (!batch_size)
+		batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
+						   conf->temp_inactive_list);
+		if (!batch_size && !released)
 			break;
 		handled += batch_size;
 
@@ -4843,22 +5310,29 @@ raid5_set_cache_size(struct mddev *mddev, int size)
 {
 	struct r5conf *conf = mddev->private;
 	int err;
+	int hash;
 
 	if (size <= 16 || size > 32768)
 		return -EINVAL;
+	hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
 	while (size < conf->max_nr_stripes) {
-		if (drop_one_stripe(conf))
+		if (drop_one_stripe(conf, hash))
 			conf->max_nr_stripes--;
 		else
 			break;
+		hash--;
+		if (hash < 0)
+			hash = NR_STRIPE_HASH_LOCKS - 1;
 	}
 	err = md_allow_write(mddev);
 	if (err)
 		return err;
+	hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
 	while (size > conf->max_nr_stripes) {
-		if (grow_one_stripe(conf))
+		if (grow_one_stripe(conf, hash))
 			conf->max_nr_stripes++;
 		else break;
+		hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
 	}
 	return 0;
 }
@@ -4876,7 +5350,7 @@ raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
 	if (!conf)
 		return -ENODEV;
 
-	if (strict_strtoul(page, 10, &new))
+	if (kstrtoul(page, 10, &new))
 		return -EINVAL;
 	err = raid5_set_cache_size(mddev, new);
 	if (err)
@@ -4909,7 +5383,7 @@ raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
 	if (!conf)
 		return -ENODEV;
 
-	if (strict_strtoul(page, 10, &new))
+	if (kstrtoul(page, 10, &new))
 		return -EINVAL;
 	if (new > conf->max_nr_stripes)
 		return -EINVAL;
@@ -4924,6 +5398,50 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
 					raid5_store_preread_threshold);
 
 static ssize_t
+raid5_show_skip_copy(struct mddev *mddev, char *page)
+{
+	struct r5conf *conf = mddev->private;
+	if (conf)
+		return sprintf(page, "%d\n", conf->skip_copy);
+	else
+		return 0;
+}
+
+static ssize_t
+raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
+{
+	struct r5conf *conf = mddev->private;
+	unsigned long new;
+	if (len >= PAGE_SIZE)
+		return -EINVAL;
+	if (!conf)
+		return -ENODEV;
+
+	if (kstrtoul(page, 10, &new))
+		return -EINVAL;
+	new = !!new;
+	if (new == conf->skip_copy)
+		return len;
+
+	mddev_suspend(mddev);
+	conf->skip_copy = new;
+	if (new)
+		mddev->queue->backing_dev_info.capabilities |=
+						BDI_CAP_STABLE_WRITES;
+	else
+		mddev->queue->backing_dev_info.capabilities &=
+						~BDI_CAP_STABLE_WRITES;
+	mddev_resume(mddev);
+	return len;
+}
+
+static struct md_sysfs_entry
+raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
+					raid5_show_skip_copy,
+					raid5_store_skip_copy);
+
+
+static ssize_t
 stripe_cache_active_show(struct mddev *mddev, char *page)
 {
 	struct r5conf *conf = mddev->private;
@@ -4936,10 +5454,79 @@ stripe_cache_active_show(struct mddev *mddev, char *page)
 static struct md_sysfs_entry
 raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
 
+static ssize_t
+raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
+{
+	struct r5conf *conf = mddev->private;
+	if (conf)
+		return sprintf(page, "%d\n", conf->worker_cnt_per_group);
+	else
+		return 0;
+}
+
+static int alloc_thread_groups(struct r5conf *conf, int cnt,
+			       int *group_cnt,
+			       int *worker_cnt_per_group,
+			       struct r5worker_group **worker_groups);
+static ssize_t
+raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
+{
+	struct r5conf *conf = mddev->private;
+	unsigned long new;
+	int err;
+	struct r5worker_group *new_groups, *old_groups;
+	int group_cnt, worker_cnt_per_group;
+
+	if (len >= PAGE_SIZE)
+		return -EINVAL;
+	if (!conf)
+		return -ENODEV;
+
+	if (kstrtoul(page, 10, &new))
+		return -EINVAL;
+
+	if (new == conf->worker_cnt_per_group)
+		return len;
+
+	mddev_suspend(mddev);
+
+	old_groups = conf->worker_groups;
+	if (old_groups)
+		flush_workqueue(raid5_wq);
+
+	err = alloc_thread_groups(conf, new,
+				  &group_cnt, &worker_cnt_per_group,
+				  &new_groups);
+	if (!err) {
+		spin_lock_irq(&conf->device_lock);
+		conf->group_cnt = group_cnt;
+		conf->worker_cnt_per_group = worker_cnt_per_group;
+		conf->worker_groups = new_groups;
+		spin_unlock_irq(&conf->device_lock);
+
+		if (old_groups)
+			kfree(old_groups[0].workers);
+		kfree(old_groups);
+	}
+
+	mddev_resume(mddev);
+
+	if (err)
+		return err;
+	return len;
+}
+
+static struct md_sysfs_entry
+raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR,
+				raid5_show_group_thread_cnt,
+				raid5_store_group_thread_cnt);
+
 static struct attribute *raid5_attrs[] =  {
 	&raid5_stripecache_size.attr,
 	&raid5_stripecache_active.attr,
 	&raid5_preread_bypass_threshold.attr,
+	&raid5_group_thread_cnt.attr,
+	&raid5_skip_copy.attr,
 	NULL,
 };
 static struct attribute_group raid5_attrs_group = {
@@ -4947,6 +5534,61 @@ static struct attribute_group raid5_attrs_group = {
 	.attrs = raid5_attrs,
 };
 
+static int alloc_thread_groups(struct r5conf *conf, int cnt,
+			       int *group_cnt,
+			       int *worker_cnt_per_group,
+			       struct r5worker_group **worker_groups)
+{
+	int i, j, k;
+	ssize_t size;
+	struct r5worker *workers;
+
+	*worker_cnt_per_group = cnt;
+	if (cnt == 0) {
+		*group_cnt = 0;
+		*worker_groups = NULL;
+		return 0;
+	}
+	*group_cnt = num_possible_nodes();
+	size = sizeof(struct r5worker) * cnt;
+	workers = kzalloc(size * *group_cnt, GFP_NOIO);
+	*worker_groups = kzalloc(sizeof(struct r5worker_group) *
+				*group_cnt, GFP_NOIO);
+	if (!*worker_groups || !workers) {
+		kfree(workers);
+		kfree(*worker_groups);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < *group_cnt; i++) {
+		struct r5worker_group *group;
+
+		group = &(*worker_groups)[i];
+		INIT_LIST_HEAD(&group->handle_list);
+		group->conf = conf;
+		group->workers = workers + i * cnt;
+
+		for (j = 0; j < cnt; j++) {
+			struct r5worker *worker = group->workers + j;
+			worker->group = group;
+			INIT_WORK(&worker->work, raid5_do_work);
+
+			for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
+				INIT_LIST_HEAD(worker->temp_inactive_list + k);
+		}
+	}
+
+	return 0;
+}
+
+static void free_thread_groups(struct r5conf *conf)
+{
+	if (conf->worker_groups)
+		kfree(conf->worker_groups[0].workers);
+	kfree(conf->worker_groups);
+	conf->worker_groups = NULL;
+}
+
 static sector_t
 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
 {
@@ -4963,23 +5605,43 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
 	return sectors * (raid_disks - conf->max_degraded);
 }
 
+static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
+{
+	safe_put_page(percpu->spare_page);
+	kfree(percpu->scribble);
+	percpu->spare_page = NULL;
+	percpu->scribble = NULL;
+}
+
+static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
+{
+	if (conf->level == 6 && !percpu->spare_page)
+		percpu->spare_page = alloc_page(GFP_KERNEL);
+	if (!percpu->scribble)
+		percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
+
+	if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
+		free_scratch_buffer(conf, percpu);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
 static void raid5_free_percpu(struct r5conf *conf)
 {
-	struct raid5_percpu *percpu;
 	unsigned long cpu;
 
 	if (!conf->percpu)
 		return;
 
-	get_online_cpus();
-	for_each_possible_cpu(cpu) {
-		percpu = per_cpu_ptr(conf->percpu, cpu);
-		safe_put_page(percpu->spare_page);
-		kfree(percpu->scribble);
-	}
 #ifdef CONFIG_HOTPLUG_CPU
 	unregister_cpu_notifier(&conf->cpu_notify);
 #endif
+
+	get_online_cpus();
+	for_each_possible_cpu(cpu)
+		free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
 	put_online_cpus();
 
 	free_percpu(conf->percpu);
@@ -4987,6 +5649,7 @@ static void raid5_free_percpu(struct r5conf *conf)
 
 static void free_conf(struct r5conf *conf)
 {
+	free_thread_groups(conf);
 	shrink_stripes(conf);
 	raid5_free_percpu(conf);
 	kfree(conf->disks);
@@ -5005,15 +5668,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
-		if (conf->level == 6 && !percpu->spare_page)
-			percpu->spare_page = alloc_page(GFP_KERNEL);
-		if (!percpu->scribble)
-			percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
-
-		if (!percpu->scribble ||
-		    (conf->level == 6 && !percpu->spare_page)) {
-			safe_put_page(percpu->spare_page);
-			kfree(percpu->scribble);
+		if (alloc_scratch_buffer(conf, percpu)) {
 			pr_err("%s: failed memory allocation for cpu%ld\n",
 			       __func__, cpu);
 			return notifier_from_errno(-ENOMEM);
@@ -5021,10 +5676,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
 		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
-		safe_put_page(percpu->spare_page);
-		kfree(percpu->scribble);
-		percpu->spare_page = NULL;
-		percpu->scribble = NULL;
+		free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
 		break;
 	default:
 		break;
@@ -5036,40 +5688,29 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
 static int raid5_alloc_percpu(struct r5conf *conf)
 {
 	unsigned long cpu;
-	struct page *spare_page;
-	struct raid5_percpu __percpu *allcpus;
-	void *scribble;
-	int err;
+	int err = 0;
 
-	allcpus = alloc_percpu(struct raid5_percpu);
-	if (!allcpus)
+	conf->percpu = alloc_percpu(struct raid5_percpu);
+	if (!conf->percpu)
 		return -ENOMEM;
-	conf->percpu = allcpus;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	conf->cpu_notify.notifier_call = raid456_cpu_notify;
+	conf->cpu_notify.priority = 0;
+	err = register_cpu_notifier(&conf->cpu_notify);
+	if (err)
+		return err;
+#endif
 
 	get_online_cpus();
-	err = 0;
 	for_each_present_cpu(cpu) {
-		if (conf->level == 6) {
-			spare_page = alloc_page(GFP_KERNEL);
-			if (!spare_page) {
-				err = -ENOMEM;
-				break;
-			}
-			per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
-		}
-		scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
-		if (!scribble) {
-			err = -ENOMEM;
+		err = alloc_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
+		if (err) {
+			pr_err("%s: failed memory allocation for cpu%ld\n",
+			       __func__, cpu);
 			break;
 		}
-		per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
 	}
-#ifdef CONFIG_HOTPLUG_CPU
-	conf->cpu_notify.notifier_call = raid456_cpu_notify;
-	conf->cpu_notify.priority = 0;
-	if (err == 0)
-		err = register_cpu_notifier(&conf->cpu_notify);
-#endif
 	put_online_cpus();
 
 	return err;
@@ -5082,6 +5723,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	struct md_rdev *rdev;
 	struct disk_info *disk;
 	char pers_name[6];
+	int i;
+	int group_cnt, worker_cnt_per_group;
+	struct r5worker_group *new_group;
 
 	if (mddev->new_level != 5
 	    && mddev->new_level != 4
@@ -5115,14 +5759,23 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
 	if (conf == NULL)
 		goto abort;
+	/* Don't enable multi-threading by default*/
+	if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
+				 &new_group)) {
+		conf->group_cnt = group_cnt;
+		conf->worker_cnt_per_group = worker_cnt_per_group;
+		conf->worker_groups = new_group;
+	} else
+		goto abort;
 	spin_lock_init(&conf->device_lock);
+	seqcount_init(&conf->gen_lock);
 	init_waitqueue_head(&conf->wait_for_stripe);
 	init_waitqueue_head(&conf->wait_for_overlap);
 	INIT_LIST_HEAD(&conf->handle_list);
 	INIT_LIST_HEAD(&conf->hold_list);
 	INIT_LIST_HEAD(&conf->delayed_list);
 	INIT_LIST_HEAD(&conf->bitmap_list);
-	INIT_LIST_HEAD(&conf->inactive_list);
+	init_llist_head(&conf->released_stripes);
 	atomic_set(&conf->active_stripes, 0);
 	atomic_set(&conf->preread_active_stripes, 0);
 	atomic_set(&conf->active_aligned_reads, 0);
@@ -5147,6 +5800,21 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
 		goto abort;
 
+	/* We init hash_locks[0] separately to that it can be used
+	 * as the reference lock in the spin_lock_nest_lock() call
+	 * in lock_all_device_hash_locks_irq in order to convince
+	 * lockdep that we know what we are doing.
+	 */
+	spin_lock_init(conf->hash_locks);
+	for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
+		spin_lock_init(conf->hash_locks + i);
+
+	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+		INIT_LIST_HEAD(conf->inactive_list + i);
+
+	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+		INIT_LIST_HEAD(conf->temp_inactive_list + i);
+
 	conf->level = mddev->new_level;
 	if (raid5_alloc_percpu(conf) != 0)
 		goto abort;
@@ -5187,7 +5855,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	else
 		conf->max_degraded = 1;
 	conf->algorithm = mddev->new_layout;
-	conf->max_nr_stripes = NR_STRIPES;
 	conf->reshape_progress = mddev->reshape_position;
 	if (conf->reshape_progress != MaxSector) {
 		conf->prev_chunk_sectors = mddev->chunk_sectors;
@@ -5196,7 +5863,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 
 	memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
 		 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
-	if (grow_stripes(conf, conf->max_nr_stripes)) {
+	atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
+	if (grow_stripes(conf, NR_STRIPES)) {
 		printk(KERN_ERR
 		       "md/raid:%s: couldn't allocate %dkB for buffers\n",
 		       mdname(mddev), memory);
@@ -5418,7 +6086,7 @@ static int run(struct mddev *mddev)
 		if (mddev->major_version == 0 &&
 		    mddev->minor_version > 90)
 			rdev->recovery_offset = reshape_offset;
-			
+
 		if (rdev->recovery_offset < reshape_offset) {
 			/* We need to check old and new layout */
 			if (!only_parity(rdev->raid_disk,
@@ -5524,11 +6192,16 @@ static int run(struct mddev *mddev)
 		blk_queue_io_min(mddev->queue, chunk_size);
 		blk_queue_io_opt(mddev->queue, chunk_size *
 				 (conf->raid_disks - conf->max_degraded));
+		mddev->queue->limits.raid_partial_stripes_expensive = 1;
 		/*
 		 * We can only discard a whole stripe. It doesn't make sense to
 		 * discard data disk but write parity disk
 		 */
 		stripe = stripe * PAGE_SIZE;
+		/* Round up to power of 2, as discard handling
+		 * currently assumes that */
+		while ((stripe-1) & stripe)
+			stripe = (stripe | (stripe-1)) + 1;
 		mddev->queue->limits.discard_alignment = stripe;
 		mddev->queue->limits.discard_granularity = stripe;
 		/*
@@ -5537,6 +6210,8 @@ static int run(struct mddev *mddev)
 		 */
 		mddev->queue->limits.discard_zeroes_data = 0;
 
+		blk_queue_max_write_same_sectors(mddev->queue, 0);
+
 		rdev_for_each(rdev, mddev) {
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->data_offset << 9);
@@ -5860,7 +6535,7 @@ static int check_reshape(struct mddev *mddev)
 		return 0; /* nothing to do */
 	if (has_failed(conf))
 		return -EINVAL;
-	if (mddev->delta_disks < 0) {
+	if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) {
 		/* We might be able to shrink, but the devices must
 		 * be made bigger first.
 		 * For raid6, 4 is the minimum size.
@@ -5921,6 +6596,7 @@ static int raid5_start_reshape(struct mddev *mddev)
 
 	atomic_set(&conf->reshape_stripes, 0);
 	spin_lock_irq(&conf->device_lock);
+	write_seqcount_begin(&conf->gen_lock);
 	conf->previous_raid_disks = conf->raid_disks;
 	conf->raid_disks += mddev->delta_disks;
 	conf->prev_chunk_sectors = conf->chunk_sectors;
@@ -5937,8 +6613,16 @@ static int raid5_start_reshape(struct mddev *mddev)
 	else
 		conf->reshape_progress = 0;
 	conf->reshape_safe = conf->reshape_progress;
+	write_seqcount_end(&conf->gen_lock);
 	spin_unlock_irq(&conf->device_lock);
 
+	/* Now make sure any requests that proceeded on the assumption
+	 * the reshape wasn't running - like Discard or Read - have
+	 * completed.
+	 */
+	mddev_suspend(mddev);
+	mddev_resume(mddev);
+
 	/* Add some new drives, as many as will fit.
 	 * We know there are enough to make the newly sized array work.
 	 * Don't add devices if we are reducing the number of
@@ -5987,12 +6671,18 @@ static int raid5_start_reshape(struct mddev *mddev)
 	if (!mddev->sync_thread) {
 		mddev->recovery = 0;
 		spin_lock_irq(&conf->device_lock);
+		write_seqcount_begin(&conf->gen_lock);
 		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
+		mddev->new_chunk_sectors =
+			conf->chunk_sectors = conf->prev_chunk_sectors;
+		mddev->new_layout = conf->algorithm = conf->prev_algo;
 		rdev_for_each(rdev, mddev)
 			rdev->new_data_offset = rdev->data_offset;
 		smp_wmb();
+		conf->generation --;
 		conf->reshape_progress = MaxSector;
 		mddev->reshape_position = MaxSector;
+		write_seqcount_end(&conf->gen_lock);
 		spin_unlock_irq(&conf->device_lock);
 		return -EAGAIN;
 	}
@@ -6080,27 +6770,28 @@ static void raid5_quiesce(struct mddev *mddev, int state)
 		break;
 
 	case 1: /* stop all writes */
-		spin_lock_irq(&conf->device_lock);
+		lock_all_device_hash_locks_irq(conf);
 		/* '2' tells resync/reshape to pause so that all
 		 * active stripes can drain
 		 */
 		conf->quiesce = 2;
-		wait_event_lock_irq(conf->wait_for_stripe,
+		wait_event_cmd(conf->wait_for_stripe,
 				    atomic_read(&conf->active_stripes) == 0 &&
 				    atomic_read(&conf->active_aligned_reads) == 0,
-				    conf->device_lock, /* nothing */);
+				    unlock_all_device_hash_locks_irq(conf),
+				    lock_all_device_hash_locks_irq(conf));
 		conf->quiesce = 1;
-		spin_unlock_irq(&conf->device_lock);
+		unlock_all_device_hash_locks_irq(conf);
 		/* allow reshape to continue */
 		wake_up(&conf->wait_for_overlap);
 		break;
 
 	case 0: /* re-enable writes */
-		spin_lock_irq(&conf->device_lock);
+		lock_all_device_hash_locks_irq(conf);
 		conf->quiesce = 0;
 		wake_up(&conf->wait_for_stripe);
 		wake_up(&conf->wait_for_overlap);
-		spin_unlock_irq(&conf->device_lock);
+		unlock_all_device_hash_locks_irq(conf);
 		break;
 	}
 }
@@ -6413,6 +7104,10 @@ static struct md_personality raid4_personality =
 
 static int __init raid5_init(void)
 {
+	raid5_wq = alloc_workqueue("raid5wq",
+		WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
+	if (!raid5_wq)
+		return -ENOMEM;
 	register_md_personality(&raid6_personality);
 	register_md_personality(&raid5_personality);
 	register_md_personality(&raid4_personality);
@@ -6424,6 +7119,7 @@ static void raid5_exit(void)
 	unregister_md_personality(&raid6_personality);
 	unregister_md_personality(&raid5_personality);
 	unregister_md_personality(&raid4_personality);
+	destroy_workqueue(raid5_wq);
 }
 
 module_init(raid5_init);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 18b2c4a8a1f..bc72cd4be5f 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -49,7 +49,7 @@
  * can't distinguish between a clean block that has been generated
  * from parity calculations, and a clean block that has been
  * successfully written to the spare ( or to parity when resyncing).
- * To distingush these states we have a stripe bit STRIPE_INSYNC that
+ * To distinguish these states we have a stripe bit STRIPE_INSYNC that
  * is set whenever a write is scheduled to the spare, or to the parity
  * disc if there is no spare.  A sync request clears this bit, and
  * when we find it set with no buffers locked, we know the sync is
@@ -197,6 +197,7 @@ enum reconstruct_states {
 struct stripe_head {
 	struct hlist_node	hash;
 	struct list_head	lru;	      /* inactive_list or handle_list */
+	struct llist_node	release_list;
 	struct r5conf		*raid_conf;
 	short			generation;	/* increments with every
 						 * reshape */
@@ -204,6 +205,7 @@ struct stripe_head {
 	short			pd_idx;		/* parity disk index */
 	short			qd_idx;		/* 'Q' disk index for raid6 */
 	short			ddf_layout;/* use DDF ordering to calculate Q */
+	short			hash_lock_index;
 	unsigned long		state;		/* state flags */
 	atomic_t		count;	      /* nr of active thread/requests */
 	int			bm_seq;	/* sequence number for bitmap flushes */
@@ -211,6 +213,8 @@ struct stripe_head {
 	enum check_states	check_state;
 	enum reconstruct_states reconstruct_state;
 	spinlock_t		stripe_lock;
+	int			cpu;
+	struct r5worker_group	*group;
 	/**
 	 * struct stripe_operations
 	 * @target - STRIPE_OP_COMPUTE_BLK target
@@ -221,10 +225,6 @@ struct stripe_head {
 	struct stripe_operations {
 		int 		     target, target2;
 		enum sum_check_flags zero_sum_result;
-		#ifdef CONFIG_MULTICORE_RAID456
-		unsigned long	     request;
-		wait_queue_head_t    wait_for_ops;
-		#endif
 	} ops;
 	struct r5dev {
 		/* rreq and rvec are used for the replacement device when
@@ -232,7 +232,7 @@ struct stripe_head {
 		 */
 		struct bio	req, rreq;
 		struct bio_vec	vec, rvec;
-		struct page	*page;
+		struct page	*page, *orig_page;
 		struct bio	*toread, *read, *towrite, *written;
 		sector_t	sector;			/* sector of this page */
 		unsigned long	flags;
@@ -299,6 +299,7 @@ enum r5dev_flags {
 			 * data in, and now is a good time to write it out.
 			 */
 	R5_Discard,	/* Discard the stripe */
+	R5_SkipCopy,	/* Don't copy data from bio to stripe cache */
 };
 
 /*
@@ -310,6 +311,7 @@ enum {
 	STRIPE_SYNC_REQUESTED,
 	STRIPE_SYNCING,
 	STRIPE_INSYNC,
+	STRIPE_REPLACED,
 	STRIPE_PREREAD_ACTIVE,
 	STRIPE_DELAYED,
 	STRIPE_DEGRADED,
@@ -323,6 +325,8 @@ enum {
 	STRIPE_COMPUTE_RUN,
 	STRIPE_OPS_REQ_PENDING,
 	STRIPE_ON_UNPLUG_LIST,
+	STRIPE_DISCARD,
+	STRIPE_ON_RELEASE_LIST,
 };
 
 /*
@@ -365,8 +369,32 @@ struct disk_info {
 	struct md_rdev	*rdev, *replacement;
 };
 
+/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
+ * This is because we sometimes take all the spinlocks
+ * and creating that much locking depth can cause
+ * problems.
+ */
+#define NR_STRIPE_HASH_LOCKS 8
+#define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1)
+
+struct r5worker {
+	struct work_struct work;
+	struct r5worker_group *group;
+	struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
+	bool working;
+};
+
+struct r5worker_group {
+	struct list_head handle_list;
+	struct r5conf *conf;
+	struct r5worker *workers;
+	int stripes_cnt;
+};
+
 struct r5conf {
 	struct hlist_head	*stripe_hashtbl;
+	/* only protect corresponding hash list and inactive_list */
+	spinlock_t		hash_locks[NR_STRIPE_HASH_LOCKS];
 	struct mddev		*mddev;
 	int			chunk_sectors;
 	int			level, algorithm;
@@ -388,6 +416,7 @@ struct r5conf {
 	int			prev_chunk_sectors;
 	int			prev_algo;
 	short			generation; /* increments with every reshape */
+	seqcount_t		gen_lock;	/* lock against generation changes */
 	unsigned long		reshape_checkpoint; /* Time we last updated
 						     * metadata */
 	long long		min_offset_diff; /* minimum difference between
@@ -408,6 +437,7 @@ struct r5conf {
 	atomic_t		pending_full_writes; /* full write backlog */
 	int			bypass_count; /* bypassed prereads */
 	int			bypass_threshold; /* preread nice */
+	int			skip_copy; /* Don't copy data from bio to stripe cache */
 	struct list_head	*last_hold; /* detect hold_list promotions */
 
 	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */
@@ -446,7 +476,9 @@ struct r5conf {
 	 * Free stripes pool
 	 */
 	atomic_t		active_stripes;
-	struct list_head	inactive_list;
+	struct list_head	inactive_list[NR_STRIPE_HASH_LOCKS];
+	atomic_t		empty_inactive_list_nr;
+	struct llist_head	released_stripes;
 	wait_queue_head_t	wait_for_stripe;
 	wait_queue_head_t	wait_for_overlap;
 	int			inactive_blocked;	/* release of inactive stripes blocked,
@@ -460,6 +492,10 @@ struct r5conf {
 	 * the new thread here until we fully activate the array.
 	 */
 	struct md_thread	*thread;
+	struct list_head	temp_inactive_list[NR_STRIPE_HASH_LOCKS];
+	struct r5worker_group	*worker_groups;
+	int			group_cnt;
+	int			worker_cnt_per_group;
 };
 
 /*