aboutsummaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-11-02 17:02:37 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2011-11-02 17:02:37 -0700
commit43672a0784707d795556b1f93925da8b8e797d03 (patch)
tree5c92aabd211281300f89fc2e69e9ee7e58bcc449 /drivers/md
parent2380078cdb7e6d520e33dcf834e0be979d542e48 (diff)
parent2e727c3ca1beff05f27b6207a795790f222bf8d8 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/steve/linux-dm
* git://git.kernel.org/pub/scm/linux/kernel/git/steve/linux-dm: dm: raid fix device status indicator when array initializing dm log userspace: add log device dependency dm log userspace: fix comment hyphens dm: add thin provisioning target dm: add persistent data library dm: add bufio dm: export dm get md dm table: add immutable feature dm table: add always writeable feature dm table: add singleton feature dm kcopyd: add dm_kcopyd_zero to zero an area dm: remove superfluous smp_mb dm: use local printk ratelimit dm table: propagate non rotational flag
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig36
-rw-r--r--drivers/md/Makefile4
-rw-r--r--drivers/md/dm-bufio.c1699
-rw-r--r--drivers/md/dm-bufio.h112
-rw-r--r--drivers/md/dm-ioctl.c11
-rw-r--r--drivers/md/dm-kcopyd.c31
-rw-r--r--drivers/md/dm-log-userspace-base.c37
-rw-r--r--drivers/md/dm-raid.c48
-rw-r--r--drivers/md/dm-table.c73
-rw-r--r--drivers/md/dm-thin-metadata.c1391
-rw-r--r--drivers/md/dm-thin-metadata.h156
-rw-r--r--drivers/md/dm-thin.c2428
-rw-r--r--drivers/md/dm.c21
-rw-r--r--drivers/md/dm.h2
-rw-r--r--drivers/md/persistent-data/Kconfig8
-rw-r--r--drivers/md/persistent-data/Makefile11
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c620
-rw-r--r--drivers/md/persistent-data/dm-block-manager.h123
-rw-r--r--drivers/md/persistent-data/dm-btree-internal.h137
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c566
-rw-r--r--drivers/md/persistent-data/dm-btree-spine.c244
-rw-r--r--drivers/md/persistent-data/dm-btree.c805
-rw-r--r--drivers/md/persistent-data/dm-btree.h145
-rw-r--r--drivers/md/persistent-data/dm-persistent-data-internal.h19
-rw-r--r--drivers/md/persistent-data/dm-space-map-checker.c437
-rw-r--r--drivers/md/persistent-data/dm-space-map-checker.h26
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c705
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.h126
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.c335
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.h25
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c596
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.h33
-rw-r--r--drivers/md/persistent-data/dm-space-map.h134
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.c400
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.h130
35 files changed, 11653 insertions, 21 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index f75a66e7d31..faa4741df6d 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -208,6 +208,16 @@ config DM_DEBUG
If unsure, say N.
+config DM_BUFIO
+ tristate
+ depends on BLK_DEV_DM && EXPERIMENTAL
+ ---help---
+ This interface allows you to do buffered I/O on a device and acts
+ as a cache, holding recently-read blocks in memory and performing
+ delayed writes.
+
+source "drivers/md/persistent-data/Kconfig"
+
config DM_CRYPT
tristate "Crypt target support"
depends on BLK_DEV_DM
@@ -233,6 +243,32 @@ config DM_SNAPSHOT
---help---
Allow volume managers to take writable snapshots of a device.
+config DM_THIN_PROVISIONING
+ tristate "Thin provisioning target (EXPERIMENTAL)"
+ depends on BLK_DEV_DM && EXPERIMENTAL
+ select DM_PERSISTENT_DATA
+ ---help---
+ Provides thin provisioning and snapshots that share a data store.
+
+config DM_DEBUG_BLOCK_STACK_TRACING
+ boolean "Keep stack trace of thin provisioning block lock holders"
+ depends on STACKTRACE_SUPPORT && DM_THIN_PROVISIONING
+ select STACKTRACE
+ ---help---
+ Enable this for messages that may help debug problems with the
+ block manager locking used by thin provisioning.
+
+ If unsure, say N.
+
+config DM_DEBUG_SPACE_MAPS
+ boolean "Extra validation for thin provisioning space maps"
+ depends on DM_THIN_PROVISIONING
+ ---help---
+ Enable this for messages that may help debug problems with the
+ space maps used by thin provisioning.
+
+ If unsure, say N.
+
config DM_MIRROR
tristate "Mirror target"
depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 448838b1f92..046860c7a16 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -10,6 +10,7 @@ dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \
dm-mirror-y += dm-raid1.o
dm-log-userspace-y \
+= dm-log-userspace-base.o dm-log-userspace-transfer.o
+dm-thin-pool-y += dm-thin.o dm-thin-metadata.o
md-mod-y += md.o bitmap.o
raid456-y += raid5.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_MD_MULTIPATH) += multipath.o
obj-$(CONFIG_MD_FAULTY) += faulty.o
obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
+obj-$(CONFIG_DM_BUFIO) += dm-bufio.o
obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
obj-$(CONFIG_DM_DELAY) += dm-delay.o
obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o
@@ -34,10 +36,12 @@ obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o
obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
+obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/
obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o
obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o
obj-$(CONFIG_DM_ZERO) += dm-zero.o
obj-$(CONFIG_DM_RAID) += dm-raid.o
+obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
new file mode 100644
index 00000000000..cb246667dd5
--- /dev/null
+++ b/drivers/md/dm-bufio.c
@@ -0,0 +1,1699 @@
+/*
+ * Copyright (C) 2009-2011 Red Hat, Inc.
+ *
+ * Author: Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-bufio.h"
+
+#include <linux/device-mapper.h>
+#include <linux/dm-io.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/version.h>
+#include <linux/shrinker.h>
+
+#define DM_MSG_PREFIX "bufio"
+
+/*
+ * Memory management policy:
+ * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
+ * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
+ * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
+ * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
+ * dirty buffers.
+ */
+#define DM_BUFIO_MIN_BUFFERS 8
+
+#define DM_BUFIO_MEMORY_PERCENT 2
+#define DM_BUFIO_VMALLOC_PERCENT 25
+#define DM_BUFIO_WRITEBACK_PERCENT 75
+
+/*
+ * Check buffer ages in this interval (seconds)
+ */
+#define DM_BUFIO_WORK_TIMER_SECS 10
+
+/*
+ * Free buffers when they are older than this (seconds)
+ */
+#define DM_BUFIO_DEFAULT_AGE_SECS 60
+
+/*
+ * The number of bvec entries that are embedded directly in the buffer.
+ * If the chunk size is larger, dm-io is used to do the io.
+ */
+#define DM_BUFIO_INLINE_VECS 16
+
+/*
+ * Buffer hash
+ */
+#define DM_BUFIO_HASH_BITS 20
+#define DM_BUFIO_HASH(block) \
+ ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \
+ ((1 << DM_BUFIO_HASH_BITS) - 1))
+
+/*
+ * Don't try to use kmem_cache_alloc for blocks larger than this.
+ * For explanation, see alloc_buffer_data below.
+ */
+#define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT (PAGE_SIZE >> 1)
+#define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1))
+
+/*
+ * dm_buffer->list_mode
+ */
+#define LIST_CLEAN 0
+#define LIST_DIRTY 1
+#define LIST_SIZE 2
+
+/*
+ * Linking of buffers:
+ * All buffers are linked to cache_hash with their hash_list field.
+ *
+ * Clean buffers that are not being written (B_WRITING not set)
+ * are linked to lru[LIST_CLEAN] with their lru_list field.
+ *
+ * Dirty and clean buffers that are being written are linked to
+ * lru[LIST_DIRTY] with their lru_list field. When the write
+ * finishes, the buffer cannot be relinked immediately (because we
+ * are in an interrupt context and relinking requires process
+ * context), so some clean-not-writing buffers can be held on
+ * dirty_lru too. They are later added to lru in the process
+ * context.
+ */
+struct dm_bufio_client {
+ struct mutex lock;
+
+ struct list_head lru[LIST_SIZE];
+ unsigned long n_buffers[LIST_SIZE];
+
+ struct block_device *bdev;
+ unsigned block_size;
+ unsigned char sectors_per_block_bits;
+ unsigned char pages_per_block_bits;
+ unsigned char blocks_per_page_bits;
+ unsigned aux_size;
+ void (*alloc_callback)(struct dm_buffer *);
+ void (*write_callback)(struct dm_buffer *);
+
+ struct dm_io_client *dm_io;
+
+ struct list_head reserved_buffers;
+ unsigned need_reserved_buffers;
+
+ struct hlist_head *cache_hash;
+ wait_queue_head_t free_buffer_wait;
+
+ int async_write_error;
+
+ struct list_head client_list;
+ struct shrinker shrinker;
+};
+
+/*
+ * Buffer state bits.
+ */
+#define B_READING 0
+#define B_WRITING 1
+#define B_DIRTY 2
+
+/*
+ * Describes how the block was allocated:
+ * kmem_cache_alloc(), __get_free_pages() or vmalloc().
+ * See the comment at alloc_buffer_data.
+ */
+enum data_mode {
+ DATA_MODE_SLAB = 0,
+ DATA_MODE_GET_FREE_PAGES = 1,
+ DATA_MODE_VMALLOC = 2,
+ DATA_MODE_LIMIT = 3
+};
+
+struct dm_buffer {
+ struct hlist_node hash_list;
+ struct list_head lru_list;
+ sector_t block;
+ void *data;
+ enum data_mode data_mode;
+ unsigned char list_mode; /* LIST_* */
+ unsigned hold_count;
+ int read_error;
+ int write_error;
+ unsigned long state;
+ unsigned long last_accessed;
+ struct dm_bufio_client *c;
+ struct bio bio;
+ struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS];
+};
+
+/*----------------------------------------------------------------*/
+
+static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT];
+static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT];
+
+static inline int dm_bufio_cache_index(struct dm_bufio_client *c)
+{
+ unsigned ret = c->blocks_per_page_bits - 1;
+
+ BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches));
+
+ return ret;
+}
+
+#define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)])
+#define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)])
+
+#define dm_bufio_in_request() (!!current->bio_list)
+
+static void dm_bufio_lock(struct dm_bufio_client *c)
+{
+ mutex_lock_nested(&c->lock, dm_bufio_in_request());
+}
+
+static int dm_bufio_trylock(struct dm_bufio_client *c)
+{
+ return mutex_trylock(&c->lock);
+}
+
+static void dm_bufio_unlock(struct dm_bufio_client *c)
+{
+ mutex_unlock(&c->lock);
+}
+
+/*
+ * FIXME Move to sched.h?
+ */
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+# define dm_bufio_cond_resched() \
+do { \
+ if (unlikely(need_resched())) \
+ _cond_resched(); \
+} while (0)
+#else
+# define dm_bufio_cond_resched() do { } while (0)
+#endif
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Default cache size: available memory divided by the ratio.
+ */
+static unsigned long dm_bufio_default_cache_size;
+
+/*
+ * Total cache size set by the user.
+ */
+static unsigned long dm_bufio_cache_size;
+
+/*
+ * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
+ * at any time. If it disagrees, the user has changed cache size.
+ */
+static unsigned long dm_bufio_cache_size_latch;
+
+static DEFINE_SPINLOCK(param_spinlock);
+
+/*
+ * Buffers are freed after this timeout
+ */
+static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
+
+static unsigned long dm_bufio_peak_allocated;
+static unsigned long dm_bufio_allocated_kmem_cache;
+static unsigned long dm_bufio_allocated_get_free_pages;
+static unsigned long dm_bufio_allocated_vmalloc;
+static unsigned long dm_bufio_current_allocated;
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count
+ */
+static unsigned long dm_bufio_cache_size_per_client;
+
+/*
+ * The current number of clients.
+ */
+static int dm_bufio_client_count;
+
+/*
+ * The list of all clients.
+ */
+static LIST_HEAD(dm_bufio_all_clients);
+
+/*
+ * This mutex protects dm_bufio_cache_size_latch,
+ * dm_bufio_cache_size_per_client and dm_bufio_client_count
+ */
+static DEFINE_MUTEX(dm_bufio_clients_lock);
+
+/*----------------------------------------------------------------*/
+
+static void adjust_total_allocated(enum data_mode data_mode, long diff)
+{
+ static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
+ &dm_bufio_allocated_kmem_cache,
+ &dm_bufio_allocated_get_free_pages,
+ &dm_bufio_allocated_vmalloc,
+ };
+
+ spin_lock(&param_spinlock);
+
+ *class_ptr[data_mode] += diff;
+
+ dm_bufio_current_allocated += diff;
+
+ if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
+ dm_bufio_peak_allocated = dm_bufio_current_allocated;
+
+ spin_unlock(&param_spinlock);
+}
+
+/*
+ * Change the number of clients and recalculate per-client limit.
+ */
+static void __cache_size_refresh(void)
+{
+ BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
+ BUG_ON(dm_bufio_client_count < 0);
+
+ dm_bufio_cache_size_latch = dm_bufio_cache_size;
+
+ barrier();
+
+ /*
+ * Use default if set to 0 and report the actual cache size used.
+ */
+ if (!dm_bufio_cache_size_latch) {
+ (void)cmpxchg(&dm_bufio_cache_size, 0,
+ dm_bufio_default_cache_size);
+ dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
+ }
+
+ dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch /
+ (dm_bufio_client_count ? : 1);
+}
+
+/*
+ * Allocating buffer data.
+ *
+ * Small buffers are allocated with kmem_cache, to use space optimally.
+ *
+ * For large buffers, we choose between get_free_pages and vmalloc.
+ * Each has advantages and disadvantages.
+ *
+ * __get_free_pages can randomly fail if the memory is fragmented.
+ * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
+ * as low as 128M) so using it for caching is not appropriate.
+ *
+ * If the allocation may fail we use __get_free_pages. Memory fragmentation
+ * won't have a fatal effect here, but it just causes flushes of some other
+ * buffers and more I/O will be performed. Don't use __get_free_pages if it
+ * always fails (i.e. order >= MAX_ORDER).
+ *
+ * If the allocation shouldn't fail we use __vmalloc. This is only for the
+ * initial reserve allocation, so there's no risk of wasting all vmalloc
+ * space.
+ */
+static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
+ enum data_mode *data_mode)
+{
+ if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) {
+ *data_mode = DATA_MODE_SLAB;
+ return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask);
+ }
+
+ if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT &&
+ gfp_mask & __GFP_NORETRY) {
+ *data_mode = DATA_MODE_GET_FREE_PAGES;
+ return (void *)__get_free_pages(gfp_mask,
+ c->pages_per_block_bits);
+ }
+
+ *data_mode = DATA_MODE_VMALLOC;
+ return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
+}
+
+/*
+ * Free buffer's data.
+ */
+static void free_buffer_data(struct dm_bufio_client *c,
+ void *data, enum data_mode data_mode)
+{
+ switch (data_mode) {
+ case DATA_MODE_SLAB:
+ kmem_cache_free(DM_BUFIO_CACHE(c), data);
+ break;
+
+ case DATA_MODE_GET_FREE_PAGES:
+ free_pages((unsigned long)data, c->pages_per_block_bits);
+ break;
+
+ case DATA_MODE_VMALLOC:
+ vfree(data);
+ break;
+
+ default:
+ DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
+ data_mode);
+ BUG();
+ }
+}
+
+/*
+ * Allocate buffer and its data.
+ */
+static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
+{
+ struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size,
+ gfp_mask);
+
+ if (!b)
+ return NULL;
+
+ b->c = c;
+
+ b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
+ if (!b->data) {
+ kfree(b);
+ return NULL;
+ }
+
+ adjust_total_allocated(b->data_mode, (long)c->block_size);
+
+ return b;
+}
+
+/*
+ * Free buffer and its data.
+ */
+static void free_buffer(struct dm_buffer *b)
+{
+ struct dm_bufio_client *c = b->c;
+
+ adjust_total_allocated(b->data_mode, -(long)c->block_size);
+
+ free_buffer_data(c, b->data, b->data_mode);
+ kfree(b);
+}
+
+/*
+ * Link buffer to the hash list and clean or dirty queue.
+ */
+static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
+{
+ struct dm_bufio_client *c = b->c;
+
+ c->n_buffers[dirty]++;
+ b->block = block;
+ b->list_mode = dirty;
+ list_add(&b->lru_list, &c->lru[dirty]);
+ hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]);
+ b->last_accessed = jiffies;
+}
+
+/*
+ * Unlink buffer from the hash list and dirty or clean queue.
+ */
+static void __unlink_buffer(struct dm_buffer *b)
+{
+ struct dm_bufio_client *c = b->c;
+
+ BUG_ON(!c->n_buffers[b->list_mode]);
+
+ c->n_buffers[b->list_mode]--;
+ hlist_del(&b->hash_list);
+ list_del(&b->lru_list);
+}
+
+/*
+ * Place the buffer to the head of dirty or clean LRU queue.
+ */
+static void __relink_lru(struct dm_buffer *b, int dirty)
+{
+ struct dm_bufio_client *c = b->c;
+
+ BUG_ON(!c->n_buffers[b->list_mode]);
+
+ c->n_buffers[b->list_mode]--;
+ c->n_buffers[dirty]++;
+ b->list_mode = dirty;
+ list_del(&b->lru_list);
+ list_add(&b->lru_list, &c->lru[dirty]);
+}
+
+/*----------------------------------------------------------------
+ * Submit I/O on the buffer.
+ *
+ * Bio interface is faster but it has some problems:
+ * the vector list is limited (increasing this limit increases
+ * memory-consumption per buffer, so it is not viable);
+ *
+ * the memory must be direct-mapped, not vmalloced;
+ *
+ * the I/O driver can reject requests spuriously if it thinks that
+ * the requests are too big for the device or if they cross a
+ * controller-defined memory boundary.
+ *
+ * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
+ * it is not vmalloced, try using the bio interface.
+ *
+ * If the buffer is big, if it is vmalloced or if the underlying device
+ * rejects the bio because it is too large, use dm-io layer to do the I/O.
+ * The dm-io layer splits the I/O into multiple requests, avoiding the above
+ * shortcomings.
+ *--------------------------------------------------------------*/
+
+/*
+ * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
+ * that the request was handled directly with bio interface.
+ */
+static void dmio_complete(unsigned long error, void *context)
+{
+ struct dm_buffer *b = context;
+
+ b->bio.bi_end_io(&b->bio, error ? -EIO : 0);
+}
+
+static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
+ bio_end_io_t *end_io)
+{
+ int r;
+ struct dm_io_request io_req = {
+ .bi_rw = rw,
+ .notify.fn = dmio_complete,
+ .notify.context = b,
+ .client = b->c->dm_io,
+ };
+ struct dm_io_region region = {
+ .bdev = b->c->bdev,
+ .sector = block << b->c->sectors_per_block_bits,
+ .count = b->c->block_size >> SECTOR_SHIFT,
+ };
+
+ if (b->data_mode != DATA_MODE_VMALLOC) {
+ io_req.mem.type = DM_IO_KMEM;
+ io_req.mem.ptr.addr = b->data;
+ } else {
+ io_req.mem.type = DM_IO_VMA;
+ io_req.mem.ptr.vma = b->data;
+ }
+
+ b->bio.bi_end_io = end_io;
+
+ r = dm_io(&io_req, 1, &region, NULL);
+ if (r)
+ end_io(&b->bio, r);
+}
+
+static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
+ bio_end_io_t *end_io)
+{
+ char *ptr;
+ int len;
+
+ bio_init(&b->bio);
+ b->bio.bi_io_vec = b->bio_vec;
+ b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS;
+ b->bio.bi_sector = block << b->c->sectors_per_block_bits;
+ b->bio.bi_bdev = b->c->bdev;
+ b->bio.bi_end_io = end_io;
+
+ /*
+ * We assume that if len >= PAGE_SIZE ptr is page-aligned.
+ * If len < PAGE_SIZE the buffer doesn't cross page boundary.
+ */
+ ptr = b->data;
+ len = b->c->block_size;
+
+ if (len >= PAGE_SIZE)
+ BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1));
+ else
+ BUG_ON((unsigned long)ptr & (len - 1));
+
+ do {
+ if (!bio_add_page(&b->bio, virt_to_page(ptr),
+ len < PAGE_SIZE ? len : PAGE_SIZE,
+ virt_to_phys(ptr) & (PAGE_SIZE - 1))) {
+ BUG_ON(b->c->block_size <= PAGE_SIZE);
+ use_dmio(b, rw, block, end_io);
+ return;
+ }
+
+ len -= PAGE_SIZE;
+ ptr += PAGE_SIZE;
+ } while (len > 0);
+
+ submit_bio(rw, &b->bio);
+}
+
+static void submit_io(struct dm_buffer *b, int rw, sector_t block,
+ bio_end_io_t *end_io)
+{
+ if (rw == WRITE && b->c->write_callback)
+ b->c->write_callback(b);
+
+ if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE &&
+ b->data_mode != DATA_MODE_VMALLOC)
+ use_inline_bio(b, rw, block, end_io);
+ else
+ use_dmio(b, rw, block, end_io);
+}
+
+/*----------------------------------------------------------------
+ * Writing dirty buffers
+ *--------------------------------------------------------------*/
+
+/*
+ * The endio routine for write.
+ *
+ * Set the error, clear B_WRITING bit and wake anyone who was waiting on
+ * it.
+ */
+static void write_endio(struct bio *bio, int error)
+{
+ struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
+
+ b->write_error = error;
+ if (error) {
+ struct dm_bufio_client *c = b->c;
+ (void)cmpxchg(&c->async_write_error, 0, error);
+ }
+
+ BUG_ON(!test_bit(B_WRITING, &b->state));
+
+ smp_mb__before_clear_bit();
+ clear_bit(B_WRITING, &b->state);
+ smp_mb__after_clear_bit();
+
+ wake_up_bit(&b->state, B_WRITING);
+}
+
+/*
+ * This function is called when wait_on_bit is actually waiting.
+ */
+static int do_io_schedule(void *word)
+{
+ io_schedule();
+
+ return 0;
+}
+
+/*
+ * Initiate a write on a dirty buffer, but don't wait for it.
+ *
+ * - If the buffer is not dirty, exit.
+ * - If there some previous write going on, wait for it to finish (we can't
+ * have two writes on the same buffer simultaneously).
+ * - Submit our write and don't wait on it. We set B_WRITING indicating
+ * that there is a write in progress.
+ */
+static void __write_dirty_buffer(struct dm_buffer *b)
+{
+ if (!test_bit(B_DIRTY, &b->state))
+ return;
+
+ clear_bit(B_DIRTY, &b->state);
+ wait_on_bit_lock(&b->state, B_WRITING,
+ do_io_schedule, TASK_UNINTERRUPTIBLE);
+
+ submit_io(b, WRITE, b->block, write_endio);
+}
+
+/*
+ * Wait until any activity on the buffer finishes. Possibly write the
+ * buffer if it is dirty. When this function finishes, there is no I/O
+ * running on the buffer and the buffer is not dirty.
+ */
+static void __make_buffer_clean(struct dm_buffer *b)
+{
+ BUG_ON(b->hold_count);
+
+ if (!b->state) /* fast case */
+ return;
+
+ wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE);
+ __write_dirty_buffer(b);
+ wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE);
+}
+
+/*
+ * Find some buffer that is not held by anybody, clean it, unlink it and
+ * return it.
+ */
+static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
+{
+ struct dm_buffer *b;
+
+ list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) {
+ BUG_ON(test_bit(B_WRITING, &b->state));
+ BUG_ON(test_bit(B_DIRTY, &b->state));
+
+ if (!b->hold_count) {
+ __make_buffer_clean(b);
+ __unlink_buffer(b);
+ return b;
+ }
+ dm_bufio_cond_resched();
+ }
+
+ list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) {
+ BUG_ON(test_bit(B_READING, &b->state));
+
+ if (!b->hold_count) {
+ __make_buffer_clean(b);
+ __unlink_buffer(b);
+ return b;
+ }
+ dm_bufio_cond_resched();
+ }
+
+ return NULL;
+}
+
+/*
+ * Wait until some other threads free some buffer or release hold count on
+ * some buffer.
+ *
+ * This function is entered with c->lock held, drops it and regains it
+ * before exiting.
+ */
+static void __wait_for_free_buffer(struct dm_bufio_client *c)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue(&c->free_buffer_wait, &wait);
+ set_task_state(current, TASK_UNINTERRUPTIBLE);
+ dm_bufio_unlock(c);
+
+ io_schedule();
+
+ set_task_state(current, TASK_RUNNING);
+ remove_wait_queue(&c->free_buffer_wait, &wait);
+
+ dm_bufio_lock(c);
+}
+
+/*
+ * Allocate a new buffer. If the allocation is not possible, wait until
+ * some other thread frees a buffer.
+ *
+ * May drop the lock and regain it.
+ */
+static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c)
+{
+ struct dm_buffer *b;
+
+ /*
+ * dm-bufio is resistant to allocation failures (it just keeps
+ * one buffer reserved in cases all the allocations fail).
+ * So set flags to not try too hard:
+ * GFP_NOIO: don't recurse into the I/O layer
+ * __GFP_NORETRY: don't retry and rather return failure
+ * __GFP_NOMEMALLOC: don't use emergency reserves
+ * __GFP_NOWARN: don't print a warning in case of failure
+ *
+ * For debugging, if we set the cache size to 1, no new buffers will
+ * be allocated.
+ */
+ while (1) {
+ if (dm_bufio_cache_size_latch != 1) {
+ b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ if (b)
+ return b;
+ }
+
+ if (!list_empty(&c->reserved_buffers)) {
+ b = list_entry(c->reserved_buffers.next,
+ struct dm_buffer, lru_list);
+ list_del(&b->lru_list);
+ c->need_reserved_buffers++;
+
+ return b;
+ }
+
+ b = __get_unclaimed_buffer(c);
+ if (b)
+ return b;
+
+ __wait_for_free_buffer(c);
+ }
+}
+
+static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c)
+{
+ struct dm_buffer *b = __alloc_buffer_wait_no_callback(c);
+
+ if (c->alloc_callback)
+ c->alloc_callback(b);
+
+ return b;
+}
+
+/*
+ * Free a buffer and wake other threads waiting for free buffers.
+ */
+static void __free_buffer_wake(struct dm_buffer *b)
+{
+ struct dm_bufio_client *c = b->c;
+
+ if (!c->need_reserved_buffers)
+ free_buffer(b);
+ else {
+ list_add(&b->lru_list, &c->r