aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-11-02 17:02:37 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2011-11-02 17:02:37 -0700
commit43672a0784707d795556b1f93925da8b8e797d03 (patch)
tree5c92aabd211281300f89fc2e69e9ee7e58bcc449
parent2380078cdb7e6d520e33dcf834e0be979d542e48 (diff)
parent2e727c3ca1beff05f27b6207a795790f222bf8d8 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/steve/linux-dm
* git://git.kernel.org/pub/scm/linux/kernel/git/steve/linux-dm: dm: raid fix device status indicator when array initializing dm log userspace: add log device dependency dm log userspace: fix comment hyphens dm: add thin provisioning target dm: add persistent data library dm: add bufio dm: export dm get md dm table: add immutable feature dm table: add always writeable feature dm table: add singleton feature dm kcopyd: add dm_kcopyd_zero to zero an area dm: remove superfluous smp_mb dm: use local printk ratelimit dm table: propagate non rotational flag
-rw-r--r--Documentation/device-mapper/dm-log.txt2
-rw-r--r--Documentation/device-mapper/persistent-data.txt84
-rw-r--r--Documentation/device-mapper/thin-provisioning.txt285
-rw-r--r--drivers/md/Kconfig36
-rw-r--r--drivers/md/Makefile4
-rw-r--r--drivers/md/dm-bufio.c1699
-rw-r--r--drivers/md/dm-bufio.h112
-rw-r--r--drivers/md/dm-ioctl.c11
-rw-r--r--drivers/md/dm-kcopyd.c31
-rw-r--r--drivers/md/dm-log-userspace-base.c37
-rw-r--r--drivers/md/dm-raid.c48
-rw-r--r--drivers/md/dm-table.c73
-rw-r--r--drivers/md/dm-thin-metadata.c1391
-rw-r--r--drivers/md/dm-thin-metadata.h156
-rw-r--r--drivers/md/dm-thin.c2428
-rw-r--r--drivers/md/dm.c21
-rw-r--r--drivers/md/dm.h2
-rw-r--r--drivers/md/persistent-data/Kconfig8
-rw-r--r--drivers/md/persistent-data/Makefile11
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c620
-rw-r--r--drivers/md/persistent-data/dm-block-manager.h123
-rw-r--r--drivers/md/persistent-data/dm-btree-internal.h137
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c566
-rw-r--r--drivers/md/persistent-data/dm-btree-spine.c244
-rw-r--r--drivers/md/persistent-data/dm-btree.c805
-rw-r--r--drivers/md/persistent-data/dm-btree.h145
-rw-r--r--drivers/md/persistent-data/dm-persistent-data-internal.h19
-rw-r--r--drivers/md/persistent-data/dm-space-map-checker.c437
-rw-r--r--drivers/md/persistent-data/dm-space-map-checker.h26
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c705
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.h126
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.c335
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.h25
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c596
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.h33
-rw-r--r--drivers/md/persistent-data/dm-space-map.h134
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.c400
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.h130
-rw-r--r--include/linux/device-mapper.h45
-rw-r--r--include/linux/dm-ioctl.h4
-rw-r--r--include/linux/dm-kcopyd.h4
-rw-r--r--include/linux/dm-log-userspace.h18
42 files changed, 12079 insertions, 37 deletions
diff --git a/Documentation/device-mapper/dm-log.txt b/Documentation/device-mapper/dm-log.txt
index 994dd75475a..c155ac569c4 100644
--- a/Documentation/device-mapper/dm-log.txt
+++ b/Documentation/device-mapper/dm-log.txt
@@ -48,7 +48,7 @@ kernel and userspace, 'connector' is used as the interface for
communication.
There are currently two userspace log implementations that leverage this
-framework - "clustered_disk" and "clustered_core". These implementations
+framework - "clustered-disk" and "clustered-core". These implementations
provide a cluster-coherent log for shared-storage. Device-mapper mirroring
can be used in a shared-storage environment when the cluster log implementations
are employed.
diff --git a/Documentation/device-mapper/persistent-data.txt b/Documentation/device-mapper/persistent-data.txt
new file mode 100644
index 00000000000..0e5df9b04ad
--- /dev/null
+++ b/Documentation/device-mapper/persistent-data.txt
@@ -0,0 +1,84 @@
+Introduction
+============
+
+The more-sophisticated device-mapper targets require complex metadata
+that is managed in kernel. In late 2010 we were seeing that various
+different targets were rolling their own data strutures, for example:
+
+- Mikulas Patocka's multisnap implementation
+- Heinz Mauelshagen's thin provisioning target
+- Another btree-based caching target posted to dm-devel
+- Another multi-snapshot target based on a design of Daniel Phillips
+
+Maintaining these data structures takes a lot of work, so if possible
+we'd like to reduce the number.
+
+The persistent-data library is an attempt to provide a re-usable
+framework for people who want to store metadata in device-mapper
+targets. It's currently used by the thin-provisioning target and an
+upcoming hierarchical storage target.
+
+Overview
+========
+
+The main documentation is in the header files which can all be found
+under drivers/md/persistent-data.
+
+The block manager
+-----------------
+
+dm-block-manager.[hc]
+
+This provides access to the data on disk in fixed sized-blocks. There
+is a read/write locking interface to prevent concurrent accesses, and
+keep data that is being used in the cache.
+
+Clients of persistent-data are unlikely to use this directly.
+
+The transaction manager
+-----------------------
+
+dm-transaction-manager.[hc]
+
+This restricts access to blocks and enforces copy-on-write semantics.
+The only way you can get hold of a writable block through the
+transaction manager is by shadowing an existing block (ie. doing
+copy-on-write) or allocating a fresh one. Shadowing is elided within
+the same transaction so performance is reasonable. The commit method
+ensures that all data is flushed before it writes the superblock.
+On power failure your metadata will be as it was when last committed.
+
+The Space Maps
+--------------
+
+dm-space-map.h
+dm-space-map-metadata.[hc]
+dm-space-map-disk.[hc]
+
+On-disk data structures that keep track of reference counts of blocks.
+Also acts as the allocator of new blocks. Currently two
+implementations: a simpler one for managing blocks on a different
+device (eg. thinly-provisioned data blocks); and one for managing
+the metadata space. The latter is complicated by the need to store
+its own data within the space it's managing.
+
+The data structures
+-------------------
+
+dm-btree.[hc]
+dm-btree-remove.c
+dm-btree-spine.c
+dm-btree-internal.h
+
+Currently there is only one data structure, a hierarchical btree.
+There are plans to add more. For example, something with an
+array-like interface would see a lot of use.
+
+The btree is 'hierarchical' in that you can define it to be composed
+of nested btrees, and take multiple keys. For example, the
+thin-provisioning target uses a btree with two levels of nesting.
+The first maps a device id to a mapping tree, and that in turn maps a
+virtual block to a physical block.
+
+Values stored in the btrees can have arbitrary size. Keys are always
+64bits, although nesting allows you to use multiple keys.
diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt
new file mode 100644
index 00000000000..801d9d1cf82
--- /dev/null
+++ b/Documentation/device-mapper/thin-provisioning.txt
@@ -0,0 +1,285 @@
+Introduction
+============
+
+This document descibes a collection of device-mapper targets that
+between them implement thin-provisioning and snapshots.
+
+The main highlight of this implementation, compared to the previous
+implementation of snapshots, is that it allows many virtual devices to
+be stored on the same data volume. This simplifies administration and
+allows the sharing of data between volumes, thus reducing disk usage.
+
+Another significant feature is support for an arbitrary depth of
+recursive snapshots (snapshots of snapshots of snapshots ...). The
+previous implementation of snapshots did this by chaining together
+lookup tables, and so performance was O(depth). This new
+implementation uses a single data structure to avoid this degradation
+with depth. Fragmentation may still be an issue, however, in some
+scenarios.
+
+Metadata is stored on a separate device from data, giving the
+administrator some freedom, for example to:
+
+- Improve metadata resilience by storing metadata on a mirrored volume
+ but data on a non-mirrored one.
+
+- Improve performance by storing the metadata on SSD.
+
+Status
+======
+
+These targets are very much still in the EXPERIMENTAL state. Please
+do not yet rely on them in production. But do experiment and offer us
+feedback. Different use cases will have different performance
+characteristics, for example due to fragmentation of the data volume.
+
+If you find this software is not performing as expected please mail
+dm-devel@redhat.com with details and we'll try our best to improve
+things for you.
+
+Userspace tools for checking and repairing the metadata are under
+development.
+
+Cookbook
+========
+
+This section describes some quick recipes for using thin provisioning.
+They use the dmsetup program to control the device-mapper driver
+directly. End users will be advised to use a higher-level volume
+manager such as LVM2 once support has been added.
+
+Pool device
+-----------
+
+The pool device ties together the metadata volume and the data volume.
+It maps I/O linearly to the data volume and updates the metadata via
+two mechanisms:
+
+- Function calls from the thin targets
+
+- Device-mapper 'messages' from userspace which control the creation of new
+ virtual devices amongst other things.
+
+Setting up a fresh pool device
+------------------------------
+
+Setting up a pool device requires a valid metadata device, and a
+data device. If you do not have an existing metadata device you can
+make one by zeroing the first 4k to indicate empty metadata.
+
+ dd if=/dev/zero of=$metadata_dev bs=4096 count=1
+
+The amount of metadata you need will vary according to how many blocks
+are shared between thin devices (i.e. through snapshots). If you have
+less sharing than average you'll need a larger-than-average metadata device.
+
+As a guide, we suggest you calculate the number of bytes to use in the
+metadata device as 48 * $data_dev_size / $data_block_size but round it up
+to 2MB if the answer is smaller. The largest size supported is 16GB.
+
+If you're creating large numbers of snapshots which are recording large
+amounts of change, you may need find you need to increase this.
+
+Reloading a pool table
+----------------------
+
+You may reload a pool's table, indeed this is how the pool is resized
+if it runs out of space. (N.B. While specifying a different metadata
+device when reloading is not forbidden at the moment, things will go
+wrong if it does not route I/O to exactly the same on-disk location as
+previously.)
+
+Using an existing pool device
+-----------------------------
+
+ dmsetup create pool \
+ --table "0 20971520 thin-pool $metadata_dev $data_dev \
+ $data_block_size $low_water_mark"
+
+$data_block_size gives the smallest unit of disk space that can be
+allocated at a time expressed in units of 512-byte sectors. People
+primarily interested in thin provisioning may want to use a value such
+as 1024 (512KB). People doing lots of snapshotting may want a smaller value
+such as 128 (64KB). If you are not zeroing newly-allocated data,
+a larger $data_block_size in the region of 256000 (128MB) is suggested.
+$data_block_size must be the same for the lifetime of the
+metadata device.
+
+$low_water_mark is expressed in blocks of size $data_block_size. If
+free space on the data device drops below this level then a dm event
+will be triggered which a userspace daemon should catch allowing it to
+extend the pool device. Only one such event will be sent.
+Resuming a device with a new table itself triggers an event so the
+userspace daemon can use this to detect a situation where a new table
+already exceeds the threshold.
+
+Thin provisioning
+-----------------
+
+i) Creating a new thinly-provisioned volume.
+
+ To create a new thinly- provisioned volume you must send a message to an
+ active pool device, /dev/mapper/pool in this example.
+
+ dmsetup message /dev/mapper/pool 0 "create_thin 0"
+
+ Here '0' is an identifier for the volume, a 24-bit number. It's up
+ to the caller to allocate and manage these identifiers. If the
+ identifier is already in use, the message will fail with -EEXIST.
+
+ii) Using a thinly-provisioned volume.
+
+ Thinly-provisioned volumes are activated using the 'thin' target:
+
+ dmsetup create thin --table "0 2097152 thin /dev/mapper/pool 0"
+
+ The last parameter is the identifier for the thinp device.
+
+Internal snapshots
+------------------
+
+i) Creating an internal snapshot.
+
+ Snapshots are created with another message to the pool.
+
+ N.B. If the origin device that you wish to snapshot is active, you
+ must suspend it before creating the snapshot to avoid corruption.
+ This is NOT enforced at the moment, so please be careful!
+
+ dmsetup suspend /dev/mapper/thin
+ dmsetup message /dev/mapper/pool 0 "create_snap 1 0"
+ dmsetup resume /dev/mapper/thin
+
+ Here '1' is the identifier for the volume, a 24-bit number. '0' is the
+ identifier for the origin device.
+
+ii) Using an internal snapshot.
+
+ Once created, the user doesn't have to worry about any connection
+ between the origin and the snapshot. Indeed the snapshot is no
+ different from any other thinly-provisioned device and can be
+ snapshotted itself via the same method. It's perfectly legal to
+ have only one of them active, and there's no ordering requirement on
+ activating or removing them both. (This differs from conventional
+ device-mapper snapshots.)
+
+ Activate it exactly the same way as any other thinly-provisioned volume:
+
+ dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 1"
+
+Deactivation
+------------
+
+All devices using a pool must be deactivated before the pool itself
+can be.
+
+ dmsetup remove thin
+ dmsetup remove snap
+ dmsetup remove pool
+
+Reference
+=========
+
+'thin-pool' target
+------------------
+
+i) Constructor
+
+ thin-pool <metadata dev> <data dev> <data block size (sectors)> \
+ <low water mark (blocks)> [<number of feature args> [<arg>]*]
+
+ Optional feature arguments:
+ - 'skip_block_zeroing': skips the zeroing of newly-provisioned blocks.
+
+ Data block size must be between 64KB (128 sectors) and 1GB
+ (2097152 sectors) inclusive.
+
+
+ii) Status
+
+ <transaction id> <used metadata blocks>/<total metadata blocks>
+ <used data blocks>/<total data blocks> <held metadata root>
+
+
+ transaction id:
+ A 64-bit number used by userspace to help synchronise with metadata
+ from volume managers.
+
+ used data blocks / total data blocks
+ If the number of free blocks drops below the pool's low water mark a
+ dm event will be sent to userspace. This event is edge-triggered and
+ it will occur only once after each resume so volume manager writers
+ should register for the event and then check the target's status.
+
+ held metadata root:
+ The location, in sectors, of the metadata root that has been
+ 'held' for userspace read access. '-' indicates there is no
+ held root. This feature is not yet implemented so '-' is
+ always returned.
+
+iii) Messages
+
+ create_thin <dev id>
+
+ Create a new thinly-provisioned device.
+ <dev id> is an arbitrary unique 24-bit identifier chosen by
+ the caller.
+
+ create_snap <dev id> <origin id>
+
+ Create a new snapshot of another thinly-provisioned device.
+ <dev id> is an arbitrary unique 24-bit identifier chosen by
+ the caller.
+ <origin id> is the identifier of the thinly-provisioned device
+ of which the new device will be a snapshot.
+
+ delete <dev id>
+
+ Deletes a thin device. Irreversible.
+
+ trim <dev id> <new size in sectors>
+
+ Delete mappings from the end of a thin device. Irreversible.
+ You might want to use this if you're reducing the size of
+ your thinly-provisioned device. In many cases, due to the
+ sharing of blocks between devices, it is not possible to
+ determine in advance how much space 'trim' will release. (In
+ future a userspace tool might be able to perform this
+ calculation.)
+
+ set_transaction_id <current id> <new id>
+
+ Userland volume managers, such as LVM, need a way to
+ synchronise their external metadata with the internal metadata of the
+ pool target. The thin-pool target offers to store an
+ arbitrary 64-bit transaction id and return it on the target's
+ status line. To avoid races you must provide what you think
+ the current transaction id is when you change it with this
+ compare-and-swap message.
+
+'thin' target
+-------------
+
+i) Constructor
+
+ thin <pool dev> <dev id>
+
+ pool dev:
+ the thin-pool device, e.g. /dev/mapper/my_pool or 253:0
+
+ dev id:
+ the internal device identifier of the device to be
+ activated.
+
+The pool doesn't store any size against the thin devices. If you
+load a thin target that is smaller than you've been using previously,
+then you'll have no access to blocks mapped beyond the end. If you
+load a target that is bigger than before, then extra blocks will be
+provisioned as and when needed.
+
+If you wish to reduce the size of your thin device and potentially
+regain some space then send the 'trim' message to the pool.
+
+ii) Status
+
+ <nr mapped sectors> <highest mapped sector>
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index f75a66e7d31..faa4741df6d 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -208,6 +208,16 @@ config DM_DEBUG
If unsure, say N.
+config DM_BUFIO
+ tristate
+ depends on BLK_DEV_DM && EXPERIMENTAL
+ ---help---
+ This interface allows you to do buffered I/O on a device and acts
+ as a cache, holding recently-read blocks in memory and performing
+ delayed writes.
+
+source "drivers/md/persistent-data/Kconfig"
+
config DM_CRYPT
tristate "Crypt target support"
depends on BLK_DEV_DM
@@ -233,6 +243,32 @@ config DM_SNAPSHOT
---help---
Allow volume managers to take writable snapshots of a device.
+config DM_THIN_PROVISIONING
+ tristate "Thin provisioning target (EXPERIMENTAL)"
+ depends on BLK_DEV_DM && EXPERIMENTAL
+ select DM_PERSISTENT_DATA
+ ---help---
+ Provides thin provisioning and snapshots that share a data store.
+
+config DM_DEBUG_BLOCK_STACK_TRACING
+ boolean "Keep stack trace of thin provisioning block lock holders"
+ depends on STACKTRACE_SUPPORT && DM_THIN_PROVISIONING
+ select STACKTRACE
+ ---help---
+ Enable this for messages that may help debug problems with the
+ block manager locking used by thin provisioning.
+
+ If unsure, say N.
+
+config DM_DEBUG_SPACE_MAPS
+ boolean "Extra validation for thin provisioning space maps"
+ depends on DM_THIN_PROVISIONING
+ ---help---
+ Enable this for messages that may help debug problems with the
+ space maps used by thin provisioning.
+
+ If unsure, say N.
+
config DM_MIRROR
tristate "Mirror target"
depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 448838b1f92..046860c7a16 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -10,6 +10,7 @@ dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \
dm-mirror-y += dm-raid1.o
dm-log-userspace-y \
+= dm-log-userspace-base.o dm-log-userspace-transfer.o
+dm-thin-pool-y += dm-thin.o dm-thin-metadata.o
md-mod-y += md.o bitmap.o
raid456-y += raid5.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_MD_MULTIPATH) += multipath.o
obj-$(CONFIG_MD_FAULTY) += faulty.o
obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
+obj-$(CONFIG_DM_BUFIO) += dm-bufio.o
obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
obj-$(CONFIG_DM_DELAY) += dm-delay.o
obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o
@@ -34,10 +36,12 @@ obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o
obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
+obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/
obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o
obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o
obj-$(CONFIG_DM_ZERO) += dm-zero.o
obj-$(CONFIG_DM_RAID) += dm-raid.o
+obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
new file mode 100644
index 00000000000..cb246667dd5
--- /dev/null
+++ b/drivers/md/dm-bufio.c
@@ -0,0 +1,1699 @@
+/*
+ * Copyright (C) 2009-2011 Red Hat, Inc.
+ *
+ * Author: Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-bufio.h"
+
+#include <linux/device-mapper.h>
+#include <linux/dm-io.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/version.h>
+#include <linux/shrinker.h>
+
+#define DM_MSG_PREFIX "bufio"
+
+/*
+ * Memory management policy:
+ * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
+ * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
+ * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
+ * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
+ * dirty buffers.
+ */
+#define DM_BUFIO_MIN_BUFFERS 8
+
+#define DM_BUFIO_MEMORY_PERCENT 2
+#define DM_BUFIO_VMALLOC_PERCENT 25
+#define DM_BUFIO_WRITEBACK_PERCENT 75
+
+/*
+ * Check buffer ages in this interval (seconds)
+ */
+#define DM_BUFIO_WORK_TIMER_SECS 10
+
+/*
+ * Free buffers when they are older than this (seconds)
+ */
+#define DM_BUFIO_DEFAULT_AGE_SECS 60
+
+/*
+ * The number of bvec entries that are embedded directly in the buffer.
+ * If the chunk size is larger, dm-io is used to do the io.
+ */
+#define DM_BUFIO_INLINE_VECS 16
+
+/*
+ * Buffer hash
+ */
+#define DM_BUFIO_HASH_BITS 20
+#define DM_BUFIO_HASH(block) \
+ ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \
+ ((1 << DM_BUFIO_HASH_BITS) - 1))
+
+/*
+ * Don't try to use kmem_cache_alloc for blocks larger than this.
+ * For explanation, see alloc_buffer_data below.
+ */
+#define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT (PAGE_SIZE >> 1)
+#define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1))
+
+/*
+ * dm_buffer->list_mode
+ */
+#define LIST_CLEAN 0
+#define LIST_DIRTY 1
+#define LIST_SIZE 2
+
+/*
+ * Linking of buffers:
+ * All buffers are linked to cache_hash with their hash_list field.
+ *
+ * Clean buffers that are not being written (B_WRITING not set)
+ * are linked to lru[LIST_CLEAN] with their lru_list field.
+ *
+ * Dirty and clean buffers that are being written are linked to
+ * lru[LIST_DIRTY] with their lru_list field. When the write
+ * finishes, the buffer cannot be relinked immediately (because we
+ * are in an interrupt context and relinking requires process
+ * context), so some clean-not-writing buffers can be held on
+ * dirty_lru too. They are later added to lru in the process
+ * context.
+ */
+struct dm_bufio_client {
+ struct mutex lock;
+
+ struct list_head lru[LIST_SIZE];
+ unsigned long n_buffers[LIST_SIZE];
+
+ struct block_device *bdev;
+ unsigned block_size;
+ unsigned char sectors_per_block_bits;
+ unsigned char pages_per_block_bits;
+ unsigned char blocks_per_page_bits;
+ unsigned aux_size;
+ void (*alloc_callback)(struct dm_buffer *);
+ void (*write_callback)(struct dm_buffer *);
+
+ struct dm_io_client *dm_io;
+
+ struct list_head reserved_buffers;
+ unsigned need_reserved_buffers;
+
+ struct hlist_head *cache_hash;
+ wait_queue_head_t free_buffer_wait;
+
+ int async_write_error;
+
+ struct list_head client_list;
+ struct shrinker shrinker;
+};
+
+/*
+ * Buffer state bits.
+ */
+#define B_READING 0
+#define B_WRITING 1
+#define B_DIRTY 2
+
+/*
+ * Describes how the block was allocated:
+ * kmem_cache_alloc(), __get_free_pages() or vmalloc().
+ * See the comment at alloc_buffer_data.
+ */
+enum data_mode {
+ DATA_MODE_SLAB = 0,
+ DATA_MODE_GET_FREE_PAGES = 1,
+ DATA_MODE_VMALLOC = 2,
+ DATA_MODE_LIMIT = 3
+};
+
+struct dm_buffer {
+ struct hlist_node hash_list;
+ struct list_head lru_list;
+ sector_t block;
+ void *data;
+ enum data_mode data_mode;
+ unsigned char list_mode; /* LIST_* */
+ unsigned hold_count;
+ int read_error;
+ int write_error;
+ unsigned long state;
+ unsigned long last_accessed;
+ struct dm_bufio_client *c;
+ struct bio bio;
+ struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS];
+};
+
+/*----------------------------------------------------------------*/
+
+static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT];
+static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT];
+
+static inline int dm_bufio_cache_index(struct dm_bufio_client *c)
+{
+ unsigned ret = c->blocks_per_page_bits - 1;
+
+ BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches));
+
+ return ret;
+}
+
+#define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)])
+#define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)])
+
+#define dm_bufio_in_request() (!!current->bio_list)
+
+static void dm_bufio_lock(struct dm_bufio_client *c)
+{
+ mutex_lock_nested(&c->lock, dm_bufio_in_request());
+}
+
+static int dm_bufio_trylock(struct dm_bufio_client *c)
+{
+ return mutex_trylock(&c->lock);
+}
+
+static void dm_bufio_unlock(struct dm_bufio_client *c)
+{
+ mutex_unlock(&c->lock);
+}
+
+/*
+ * FIXME Move to sched.h?
+ */
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+# define dm_bufio_cond_resched() \
+do { \
+ if (unlikely(need_resched())) \
+ _cond_resched(); \
+} while (0)
+#else
+# define dm_bufio_cond_resched() do { } while (0)
+#endif
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Default cache size: available memory divided by the ratio.
+ */
+static unsigned long dm_bufio_default_cache_size;
+
+/*
+ * Total cache size set by the user.
+ */
+static unsigned long dm_bufio_cache_size;
+
+/*
+ * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
+ * at any time. If it disagrees, the user has changed cache size.
+ */
+static unsigned long dm_bufio_cache_size_latch;
+
+static DEFINE_SPINLOCK(param_spinlock);
+
+/*
+ * Buffers are freed after this timeout
+ */
+static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
+
+static unsigned long dm_bufio_peak_allocated;
+static unsigned long dm_bufio_allocated_kmem_cache;
+static unsigned long dm_bufio_allocated_get_free_pages;
+static unsigned long dm_bufio_allocated_vmalloc;
+static unsigned long dm_bufio_current_allocated;
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count
+ */<