aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-12-15 09:12:01 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2009-12-15 09:12:01 -0800
commit53365383c4667aba55385cd1858582c19a7a8a36 (patch)
treeb290d003534b3947834762c2fb492d9d0beb985f
parent51b736b85155a56543fda8aeca5f8592795d7983 (diff)
parentd2fdb776e08d4231d7e86a879cc663a93913c202 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm
* git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm: (80 commits) dm snapshot: use merge origin if snapshot invalid dm snapshot: report merge failure in status dm snapshot: merge consecutive chunks together dm snapshot: trigger exceptions in remaining snapshots during merge dm snapshot: delay merging a chunk until writes to it complete dm snapshot: queue writes to chunks being merged dm snapshot: add merging dm snapshot: permit only one merge at once dm snapshot: support barriers in snapshot merge target dm snapshot: avoid allocating exceptions in merge dm snapshot: rework writing to origin dm snapshot: add merge target dm exception store: add merge specific methods dm snapshot: create function for chunk_is_tracked wait dm snapshot: make bio optional in __origin_write dm mpath: reject messages when device is suspended dm: export suspended state to targets dm: rename dm_suspended to dm_suspended_md dm: swap target postsuspend call and setting suspended flag dm crypt: add plain64 iv ...
-rw-r--r--Documentation/device-mapper/snapshot.txt60
-rw-r--r--drivers/md/dm-crypt.c207
-rw-r--r--drivers/md/dm-exception-store.c33
-rw-r--r--drivers/md/dm-exception-store.h62
-rw-r--r--drivers/md/dm-io.c120
-rw-r--r--drivers/md/dm-ioctl.c123
-rw-r--r--drivers/md/dm-kcopyd.c5
-rw-r--r--drivers/md/dm-log.c77
-rw-r--r--drivers/md/dm-mpath.c95
-rw-r--r--drivers/md/dm-raid1.c219
-rw-r--r--drivers/md/dm-region-hash.c31
-rw-r--r--drivers/md/dm-snap-persistent.c195
-rw-r--r--drivers/md/dm-snap-transient.c24
-rw-r--r--drivers/md/dm-snap.c1279
-rw-r--r--drivers/md/dm-sysfs.c10
-rw-r--r--drivers/md/dm-table.c3
-rw-r--r--drivers/md/dm-uevent.c9
-rw-r--r--drivers/md/dm.c643
-rw-r--r--drivers/md/dm.h13
-rw-r--r--include/linux/device-mapper.h8
-rw-r--r--include/linux/dm-dirty-log.h6
-rw-r--r--include/linux/dm-ioctl.h13
-rw-r--r--include/linux/dm-region-hash.h3
23 files changed, 2349 insertions, 889 deletions
diff --git a/Documentation/device-mapper/snapshot.txt b/Documentation/device-mapper/snapshot.txt
index a5009c8300f..e3a77b21513 100644
--- a/Documentation/device-mapper/snapshot.txt
+++ b/Documentation/device-mapper/snapshot.txt
@@ -8,13 +8,19 @@ the block device which are also writable without interfering with the
original content;
*) To create device "forks", i.e. multiple different versions of the
same data stream.
+*) To merge a snapshot of a block device back into the snapshot's origin
+device.
+In the first two cases, dm copies only the chunks of data that get
+changed and uses a separate copy-on-write (COW) block device for
+storage.
-In both cases, dm copies only the chunks of data that get changed and
-uses a separate copy-on-write (COW) block device for storage.
+For snapshot merge the contents of the COW storage are merged back into
+the origin device.
-There are two dm targets available: snapshot and snapshot-origin.
+There are three dm targets available:
+snapshot, snapshot-origin, and snapshot-merge.
*) snapshot-origin <origin>
@@ -40,8 +46,25 @@ The difference is that for transient snapshots less metadata must be
saved on disk - they can be kept in memory by the kernel.
-How this is used by LVM2
-========================
+* snapshot-merge <origin> <COW device> <persistent> <chunksize>
+
+takes the same table arguments as the snapshot target except it only
+works with persistent snapshots. This target assumes the role of the
+"snapshot-origin" target and must not be loaded if the "snapshot-origin"
+is still present for <origin>.
+
+Creates a merging snapshot that takes control of the changed chunks
+stored in the <COW device> of an existing snapshot, through a handover
+procedure, and merges these chunks back into the <origin>. Once merging
+has started (in the background) the <origin> may be opened and the merge
+will continue while I/O is flowing to it. Changes to the <origin> are
+deferred until the merging snapshot's corresponding chunk(s) have been
+merged. Once merging has started the snapshot device, associated with
+the "snapshot" target, will return -EIO when accessed.
+
+
+How snapshot is used by LVM2
+============================
When you create the first LVM2 snapshot of a volume, four dm devices are used:
1) a device containing the original mapping table of the source volume;
@@ -72,3 +95,30 @@ brw------- 1 root root 254, 12 29 ago 18:15 /dev/mapper/volumeGroup-snap-cow
brw------- 1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap
brw------- 1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base
+
+How snapshot-merge is used by LVM2
+==================================
+A merging snapshot assumes the role of the "snapshot-origin" while
+merging. As such the "snapshot-origin" is replaced with
+"snapshot-merge". The "-real" device is not changed and the "-cow"
+device is renamed to <origin name>-cow to aid LVM2's cleanup of the
+merging snapshot after it completes. The "snapshot" that hands over its
+COW device to the "snapshot-merge" is deactivated (unless using lvchange
+--refresh); but if it is left active it will simply return I/O errors.
+
+A snapshot will merge into its origin with the following command:
+
+lvconvert --merge volumeGroup/snap
+
+we'll now have this situation:
+
+# dmsetup table|grep volumeGroup
+
+volumeGroup-base-real: 0 2097152 linear 8:19 384
+volumeGroup-base-cow: 0 204800 linear 8:19 2097536
+volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16
+
+# ls -lL /dev/mapper/volumeGroup-*
+brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real
+brw------- 1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow
+brw------- 1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index e412980763b..a93637223c8 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,7 +1,7 @@
/*
* Copyright (C) 2003 Christophe Saout <christophe@saout.de>
* Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
- * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
*
* This file is released under the GPL.
*/
@@ -71,10 +71,21 @@ struct crypt_iv_operations {
int (*ctr)(struct crypt_config *cc, struct dm_target *ti,
const char *opts);
void (*dtr)(struct crypt_config *cc);
- const char *(*status)(struct crypt_config *cc);
+ int (*init)(struct crypt_config *cc);
+ int (*wipe)(struct crypt_config *cc);
int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
};
+struct iv_essiv_private {
+ struct crypto_cipher *tfm;
+ struct crypto_hash *hash_tfm;
+ u8 *salt;
+};
+
+struct iv_benbi_private {
+ int shift;
+};
+
/*
* Crypt: maps a linear range of a block device
* and encrypts / decrypts at the same time.
@@ -102,8 +113,8 @@ struct crypt_config {
struct crypt_iv_operations *iv_gen_ops;
char *iv_mode;
union {
- struct crypto_cipher *essiv_tfm;
- int benbi_shift;
+ struct iv_essiv_private essiv;
+ struct iv_benbi_private benbi;
} iv_gen_private;
sector_t iv_offset;
unsigned int iv_size;
@@ -147,6 +158,9 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io);
* plain: the initial vector is the 32-bit little-endian version of the sector
* number, padded with zeros if necessary.
*
+ * plain64: the initial vector is the 64-bit little-endian version of the sector
+ * number, padded with zeros if necessary.
+ *
* essiv: "encrypted sector|salt initial vector", the sector number is
* encrypted with the bulk cipher using a salt as key. The salt
* should be derived from the bulk cipher's key via hashing.
@@ -169,88 +183,123 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
return 0;
}
-static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
- const char *opts)
+static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
+ sector_t sector)
{
- struct crypto_cipher *essiv_tfm;
- struct crypto_hash *hash_tfm;
+ memset(iv, 0, cc->iv_size);
+ *(u64 *)iv = cpu_to_le64(sector);
+
+ return 0;
+}
+
+/* Initialise ESSIV - compute salt but no local memory allocations */
+static int crypt_iv_essiv_init(struct crypt_config *cc)
+{
+ struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
struct hash_desc desc;
struct scatterlist sg;
- unsigned int saltsize;
- u8 *salt;
int err;
- if (opts == NULL) {
+ sg_init_one(&sg, cc->key, cc->key_size);
+ desc.tfm = essiv->hash_tfm;
+ desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ err = crypto_hash_digest(&desc, &sg, cc->key_size, essiv->salt);
+ if (err)
+ return err;
+
+ return crypto_cipher_setkey(essiv->tfm, essiv->salt,
+ crypto_hash_digestsize(essiv->hash_tfm));
+}
+
+/* Wipe salt and reset key derived from volume key */
+static int crypt_iv_essiv_wipe(struct crypt_config *cc)
+{
+ struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
+ unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
+
+ memset(essiv->salt, 0, salt_size);
+
+ return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
+}
+
+static void crypt_iv_essiv_dtr(struct crypt_config *cc)
+{
+ struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
+
+ crypto_free_cipher(essiv->tfm);
+ essiv->tfm = NULL;
+
+ crypto_free_hash(essiv->hash_tfm);
+ essiv->hash_tfm = NULL;
+
+ kzfree(essiv->salt);
+ essiv->salt = NULL;
+}
+
+static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
+ const char *opts)
+{
+ struct crypto_cipher *essiv_tfm = NULL;
+ struct crypto_hash *hash_tfm = NULL;
+ u8 *salt = NULL;
+ int err;
+
+ if (!opts) {
ti->error = "Digest algorithm missing for ESSIV mode";
return -EINVAL;
}
- /* Hash the cipher key with the given hash algorithm */
+ /* Allocate hash algorithm */
hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(hash_tfm)) {
ti->error = "Error initializing ESSIV hash";
- return PTR_ERR(hash_tfm);
+ err = PTR_ERR(hash_tfm);
+ goto bad;
}
- saltsize = crypto_hash_digestsize(hash_tfm);
- salt = kmalloc(saltsize, GFP_KERNEL);
- if (salt == NULL) {
+ salt = kzalloc(crypto_hash_digestsize(hash_tfm), GFP_KERNEL);
+ if (!salt) {
ti->error = "Error kmallocing salt storage in ESSIV";
- crypto_free_hash(hash_tfm);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto bad;
}
- sg_init_one(&sg, cc->key, cc->key_size);
- desc.tfm = hash_tfm;
- desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
- err = crypto_hash_digest(&desc, &sg, cc->key_size, salt);
- crypto_free_hash(hash_tfm);
-
- if (err) {
- ti->error = "Error calculating hash in ESSIV";
- kfree(salt);
- return err;
- }
-
- /* Setup the essiv_tfm with the given salt */
+ /* Allocate essiv_tfm */
essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(essiv_tfm)) {
ti->error = "Error allocating crypto tfm for ESSIV";
- kfree(salt);
- return PTR_ERR(essiv_tfm);
+ err = PTR_ERR(essiv_tfm);
+ goto bad;
}
if (crypto_cipher_blocksize(essiv_tfm) !=
crypto_ablkcipher_ivsize(cc->tfm)) {
ti->error = "Block size of ESSIV cipher does "
"not match IV size of block cipher";
- crypto_free_cipher(essiv_tfm);
- kfree(salt);
- return -EINVAL;
+ err = -EINVAL;
+ goto bad;
}
- err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
- if (err) {
- ti->error = "Failed to set key for ESSIV cipher";
- crypto_free_cipher(essiv_tfm);
- kfree(salt);
- return err;
- }
- kfree(salt);
- cc->iv_gen_private.essiv_tfm = essiv_tfm;
+ cc->iv_gen_private.essiv.salt = salt;
+ cc->iv_gen_private.essiv.tfm = essiv_tfm;
+ cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
+
return 0;
-}
-static void crypt_iv_essiv_dtr(struct crypt_config *cc)
-{
- crypto_free_cipher(cc->iv_gen_private.essiv_tfm);
- cc->iv_gen_private.essiv_tfm = NULL;
+bad:
+ if (essiv_tfm && !IS_ERR(essiv_tfm))
+ crypto_free_cipher(essiv_tfm);
+ if (hash_tfm && !IS_ERR(hash_tfm))
+ crypto_free_hash(hash_tfm);
+ kfree(salt);
+ return err;
}
static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
{
memset(iv, 0, cc->iv_size);
*(u64 *)iv = cpu_to_le64(sector);
- crypto_cipher_encrypt_one(cc->iv_gen_private.essiv_tfm, iv, iv);
+ crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
return 0;
}
@@ -273,7 +322,7 @@ static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
return -EINVAL;
}
- cc->iv_gen_private.benbi_shift = 9 - log;
+ cc->iv_gen_private.benbi.shift = 9 - log;
return 0;
}
@@ -288,7 +337,7 @@ static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
- val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi_shift) + 1);
+ val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1);
put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
return 0;
@@ -305,9 +354,15 @@ static struct crypt_iv_operations crypt_iv_plain_ops = {
.generator = crypt_iv_plain_gen
};
+static struct crypt_iv_operations crypt_iv_plain64_ops = {
+ .generator = crypt_iv_plain64_gen
+};
+
static struct crypt_iv_operations crypt_iv_essiv_ops = {
.ctr = crypt_iv_essiv_ctr,
.dtr = crypt_iv_essiv_dtr,
+ .init = crypt_iv_essiv_init,
+ .wipe = crypt_iv_essiv_wipe,
.generator = crypt_iv_essiv_gen
};
@@ -934,14 +989,14 @@ static int crypt_set_key(struct crypt_config *cc, char *key)
set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
- return 0;
+ return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
}
static int crypt_wipe_key(struct crypt_config *cc)
{
clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
memset(&cc->key, 0, cc->key_size * sizeof(u8));
- return 0;
+ return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
}
/*
@@ -983,11 +1038,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
return -ENOMEM;
}
- if (crypt_set_key(cc, argv[1])) {
- ti->error = "Error decoding key";
- goto bad_cipher;
- }
-
/* Compatibility mode for old dm-crypt cipher strings */
if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) {
chainmode = "cbc";
@@ -1015,6 +1065,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
strcpy(cc->chainmode, chainmode);
cc->tfm = tfm;
+ if (crypt_set_key(cc, argv[1]) < 0) {
+ ti->error = "Error decoding and setting key";
+ goto bad_ivmode;
+ }
+
/*
* Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi".
* See comments at iv code
@@ -1024,6 +1079,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
cc->iv_gen_ops = NULL;
else if (strcmp(ivmode, "plain") == 0)
cc->iv_gen_ops = &crypt_iv_plain_ops;
+ else if (strcmp(ivmode, "plain64") == 0)
+ cc->iv_gen_ops = &crypt_iv_plain64_ops;
else if (strcmp(ivmode, "essiv") == 0)
cc->iv_gen_ops = &crypt_iv_essiv_ops;
else if (strcmp(ivmode, "benbi") == 0)
@@ -1039,6 +1096,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0)
goto bad_ivmode;
+ if (cc->iv_gen_ops && cc->iv_gen_ops->init &&
+ cc->iv_gen_ops->init(cc) < 0) {
+ ti->error = "Error initialising IV";
+ goto bad_slab_pool;
+ }
+
cc->iv_size = crypto_ablkcipher_ivsize(tfm);
if (cc->iv_size)
/* at least a 64 bit sector number should fit in our buffer */
@@ -1085,11 +1148,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad_bs;
}
- if (crypto_ablkcipher_setkey(tfm, cc->key, key_size) < 0) {
- ti->error = "Error setting key";
- goto bad_device;
- }
-
if (sscanf(argv[2], "%llu", &tmpll) != 1) {
ti->error = "Invalid iv_offset sector";
goto bad_device;
@@ -1278,6 +1336,7 @@ static void crypt_resume(struct dm_target *ti)
static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
{
struct crypt_config *cc = ti->private;
+ int ret = -EINVAL;
if (argc < 2)
goto error;
@@ -1287,10 +1346,22 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
DMWARN("not suspended during key manipulation.");
return -EINVAL;
}
- if (argc == 3 && !strnicmp(argv[1], MESG_STR("set")))
- return crypt_set_key(cc, argv[2]);
- if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe")))
+ if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) {
+ ret = crypt_set_key(cc, argv[2]);
+ if (ret)
+ return ret;
+ if (cc->iv_gen_ops && cc->iv_gen_ops->init)
+ ret = cc->iv_gen_ops->init(cc);
+ return ret;
+ }
+ if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) {
+ if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) {
+ ret = cc->iv_gen_ops->wipe(cc);
+ if (ret)
+ return ret;
+ }
return crypt_wipe_key(cc);
+ }
}
error:
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 7dbe652efb5..2b7907b6dd0 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -172,7 +172,8 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
}
/* Validate the chunk size against the device block size */
- if (chunk_size % (bdev_logical_block_size(store->cow->bdev) >> 9)) {
+ if (chunk_size %
+ (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9)) {
*error = "Chunk size is not a multiple of device blocksize";
return -EINVAL;
}
@@ -190,6 +191,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
}
int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
+ struct dm_snapshot *snap,
unsigned *args_used,
struct dm_exception_store **store)
{
@@ -198,7 +200,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
struct dm_exception_store *tmp_store;
char persistent;
- if (argc < 3) {
+ if (argc < 2) {
ti->error = "Insufficient exception store arguments";
return -EINVAL;
}
@@ -209,14 +211,15 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
return -ENOMEM;
}
- persistent = toupper(*argv[1]);
+ persistent = toupper(*argv[0]);
if (persistent == 'P')
type = get_type("P");
else if (persistent == 'N')
type = get_type("N");
else {
ti->error = "Persistent flag is not P or N";
- return -EINVAL;
+ r = -EINVAL;
+ goto bad_type;
}
if (!type) {
@@ -226,32 +229,23 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
}
tmp_store->type = type;
- tmp_store->ti = ti;
-
- r = dm_get_device(ti, argv[0], 0, 0,
- FMODE_READ | FMODE_WRITE, &tmp_store->cow);
- if (r) {
- ti->error = "Cannot get COW device";
- goto bad_cow;
- }
+ tmp_store->snap = snap;
- r = set_chunk_size(tmp_store, argv[2], &ti->error);
+ r = set_chunk_size(tmp_store, argv[1], &ti->error);
if (r)
- goto bad_ctr;
+ goto bad;
r = type->ctr(tmp_store, 0, NULL);
if (r) {
ti->error = "Exception store type constructor failed";
- goto bad_ctr;
+ goto bad;
}
- *args_used = 3;
+ *args_used = 2;
*store = tmp_store;
return 0;
-bad_ctr:
- dm_put_device(ti, tmp_store->cow);
-bad_cow:
+bad:
put_type(type);
bad_type:
kfree(tmp_store);
@@ -262,7 +256,6 @@ EXPORT_SYMBOL(dm_exception_store_create);
void dm_exception_store_destroy(struct dm_exception_store *store)
{
store->type->dtr(store);
- dm_put_device(store->ti, store->cow);
put_type(store->type);
kfree(store);
}
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 8a223a48802..e8dfa06af3b 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -26,7 +26,7 @@ typedef sector_t chunk_t;
* of chunks that follow contiguously. Remaining bits hold the number of the
* chunk within the device.
*/
-struct dm_snap_exception {
+struct dm_exception {
struct list_head hash_list;
chunk_t old_chunk;
@@ -64,17 +64,34 @@ struct dm_exception_store_type {
* Find somewhere to store the next exception.
*/
int (*prepare_exception) (struct dm_exception_store *store,
- struct dm_snap_exception *e);
+ struct dm_exception *e);
/*
* Update the metadata with this exception.
*/
void (*commit_exception) (struct dm_exception_store *store,
- struct dm_snap_exception *e,
+ struct dm_exception *e,
void (*callback) (void *, int success),
void *callback_context);
/*
+ * Returns 0 if the exception store is empty.
+ *
+ * If there are exceptions still to be merged, sets
+ * *last_old_chunk and *last_new_chunk to the most recent
+ * still-to-be-merged chunk and returns the number of
+ * consecutive previous ones.
+ */
+ int (*prepare_merge) (struct dm_exception_store *store,
+ chunk_t *last_old_chunk, chunk_t *last_new_chunk);
+
+ /*
+ * Clear the last n exceptions.
+ * nr_merged must be <= the value returned by prepare_merge.
+ */
+ int (*commit_merge) (struct dm_exception_store *store, int nr_merged);
+
+ /*
* The snapshot is invalid, note this in the metadata.
*/
void (*drop_snapshot) (struct dm_exception_store *store);
@@ -86,19 +103,19 @@ struct dm_exception_store_type {
/*
* Return how full the snapshot is.
*/
- void (*fraction_full) (struct dm_exception_store *store,
- sector_t *numerator,
- sector_t *denominator);
+ void (*usage) (struct dm_exception_store *store,
+ sector_t *total_sectors, sector_t *sectors_allocated,
+ sector_t *metadata_sectors);
/* For internal device-mapper use only. */
struct list_head list;
};
+struct dm_snapshot;
+
struct dm_exception_store {
struct dm_exception_store_type *type;
- struct dm_target *ti;
-
- struct dm_dev *cow;
+ struct dm_snapshot *snap;
/* Size of data blocks saved - must be a power of 2 */
unsigned chunk_size;
@@ -109,6 +126,11 @@ struct dm_exception_store {
};
/*
+ * Obtain the cow device used by a given snapshot.
+ */
+struct dm_dev *dm_snap_cow(struct dm_snapshot *snap);
+
+/*
* Funtions to manipulate consecutive chunks
*/
# if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
@@ -120,18 +142,25 @@ static inline chunk_t dm_chunk_number(chunk_t chunk)
return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
}
-static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
+static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
{
return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
}
-static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
+static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
{
e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);
BUG_ON(!dm_consecutive_chunk_count(e));
}
+static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
+{
+ BUG_ON(!dm_consecutive_chunk_count(e));
+
+ e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS);
+}
+
# else
# define DM_CHUNK_CONSECUTIVE_BITS 0
@@ -140,12 +169,16 @@ static inline chunk_t dm_chunk_number(chunk_t chunk)
return chunk;
}
-static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
+static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
{
return 0;
}
-static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
+static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
+{
+}
+
+static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
{
}
@@ -162,7 +195,7 @@ static inline sector_t get_dev_size(struct block_device *bdev)
static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
sector_t sector)
{
- return (sector & ~store->chunk_mask) >> store->chunk_shift;
+ return sector >> store->chunk_shift;
}
int dm_exception_store_type_register(struct dm_exception_store_type *type);
@@ -173,6 +206,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
char **error);
int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
+ struct dm_snapshot *snap,
unsigned *args_used,
struct dm_exception_store **store);
void dm_exception_store_destroy(struct dm_exception_store *store);
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 3a2e6a2f8bd..10f457ca6af 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -5,6 +5,8 @@
* This file is released under the GPL.
*/
+#include "dm.h"
+
#include <linux/device-mapper.h>
#include <linux/bio.h>
@@ -14,12 +16,19 @@
#include <linux/slab.h>
#include <linux/dm-io.h>
+#define DM_MSG_PREFIX "io"
+
+#define DM_IO_MAX_REGIONS BITS_PER_LONG
+
struct dm_io_client {
mempool_t *pool;
struct bio_set *bios;
};
-/* FIXME: can we shrink this ? */
+/*
+ * Aligning 'struct io' reduces the number of bits required to store
+ * its address. Refer to store_io_and_region_in_bio() below.
+ */
struct io {
unsigned long error_bits;
unsigned long eopnotsupp_bits;
@@ -28,7 +37,9 @@ struct io {
struct dm_io_client *client;
io_notify_fn callback;
void *context;
-};
+} __attribute__((aligned(DM_IO_MAX_REGIONS)));
+
+static struct kmem_cache *_dm_io_cache;
/*
* io contexts are only dynamically allocated for asynchronous
@@ -53,7 +64,7 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages)
if (!client)
return ERR_PTR(-ENOMEM);
- client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io));
+ client->pool = mempool_create_slab_pool(ios, _dm_io_cache);
if (!client->pool)
goto bad;
@@ -88,18 +99,29 @@ EXPORT_SYMBOL(dm_io_client_destroy);
/*-----------------------------------------------------------------
* We need to keep track of which region a bio is doing io for.
- * In order to save a memory allocation we store this the last
- * bvec which we know is unused (blech).
- * XXX This is ugly and can OOPS with some configs... find another way.
+ * To avoid a memory allocation to store just 5 or 6 bits, we
+ * ensure the 'struct io' pointer is aligned so enough low bits are
+ * always zero and then combine it with the region number directly in
+ * bi_private.
*---------------------------------------------------------------*/
-static inline void bio_set_region(struct bio *bio, unsigned region)
+static void store_io_and_region_in_bio(struct bio *bio, struct io *io,
+ unsigned region)
{
- bio->bi_io_vec[bio->bi_max_vecs].bv_len = region;
+ if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) {
+ DMCRIT("Unaligned struct io pointer %p", io);
+ BUG();
+ }
+
+ bio->bi_private = (void *)((unsigned long)io | region);
}
-static inline unsigned bio_get_region(struct bio *bio)
+static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
+ unsigned *region)
{
- return bio->bi_io_vec[bio->bi_max_vecs].bv_len;
+ unsigned long val = (unsigned long)bio->bi_private;
+
+ *io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS);
+ *region = val & (DM_IO_MAX_REGIONS - 1);
}
/*-----------------------------------------------------------------
@@ -140,10 +162,8 @@ static void endio(struct bio *bio, int error)
/*
* The bio destructor in bio_put() may use the io object.
*/
- io = bio->bi_private;
- region = bio_get_region(bio);
+ retrieve_io_and_region_from_bio(bio, &io, &region);
- bio->bi_max_vecs++;
bio_put(bio);
dec_count(io, region, error);
@@ -243,7 +263,10 @@ static void vm_dp_init(struct dpages *dp, void *data)
static void dm_bio_destructor(struct bio *bio)
{
- struct io *io = bio->bi_private;
+ unsigned region;
+ struct io *io;
+
+ retrieve_io_and_region_from_bio(bio, &io, &region);
bio_free(bio, io->client->bios);
}
@@ -286,26 +309,23 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
unsigned num_bvecs;
sector_t remaining = where->count;
- while (remaining) {
+ /*
+ * where->count may be zero if rw holds a write barrier and we
+ * need to send a zero-sized barrier.
+ */
+ do {
/*
- * Allocate a suitably sized-bio: we add an extra
- * bvec for bio_get/set_region() and decrement bi_max_vecs
- * to hide it from bio_add_page().
+ * Allocate a suitably sized-bio.
*/
num_bvecs = dm_sector_div_up(remaining,
(PAGE_SIZE >> SECTOR_SHIFT));
- num_bvecs = 1 + min_t(int, bio_get_nr_vecs(where->bdev),
- num_bvecs);
- if (unlikely(num_bvecs > BIO_MAX_PAGES))
- num_bvecs = BIO_MAX_PAGES;
+ num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs);
bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
bio->bi_sector = where->sector + (where->count - remaining);
bio->bi_bdev = where->bdev;
bio->bi_end_io = endio;
- bio->bi_private = io;
bio->bi_destructor = dm_bio_destructor;
- bio->bi_max_vecs--;
- bio_set_region(bio, region);
+ store_io_and_region_in_bio(bio, io, region);
/*
* Try and add as many pages as possible.
@@ -323,7 +343,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
atomic_inc(&io->count);
submit_bio(rw, bio);
- }
+ } while (remaining);
}
static void dispatch_io(int rw, unsigned int num_regions,
@@ -333,6 +353,8 @@ static void dispatch_io(int rw, unsigned int num_regions,
int i;
struct dpages old_pages = *dp;
+ BUG_ON(num_regions > DM_IO_MAX_REGIONS);
+
if (sync)
rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
@@ -342,7 +364,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
*/
for (i = 0; i < num_regions; i++) {
*dp = old_pages;
- if (where[i].count)
+ if (where[i].count || (rw & (1 << BIO_RW_BARRIER)))
do_region(rw, i, where + i, dp, io);
}
@@ -357,7 +379,14 @@ static int sync