diff options
-rw-r--r-- | Documentation/device-mapper/snapshot.txt | 60 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 207 | ||||
-rw-r--r-- | drivers/md/dm-exception-store.c | 33 | ||||
-rw-r--r-- | drivers/md/dm-exception-store.h | 62 | ||||
-rw-r--r-- | drivers/md/dm-io.c | 120 | ||||
-rw-r--r-- | drivers/md/dm-ioctl.c | 123 | ||||
-rw-r--r-- | drivers/md/dm-kcopyd.c | 5 | ||||
-rw-r--r-- | drivers/md/dm-log.c | 77 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 95 | ||||
-rw-r--r-- | drivers/md/dm-raid1.c | 219 | ||||
-rw-r--r-- | drivers/md/dm-region-hash.c | 31 | ||||
-rw-r--r-- | drivers/md/dm-snap-persistent.c | 195 | ||||
-rw-r--r-- | drivers/md/dm-snap-transient.c | 24 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 1279 | ||||
-rw-r--r-- | drivers/md/dm-sysfs.c | 10 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-uevent.c | 9 | ||||
-rw-r--r-- | drivers/md/dm.c | 643 | ||||
-rw-r--r-- | drivers/md/dm.h | 13 | ||||
-rw-r--r-- | include/linux/device-mapper.h | 8 | ||||
-rw-r--r-- | include/linux/dm-dirty-log.h | 6 | ||||
-rw-r--r-- | include/linux/dm-ioctl.h | 13 | ||||
-rw-r--r-- | include/linux/dm-region-hash.h | 3 |
23 files changed, 2349 insertions, 889 deletions
diff --git a/Documentation/device-mapper/snapshot.txt b/Documentation/device-mapper/snapshot.txt index a5009c8300f..e3a77b21513 100644 --- a/Documentation/device-mapper/snapshot.txt +++ b/Documentation/device-mapper/snapshot.txt @@ -8,13 +8,19 @@ the block device which are also writable without interfering with the original content; *) To create device "forks", i.e. multiple different versions of the same data stream. +*) To merge a snapshot of a block device back into the snapshot's origin +device. +In the first two cases, dm copies only the chunks of data that get +changed and uses a separate copy-on-write (COW) block device for +storage. -In both cases, dm copies only the chunks of data that get changed and -uses a separate copy-on-write (COW) block device for storage. +For snapshot merge the contents of the COW storage are merged back into +the origin device. -There are two dm targets available: snapshot and snapshot-origin. +There are three dm targets available: +snapshot, snapshot-origin, and snapshot-merge. *) snapshot-origin <origin> @@ -40,8 +46,25 @@ The difference is that for transient snapshots less metadata must be saved on disk - they can be kept in memory by the kernel. -How this is used by LVM2 -======================== +* snapshot-merge <origin> <COW device> <persistent> <chunksize> + +takes the same table arguments as the snapshot target except it only +works with persistent snapshots. This target assumes the role of the +"snapshot-origin" target and must not be loaded if the "snapshot-origin" +is still present for <origin>. + +Creates a merging snapshot that takes control of the changed chunks +stored in the <COW device> of an existing snapshot, through a handover +procedure, and merges these chunks back into the <origin>. Once merging +has started (in the background) the <origin> may be opened and the merge +will continue while I/O is flowing to it. Changes to the <origin> are +deferred until the merging snapshot's corresponding chunk(s) have been +merged. Once merging has started the snapshot device, associated with +the "snapshot" target, will return -EIO when accessed. + + +How snapshot is used by LVM2 +============================ When you create the first LVM2 snapshot of a volume, four dm devices are used: 1) a device containing the original mapping table of the source volume; @@ -72,3 +95,30 @@ brw------- 1 root root 254, 12 29 ago 18:15 /dev/mapper/volumeGroup-snap-cow brw------- 1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap brw------- 1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base + +How snapshot-merge is used by LVM2 +================================== +A merging snapshot assumes the role of the "snapshot-origin" while +merging. As such the "snapshot-origin" is replaced with +"snapshot-merge". The "-real" device is not changed and the "-cow" +device is renamed to <origin name>-cow to aid LVM2's cleanup of the +merging snapshot after it completes. The "snapshot" that hands over its +COW device to the "snapshot-merge" is deactivated (unless using lvchange +--refresh); but if it is left active it will simply return I/O errors. + +A snapshot will merge into its origin with the following command: + +lvconvert --merge volumeGroup/snap + +we'll now have this situation: + +# dmsetup table|grep volumeGroup + +volumeGroup-base-real: 0 2097152 linear 8:19 384 +volumeGroup-base-cow: 0 204800 linear 8:19 2097536 +volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16 + +# ls -lL /dev/mapper/volumeGroup-* +brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real +brw------- 1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow +brw------- 1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index e412980763b..a93637223c8 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1,7 +1,7 @@ /* * Copyright (C) 2003 Christophe Saout <christophe@saout.de> * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> - * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. + * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. * * This file is released under the GPL. */ @@ -71,10 +71,21 @@ struct crypt_iv_operations { int (*ctr)(struct crypt_config *cc, struct dm_target *ti, const char *opts); void (*dtr)(struct crypt_config *cc); - const char *(*status)(struct crypt_config *cc); + int (*init)(struct crypt_config *cc); + int (*wipe)(struct crypt_config *cc); int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); }; +struct iv_essiv_private { + struct crypto_cipher *tfm; + struct crypto_hash *hash_tfm; + u8 *salt; +}; + +struct iv_benbi_private { + int shift; +}; + /* * Crypt: maps a linear range of a block device * and encrypts / decrypts at the same time. @@ -102,8 +113,8 @@ struct crypt_config { struct crypt_iv_operations *iv_gen_ops; char *iv_mode; union { - struct crypto_cipher *essiv_tfm; - int benbi_shift; + struct iv_essiv_private essiv; + struct iv_benbi_private benbi; } iv_gen_private; sector_t iv_offset; unsigned int iv_size; @@ -147,6 +158,9 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io); * plain: the initial vector is the 32-bit little-endian version of the sector * number, padded with zeros if necessary. * + * plain64: the initial vector is the 64-bit little-endian version of the sector + * number, padded with zeros if necessary. + * * essiv: "encrypted sector|salt initial vector", the sector number is * encrypted with the bulk cipher using a salt as key. The salt * should be derived from the bulk cipher's key via hashing. @@ -169,88 +183,123 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector) return 0; } -static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, - const char *opts) +static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, + sector_t sector) { - struct crypto_cipher *essiv_tfm; - struct crypto_hash *hash_tfm; + memset(iv, 0, cc->iv_size); + *(u64 *)iv = cpu_to_le64(sector); + + return 0; +} + +/* Initialise ESSIV - compute salt but no local memory allocations */ +static int crypt_iv_essiv_init(struct crypt_config *cc) +{ + struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; struct hash_desc desc; struct scatterlist sg; - unsigned int saltsize; - u8 *salt; int err; - if (opts == NULL) { + sg_init_one(&sg, cc->key, cc->key_size); + desc.tfm = essiv->hash_tfm; + desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + err = crypto_hash_digest(&desc, &sg, cc->key_size, essiv->salt); + if (err) + return err; + + return crypto_cipher_setkey(essiv->tfm, essiv->salt, + crypto_hash_digestsize(essiv->hash_tfm)); +} + +/* Wipe salt and reset key derived from volume key */ +static int crypt_iv_essiv_wipe(struct crypt_config *cc) +{ + struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; + unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); + + memset(essiv->salt, 0, salt_size); + + return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size); +} + +static void crypt_iv_essiv_dtr(struct crypt_config *cc) +{ + struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; + + crypto_free_cipher(essiv->tfm); + essiv->tfm = NULL; + + crypto_free_hash(essiv->hash_tfm); + essiv->hash_tfm = NULL; + + kzfree(essiv->salt); + essiv->salt = NULL; +} + +static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, + const char *opts) +{ + struct crypto_cipher *essiv_tfm = NULL; + struct crypto_hash *hash_tfm = NULL; + u8 *salt = NULL; + int err; + + if (!opts) { ti->error = "Digest algorithm missing for ESSIV mode"; return -EINVAL; } - /* Hash the cipher key with the given hash algorithm */ + /* Allocate hash algorithm */ hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(hash_tfm)) { ti->error = "Error initializing ESSIV hash"; - return PTR_ERR(hash_tfm); + err = PTR_ERR(hash_tfm); + goto bad; } - saltsize = crypto_hash_digestsize(hash_tfm); - salt = kmalloc(saltsize, GFP_KERNEL); - if (salt == NULL) { + salt = kzalloc(crypto_hash_digestsize(hash_tfm), GFP_KERNEL); + if (!salt) { ti->error = "Error kmallocing salt storage in ESSIV"; - crypto_free_hash(hash_tfm); - return -ENOMEM; + err = -ENOMEM; + goto bad; } - sg_init_one(&sg, cc->key, cc->key_size); - desc.tfm = hash_tfm; - desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; - err = crypto_hash_digest(&desc, &sg, cc->key_size, salt); - crypto_free_hash(hash_tfm); - - if (err) { - ti->error = "Error calculating hash in ESSIV"; - kfree(salt); - return err; - } - - /* Setup the essiv_tfm with the given salt */ + /* Allocate essiv_tfm */ essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(essiv_tfm)) { ti->error = "Error allocating crypto tfm for ESSIV"; - kfree(salt); - return PTR_ERR(essiv_tfm); + err = PTR_ERR(essiv_tfm); + goto bad; } if (crypto_cipher_blocksize(essiv_tfm) != crypto_ablkcipher_ivsize(cc->tfm)) { ti->error = "Block size of ESSIV cipher does " "not match IV size of block cipher"; - crypto_free_cipher(essiv_tfm); - kfree(salt); - return -EINVAL; + err = -EINVAL; + goto bad; } - err = crypto_cipher_setkey(essiv_tfm, salt, saltsize); - if (err) { - ti->error = "Failed to set key for ESSIV cipher"; - crypto_free_cipher(essiv_tfm); - kfree(salt); - return err; - } - kfree(salt); - cc->iv_gen_private.essiv_tfm = essiv_tfm; + cc->iv_gen_private.essiv.salt = salt; + cc->iv_gen_private.essiv.tfm = essiv_tfm; + cc->iv_gen_private.essiv.hash_tfm = hash_tfm; + return 0; -} -static void crypt_iv_essiv_dtr(struct crypt_config *cc) -{ - crypto_free_cipher(cc->iv_gen_private.essiv_tfm); - cc->iv_gen_private.essiv_tfm = NULL; +bad: + if (essiv_tfm && !IS_ERR(essiv_tfm)) + crypto_free_cipher(essiv_tfm); + if (hash_tfm && !IS_ERR(hash_tfm)) + crypto_free_hash(hash_tfm); + kfree(salt); + return err; } static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) { memset(iv, 0, cc->iv_size); *(u64 *)iv = cpu_to_le64(sector); - crypto_cipher_encrypt_one(cc->iv_gen_private.essiv_tfm, iv, iv); + crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv); return 0; } @@ -273,7 +322,7 @@ static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, return -EINVAL; } - cc->iv_gen_private.benbi_shift = 9 - log; + cc->iv_gen_private.benbi.shift = 9 - log; return 0; } @@ -288,7 +337,7 @@ static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector) memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ - val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi_shift) + 1); + val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1); put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); return 0; @@ -305,9 +354,15 @@ static struct crypt_iv_operations crypt_iv_plain_ops = { .generator = crypt_iv_plain_gen }; +static struct crypt_iv_operations crypt_iv_plain64_ops = { + .generator = crypt_iv_plain64_gen +}; + static struct crypt_iv_operations crypt_iv_essiv_ops = { .ctr = crypt_iv_essiv_ctr, .dtr = crypt_iv_essiv_dtr, + .init = crypt_iv_essiv_init, + .wipe = crypt_iv_essiv_wipe, .generator = crypt_iv_essiv_gen }; @@ -934,14 +989,14 @@ static int crypt_set_key(struct crypt_config *cc, char *key) set_bit(DM_CRYPT_KEY_VALID, &cc->flags); - return 0; + return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); } static int crypt_wipe_key(struct crypt_config *cc) { clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); memset(&cc->key, 0, cc->key_size * sizeof(u8)); - return 0; + return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); } /* @@ -983,11 +1038,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -ENOMEM; } - if (crypt_set_key(cc, argv[1])) { - ti->error = "Error decoding key"; - goto bad_cipher; - } - /* Compatibility mode for old dm-crypt cipher strings */ if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) { chainmode = "cbc"; @@ -1015,6 +1065,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) strcpy(cc->chainmode, chainmode); cc->tfm = tfm; + if (crypt_set_key(cc, argv[1]) < 0) { + ti->error = "Error decoding and setting key"; + goto bad_ivmode; + } + /* * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi". * See comments at iv code @@ -1024,6 +1079,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) cc->iv_gen_ops = NULL; else if (strcmp(ivmode, "plain") == 0) cc->iv_gen_ops = &crypt_iv_plain_ops; + else if (strcmp(ivmode, "plain64") == 0) + cc->iv_gen_ops = &crypt_iv_plain64_ops; else if (strcmp(ivmode, "essiv") == 0) cc->iv_gen_ops = &crypt_iv_essiv_ops; else if (strcmp(ivmode, "benbi") == 0) @@ -1039,6 +1096,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) goto bad_ivmode; + if (cc->iv_gen_ops && cc->iv_gen_ops->init && + cc->iv_gen_ops->init(cc) < 0) { + ti->error = "Error initialising IV"; + goto bad_slab_pool; + } + cc->iv_size = crypto_ablkcipher_ivsize(tfm); if (cc->iv_size) /* at least a 64 bit sector number should fit in our buffer */ @@ -1085,11 +1148,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_bs; } - if (crypto_ablkcipher_setkey(tfm, cc->key, key_size) < 0) { - ti->error = "Error setting key"; - goto bad_device; - } - if (sscanf(argv[2], "%llu", &tmpll) != 1) { ti->error = "Invalid iv_offset sector"; goto bad_device; @@ -1278,6 +1336,7 @@ static void crypt_resume(struct dm_target *ti) static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) { struct crypt_config *cc = ti->private; + int ret = -EINVAL; if (argc < 2) goto error; @@ -1287,10 +1346,22 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) DMWARN("not suspended during key manipulation."); return -EINVAL; } - if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) - return crypt_set_key(cc, argv[2]); - if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) + if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) { + ret = crypt_set_key(cc, argv[2]); + if (ret) + return ret; + if (cc->iv_gen_ops && cc->iv_gen_ops->init) + ret = cc->iv_gen_ops->init(cc); + return ret; + } + if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) { + if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) { + ret = cc->iv_gen_ops->wipe(cc); + if (ret) + return ret; + } return crypt_wipe_key(cc); + } } error: diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 7dbe652efb5..2b7907b6dd0 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -172,7 +172,8 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store, } /* Validate the chunk size against the device block size */ - if (chunk_size % (bdev_logical_block_size(store->cow->bdev) >> 9)) { + if (chunk_size % + (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9)) { *error = "Chunk size is not a multiple of device blocksize"; return -EINVAL; } @@ -190,6 +191,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store, } int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, + struct dm_snapshot *snap, unsigned *args_used, struct dm_exception_store **store) { @@ -198,7 +200,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, struct dm_exception_store *tmp_store; char persistent; - if (argc < 3) { + if (argc < 2) { ti->error = "Insufficient exception store arguments"; return -EINVAL; } @@ -209,14 +211,15 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, return -ENOMEM; } - persistent = toupper(*argv[1]); + persistent = toupper(*argv[0]); if (persistent == 'P') type = get_type("P"); else if (persistent == 'N') type = get_type("N"); else { ti->error = "Persistent flag is not P or N"; - return -EINVAL; + r = -EINVAL; + goto bad_type; } if (!type) { @@ -226,32 +229,23 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, } tmp_store->type = type; - tmp_store->ti = ti; - - r = dm_get_device(ti, argv[0], 0, 0, - FMODE_READ | FMODE_WRITE, &tmp_store->cow); - if (r) { - ti->error = "Cannot get COW device"; - goto bad_cow; - } + tmp_store->snap = snap; - r = set_chunk_size(tmp_store, argv[2], &ti->error); + r = set_chunk_size(tmp_store, argv[1], &ti->error); if (r) - goto bad_ctr; + goto bad; r = type->ctr(tmp_store, 0, NULL); if (r) { ti->error = "Exception store type constructor failed"; - goto bad_ctr; + goto bad; } - *args_used = 3; + *args_used = 2; *store = tmp_store; return 0; -bad_ctr: - dm_put_device(ti, tmp_store->cow); -bad_cow: +bad: put_type(type); bad_type: kfree(tmp_store); @@ -262,7 +256,6 @@ EXPORT_SYMBOL(dm_exception_store_create); void dm_exception_store_destroy(struct dm_exception_store *store) { store->type->dtr(store); - dm_put_device(store->ti, store->cow); put_type(store->type); kfree(store); } diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index 8a223a48802..e8dfa06af3b 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h @@ -26,7 +26,7 @@ typedef sector_t chunk_t; * of chunks that follow contiguously. Remaining bits hold the number of the * chunk within the device. */ -struct dm_snap_exception { +struct dm_exception { struct list_head hash_list; chunk_t old_chunk; @@ -64,17 +64,34 @@ struct dm_exception_store_type { * Find somewhere to store the next exception. */ int (*prepare_exception) (struct dm_exception_store *store, - struct dm_snap_exception *e); + struct dm_exception *e); /* * Update the metadata with this exception. */ void (*commit_exception) (struct dm_exception_store *store, - struct dm_snap_exception *e, + struct dm_exception *e, void (*callback) (void *, int success), void *callback_context); /* + * Returns 0 if the exception store is empty. + * + * If there are exceptions still to be merged, sets + * *last_old_chunk and *last_new_chunk to the most recent + * still-to-be-merged chunk and returns the number of + * consecutive previous ones. + */ + int (*prepare_merge) (struct dm_exception_store *store, + chunk_t *last_old_chunk, chunk_t *last_new_chunk); + + /* + * Clear the last n exceptions. + * nr_merged must be <= the value returned by prepare_merge. + */ + int (*commit_merge) (struct dm_exception_store *store, int nr_merged); + + /* * The snapshot is invalid, note this in the metadata. */ void (*drop_snapshot) (struct dm_exception_store *store); @@ -86,19 +103,19 @@ struct dm_exception_store_type { /* * Return how full the snapshot is. */ - void (*fraction_full) (struct dm_exception_store *store, - sector_t *numerator, - sector_t *denominator); + void (*usage) (struct dm_exception_store *store, + sector_t *total_sectors, sector_t *sectors_allocated, + sector_t *metadata_sectors); /* For internal device-mapper use only. */ struct list_head list; }; +struct dm_snapshot; + struct dm_exception_store { struct dm_exception_store_type *type; - struct dm_target *ti; - - struct dm_dev *cow; + struct dm_snapshot *snap; /* Size of data blocks saved - must be a power of 2 */ unsigned chunk_size; @@ -109,6 +126,11 @@ struct dm_exception_store { }; /* + * Obtain the cow device used by a given snapshot. + */ +struct dm_dev *dm_snap_cow(struct dm_snapshot *snap); + +/* * Funtions to manipulate consecutive chunks */ # if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64) @@ -120,18 +142,25 @@ static inline chunk_t dm_chunk_number(chunk_t chunk) return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL); } -static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) +static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e) { return e->new_chunk >> DM_CHUNK_NUMBER_BITS; } -static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) +static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e) { e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS); BUG_ON(!dm_consecutive_chunk_count(e)); } +static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e) +{ + BUG_ON(!dm_consecutive_chunk_count(e)); + + e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS); +} + # else # define DM_CHUNK_CONSECUTIVE_BITS 0 @@ -140,12 +169,16 @@ static inline chunk_t dm_chunk_number(chunk_t chunk) return chunk; } -static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) +static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e) { return 0; } -static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) +static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e) +{ +} + +static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e) { } @@ -162,7 +195,7 @@ static inline sector_t get_dev_size(struct block_device *bdev) static inline chunk_t sector_to_chunk(struct dm_exception_store *store, sector_t sector) { - return (sector & ~store->chunk_mask) >> store->chunk_shift; + return sector >> store->chunk_shift; } int dm_exception_store_type_register(struct dm_exception_store_type *type); @@ -173,6 +206,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store, char **error); int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, + struct dm_snapshot *snap, unsigned *args_used, struct dm_exception_store **store); void dm_exception_store_destroy(struct dm_exception_store *store); diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 3a2e6a2f8bd..10f457ca6af 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -5,6 +5,8 @@ * This file is released under the GPL. */ +#include "dm.h" + #include <linux/device-mapper.h> #include <linux/bio.h> @@ -14,12 +16,19 @@ #include <linux/slab.h> #include <linux/dm-io.h> +#define DM_MSG_PREFIX "io" + +#define DM_IO_MAX_REGIONS BITS_PER_LONG + struct dm_io_client { mempool_t *pool; struct bio_set *bios; }; -/* FIXME: can we shrink this ? */ +/* + * Aligning 'struct io' reduces the number of bits required to store + * its address. Refer to store_io_and_region_in_bio() below. + */ struct io { unsigned long error_bits; unsigned long eopnotsupp_bits; @@ -28,7 +37,9 @@ struct io { struct dm_io_client *client; io_notify_fn callback; void *context; -}; +} __attribute__((aligned(DM_IO_MAX_REGIONS))); + +static struct kmem_cache *_dm_io_cache; /* * io contexts are only dynamically allocated for asynchronous @@ -53,7 +64,7 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages) if (!client) return ERR_PTR(-ENOMEM); - client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io)); + client->pool = mempool_create_slab_pool(ios, _dm_io_cache); if (!client->pool) goto bad; @@ -88,18 +99,29 @@ EXPORT_SYMBOL(dm_io_client_destroy); /*----------------------------------------------------------------- * We need to keep track of which region a bio is doing io for. - * In order to save a memory allocation we store this the last - * bvec which we know is unused (blech). - * XXX This is ugly and can OOPS with some configs... find another way. + * To avoid a memory allocation to store just 5 or 6 bits, we + * ensure the 'struct io' pointer is aligned so enough low bits are + * always zero and then combine it with the region number directly in + * bi_private. *---------------------------------------------------------------*/ -static inline void bio_set_region(struct bio *bio, unsigned region) +static void store_io_and_region_in_bio(struct bio *bio, struct io *io, + unsigned region) { - bio->bi_io_vec[bio->bi_max_vecs].bv_len = region; + if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) { + DMCRIT("Unaligned struct io pointer %p", io); + BUG(); + } + + bio->bi_private = (void *)((unsigned long)io | region); } -static inline unsigned bio_get_region(struct bio *bio) +static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io, + unsigned *region) { - return bio->bi_io_vec[bio->bi_max_vecs].bv_len; + unsigned long val = (unsigned long)bio->bi_private; + + *io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS); + *region = val & (DM_IO_MAX_REGIONS - 1); } /*----------------------------------------------------------------- @@ -140,10 +162,8 @@ static void endio(struct bio *bio, int error) /* * The bio destructor in bio_put() may use the io object. */ - io = bio->bi_private; - region = bio_get_region(bio); + retrieve_io_and_region_from_bio(bio, &io, ®ion); - bio->bi_max_vecs++; bio_put(bio); dec_count(io, region, error); @@ -243,7 +263,10 @@ static void vm_dp_init(struct dpages *dp, void *data) static void dm_bio_destructor(struct bio *bio) { - struct io *io = bio->bi_private; + unsigned region; + struct io *io; + + retrieve_io_and_region_from_bio(bio, &io, ®ion); bio_free(bio, io->client->bios); } @@ -286,26 +309,23 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, unsigned num_bvecs; sector_t remaining = where->count; - while (remaining) { + /* + * where->count may be zero if rw holds a write barrier and we + * need to send a zero-sized barrier. + */ + do { /* - * Allocate a suitably sized-bio: we add an extra - * bvec for bio_get/set_region() and decrement bi_max_vecs - * to hide it from bio_add_page(). + * Allocate a suitably sized-bio. */ num_bvecs = dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)); - num_bvecs = 1 + min_t(int, bio_get_nr_vecs(where->bdev), - num_bvecs); - if (unlikely(num_bvecs > BIO_MAX_PAGES)) - num_bvecs = BIO_MAX_PAGES; + num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs); bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); bio->bi_sector = where->sector + (where->count - remaining); bio->bi_bdev = where->bdev; bio->bi_end_io = endio; - bio->bi_private = io; bio->bi_destructor = dm_bio_destructor; - bio->bi_max_vecs--; - bio_set_region(bio, region); + store_io_and_region_in_bio(bio, io, region); /* * Try and add as many pages as possible. @@ -323,7 +343,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, atomic_inc(&io->count); submit_bio(rw, bio); - } + } while (remaining); } static void dispatch_io(int rw, unsigned int num_regions, @@ -333,6 +353,8 @@ static void dispatch_io(int rw, unsigned int num_regions, int i; struct dpages old_pages = *dp; + BUG_ON(num_regions > DM_IO_MAX_REGIONS); + if (sync) rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); @@ -342,7 +364,7 @@ static void dispatch_io(int rw, unsigned int num_regions, */ for (i = 0; i < num_regions; i++) { *dp = old_pages; - if (where[i].count) + if (where[i].count || (rw & (1 << BIO_RW_BARRIER))) do_region(rw, i, where + i, dp, io); } @@ -357,7 +379,14 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, struct dm_io_region *where, int rw, struct dpages *dp, unsigned long *error_bits) { - struct io io; + /* + * gcc <= 4.3 can't do the alignment for stack variables, so we must + * align it on our own. + * volatile prevents the optimizer from removing or reusing + * "io_" field from the stack frame (allowed in ANSI C). + */ + volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1]; + struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io)); if (num_regions > 1 && (rw & RW_MASK) != WRITE) { WARN_ON(1); @@ -365,33 +394,33 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, } retry: - io.error_bits = 0; - io.eopnotsupp_bits = 0; - atomic_set(&io.count, 1); /* see dispatch_io() */ - io.sleeper = current; - io.client = client; + io->error_bits = 0; + io->eopnotsupp_bits = 0; + atomic_set(&io->count, 1); /* see dispatch_io() */ + io->sleeper = current; + io->client = client; - dispatch_io(rw, num_regions, where, dp, &io, 1); + dispatch_io(rw, num_regions, where, dp, io, 1); while (1) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!atomic_read(&io.count)) + if (!atomic_read(&io->count)) break; io_schedule(); } set_current_state(TASK_RUNNING); - if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) { + if (io->eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) { rw &= ~(1 << BIO_RW_BARRIER); goto retry; |