diff options
Diffstat (limited to 'drivers/md/persistent-data')
20 files changed, 1939 insertions, 223 deletions
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig index ceb359050a5..0c2dec7aec2 100644 --- a/drivers/md/persistent-data/Kconfig +++ b/drivers/md/persistent-data/Kconfig @@ -1,8 +1,18 @@ config DM_PERSISTENT_DATA tristate - depends on BLK_DEV_DM && EXPERIMENTAL + depends on BLK_DEV_DM select LIBCRC32C select DM_BUFIO ---help--- Library providing immutable on-disk data structure support for device-mapper targets such as the thin provisioning target. + +config DM_DEBUG_BLOCK_STACK_TRACING + boolean "Keep stack trace of persistent data block lock holders" + depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA + select STACKTRACE + ---help--- + Enable this for messages that may help debug problems with the + block manager locking used by thin provisioning and caching. + + If unsure, say N. diff --git a/drivers/md/persistent-data/Makefile b/drivers/md/persistent-data/Makefile index d8e7cb767c1..ff528792c35 100644 --- a/drivers/md/persistent-data/Makefile +++ b/drivers/md/persistent-data/Makefile @@ -1,5 +1,7 @@ obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o dm-persistent-data-objs := \ + dm-array.o \ + dm-bitset.o \ dm-block-manager.o \ dm-space-map-common.o \ dm-space-map-disk.o \ diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c new file mode 100644 index 00000000000..1d75b1dc1e2 --- /dev/null +++ b/drivers/md/persistent-data/dm-array.c @@ -0,0 +1,819 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-array.h" +#include "dm-space-map.h" +#include "dm-transaction-manager.h" + +#include <linux/export.h> +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "array" + +/*----------------------------------------------------------------*/ + +/* + * The array is implemented as a fully populated btree, which points to + * blocks that contain the packed values. This is more space efficient + * than just using a btree since we don't store 1 key per value. + */ +struct array_block { + __le32 csum; + __le32 max_entries; + __le32 nr_entries; + __le32 value_size; + __le64 blocknr; /* Block this node is supposed to live in. */ +} __packed; + +/*----------------------------------------------------------------*/ + +/* + * Validator methods. As usual we calculate a checksum, and also write the + * block location into the header (paranoia about ssds remapping areas by + * mistake). + */ +#define CSUM_XOR 595846735 + +static void array_block_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t size_of_block) +{ + struct array_block *bh_le = dm_block_data(b); + + bh_le->blocknr = cpu_to_le64(dm_block_location(b)); + bh_le->csum = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries, + size_of_block - sizeof(__le32), + CSUM_XOR)); +} + +static int array_block_check(struct dm_block_validator *v, + struct dm_block *b, + size_t size_of_block) +{ + struct array_block *bh_le = dm_block_data(b); + __le32 csum_disk; + + if (dm_block_location(b) != le64_to_cpu(bh_le->blocknr)) { + DMERR_LIMIT("array_block_check failed: blocknr %llu != wanted %llu", + (unsigned long long) le64_to_cpu(bh_le->blocknr), + (unsigned long long) dm_block_location(b)); + return -ENOTBLK; + } + + csum_disk = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries, + size_of_block - sizeof(__le32), + CSUM_XOR)); + if (csum_disk != bh_le->csum) { + DMERR_LIMIT("array_block_check failed: csum %u != wanted %u", + (unsigned) le32_to_cpu(csum_disk), + (unsigned) le32_to_cpu(bh_le->csum)); + return -EILSEQ; + } + + return 0; +} + +static struct dm_block_validator array_validator = { + .name = "array", + .prepare_for_write = array_block_prepare_for_write, + .check = array_block_check +}; + +/*----------------------------------------------------------------*/ + +/* + * Functions for manipulating the array blocks. + */ + +/* + * Returns a pointer to a value within an array block. + * + * index - The index into _this_ specific block. + */ +static void *element_at(struct dm_array_info *info, struct array_block *ab, + unsigned index) +{ + unsigned char *entry = (unsigned char *) (ab + 1); + + entry += index * info->value_type.size; + + return entry; +} + +/* + * Utility function that calls one of the value_type methods on every value + * in an array block. + */ +static void on_entries(struct dm_array_info *info, struct array_block *ab, + void (*fn)(void *, const void *)) +{ + unsigned i, nr_entries = le32_to_cpu(ab->nr_entries); + + for (i = 0; i < nr_entries; i++) + fn(info->value_type.context, element_at(info, ab, i)); +} + +/* + * Increment every value in an array block. + */ +static void inc_ablock_entries(struct dm_array_info *info, struct array_block *ab) +{ + struct dm_btree_value_type *vt = &info->value_type; + + if (vt->inc) + on_entries(info, ab, vt->inc); +} + +/* + * Decrement every value in an array block. + */ +static void dec_ablock_entries(struct dm_array_info *info, struct array_block *ab) +{ + struct dm_btree_value_type *vt = &info->value_type; + + if (vt->dec) + on_entries(info, ab, vt->dec); +} + +/* + * Each array block can hold this many values. + */ +static uint32_t calc_max_entries(size_t value_size, size_t size_of_block) +{ + return (size_of_block - sizeof(struct array_block)) / value_size; +} + +/* + * Allocate a new array block. The caller will need to unlock block. + */ +static int alloc_ablock(struct dm_array_info *info, size_t size_of_block, + uint32_t max_entries, + struct dm_block **block, struct array_block **ab) +{ + int r; + + r = dm_tm_new_block(info->btree_info.tm, &array_validator, block); + if (r) + return r; + + (*ab) = dm_block_data(*block); + (*ab)->max_entries = cpu_to_le32(max_entries); + (*ab)->nr_entries = cpu_to_le32(0); + (*ab)->value_size = cpu_to_le32(info->value_type.size); + + return 0; +} + +/* + * Pad an array block out with a particular value. Every instance will + * cause an increment of the value_type. new_nr must always be more than + * the current number of entries. + */ +static void fill_ablock(struct dm_array_info *info, struct array_block *ab, + const void *value, unsigned new_nr) +{ + unsigned i; + uint32_t nr_entries; + struct dm_btree_value_type *vt = &info->value_type; + + BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); + BUG_ON(new_nr < le32_to_cpu(ab->nr_entries)); + + nr_entries = le32_to_cpu(ab->nr_entries); + for (i = nr_entries; i < new_nr; i++) { + if (vt->inc) + vt->inc(vt->context, value); + memcpy(element_at(info, ab, i), value, vt->size); + } + ab->nr_entries = cpu_to_le32(new_nr); +} + +/* + * Remove some entries from the back of an array block. Every value + * removed will be decremented. new_nr must be <= the current number of + * entries. + */ +static void trim_ablock(struct dm_array_info *info, struct array_block *ab, + unsigned new_nr) +{ + unsigned i; + uint32_t nr_entries; + struct dm_btree_value_type *vt = &info->value_type; + + BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); + BUG_ON(new_nr > le32_to_cpu(ab->nr_entries)); + + nr_entries = le32_to_cpu(ab->nr_entries); + for (i = nr_entries; i > new_nr; i--) + if (vt->dec) + vt->dec(vt->context, element_at(info, ab, i - 1)); + ab->nr_entries = cpu_to_le32(new_nr); +} + +/* + * Read locks a block, and coerces it to an array block. The caller must + * unlock 'block' when finished. + */ +static int get_ablock(struct dm_array_info *info, dm_block_t b, + struct dm_block **block, struct array_block **ab) +{ + int r; + + r = dm_tm_read_lock(info->btree_info.tm, b, &array_validator, block); + if (r) + return r; + + *ab = dm_block_data(*block); + return 0; +} + +/* + * Unlocks an array block. + */ +static int unlock_ablock(struct dm_array_info *info, struct dm_block *block) +{ + return dm_tm_unlock(info->btree_info.tm, block); +} + +/*----------------------------------------------------------------*/ + +/* + * Btree manipulation. + */ + +/* + * Looks up an array block in the btree, and then read locks it. + * + * index is the index of the index of the array_block, (ie. the array index + * / max_entries). + */ +static int lookup_ablock(struct dm_array_info *info, dm_block_t root, + unsigned index, struct dm_block **block, + struct array_block **ab) +{ + int r; + uint64_t key = index; + __le64 block_le; + + r = dm_btree_lookup(&info->btree_info, root, &key, &block_le); + if (r) + return r; + + return get_ablock(info, le64_to_cpu(block_le), block, ab); +} + +/* + * Insert an array block into the btree. The block is _not_ unlocked. + */ +static int insert_ablock(struct dm_array_info *info, uint64_t index, + struct dm_block *block, dm_block_t *root) +{ + __le64 block_le = cpu_to_le64(dm_block_location(block)); + + __dm_bless_for_disk(block_le); + return dm_btree_insert(&info->btree_info, *root, &index, &block_le, root); +} + +/* + * Looks up an array block in the btree. Then shadows it, and updates the + * btree to point to this new shadow. 'root' is an input/output parameter + * for both the current root block, and the new one. + */ +static int shadow_ablock(struct dm_array_info *info, dm_block_t *root, + unsigned index, struct dm_block **block, + struct array_block **ab) +{ + int r, inc; + uint64_t key = index; + dm_block_t b; + __le64 block_le; + + /* + * lookup + */ + r = dm_btree_lookup(&info->btree_info, *root, &key, &block_le); + if (r) + return r; + b = le64_to_cpu(block_le); + + /* + * shadow + */ + r = dm_tm_shadow_block(info->btree_info.tm, b, + &array_validator, block, &inc); + if (r) + return r; + + *ab = dm_block_data(*block); + if (inc) + inc_ablock_entries(info, *ab); + + /* + * Reinsert. + * + * The shadow op will often be a noop. Only insert if it really + * copied data. + */ + if (dm_block_location(*block) != b) { + /* + * dm_tm_shadow_block will have already decremented the old + * block, but it is still referenced by the btree. We + * increment to stop the insert decrementing it below zero + * when overwriting the old value. + */ + dm_tm_inc(info->btree_info.tm, b); + r = insert_ablock(info, index, *block, root); + } + + return r; +} + +/* + * Allocate an new array block, and fill it with some values. + */ +static int insert_new_ablock(struct dm_array_info *info, size_t size_of_block, + uint32_t max_entries, + unsigned block_index, uint32_t nr, + const void *value, dm_block_t *root) +{ + int r; + struct dm_block *block; + struct array_block *ab; + + r = alloc_ablock(info, size_of_block, max_entries, &block, &ab); + if (r) + return r; + + fill_ablock(info, ab, value, nr); + r = insert_ablock(info, block_index, block, root); + unlock_ablock(info, block); + + return r; +} + +static int insert_full_ablocks(struct dm_array_info *info, size_t size_of_block, + unsigned begin_block, unsigned end_block, + unsigned max_entries, const void *value, + dm_block_t *root) +{ + int r = 0; + + for (; !r && begin_block != end_block; begin_block++) + r = insert_new_ablock(info, size_of_block, max_entries, begin_block, max_entries, value, root); + + return r; +} + +/* + * There are a bunch of functions involved with resizing an array. This + * structure holds information that commonly needed by them. Purely here + * to reduce parameter count. + */ +struct resize { + /* + * Describes the array. + */ + struct dm_array_info *info; + + /* + * The current root of the array. This gets updated. + */ + dm_block_t root; + + /* + * Metadata block size. Used to calculate the nr entries in an + * array block. + */ + size_t size_of_block; + + /* + * Maximum nr entries in an array block. + */ + unsigned max_entries; + + /* + * nr of completely full blocks in the array. + * + * 'old' refers to before the resize, 'new' after. + */ + unsigned old_nr_full_blocks, new_nr_full_blocks; + + /* + * Number of entries in the final block. 0 iff only full blocks in + * the array. + */ + unsigned old_nr_entries_in_last_block, new_nr_entries_in_last_block; + + /* + * The default value used when growing the array. + */ + const void *value; +}; + +/* + * Removes a consecutive set of array blocks from the btree. The values + * in block are decremented as a side effect of the btree remove. + * + * begin_index - the index of the first array block to remove. + * end_index - the one-past-the-end value. ie. this block is not removed. + */ +static int drop_blocks(struct resize *resize, unsigned begin_index, + unsigned end_index) +{ + int r; + + while (begin_index != end_index) { + uint64_t key = begin_index++; + r = dm_btree_remove(&resize->info->btree_info, resize->root, + &key, &resize->root); + if (r) + return r; + } + + return 0; +} + +/* + * Calculates how many blocks are needed for the array. + */ +static unsigned total_nr_blocks_needed(unsigned nr_full_blocks, + unsigned nr_entries_in_last_block) +{ + return nr_full_blocks + (nr_entries_in_last_block ? 1 : 0); +} + +/* + * Shrink an array. + */ +static int shrink(struct resize *resize) +{ + int r; + unsigned begin, end; + struct dm_block *block; + struct array_block *ab; + + /* + * Lose some blocks from the back? + */ + if (resize->new_nr_full_blocks < resize->old_nr_full_blocks) { + begin = total_nr_blocks_needed(resize->new_nr_full_blocks, + resize->new_nr_entries_in_last_block); + end = total_nr_blocks_needed(resize->old_nr_full_blocks, + resize->old_nr_entries_in_last_block); + + r = drop_blocks(resize, begin, end); + if (r) + return r; + } + + /* + * Trim the new tail block + */ + if (resize->new_nr_entries_in_last_block) { + r = shadow_ablock(resize->info, &resize->root, + resize->new_nr_full_blocks, &block, &ab); + if (r) + return r; + + trim_ablock(resize->info, ab, resize->new_nr_entries_in_last_block); + unlock_ablock(resize->info, block); + } + + return 0; +} + +/* + * Grow an array. + */ +static int grow_extend_tail_block(struct resize *resize, uint32_t new_nr_entries) +{ + int r; + struct dm_block *block; + struct array_block *ab; + + r = shadow_ablock(resize->info, &resize->root, + resize->old_nr_full_blocks, &block, &ab); + if (r) + return r; + + fill_ablock(resize->info, ab, resize->value, new_nr_entries); + unlock_ablock(resize->info, block); + + return r; +} + +static int grow_add_tail_block(struct resize *resize) +{ + return insert_new_ablock(resize->info, resize->size_of_block, + resize->max_entries, + resize->new_nr_full_blocks, + resize->new_nr_entries_in_last_block, + resize->value, &resize->root); +} + +static int grow_needs_more_blocks(struct resize *resize) +{ + int r; + unsigned old_nr_blocks = resize->old_nr_full_blocks; + + if (resize->old_nr_entries_in_last_block > 0) { + old_nr_blocks++; + + r = grow_extend_tail_block(resize, resize->max_entries); + if (r) + return r; + } + + r = insert_full_ablocks(resize->info, resize->size_of_block, + old_nr_blocks, + resize->new_nr_full_blocks, + resize->max_entries, resize->value, + &resize->root); + if (r) + return r; + + if (resize->new_nr_entries_in_last_block) + r = grow_add_tail_block(resize); + + return r; +} + +static int grow(struct resize *resize) +{ + if (resize->new_nr_full_blocks > resize->old_nr_full_blocks) + return grow_needs_more_blocks(resize); + + else if (resize->old_nr_entries_in_last_block) + return grow_extend_tail_block(resize, resize->new_nr_entries_in_last_block); + + else + return grow_add_tail_block(resize); +} + +/*----------------------------------------------------------------*/ + +/* + * These are the value_type functions for the btree elements, which point + * to array blocks. + */ +static void block_inc(void *context, const void *value) +{ + __le64 block_le; + struct dm_array_info *info = context; + + memcpy(&block_le, value, sizeof(block_le)); + dm_tm_inc(info->btree_info.tm, le64_to_cpu(block_le)); +} + +static void block_dec(void *context, const void *value) +{ + int r; + uint64_t b; + __le64 block_le; + uint32_t ref_count; + struct dm_block *block; + struct array_block *ab; + struct dm_array_info *info = context; + + memcpy(&block_le, value, sizeof(block_le)); + b = le64_to_cpu(block_le); + + r = dm_tm_ref(info->btree_info.tm, b, &ref_count); + if (r) { + DMERR_LIMIT("couldn't get reference count for block %llu", + (unsigned long long) b); + return; + } + + if (ref_count == 1) { + /* + * We're about to drop the last reference to this ablock. + * So we need to decrement the ref count of the contents. + */ + r = get_ablock(info, b, &block, &ab); + if (r) { + DMERR_LIMIT("couldn't get array block %llu", + (unsigned long long) b); + return; + } + + dec_ablock_entries(info, ab); + unlock_ablock(info, block); + } + + dm_tm_dec(info->btree_info.tm, b); +} + +static int block_equal(void *context, const void *value1, const void *value2) +{ + return !memcmp(value1, value2, sizeof(__le64)); +} + +/*----------------------------------------------------------------*/ + +void dm_array_info_init(struct dm_array_info *info, + struct dm_transaction_manager *tm, + struct dm_btree_value_type *vt) +{ + struct dm_btree_value_type *bvt = &info->btree_info.value_type; + + memcpy(&info->value_type, vt, sizeof(info->value_type)); + info->btree_info.tm = tm; + info->btree_info.levels = 1; + + bvt->context = info; + bvt->size = sizeof(__le64); + bvt->inc = block_inc; + bvt->dec = block_dec; + bvt->equal = block_equal; +} +EXPORT_SYMBOL_GPL(dm_array_info_init); + +int dm_array_empty(struct dm_array_info *info, dm_block_t *root) +{ + return dm_btree_empty(&info->btree_info, root); +} +EXPORT_SYMBOL_GPL(dm_array_empty); + +static int array_resize(struct dm_array_info *info, dm_block_t root, + uint32_t old_size, uint32_t new_size, + const void *value, dm_block_t *new_root) +{ + int r; + struct resize resize; + + if (old_size == new_size) + return 0; + + resize.info = info; + resize.root = root; + resize.size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); + resize.max_entries = calc_max_entries(info->value_type.size, + resize.size_of_block); + + resize.old_nr_full_blocks = old_size / resize.max_entries; + resize.old_nr_entries_in_last_block = old_size % resize.max_entries; + resize.new_nr_full_blocks = new_size / resize.max_entries; + resize.new_nr_entries_in_last_block = new_size % resize.max_entries; + resize.value = value; + + r = ((new_size > old_size) ? grow : shrink)(&resize); + if (r) + return r; + + *new_root = resize.root; + return 0; +} + +int dm_array_resize(struct dm_array_info *info, dm_block_t root, + uint32_t old_size, uint32_t new_size, + const void *value, dm_block_t *new_root) + __dm_written_to_disk(value) +{ + int r = array_resize(info, root, old_size, new_size, value, new_root); + __dm_unbless_for_disk(value); + return r; +} +EXPORT_SYMBOL_GPL(dm_array_resize); + +int dm_array_del(struct dm_array_info *info, dm_block_t root) +{ + return dm_btree_del(&info->btree_info, root); +} +EXPORT_SYMBOL_GPL(dm_array_del); + +int dm_array_get_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, void *value_le) +{ + int r; + struct dm_block *block; + struct array_block *ab; + size_t size_of_block; + unsigned entry, max_entries; + + size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); + max_entries = calc_max_entries(info->value_type.size, size_of_block); + + r = lookup_ablock(info, root, index / max_entries, &block, &ab); + if (r) + return r; + + entry = index % max_entries; + if (entry >= le32_to_cpu(ab->nr_entries)) + r = -ENODATA; + else + memcpy(value_le, element_at(info, ab, entry), + info->value_type.size); + + unlock_ablock(info, block); + return r; +} +EXPORT_SYMBOL_GPL(dm_array_get_value); + +static int array_set_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, const void *value, dm_block_t *new_root) +{ + int r; + struct dm_block *block; + struct array_block *ab; + size_t size_of_block; + unsigned max_entries; + unsigned entry; + void *old_value; + struct dm_btree_value_type *vt = &info->value_type; + + size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); + max_entries = calc_max_entries(info->value_type.size, size_of_block); + + r = shadow_ablock(info, &root, index / max_entries, &block, &ab); + if (r) + return r; + *new_root = root; + + entry = index % max_entries; + if (entry >= le32_to_cpu(ab->nr_entries)) { + r = -ENODATA; + goto out; + } + + old_value = element_at(info, ab, entry); + if (vt->dec && + (!vt->equal || !vt->equal(vt->context, old_value, value))) { + vt->dec(vt->context, old_value); + if (vt->inc) + vt->inc(vt->context, value); + } + + memcpy(old_value, value, info->value_type.size); + +out: + unlock_ablock(info, block); + return r; +} + +int dm_array_set_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, const void *value, dm_block_t *new_root) + __dm_written_to_disk(value) +{ + int r; + + r = array_set_value(info, root, index, value, new_root); + __dm_unbless_for_disk(value); + return r; +} +EXPORT_SYMBOL_GPL(dm_array_set_value); + +struct walk_info { + struct dm_array_info *info; + int (*fn)(void *context, uint64_t key, void *leaf); + void *context; +}; + +static int walk_ablock(void *context, uint64_t *keys, void *leaf) +{ + struct walk_info *wi = context; + + int r; + unsigned i; + __le64 block_le; + unsigned nr_entries, max_entries; + struct dm_block *block; + struct array_block *ab; + + memcpy(&block_le, leaf, sizeof(block_le)); + r = get_ablock(wi->info, le64_to_cpu(block_le), &block, &ab); + if (r) + return r; + + max_entries = le32_to_cpu(ab->max_entries); + nr_entries = le32_to_cpu(ab->nr_entries); + for (i = 0; i < nr_entries; i++) { + r = wi->fn(wi->context, keys[0] * max_entries + i, + element_at(wi->info, ab, i)); + + if (r) + break; + } + + unlock_ablock(wi->info, block); + return r; +} + +int dm_array_walk(struct dm_array_info *info, dm_block_t root, + int (*fn)(void *, uint64_t key, void *leaf), + void *context) +{ + struct walk_info wi; + + wi.info = info; + wi.fn = fn; + wi.context = context; + + return dm_btree_walk(&info->btree_info, root, walk_ablock, &wi); +} +EXPORT_SYMBOL_GPL(dm_array_walk); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-array.h b/drivers/md/persistent-data/dm-array.h new file mode 100644 index 00000000000..ea177d6fa58 --- /dev/null +++ b/drivers/md/persistent-data/dm-array.h @@ -0,0 +1,166 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ +#ifndef _LINUX_DM_ARRAY_H +#define _LINUX_DM_ARRAY_H + +#include "dm-btree.h" + +/*----------------------------------------------------------------*/ + +/* + * The dm-array is a persistent version of an array. It packs the data + * more efficiently than a btree which will result in less disk space use, + * and a performance boost. The element get and set operations are still + * O(ln(n)), but with a much smaller constant. + * + * The value type structure is reused from the btree type to support proper + * reference counting of values. + * + * The arrays implicitly know their length, and bounds are checked for + * lookups and updated. It doesn't store this in an accessible place + * because it would waste a whole metadata block. Make sure you store the + * size along with the array root in your encompassing data. + * + * Array entries are indexed via an unsigned integer starting from zero. + * Arrays are not sparse; if you resize an array to have 'n' entries then + * 'n - 1' will be the last valid index. + * + * Typical use: + * + * a) initialise a dm_array_info structure. This describes the array + * values and ties it into a specific transaction manager. It holds no + * instance data; the same info can be used for many similar arrays if + * you wish. + * + * b) Get yourself a root. The root is the index of a block of data on the + * disk that holds a particular instance of an array. You may have a + * pre existing root in your metadata that you wish to use, or you may + * want to create a brand new, empty array with dm_array_empty(). + * + * Like the other data structures in this library, dm_array objects are + * immutable between transactions. Update functions will return you the + * root for a _new_ array. If you've incremented the old root, via + * dm_tm_inc(), before calling the update function you may continue to use + * it in parallel with the new root. + * + * c) resize an array with dm_array_resize(). + * + * d) Get a value from the array with dm_array_get_value(). + * + * e) Set a value in the array with dm_array_set_value(). + * + * f) Walk an array of values in index order with dm_array_walk(). More + * efficient than making many calls to dm_array_get_value(). + * + * g) Destroy the array with dm_array_del(). This tells the transaction + * manager that you're no longer using this data structure so it can + * recycle it's blocks. (dm_array_dec() would be a better name for it, + * but del is in keeping with dm_btree_del()). + */ + +/* + * Describes an array. Don't initialise this structure yourself, use the + * init function below. + */ +struct dm_array_info { + struct dm_transaction_manager *tm; + struct dm_btree_value_type value_type; + struct dm_btree_info btree_info; +}; + +/* + * Sets up a dm_array_info structure. You don't need to do anything with + * this structure when you finish using it. + * + * info - the structure being filled in. + * tm - the transaction manager that should supervise this structure. + * vt - describes the leaf values. + */ +void dm_array_info_init(struct dm_array_info *info, + struct dm_transaction_manager *tm, + struct dm_btree_value_type *vt); + +/* + * Create an empty, zero length array. + * + * info - describes the array + * root - on success this will be filled out with the root block + */ +int dm_array_empty(struct dm_array_info *info, dm_block_t *root); + +/* + * Resizes the array. + * + * info - describes the array + * root - the root block of the array on disk + * old_size - the caller is responsible for remembering the size of + * the array + * new_size - can be bigger or smaller than old_size + * value - if we're growing the array the new entries will have this value + * new_root - on success, points to the new root block + * + * If growing the inc function for 'value' will be called the appropriate + * number of times. So if the caller is holding a reference they may want + * to drop it. + */ +int dm_array_resize(struct dm_array_info *info, dm_block_t root, + uint32_t old_size, uint32_t new_size, + const void *value, dm_block_t *new_root) + __dm_written_to_disk(value); + +/* + * Frees a whole array. The value_type's decrement operation will be called + * for all values in the array + */ +int dm_array_del(struct dm_array_info *info, dm_block_t root); + +/* + * Lookup a value in the array + * + * info - describes the array + * root - root block of the array + * index - array index + * value - the value to be read. Will be in on-disk format of course. + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_array_get_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, void *value); + +/* + * Set an entry in the array. + * + * info - describes the array + * root - root block of the array + * index - array index + * value - value to be written to disk. Make sure you confirm the value is + * in on-disk format with__dm_bless_for_disk() before calling. + * new_root - the new root block + * + * The old value being overwritten will be decremented, the new value + * incremented. + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_array_set_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, const void *value, dm_block_t *new_root) + __dm_written_to_disk(value); + +/* + * Walk through all the entries in an array. + * + * info - describes the array + * root - root block of the array + * fn - called back for every element + * context - passed to the callback + */ +int dm_array_walk(struct dm_array_info *info, dm_block_t root, + int (*fn)(void *context, uint64_t key, void *leaf), + void *context); + +/*----------------------------------------------------------------*/ + +#endif /* _LINUX_DM_ARRAY_H */ diff --git a/drivers/md/persistent-data/dm-bitset.c b/drivers/md/persistent-data/dm-bitset.c new file mode 100644 index 00000000000..36f7cc2c710 --- /dev/null +++ b/drivers/md/persistent-data/dm-bitset.c @@ -0,0 +1,171 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-bitset.h" +#include "dm-transaction-manager.h" + +#include <linux/export.h> +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "bitset" +#define BITS_PER_ARRAY_ENTRY 64 + +/*----------------------------------------------------------------*/ + +static struct dm_btree_value_type bitset_bvt = { + .context = NULL, + .size = sizeof(__le64), + .inc = NULL, + .dec = NULL, + .equal = NULL, +}; + +/*----------------------------------------------------------------*/ + +void dm_disk_bitset_init(struct dm_transaction_manager *tm, + struct dm_disk_bitset *info) +{ + dm_array_info_init(&info->array_info, tm, &bitset_bvt); + info->current_index_set = false; +} +EXPORT_SYMBOL_GPL(dm_disk_bitset_init); + +int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *root) +{ + return dm_array_empty(&info->array_info, root); +} +EXPORT_SYMBOL_GPL(dm_bitset_empty); + +int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t root, + uint32_t old_nr_entries, uint32_t new_nr_entries, + bool default_value, dm_block_t *new_root) +{ + uint32_t old_blocks = dm_div_up(old_nr_entries, BITS_PER_ARRAY_ENTRY); + uint32_t new_blocks = dm_div_up(new_nr_entries, BITS_PER_ARRAY_ENTRY); + __le64 value = default_value ? cpu_to_le64(~0) : cpu_to_le64(0); + + __dm_bless_for_disk(&value); + return dm_array_resize(&info->array_info, root, old_blocks, new_blocks, + &value, new_root); +} +EXPORT_SYMBOL_GPL(dm_bitset_resize); + +int dm_bitset_del(struct dm_disk_bitset *info, dm_block_t root) +{ + return dm_array_del(&info->array_info, root); +} +EXPORT_SYMBOL_GPL(dm_bitset_del); + +int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root, + dm_block_t *new_root) +{ + int r; + __le64 value; + + if (!info->current_index_set || !info->dirty) + return 0; + + value = cpu_to_le64(info->current_bits); + + __dm_bless_for_disk(&value); + r = dm_array_set_value(&info->array_info, root, info->current_index, + &value, new_root); + if (r) + return r; + + info->current_index_set = false; + info->dirty = false; + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bitset_flush); + +static int read_bits(struct dm_disk_bitset *info, dm_block_t root, + uint32_t array_index) +{ + int r; + __le64 value; + + r = dm_array_get_value(&info->array_info, root, array_index, &value); + if (r) + return r; + + info->current_bits = le64_to_cpu(value); + info->current_index_set = true; + info->current_index = array_index; + info->dirty = false; + + return 0; +} + +static int get_array_entry(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root) +{ + int r; + unsigned array_index = index / BITS_PER_ARRAY_ENTRY; + + if (info->current_index_set) { + if (info->current_index == array_index) + return 0; + + r = dm_bitset_flush(info, root, new_root); + if (r) + return r; + } + + return read_bits(info, root, array_index); +} + +int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root) +{ + int r; + unsigned b = index % BITS_PER_ARRAY_ENTRY; + + r = get_array_entry(info, root, index, new_root); + if (r) + return r; + + set_bit(b, (unsigned long *) &info->current_bits); + info->dirty = true; + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bitset_set_bit); + +int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root) +{ + int r; + unsigned b = index % BITS_PER_ARRAY_ENTRY; + + r = get_array_entry(info, root, index, new_root); + if (r) + return r; + + clear_bit(b, (unsigned long *) &info->current_bits); + info->dirty = true; + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bitset_clear_bit); + +int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root, bool *result) +{ + int r; + unsigned b = index % BITS_PER_ARRAY_ENTRY; + + r = get_array_entry(info, root, index, new_root); + if (r) + return r; + + *result = test_bit(b, (unsigned long *) &info->current_bits); + return 0; +} +EXPORT_SYMBOL_GPL(dm_bitset_test_bit); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-bitset.h b/drivers/md/persistent-data/dm-bitset.h new file mode 100644 index 00000000000..c2287d672ef --- /dev/null +++ b/drivers/md/persistent-data/dm-bitset.h @@ -0,0 +1,166 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ +#ifndef _LINUX_DM_BITSET_H +#define _LINUX_DM_BITSET_H + +#include "dm-array.h" + +/*----------------------------------------------------------------*/ + +/* + * This bitset type is a thin wrapper round a dm_array of 64bit words. It + * uses a tiny, one word cache to reduce the number of array lookups and so + * increase performance. + * + * Like the dm-array that it's based on, the caller needs to keep track of + * the size of the bitset separately. The underlying dm-array implicitly + * knows how many words it's storing and will return -ENODATA if you try + * and access an out of bounds word. However, an out of bounds bit in the + * final word will _not_ be detected, you have been warned. + * + * Bits are indexed from zero. + + * Typical use: + * + * a) Initialise a dm_disk_bitset structure with dm_disk_bitset_init(). + * This describes the bitset and includes the cache. It's not called it + * dm_bitset_info in line with other data structures because it does + * include instance data. + * + * b) Get yourself a root. The root is the index of a block of data on the + * disk that holds a particular instance of an bitset. You may have a + * pre existing root in your metadata that you wish to use, or you may + * want to create a brand new, empty bitset with dm_bitset_empty(). + * + * Like the other data structures in this library, dm_bitset objects are + * immutable between transactions. Update functions will return you the + * root for a _new_ array. If you've incremented the old root, via + * dm_tm_inc(), before calling the update function you may continue to use + * it in parallel with the new root. + * + * Even read operations may trigger the cache to be flushed and as such + * return a root for a new, updated bitset. + * + * c) resize a bitset with dm_bitset_resize(). + * + * d) Set a bit with dm_bitset_set_bit(). + * + * e) Clear a bit with dm_bitset_clear_bit(). + * + * f) Test a bit with dm_bitset_test_bit(). + * + * g) Flush all updates from the cache with dm_bitset_flush(). + * + * h) Destroy the bitset with dm_bitset_del(). This tells the transaction + * manager that you're no longer using this data structure so it can + * recycle it's blocks. (dm_bitset_dec() would be a better name for it, + * but del is in keeping with dm_btree_del()). + */ + +/* + * Opaque object. Unlike dm_array_info, you should have one of these per + * bitset. Initialise with dm_disk_bitset_init(). + */ +struct dm_disk_bitset { + struct dm_array_info array_info; + + uint32_t current_index; + uint64_t current_bits; + + bool current_index_set:1; + bool dirty:1; +}; + +/* + * Sets up a dm_disk_bitset structure. You don't need to do anything with + * this structure when you finish using it. + * + * tm - the transaction manager that should supervise this structure + * info - the structure being initialised + */ +void dm_disk_bitset_init(struct dm_transaction_manager *tm, + struct dm_disk_bitset *info); + +/* + * Create an empty, zero length bitset. + * + * info - describes the bitset + * new_root - on success, points to the new root block + */ +int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *new_root); + +/* + * Resize the bitset. + * + * info - describes the bitset + * old_root - the root block of the array on disk + * old_nr_entries - the number of bits in the old bitset + * new_nr_entries - the number of bits you want in the new bitset + * default_value - the value for any new bits + * new_root - on success, points to the new root block + */ +int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t old_root, + uint32_t old_nr_entries, uint32_t new_nr_entries, + bool default_value, dm_block_t *new_root); + +/* + * Frees the bitset. + */ +int dm_bitset_del(struct dm_disk_bitset *info, dm_block_t root); + +/* + * Set a bit. + * + * info - describes the bitset + * root - the root block of the bitset + * index - the bit index + * new_root - on success, points to the new root block + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root); + +/* + * Clears a bit. + * + * info - describes the bitset + * root - the root block of the bitset + * index - the bit index + * new_root - on success, points to the new root block + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root); + +/* + * Tests a bit. + * + * info - describes the bitset + * root - the root block of the bitset + * index - the bit index + * new_root - on success, points to the new root block (cached values may have been written) + * result - the bit value you're after + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root, bool *result); + +/* + * Flush any cached changes to disk. + * + * info - describes the bitset + * root - the root block of the bitset + * new_root - on success, points to the new root block + */ +int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root, + dm_block_t *new_root); + +/*----------------------------------------------------------------*/ + +#endif /* _LINUX_DM_BITSET_H */ diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index 5ba277768d9..087411c95ff 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -25,7 +25,7 @@ * may be held at once. This is just an implementation detail. * * ii) Recursive locking attempts are detected and return EINVAL. A stack - * trace is also emitted for the previous lock aquisition. + * trace is also emitted for the previous lock acquisition. * * iii) Priority is given to write locks. */ @@ -104,12 +104,12 @@ static int __check_holder(struct block_lock *lock) for (i = 0; i < MAX_HOLDERS; i++) { if (lock->holders[i] == current) { - DMERR("recursive lock detected in pool metadata"); + DMERR("recursive lock detected in metadata"); #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING DMERR("previously held here:"); print_stack_trace(lock->traces + i, 4); - DMERR("subsequent aquisition attempted here:"); + DMERR("subsequent acquisition attempted here:"); t.nr_entries = 0; t.max_entries = MAX_STACK; t.entries = entries; @@ -428,15 +428,17 @@ static int dm_bm_validate_buffer(struct dm_block_manager *bm, if (!v) return 0; r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(bm->bufio)); - if (unlikely(r)) + if (unlikely(r)) { + DMERR_LIMIT("%s validator check failed for block %llu", v->name, + (unsigned long long) dm_bufio_get_block_number(buf)); return r; + } aux->validator = v; } else { if (unlikely(aux->validator != v)) { - DMERR("validator mismatch (old=%s vs new=%s) for block %llu", - aux->validator->name, v ? v->name : "NULL", - (unsigned long long) - dm_bufio_get_block_number(buf)); + DMERR_LIMIT("validator mismatch (old=%s vs new=%s) for block %llu", + aux->validator->name, v ? v->name : "NULL", + (unsigned long long) dm_bufio_get_block_number(buf)); return -EINVAL; } } @@ -593,24 +595,19 @@ int dm_bm_unlock(struct dm_block *b) } EXPORT_SYMBOL_GPL(dm_bm_unlock); -int dm_bm_flush_and_unlock(struct dm_block_manager *bm, - struct dm_block *superblock) +int dm_bm_flush(struct dm_block_manager *bm) { - int r; - if (bm->read_only) return -EPERM; - r = dm_bufio_write_dirty_buffers(bm->bufio); - if (unlikely(r)) { - dm_bm_unlock(superblock); - return r; - } - - dm_bm_unlock(superblock); - return dm_bufio_write_dirty_buffers(bm->bufio); } +EXPORT_SYMBOL_GPL(dm_bm_flush); + +void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b) +{ + dm_bufio_prefetch(bm->bufio, b, 1); +} void dm_bm_set_read_only(struct dm_block_manager *bm) { @@ -618,6 +615,12 @@ void dm_bm_set_read_only(struct dm_block_manager *bm) } EXPORT_SYMBOL_GPL(dm_bm_set_read_only); +void dm_bm_set_read_write(struct dm_block_manager *bm) +{ + bm->read_only = false; +} +EXPORT_SYMBOL_GPL(dm_bm_set_read_write); + u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor) { return crc32c(~(u32) 0, data, len) ^ init_xor; diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h index be5bff61be2..1b95dfc1778 100644 --- a/drivers/md/persistent-data/dm-block-manager.h +++ b/drivers/md/persistent-data/dm-block-manager.h @@ -105,8 +105,12 @@ int dm_bm_unlock(struct dm_block *b); * * This method always blocks. */ -int dm_bm_flush_and_unlock(struct dm_block_manager *bm, - struct dm_block *superblock); +int dm_bm_flush(struct dm_block_manager *bm); + +/* + * Request data is prefetched into the cache. + */ +void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b); /* * Switches the bm to a read only mode. Once read-only mode @@ -120,6 +124,7 @@ int dm_bm_flush_and_unlock(struct dm_block_manager *bm, * be returned if you do. */ void dm_bm_set_read_only(struct dm_block_manager *bm); +void dm_bm_set_read_write(struct dm_block_manager *bm); u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor); diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index 5709bfeab1e..37d367bb9aa 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -36,13 +36,13 @@ struct node_header { __le32 padding; } __packed; -struct node { +struct btree_node { struct node_header header; __le64 keys[0]; } __packed; -void inc_children(struct dm_transaction_manager *tm, struct node *n, +void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, struct dm_btree_value_type *vt); int new_block(struct dm_btree_info *info, struct dm_block **result); @@ -64,7 +64,8 @@ struct ro_spine { void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info); int exit_ro_spine(struct ro_spine *s); int ro_step(struct ro_spine *s, dm_block_t new_child); -struct node *ro_node(struct ro_spine *s); +void ro_pop(struct ro_spine *s); +struct btree_node *ro_node(struct ro_spine *s); struct shadow_spine { struct dm_btree_info *info; @@ -98,17 +99,17 @@ int shadow_root(struct shadow_spine *s); /* * Some inlines. */ -static inline __le64 *key_ptr(struct node *n, uint32_t index) +static inline __le64 *key_ptr(struct btree_node *n, uint32_t index) { return n->keys + index; } -static inline void *value_base(struct node *n) +static inline void *value_base(struct btree_node *n) { return &n->keys[le32_to_cpu(n->header.max_entries)]; } -static inline void *value_ptr(struct node *n, uint32_t index) +static inline void *value_ptr(struct btree_node *n, uint32_t index) { uint32_t value_size = le32_to_cpu(n->header.value_size); return value_base(n) + (value_size * index); @@ -117,7 +118,7 @@ static inline void *value_ptr(struct node *n, uint32_t index) /* * Assumes the values are suitably-aligned and converts to core format. */ -static inline uint64_t value64(struct node *n, uint32_t index) +static inline uint64_t value64(struct btree_node *n, uint32_t index) { __le64 *values_le = value_base(n); @@ -127,7 +128,7 @@ static inline uint64_t value64(struct node *n, uint32_t index) /* * Searching for a key within a single node. */ -int lower_bound(struct node *n, uint64_t key); +int lower_bound(struct btree_node *n, uint64_t key); extern struct dm_block_validator btree_node_validator; diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c index aa71e2359a0..b88757cd0d1 100644 --- a/drivers/md/persistent-data/dm-btree-remove.c +++ b/drivers/md/persistent-data/dm-btree-remove.c @@ -53,7 +53,7 @@ /* * Some little utilities for moving node data around. */ -static void node_shift(struct node *n, int shift) +static void node_shift(struct btree_node *n, int shift) { uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); uint32_t value_size = le32_to_cpu(n->header.value_size); @@ -79,7 +79,7 @@ static void node_shift(struct node *n, int shift) } } -static void node_copy(struct node *left, struct node *right, int shift) +static void node_copy(struct btree_node *left, struct btree_node *right, int shift) { uint32_t nr_left = le32_to_cpu(left->header.nr_entries); uint32_t value_size = le32_to_cpu(left->header.value_size); @@ -108,7 +108,7 @@ static void node_copy(struct node *left, struct node *right, int shift) /* * Delete a specific entry from a leaf node. */ -static void delete_at(struct node *n, unsigned index) +static void delete_at(struct btree_node *n, unsigned index) { unsigned nr_entries = le32_to_cpu(n->header.nr_entries); unsigned nr_to_copy = nr_entries - (index + 1); @@ -128,7 +128,7 @@ static void delete_at(struct node *n, unsigned index) n->header.nr_entries = cpu_to_le32(nr_entries - 1); } -static unsigned merge_threshold(struct node *n) +static unsigned merge_threshold(struct btree_node *n) { return le32_to_cpu(n->header.max_entries) / 3; } @@ -136,18 +136,11 @@ static unsigned merge_threshold(struct node *n) struct child { unsigned index; struct dm_block *block; - struct node *n; + struct btree_node *n; }; -static struct dm_btree_value_type le64_type = { - .context = NULL, - .size = sizeof(__le64), - .inc = NULL, - .dec = NULL, - .equal = NULL -}; - -static int init_child(struct dm_btree_info *info, struct node *parent, +static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt, + struct btree_node *parent, unsigned index, struct child *result) { int r, inc; @@ -164,7 +157,7 @@ static int init_child(struct dm_btree_info *info, struct node *parent, result->n = dm_block_data(result->block); if (inc) - inc_children(info->tm, result->n, &le64_type); + inc_children(info->tm, result->n, vt); *((__le64 *) value_ptr(parent, index)) = cpu_to_le64(dm_block_location(result->block)); @@ -177,7 +170,7 @@ static int exit_child(struct dm_btree_info *info, struct child *c) return dm_tm_unlock(info->tm, c->block); } -static void shift(struct node *left, struct node *right, int count) +static void shift(struct btree_node *left, struct btree_node *right, int count) { uint32_t nr_left = le32_to_cpu(left->header.nr_entries); uint32_t nr_right = le32_to_cpu(right->header.nr_entries); @@ -203,11 +196,11 @@ static void shift(struct node *left, struct node *right, int count) right->header.nr_entries = cpu_to_le32(nr_right + count); } -static void __rebalance2(struct dm_btree_info *info, struct node *parent, +static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent, struct child *l, struct child *r) { - struct node *left = l->n; - struct node *right = r->n; + struct btree_node *left = l->n; + struct btree_node *right = r->n; uint32_t nr_left = le32_to_cpu(left->header.nr_entries); uint32_t nr_right = le32_to_cpu(right->header.nr_entries); unsigned threshold = 2 * merge_threshold(left) + 1; @@ -236,19 +229,19 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent, } static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, - unsigned left_index) + struct dm_btree_value_type *vt, unsigned left_index) { int r; - struct node *parent; + struct btree_node *parent; struct child left, right; parent = dm_block_data(shadow_current(s)); - r = init_child(info, parent, left_index, &left); + r = init_child(info, vt, parent, left_index, &left); if (r) return r; - r = init_child(info, parent, left_index + 1, &right); + r = init_child(info, vt, parent, left_index + 1, &right); if (r) { exit_child(info, &left); return r; @@ -270,9 +263,9 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, * in right, then rebalance2. This wastes some cpu, but I want something * simple atm. */ -static void delete_center_node(struct dm_btree_info *info, struct node *parent, +static void delete_center_node(struct dm_btree_info *info, struct btree_node *parent, struct child *l, struct child *c, struct child *r, - struct node *left, struct node *center, struct node *right, + struct btree_node *left, struct btree_node *center, struct btree_node *right, uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) { uint32_t max_entries = le32_to_cpu(left->header.max_entries); @@ -301,9 +294,9 @@ static void delete_center_node(struct dm_btree_info *info, struct node *parent, /* * Redistributes entries among 3 sibling nodes. */ -static void redistribute3(struct dm_btree_info *info, struct node *parent, +static void redistribute3(struct dm_btree_info *info, struct btree_node *parent, struct child *l, struct child *c, struct child *r, - struct node *left, struct node *center, struct node *right, + struct btree_node *left, struct btree_node *center, struct btree_node *right, uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) { int s; @@ -343,12 +336,12 @@ static void redistribute3(struct dm_btree_info *info, struct node *parent, *key_ptr(parent, r->index) = right->keys[0]; } -static void __rebalance3(struct dm_btree_info *info, struct node *parent, +static void __rebalance3(struct dm_btree_info *info, struct btree_node *parent, struct child *l, struct child *c, struct child *r) { - struct node *left = l->n; - struct node *center = c->n; - struct node *right = r->n; + struct btree_node *left = l->n; + struct btree_node *center = c->n; + struct btree_node *right = r->n; uint32_t nr_left = le32_to_cpu(left->header.nr_entries); uint32_t nr_center = le32_to_cpu(center->header.nr_entries); @@ -368,26 +361,26 @@ static void __rebalance3(struct dm_btree_info *info, struct node *parent, } static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, - unsigned left_index) + struct dm_btree_value_type *vt, unsigned left_index) { int r; - struct node *parent = dm_block_data(shadow_current(s)); + struct btree_node *parent = dm_block_data(shadow_current(s)); struct child left, center, right; /* * FIXME: fill out an array? */ - r = init_child(info, parent, left_index, &left); + r = init_child(info, vt, parent, left_index, &left); if (r) return r; - r = init_child(info, parent, left_index + 1, ¢er); + r = init_child(info, vt, parent, left_index + 1, ¢er); if (r) { exit_child(info, &left); return r; } - r = init_child(info, parent, left_index + 2, &right); + r = init_child(info, vt, parent, left_index + 2, &right); if (r) { exit_child(info, &left); exit_child(info, ¢er); @@ -421,7 +414,7 @@ static int get_nr_entries(struct dm_transaction_manager *tm, { int r; struct dm_block *block; - struct node *n; + struct btree_node *n; r = dm_tm_read_lock(tm, b, &btree_node_validator, &block); if (r) @@ -434,11 +427,12 @@ static int get_nr_entries(struct dm_transaction_manager *tm, } static int rebalance_children(struct shadow_spine *s, - struct dm_btree_info *info, uint64_t key) + struct dm_btree_info *info, + struct dm_btree_value_type *vt, uint64_t key) { int i, r, has_left_sibling, has_right_sibling; uint32_t child_entries; - struct node *n; + struct btree_node *n; n = dm_block_data(shadow_current(s)); @@ -472,18 +466,18 @@ static int rebalance_children(struct shadow_spine *s, has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); if (!has_left_sibling) - r = rebalance2(s, info, i); + r = rebalance2(s, info, vt, i); else if (!has_right_sibling) - r = rebalance2(s, info, i - 1); + r = rebalance2(s, info, vt, i - 1); else - r = rebalance3(s, info, i - 1); + r = rebalance3(s, info, vt, i - 1); return r; } -static int do_leaf(struct node *n, uint64_t key, unsigned *index) +static int do_leaf(struct btree_node *n, uint64_t key, unsigned *index) { int i = lower_bound(n, key); @@ -506,7 +500,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, uint64_t key, unsigned *index) { int i = *index, r; - struct node *n; + struct btree_node *n; for (;;) { r = shadow_step(s, root, vt); @@ -529,7 +523,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, if (le32_to_cpu(n->header.flags) & LEAF_NODE) return do_leaf(n, key, index); - r = rebalance_children(s, info, key); + r = rebalance_children(s, info, vt, key); if (r) break; @@ -550,13 +544,21 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, return r; } +static struct dm_btree_value_type le64_type = { + .context = NULL, + .size = sizeof(__le64), + .inc = NULL, + .dec = NULL, + .equal = NULL +}; + int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, uint64_t *keys, dm_block_t *new_root) { unsigned level, last_level = info->levels - 1; int index = 0, r = 0; struct shadow_spine spine; - struct node *n; + struct btree_node *n; init_shadow_spine(&spine, info); for (level = 0; level < info->levels; level++) { diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c index d9a7912ee8e..cf9fd676ae4 100644 --- a/drivers/md/persistent-data/dm-btree-spine.c +++ b/drivers/md/persistent-data/dm-btree-spine.c @@ -23,7 +23,7 @@ static void node_prepare_for_write(struct dm_block_validator *v, struct dm_block *b, size_t block_size) { - struct node *n = dm_block_data(b); + struct btree_node *n = dm_block_data(b); struct node_header *h = &n->header; h->blocknr = cpu_to_le64(dm_block_location(b)); @@ -38,15 +38,15 @@ static int node_check(struct dm_block_validator *v, struct dm_block *b, size_t block_size) { - struct node *n = dm_block_data(b); + struct btree_node *n = dm_block_data(b); struct node_header *h = &n->header; size_t value_size; __le32 csum_disk; uint32_t flags; if (dm_block_location(b) != le64_to_cpu(h->blocknr)) { - DMERR("node_check failed blocknr %llu wanted %llu", - le64_to_cpu(h->blocknr), dm_block_location(b)); + DMERR_LIMIT("node_check failed: blocknr %llu != wanted %llu", + le64_to_cpu(h->blocknr), dm_block_location(b)); return -ENOTBLK; } @@ -54,8 +54,8 @@ static int node_check(struct dm_block_validator *v, block_size - sizeof(__le32), BTREE_CSUM_XOR)); if (csum_disk != h->csum) { - DMERR("node_check failed csum %u wanted %u", - le32_to_cpu(csum_disk), le32_to_cpu(h->csum)); + DMERR_LIMIT("node_check failed: csum %u != wanted %u", + le32_to_cpu(csum_disk), le32_to_cpu(h->csum)); return -EILSEQ; } @@ -63,12 +63,12 @@ static int node_check(struct dm_block_validator *v, if (sizeof(struct node_header) + (sizeof(__le64) + value_size) * le32_to_cpu(h->max_entries) > block_size) { - DMERR("node_check failed: max_entries too large"); + DMERR_LIMIT("node_check failed: max_entries too large"); return -EILSEQ; } if (le32_to_cpu(h->nr_entries) > le32_to_cpu(h->max_entries)) { - DMERR("node_check failed, too many entries"); + DMERR_LIMIT("node_check failed: too many entries"); return -EILSEQ; } @@ -77,7 +77,7 @@ static int node_check(struct dm_block_validator *v, */ flags = le32_to_cpu(h->flags); if (!(flags & INTERNAL_NODE) && !(flags & LEAF_NODE)) { - DMERR("node_check failed, node is neither INTERNAL or LEAF"); + DMERR_LIMIT("node_check failed: node is neither INTERNAL or LEAF"); return -EILSEQ; } @@ -164,7 +164,14 @@ int ro_step(struct ro_spine *s, dm_block_t new_child) return r; } -struct node *ro_node(struct ro_spine *s) +void ro_pop(struct ro_spine *s) +{ + BUG_ON(!s->count); + --s->count; + unlock_block(s->info, s->nodes[s->count]); +} + +struct btree_node *ro_node(struct ro_spine *s) { struct dm_block *block; diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index d12b2cc51f1..416060c2570 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -38,7 +38,7 @@ static void array_insert(void *base, size_t elt_size, unsigned nr_elts, /*----------------------------------------------------------------*/ /* makes the assumption that no two keys are the same. */ -static int bsearch(struct node *n, uint64_t key, int want_hi) +static int bsearch(struct btree_node *n, uint64_t key, int want_hi) { int lo = -1, hi = le32_to_cpu(n->header.nr_entries); @@ -58,12 +58,12 @@ static int bsearch(struct node *n, uint64_t key, int want_hi) return want_hi ? hi : lo; } -int lower_bound(struct node *n, uint64_t key) +int lower_bound(struct btree_node *n, uint64_t key) { return bsearch(n, key, 0); } -void inc_children(struct dm_transaction_manager *tm, struct node *n, +void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, struct dm_btree_value_type *vt) { unsigned i; @@ -77,7 +77,7 @@ void inc_children(struct dm_transaction_manager *tm, struct node *n, vt->inc(vt->context, value_ptr(n, i)); } -static int insert_at(size_t value_size, struct node *node, unsigned index, +static int insert_at(size_t value_size, struct btree_node *node, unsigned index, uint64_t key, void *value) __dm_written_to_disk(value) { @@ -122,7 +122,7 @@ int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root) { int r; struct dm_block *b; - struct node *n; + struct btree_node *n; size_t block_size; uint32_t max_entries; @@ -154,13 +154,14 @@ EXPORT_SYMBOL_GPL(dm_btree_empty); #define MAX_SPINE_DEPTH 64 struct frame { struct dm_block *b; - struct node *n; + struct btree_node *n; unsigned level; unsigned nr_children; unsigned current_child; }; struct del_stack { + struct dm_btree_info *info; struct dm_transaction_manager *tm; int top; struct frame spine[MAX_SPINE_DEPTH]; @@ -183,6 +184,20 @@ static int unprocessed_frames(struct del_stack *s) return s->top >= 0; } +static void prefetch_children(struct del_stack *s, struct frame *f) +{ + unsigned i; + struct dm_block_manager *bm = dm_tm_get_bm(s->tm); + + for (i = 0; i < f->nr_children; i++) + dm_bm_prefetch(bm, value64(f->n, i)); +} + +static bool is_internal_level(struct dm_btree_info *info, struct frame *f) +{ + return f->level < (info->levels - 1); +} + static int push_frame(struct del_stack *s, dm_block_t b, unsigned level) { int r; @@ -205,6 +220,7 @@ static int push_frame(struct del_stack *s, dm_block_t b, unsigned level) dm_tm_dec(s->tm, b); else { + uint32_t flags; struct frame *f = s->spine + ++s->top; r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b); @@ -217,6 +233,10 @@ static int push_frame(struct del_stack *s, dm_block_t b, unsigned level) f->level = level; f->nr_children = le32_to_cpu(f->n->header.nr_entries); f->current_child = 0; + + flags = le32_to_cpu(f->n->header.flags); + if (flags & INTERNAL_NODE || is_internal_level(s->info, f)) + prefetch_children(s, f); } return 0; @@ -238,10 +258,11 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root) s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; + s->info = info; s->tm = info->tm; s->top = -1; - r = push_frame(s, root, 1); + r = push_frame(s, root, 0); if (r) goto out; @@ -267,7 +288,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root) if (r) goto out; - } else if (f->level != (info->levels - 1)) { + } else if (is_internal_level(info, f)) { b = value64(f->n, f->current_child); f->current_child++; r = push_frame(s, b, f->level + 1); @@ -282,7 +303,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root) info->value_type.dec(info->value_type.context, value_ptr(f->n, i)); } - f->current_child = f->nr_children; + pop_frame(s); } } @@ -295,7 +316,7 @@ EXPORT_SYMBOL_GPL(dm_btree_del); /*----------------------------------------------------------------*/ static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key, - int (*search_fn)(struct node *, uint64_t), + int (*search_fn)(struct btree_node *, uint64_t), uint64_t *result_key, void *v, size_t value_size) { int i, r; @@ -406,7 +427,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root, size_t size; unsigned nr_left, nr_right; struct dm_block *left, *right, *parent; - struct node *ln, *rn, *pn; + struct btree_node *ln, *rn, *pn; __le64 location; left = shadow_current(s); @@ -491,7 +512,7 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key) size_t size; unsigned nr_left, nr_right; struct dm_block *left, *right, *new_parent; - struct node *pn, *ln, *rn; + struct btree_node *pn, *ln, *rn; __le64 val; new_parent = shadow_current(s); @@ -576,7 +597,7 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, uint64_t key, unsigned *index) { int r, i = *index, top = 1; - struct node *node; + struct btree_node *node; for (;;) { r = shadow_step(s, root, vt); @@ -643,7 +664,7 @@ static int insert(struct dm_btree_info *info, dm_block_t root, unsigned level, index = -1, last_level = info->levels - 1; dm_block_t block = root; struct shadow_spine spine; - struct node *n; + struct btree_node *n; struct dm_btree_value_type le64_type; le64_type.context = NULL; @@ -749,8 +770,8 @@ EXPORT_SYMBOL_GPL(dm_btree_insert_notify); /*----------------------------------------------------------------*/ -static int find_highest_key(struct ro_spine *s, dm_block_t block, - uint64_t *result_key, dm_block_t *next_block) +static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest, + uint64_t *result_key, dm_block_t *next_block) { int i, r; uint32_t flags; @@ -767,7 +788,11 @@ static int find_highest_key(struct ro_spine *s, dm_block_t block, else i--; - *result_key = le64_to_cpu(ro_node(s)->keys[i]); + if (find_highest) + *result_key = le64_to_cpu(ro_node(s)->keys[i]); + else + *result_key = le64_to_cpu(ro_node(s)->keys[0]); + if (next_block || flags & INTERNAL_NODE) block = value64(ro_node(s), i); @@ -778,16 +803,16 @@ static int find_highest_key(struct ro_spine *s, dm_block_t block, return 0; } -int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, - uint64_t *result_keys) +static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root, + bool find_highest, uint64_t *result_keys) { int r = 0, count = 0, level; struct ro_spine spine; init_ro_spine(&spine, info); for (level = 0; level < info->levels; level++) { - r = find_highest_key(&spine, root, result_keys + level, - level == info->levels - 1 ? NULL : &root); + r = find_key(&spine, root, find_highest, result_keys + level, + level == info->levels - 1 ? NULL : &root); if (r == -ENODATA) { r = 0; break; @@ -801,4 +826,71 @@ int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, return r ? r : count; } + +int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, + uint64_t *result_keys) +{ + return dm_btree_find_key(info, root, true, result_keys); +} EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); + +int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root, + uint64_t *result_keys) +{ + return dm_btree_find_key(info, root, false, result_keys); +} +EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key); + +/*----------------------------------------------------------------*/ + +/* + * FIXME: We shouldn't use a recursive algorithm when we have limited stack + * space. Also this only works for single level trees. + */ +static int walk_node(struct ro_spine *s, dm_block_t block, + int (*fn)(void *context, uint64_t *keys, void *leaf), + void *context) +{ + int r; + unsigned i, nr; + struct btree_node *n; + uint64_t keys; + + r = ro_step(s, block); + n = ro_node(s); + + nr = le32_to_cpu(n->header.nr_entries); + for (i = 0; i < nr; i++) { + if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) { + r = walk_node(s, value64(n, i), fn, context); + if (r) + goto out; + } else { + keys = le64_to_cpu(*key_ptr(n, i)); + r = fn(context, &keys, value_ptr(n, i)); + if (r) + goto out; + } + } + +out: + ro_pop(s); + return r; +} + +int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, + int (*fn)(void *context, uint64_t *keys, void *leaf), + void *context) +{ + int r; + struct ro_spine spine; + + BUG_ON(info->levels > 1); + + init_ro_spine(&spine, info); + r = walk_node(&spine, root, fn, context); + exit_ro_spine(&spine); + + return r; +} +EXPORT_SYMBOL_GPL(dm_btree_walk); diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h index ae02c84410f..dacfc34180b 100644 --- a/drivers/md/persistent-data/dm-btree.h +++ b/drivers/md/persistent-data/dm-btree.h @@ -35,7 +35,7 @@ struct dm_transaction_manager; */ /* - * Infomation about the values stored within the btree. + * Information about the values stored within the btree. */ struct dm_btree_value_type { void *context; @@ -58,21 +58,21 @@ struct dm_btree_value_type { * somewhere.) This method is _not_ called for insertion of a new * value: It is assumed the ref count is already 1. */ - void (*inc)(void *context, void *value); + void (*inc)(void *context, const void *value); /* * This value is being deleted. The btree takes care of freeing * the memory pointed to by @value. Often the del function just * needs to decrement a reference count somewhere. */ - void (*dec)(void *context, void *value); + void (*dec)(void *context, const void *value); /* * A test for equality between two values. When a value is * overwritten with a new one, the old one has the dec method * called _unless_ the new and old value are deemed equal. */ - int (*equal)(void *context, void *value1, void *value2); + int (*equal)(void *context, const void *value1, const void *value2); }; /* @@ -137,9 +137,26 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, /* * Returns < 0 on failure. Otherwise the number of key entries that have * been filled out. Remember trees can have zero entries, and as such have + * no lowest key. + */ +int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root, + uint64_t *result_keys); + +/* + * Returns < 0 on failure. Otherwise the number of key entries that have + * been filled out. Remember trees can have zero entries, and as such have * no highest key. */ int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, uint64_t *result_keys); +/* + * Iterate through the a btree, calling fn() on each entry. + * It only works for single level trees and is internally recursive, so + * monitor stack usage carefully. + */ +int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, + int (*fn)(void *context, uint64_t *keys, void *leaf), + void *context); + #endif /* _LINUX_DM_BTREE_H */ diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index f3a9af8cdec..aacbe70c2c2 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -39,8 +39,8 @@ static int index_check(struct dm_block_validator *v, __le32 csum_disk; if (dm_block_location(b) != le64_to_cpu(mi_le->blocknr)) { - DMERR("index_check failed blocknr %llu wanted %llu", - le64_to_cpu(mi_le->blocknr), dm_block_location(b)); + DMERR_LIMIT("index_check failed: blocknr %llu != wanted %llu", + le64_to_cpu(mi_le->blocknr), dm_block_location(b)); return -ENOTBLK; } @@ -48,8 +48,8 @@ static int index_check(struct dm_block_validator *v, block_size - sizeof(__le32), INDEX_CSUM_XOR)); if (csum_disk != mi_le->csum) { - DMERR("index_check failed csum %u wanted %u", - le32_to_cpu(csum_disk), le32_to_cpu(mi_le->csum)); + DMERR_LIMIT("index_check failed: csum %u != wanted %u", + le32_to_cpu(csum_disk), le32_to_cpu(mi_le->csum)); return -EILSEQ; } @@ -89,8 +89,8 @@ static int bitmap_check(struct dm_block_validator *v, __le32 csum_disk; if (dm_block_location(b) != le64_to_cpu(disk_header->blocknr)) { - DMERR("bitmap check failed blocknr %llu wanted %llu", - le64_to_cpu(disk_header->blocknr), dm_block_location(b)); + DMERR_LIMIT("bitmap check failed: blocknr %llu != wanted %llu", + le64_to_cpu(disk_header->blocknr), dm_block_location(b)); return -ENOTBLK; } @@ -98,8 +98,8 @@ static int bitmap_check(struct dm_block_validator *v, block_size - sizeof(__le32), BITMAP_CSUM_XOR)); if (csum_disk != disk_header->csum) { - DMERR("bitmap check failed csum %u wanted %u", - le32_to_cpu(csum_disk), le32_to_cpu(disk_header->csum)); + DMERR_LIMIT("bitmap check failed: csum %u != wanted %u", + le32_to_cpu(csum_disk), le32_to_cpu(disk_header->csum)); return -EILSEQ; } @@ -245,6 +245,10 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks) return -EINVAL; } + /* + * We need to set this before the dm_tm_new_block() call below. + */ + ll->nr_blocks = nr_blocks; for (i = old_blocks; i < blocks; i++) { struct dm_block *b; struct disk_index_entry idx; @@ -252,6 +256,7 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks) r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b); if (r < 0) return r; + idx.blocknr = cpu_to_le64(dm_block_location(b)); r = dm_tm_unlock(ll->tm, b); @@ -266,7 +271,6 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks) return r; } - ll->nr_blocks = nr_blocks; return 0; } @@ -292,16 +296,11 @@ int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result) return dm_tm_unlock(ll->tm, blk); } -int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result) +static int sm_ll_lookup_big_ref_count(struct ll_disk *ll, dm_block_t b, + uint32_t *result) { __le32 le_rc; - int r = sm_ll_lookup_bitmap(ll, b, result); - - if (r) - return r; - - if (*result != 3) - return r; + int r; r = dm_btree_lookup(&ll->ref_count_info, ll->ref_count_root, &b, &le_rc); if (r < 0) @@ -312,6 +311,19 @@ int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result) return r; } +int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result) +{ + int r = sm_ll_lookup_bitmap(ll, b, result); + + if (r) + return r; + + if (*result != 3) + return r; + + return sm_ll_lookup_big_ref_count(ll, b, result); +} + int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, dm_block_t end, dm_block_t *result) { @@ -372,11 +384,12 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, return -ENOSPC; } -int sm_ll_insert(struct ll_disk *ll, dm_block_t b, - uint32_t ref_count, enum allocation_event *ev) +static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, + int (*mutator)(void *context, uint32_t old, uint32_t *new), + void *context, enum allocation_event *ev) { int r; - uint32_t bit, old; + uint32_t bit, old, ref_count; struct dm_block *nb; dm_block_t index = b; struct disk_index_entry ie_disk; @@ -399,6 +412,20 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b, bm_le = dm_bitmap_data(nb); old = sm_lookup_bitmap(bm_le, bit); + if (old > 2) { + r = sm_ll_lookup_big_ref_count(ll, b, &old); + if (r < 0) { + dm_tm_unlock(ll->tm, nb); + return r; + } + } + + r = mutator(context, old, &ref_count); + if (r) { + dm_tm_unlock(ll->tm, nb); + return r; + } + if (ref_count <= 2) { sm_set_bitmap(bm_le, bit, ref_count); @@ -448,31 +475,43 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b, return ll->save_ie(ll, index, &ie_disk); } -int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) +static int set_ref_count(void *context, uint32_t old, uint32_t *new) { - int r; - uint32_t rc; - - r = sm_ll_lookup(ll, b, &rc); - if (r) - return r; + *new = *((uint32_t *) context); + return 0; +} - return sm_ll_insert(ll, b, rc + 1, ev); +int sm_ll_insert(struct ll_disk *ll, dm_block_t b, + uint32_t ref_count, enum allocation_event *ev) +{ + return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev); } -int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) +static int inc_ref_count(void *context, uint32_t old, uint32_t *new) { - int r; - uint32_t rc; + *new = old + 1; + return 0; +} - r = sm_ll_lookup(ll, b, &rc); - if (r) - return r; +int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) +{ + return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev); +} - if (!rc) +static int dec_ref_count(void *context, uint32_t old, uint32_t *new) +{ + if (!old) { + DMERR_LIMIT("unable to decrement a reference count below 0"); return -EINVAL; + } - return sm_ll_insert(ll, b, rc - 1, ev); + *new = old - 1; + return 0; +} + +int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) +{ + return sm_ll_mutate(ll, b, dec_ref_count, NULL, ev); } int sm_ll_commit(struct ll_disk *ll) diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c index f6d29e614ab..cfbf9617e46 100644 --- a/drivers/md/persistent-data/dm-space-map-disk.c +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -140,26 +140,10 @@ static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b) static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b) { - int r; - uint32_t old_count; enum allocation_event ev; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - r = sm_ll_dec(&smd->ll, b, &ev); - if (!r && (ev == SM_FREE)) { - /* - * It's only free if it's also free in the last - * transaction. - */ - r = sm_ll_lookup(&smd->old_ll, b, &old_count); - if (r) - return r; - - if (!old_count) - smd->nr_allocated_this_transaction--; - } - - return r; + return sm_ll_dec(&smd->ll, b, &ev); } static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) @@ -248,7 +232,8 @@ static struct dm_space_map ops = { .new_block = sm_disk_new_block, .commit = sm_disk_commit, .root_size = sm_disk_root_size, - .copy_root = sm_disk_copy_root + .copy_root = sm_disk_copy_root, + .register_threshold_callback = NULL }; struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm, diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index e89ae5e7a51..786b689bdfc 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -17,6 +17,55 @@ /*----------------------------------------------------------------*/ /* + * An edge triggered threshold. + */ +struct threshold { + bool threshold_set; + bool value_set; + dm_block_t threshold; + dm_block_t current_value; + dm_sm_threshold_fn fn; + void *context; +}; + +static void threshold_init(struct threshold *t) +{ + t->threshold_set = false; + t->value_set = false; +} + +static void set_threshold(struct threshold *t, dm_block_t value, + dm_sm_threshold_fn fn, void *context) +{ + t->threshold_set = true; + t->threshold = value; + t->fn = fn; + t->context = context; +} + +static bool below_threshold(struct threshold *t, dm_block_t value) +{ + return t->threshold_set && value <= t->threshold; +} + +static bool threshold_already_triggered(struct threshold *t) +{ + return t->value_set && below_threshold(t, t->current_value); +} + +static void check_threshold(struct threshold *t, dm_block_t value) +{ + if (below_threshold(t, value) && + !threshold_already_triggered(t)) + t->fn(t->context); + + t->value_set = true; + t->current_value = value; +} + +/*----------------------------------------------------------------*/ + +/* * Space map interface. * * The low level disk format is written using the standard btree and @@ -42,6 +91,69 @@ struct block_op { dm_block_t block; }; +struct bop_ring_buffer { + unsigned begin; + unsigned end; + struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1]; +}; + +static void brb_init(struct bop_ring_buffer *brb) +{ + brb->begin = 0; + brb->end = 0; +} + +static bool brb_empty(struct bop_ring_buffer *brb) +{ + return brb->begin == brb->end; +} + +static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old) +{ + unsigned r = old + 1; + return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r; +} + +static int brb_push(struct bop_ring_buffer *brb, + enum block_op_type type, dm_block_t b) +{ + struct block_op *bop; + unsigned next = brb_next(brb, brb->end); + + /* + * We don't allow the last bop to be filled, this way we can + * differentiate between full and empty. + */ + if (next == brb->begin) + return -ENOMEM; + + bop = brb->bops + brb->end; + bop->type = type; + bop->block = b; + + brb->end = next; + + return 0; +} + +static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result) +{ + struct block_op *bop; + + if (brb_empty(brb)) + return -ENODATA; + + bop = brb->bops + brb->begin; + result->type = bop->type; + result->block = bop->block; + + brb->begin = brb_next(brb, brb->begin); + + return 0; +} + +/*----------------------------------------------------------------*/ + struct sm_metadata { struct dm_space_map sm; @@ -52,23 +164,20 @@ struct sm_metadata { unsigned recursion_count; unsigned allocated_this_transaction; - unsigned nr_uncommitted; - struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS]; + struct bop_ring_buffer uncommitted; + + struct threshold threshold; }; static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b) { - struct block_op *op; + int r = brb_push(&smm->uncommitted, type, b); - if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) { + if (r) { DMERR("too many recursive allocations"); return -ENOMEM; } - op = smm->uncommitted + smm->nr_uncommitted++; - op->type = type; - op->block = b; - return 0; } @@ -107,11 +216,17 @@ static int out(struct sm_metadata *smm) return -ENOMEM; } - if (smm->recursion_count == 1 && smm->nr_uncommitted) { - while (smm->nr_uncommitted && !r) { - smm->nr_uncommitted--; - r = commit_bop(smm, smm->uncommitted + - smm->nr_uncommitted); + if (smm->recursion_count == 1) { + while (!brb_empty(&smm->uncommitted)) { + struct block_op bop; + + r = brb_pop(&smm->uncommitted, &bop); + if (r) { + DMERR("bug in bop ring buffer"); + break; + } + + r = commit_bop(smm, &bop); if (r) break; } @@ -144,12 +259,6 @@ static void sm_metadata_destroy(struct dm_space_map *sm) kfree(smm); } -static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks) -{ - DMERR("doesn't support extend"); - return -EINVAL; -} - static int sm_metadata_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) { struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); @@ -172,7 +281,8 @@ static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count) static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, uint32_t *result) { - int r, i; + int r; + unsigned i; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); unsigned adjustment = 0; @@ -180,8 +290,10 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, * We may have some uncommitted adjustments to add. This list * should always be really short. */ - for (i = 0; i < smm->nr_uncommitted; i++) { - struct block_op *op = smm->uncommitted + i; + for (i = smm->uncommitted.begin; + i != smm->uncommitted.end; + i = brb_next(&smm->uncommitted, i)) { + struct block_op *op = smm->uncommitted.bops + i; if (op->block != b) continue; @@ -209,7 +321,8 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, dm_block_t b, int *result) { - int r, i, adjustment = 0; + int r, adjustment = 0; + unsigned i; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); uint32_t rc; @@ -217,8 +330,11 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, * We may have some uncommitted adjustments to add. This list * should always be really short. */ - for (i = 0; i < smm->nr_uncommitted; i++) { - struct block_op *op = smm->uncommitted + i; + for (i = smm->uncommitted.begin; + i != smm->uncommitted.end; + i = brb_next(&smm->uncommitted, i)) { + + struct block_op *op = smm->uncommitted.bops + i; if (op->block != b) continue; @@ -335,9 +451,23 @@ static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b) static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b) { + dm_block_t count; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + int r = sm_metadata_new_block_(sm, b); - if (r) - DMERR("out of metadata space"); + if (r) { + DMERR_LIMIT("unable to allocate new metadata block"); + return r; + } + + r = sm_metadata_get_nr_free(sm, &count); + if (r) { + DMERR_LIMIT("couldn't get free block count"); + return r; + } + + check_threshold(&smm->threshold, count); + return r; } @@ -357,6 +487,18 @@ static int sm_metadata_commit(struct dm_space_map *sm) return 0; } +static int sm_metadata_register_threshold_callback(struct dm_space_map *sm, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + set_threshold(&smm->threshold, threshold, fn, context); + + return 0; +} + static int sm_metadata_root_size(struct dm_space_map *sm, size_t *result) { *result = sizeof(struct disk_sm_root); @@ -382,6 +524,8 @@ static int sm_metadata_copy_root(struct dm_space_map *sm, void *where_le, size_t return 0; } +static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks); + static struct dm_space_map ops = { .destroy = sm_metadata_destroy, .extend = sm_metadata_extend, @@ -395,7 +539,8 @@ static struct dm_space_map ops = { .new_block = sm_metadata_new_block, .commit = sm_metadata_commit, .root_size = sm_metadata_root_size, - .copy_root = sm_metadata_copy_root + .copy_root = sm_metadata_copy_root, + .register_threshold_callback = sm_metadata_register_threshold_callback }; /*----------------------------------------------------------------*/ @@ -410,7 +555,7 @@ static void sm_bootstrap_destroy(struct dm_space_map *sm) static int sm_bootstrap_extend(struct dm_space_map *sm, dm_block_t extra_blocks) { - DMERR("boostrap doesn't support extend"); + DMERR("bootstrap doesn't support extend"); return -EINVAL; } @@ -450,7 +595,7 @@ static int sm_bootstrap_count_is_more_than_one(struct dm_space_map *sm, static int sm_bootstrap_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count) { - DMERR("boostrap doesn't support set_count"); + DMERR("bootstrap doesn't support set_count"); return -EINVAL; } @@ -491,7 +636,7 @@ static int sm_bootstrap_commit(struct dm_space_map *sm) static int sm_bootstrap_root_size(struct dm_space_map *sm, size_t *result) { - DMERR("boostrap doesn't support root_size"); + DMERR("bootstrap doesn't support root_size"); return -EINVAL; } @@ -499,7 +644,7 @@ static int sm_bootstrap_root_size(struct dm_space_map *sm, size_t *result) static int sm_bootstrap_copy_root(struct dm_space_map *sm, void *where, size_t max) { - DMERR("boostrap doesn't support copy_root"); + DMERR("bootstrap doesn't support copy_root"); return -EINVAL; } @@ -517,11 +662,60 @@ static struct dm_space_map bootstrap_ops = { .new_block = sm_bootstrap_new_block, .commit = sm_bootstrap_commit, .root_size = sm_bootstrap_root_size, - .copy_root = sm_bootstrap_copy_root + .copy_root = sm_bootstrap_copy_root, + .register_threshold_callback = NULL }; /*----------------------------------------------------------------*/ +static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks) +{ + int r, i; + enum allocation_event ev; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + dm_block_t old_len = smm->ll.nr_blocks; + + /* + * Flick into a mode where all blocks get allocated in the new area. + */ + smm->begin = old_len; + memcpy(sm, &bootstrap_ops, sizeof(*sm)); + + /* + * Extend. + */ + r = sm_ll_extend(&smm->ll, extra_blocks); + if (r) + goto out; + + /* + * We repeatedly increment then commit until the commit doesn't + * allocate any new blocks. + */ + do { + for (i = old_len; !r && i < smm->begin; i++) { + r = sm_ll_inc(&smm->ll, i, &ev); + if (r) + goto out; + } + old_len = smm->begin; + + r = sm_ll_commit(&smm->ll); + if (r) + goto out; + + } while (old_len != smm->begin); + +out: + /* + * Switch back to normal behaviour. + */ + memcpy(sm, &ops, sizeof(*sm)); + return r; +} + +/*----------------------------------------------------------------*/ + struct dm_space_map *dm_sm_metadata_init(void) { struct sm_metadata *smm; @@ -548,7 +742,8 @@ int dm_sm_metadata_create(struct dm_space_map *sm, smm->begin = superblock + 1; smm->recursion_count = 0; smm->allocated_this_transaction = 0; - smm->nr_uncommitted = 0; + brb_init(&smm->uncommitted); + threshold_init(&smm->threshold); memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); @@ -556,6 +751,8 @@ int dm_sm_metadata_create(struct dm_space_map *sm, if (r) return r; + if (nr_blocks > DM_SM_METADATA_MAX_BLOCKS) + nr_blocks = DM_SM_METADATA_MAX_BLOCKS; r = sm_ll_extend(&smm->ll, nr_blocks); if (r) return r; @@ -589,7 +786,8 @@ int dm_sm_metadata_open(struct dm_space_map *sm, smm->begin = 0; smm->recursion_count = 0; smm->allocated_this_transaction = 0; - smm->nr_uncommitted = 0; + brb_init(&smm->uncommitted); + threshold_init(&smm->threshold); memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); return 0; diff --git a/drivers/md/persistent-data/dm-space-map-metadata.h b/drivers/md/persistent-data/dm-space-map-metadata.h index 39bba0801cf..64df923974d 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.h +++ b/drivers/md/persistent-data/dm-space-map-metadata.h @@ -9,6 +9,17 @@ #include "dm-transaction-manager.h" +#define DM_SM_METADATA_BLOCK_SIZE (4096 >> SECTOR_SHIFT) + +/* + * The metadata device is currently limited in size. + * + * We have one block of index, which can hold 255 index entries. Each + * index entry contains allocation info about ~16k metadata blocks. + */ +#define DM_SM_METADATA_MAX_BLOCKS (255 * ((1 << 14) - 64)) +#define DM_SM_METADATA_MAX_SECTORS (DM_SM_METADATA_MAX_BLOCKS * DM_SM_METADATA_BLOCK_SIZE) + /* * Unfortunately we have to use two-phase construction due to the cycle * between the tm and sm. diff --git a/drivers/md/persistent-data/dm-space-map.h b/drivers/md/persistent-data/dm-space-map.h index 1cbfc6b1638..3e6d1153b7c 100644 --- a/drivers/md/persistent-data/dm-space-map.h +++ b/drivers/md/persistent-data/dm-space-map.h @@ -9,6 +9,8 @@ #include "dm-block-manager.h" +typedef void (*dm_sm_threshold_fn)(void *context); + /* * struct dm_space_map keeps a record of how many times each block in a device * is referenced. It needs to be fixed on disk as part of the transaction. @@ -59,6 +61,15 @@ struct dm_space_map { */ int (*root_size)(struct dm_space_map *sm, size_t *result); int (*copy_root)(struct dm_space_map *sm, void *copy_to_here_le, size_t len); + + /* + * You can register one threshold callback which is edge-triggered + * when the free space in the space map drops below the threshold. + */ + int (*register_threshold_callback)(struct dm_space_map *sm, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context); }; /*----------------------------------------------------------------*/ @@ -131,4 +142,16 @@ static inline int dm_sm_copy_root(struct dm_space_map *sm, void *copy_to_here_le return sm->copy_root(sm, copy_to_here_le, len); } +static inline int dm_sm_register_threshold_callback(struct dm_space_map *sm, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context) +{ + if (sm->register_threshold_callback) + return sm->register_threshold_callback(sm, threshold, fn, context); + + return -EINVAL; +} + + #endif /* _LINUX_DM_SPACE_MAP_H */ diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index d247a35da3c..3bc30a0ae3d 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c @@ -25,8 +25,8 @@ struct shadow_info { /* * It would be nice if we scaled with the size of transaction. */ -#define HASH_SIZE 256 -#define HASH_MASK (HASH_SIZE - 1) +#define DM_HASH_SIZE 256 +#define DM_HASH_MASK (DM_HASH_SIZE - 1) struct dm_transaction_manager { int is_clone; @@ -36,7 +36,7 @@ struct dm_transaction_manager { struct dm_space_map *sm; spinlock_t lock; - struct hlist_head buckets[HASH_SIZE]; + struct hlist_head buckets[DM_HASH_SIZE]; }; /*----------------------------------------------------------------*/ @@ -44,12 +44,11 @@ struct dm_transaction_manager { static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b) { int r = 0; - unsigned bucket = dm_hash_block(b, HASH_MASK); + unsigned bucket = dm_hash_block(b, DM_HASH_MASK); struct shadow_info *si; - struct hlist_node *n; spin_lock(&tm->lock); - hlist_for_each_entry(si, n, tm->buckets + bucket, hlist) + hlist_for_each_entry(si, tm->buckets + bucket, hlist) if (si->where == b) { r = 1; break; @@ -71,7 +70,7 @@ static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b) si = kmalloc(sizeof(*si), GFP_NOIO); if (si) { si->where = b; - bucket = dm_hash_block(b, HASH_MASK); + bucket = dm_hash_block(b, DM_HASH_MASK); spin_lock(&tm->lock); hlist_add_head(&si->hlist, tm->buckets + bucket); spin_unlock(&tm->lock); @@ -81,14 +80,14 @@ static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b) static void wipe_shadow_table(struct dm_transaction_manager *tm) { struct shadow_info *si; - struct hlist_node *n, *tmp; + struct hlist_node *tmp; struct hlist_head *bucket; int i; spin_lock(&tm->lock); - for (i = 0; i < HASH_SIZE; i++) { + for (i = 0; i < DM_HASH_SIZE; i++) { bucket = tm->buckets + i; - hlist_for_each_entry_safe(si, n, tmp, bucket, hlist) + hlist_for_each_entry_safe(si, tmp, bucket, hlist) kfree(si); INIT_HLIST_HEAD(bucket); @@ -115,7 +114,7 @@ static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm, tm->sm = sm; spin_lock_init(&tm->lock); - for (i = 0; i < HASH_SIZE; i++) + for (i = 0; i < DM_HASH_SIZE; i++) INIT_HLIST_HEAD(tm->buckets + i); return tm; @@ -155,7 +154,7 @@ int dm_tm_pre_commit(struct dm_transaction_manager *tm) if (r < 0) return r; - return 0; + return dm_bm_flush(tm->bm); } EXPORT_SYMBOL_GPL(dm_tm_pre_commit); @@ -165,8 +164,9 @@ int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root) return -EWOULDBLOCK; wipe_shadow_table(tm); + dm_bm_unlock(root); - return dm_bm_flush_and_unlock(tm->bm, root); + return dm_bm_flush(tm->bm); } EXPORT_SYMBOL_GPL(dm_tm_commit); diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h index b5b139076ca..2772ed2a781 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.h +++ b/drivers/md/persistent-data/dm-transaction-manager.h @@ -38,18 +38,17 @@ struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transac /* * We use a 2-phase commit here. * - * i) In the first phase the block manager is told to start flushing, and - * the changes to the space map are written to disk. You should interrogate - * your particular space map to get detail of its root node etc. to be - * included in your superblock. + * i) Make all changes for the transaction *except* for the superblock. + * Then call dm_tm_pre_commit() to flush them to disk. * - * ii) @root will be committed last. You shouldn't use more than the - * first 512 bytes of @root if you wish the transaction to survive a power - * failure. You *must* have a write lock held on @root for both stage (i) - * and (ii). The commit will drop the write lock. + * ii) Lock your superblock. Update. Then call dm_tm_commit() which will + * unlock the superblock and flush it. No other blocks should be updated + * during this period. Care should be taken to never unlock a partially + * updated superblock; perform any operations that could fail *before* you + * take the superblock lock. */ int dm_tm_pre_commit(struct dm_transaction_manager *tm); -int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root); +int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *superblock); /* * These methods are the only way to get hold of a writeable block. |
