diff options
Diffstat (limited to 'drivers/md/persistent-data')
23 files changed, 2184 insertions, 921 deletions
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig index ceb359050a5..0c2dec7aec2 100644 --- a/drivers/md/persistent-data/Kconfig +++ b/drivers/md/persistent-data/Kconfig @@ -1,8 +1,18 @@ config DM_PERSISTENT_DATA tristate - depends on BLK_DEV_DM && EXPERIMENTAL + depends on BLK_DEV_DM select LIBCRC32C select DM_BUFIO ---help--- Library providing immutable on-disk data structure support for device-mapper targets such as the thin provisioning target. + +config DM_DEBUG_BLOCK_STACK_TRACING + boolean "Keep stack trace of persistent data block lock holders" + depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA + select STACKTRACE + ---help--- + Enable this for messages that may help debug problems with the + block manager locking used by thin provisioning and caching. + + If unsure, say N. diff --git a/drivers/md/persistent-data/Makefile b/drivers/md/persistent-data/Makefile index cfa95f66223..ff528792c35 100644 --- a/drivers/md/persistent-data/Makefile +++ b/drivers/md/persistent-data/Makefile @@ -1,7 +1,8 @@ obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o dm-persistent-data-objs := \ + dm-array.o \ + dm-bitset.o \ dm-block-manager.o \ - dm-space-map-checker.o \ dm-space-map-common.o \ dm-space-map-disk.o \ dm-space-map-metadata.o \ diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c new file mode 100644 index 00000000000..1d75b1dc1e2 --- /dev/null +++ b/drivers/md/persistent-data/dm-array.c @@ -0,0 +1,819 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-array.h" +#include "dm-space-map.h" +#include "dm-transaction-manager.h" + +#include <linux/export.h> +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "array" + +/*----------------------------------------------------------------*/ + +/* + * The array is implemented as a fully populated btree, which points to + * blocks that contain the packed values. This is more space efficient + * than just using a btree since we don't store 1 key per value. + */ +struct array_block { + __le32 csum; + __le32 max_entries; + __le32 nr_entries; + __le32 value_size; + __le64 blocknr; /* Block this node is supposed to live in. */ +} __packed; + +/*----------------------------------------------------------------*/ + +/* + * Validator methods. As usual we calculate a checksum, and also write the + * block location into the header (paranoia about ssds remapping areas by + * mistake). + */ +#define CSUM_XOR 595846735 + +static void array_block_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t size_of_block) +{ + struct array_block *bh_le = dm_block_data(b); + + bh_le->blocknr = cpu_to_le64(dm_block_location(b)); + bh_le->csum = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries, + size_of_block - sizeof(__le32), + CSUM_XOR)); +} + +static int array_block_check(struct dm_block_validator *v, + struct dm_block *b, + size_t size_of_block) +{ + struct array_block *bh_le = dm_block_data(b); + __le32 csum_disk; + + if (dm_block_location(b) != le64_to_cpu(bh_le->blocknr)) { + DMERR_LIMIT("array_block_check failed: blocknr %llu != wanted %llu", + (unsigned long long) le64_to_cpu(bh_le->blocknr), + (unsigned long long) dm_block_location(b)); + return -ENOTBLK; + } + + csum_disk = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries, + size_of_block - sizeof(__le32), + CSUM_XOR)); + if (csum_disk != bh_le->csum) { + DMERR_LIMIT("array_block_check failed: csum %u != wanted %u", + (unsigned) le32_to_cpu(csum_disk), + (unsigned) le32_to_cpu(bh_le->csum)); + return -EILSEQ; + } + + return 0; +} + +static struct dm_block_validator array_validator = { + .name = "array", + .prepare_for_write = array_block_prepare_for_write, + .check = array_block_check +}; + +/*----------------------------------------------------------------*/ + +/* + * Functions for manipulating the array blocks. + */ + +/* + * Returns a pointer to a value within an array block. + * + * index - The index into _this_ specific block. + */ +static void *element_at(struct dm_array_info *info, struct array_block *ab, + unsigned index) +{ + unsigned char *entry = (unsigned char *) (ab + 1); + + entry += index * info->value_type.size; + + return entry; +} + +/* + * Utility function that calls one of the value_type methods on every value + * in an array block. + */ +static void on_entries(struct dm_array_info *info, struct array_block *ab, + void (*fn)(void *, const void *)) +{ + unsigned i, nr_entries = le32_to_cpu(ab->nr_entries); + + for (i = 0; i < nr_entries; i++) + fn(info->value_type.context, element_at(info, ab, i)); +} + +/* + * Increment every value in an array block. + */ +static void inc_ablock_entries(struct dm_array_info *info, struct array_block *ab) +{ + struct dm_btree_value_type *vt = &info->value_type; + + if (vt->inc) + on_entries(info, ab, vt->inc); +} + +/* + * Decrement every value in an array block. + */ +static void dec_ablock_entries(struct dm_array_info *info, struct array_block *ab) +{ + struct dm_btree_value_type *vt = &info->value_type; + + if (vt->dec) + on_entries(info, ab, vt->dec); +} + +/* + * Each array block can hold this many values. + */ +static uint32_t calc_max_entries(size_t value_size, size_t size_of_block) +{ + return (size_of_block - sizeof(struct array_block)) / value_size; +} + +/* + * Allocate a new array block. The caller will need to unlock block. + */ +static int alloc_ablock(struct dm_array_info *info, size_t size_of_block, + uint32_t max_entries, + struct dm_block **block, struct array_block **ab) +{ + int r; + + r = dm_tm_new_block(info->btree_info.tm, &array_validator, block); + if (r) + return r; + + (*ab) = dm_block_data(*block); + (*ab)->max_entries = cpu_to_le32(max_entries); + (*ab)->nr_entries = cpu_to_le32(0); + (*ab)->value_size = cpu_to_le32(info->value_type.size); + + return 0; +} + +/* + * Pad an array block out with a particular value. Every instance will + * cause an increment of the value_type. new_nr must always be more than + * the current number of entries. + */ +static void fill_ablock(struct dm_array_info *info, struct array_block *ab, + const void *value, unsigned new_nr) +{ + unsigned i; + uint32_t nr_entries; + struct dm_btree_value_type *vt = &info->value_type; + + BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); + BUG_ON(new_nr < le32_to_cpu(ab->nr_entries)); + + nr_entries = le32_to_cpu(ab->nr_entries); + for (i = nr_entries; i < new_nr; i++) { + if (vt->inc) + vt->inc(vt->context, value); + memcpy(element_at(info, ab, i), value, vt->size); + } + ab->nr_entries = cpu_to_le32(new_nr); +} + +/* + * Remove some entries from the back of an array block. Every value + * removed will be decremented. new_nr must be <= the current number of + * entries. + */ +static void trim_ablock(struct dm_array_info *info, struct array_block *ab, + unsigned new_nr) +{ + unsigned i; + uint32_t nr_entries; + struct dm_btree_value_type *vt = &info->value_type; + + BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); + BUG_ON(new_nr > le32_to_cpu(ab->nr_entries)); + + nr_entries = le32_to_cpu(ab->nr_entries); + for (i = nr_entries; i > new_nr; i--) + if (vt->dec) + vt->dec(vt->context, element_at(info, ab, i - 1)); + ab->nr_entries = cpu_to_le32(new_nr); +} + +/* + * Read locks a block, and coerces it to an array block. The caller must + * unlock 'block' when finished. + */ +static int get_ablock(struct dm_array_info *info, dm_block_t b, + struct dm_block **block, struct array_block **ab) +{ + int r; + + r = dm_tm_read_lock(info->btree_info.tm, b, &array_validator, block); + if (r) + return r; + + *ab = dm_block_data(*block); + return 0; +} + +/* + * Unlocks an array block. + */ +static int unlock_ablock(struct dm_array_info *info, struct dm_block *block) +{ + return dm_tm_unlock(info->btree_info.tm, block); +} + +/*----------------------------------------------------------------*/ + +/* + * Btree manipulation. + */ + +/* + * Looks up an array block in the btree, and then read locks it. + * + * index is the index of the index of the array_block, (ie. the array index + * / max_entries). + */ +static int lookup_ablock(struct dm_array_info *info, dm_block_t root, + unsigned index, struct dm_block **block, + struct array_block **ab) +{ + int r; + uint64_t key = index; + __le64 block_le; + + r = dm_btree_lookup(&info->btree_info, root, &key, &block_le); + if (r) + return r; + + return get_ablock(info, le64_to_cpu(block_le), block, ab); +} + +/* + * Insert an array block into the btree. The block is _not_ unlocked. + */ +static int insert_ablock(struct dm_array_info *info, uint64_t index, + struct dm_block *block, dm_block_t *root) +{ + __le64 block_le = cpu_to_le64(dm_block_location(block)); + + __dm_bless_for_disk(block_le); + return dm_btree_insert(&info->btree_info, *root, &index, &block_le, root); +} + +/* + * Looks up an array block in the btree. Then shadows it, and updates the + * btree to point to this new shadow. 'root' is an input/output parameter + * for both the current root block, and the new one. + */ +static int shadow_ablock(struct dm_array_info *info, dm_block_t *root, + unsigned index, struct dm_block **block, + struct array_block **ab) +{ + int r, inc; + uint64_t key = index; + dm_block_t b; + __le64 block_le; + + /* + * lookup + */ + r = dm_btree_lookup(&info->btree_info, *root, &key, &block_le); + if (r) + return r; + b = le64_to_cpu(block_le); + + /* + * shadow + */ + r = dm_tm_shadow_block(info->btree_info.tm, b, + &array_validator, block, &inc); + if (r) + return r; + + *ab = dm_block_data(*block); + if (inc) + inc_ablock_entries(info, *ab); + + /* + * Reinsert. + * + * The shadow op will often be a noop. Only insert if it really + * copied data. + */ + if (dm_block_location(*block) != b) { + /* + * dm_tm_shadow_block will have already decremented the old + * block, but it is still referenced by the btree. We + * increment to stop the insert decrementing it below zero + * when overwriting the old value. + */ + dm_tm_inc(info->btree_info.tm, b); + r = insert_ablock(info, index, *block, root); + } + + return r; +} + +/* + * Allocate an new array block, and fill it with some values. + */ +static int insert_new_ablock(struct dm_array_info *info, size_t size_of_block, + uint32_t max_entries, + unsigned block_index, uint32_t nr, + const void *value, dm_block_t *root) +{ + int r; + struct dm_block *block; + struct array_block *ab; + + r = alloc_ablock(info, size_of_block, max_entries, &block, &ab); + if (r) + return r; + + fill_ablock(info, ab, value, nr); + r = insert_ablock(info, block_index, block, root); + unlock_ablock(info, block); + + return r; +} + +static int insert_full_ablocks(struct dm_array_info *info, size_t size_of_block, + unsigned begin_block, unsigned end_block, + unsigned max_entries, const void *value, + dm_block_t *root) +{ + int r = 0; + + for (; !r && begin_block != end_block; begin_block++) + r = insert_new_ablock(info, size_of_block, max_entries, begin_block, max_entries, value, root); + + return r; +} + +/* + * There are a bunch of functions involved with resizing an array. This + * structure holds information that commonly needed by them. Purely here + * to reduce parameter count. + */ +struct resize { + /* + * Describes the array. + */ + struct dm_array_info *info; + + /* + * The current root of the array. This gets updated. + */ + dm_block_t root; + + /* + * Metadata block size. Used to calculate the nr entries in an + * array block. + */ + size_t size_of_block; + + /* + * Maximum nr entries in an array block. + */ + unsigned max_entries; + + /* + * nr of completely full blocks in the array. + * + * 'old' refers to before the resize, 'new' after. + */ + unsigned old_nr_full_blocks, new_nr_full_blocks; + + /* + * Number of entries in the final block. 0 iff only full blocks in + * the array. + */ + unsigned old_nr_entries_in_last_block, new_nr_entries_in_last_block; + + /* + * The default value used when growing the array. + */ + const void *value; +}; + +/* + * Removes a consecutive set of array blocks from the btree. The values + * in block are decremented as a side effect of the btree remove. + * + * begin_index - the index of the first array block to remove. + * end_index - the one-past-the-end value. ie. this block is not removed. + */ +static int drop_blocks(struct resize *resize, unsigned begin_index, + unsigned end_index) +{ + int r; + + while (begin_index != end_index) { + uint64_t key = begin_index++; + r = dm_btree_remove(&resize->info->btree_info, resize->root, + &key, &resize->root); + if (r) + return r; + } + + return 0; +} + +/* + * Calculates how many blocks are needed for the array. + */ +static unsigned total_nr_blocks_needed(unsigned nr_full_blocks, + unsigned nr_entries_in_last_block) +{ + return nr_full_blocks + (nr_entries_in_last_block ? 1 : 0); +} + +/* + * Shrink an array. + */ +static int shrink(struct resize *resize) +{ + int r; + unsigned begin, end; + struct dm_block *block; + struct array_block *ab; + + /* + * Lose some blocks from the back? + */ + if (resize->new_nr_full_blocks < resize->old_nr_full_blocks) { + begin = total_nr_blocks_needed(resize->new_nr_full_blocks, + resize->new_nr_entries_in_last_block); + end = total_nr_blocks_needed(resize->old_nr_full_blocks, + resize->old_nr_entries_in_last_block); + + r = drop_blocks(resize, begin, end); + if (r) + return r; + } + + /* + * Trim the new tail block + */ + if (resize->new_nr_entries_in_last_block) { + r = shadow_ablock(resize->info, &resize->root, + resize->new_nr_full_blocks, &block, &ab); + if (r) + return r; + + trim_ablock(resize->info, ab, resize->new_nr_entries_in_last_block); + unlock_ablock(resize->info, block); + } + + return 0; +} + +/* + * Grow an array. + */ +static int grow_extend_tail_block(struct resize *resize, uint32_t new_nr_entries) +{ + int r; + struct dm_block *block; + struct array_block *ab; + + r = shadow_ablock(resize->info, &resize->root, + resize->old_nr_full_blocks, &block, &ab); + if (r) + return r; + + fill_ablock(resize->info, ab, resize->value, new_nr_entries); + unlock_ablock(resize->info, block); + + return r; +} + +static int grow_add_tail_block(struct resize *resize) +{ + return insert_new_ablock(resize->info, resize->size_of_block, + resize->max_entries, + resize->new_nr_full_blocks, + resize->new_nr_entries_in_last_block, + resize->value, &resize->root); +} + +static int grow_needs_more_blocks(struct resize *resize) +{ + int r; + unsigned old_nr_blocks = resize->old_nr_full_blocks; + + if (resize->old_nr_entries_in_last_block > 0) { + old_nr_blocks++; + + r = grow_extend_tail_block(resize, resize->max_entries); + if (r) + return r; + } + + r = insert_full_ablocks(resize->info, resize->size_of_block, + old_nr_blocks, + resize->new_nr_full_blocks, + resize->max_entries, resize->value, + &resize->root); + if (r) + return r; + + if (resize->new_nr_entries_in_last_block) + r = grow_add_tail_block(resize); + + return r; +} + +static int grow(struct resize *resize) +{ + if (resize->new_nr_full_blocks > resize->old_nr_full_blocks) + return grow_needs_more_blocks(resize); + + else if (resize->old_nr_entries_in_last_block) + return grow_extend_tail_block(resize, resize->new_nr_entries_in_last_block); + + else + return grow_add_tail_block(resize); +} + +/*----------------------------------------------------------------*/ + +/* + * These are the value_type functions for the btree elements, which point + * to array blocks. + */ +static void block_inc(void *context, const void *value) +{ + __le64 block_le; + struct dm_array_info *info = context; + + memcpy(&block_le, value, sizeof(block_le)); + dm_tm_inc(info->btree_info.tm, le64_to_cpu(block_le)); +} + +static void block_dec(void *context, const void *value) +{ + int r; + uint64_t b; + __le64 block_le; + uint32_t ref_count; + struct dm_block *block; + struct array_block *ab; + struct dm_array_info *info = context; + + memcpy(&block_le, value, sizeof(block_le)); + b = le64_to_cpu(block_le); + + r = dm_tm_ref(info->btree_info.tm, b, &ref_count); + if (r) { + DMERR_LIMIT("couldn't get reference count for block %llu", + (unsigned long long) b); + return; + } + + if (ref_count == 1) { + /* + * We're about to drop the last reference to this ablock. + * So we need to decrement the ref count of the contents. + */ + r = get_ablock(info, b, &block, &ab); + if (r) { + DMERR_LIMIT("couldn't get array block %llu", + (unsigned long long) b); + return; + } + + dec_ablock_entries(info, ab); + unlock_ablock(info, block); + } + + dm_tm_dec(info->btree_info.tm, b); +} + +static int block_equal(void *context, const void *value1, const void *value2) +{ + return !memcmp(value1, value2, sizeof(__le64)); +} + +/*----------------------------------------------------------------*/ + +void dm_array_info_init(struct dm_array_info *info, + struct dm_transaction_manager *tm, + struct dm_btree_value_type *vt) +{ + struct dm_btree_value_type *bvt = &info->btree_info.value_type; + + memcpy(&info->value_type, vt, sizeof(info->value_type)); + info->btree_info.tm = tm; + info->btree_info.levels = 1; + + bvt->context = info; + bvt->size = sizeof(__le64); + bvt->inc = block_inc; + bvt->dec = block_dec; + bvt->equal = block_equal; +} +EXPORT_SYMBOL_GPL(dm_array_info_init); + +int dm_array_empty(struct dm_array_info *info, dm_block_t *root) +{ + return dm_btree_empty(&info->btree_info, root); +} +EXPORT_SYMBOL_GPL(dm_array_empty); + +static int array_resize(struct dm_array_info *info, dm_block_t root, + uint32_t old_size, uint32_t new_size, + const void *value, dm_block_t *new_root) +{ + int r; + struct resize resize; + + if (old_size == new_size) + return 0; + + resize.info = info; + resize.root = root; + resize.size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); + resize.max_entries = calc_max_entries(info->value_type.size, + resize.size_of_block); + + resize.old_nr_full_blocks = old_size / resize.max_entries; + resize.old_nr_entries_in_last_block = old_size % resize.max_entries; + resize.new_nr_full_blocks = new_size / resize.max_entries; + resize.new_nr_entries_in_last_block = new_size % resize.max_entries; + resize.value = value; + + r = ((new_size > old_size) ? grow : shrink)(&resize); + if (r) + return r; + + *new_root = resize.root; + return 0; +} + +int dm_array_resize(struct dm_array_info *info, dm_block_t root, + uint32_t old_size, uint32_t new_size, + const void *value, dm_block_t *new_root) + __dm_written_to_disk(value) +{ + int r = array_resize(info, root, old_size, new_size, value, new_root); + __dm_unbless_for_disk(value); + return r; +} +EXPORT_SYMBOL_GPL(dm_array_resize); + +int dm_array_del(struct dm_array_info *info, dm_block_t root) +{ + return dm_btree_del(&info->btree_info, root); +} +EXPORT_SYMBOL_GPL(dm_array_del); + +int dm_array_get_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, void *value_le) +{ + int r; + struct dm_block *block; + struct array_block *ab; + size_t size_of_block; + unsigned entry, max_entries; + + size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); + max_entries = calc_max_entries(info->value_type.size, size_of_block); + + r = lookup_ablock(info, root, index / max_entries, &block, &ab); + if (r) + return r; + + entry = index % max_entries; + if (entry >= le32_to_cpu(ab->nr_entries)) + r = -ENODATA; + else + memcpy(value_le, element_at(info, ab, entry), + info->value_type.size); + + unlock_ablock(info, block); + return r; +} +EXPORT_SYMBOL_GPL(dm_array_get_value); + +static int array_set_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, const void *value, dm_block_t *new_root) +{ + int r; + struct dm_block *block; + struct array_block *ab; + size_t size_of_block; + unsigned max_entries; + unsigned entry; + void *old_value; + struct dm_btree_value_type *vt = &info->value_type; + + size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); + max_entries = calc_max_entries(info->value_type.size, size_of_block); + + r = shadow_ablock(info, &root, index / max_entries, &block, &ab); + if (r) + return r; + *new_root = root; + + entry = index % max_entries; + if (entry >= le32_to_cpu(ab->nr_entries)) { + r = -ENODATA; + goto out; + } + + old_value = element_at(info, ab, entry); + if (vt->dec && + (!vt->equal || !vt->equal(vt->context, old_value, value))) { + vt->dec(vt->context, old_value); + if (vt->inc) + vt->inc(vt->context, value); + } + + memcpy(old_value, value, info->value_type.size); + +out: + unlock_ablock(info, block); + return r; +} + +int dm_array_set_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, const void *value, dm_block_t *new_root) + __dm_written_to_disk(value) +{ + int r; + + r = array_set_value(info, root, index, value, new_root); + __dm_unbless_for_disk(value); + return r; +} +EXPORT_SYMBOL_GPL(dm_array_set_value); + +struct walk_info { + struct dm_array_info *info; + int (*fn)(void *context, uint64_t key, void *leaf); + void *context; +}; + +static int walk_ablock(void *context, uint64_t *keys, void *leaf) +{ + struct walk_info *wi = context; + + int r; + unsigned i; + __le64 block_le; + unsigned nr_entries, max_entries; + struct dm_block *block; + struct array_block *ab; + + memcpy(&block_le, leaf, sizeof(block_le)); + r = get_ablock(wi->info, le64_to_cpu(block_le), &block, &ab); + if (r) + return r; + + max_entries = le32_to_cpu(ab->max_entries); + nr_entries = le32_to_cpu(ab->nr_entries); + for (i = 0; i < nr_entries; i++) { + r = wi->fn(wi->context, keys[0] * max_entries + i, + element_at(wi->info, ab, i)); + + if (r) + break; + } + + unlock_ablock(wi->info, block); + return r; +} + +int dm_array_walk(struct dm_array_info *info, dm_block_t root, + int (*fn)(void *, uint64_t key, void *leaf), + void *context) +{ + struct walk_info wi; + + wi.info = info; + wi.fn = fn; + wi.context = context; + + return dm_btree_walk(&info->btree_info, root, walk_ablock, &wi); +} +EXPORT_SYMBOL_GPL(dm_array_walk); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-array.h b/drivers/md/persistent-data/dm-array.h new file mode 100644 index 00000000000..ea177d6fa58 --- /dev/null +++ b/drivers/md/persistent-data/dm-array.h @@ -0,0 +1,166 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ +#ifndef _LINUX_DM_ARRAY_H +#define _LINUX_DM_ARRAY_H + +#include "dm-btree.h" + +/*----------------------------------------------------------------*/ + +/* + * The dm-array is a persistent version of an array. It packs the data + * more efficiently than a btree which will result in less disk space use, + * and a performance boost. The element get and set operations are still + * O(ln(n)), but with a much smaller constant. + * + * The value type structure is reused from the btree type to support proper + * reference counting of values. + * + * The arrays implicitly know their length, and bounds are checked for + * lookups and updated. It doesn't store this in an accessible place + * because it would waste a whole metadata block. Make sure you store the + * size along with the array root in your encompassing data. + * + * Array entries are indexed via an unsigned integer starting from zero. + * Arrays are not sparse; if you resize an array to have 'n' entries then + * 'n - 1' will be the last valid index. + * + * Typical use: + * + * a) initialise a dm_array_info structure. This describes the array + * values and ties it into a specific transaction manager. It holds no + * instance data; the same info can be used for many similar arrays if + * you wish. + * + * b) Get yourself a root. The root is the index of a block of data on the + * disk that holds a particular instance of an array. You may have a + * pre existing root in your metadata that you wish to use, or you may + * want to create a brand new, empty array with dm_array_empty(). + * + * Like the other data structures in this library, dm_array objects are + * immutable between transactions. Update functions will return you the + * root for a _new_ array. If you've incremented the old root, via + * dm_tm_inc(), before calling the update function you may continue to use + * it in parallel with the new root. + * + * c) resize an array with dm_array_resize(). + * + * d) Get a value from the array with dm_array_get_value(). + * + * e) Set a value in the array with dm_array_set_value(). + * + * f) Walk an array of values in index order with dm_array_walk(). More + * efficient than making many calls to dm_array_get_value(). + * + * g) Destroy the array with dm_array_del(). This tells the transaction + * manager that you're no longer using this data structure so it can + * recycle it's blocks. (dm_array_dec() would be a better name for it, + * but del is in keeping with dm_btree_del()). + */ + +/* + * Describes an array. Don't initialise this structure yourself, use the + * init function below. + */ +struct dm_array_info { + struct dm_transaction_manager *tm; + struct dm_btree_value_type value_type; + struct dm_btree_info btree_info; +}; + +/* + * Sets up a dm_array_info structure. You don't need to do anything with + * this structure when you finish using it. + * + * info - the structure being filled in. + * tm - the transaction manager that should supervise this structure. + * vt - describes the leaf values. + */ +void dm_array_info_init(struct dm_array_info *info, + struct dm_transaction_manager *tm, + struct dm_btree_value_type *vt); + +/* + * Create an empty, zero length array. + * + * info - describes the array + * root - on success this will be filled out with the root block + */ +int dm_array_empty(struct dm_array_info *info, dm_block_t *root); + +/* + * Resizes the array. + * + * info - describes the array + * root - the root block of the array on disk + * old_size - the caller is responsible for remembering the size of + * the array + * new_size - can be bigger or smaller than old_size + * value - if we're growing the array the new entries will have this value + * new_root - on success, points to the new root block + * + * If growing the inc function for 'value' will be called the appropriate + * number of times. So if the caller is holding a reference they may want + * to drop it. + */ +int dm_array_resize(struct dm_array_info *info, dm_block_t root, + uint32_t old_size, uint32_t new_size, + const void *value, dm_block_t *new_root) + __dm_written_to_disk(value); + +/* + * Frees a whole array. The value_type's decrement operation will be called + * for all values in the array + */ +int dm_array_del(struct dm_array_info *info, dm_block_t root); + +/* + * Lookup a value in the array + * + * info - describes the array + * root - root block of the array + * index - array index + * value - the value to be read. Will be in on-disk format of course. + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_array_get_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, void *value); + +/* + * Set an entry in the array. + * + * info - describes the array + * root - root block of the array + * index - array index + * value - value to be written to disk. Make sure you confirm the value is + * in on-disk format with__dm_bless_for_disk() before calling. + * new_root - the new root block + * + * The old value being overwritten will be decremented, the new value + * incremented. + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_array_set_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, const void *value, dm_block_t *new_root) + __dm_written_to_disk(value); + +/* + * Walk through all the entries in an array. + * + * info - describes the array + * root - root block of the array + * fn - called back for every element + * context - passed to the callback + */ +int dm_array_walk(struct dm_array_info *info, dm_block_t root, + int (*fn)(void *context, uint64_t key, void *leaf), + void *context); + +/*----------------------------------------------------------------*/ + +#endif /* _LINUX_DM_ARRAY_H */ diff --git a/drivers/md/persistent-data/dm-bitset.c b/drivers/md/persistent-data/dm-bitset.c new file mode 100644 index 00000000000..36f7cc2c710 --- /dev/null +++ b/drivers/md/persistent-data/dm-bitset.c @@ -0,0 +1,171 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-bitset.h" +#include "dm-transaction-manager.h" + +#include <linux/export.h> +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "bitset" +#define BITS_PER_ARRAY_ENTRY 64 + +/*----------------------------------------------------------------*/ + +static struct dm_btree_value_type bitset_bvt = { + .context = NULL, + .size = sizeof(__le64), + .inc = NULL, + .dec = NULL, + .equal = NULL, +}; + +/*----------------------------------------------------------------*/ + +void dm_disk_bitset_init(struct dm_transaction_manager *tm, + struct dm_disk_bitset *info) +{ + dm_array_info_init(&info->array_info, tm, &bitset_bvt); + info->current_index_set = false; +} +EXPORT_SYMBOL_GPL(dm_disk_bitset_init); + +int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *root) +{ + return dm_array_empty(&info->array_info, root); +} +EXPORT_SYMBOL_GPL(dm_bitset_empty); + +int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t root, + uint32_t old_nr_entries, uint32_t new_nr_entries, + bool default_value, dm_block_t *new_root) +{ + uint32_t old_blocks = dm_div_up(old_nr_entries, BITS_PER_ARRAY_ENTRY); + uint32_t new_blocks = dm_div_up(new_nr_entries, BITS_PER_ARRAY_ENTRY); + __le64 value = default_value ? cpu_to_le64(~0) : cpu_to_le64(0); + + __dm_bless_for_disk(&value); + return dm_array_resize(&info->array_info, root, old_blocks, new_blocks, + &value, new_root); +} +EXPORT_SYMBOL_GPL(dm_bitset_resize); + +int dm_bitset_del(struct dm_disk_bitset *info, dm_block_t root) +{ + return dm_array_del(&info->array_info, root); +} +EXPORT_SYMBOL_GPL(dm_bitset_del); + +int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root, + dm_block_t *new_root) +{ + int r; + __le64 value; + + if (!info->current_index_set || !info->dirty) + return 0; + + value = cpu_to_le64(info->current_bits); + + __dm_bless_for_disk(&value); + r = dm_array_set_value(&info->array_info, root, info->current_index, + &value, new_root); + if (r) + return r; + + info->current_index_set = false; + info->dirty = false; + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bitset_flush); + +static int read_bits(struct dm_disk_bitset *info, dm_block_t root, + uint32_t array_index) +{ + int r; + __le64 value; + + r = dm_array_get_value(&info->array_info, root, array_index, &value); + if (r) + return r; + + info->current_bits = le64_to_cpu(value); + info->current_index_set = true; + info->current_index = array_index; + info->dirty = false; + + return 0; +} + +static int get_array_entry(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root) +{ + int r; + unsigned array_index = index / BITS_PER_ARRAY_ENTRY; + + if (info->current_index_set) { + if (info->current_index == array_index) + return 0; + + r = dm_bitset_flush(info, root, new_root); + if (r) + return r; + } + + return read_bits(info, root, array_index); +} + +int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root) +{ + int r; + unsigned b = index % BITS_PER_ARRAY_ENTRY; + + r = get_array_entry(info, root, index, new_root); + if (r) + return r; + + set_bit(b, (unsigned long *) &info->current_bits); + info->dirty = true; + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bitset_set_bit); + +int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root) +{ + int r; + unsigned b = index % BITS_PER_ARRAY_ENTRY; + + r = get_array_entry(info, root, index, new_root); + if (r) + return r; + + clear_bit(b, (unsigned long *) &info->current_bits); + info->dirty = true; + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bitset_clear_bit); + +int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root, bool *result) +{ + int r; + unsigned b = index % BITS_PER_ARRAY_ENTRY; + + r = get_array_entry(info, root, index, new_root); + if (r) + return r; + + *result = test_bit(b, (unsigned long *) &info->current_bits); + return 0; +} +EXPORT_SYMBOL_GPL(dm_bitset_test_bit); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-bitset.h b/drivers/md/persistent-data/dm-bitset.h new file mode 100644 index 00000000000..c2287d672ef --- /dev/null +++ b/drivers/md/persistent-data/dm-bitset.h @@ -0,0 +1,166 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ +#ifndef _LINUX_DM_BITSET_H +#define _LINUX_DM_BITSET_H + +#include "dm-array.h" + +/*----------------------------------------------------------------*/ + +/* + * This bitset type is a thin wrapper round a dm_array of 64bit words. It + * uses a tiny, one word cache to reduce the number of array lookups and so + * increase performance. + * + * Like the dm-array that it's based on, the caller needs to keep track of + * the size of the bitset separately. The underlying dm-array implicitly + * knows how many words it's storing and will return -ENODATA if you try + * and access an out of bounds word. However, an out of bounds bit in the + * final word will _not_ be detected, you have been warned. + * + * Bits are indexed from zero. + + * Typical use: + * + * a) Initialise a dm_disk_bitset structure with dm_disk_bitset_init(). + * This describes the bitset and includes the cache. It's not called it + * dm_bitset_info in line with other data structures because it does + * include instance data. + * + * b) Get yourself a root. The root is the index of a block of data on the + * disk that holds a particular instance of an bitset. You may have a + * pre existing root in your metadata that you wish to use, or you may + * want to create a brand new, empty bitset with dm_bitset_empty(). + * + * Like the other data structures in this library, dm_bitset objects are + * immutable between transactions. Update functions will return you the + * root for a _new_ array. If you've incremented the old root, via + * dm_tm_inc(), before calling the update function you may continue to use + * it in parallel with the new root. + * + * Even read operations may trigger the cache to be flushed and as such + * return a root for a new, updated bitset. + * + * c) resize a bitset with dm_bitset_resize(). + * + * d) Set a bit with dm_bitset_set_bit(). + * + * e) Clear a bit with dm_bitset_clear_bit(). + * + * f) Test a bit with dm_bitset_test_bit(). + * + * g) Flush all updates from the cache with dm_bitset_flush(). + * + * h) Destroy the bitset with dm_bitset_del(). This tells the transaction + * manager that you're no longer using this data structure so it can + * recycle it's blocks. (dm_bitset_dec() would be a better name for it, + * but del is in keeping with dm_btree_del()). + */ + +/* + * Opaque object. Unlike dm_array_info, you should have one of these per + * bitset. Initialise with dm_disk_bitset_init(). + */ +struct dm_disk_bitset { + struct dm_array_info array_info; + + uint32_t current_index; + uint64_t current_bits; + + bool current_index_set:1; + bool dirty:1; +}; + +/* + * Sets up a dm_disk_bitset structure. You don't need to do anything with + * this structure when you finish using it. + * + * tm - the transaction manager that should supervise this structure + * info - the structure being initialised + */ +void dm_disk_bitset_init(struct dm_transaction_manager *tm, + struct dm_disk_bitset *info); + +/* + * Create an empty, zero length bitset. + * + * info - describes the bitset + * new_root - on success, points to the new root block + */ +int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *new_root); + +/* + * Resize the bitset. + * + * info - describes the bitset + * old_root - the root block of the array on disk + * old_nr_entries - the number of bits in the old bitset + * new_nr_entries - the number of bits you want in the new bitset + * default_value - the value for any new bits + * new_root - on success, points to the new root block + */ +int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t old_root, + uint32_t old_nr_entries, uint32_t new_nr_entries, + bool default_value, dm_block_t *new_root); + +/* + * Frees the bitset. + */ +int dm_bitset_del(struct dm_disk_bitset *info, dm_block_t root); + +/* + * Set a bit. + * + * info - describes the bitset + * root - the root block of the bitset + * index - the bit index + * new_root - on success, points to the new root block + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root); + +/* + * Clears a bit. + * + * info - describes the bitset + * root - the root block of the bitset + * index - the bit index + * new_root - on success, points to the new root block + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root); + +/* + * Tests a bit. + * + * info - describes the bitset + * root - the root block of the bitset + * index - the bit index + * new_root - on success, points to the new root block (cached values may have been written) + * result - the bit value you're after + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root, bool *result); + +/* + * Flush any cached changes to disk. + * + * info - describes the bitset + * root - the root block of the bitset + * new_root - on success, points to the new root block + */ +int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root, + dm_block_t *new_root); + +/*----------------------------------------------------------------*/ + +#endif /* _LINUX_DM_BITSET_H */ diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index 0317ecdc6e5..087411c95ff 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -25,7 +25,7 @@ * may be held at once. This is just an implementation detail. * * ii) Recursive locking attempts are detected and return EINVAL. A stack - * trace is also emitted for the previous lock aquisition. + * trace is also emitted for the previous lock acquisition. * * iii) Priority is given to write locks. */ @@ -104,12 +104,12 @@ static int __check_holder(struct block_lock *lock) for (i = 0; i < MAX_HOLDERS; i++) { if (lock->holders[i] == current) { - DMERR("recursive lock detected in pool metadata"); + DMERR("recursive lock detected in metadata"); #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING DMERR("previously held here:"); print_stack_trace(lock->traces + i, 4); - DMERR("subsequent aquisition attempted here:"); + DMERR("subsequent acquisition attempted here:"); t.nr_entries = 0; t.max_entries = MAX_STACK; t.entries = entries; @@ -325,11 +325,6 @@ static struct dm_buffer *to_buffer(struct dm_block *b) return (struct dm_buffer *) b; } -static struct dm_bufio_client *to_bufio(struct dm_block_manager *bm) -{ - return (struct dm_bufio_client *) bm; -} - dm_block_t dm_block_location(struct dm_block *b) { return dm_bufio_get_block_number(to_buffer(b)); @@ -367,34 +362,60 @@ static void dm_block_manager_write_callback(struct dm_buffer *buf) /*---------------------------------------------------------------- * Public interface *--------------------------------------------------------------*/ +struct dm_block_manager { + struct dm_bufio_client *bufio; + bool read_only:1; +}; + struct dm_block_manager *dm_block_manager_create(struct block_device *bdev, unsigned block_size, unsigned cache_size, unsigned max_held_per_thread) { - return (struct dm_block_manager *) - dm_bufio_client_create(bdev, block_size, max_held_per_thread, - sizeof(struct buffer_aux), - dm_block_manager_alloc_callback, - dm_block_manager_write_callback); + int r; + struct dm_block_manager *bm; + + bm = kmalloc(sizeof(*bm), GFP_KERNEL); + if (!bm) { + r = -ENOMEM; + goto bad; + } + + bm->bufio = dm_bufio_client_create(bdev, block_size, max_held_per_thread, + sizeof(struct buffer_aux), + dm_block_manager_alloc_callback, + dm_block_manager_write_callback); + if (IS_ERR(bm->bufio)) { + r = PTR_ERR(bm->bufio); + kfree(bm); + goto bad; + } + + bm->read_only = false; + + return bm; + +bad: + return ERR_PTR(r); } EXPORT_SYMBOL_GPL(dm_block_manager_create); void dm_block_manager_destroy(struct dm_block_manager *bm) { - return dm_bufio_client_destroy(to_bufio(bm)); + dm_bufio_client_destroy(bm->bufio); + kfree(bm); } EXPORT_SYMBOL_GPL(dm_block_manager_destroy); unsigned dm_bm_block_size(struct dm_block_manager *bm) { - return dm_bufio_get_block_size(to_bufio(bm)); + return dm_bufio_get_block_size(bm->bufio); } EXPORT_SYMBOL_GPL(dm_bm_block_size); dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm) { - return dm_bufio_get_device_size(to_bufio(bm)); + return dm_bufio_get_device_size(bm->bufio); } static int dm_bm_validate_buffer(struct dm_block_manager *bm, @@ -406,16 +427,18 @@ static int dm_bm_validate_buffer(struct dm_block_manager *bm, int r; if (!v) return 0; - r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(to_bufio(bm))); - if (unlikely(r)) + r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(bm->bufio)); + if (unlikely(r)) { + DMERR_LIMIT("%s validator check failed for block %llu", v->name, + (unsigned long long) dm_bufio_get_block_number(buf)); return r; + } aux->validator = v; } else { if (unlikely(aux->validator != v)) { - DMERR("validator mismatch (old=%s vs new=%s) for block %llu", - aux->validator->name, v ? v->name : "NULL", - (unsigned long long) - dm_bufio_get_block_number(buf)); + DMERR_LIMIT("validator mismatch (old=%s vs new=%s) for block %llu", + aux->validator->name, v ? v->name : "NULL", + (unsigned long long) dm_bufio_get_block_number(buf)); return -EINVAL; } } @@ -430,7 +453,7 @@ int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, void *p; int r; - p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result); + p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result); if (unlikely(IS_ERR(p))) return PTR_ERR(p); @@ -463,7 +486,10 @@ int dm_bm_write_lock(struct dm_block_manager *bm, void *p; int r; - p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result); + if (bm->read_only) + return -EPERM; + + p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result); if (unlikely(IS_ERR(p))) return PTR_ERR(p); @@ -496,7 +522,7 @@ int dm_bm_read_try_lock(struct dm_block_manager *bm, void *p; int r; - p = dm_bufio_get(to_bufio(bm), b, (struct dm_buffer **) result); + p = dm_bufio_get(bm->bufio, b, (struct dm_buffer **) result); if (unlikely(IS_ERR(p))) return PTR_ERR(p); if (unlikely(!p)) @@ -529,7 +555,10 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm, struct buffer_aux *aux; void *p; - p = dm_bufio_new(to_bufio(bm), b, (struct dm_buffer **) result); + if (bm->read_only) + return -EPERM; + + p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result); if (unlikely(IS_ERR(p))) return PTR_ERR(p); @@ -547,6 +576,7 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm, return 0; } +EXPORT_SYMBOL_GPL(dm_bm_write_lock_zero); int dm_bm_unlock(struct dm_block *b) { @@ -565,45 +595,31 @@ int dm_bm_unlock(struct dm_block *b) } EXPORT_SYMBOL_GPL(dm_bm_unlock); -int dm_bm_unlock_move(struct dm_block *b, dm_block_t n) +int dm_bm_flush(struct dm_block_manager *bm) { - struct buffer_aux *aux; - - aux = dm_bufio_get_aux_data(to_buffer(b)); + if (bm->read_only) + return -EPERM; - if (aux->write_locked) { - dm_bufio_mark_buffer_dirty(to_buffer(b)); - bl_up_write(&aux->lock); - } else - bl_up_read(&aux->lock); - - dm_bufio_release_move(to_buffer(b), n); - return 0; + return dm_bufio_write_dirty_buffers(bm->bufio); } +EXPORT_SYMBOL_GPL(dm_bm_flush); -int dm_bm_flush_and_unlock(struct dm_block_manager *bm, - struct dm_block *superblock) +void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b) { - int r; - - r = dm_bufio_write_dirty_buffers(to_bufio(bm)); - if (unlikely(r)) - return r; - r = dm_bufio_issue_flush(to_bufio(bm)); - if (unlikely(r)) - return r; - - dm_bm_unlock(superblock); + dm_bufio_prefetch(bm->bufio, b, 1); +} - r = dm_bufio_write_dirty_buffers(to_bufio(bm)); - if (unlikely(r)) - return r; - r = dm_bufio_issue_flush(to_bufio(bm)); - if (unlikely(r)) - return r; +void dm_bm_set_read_only(struct dm_block_manager *bm) +{ + bm->read_only = true; +} +EXPORT_SYMBOL_GPL(dm_bm_set_read_only); - return 0; +void dm_bm_set_read_write(struct dm_block_manager *bm) +{ + bm->read_only = false; } +EXPORT_SYMBOL_GPL(dm_bm_set_read_write); u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor) { diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h index 924833d2dfa..1b95dfc1778 100644 --- a/drivers/md/persistent-data/dm-block-manager.h +++ b/drivers/md/persistent-data/dm-block-manager.h @@ -97,14 +97,6 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b, int dm_bm_unlock(struct dm_block *b); /* - * An optimisation; we often want to copy a block's contents to a new - * block. eg, as part of the shadowing operation. It's far better for - * bufio to do this move behind the scenes than hold 2 locks and memcpy the - * data. - */ -int dm_bm_unlock_move(struct dm_block *b, dm_block_t n); - -/* * It's a common idiom to have a superblock that should be committed last. * * @superblock should be write-locked on entry. It will be unlocked during @@ -113,8 +105,26 @@ int dm_bm_unlock_move(struct dm_block *b, dm_block_t n); * * This method always blocks. */ -int dm_bm_flush_and_unlock(struct dm_block_manager *bm, - struct dm_block *superblock); +int dm_bm_flush(struct dm_block_manager *bm); + +/* + * Request data is prefetched into the cache. + */ +void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b); + +/* + * Switches the bm to a read only mode. Once read-only mode + * has been entered the following functions will return -EPERM. + * + * dm_bm_write_lock + * dm_bm_write_lock_zero + * dm_bm_flush_and_unlock + * + * Additionally you should not use dm_bm_unlock_move, however no error will + * be returned if you do. + */ +void dm_bm_set_read_only(struct dm_block_manager *bm); +void dm_bm_set_read_write(struct dm_block_manager *bm); u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor); diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index d279c768f8f..37d367bb9aa 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -36,13 +36,13 @@ struct node_header { __le32 padding; } __packed; -struct node { +struct btree_node { struct node_header header; __le64 keys[0]; } __packed; -void inc_children(struct dm_transaction_manager *tm, struct node *n, +void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, struct dm_btree_value_type *vt); int new_block(struct dm_btree_info *info, struct dm_block **result); @@ -64,7 +64,8 @@ struct ro_spine { void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info); int exit_ro_spine(struct ro_spine *s); int ro_step(struct ro_spine *s, dm_block_t new_child); -struct node *ro_node(struct ro_spine *s); +void ro_pop(struct ro_spine *s); +struct btree_node *ro_node(struct ro_spine *s); struct shadow_spine { struct dm_btree_info *info; @@ -98,29 +99,26 @@ int shadow_root(struct shadow_spine *s); /* * Some inlines. */ -static inline __le64 *key_ptr(struct node *n, uint32_t index) +static inline __le64 *key_ptr(struct btree_node *n, uint32_t index) { return n->keys + index; } -static inline void *value_base(struct node *n) +static inline void *value_base(struct btree_node *n) { return &n->keys[le32_to_cpu(n->header.max_entries)]; } -/* - * FIXME: Now that value size is stored in node we don't need the third parm. - */ -static inline void *value_ptr(struct node *n, uint32_t index, size_t value_size) +static inline void *value_ptr(struct btree_node *n, uint32_t index) { - BUG_ON(value_size != le32_to_cpu(n->header.value_size)); + uint32_t value_size = le32_to_cpu(n->header.value_size); return value_base(n) + (value_size * index); } /* * Assumes the values are suitably-aligned and converts to core format. */ -static inline uint64_t value64(struct node *n, uint32_t index) +static inline uint64_t value64(struct btree_node *n, uint32_t index) { __le64 *values_le = value_base(n); @@ -130,7 +128,7 @@ static inline uint64_t value64(struct node *n, uint32_t index) /* * Searching for a key within a single node. */ -int lower_bound(struct node *n, uint64_t key); +int lower_bound(struct btree_node *n, uint64_t key); extern struct dm_block_validator btree_node_validator; diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c index 023fbc2d389..b88757cd0d1 100644 --- a/drivers/md/persistent-data/dm-btree-remove.c +++ b/drivers/md/persistent-data/dm-btree-remove.c @@ -53,7 +53,7 @@ /* * Some little utilities for moving node data around. */ -static void node_shift(struct node *n, int shift) +static void node_shift(struct btree_node *n, int shift) { uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); uint32_t value_size = le32_to_cpu(n->header.value_size); @@ -61,25 +61,25 @@ static void node_shift(struct node *n, int shift) if (shift < 0) { shift = -shift; BUG_ON(shift > nr_entries); - BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift, value_size)); + BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift)); memmove(key_ptr(n, 0), key_ptr(n, shift), (nr_entries - shift) * sizeof(__le64)); - memmove(value_ptr(n, 0, value_size), - value_ptr(n, shift, value_size), + memmove(value_ptr(n, 0), + value_ptr(n, shift), (nr_entries - shift) * value_size); } else { BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); memmove(key_ptr(n, shift), key_ptr(n, 0), nr_entries * sizeof(__le64)); - memmove(value_ptr(n, shift, value_size), - value_ptr(n, 0, value_size), + memmove(value_ptr(n, shift), + value_ptr(n, 0), nr_entries * value_size); } } -static void node_copy(struct node *left, struct node *right, int shift) +static void node_copy(struct btree_node *left, struct btree_node *right, int shift) { uint32_t nr_left = le32_to_cpu(left->header.nr_entries); uint32_t value_size = le32_to_cpu(left->header.value_size); @@ -91,16 +91,16 @@ static void node_copy(struct node *left, struct node *right, int shift) memcpy(key_ptr(left, nr_left), key_ptr(right, 0), shift * sizeof(__le64)); - memcpy(value_ptr(left, nr_left, value_size), - value_ptr(right, 0, value_size), + memcpy(value_ptr(left, nr_left), + value_ptr(right, 0), shift * value_size); } else { BUG_ON(shift > le32_to_cpu(right->header.max_entries)); memcpy(key_ptr(right, 0), key_ptr(left, nr_left - shift), shift * sizeof(__le64)); - memcpy(value_ptr(right, 0, value_size), - value_ptr(left, nr_left - shift, value_size), + memcpy(value_ptr(right, 0), + value_ptr(left, nr_left - shift), shift * value_size); } } @@ -108,7 +108,7 @@ static void node_copy(struct node *left, struct node *right, int shift) /* * Delete a specific entry from a leaf node. */ -static void delete_at(struct node *n, unsigned index) +static void delete_at(struct btree_node *n, unsigned index) { unsigned nr_entries = le32_to_cpu(n->header.nr_entries); unsigned nr_to_copy = nr_entries - (index + 1); @@ -120,43 +120,27 @@ static void delete_at(struct node *n, unsigned index) key_ptr(n, index + 1), nr_to_copy * sizeof(__le64)); - memmove(value_ptr(n, index, value_size), - value_ptr(n, index + 1, value_size), + memmove(value_ptr(n, index), + value_ptr(n, index + 1), nr_to_copy * value_size); } n->header.nr_entries = cpu_to_le32(nr_entries - 1); } -static unsigned del_threshold(struct node *n) +static unsigned merge_threshold(struct btree_node *n) { return le32_to_cpu(n->header.max_entries) / 3; } -static unsigned merge_threshold(struct node *n) -{ - /* - * The extra one is because we know we're potentially going to - * delete an entry. - */ - return 2 * (le32_to_cpu(n->header.max_entries) / 3) + 1; -} - struct child { unsigned index; struct dm_block *block; - struct node *n; + struct btree_node *n; }; -static struct dm_btree_value_type le64_type = { - .context = NULL, - .size = sizeof(__le64), - .inc = NULL, - .dec = NULL, - .equal = NULL -}; - -static int init_child(struct dm_btree_info *info, struct node *parent, +static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt, + struct btree_node *parent, unsigned index, struct child *result) { int r, inc; @@ -173,9 +157,9 @@ static int init_child(struct dm_btree_info *info, struct node *parent, result->n = dm_block_data(result->block); if (inc) - inc_children(info->tm, result->n, &le64_type); + inc_children(info->tm, result->n, vt); - *((__le64 *) value_ptr(parent, index, sizeof(__le64))) = + *((__le64 *) value_ptr(parent, index)) = cpu_to_le64(dm_block_location(result->block)); return 0; @@ -186,8 +170,17 @@ static int exit_child(struct dm_btree_info *info, struct child *c) return dm_tm_unlock(info->tm, c->block); } -static void shift(struct node *left, struct node *right, int count) +static void shift(struct btree_node *left, struct btree_node *right, int count) { + uint32_t nr_left = le32_to_cpu(left->header.nr_entries); + uint32_t nr_right = le32_to_cpu(right->header.nr_entries); + uint32_t max_entries = le32_to_cpu(left->header.max_entries); + uint32_t r_max_entries = le32_to_cpu(right->header.max_entries); + + BUG_ON(max_entries != r_max_entries); + BUG_ON(nr_left - count > max_entries); + BUG_ON(nr_right + count > max_entries); + if (!count) return; @@ -199,24 +192,20 @@ static void shift(struct node *left, struct node *right, int count) node_shift(right, count); } - left->header.nr_entries = - cpu_to_le32(le32_to_cpu(left->header.nr_entries) - count); - BUG_ON(le32_to_cpu(left->header.nr_entries) > le32_to_cpu(left->header.max_entries)); - - right->header.nr_entries = - cpu_to_le32(le32_to_cpu(right->header.nr_entries) + count); - BUG_ON(le32_to_cpu(right->header.nr_entries) > le32_to_cpu(right->header.max_entries)); + left->header.nr_entries = cpu_to_le32(nr_left - count); + right->header.nr_entries = cpu_to_le32(nr_right + count); } -static void __rebalance2(struct dm_btree_info *info, struct node *parent, +static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent, struct child *l, struct child *r) { - struct node *left = l->n; - struct node *right = r->n; + struct btree_node *left = l->n; + struct btree_node *right = r->n; uint32_t nr_left = le32_to_cpu(left->header.nr_entries); uint32_t nr_right = le32_to_cpu(right->header.nr_entries); + unsigned threshold = 2 * merge_threshold(left) + 1; - if (nr_left + nr_right <= merge_threshold(left)) { + if (nr_left + nr_right < threshold) { /* * Merge */ @@ -234,28 +223,25 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent, * Rebalance. */ unsigned target_left = (nr_left + nr_right) / 2; - unsigned shift_ = nr_left - target_left; - BUG_ON(le32_to_cpu(left->header.max_entries) <= nr_left - shift_); - BUG_ON(le32_to_cpu(right->header.max_entries) <= nr_right + shift_); shift(left, right, nr_left - target_left); *key_ptr(parent, r->index) = right->keys[0]; } } static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, - unsigned left_index) + struct dm_btree_value_type *vt, unsigned left_index) { int r; - struct node *parent; + struct btree_node *parent; struct child left, right; parent = dm_block_data(shadow_current(s)); - r = init_child(info, parent, left_index, &left); + r = init_child(info, vt, parent, left_index, &left); if (r) return r; - r = init_child(info, parent, left_index + 1, &right); + r = init_child(info, vt, parent, left_index + 1, &right); if (r) { exit_child(info, &left); return r; @@ -272,95 +258,129 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, return exit_child(info, &right); } -static void __rebalance3(struct dm_btree_info *info, struct node *parent, - struct child *l, struct child *c, struct child *r) +/* + * We dump as many entries from center as possible into left, then the rest + * in right, then rebalance2. This wastes some cpu, but I want something + * simple atm. + */ +static void delete_center_node(struct dm_btree_info *info, struct btree_node *parent, + struct child *l, struct child *c, struct child *r, + struct btree_node *left, struct btree_node *center, struct btree_node *right, + uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) { - struct node *left = l->n; - struct node *center = c->n; - struct node *right = r->n; - - uint32_t nr_left = le32_to_cpu(left->header.nr_entries); - uint32_t nr_center = le32_to_cpu(center->header.nr_entries); - uint32_t nr_right = le32_to_cpu(right->header.nr_entries); uint32_t max_entries = le32_to_cpu(left->header.max_entries); + unsigned shift = min(max_entries - nr_left, nr_center); + + BUG_ON(nr_left + shift > max_entries); + node_copy(left, center, -shift); + left->header.nr_entries = cpu_to_le32(nr_left + shift); + + if (shift != nr_center) { + shift = nr_center - shift; + BUG_ON((nr_right + shift) > max_entries); + node_shift(right, shift); + node_copy(center, right, shift); + right->header.nr_entries = cpu_to_le32(nr_right + shift); + } + *key_ptr(parent, r->index) = right->keys[0]; - unsigned target; + delete_at(parent, c->index); + r->index--; - BUG_ON(left->header.max_entries != center->header.max_entries); - BUG_ON(center->header.max_entries != right->header.max_entries); - - if (((nr_left + nr_center + nr_right) / 2) < merge_threshold(center)) { - /* - * Delete center node: - * - * We dump as many entries from center as possible into - * left, then the rest in right, then rebalance2. This - * wastes some cpu, but I want something simple atm. - */ - unsigned shift = min(max_entries - nr_left, nr_center); - - BUG_ON(nr_left + shift > max_entries); - node_copy(left, center, -shift); - left->header.nr_entries = cpu_to_le32(nr_left + shift); - - if (shift != nr_center) { - shift = nr_center - shift; - BUG_ON((nr_right + shift) >= max_entries); - node_shift(right, shift); - node_copy(center, right, shift); - right->header.nr_entries = cpu_to_le32(nr_right + shift); - } - *key_ptr(parent, r->index) = right->keys[0]; + dm_tm_dec(info->tm, dm_block_location(c->block)); + __rebalance2(info, parent, l, r); +} - delete_at(parent, c->index); - r->index--; +/* + * Redistributes entries among 3 sibling nodes. + */ +static void redistribute3(struct dm_btree_info *info, struct btree_node *parent, + struct child *l, struct child *c, struct child *r, + struct btree_node *left, struct btree_node *center, struct btree_node *right, + uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) +{ + int s; + uint32_t max_entries = le32_to_cpu(left->header.max_entries); + unsigned target = (nr_left + nr_center + nr_right) / 3; + BUG_ON(target > max_entries); - dm_tm_dec(info->tm, dm_block_location(c->block)); - __rebalance2(info, parent, l, r); + if (nr_left < nr_right) { + s = nr_left - target; - return; - } + if (s < 0 && nr_center < -s) { + /* not enough in central node */ + shift(left, center, nr_center); + s = nr_center - target; + shift(left, right, s); + nr_right += s; + } else + shift(left, center, s); - /* - * Rebalance - */ - target = (nr_left + nr_center + nr_right) / 3; - BUG_ON(target > max_entries); + shift(center, right, target - nr_right); - /* - * Adjust the left node - */ - shift(left, center, nr_left - target); + } else { + s = target - nr_right; + if (s > 0 && nr_center < s) { + /* not enough in central node */ + shift(center, right, nr_center); + s = target - nr_center; + shift(left, right, s); + nr_left -= s; + } else + shift(center, right, s); + + shift(left, center, nr_left - target); + } - /* - * Adjust the right node - */ - shift(center, right, target - nr_right); *key_ptr(parent, c->index) = center->keys[0]; *key_ptr(parent, r->index) = right->keys[0]; } +static void __rebalance3(struct dm_btree_info *info, struct btree_node *parent, + struct child *l, struct child *c, struct child *r) +{ + struct btree_node *left = l->n; + struct btree_node *center = c->n; + struct btree_node *right = r->n; + + uint32_t nr_left = le32_to_cpu(left->header.nr_entries); + uint32_t nr_center = le32_to_cpu(center->header.nr_entries); + uint32_t nr_right = le32_to_cpu(right->header.nr_entries); + + unsigned threshold = merge_threshold(left) * 4 + 1; + + BUG_ON(left->header.max_entries != center->header.max_entries); + BUG_ON(center->header.max_entries != right->header.max_entries); + + if ((nr_left + nr_center + nr_right) < threshold) + delete_center_node(info, parent, l, c, r, left, center, right, + nr_left, nr_center, nr_right); + else + redistribute3(info, parent, l, c, r, left, center, right, + nr_left, nr_center, nr_right); +} + static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, - unsigned left_index) + struct dm_btree_value_type *vt, unsigned left_index) { int r; - struct node *parent = dm_block_data(shadow_current(s)); + struct btree_node *parent = dm_block_data(shadow_current(s)); struct child left, center, right; /* * FIXME: fill out an array? */ - r = init_child(info, parent, left_index, &left); + r = init_child(info, vt, parent, left_index, &left); if (r) return r; - r = init_child(info, parent, left_index + 1, ¢er); + r = init_child(info, vt, parent, left_index + 1, ¢er); if (r) { exit_child(info, &left); return r; } - r = init_child(info, parent, left_index + 2, &right); + r = init_child(info, vt, parent, left_index + 2, &right); if (r) { exit_child(info, &left); exit_child(info, ¢er); @@ -394,7 +414,7 @@ static int get_nr_entries(struct dm_transaction_manager *tm, { int r; struct dm_block *block; - struct node *n; + struct btree_node *n; r = dm_tm_read_lock(tm, b, &btree_node_validator, &block); if (r) @@ -407,11 +427,12 @@ static int get_nr_entries(struct dm_transaction_manager *tm, } static int rebalance_children(struct shadow_spine *s, - struct dm_btree_info *info, uint64_t key) + struct dm_btree_info *info, + struct dm_btree_value_type *vt, uint64_t key) { int i, r, has_left_sibling, has_right_sibling; uint32_t child_entries; - struct node *n; + struct btree_node *n; n = dm_block_data(shadow_current(s)); @@ -441,25 +462,22 @@ static int rebalance_children(struct shadow_spine *s, if (r) return r; - if (child_entries > del_threshold(n)) - return 0; - has_left_sibling = i > 0; has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); if (!has_left_sibling) - r = rebalance2(s, info, i); + r = rebalance2(s, info, vt, i); else if (!has_right_sibling) - r = rebalance2(s, info, i - 1); + r = rebalance2(s, info, vt, i - 1); else - r = rebalance3(s, info, i - 1); + r = rebalance3(s, info, vt, i - 1); return r; } -static int do_leaf(struct node *n, uint64_t key, unsigned *index) +static int do_leaf(struct btree_node *n, uint64_t key, unsigned *index) { int i = lower_bound(n, key); @@ -482,7 +500,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, uint64_t key, unsigned *index) { int i = *index, r; - struct node *n; + struct btree_node *n; for (;;) { r = shadow_step(s, root, vt); @@ -496,7 +514,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, */ if (shadow_has_parent(s)) { __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); - memcpy(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(__le64)), + memcpy(value_ptr(dm_block_data(shadow_parent(s)), i), &location, sizeof(__le64)); } @@ -505,7 +523,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, if (le32_to_cpu(n->header.flags) & LEAF_NODE) return do_leaf(n, key, index); - r = rebalance_children(s, info, key); + r = rebalance_children(s, info, vt, key); if (r) break; @@ -526,13 +544,21 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, return r; } +static struct dm_btree_value_type le64_type = { + .context = NULL, + .size = sizeof(__le64), + .inc = NULL, + .dec = NULL, + .equal = NULL +}; + int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, uint64_t *keys, dm_block_t *new_root) { unsigned level, last_level = info->levels - 1; int index = 0, r = 0; struct shadow_spine spine; - struct node *n; + struct btree_node *n; init_shadow_spine(&spine, info); for (level = 0; level < info->levels; level++) { @@ -553,7 +579,7 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, if (info->value_type.dec) info->value_type.dec(info->value_type.context, - value_ptr(n, index, info->value_type.size)); + value_ptr(n, index)); delete_at(n, index); } diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c index d9a7912ee8e..cf9fd676ae4 100644 --- a/drivers/md/persistent-data/dm-btree-spine.c +++ b/drivers/md/persistent-data/dm-btree-spine.c @@ -23,7 +23,7 @@ static void node_prepare_for_write(struct dm_block_validator *v, struct dm_block *b, size_t block_size) { - struct node *n = dm_block_data(b); + struct btree_node *n = dm_block_data(b); struct node_header *h = &n->header; h->blocknr = cpu_to_le64(dm_block_location(b)); @@ -38,15 +38,15 @@ static int node_check(struct dm_block_validator *v, struct dm_block *b, size_t block_size) { - struct node *n = dm_block_data(b); + struct btree_node *n = dm_block_data(b); struct node_header *h = &n->header; size_t value_size; __le32 csum_disk; uint32_t flags; if (dm_block_location(b) != le64_to_cpu(h->blocknr)) { - DMERR("node_check failed blocknr %llu wanted %llu", - le64_to_cpu(h->blocknr), dm_block_location(b)); + DMERR_LIMIT("node_check failed: blocknr %llu != wanted %llu", + le64_to_cpu(h->blocknr), dm_block_location(b)); return -ENOTBLK; } @@ -54,8 +54,8 @@ static int node_check(struct dm_block_validator *v, block_size - sizeof(__le32), BTREE_CSUM_XOR)); if (csum_disk != h->csum) { - DMERR("node_check failed csum %u wanted %u", - le32_to_cpu(csum_disk), le32_to_cpu(h->csum)); + DMERR_LIMIT("node_check failed: csum %u != wanted %u", + le32_to_cpu(csum_disk), le32_to_cpu(h->csum)); return -EILSEQ; } @@ -63,12 +63,12 @@ static int node_check(struct dm_block_validator *v, if (sizeof(struct node_header) + (sizeof(__le64) + value_size) * le32_to_cpu(h->max_entries) > block_size) { - DMERR("node_check failed: max_entries too large"); + DMERR_LIMIT("node_check failed: max_entries too large"); return -EILSEQ; } if (le32_to_cpu(h->nr_entries) > le32_to_cpu(h->max_entries)) { - DMERR("node_check failed, too many entries"); + DMERR_LIMIT("node_check failed: too many entries"); return -EILSEQ; } @@ -77,7 +77,7 @@ static int node_check(struct dm_block_validator *v, */ flags = le32_to_cpu(h->flags); if (!(flags & INTERNAL_NODE) && !(flags & LEAF_NODE)) { - DMERR("node_check failed, node is neither INTERNAL or LEAF"); + DMERR_LIMIT("node_check failed: node is neither INTERNAL or LEAF"); return -EILSEQ; } @@ -164,7 +164,14 @@ int ro_step(struct ro_spine *s, dm_block_t new_child) return r; } -struct node *ro_node(struct ro_spine *s) +void ro_pop(struct ro_spine *s) +{ + BUG_ON(!s->count); + --s->count; + unlock_block(s->info, s->nodes[s->count]); +} + +struct btree_node *ro_node(struct ro_spine *s) { struct dm_block *block; diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index bd1e7ffbe26..416060c2570 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -38,7 +38,7 @@ static void array_insert(void *base, size_t elt_size, unsigned nr_elts, /*----------------------------------------------------------------*/ /* makes the assumption that no two keys are the same. */ -static int bsearch(struct node *n, uint64_t key, int want_hi) +static int bsearch(struct btree_node *n, uint64_t key, int want_hi) { int lo = -1, hi = le32_to_cpu(n->header.nr_entries); @@ -58,12 +58,12 @@ static int bsearch(struct node *n, uint64_t key, int want_hi) return want_hi ? hi : lo; } -int lower_bound(struct node *n, uint64_t key) +int lower_bound(struct btree_node *n, uint64_t key) { return bsearch(n, key, 0); } -void inc_children(struct dm_transaction_manager *tm, struct node *n, +void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, struct dm_btree_value_type *vt) { unsigned i; @@ -74,11 +74,10 @@ void inc_children(struct dm_transaction_manager *tm, struct node *n, dm_tm_inc(tm, value64(n, i)); else if (vt->inc) for (i = 0; i < nr_entries; i++) - vt->inc(vt->context, - value_ptr(n, i, vt->size)); + vt->inc(vt->context, value_ptr(n, i)); } -static int insert_at(size_t value_size, struct node *node, unsigned index, +static int insert_at(size_t value_size, struct btree_node *node, unsigned index, uint64_t key, void *value) __dm_written_to_disk(value) { @@ -123,7 +122,7 @@ int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root) { int r; struct dm_block *b; - struct node *n; + struct btree_node *n; size_t block_size; uint32_t max_entries; @@ -155,13 +154,14 @@ EXPORT_SYMBOL_GPL(dm_btree_empty); #define MAX_SPINE_DEPTH 64 struct frame { struct dm_block *b; - struct node *n; + struct btree_node *n; unsigned level; unsigned nr_children; unsigned current_child; }; struct del_stack { + struct dm_btree_info *info; struct dm_transaction_manager *tm; int top; struct frame spine[MAX_SPINE_DEPTH]; @@ -184,6 +184,20 @@ static int unprocessed_frames(struct del_stack *s) return s->top >= 0; } +static void prefetch_children(struct del_stack *s, struct frame *f) +{ + unsigned i; + struct dm_block_manager *bm = dm_tm_get_bm(s->tm); + + for (i = 0; i < f->nr_children; i++) + dm_bm_prefetch(bm, value64(f->n, i)); +} + +static bool is_internal_level(struct dm_btree_info *info, struct frame *f) +{ + return f->level < (info->levels - 1); +} + static int push_frame(struct del_stack *s, dm_block_t b, unsigned level) { int r; @@ -206,6 +220,7 @@ static int push_frame(struct del_stack *s, dm_block_t b, unsigned level) dm_tm_dec(s->tm, b); else { + uint32_t flags; struct frame *f = s->spine + ++s->top; r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b); @@ -218,6 +233,10 @@ static int push_frame(struct del_stack *s, dm_block_t b, unsigned level) f->level = level; f->nr_children = le32_to_cpu(f->n->header.nr_entries); f->current_child = 0; + + flags = le32_to_cpu(f->n->header.flags); + if (flags & INTERNAL_NODE || is_internal_level(s->info, f)) + prefetch_children(s, f); } return 0; @@ -239,10 +258,11 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root) s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; + s->info = info; s->tm = info->tm; s->top = -1; - r = push_frame(s, root, 1); + r = push_frame(s, root, 0); if (r) goto out; @@ -268,7 +288,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root) if (r) goto out; - } else if (f->level != (info->levels - 1)) { + } else if (is_internal_level(info, f)) { b = value64(f->n, f->current_child); f->current_child++; r = push_frame(s, b, f->level + 1); @@ -281,9 +301,9 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root) for (i = 0; i < f->nr_children; i++) info->value_type.dec(info->value_type.context, - value_ptr(f->n, i, info->value_type.size)); + value_ptr(f->n, i)); } - f->current_child = f->nr_children; + pop_frame(s); } } @@ -296,7 +316,7 @@ EXPORT_SYMBOL_GPL(dm_btree_del); /*----------------------------------------------------------------*/ static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key, - int (*search_fn)(struct node *, uint64_t), + int (*search_fn)(struct btree_node *, uint64_t), uint64_t *result_key, void *v, size_t value_size) { int i, r; @@ -320,7 +340,7 @@ static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key, } while (!(flags & LEAF_NODE)); *result_key = le64_to_cpu(ro_node(s)->keys[i]); - memcpy(v, value_ptr(ro_node(s), i, value_size), value_size); + memcpy(v, value_ptr(ro_node(s), i), value_size); return 0; } @@ -407,7 +427,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root, size_t size; unsigned nr_left, nr_right; struct dm_block *left, *right, *parent; - struct node *ln, *rn, *pn; + struct btree_node *ln, *rn, *pn; __le64 location; left = shadow_current(s); @@ -432,7 +452,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root, size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? sizeof(uint64_t) : s->info->value_type.size; - memcpy(value_ptr(rn, 0, size), value_ptr(ln, nr_left, size), + memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left), size * nr_right); /* @@ -443,7 +463,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root, pn = dm_block_data(parent); location = cpu_to_le64(dm_block_location(left)); __dm_bless_for_disk(&location); - memcpy_disk(value_ptr(pn, parent_index, sizeof(__le64)), + memcpy_disk(value_ptr(pn, parent_index), &location, sizeof(__le64)); location = cpu_to_le64(dm_block_location(right)); @@ -492,7 +512,7 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key) size_t size; unsigned nr_left, nr_right; struct dm_block *left, *right, *new_parent; - struct node *pn, *ln, *rn; + struct btree_node *pn, *ln, *rn; __le64 val; new_parent = shadow_current(s); @@ -529,8 +549,8 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key) size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? sizeof(__le64) : s->info->value_type.size; - memcpy(value_ptr(ln, 0, size), value_ptr(pn, 0, size), nr_left * size); - memcpy(value_ptr(rn, 0, size), value_ptr(pn, nr_left, size), + memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size); + memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left), nr_right * size); /* new_parent should just point to l and r now */ @@ -545,12 +565,12 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key) val = cpu_to_le64(dm_block_location(left)); __dm_bless_for_disk(&val); pn->keys[0] = ln->keys[0]; - memcpy_disk(value_ptr(pn, 0, sizeof(__le64)), &val, sizeof(__le64)); + memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64)); val = cpu_to_le64(dm_block_location(right)); __dm_bless_for_disk(&val); pn->keys[1] = rn->keys[0]; - memcpy_disk(value_ptr(pn, 1, sizeof(__le64)), &val, sizeof(__le64)); + memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64)); /* * rejig the spine. This is ugly, since it knows too @@ -577,7 +597,7 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, uint64_t key, unsigned *index) { int r, i = *index, top = 1; - struct node *node; + struct btree_node *node; for (;;) { r = shadow_step(s, root, vt); @@ -595,7 +615,7 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); __dm_bless_for_disk(&location); - memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(uint64_t)), + memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), &location, sizeof(__le64)); } @@ -644,7 +664,7 @@ static int insert(struct dm_btree_info *info, dm_block_t root, unsigned level, index = -1, last_level = info->levels - 1; dm_block_t block = root; struct shadow_spine spine; - struct node *n; + struct btree_node *n; struct dm_btree_value_type le64_type; le64_type.context = NULL; @@ -710,12 +730,12 @@ static int insert(struct dm_btree_info *info, dm_block_t root, (!info->value_type.equal || !info->value_type.equal( info->value_type.context, - value_ptr(n, index, info->value_type.size), + value_ptr(n, index), value))) { info->value_type.dec(info->value_type.context, - value_ptr(n, index, info->value_type.size)); + value_ptr(n, index)); } - memcpy_disk(value_ptr(n, index, info->value_type.size), + memcpy_disk(value_ptr(n, index), value, info->value_type.size); } @@ -750,8 +770,8 @@ EXPORT_SYMBOL_GPL(dm_btree_insert_notify); /*----------------------------------------------------------------*/ -static int find_highest_key(struct ro_spine *s, dm_block_t block, - uint64_t *result_key, dm_block_t *next_block) +static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest, + uint64_t *result_key, dm_block_t *next_block) { int i, r; uint32_t flags; @@ -768,7 +788,11 @@ static int find_highest_key(struct ro_spine *s, dm_block_t block, else i--; - *result_key = le64_to_cpu(ro_node(s)->keys[i]); + if (find_highest) + *result_key = le64_to_cpu(ro_node(s)->keys[i]); + else + *result_key = le64_to_cpu(ro_node(s)->keys[0]); + if (next_block || flags & INTERNAL_NODE) block = value64(ro_node(s), i); @@ -779,16 +803,16 @@ static int find_highest_key(struct ro_spine *s, dm_block_t block, return 0; } -int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, - uint64_t *result_keys) +static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root, + bool find_highest, uint64_t *result_keys) { int r = 0, count = 0, level; struct ro_spine spine; init_ro_spine(&spine, info); for (level = 0; level < info->levels; level++) { - r = find_highest_key(&spine, root, result_keys + level, - level == info->levels - 1 ? NULL : &root); + r = find_key(&spine, root, find_highest, result_keys + level, + level == info->levels - 1 ? NULL : &root); if (r == -ENODATA) { r = 0; break; @@ -802,4 +826,71 @@ int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, return r ? r : count; } + +int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, + uint64_t *result_keys) +{ + return dm_btree_find_key(info, root, true, result_keys); +} EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); + +int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root, + uint64_t *result_keys) +{ + return dm_btree_find_key(info, root, false, result_keys); +} +EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key); + +/*----------------------------------------------------------------*/ + +/* + * FIXME: We shouldn't use a recursive algorithm when we have limited stack + * space. Also this only works for single level trees. + */ +static int walk_node(struct ro_spine *s, dm_block_t block, + int (*fn)(void *context, uint64_t *keys, void *leaf), + void *context) +{ + int r; + unsigned i, nr; + struct btree_node *n; + uint64_t keys; + + r = ro_step(s, block); + n = ro_node(s); + + nr = le32_to_cpu(n->header.nr_entries); + for (i = 0; i < nr; i++) { + if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) { + r = walk_node(s, value64(n, i), fn, context); + if (r) + goto out; + } else { + keys = le64_to_cpu(*key_ptr(n, i)); + r = fn(context, &keys, value_ptr(n, i)); + if (r) + goto out; + } + } + +out: + ro_pop(s); + return r; +} + +int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, + int (*fn)(void *context, uint64_t *keys, void *leaf), + void *context) +{ + int r; + struct ro_spine spine; + + BUG_ON(info->levels > 1); + + init_ro_spine(&spine, info); + r = walk_node(&spine, root, fn, context); + exit_ro_spine(&spine); + + return r; +} +EXPORT_SYMBOL_GPL(dm_btree_walk); diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h index ae02c84410f..dacfc34180b 100644 --- a/drivers/md/persistent-data/dm-btree.h +++ b/drivers/md/persistent-data/dm-btree.h @@ -35,7 +35,7 @@ struct dm_transaction_manager; */ /* - * Infomation about the values stored within the btree. + * Information about the values stored within the btree. */ struct dm_btree_value_type { void *context; @@ -58,21 +58,21 @@ struct dm_btree_value_type { * somewhere.) This method is _not_ called for insertion of a new * value: It is assumed the ref count is already 1. */ - void (*inc)(void *context, void *value); + void (*inc)(void *context, const void *value); /* * This value is being deleted. The btree takes care of freeing * the memory pointed to by @value. Often the del function just * needs to decrement a reference count somewhere. */ - void (*dec)(void *context, void *value); + void (*dec)(void *context, const void *value); /* * A test for equality between two values. When a value is * overwritten with a new one, the old one has the dec method * called _unless_ the new and old value are deemed equal. */ - int (*equal)(void *context, void *value1, void *value2); + int (*equal)(void *context, const void *value1, const void *value2); }; /* @@ -137,9 +137,26 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, /* * Returns < 0 on failure. Otherwise the number of key entries that have * been filled out. Remember trees can have zero entries, and as such have + * no lowest key. + */ +int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root, + uint64_t *result_keys); + +/* + * Returns < 0 on failure. Otherwise the number of key entries that have + * been filled out. Remember trees can have zero entries, and as such have * no highest key. */ int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, uint64_t *result_keys); +/* + * Iterate through the a btree, calling fn() on each entry. + * It only works for single level trees and is internally recursive, so + * monitor stack usage carefully. + */ +int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, + int (*fn)(void *context, uint64_t *keys, void *leaf), + void *context); + #endif /* _LINUX_DM_BTREE_H */ diff --git a/drivers/md/persistent-data/dm-space-map-checker.c b/drivers/md/persistent-data/dm-space-map-checker.c deleted file mode 100644 index 50ed53bf4aa..00000000000 --- a/drivers/md/persistent-data/dm-space-map-checker.c +++ /dev/null @@ -1,438 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ - -#include "dm-space-map-checker.h" - -#include <linux/device-mapper.h> -#include <linux/export.h> - -#ifdef CONFIG_DM_DEBUG_SPACE_MAPS - -#define DM_MSG_PREFIX "space map checker" - -/*----------------------------------------------------------------*/ - -struct count_array { - dm_block_t nr; - dm_block_t nr_free; - - uint32_t *counts; -}; - -static int ca_get_count(struct count_array *ca, dm_block_t b, uint32_t *count) -{ - if (b >= ca->nr) - return -EINVAL; - - *count = ca->counts[b]; - return 0; -} - -static int ca_count_more_than_one(struct count_array *ca, dm_block_t b, int *r) -{ - if (b >= ca->nr) - return -EINVAL; - - *r = ca->counts[b] > 1; - return 0; -} - -static int ca_set_count(struct count_array *ca, dm_block_t b, uint32_t count) -{ - uint32_t old_count; - - if (b >= ca->nr) - return -EINVAL; - - old_count = ca->counts[b]; - - if (!count && old_count) - ca->nr_free++; - - else if (count && !old_count) - ca->nr_free--; - - ca->counts[b] = count; - return 0; -} - -static int ca_inc_block(struct count_array *ca, dm_block_t b) -{ - if (b >= ca->nr) - return -EINVAL; - - ca_set_count(ca, b, ca->counts[b] + 1); - return 0; -} - -static int ca_dec_block(struct count_array *ca, dm_block_t b) -{ - if (b >= ca->nr) - return -EINVAL; - - BUG_ON(ca->counts[b] == 0); - ca_set_count(ca, b, ca->counts[b] - 1); - return 0; -} - -static int ca_create(struct count_array *ca, struct dm_space_map *sm) -{ - int r; - dm_block_t nr_blocks; - - r = dm_sm_get_nr_blocks(sm, &nr_blocks); - if (r) - return r; - - ca->nr = nr_blocks; - ca->nr_free = nr_blocks; - ca->counts = kzalloc(sizeof(*ca->counts) * nr_blocks, GFP_KERNEL); - if (!ca->counts) - return -ENOMEM; - - return 0; -} - -static int ca_load(struct count_array *ca, struct dm_space_map *sm) -{ - int r; - uint32_t count; - dm_block_t nr_blocks, i; - - r = dm_sm_get_nr_blocks(sm, &nr_blocks); - if (r) - return r; - - BUG_ON(ca->nr != nr_blocks); - - DMWARN("Loading debug space map from disk. This may take some time"); - for (i = 0; i < nr_blocks; i++) { - r = dm_sm_get_count(sm, i, &count); - if (r) { - DMERR("load failed"); - return r; - } - - ca_set_count(ca, i, count); - } - DMWARN("Load complete"); - - return 0; -} - -static int ca_extend(struct count_array *ca, dm_block_t extra_blocks) -{ - dm_block_t nr_blocks = ca->nr + extra_blocks; - uint32_t *counts = kzalloc(sizeof(*counts) * nr_blocks, GFP_KERNEL); - if (!counts) - return -ENOMEM; - - memcpy(counts, ca->counts, sizeof(*counts) * ca->nr); - kfree(ca->counts); - ca->nr = nr_blocks; - ca->nr_free += extra_blocks; - ca->counts = counts; - return 0; -} - -static int ca_commit(struct count_array *old, struct count_array *new) -{ - if (old->nr != new->nr) { - BUG_ON(old->nr > new->nr); - ca_extend(old, new->nr - old->nr); - } - - BUG_ON(old->nr != new->nr); - old->nr_free = new->nr_free; - memcpy(old->counts, new->counts, sizeof(*old->counts) * old->nr); - return 0; -} - -static void ca_destroy(struct count_array *ca) -{ - kfree(ca->counts); -} - -/*----------------------------------------------------------------*/ - -struct sm_checker { - struct dm_space_map sm; - - struct count_array old_counts; - struct count_array counts; - - struct dm_space_map *real_sm; -}; - -static void sm_checker_destroy(struct dm_space_map *sm) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - - dm_sm_destroy(smc->real_sm); - ca_destroy(&smc->old_counts); - ca_destroy(&smc->counts); - kfree(smc); -} - -static int sm_checker_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - int r = dm_sm_get_nr_blocks(smc->real_sm, count); - if (!r) - BUG_ON(smc->old_counts.nr != *count); - return r; -} - -static int sm_checker_get_nr_free(struct dm_space_map *sm, dm_block_t *count) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - int r = dm_sm_get_nr_free(smc->real_sm, count); - if (!r) { - /* - * Slow, but we know it's correct. - */ - dm_block_t b, n = 0; - for (b = 0; b < smc->old_counts.nr; b++) - if (smc->old_counts.counts[b] == 0 && - smc->counts.counts[b] == 0) - n++; - - if (n != *count) - DMERR("free block counts differ, checker %u, sm-disk:%u", - (unsigned) n, (unsigned) *count); - } - return r; -} - -static int sm_checker_new_block(struct dm_space_map *sm, dm_block_t *b) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - int r = dm_sm_new_block(smc->real_sm, b); - - if (!r) { - BUG_ON(*b >= smc->old_counts.nr); - BUG_ON(smc->old_counts.counts[*b] != 0); - BUG_ON(*b >= smc->counts.nr); - BUG_ON(smc->counts.counts[*b] != 0); - ca_set_count(&smc->counts, *b, 1); - } - - return r; -} - -static int sm_checker_inc_block(struct dm_space_map *sm, dm_block_t b) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - int r = dm_sm_inc_block(smc->real_sm, b); - int r2 = ca_inc_block(&smc->counts, b); - BUG_ON(r != r2); - return r; -} - -static int sm_checker_dec_block(struct dm_space_map *sm, dm_block_t b) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - int r = dm_sm_dec_block(smc->real_sm, b); - int r2 = ca_dec_block(&smc->counts, b); - BUG_ON(r != r2); - return r; -} - -static int sm_checker_get_count(struct dm_space_map *sm, dm_block_t b, uint32_t *result) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - uint32_t result2 = 0; - int r = dm_sm_get_count(smc->real_sm, b, result); - int r2 = ca_get_count(&smc->counts, b, &result2); - - BUG_ON(r != r2); - if (!r) - BUG_ON(*result != result2); - return r; -} - -static int sm_checker_count_more_than_one(struct dm_space_map *sm, dm_block_t b, int *result) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - int result2 = 0; - int r = dm_sm_count_is_more_than_one(smc->real_sm, b, result); - int r2 = ca_count_more_than_one(&smc->counts, b, &result2); - - BUG_ON(r != r2); - if (!r) - BUG_ON(!(*result) && result2); - return r; -} - -static int sm_checker_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - uint32_t old_rc; - int r = dm_sm_set_count(smc->real_sm, b, count); - int r2; - - BUG_ON(b >= smc->counts.nr); - old_rc = smc->counts.counts[b]; - r2 = ca_set_count(&smc->counts, b, count); - BUG_ON(r != r2); - - return r; -} - -static int sm_checker_commit(struct dm_space_map *sm) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - int r; - - r = dm_sm_commit(smc->real_sm); - if (r) - return r; - - r = ca_commit(&smc->old_counts, &smc->counts); - if (r) - return r; - - return 0; -} - -static int sm_checker_extend(struct dm_space_map *sm, dm_block_t extra_blocks) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - int r = dm_sm_extend(smc->real_sm, extra_blocks); - if (r) - return r; - - return ca_extend(&smc->counts, extra_blocks); -} - -static int sm_checker_root_size(struct dm_space_map *sm, size_t *result) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - return dm_sm_root_size(smc->real_sm, result); -} - -static int sm_checker_copy_root(struct dm_space_map *sm, void *copy_to_here_le, size_t len) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - return dm_sm_copy_root(smc->real_sm, copy_to_here_le, len); -} - -/*----------------------------------------------------------------*/ - -static struct dm_space_map ops_ = { - .destroy = sm_checker_destroy, - .get_nr_blocks = sm_checker_get_nr_blocks, - .get_nr_free = sm_checker_get_nr_free, - .inc_block = sm_checker_inc_block, - .dec_block = sm_checker_dec_block, - .new_block = sm_checker_new_block, - .get_count = sm_checker_get_count, - .count_is_more_than_one = sm_checker_count_more_than_one, - .set_count = sm_checker_set_count, - .commit = sm_checker_commit, - .extend = sm_checker_extend, - .root_size = sm_checker_root_size, - .copy_root = sm_checker_copy_root -}; - -struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm) -{ - int r; - struct sm_checker *smc; - - if (!sm) - return NULL; - - smc = kmalloc(sizeof(*smc), GFP_KERNEL); - if (!smc) - return NULL; - - memcpy(&smc->sm, &ops_, sizeof(smc->sm)); - r = ca_create(&smc->old_counts, sm); - if (r) { - kfree(smc); - return NULL; - } - - r = ca_create(&smc->counts, sm); - if (r) { - ca_destroy(&smc->old_counts); - kfree(smc); - return NULL; - } - - smc->real_sm = sm; - - r = ca_load(&smc->counts, sm); - if (r) { - ca_destroy(&smc->counts); - ca_destroy(&smc->old_counts); - kfree(smc); - return NULL; - } - - r = ca_commit(&smc->old_counts, &smc->counts); - if (r) { - ca_destroy(&smc->counts); - ca_destroy(&smc->old_counts); - kfree(smc); - return NULL; - } - - return &smc->sm; -} -EXPORT_SYMBOL_GPL(dm_sm_checker_create); - -struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm) -{ - int r; - struct sm_checker *smc; - - if (!sm) - return NULL; - - smc = kmalloc(sizeof(*smc), GFP_KERNEL); - if (!smc) - return NULL; - - memcpy(&smc->sm, &ops_, sizeof(smc->sm)); - r = ca_create(&smc->old_counts, sm); - if (r) { - kfree(smc); - return NULL; - } - - r = ca_create(&smc->counts, sm); - if (r) { - ca_destroy(&smc->old_counts); - kfree(smc); - return NULL; - } - - smc->real_sm = sm; - return &smc->sm; -} -EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh); - -/*----------------------------------------------------------------*/ - -#else - -struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm) -{ - return sm; -} -EXPORT_SYMBOL_GPL(dm_sm_checker_create); - -struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm) -{ - return sm; -} -EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh); - -/*----------------------------------------------------------------*/ - -#endif diff --git a/drivers/md/persistent-data/dm-space-map-checker.h b/drivers/md/persistent-data/dm-space-map-checker.h deleted file mode 100644 index 444dccf6688..00000000000 --- a/drivers/md/persistent-data/dm-space-map-checker.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ - -#ifndef SNAPSHOTS_SPACE_MAP_CHECKER_H -#define SNAPSHOTS_SPACE_MAP_CHECKER_H - -#include "dm-space-map.h" - -/*----------------------------------------------------------------*/ - -/* - * This space map wraps a real on-disk space map, and verifies all of its - * operations. It uses a lot of memory, so only use if you have a specific - * problem that you're debugging. - * - * Ownership of @sm passes. - */ -struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm); -struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm); - -/*----------------------------------------------------------------*/ - -#endif diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index df2494c06cd..aacbe70c2c2 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -39,8 +39,8 @@ static int index_check(struct dm_block_validator *v, __le32 csum_disk; if (dm_block_location(b) != le64_to_cpu(mi_le->blocknr)) { - DMERR("index_check failed blocknr %llu wanted %llu", - le64_to_cpu(mi_le->blocknr), dm_block_location(b)); + DMERR_LIMIT("index_check failed: blocknr %llu != wanted %llu", + le64_to_cpu(mi_le->blocknr), dm_block_location(b)); return -ENOTBLK; } @@ -48,8 +48,8 @@ static int index_check(struct dm_block_validator *v, block_size - sizeof(__le32), INDEX_CSUM_XOR)); if (csum_disk != mi_le->csum) { - DMERR("index_check failed csum %u wanted %u", - le32_to_cpu(csum_disk), le32_to_cpu(mi_le->csum)); + DMERR_LIMIT("index_check failed: csum %u != wanted %u", + le32_to_cpu(csum_disk), le32_to_cpu(mi_le->csum)); return -EILSEQ; } @@ -89,8 +89,8 @@ static int bitmap_check(struct dm_block_validator *v, __le32 csum_disk; if (dm_block_location(b) != le64_to_cpu(disk_header->blocknr)) { - DMERR("bitmap check failed blocknr %llu wanted %llu", - le64_to_cpu(disk_header->blocknr), dm_block_location(b)); + DMERR_LIMIT("bitmap check failed: blocknr %llu != wanted %llu", + le64_to_cpu(disk_header->blocknr), dm_block_location(b)); return -ENOTBLK; } @@ -98,8 +98,8 @@ static int bitmap_check(struct dm_block_validator *v, block_size - sizeof(__le32), BITMAP_CSUM_XOR)); if (csum_disk != disk_header->csum) { - DMERR("bitmap check failed csum %u wanted %u", - le32_to_cpu(csum_disk), le32_to_cpu(disk_header->csum)); + DMERR_LIMIT("bitmap check failed: csum %u != wanted %u", + le32_to_cpu(csum_disk), le32_to_cpu(disk_header->csum)); return -EILSEQ; } @@ -224,6 +224,7 @@ static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm) ll->nr_blocks = 0; ll->bitmap_root = 0; ll->ref_count_root = 0; + ll->bitmap_index_changed = false; return 0; } @@ -244,6 +245,10 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks) return -EINVAL; } + /* + * We need to set this before the dm_tm_new_block() call below. + */ + ll->nr_blocks = nr_blocks; for (i = old_blocks; i < blocks; i++) { struct dm_block *b; struct disk_index_entry idx; @@ -251,6 +256,7 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks) r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b); if (r < 0) return r; + idx.blocknr = cpu_to_le64(dm_block_location(b)); r = dm_tm_unlock(ll->tm, b); @@ -265,7 +271,6 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks) return r; } - ll->nr_blocks = nr_blocks; return 0; } @@ -291,16 +296,11 @@ int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result) return dm_tm_unlock(ll->tm, blk); } -int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result) +static int sm_ll_lookup_big_ref_count(struct ll_disk *ll, dm_block_t b, + uint32_t *result) { __le32 le_rc; - int r = sm_ll_lookup_bitmap(ll, b, result); - - if (r) - return r; - - if (*result != 3) - return r; + int r; r = dm_btree_lookup(&ll->ref_count_info, ll->ref_count_root, &b, &le_rc); if (r < 0) @@ -311,6 +311,19 @@ int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result) return r; } +int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result) +{ + int r = sm_ll_lookup_bitmap(ll, b, result); + + if (r) + return r; + + if (*result != 3) + return r; + + return sm_ll_lookup_big_ref_count(ll, b, result); +} + int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, dm_block_t end, dm_block_t *result) { @@ -371,11 +384,12 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, return -ENOSPC; } -int sm_ll_insert(struct ll_disk *ll, dm_block_t b, - uint32_t ref_count, enum allocation_event *ev) +static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, + int (*mutator)(void *context, uint32_t old, uint32_t *new), + void *context, enum allocation_event *ev) { int r; - uint32_t bit, old; + uint32_t bit, old, ref_count; struct dm_block *nb; dm_block_t index = b; struct disk_index_entry ie_disk; @@ -398,6 +412,20 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b, bm_le = dm_bitmap_data(nb); old = sm_lookup_bitmap(bm_le, bit); + if (old > 2) { + r = sm_ll_lookup_big_ref_count(ll, b, &old); + if (r < 0) { + dm_tm_unlock(ll->tm, nb); + return r; + } + } + + r = mutator(context, old, &ref_count); + if (r) { + dm_tm_unlock(ll->tm, nb); + return r; + } + if (ref_count <= 2) { sm_set_bitmap(bm_le, bit, ref_count); @@ -405,8 +433,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b, if (r < 0) return r; -#if 0 - /* FIXME: dm_btree_remove doesn't handle this yet */ if (old > 2) { r = dm_btree_remove(&ll->ref_count_info, ll->ref_count_root, @@ -414,7 +440,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b, if (r) return r; } -#endif } else { __le32 le_rc = cpu_to_le32(ref_count); @@ -436,50 +461,70 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b, if (ref_count && !old) { *ev = SM_ALLOC; ll->nr_allocated++; - ie_disk.nr_free = cpu_to_le32(le32_to_cpu(ie_disk.nr_free) - 1); + le32_add_cpu(&ie_disk.nr_free, -1); if (le32_to_cpu(ie_disk.none_free_before) == bit) ie_disk.none_free_before = cpu_to_le32(bit + 1); } else if (old && !ref_count) { *ev = SM_FREE; ll->nr_allocated--; - ie_disk.nr_free = cpu_to_le32(le32_to_cpu(ie_disk.nr_free) + 1); + le32_add_cpu(&ie_disk.nr_free, 1); ie_disk.none_free_before = cpu_to_le32(min(le32_to_cpu(ie_disk.none_free_before), bit)); } return ll->save_ie(ll, index, &ie_disk); } -int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) +static int set_ref_count(void *context, uint32_t old, uint32_t *new) { - int r; - uint32_t rc; - - r = sm_ll_lookup(ll, b, &rc); - if (r) - return r; + *new = *((uint32_t *) context); + return 0; +} - return sm_ll_insert(ll, b, rc + 1, ev); +int sm_ll_insert(struct ll_disk *ll, dm_block_t b, + uint32_t ref_count, enum allocation_event *ev) +{ + return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev); } -int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) +static int inc_ref_count(void *context, uint32_t old, uint32_t *new) { - int r; - uint32_t rc; + *new = old + 1; + return 0; +} - r = sm_ll_lookup(ll, b, &rc); - if (r) - return r; +int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) +{ + return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev); +} - if (!rc) +static int dec_ref_count(void *context, uint32_t old, uint32_t *new) +{ + if (!old) { + DMERR_LIMIT("unable to decrement a reference count below 0"); return -EINVAL; + } + + *new = old - 1; + return 0; +} - return sm_ll_insert(ll, b, rc - 1, ev); +int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) +{ + return sm_ll_mutate(ll, b, dec_ref_count, NULL, ev); } int sm_ll_commit(struct ll_disk *ll) { - return ll->commit(ll); + int r = 0; + + if (ll->bitmap_index_changed) { + r = ll->commit(ll); + if (!r) + ll->bitmap_index_changed = false; + } + + return r; } /*----------------------------------------------------------------*/ @@ -494,6 +539,7 @@ static int metadata_ll_load_ie(struct ll_disk *ll, dm_block_t index, static int metadata_ll_save_ie(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *ie) { + ll->bitmap_index_changed = true; memcpy(ll->mi_le.index + index, ie, sizeof(*ie)); return 0; } diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h index 8f220821a9a..b3078d5eda0 100644 --- a/drivers/md/persistent-data/dm-space-map-common.h +++ b/drivers/md/persistent-data/dm-space-map-common.h @@ -78,6 +78,7 @@ struct ll_disk { open_index_fn open_index; max_index_entries_fn max_entries; commit_fn commit; + bool bitmap_index_changed:1; }; struct disk_sm_root { diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c index fc469ba9f62..cfbf9617e46 100644 --- a/drivers/md/persistent-data/dm-space-map-disk.c +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -4,7 +4,6 @@ * This file is released under the GPL. */ -#include "dm-space-map-checker.h" #include "dm-space-map-common.h" #include "dm-space-map-disk.h" #include "dm-space-map.h" @@ -141,26 +140,10 @@ static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b) static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b) { - int r; - uint32_t old_count; enum allocation_event ev; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - r = sm_ll_dec(&smd->ll, b, &ev); - if (!r && (ev == SM_FREE)) { - /* - * It's only free if it's also free in the last - * transaction. - */ - r = sm_ll_lookup(&smd->old_ll, b, &old_count); - if (r) - return r; - - if (!old_count) - smd->nr_allocated_this_transaction--; - } - - return r; + return sm_ll_dec(&smd->ll, b, &ev); } static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) @@ -249,12 +232,12 @@ static struct dm_space_map ops = { .new_block = sm_disk_new_block, .commit = sm_disk_commit, .root_size = sm_disk_root_size, - .copy_root = sm_disk_copy_root + .copy_root = sm_disk_copy_root, + .register_threshold_callback = NULL }; -static struct dm_space_map *dm_sm_disk_create_real( - struct dm_transaction_manager *tm, - dm_block_t nr_blocks) +struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm, + dm_block_t nr_blocks) { int r; struct sm_disk *smd; @@ -285,18 +268,10 @@ bad: kfree(smd); return ERR_PTR(r); } - -struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm, - dm_block_t nr_blocks) -{ - struct dm_space_map *sm = dm_sm_disk_create_real(tm, nr_blocks); - return dm_sm_checker_create_fresh(sm); -} EXPORT_SYMBOL_GPL(dm_sm_disk_create); -static struct dm_space_map *dm_sm_disk_open_real( - struct dm_transaction_manager *tm, - void *root_le, size_t len) +struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm, + void *root_le, size_t len) { int r; struct sm_disk *smd; @@ -323,13 +298,6 @@ bad: kfree(smd); return ERR_PTR(r); } - -struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm, - void *root_le, size_t len) -{ - return dm_sm_checker_create( - dm_sm_disk_open_real(tm, root_le, len)); -} EXPORT_SYMBOL_GPL(dm_sm_disk_open); /*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index e89ae5e7a51..786b689bdfc 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -17,6 +17,55 @@ /*----------------------------------------------------------------*/ /* + * An edge triggered threshold. + */ +struct threshold { + bool threshold_set; + bool value_set; + dm_block_t threshold; + dm_block_t current_value; + dm_sm_threshold_fn fn; + void *context; +}; + +static void threshold_init(struct threshold *t) +{ + t->threshold_set = false; + t->value_set = false; +} + +static void set_threshold(struct threshold *t, dm_block_t value, + dm_sm_threshold_fn fn, void *context) +{ + t->threshold_set = true; + t->threshold = value; + t->fn = fn; + t->context = context; +} + +static bool below_threshold(struct threshold *t, dm_block_t value) +{ + return t->threshold_set && value <= t->threshold; +} + +static bool threshold_already_triggered(struct threshold *t) +{ + return t->value_set && below_threshold(t, t->current_value); +} + +static void check_threshold(struct threshold *t, dm_block_t value) +{ + if (below_threshold(t, value) && + !threshold_already_triggered(t)) + t->fn(t->context); + + t->value_set = true; + t->current_value = value; +} + +/*----------------------------------------------------------------*/ + +/* * Space map interface. * * The low level disk format is written using the standard btree and @@ -42,6 +91,69 @@ struct block_op { dm_block_t block; }; +struct bop_ring_buffer { + unsigned begin; + unsigned end; + struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1]; +}; + +static void brb_init(struct bop_ring_buffer *brb) +{ + brb->begin = 0; + brb->end = 0; +} + +static bool brb_empty(struct bop_ring_buffer *brb) +{ + return brb->begin == brb->end; +} + +static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old) +{ + unsigned r = old + 1; + return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r; +} + +static int brb_push(struct bop_ring_buffer *brb, + enum block_op_type type, dm_block_t b) +{ + struct block_op *bop; + unsigned next = brb_next(brb, brb->end); + + /* + * We don't allow the last bop to be filled, this way we can + * differentiate between full and empty. + */ + if (next == brb->begin) + return -ENOMEM; + + bop = brb->bops + brb->end; + bop->type = type; + bop->block = b; + + brb->end = next; + + return 0; +} + +static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result) +{ + struct block_op *bop; + + if (brb_empty(brb)) + return -ENODATA; + + bop = brb->bops + brb->begin; + result->type = bop->type; + result->block = bop->block; + + brb->begin = brb_next(brb, brb->begin); + + return 0; +} + +/*----------------------------------------------------------------*/ + struct sm_metadata { struct dm_space_map sm; @@ -52,23 +164,20 @@ struct sm_metadata { unsigned recursion_count; unsigned allocated_this_transaction; - unsigned nr_uncommitted; - struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS]; + struct bop_ring_buffer uncommitted; + + struct threshold threshold; }; static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b) { - struct block_op *op; + int r = brb_push(&smm->uncommitted, type, b); - if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) { + if (r) { DMERR("too many recursive allocations"); return -ENOMEM; } - op = smm->uncommitted + smm->nr_uncommitted++; - op->type = type; - op->block = b; - return 0; } @@ -107,11 +216,17 @@ static int out(struct sm_metadata *smm) return -ENOMEM; } - if (smm->recursion_count == 1 && smm->nr_uncommitted) { - while (smm->nr_uncommitted && !r) { - smm->nr_uncommitted--; - r = commit_bop(smm, smm->uncommitted + - smm->nr_uncommitted); + if (smm->recursion_count == 1) { + while (!brb_empty(&smm->uncommitted)) { + struct block_op bop; + + r = brb_pop(&smm->uncommitted, &bop); + if (r) { + DMERR("bug in bop ring buffer"); + break; + } + + r = commit_bop(smm, &bop); if (r) break; } @@ -144,12 +259,6 @@ static void sm_metadata_destroy(struct dm_space_map *sm) kfree(smm); } -static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks) -{ - DMERR("doesn't support extend"); - return -EINVAL; -} - static int sm_metadata_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) { struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); @@ -172,7 +281,8 @@ static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count) static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, uint32_t *result) { - int r, i; + int r; + unsigned i; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); unsigned adjustment = 0; @@ -180,8 +290,10 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, * We may have some uncommitted adjustments to add. This list * should always be really short. */ - for (i = 0; i < smm->nr_uncommitted; i++) { - struct block_op *op = smm->uncommitted + i; + for (i = smm->uncommitted.begin; + i != smm->uncommitted.end; + i = brb_next(&smm->uncommitted, i)) { + struct block_op *op = smm->uncommitted.bops + i; if (op->block != b) continue; @@ -209,7 +321,8 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, dm_block_t b, int *result) { - int r, i, adjustment = 0; + int r, adjustment = 0; + unsigned i; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); uint32_t rc; @@ -217,8 +330,11 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, * We may have some uncommitted adjustments to add. This list * should always be really short. */ - for (i = 0; i < smm->nr_uncommitted; i++) { - struct block_op *op = smm->uncommitted + i; + for (i = smm->uncommitted.begin; + i != smm->uncommitted.end; + i = brb_next(&smm->uncommitted, i)) { + + struct block_op *op = smm->uncommitted.bops + i; if (op->block != b) continue; @@ -335,9 +451,23 @@ static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b) static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b) { + dm_block_t count; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + int r = sm_metadata_new_block_(sm, b); - if (r) - DMERR("out of metadata space"); + if (r) { + DMERR_LIMIT("unable to allocate new metadata block"); + return r; + } + + r = sm_metadata_get_nr_free(sm, &count); + if (r) { + DMERR_LIMIT("couldn't get free block count"); + return r; + } + + check_threshold(&smm->threshold, count); + return r; } @@ -357,6 +487,18 @@ static int sm_metadata_commit(struct dm_space_map *sm) return 0; } +static int sm_metadata_register_threshold_callback(struct dm_space_map *sm, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + set_threshold(&smm->threshold, threshold, fn, context); + + return 0; +} + static int sm_metadata_root_size(struct dm_space_map *sm, size_t *result) { *result = sizeof(struct disk_sm_root); @@ -382,6 +524,8 @@ static int sm_metadata_copy_root(struct dm_space_map *sm, void *where_le, size_t return 0; } +static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks); + static struct dm_space_map ops = { .destroy = sm_metadata_destroy, .extend = sm_metadata_extend, @@ -395,7 +539,8 @@ static struct dm_space_map ops = { .new_block = sm_metadata_new_block, .commit = sm_metadata_commit, .root_size = sm_metadata_root_size, - .copy_root = sm_metadata_copy_root + .copy_root = sm_metadata_copy_root, + .register_threshold_callback = sm_metadata_register_threshold_callback }; /*----------------------------------------------------------------*/ @@ -410,7 +555,7 @@ static void sm_bootstrap_destroy(struct dm_space_map *sm) static int sm_bootstrap_extend(struct dm_space_map *sm, dm_block_t extra_blocks) { - DMERR("boostrap doesn't support extend"); + DMERR("bootstrap doesn't support extend"); return -EINVAL; } @@ -450,7 +595,7 @@ static int sm_bootstrap_count_is_more_than_one(struct dm_space_map *sm, static int sm_bootstrap_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count) { - DMERR("boostrap doesn't support set_count"); + DMERR("bootstrap doesn't support set_count"); return -EINVAL; } @@ -491,7 +636,7 @@ static int sm_bootstrap_commit(struct dm_space_map *sm) static int sm_bootstrap_root_size(struct dm_space_map *sm, size_t *result) { - DMERR("boostrap doesn't support root_size"); + DMERR("bootstrap doesn't support root_size"); return -EINVAL; } @@ -499,7 +644,7 @@ static int sm_bootstrap_root_size(struct dm_space_map *sm, size_t *result) static int sm_bootstrap_copy_root(struct dm_space_map *sm, void *where, size_t max) { - DMERR("boostrap doesn't support copy_root"); + DMERR("bootstrap doesn't support copy_root"); return -EINVAL; } @@ -517,11 +662,60 @@ static struct dm_space_map bootstrap_ops = { .new_block = sm_bootstrap_new_block, .commit = sm_bootstrap_commit, .root_size = sm_bootstrap_root_size, - .copy_root = sm_bootstrap_copy_root + .copy_root = sm_bootstrap_copy_root, + .register_threshold_callback = NULL }; /*----------------------------------------------------------------*/ +static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks) +{ + int r, i; + enum allocation_event ev; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + dm_block_t old_len = smm->ll.nr_blocks; + + /* + * Flick into a mode where all blocks get allocated in the new area. + */ + smm->begin = old_len; + memcpy(sm, &bootstrap_ops, sizeof(*sm)); + + /* + * Extend. + */ + r = sm_ll_extend(&smm->ll, extra_blocks); + if (r) + goto out; + + /* + * We repeatedly increment then commit until the commit doesn't + * allocate any new blocks. + */ + do { + for (i = old_len; !r && i < smm->begin; i++) { + r = sm_ll_inc(&smm->ll, i, &ev); + if (r) + goto out; + } + old_len = smm->begin; + + r = sm_ll_commit(&smm->ll); + if (r) + goto out; + + } while (old_len != smm->begin); + +out: + /* + * Switch back to normal behaviour. + */ + memcpy(sm, &ops, sizeof(*sm)); + return r; +} + +/*----------------------------------------------------------------*/ + struct dm_space_map *dm_sm_metadata_init(void) { struct sm_metadata *smm; @@ -548,7 +742,8 @@ int dm_sm_metadata_create(struct dm_space_map *sm, smm->begin = superblock + 1; smm->recursion_count = 0; smm->allocated_this_transaction = 0; - smm->nr_uncommitted = 0; + brb_init(&smm->uncommitted); + threshold_init(&smm->threshold); memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); @@ -556,6 +751,8 @@ int dm_sm_metadata_create(struct dm_space_map *sm, if (r) return r; + if (nr_blocks > DM_SM_METADATA_MAX_BLOCKS) + nr_blocks = DM_SM_METADATA_MAX_BLOCKS; r = sm_ll_extend(&smm->ll, nr_blocks); if (r) return r; @@ -589,7 +786,8 @@ int dm_sm_metadata_open(struct dm_space_map *sm, smm->begin = 0; smm->recursion_count = 0; smm->allocated_this_transaction = 0; - smm->nr_uncommitted = 0; + brb_init(&smm->uncommitted); + threshold_init(&smm->threshold); memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); return 0; diff --git a/drivers/md/persistent-data/dm-space-map-metadata.h b/drivers/md/persistent-data/dm-space-map-metadata.h index 39bba0801cf..64df923974d 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.h +++ b/drivers/md/persistent-data/dm-space-map-metadata.h @@ -9,6 +9,17 @@ #include "dm-transaction-manager.h" +#define DM_SM_METADATA_BLOCK_SIZE (4096 >> SECTOR_SHIFT) + +/* + * The metadata device is currently limited in size. + * + * We have one block of index, which can hold 255 index entries. Each + * index entry contains allocation info about ~16k metadata blocks. + */ +#define DM_SM_METADATA_MAX_BLOCKS (255 * ((1 << 14) - 64)) +#define DM_SM_METADATA_MAX_SECTORS (DM_SM_METADATA_MAX_BLOCKS * DM_SM_METADATA_BLOCK_SIZE) + /* * Unfortunately we have to use two-phase construction due to the cycle * between the tm and sm. diff --git a/drivers/md/persistent-data/dm-space-map.h b/drivers/md/persistent-data/dm-space-map.h index 1cbfc6b1638..3e6d1153b7c 100644 --- a/drivers/md/persistent-data/dm-space-map.h +++ b/drivers/md/persistent-data/dm-space-map.h @@ -9,6 +9,8 @@ #include "dm-block-manager.h" +typedef void (*dm_sm_threshold_fn)(void *context); + /* * struct dm_space_map keeps a record of how many times each block in a device * is referenced. It needs to be fixed on disk as part of the transaction. @@ -59,6 +61,15 @@ struct dm_space_map { */ int (*root_size)(struct dm_space_map *sm, size_t *result); int (*copy_root)(struct dm_space_map *sm, void *copy_to_here_le, size_t len); + + /* + * You can register one threshold callback which is edge-triggered + * when the free space in the space map drops below the threshold. + */ + int (*register_threshold_callback)(struct dm_space_map *sm, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context); }; /*----------------------------------------------------------------*/ @@ -131,4 +142,16 @@ static inline int dm_sm_copy_root(struct dm_space_map *sm, void *copy_to_here_le return sm->copy_root(sm, copy_to_here_le, len); } +static inline int dm_sm_register_threshold_callback(struct dm_space_map *sm, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context) +{ + if (sm->register_threshold_callback) + return sm->register_threshold_callback(sm, threshold, fn, context); + + return -EINVAL; +} + + #endif /* _LINUX_DM_SPACE_MAP_H */ diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index 6f8d38747d7..3bc30a0ae3d 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c @@ -5,7 +5,6 @@ */ #include "dm-transaction-manager.h" #include "dm-space-map.h" -#include "dm-space-map-checker.h" #include "dm-space-map-disk.h" #include "dm-space-map-metadata.h" #include "dm-persistent-data-internal.h" @@ -26,8 +25,8 @@ struct shadow_info { /* * It would be nice if we scaled with the size of transaction. */ -#define HASH_SIZE 256 -#define HASH_MASK (HASH_SIZE - 1) +#define DM_HASH_SIZE 256 +#define DM_HASH_MASK (DM_HASH_SIZE - 1) struct dm_transaction_manager { int is_clone; @@ -37,7 +36,7 @@ struct dm_transaction_manager { struct dm_space_map *sm; spinlock_t lock; - struct hlist_head buckets[HASH_SIZE]; + struct hlist_head buckets[DM_HASH_SIZE]; }; /*----------------------------------------------------------------*/ @@ -45,12 +44,11 @@ struct dm_transaction_manager { static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b) { int r = 0; - unsigned bucket = dm_hash_block(b, HASH_MASK); + unsigned bucket = dm_hash_block(b, DM_HASH_MASK); struct shadow_info *si; - struct hlist_node *n; spin_lock(&tm->lock); - hlist_for_each_entry(si, n, tm->buckets + bucket, hlist) + hlist_for_each_entry(si, tm->buckets + bucket, hlist) if (si->where == b) { r = 1; break; @@ -72,7 +70,7 @@ static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b) si = kmalloc(sizeof(*si), GFP_NOIO); if (si) { si->where = b; - bucket = dm_hash_block(b, HASH_MASK); + bucket = dm_hash_block(b, DM_HASH_MASK); spin_lock(&tm->lock); hlist_add_head(&si->hlist, tm->buckets + bucket); spin_unlock(&tm->lock); @@ -82,14 +80,14 @@ static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b) static void wipe_shadow_table(struct dm_transaction_manager *tm) { struct shadow_info *si; - struct hlist_node *n, *tmp; + struct hlist_node *tmp; struct hlist_head *bucket; int i; spin_lock(&tm->lock); - for (i = 0; i < HASH_SIZE; i++) { + for (i = 0; i < DM_HASH_SIZE; i++) { bucket = tm->buckets + i; - hlist_for_each_entry_safe(si, n, tmp, bucket, hlist) + hlist_for_each_entry_safe(si, tmp, bucket, hlist) kfree(si); INIT_HLIST_HEAD(bucket); @@ -116,7 +114,7 @@ static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm, tm->sm = sm; spin_lock_init(&tm->lock); - for (i = 0; i < HASH_SIZE; i++) + for (i = 0; i < DM_HASH_SIZE; i++) INIT_HLIST_HEAD(tm->buckets + i); return tm; @@ -138,6 +136,9 @@ EXPORT_SYMBOL_GPL(dm_tm_create_non_blocking_clone); void dm_tm_destroy(struct dm_transaction_manager *tm) { + if (!tm->is_clone) + wipe_shadow_table(tm); + kfree(tm); } EXPORT_SYMBOL_GPL(dm_tm_destroy); @@ -153,7 +154,7 @@ int dm_tm_pre_commit(struct dm_transaction_manager *tm) if (r < 0) return r; - return 0; + return dm_bm_flush(tm->bm); } EXPORT_SYMBOL_GPL(dm_tm_pre_commit); @@ -163,8 +164,9 @@ int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root) return -EWOULDBLOCK; wipe_shadow_table(tm); + dm_bm_unlock(root); - return dm_bm_flush_and_unlock(tm->bm, root); + return dm_bm_flush(tm->bm); } EXPORT_SYMBOL_GPL(dm_tm_commit); @@ -217,13 +219,24 @@ static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, if (r < 0) return r; - r = dm_bm_unlock_move(orig_block, new); - if (r < 0) { + /* + * It would be tempting to use dm_bm_unlock_move here, but some + * code, such as the space maps, keeps using the old data structures + * secure in the knowledge they won't be changed until the next + * transaction. Using unlock_move would force a synchronous read + * since the old block would no longer be in the cache. + */ + r = dm_bm_write_lock_zero(tm->bm, new, v, result); + if (r) { dm_bm_unlock(orig_block); return r; } - return dm_bm_write_lock(tm->bm, new, v, result); + memcpy(dm_block_data(*result), dm_block_data(orig_block), + dm_bm_block_size(tm->bm)); + + dm_bm_unlock(orig_block); + return r; } int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, @@ -249,6 +262,7 @@ int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, return r; } +EXPORT_SYMBOL_GPL(dm_tm_shadow_block); int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b, struct dm_block_validator *v, @@ -259,6 +273,7 @@ int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b, return dm_bm_read_lock(tm->bm, b, v, blk); } +EXPORT_SYMBOL_GPL(dm_tm_read_lock); int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b) { @@ -306,94 +321,61 @@ struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm) static int dm_tm_create_internal(struct dm_block_manager *bm, dm_block_t sb_location, - struct dm_block_validator *sb_validator, - size_t root_offset, size_t root_max_len, struct dm_transaction_manager **tm, struct dm_space_map **sm, - struct dm_block **sblock, - int create) + int create, + void *sm_root, size_t sm_len) { int r; - struct dm_space_map *inner; - inner = dm_sm_metadata_init(); - if (IS_ERR(inner)) - return PTR_ERR(inner); + *sm = dm_sm_metadata_init(); + if (IS_ERR(*sm)) + return PTR_ERR(*sm); - *tm = dm_tm_create(bm, inner); + *tm = dm_tm_create(bm, *sm); if (IS_ERR(*tm)) { - dm_sm_destroy(inner); + dm_sm_destroy(*sm); return PTR_ERR(*tm); } if (create) { - r = dm_bm_write_lock_zero(dm_tm_get_bm(*tm), sb_location, - sb_validator, sblock); - if (r < 0) { - DMERR("couldn't lock superblock"); - goto bad1; - } - - r = dm_sm_metadata_create(inner, *tm, dm_bm_nr_blocks(bm), + r = dm_sm_metadata_create(*sm, *tm, dm_bm_nr_blocks(bm), sb_location); if (r) { DMERR("couldn't create metadata space map"); - goto bad2; + goto bad; } - *sm = dm_sm_checker_create(inner); - if (!*sm) - goto bad2; - } else { - r = dm_bm_write_lock(dm_tm_get_bm(*tm), sb_location, - sb_validator, sblock); - if (r < 0) { - DMERR("couldn't lock superblock"); - goto bad1; - } - - r = dm_sm_metadata_open(inner, *tm, - dm_block_data(*sblock) + root_offset, - root_max_len); + r = dm_sm_metadata_open(*sm, *tm, sm_root, sm_len); if (r) { DMERR("couldn't open metadata space map"); - goto bad2; + goto bad; } - - *sm = dm_sm_checker_create(inner); - if (!*sm) - goto bad2; } return 0; -bad2: - dm_tm_unlock(*tm, *sblock); -bad1: +bad: dm_tm_destroy(*tm); - dm_sm_destroy(inner); + dm_sm_destroy(*sm); return r; } int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, - struct dm_block_validator *sb_validator, struct dm_transaction_manager **tm, - struct dm_space_map **sm, struct dm_block **sblock) + struct dm_space_map **sm) { - return dm_tm_create_internal(bm, sb_location, sb_validator, - 0, 0, tm, sm, sblock, 1); + return dm_tm_create_internal(bm, sb_location, tm, sm, 1, NULL, 0); } EXPORT_SYMBOL_GPL(dm_tm_create_with_sm); int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, - struct dm_block_validator *sb_validator, - size_t root_offset, size_t root_max_len, + void *sm_root, size_t root_len, struct dm_transaction_manager **tm, - struct dm_space_map **sm, struct dm_block **sblock) + struct dm_space_map **sm) { - return dm_tm_create_internal(bm, sb_location, sb_validator, root_offset, - root_max_len, tm, sm, sblock, 0); + return dm_tm_create_internal(bm, sb_location, tm, sm, 0, sm_root, root_len); } EXPORT_SYMBOL_GPL(dm_tm_open_with_sm); diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h index 6da784871db..2772ed2a781 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.h +++ b/drivers/md/persistent-data/dm-transaction-manager.h @@ -38,18 +38,17 @@ struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transac /* * We use a 2-phase commit here. * - * i) In the first phase the block manager is told to start flushing, and - * the changes to the space map are written to disk. You should interrogate - * your particular space map to get detail of its root node etc. to be - * included in your superblock. + * i) Make all changes for the transaction *except* for the superblock. + * Then call dm_tm_pre_commit() to flush them to disk. * - * ii) @root will be committed last. You shouldn't use more than the - * first 512 bytes of @root if you wish the transaction to survive a power - * failure. You *must* have a write lock held on @root for both stage (i) - * and (ii). The commit will drop the write lock. + * ii) Lock your superblock. Update. Then call dm_tm_commit() which will + * unlock the superblock and flush it. No other blocks should be updated + * during this period. Care should be taken to never unlock a partially + * updated superblock; perform any operations that could fail *before* you + * take the superblock lock. */ int dm_tm_pre_commit(struct dm_transaction_manager *tm); -int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root); +int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *superblock); /* * These methods are the only way to get hold of a writeable block. @@ -115,16 +114,17 @@ struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm); * * Returns a tm that has an open transaction to write the new disk sm. * Caller should store the new sm root and commit. + * + * The superblock location is passed so the metadata space map knows it + * shouldn't be used. */ int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, - struct dm_block_validator *sb_validator, struct dm_transaction_manager **tm, - struct dm_space_map **sm, struct dm_block **sblock); + struct dm_space_map **sm); int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, - struct dm_block_validator *sb_validator, - size_t root_offset, size_t root_max_len, + void *sm_root, size_t root_len, struct dm_transaction_manager **tm, - struct dm_space_map **sm, struct dm_block **sblock); + struct dm_space_map **sm); #endif /* _LINUX_DM_TRANSACTION_MANAGER_H */ |
