diff options
Diffstat (limited to 'drivers/md/dm-table.c')
-rw-r--r-- | drivers/md/dm-table.c | 950 |
1 files changed, 950 insertions, 0 deletions
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c new file mode 100644 index 00000000000..ee175d4906c --- /dev/null +++ b/drivers/md/dm-table.c @@ -0,0 +1,950 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/blkdev.h> +#include <linux/namei.h> +#include <linux/ctype.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <asm/atomic.h> + +#define MAX_DEPTH 16 +#define NODE_SIZE L1_CACHE_BYTES +#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) +#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) + +struct dm_table { + atomic_t holders; + + /* btree table */ + unsigned int depth; + unsigned int counts[MAX_DEPTH]; /* in nodes */ + sector_t *index[MAX_DEPTH]; + + unsigned int num_targets; + unsigned int num_allocated; + sector_t *highs; + struct dm_target *targets; + + /* + * Indicates the rw permissions for the new logical + * device. This should be a combination of FMODE_READ + * and FMODE_WRITE. + */ + int mode; + + /* a list of devices used by this table */ + struct list_head devices; + + /* + * These are optimistic limits taken from all the + * targets, some targets will need smaller limits. + */ + struct io_restrictions limits; + + /* events get handed up using this callback */ + void (*event_fn)(void *); + void *event_context; +}; + +/* + * Similar to ceiling(log_size(n)) + */ +static unsigned int int_log(unsigned int n, unsigned int base) +{ + int result = 0; + + while (n > 1) { + n = dm_div_up(n, base); + result++; + } + + return result; +} + +/* + * Returns the minimum that is _not_ zero, unless both are zero. + */ +#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) + +/* + * Combine two io_restrictions, always taking the lower value. + */ +static void combine_restrictions_low(struct io_restrictions *lhs, + struct io_restrictions *rhs) +{ + lhs->max_sectors = + min_not_zero(lhs->max_sectors, rhs->max_sectors); + + lhs->max_phys_segments = + min_not_zero(lhs->max_phys_segments, rhs->max_phys_segments); + + lhs->max_hw_segments = + min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments); + + lhs->hardsect_size = max(lhs->hardsect_size, rhs->hardsect_size); + + lhs->max_segment_size = + min_not_zero(lhs->max_segment_size, rhs->max_segment_size); + + lhs->seg_boundary_mask = + min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask); +} + +/* + * Calculate the index of the child node of the n'th node k'th key. + */ +static inline unsigned int get_child(unsigned int n, unsigned int k) +{ + return (n * CHILDREN_PER_NODE) + k; +} + +/* + * Return the n'th node of level l from table t. + */ +static inline sector_t *get_node(struct dm_table *t, + unsigned int l, unsigned int n) +{ + return t->index[l] + (n * KEYS_PER_NODE); +} + +/* + * Return the highest key that you could lookup from the n'th + * node on level l of the btree. + */ +static sector_t high(struct dm_table *t, unsigned int l, unsigned int n) +{ + for (; l < t->depth - 1; l++) + n = get_child(n, CHILDREN_PER_NODE - 1); + + if (n >= t->counts[l]) + return (sector_t) - 1; + + return get_node(t, l, n)[KEYS_PER_NODE - 1]; +} + +/* + * Fills in a level of the btree based on the highs of the level + * below it. + */ +static int setup_btree_index(unsigned int l, struct dm_table *t) +{ + unsigned int n, k; + sector_t *node; + + for (n = 0U; n < t->counts[l]; n++) { + node = get_node(t, l, n); + + for (k = 0U; k < KEYS_PER_NODE; k++) + node[k] = high(t, l + 1, get_child(n, k)); + } + + return 0; +} + +void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size) +{ + unsigned long size; + void *addr; + + /* + * Check that we're not going to overflow. + */ + if (nmemb > (ULONG_MAX / elem_size)) + return NULL; + + size = nmemb * elem_size; + addr = vmalloc(size); + if (addr) + memset(addr, 0, size); + + return addr; +} + +/* + * highs, and targets are managed as dynamic arrays during a + * table load. + */ +static int alloc_targets(struct dm_table *t, unsigned int num) +{ + sector_t *n_highs; + struct dm_target *n_targets; + int n = t->num_targets; + + /* + * Allocate both the target array and offset array at once. + */ + n_highs = (sector_t *) dm_vcalloc(num, sizeof(struct dm_target) + + sizeof(sector_t)); + if (!n_highs) + return -ENOMEM; + + n_targets = (struct dm_target *) (n_highs + num); + + if (n) { + memcpy(n_highs, t->highs, sizeof(*n_highs) * n); + memcpy(n_targets, t->targets, sizeof(*n_targets) * n); + } + + memset(n_highs + n, -1, sizeof(*n_highs) * (num - n)); + vfree(t->highs); + + t->num_allocated = num; + t->highs = n_highs; + t->targets = n_targets; + + return 0; +} + +int dm_table_create(struct dm_table **result, int mode, unsigned num_targets) +{ + struct dm_table *t = kmalloc(sizeof(*t), GFP_KERNEL); + + if (!t) + return -ENOMEM; + + memset(t, 0, sizeof(*t)); + INIT_LIST_HEAD(&t->devices); + atomic_set(&t->holders, 1); + + if (!num_targets) + num_targets = KEYS_PER_NODE; + + num_targets = dm_round_up(num_targets, KEYS_PER_NODE); + + if (alloc_targets(t, num_targets)) { + kfree(t); + t = NULL; + return -ENOMEM; + } + + t->mode = mode; + *result = t; + return 0; +} + +static void free_devices(struct list_head *devices) +{ + struct list_head *tmp, *next; + + for (tmp = devices->next; tmp != devices; tmp = next) { + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); + next = tmp->next; + kfree(dd); + } +} + +void table_destroy(struct dm_table *t) +{ + unsigned int i; + + /* free the indexes (see dm_table_complete) */ + if (t->depth >= 2) + vfree(t->index[t->depth - 2]); + + /* free the targets */ + for (i = 0; i < t->num_targets; i++) { + struct dm_target *tgt = t->targets + i; + + if (tgt->type->dtr) + tgt->type->dtr(tgt); + + dm_put_target_type(tgt->type); + } + + vfree(t->highs); + + /* free the device list */ + if (t->devices.next != &t->devices) { + DMWARN("devices still present during destroy: " + "dm_table_remove_device calls missing"); + + free_devices(&t->devices); + } + + kfree(t); +} + +void dm_table_get(struct dm_table *t) +{ + atomic_inc(&t->holders); +} + +void dm_table_put(struct dm_table *t) +{ + if (!t) + return; + + if (atomic_dec_and_test(&t->holders)) + table_destroy(t); +} + +/* + * Checks to see if we need to extend highs or targets. + */ +static inline int check_space(struct dm_table *t) +{ + if (t->num_targets >= t->num_allocated) + return alloc_targets(t, t->num_allocated * 2); + + return 0; +} + +/* + * Convert a device path to a dev_t. + */ +static int lookup_device(const char *path, dev_t *dev) +{ + int r; + struct nameidata nd; + struct inode *inode; + + if ((r = path_lookup(path, LOOKUP_FOLLOW, &nd))) + return r; + + inode = nd.dentry->d_inode; + if (!inode) { + r = -ENOENT; + goto out; + } + + if (!S_ISBLK(inode->i_mode)) { + r = -ENOTBLK; + goto out; + } + + *dev = inode->i_rdev; + + out: + path_release(&nd); + return r; +} + +/* + * See if we've already got a device in the list. + */ +static struct dm_dev *find_device(struct list_head *l, dev_t dev) +{ + struct dm_dev *dd; + + list_for_each_entry (dd, l, list) + if (dd->bdev->bd_dev == dev) + return dd; + + return NULL; +} + +/* + * Open a device so we can use it as a map destination. + */ +static int open_dev(struct dm_dev *d, dev_t dev) +{ + static char *_claim_ptr = "I belong to device-mapper"; + struct block_device *bdev; + + int r; + + if (d->bdev) + BUG(); + + bdev = open_by_devnum(dev, d->mode); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + r = bd_claim(bdev, _claim_ptr); + if (r) + blkdev_put(bdev); + else + d->bdev = bdev; + return r; +} + +/* + * Close a device that we've been using. + */ +static void close_dev(struct dm_dev *d) +{ + if (!d->bdev) + return; + + bd_release(d->bdev); + blkdev_put(d->bdev); + d->bdev = NULL; +} + +/* + * If possible (ie. blk_size[major] is set), this checks an area + * of a destination device is valid. + */ +static int check_device_area(struct dm_dev *dd, sector_t start, sector_t len) +{ + sector_t dev_size; + dev_size = dd->bdev->bd_inode->i_size >> SECTOR_SHIFT; + return ((start < dev_size) && (len <= (dev_size - start))); +} + +/* + * This upgrades the mode on an already open dm_dev. Being + * careful to leave things as they were if we fail to reopen the + * device. + */ +static int upgrade_mode(struct dm_dev *dd, int new_mode) +{ + int r; + struct dm_dev dd_copy; + dev_t dev = dd->bdev->bd_dev; + + dd_copy = *dd; + + dd->mode |= new_mode; + dd->bdev = NULL; + r = open_dev(dd, dev); + if (!r) + close_dev(&dd_copy); + else + *dd = dd_copy; + + return r; +} + +/* + * Add a device to the list, or just increment the usage count if + * it's already present. + */ +static int __table_get_device(struct dm_table *t, struct dm_target *ti, + const char *path, sector_t start, sector_t len, + int mode, struct dm_dev **result) +{ + int r; + dev_t dev; + struct dm_dev *dd; + unsigned int major, minor; + + if (!t) + BUG(); + + if (sscanf(path, "%u:%u", &major, &minor) == 2) { + /* Extract the major/minor numbers */ + dev = MKDEV(major, minor); + if (MAJOR(dev) != major || MINOR(dev) != minor) + return -EOVERFLOW; + } else { + /* convert the path to a device */ + if ((r = lookup_device(path, &dev))) + return r; + } + + dd = find_device(&t->devices, dev); + if (!dd) { + dd = kmalloc(sizeof(*dd), GFP_KERNEL); + if (!dd) + return -ENOMEM; + + dd->mode = mode; + dd->bdev = NULL; + + if ((r = open_dev(dd, dev))) { + kfree(dd); + return r; + } + + format_dev_t(dd->name, dev); + + atomic_set(&dd->count, 0); + list_add(&dd->list, &t->devices); + + } else if (dd->mode != (mode | dd->mode)) { + r = upgrade_mode(dd, mode); + if (r) + return r; + } + atomic_inc(&dd->count); + + if (!check_device_area(dd, start, len)) { + DMWARN("device %s too small for target", path); + dm_put_device(ti, dd); + return -EINVAL; + } + + *result = dd; + + return 0; +} + + +int dm_get_device(struct dm_target *ti, const char *path, sector_t start, + sector_t len, int mode, struct dm_dev **result) +{ + int r = __table_get_device(ti->table, ti, path, + start, len, mode, result); + if (!r) { + request_queue_t *q = bdev_get_queue((*result)->bdev); + struct io_restrictions *rs = &ti->limits; + + /* + * Combine the device limits low. + * + * FIXME: if we move an io_restriction struct + * into q this would just be a call to + * combine_restrictions_low() + */ + rs->max_sectors = + min_not_zero(rs->max_sectors, q->max_sectors); + + /* FIXME: Device-Mapper on top of RAID-0 breaks because DM + * currently doesn't honor MD's merge_bvec_fn routine. + * In this case, we'll force DM to use PAGE_SIZE or + * smaller I/O, just to be safe. A better fix is in the + * works, but add this for the time being so it will at + * least operate correctly. + */ + if (q->merge_bvec_fn) + rs->max_sectors = + min_not_zero(rs->max_sectors, + (unsigned short)(PAGE_SIZE >> 9)); + + rs->max_phys_segments = + min_not_zero(rs->max_phys_segments, + q->max_phys_segments); + + rs->max_hw_segments = + min_not_zero(rs->max_hw_segments, q->max_hw_segments); + + rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size); + + rs->max_segment_size = + min_not_zero(rs->max_segment_size, q->max_segment_size); + + rs->seg_boundary_mask = + min_not_zero(rs->seg_boundary_mask, + q->seg_boundary_mask); + } + + return r; +} + +/* + * Decrement a devices use count and remove it if necessary. + */ +void dm_put_device(struct dm_target *ti, struct dm_dev *dd) +{ + if (atomic_dec_and_test(&dd->count)) { + close_dev(dd); + list_del(&dd->list); + kfree(dd); + } +} + +/* + * Checks to see if the target joins onto the end of the table. + */ +static int adjoin(struct dm_table *table, struct dm_target *ti) +{ + struct dm_target *prev; + + if (!table->num_targets) + return !ti->begin; + + prev = &table->targets[table->num_targets - 1]; + return (ti->begin == (prev->begin + prev->len)); +} + +/* + * Used to dynamically allocate the arg array. + */ +static char **realloc_argv(unsigned *array_size, char **old_argv) +{ + char **argv; + unsigned new_size; + + new_size = *array_size ? *array_size * 2 : 64; + argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL); + if (argv) { + memcpy(argv, old_argv, *array_size * sizeof(*argv)); + *array_size = new_size; + } + + kfree(old_argv); + return argv; +} + +/* + * Destructively splits up the argument list to pass to ctr. + */ +int dm_split_args(int *argc, char ***argvp, char *input) +{ + char *start, *end = input, *out, **argv = NULL; + unsigned array_size = 0; + + *argc = 0; + argv = realloc_argv(&array_size, argv); + if (!argv) + return -ENOMEM; + + while (1) { + start = end; + + /* Skip whitespace */ + while (*start && isspace(*start)) + start++; + + if (!*start) + break; /* success, we hit the end */ + + /* 'out' is used to remove any back-quotes */ + end = out = start; + while (*end) { + /* Everything apart from '\0' can be quoted */ + if (*end == '\\' && *(end + 1)) { + *out++ = *(end + 1); + end += 2; + continue; + } + + if (isspace(*end)) + break; /* end of token */ + + *out++ = *end++; + } + + /* have we already filled the array ? */ + if ((*argc + 1) > array_size) { + argv = realloc_argv(&array_size, argv); + if (!argv) + return -ENOMEM; + } + + /* we know this is whitespace */ + if (*end) + end++; + + /* terminate the string and put it in the array */ + *out = '\0'; + argv[*argc] = start; + (*argc)++; + } + + *argvp = argv; + return 0; +} + +static void check_for_valid_limits(struct io_restrictions *rs) +{ + if (!rs->max_sectors) + rs->max_sectors = MAX_SECTORS; + if (!rs->max_phys_segments) + rs->max_phys_segments = MAX_PHYS_SEGMENTS; + if (!rs->max_hw_segments) + rs->max_hw_segments = MAX_HW_SEGMENTS; + if (!rs->hardsect_size) + rs->hardsect_size = 1 << SECTOR_SHIFT; + if (!rs->max_segment_size) + rs->max_segment_size = MAX_SEGMENT_SIZE; + if (!rs->seg_boundary_mask) + rs->seg_boundary_mask = -1; +} + +int dm_table_add_target(struct dm_table *t, const char *type, + sector_t start, sector_t len, char *params) +{ + int r = -EINVAL, argc; + char **argv; + struct dm_target *tgt; + + if ((r = check_space(t))) + return r; + + tgt = t->targets + t->num_targets; + memset(tgt, 0, sizeof(*tgt)); + + if (!len) { + tgt->error = "zero-length target"; + DMERR("%s", tgt->error); + return -EINVAL; + } + + tgt->type = dm_get_target_type(type); + if (!tgt->type) { + tgt->error = "unknown target type"; + DMERR("%s", tgt->error); + return -EINVAL; + } + + tgt->table = t; + tgt->begin = start; + tgt->len = len; + tgt->error = "Unknown error"; + + /* + * Does this target adjoin the previous one ? + */ + if (!adjoin(t, tgt)) { + tgt->error = "Gap in table"; + r = -EINVAL; + goto bad; + } + + r = dm_split_args(&argc, &argv, params); + if (r) { + tgt->error = "couldn't split parameters (insufficient memory)"; + goto bad; + } + + r = tgt->type->ctr(tgt, argc, argv); + kfree(argv); + if (r) + goto bad; + + t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; + + /* FIXME: the plan is to combine high here and then have + * the merge fn apply the target level restrictions. */ + combine_restrictions_low(&t->limits, &tgt->limits); + return 0; + + bad: + DMERR("%s", tgt->error); + dm_put_target_type(tgt->type); + return r; +} + +static int setup_indexes(struct dm_table *t) +{ + int i; + unsigned int total = 0; + sector_t *indexes; + + /* allocate the space for *all* the indexes */ + for (i = t->depth - 2; i >= 0; i--) { + t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE); + total += t->counts[i]; + } + + indexes = (sector_t *) dm_vcalloc(total, (unsigned long) NODE_SIZE); + if (!indexes) + return -ENOMEM; + + /* set up internal nodes, bottom-up */ + for (i = t->depth - 2, total = 0; i >= 0; i--) { + t->index[i] = indexes; + indexes += (KEYS_PER_NODE * t->counts[i]); + setup_btree_index(i, t); + } + + return 0; +} + +/* + * Builds the btree to index the map. + */ +int dm_table_complete(struct dm_table *t) +{ + int r = 0; + unsigned int leaf_nodes; + + check_for_valid_limits(&t->limits); + + /* how many indexes will the btree have ? */ + leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); + t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); + + /* leaf layer has already been set up */ + t->counts[t->depth - 1] = leaf_nodes; + t->index[t->depth - 1] = t->highs; + + if (t->depth >= 2) + r = setup_indexes(t); + + return r; +} + +static DECLARE_MUTEX(_event_lock); +void dm_table_event_callback(struct dm_table *t, + void (*fn)(void *), void *context) +{ + down(&_event_lock); + t->event_fn = fn; + t->event_context = context; + up(&_event_lock); +} + +void dm_table_event(struct dm_table *t) +{ + /* + * You can no longer call dm_table_event() from interrupt + * context, use a bottom half instead. + */ + BUG_ON(in_interrupt()); + + down(&_event_lock); + if (t->event_fn) + t->event_fn(t->event_context); + up(&_event_lock); +} + +sector_t dm_table_get_size(struct dm_table *t) +{ + return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; +} + +struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) +{ + if (index > t->num_targets) + return NULL; + + return t->targets + index; +} + +/* + * Search the btree for the correct target. + */ +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) +{ + unsigned int l, n = 0, k = 0; + sector_t *node; + + for (l = 0; l < t->depth; l++) { + n = get_child(n, k); + node = get_node(t, l, n); + + for (k = 0; k < KEYS_PER_NODE; k++) + if (node[k] >= sector) + break; + } + + return &t->targets[(KEYS_PER_NODE * n) + k]; +} + +void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q) +{ + /* + * Make sure we obey the optimistic sub devices + * restrictions. + */ + blk_queue_max_sectors(q, t->limits.max_sectors); + q->max_phys_segments = t->limits.max_phys_segments; + q->max_hw_segments = t->limits.max_hw_segments; + q->hardsect_size = t->limits.hardsect_size; + q->max_segment_size = t->limits.max_segment_size; + q->seg_boundary_mask = t->limits.seg_boundary_mask; +} + +unsigned int dm_table_get_num_targets(struct dm_table *t) +{ + return t->num_targets; +} + +struct list_head *dm_table_get_devices(struct dm_table *t) +{ + return &t->devices; +} + +int dm_table_get_mode(struct dm_table *t) +{ + return t->mode; +} + +static void suspend_targets(struct dm_table *t, unsigned postsuspend) +{ + int i = t->num_targets; + struct dm_target *ti = t->targets; + + while (i--) { + if (postsuspend) { + if (ti->type->postsuspend) + ti->type->postsuspend(ti); + } else if (ti->type->presuspend) + ti->type->presuspend(ti); + + ti++; + } +} + +void dm_table_presuspend_targets(struct dm_table *t) +{ + return suspend_targets(t, 0); +} + +void dm_table_postsuspend_targets(struct dm_table *t) +{ + return suspend_targets(t, 1); +} + +void dm_table_resume_targets(struct dm_table *t) +{ + int i; + + for (i = 0; i < t->num_targets; i++) { + struct dm_target *ti = t->targets + i; + + if (ti->type->resume) + ti->type->resume(ti); + } +} + +int dm_table_any_congested(struct dm_table *t, int bdi_bits) +{ + struct list_head *d, *devices; + int r = 0; + + devices = dm_table_get_devices(t); + for (d = devices->next; d != devices; d = d->next) { + struct dm_dev *dd = list_entry(d, struct dm_dev, list); + request_queue_t *q = bdev_get_queue(dd->bdev); + r |= bdi_congested(&q->backing_dev_info, bdi_bits); + } + + return r; +} + +void dm_table_unplug_all(struct dm_table *t) +{ + struct list_head *d, *devices = dm_table_get_devices(t); + + for (d = devices->next; d != devices; d = d->next) { + struct dm_dev *dd = list_entry(d, struct dm_dev, list); + request_queue_t *q = bdev_get_queue(dd->bdev); + + if (q->unplug_fn) + q->unplug_fn(q); + } +} + +int dm_table_flush_all(struct dm_table *t) +{ + struct list_head *d, *devices = dm_table_get_devices(t); + int ret = 0; + + for (d = devices->next; d != devices; d = d->next) { + struct dm_dev *dd = list_entry(d, struct dm_dev, list); + request_queue_t *q = bdev_get_queue(dd->bdev); + int err; + + if (!q->issue_flush_fn) + err = -EOPNOTSUPP; + else + err = q->issue_flush_fn(q, dd->bdev->bd_disk, NULL); + + if (!ret) + ret = err; + } + + return ret; +} + +EXPORT_SYMBOL(dm_vcalloc); +EXPORT_SYMBOL(dm_get_device); +EXPORT_SYMBOL(dm_put_device); +EXPORT_SYMBOL(dm_table_event); +EXPORT_SYMBOL(dm_table_get_mode); +EXPORT_SYMBOL(dm_table_put); +EXPORT_SYMBOL(dm_table_get); +EXPORT_SYMBOL(dm_table_unplug_all); +EXPORT_SYMBOL(dm_table_flush_all); |