diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-07-11 13:05:40 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-07-11 13:05:40 -0700 |
commit | 9903883f1dd6e86f286b7bfa6e4b423f98c1cd9e (patch) | |
tree | 63c907110eac32c31a1786ebff3e7d9257e61c9b /drivers/md | |
parent | 36805aaea5ae3cf1bb32f1643e0a800bb69f0d5b (diff) | |
parent | 9d0eb0ab432aaa9160cf2675aee73b3900b9bc18 (diff) |
Merge tag 'dm-3.11-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm
Pull device-mapper changes from Alasdair G Kergon:
"Add a device-mapper target called dm-switch to provide a multipath
framework for storage arrays that dynamically reconfigure their
preferred paths for different device regions.
Fix a bug in the verity target that prevented its use with some
specific sizes of devices.
Improve some locking mechanisms in the device-mapper core and bufio.
Add Mike Snitzer as a device-mapper maintainer.
A few more clean-ups and fixes"
* tag 'dm-3.11-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm:
dm: add switch target
dm: update maintainers
dm: optimize reorder structure
dm: optimize use SRCU and RCU
dm bufio: submit writes outside lock
dm cache: fix arm link errors with inline
dm verity: use __ffs and __fls
dm flakey: correct ctr alloc failure mesg
dm verity: remove pointless comparison
dm: use __GFP_HIGHMEM in __vmalloc
dm verity: fix inability to use a few specific devices sizes
dm ioctl: set noio flag to avoid __vmalloc deadlock
dm mpath: fix ioctl deadlock when no paths
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 14 | ||||
-rw-r--r-- | drivers/md/Makefile | 1 | ||||
-rw-r--r-- | drivers/md/dm-bufio.c | 75 | ||||
-rw-r--r-- | drivers/md/dm-cache-target.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-flakey.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-ioctl.c | 127 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 8 | ||||
-rw-r--r-- | drivers/md/dm-switch.c | 538 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 35 | ||||
-rw-r--r-- | drivers/md/dm-verity.c | 17 | ||||
-rw-r--r-- | drivers/md/dm.c | 177 |
11 files changed, 818 insertions, 180 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 3bfc8f1da9f..30b426ed744 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -412,4 +412,18 @@ config DM_VERITY If unsure, say N. +config DM_SWITCH + tristate "Switch target support (EXPERIMENTAL)" + depends on BLK_DEV_DM + ---help--- + This device-mapper target creates a device that supports an arbitrary + mapping of fixed-size regions of I/O across a fixed set of paths. + The path used for any specific region can be switched dynamically + by sending the target a message. + + To compile this code as a module, choose M here: the module will + be called dm-switch. + + If unsure, say N. + endif # MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 1439fd4ad9b..5ef78efc27f 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -40,6 +40,7 @@ obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o +obj-$(CONFIG_DM_SWITCH) += dm-switch.o obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/ obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 0387e05cdb9..5227e079a6e 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -145,6 +145,7 @@ struct dm_buffer { unsigned long state; unsigned long last_accessed; struct dm_bufio_client *c; + struct list_head write_list; struct bio bio; struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS]; }; @@ -349,7 +350,7 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, if (gfp_mask & __GFP_NORETRY) noio_flag = memalloc_noio_save(); - ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); + ptr = __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, PAGE_KERNEL); if (gfp_mask & __GFP_NORETRY) memalloc_noio_restore(noio_flag); @@ -630,7 +631,8 @@ static int do_io_schedule(void *word) * - Submit our write and don't wait on it. We set B_WRITING indicating * that there is a write in progress. */ -static void __write_dirty_buffer(struct dm_buffer *b) +static void __write_dirty_buffer(struct dm_buffer *b, + struct list_head *write_list) { if (!test_bit(B_DIRTY, &b->state)) return; @@ -639,7 +641,24 @@ static void __write_dirty_buffer(struct dm_buffer *b) wait_on_bit_lock(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE); - submit_io(b, WRITE, b->block, write_endio); + if (!write_list) + submit_io(b, WRITE, b->block, write_endio); + else + list_add_tail(&b->write_list, write_list); +} + +static void __flush_write_list(struct list_head *write_list) +{ + struct blk_plug plug; + blk_start_plug(&plug); + while (!list_empty(write_list)) { + struct dm_buffer *b = + list_entry(write_list->next, struct dm_buffer, write_list); + list_del(&b->write_list); + submit_io(b, WRITE, b->block, write_endio); + dm_bufio_cond_resched(); + } + blk_finish_plug(&plug); } /* @@ -655,7 +674,7 @@ static void __make_buffer_clean(struct dm_buffer *b) return; wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); - __write_dirty_buffer(b); + __write_dirty_buffer(b, NULL); wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE); } @@ -802,7 +821,8 @@ static void __free_buffer_wake(struct dm_buffer *b) wake_up(&c->free_buffer_wait); } -static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait) +static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait, + struct list_head *write_list) { struct dm_buffer *b, *tmp; @@ -818,7 +838,7 @@ static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait) if (no_wait && test_bit(B_WRITING, &b->state)) return; - __write_dirty_buffer(b); + __write_dirty_buffer(b, write_list); dm_bufio_cond_resched(); } } @@ -853,7 +873,8 @@ static void __get_memory_limit(struct dm_bufio_client *c, * If we are over threshold_buffers, start freeing buffers. * If we're over "limit_buffers", block until we get under the limit. */ -static void __check_watermark(struct dm_bufio_client *c) +static void __check_watermark(struct dm_bufio_client *c, + struct list_head *write_list) { unsigned long threshold_buffers, limit_buffers; @@ -872,7 +893,7 @@ static void __check_watermark(struct dm_bufio_client *c) } if (c->n_buffers[LIST_DIRTY] > threshold_buffers) - __write_dirty_buffers_async(c, 1); + __write_dirty_buffers_async(c, 1, write_list); } /* @@ -897,7 +918,8 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) *--------------------------------------------------------------*/ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, - enum new_flag nf, int *need_submit) + enum new_flag nf, int *need_submit, + struct list_head *write_list) { struct dm_buffer *b, *new_b = NULL; @@ -924,7 +946,7 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, goto found_buffer; } - __check_watermark(c); + __check_watermark(c, write_list); b = new_b; b->hold_count = 1; @@ -992,10 +1014,14 @@ static void *new_read(struct dm_bufio_client *c, sector_t block, int need_submit; struct dm_buffer *b; + LIST_HEAD(write_list); + dm_bufio_lock(c); - b = __bufio_new(c, block, nf, &need_submit); + b = __bufio_new(c, block, nf, &need_submit, &write_list); dm_bufio_unlock(c); + __flush_write_list(&write_list); + if (!b) return b; @@ -1047,6 +1073,8 @@ void dm_bufio_prefetch(struct dm_bufio_client *c, { struct blk_plug plug; + LIST_HEAD(write_list); + BUG_ON(dm_bufio_in_request()); blk_start_plug(&plug); @@ -1055,7 +1083,15 @@ void dm_bufio_prefetch(struct dm_bufio_client *c, for (; n_blocks--; block++) { int need_submit; struct dm_buffer *b; - b = __bufio_new(c, block, NF_PREFETCH, &need_submit); + b = __bufio_new(c, block, NF_PREFETCH, &need_submit, + &write_list); + if (unlikely(!list_empty(&write_list))) { + dm_bufio_unlock(c); + blk_finish_plug(&plug); + __flush_write_list(&write_list); + blk_start_plug(&plug); + dm_bufio_lock(c); + } if (unlikely(b != NULL)) { dm_bufio_unlock(c); @@ -1069,7 +1105,6 @@ void dm_bufio_prefetch(struct dm_bufio_client *c, goto flush_plug; dm_bufio_lock(c); } - } dm_bufio_unlock(c); @@ -1126,11 +1161,14 @@ EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty); void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c) { + LIST_HEAD(write_list); + BUG_ON(dm_bufio_in_request()); dm_bufio_lock(c); - __write_dirty_buffers_async(c, 0); + __write_dirty_buffers_async(c, 0, &write_list); dm_bufio_unlock(c); + __flush_write_list(&write_list); } EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); @@ -1147,8 +1185,13 @@ int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) unsigned long buffers_processed = 0; struct dm_buffer *b, *tmp; + LIST_HEAD(write_list); + + dm_bufio_lock(c); + __write_dirty_buffers_async(c, 0, &write_list); + dm_bufio_unlock(c); + __flush_write_list(&write_list); dm_bufio_lock(c); - __write_dirty_buffers_async(c, 0); again: list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { @@ -1274,7 +1317,7 @@ retry: BUG_ON(!b->hold_count); BUG_ON(test_bit(B_READING, &b->state)); - __write_dirty_buffer(b); + __write_dirty_buffer(b, NULL); if (b->hold_count == 1) { wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index df44b60e66f..0df3ec085eb 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -425,6 +425,10 @@ static bool block_size_is_power_of_two(struct cache *cache) return cache->sectors_per_block_shift >= 0; } +/* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */ +#if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6 +__always_inline +#endif static dm_block_t block_div(dm_block_t b, uint32_t n) { do_div(b, n); diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 7fcf21cb4ff..c80a0ec5f12 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -176,7 +176,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) fc = kzalloc(sizeof(*fc), GFP_KERNEL); if (!fc) { - ti->error = "Cannot allocate linear context"; + ti->error = "Cannot allocate context"; return -ENOMEM; } fc->start_time = jiffies; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index aa04f022464..f1b758675ec 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -36,6 +36,14 @@ struct hash_cell { struct dm_table *new_map; }; +/* + * A dummy definition to make RCU happy. + * struct dm_table should never be dereferenced in this file. + */ +struct dm_table { + int undefined__; +}; + struct vers_iter { size_t param_size; struct dm_target_versions *vers, *old_vers; @@ -242,9 +250,10 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi return -EBUSY; } -static void __hash_remove(struct hash_cell *hc) +static struct dm_table *__hash_remove(struct hash_cell *hc) { struct dm_table *table; + int srcu_idx; /* remove from the dev hash */ list_del(&hc->uuid_list); @@ -253,16 +262,18 @@ static void __hash_remove(struct hash_cell *hc) dm_set_mdptr(hc->md, NULL); mutex_unlock(&dm_hash_cells_mutex); - table = dm_get_live_table(hc->md); - if (table) { + table = dm_get_live_table(hc->md, &srcu_idx); + if (table) dm_table_event(table); - dm_table_put(table); - } + dm_put_live_table(hc->md, srcu_idx); + table = NULL; if (hc->new_map) - dm_table_destroy(hc->new_map); + table = hc->new_map; dm_put(hc->md); free_cell(hc); + + return table; } static void dm_hash_remove_all(int keep_open_devices) @@ -270,6 +281,7 @@ static void dm_hash_remove_all(int keep_open_devices) int i, dev_skipped; struct hash_cell *hc; struct mapped_device *md; + struct dm_table *t; retry: dev_skipped = 0; @@ -287,10 +299,14 @@ retry: continue; } - __hash_remove(hc); + t = __hash_remove(hc); up_write(&_hash_lock); + if (t) { + dm_sync_table(md); + dm_table_destroy(t); + } dm_put(md); if (likely(keep_open_devices)) dm_destroy(md); @@ -356,6 +372,7 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, struct dm_table *table; struct mapped_device *md; unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0; + int srcu_idx; /* * duplicate new. @@ -418,11 +435,10 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, /* * Wake up any dm event waiters. */ - table = dm_get_live_table(hc->md); - if (table) { + table = dm_get_live_table(hc->md, &srcu_idx); + if (table) dm_table_event(table); - dm_table_put(table); - } + dm_put_live_table(hc->md, srcu_idx); if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, param->event_nr)) param->flags |= DM_UEVENT_GENERATED_FLAG; @@ -620,11 +636,14 @@ static int check_name(const char *name) * _hash_lock without first calling dm_table_put, because dm_table_destroy * waits for this dm_table_put and could be called under this lock. */ -static struct dm_table *dm_get_inactive_table(struct mapped_device *md) +static struct dm_table *dm_get_inactive_table(struct mapped_device *md, int *srcu_idx) { struct hash_cell *hc; struct dm_table *table = NULL; + /* increment rcu count, we don't care about the table pointer */ + dm_get_live_table(md, srcu_idx); + down_read(&_hash_lock); hc = dm_get_mdptr(md); if (!hc || hc->md != md) { @@ -633,8 +652,6 @@ static struct dm_table *dm_get_inactive_table(struct mapped_device *md) } table = hc->new_map; - if (table) - dm_table_get(table); out: up_read(&_hash_lock); @@ -643,10 +660,11 @@ out: } static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md, - struct dm_ioctl *param) + struct dm_ioctl *param, + int *srcu_idx) { return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ? - dm_get_inactive_table(md) : dm_get_live_table(md); + dm_get_inactive_table(md, srcu_idx) : dm_get_live_table(md, srcu_idx); } /* @@ -657,6 +675,7 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) { struct gendisk *disk = dm_disk(md); struct dm_table *table; + int srcu_idx; param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | DM_ACTIVE_PRESENT_FLAG); @@ -676,26 +695,27 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) param->event_nr = dm_get_event_nr(md); param->target_count = 0; - table = dm_get_live_table(md); + table = dm_get_live_table(md, &srcu_idx); if (table) { if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) { if (get_disk_ro(disk)) param->flags |= DM_READONLY_FLAG; param->target_count = dm_table_get_num_targets(table); } - dm_table_put(table); param->flags |= DM_ACTIVE_PRESENT_FLAG; } + dm_put_live_table(md, srcu_idx); if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) { - table = dm_get_inactive_table(md); + int srcu_idx; + table = dm_get_inactive_table(md, &srcu_idx); if (table) { if (!(dm_table_get_mode(table) & FMODE_WRITE)) param->flags |= DM_READONLY_FLAG; param->target_count = dm_table_get_num_targets(table); - dm_table_put(table); } + dm_put_live_table(md, srcu_idx); } } @@ -796,6 +816,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) struct hash_cell *hc; struct mapped_device *md; int r; + struct dm_table *t; down_write(&_hash_lock); hc = __find_device_hash_cell(param); @@ -819,9 +840,14 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) return r; } - __hash_remove(hc); + t = __hash_remove(hc); up_write(&_hash_lock); + if (t) { + dm_sync_table(md); + dm_table_destroy(t); + } + if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr)) param->flags |= DM_UEVENT_GENERATED_FLAG; @@ -986,6 +1012,7 @@ static int do_resume(struct dm_ioctl *param) old_map = dm_swap_table(md, new_map); if (IS_ERR(old_map)) { + dm_sync_table(md); dm_table_destroy(new_map); dm_put(md); return PTR_ERR(old_map); @@ -1003,6 +1030,10 @@ static int do_resume(struct dm_ioctl *param) param->flags |= DM_UEVENT_GENERATED_FLAG; } + /* + * Since dm_swap_table synchronizes RCU, nobody should be in + * read-side critical section already. + */ if (old_map) dm_table_destroy(old_map); @@ -1125,6 +1156,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size) int r = 0; struct mapped_device *md; struct dm_table *table; + int srcu_idx; md = find_device(param); if (!md) @@ -1145,11 +1177,10 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size) */ __dev_status(md, param); - table = dm_get_live_or_inactive_table(md, param); - if (table) { + table = dm_get_live_or_inactive_table(md, param, &srcu_idx); + if (table) retrieve_status(table, param, param_size); - dm_table_put(table); - } + dm_put_live_table(md, srcu_idx); out: dm_put(md); @@ -1221,7 +1252,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size) { int r; struct hash_cell *hc; - struct dm_table *t; + struct dm_table *t, *old_map = NULL; struct mapped_device *md; struct target_type *immutable_target_type; @@ -1277,14 +1308,14 @@ static int table_load(struct dm_ioctl *param, size_t param_size) hc = dm_get_mdptr(md); if (!hc || hc->md != md) { DMWARN("device has been removed from the dev hash table."); - dm_table_destroy(t); up_write(&_hash_lock); + dm_table_destroy(t); r = -ENXIO; goto out; } if (hc->new_map) - dm_table_destroy(hc->new_map); + old_map = hc->new_map; hc->new_map = t; up_write(&_hash_lock); @@ -1292,6 +1323,11 @@ static int table_load(struct dm_ioctl *param, size_t param_size) __dev_status(md, param); out: + if (old_map) { + dm_sync_table(md); + dm_table_destroy(old_map); + } + dm_put(md); return r; @@ -1301,6 +1337,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size) { struct hash_cell *hc; struct mapped_device *md; + struct dm_table *old_map = NULL; down_write(&_hash_lock); @@ -1312,7 +1349,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size) } if (hc->new_map) { - dm_table_destroy(hc->new_map); + old_map = hc->new_map; hc->new_map = NULL; } @@ -1321,6 +1358,10 @@ static int table_clear(struct dm_ioctl *param, size_t param_size) __dev_status(hc->md, param); md = hc->md; up_write(&_hash_lock); + if (old_map) { + dm_sync_table(md); + dm_table_destroy(old_map); + } dm_put(md); return 0; @@ -1370,6 +1411,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size) { struct mapped_device *md; struct dm_table *table; + int srcu_idx; md = find_device(param); if (!md) @@ -1377,11 +1419,10 @@ static int table_deps(struct dm_ioctl *param, size_t param_size) __dev_status(md, param); - table = dm_get_live_or_inactive_table(md, param); - if (table) { + table = dm_get_live_or_inactive_table(md, param, &srcu_idx); + if (table) retrieve_deps(table, param, param_size); - dm_table_put(table); - } + dm_put_live_table(md, srcu_idx); dm_put(md); @@ -1396,6 +1437,7 @@ static int table_status(struct dm_ioctl *param, size_t param_size) { struct mapped_device *md; struct dm_table *table; + int srcu_idx; md = find_device(param); if (!md) @@ -1403,11 +1445,10 @@ static int table_status(struct dm_ioctl *param, size_t param_size) __dev_status(md, param); - table = dm_get_live_or_inactive_table(md, param); - if (table) { + table = dm_get_live_or_inactive_table(md, param, &srcu_idx); + if (table) retrieve_status(table, param, param_size); - dm_table_put(table); - } + dm_put_live_table(md, srcu_idx); dm_put(md); @@ -1443,6 +1484,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size) struct dm_target_msg *tmsg = (void *) param + param->data_start; size_t maxlen; char *result = get_result_buffer(param, param_size, &maxlen); + int srcu_idx; md = find_device(param); if (!md) @@ -1470,9 +1512,9 @@ static int target_message(struct dm_ioctl *param, size_t param_size) if (r <= 1) goto out_argv; - table = dm_get_live_table(md); + table = dm_get_live_table(md, &srcu_idx); if (!table) - goto out_argv; + goto out_table; if (dm_deleting_md(md)) { r = -ENXIO; @@ -1491,7 +1533,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size) } out_table: - dm_table_put(table); + dm_put_live_table(md, srcu_idx); out_argv: kfree(argv); out: @@ -1644,7 +1686,10 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern } if (!dmi) { - dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH, PAGE_KERNEL); + unsigned noio_flag; + noio_flag = memalloc_noio_save(); + dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH | __GFP_HIGHMEM, PAGE_KERNEL); + memalloc_noio_restore(noio_flag); if (dmi) *param_flags |= DM_PARAMS_VMALLOC; } diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index bdf26f5bd32..5adede17ddf 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1561,7 +1561,6 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long flags; int r; -again: bdev = NULL; mode = 0; r = 0; @@ -1579,7 +1578,7 @@ again: } if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path)) - r = -EAGAIN; + r = -ENOTCONN; else if (!bdev) r = -EIO; @@ -1591,11 +1590,8 @@ again: if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) r = scsi_verify_blk_ioctl(NULL, cmd); - if (r == -EAGAIN && !fatal_signal_pending(current)) { + if (r == -ENOTCONN && !fatal_signal_pending(current)) queue_work(kmultipathd, &m->process_queued_ios); - msleep(10); - goto again; - } return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); } diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c new file mode 100644 index 00000000000..ff9ac4be472 --- /dev/null +++ b/drivers/md/dm-switch.c @@ -0,0 +1,538 @@ +/* + * Copyright (C) 2010-2012 by Dell Inc. All rights reserved. + * Copyright (C) 2011-2013 Red Hat, Inc. + * + * This file is released under the GPL. + * + * dm-switch is a device-mapper target that maps IO to underlying block + * devices efficiently when there are a large number of fixed-sized + * address regions but there is no simple pattern to allow for a compact + * mapping representation such as dm-stripe. + */ + +#include <linux/device-mapper.h> + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/vmalloc.h> + +#define DM_MSG_PREFIX "switch" + +/* + * One region_table_slot_t holds <region_entries_per_slot> region table + * entries each of which is <region_table_entry_bits> in size. + */ +typedef unsigned long region_table_slot_t; + +/* + * A device with the offset to its start sector. + */ +struct switch_path { + struct dm_dev *dmdev; + sector_t start; +}; + +/* + * Context block for a dm switch device. + */ +struct switch_ctx { + struct dm_target *ti; + + unsigned nr_paths; /* Number of paths in path_list. */ + + unsigned region_size; /* Region size in 512-byte sectors */ + unsigned long nr_regions; /* Number of regions making up the device */ + signed char region_size_bits; /* log2 of region_size or -1 */ + + unsigned char region_table_entry_bits; /* Number of bits in one region table entry */ + unsigned char region_entries_per_slot; /* Number of entries in one region table slot */ + signed char region_entries_per_slot_bits; /* log2 of region_entries_per_slot or -1 */ + + region_table_slot_t *region_table; /* Region table */ + + /* + * Array of dm devices to switch between. + */ + struct switch_path path_list[0]; +}; + +static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths, + unsigned region_size) +{ + struct switch_ctx *sctx; + + sctx = kzalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_path), + GFP_KERNEL); + if (!sctx) + return NULL; + + sctx->ti = ti; + sctx->region_size = region_size; + + ti->private = sctx; + + return sctx; +} + +static int alloc_region_table(struct dm_target *ti, unsigned nr_paths) +{ + struct switch_ctx *sctx = ti->private; + sector_t nr_regions = ti->len; + sector_t nr_slots; + + if (!(sctx->region_size & (sctx->region_size - 1))) + sctx->region_size_bits = __ffs(sctx->region_size); + else + sctx->region_size_bits = -1; + + sctx->region_table_entry_bits = 1; + while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 && + (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths) + sctx->region_table_entry_bits++; + + sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits; + if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1))) + sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot); + else + sctx->region_entries_per_slot_bits = -1; + + if (sector_div(nr_regions, sctx->region_size)) + nr_regions++; + + sctx->nr_regions = nr_regions; + if (sctx->nr_regions != nr_regions || sctx->nr_regions >= ULONG_MAX) { + ti->error = "Region table too large"; + return -EINVAL; + } + + nr_slots = nr_regions; + if (sector_div(nr_slots, sctx->region_entries_per_slot)) + nr_slots++; + + if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) { + ti->error = "Region table too large"; + return -EINVAL; + } + + sctx->region_table = vmalloc(nr_slots * sizeof(region_table_slot_t)); + if (!sctx->region_table) { + ti->error = "Cannot allocate region table"; + return -ENOMEM; + } + + return 0; +} + +static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr, + unsigned long *region_index, unsigned *bit) +{ + if (sctx->region_entries_per_slot_bits >= 0) { + *region_index = region_nr >> sctx->region_entries_per_slot_bits; + *bit = region_nr & (sctx->region_entries_per_slot - 1); + } else { + *region_index = region_nr / sctx->region_entries_per_slot; + *bit = region_nr % sctx->region_entries_per_slot; + } + + *bit *= sctx->region_table_entry_bits; +} + +/* + * Find which path to use at given offset. + */ +static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset) +{ + unsigned long region_index; + unsigned bit, path_nr; + sector_t p; + + p = offset; + if (sctx->region_size_bits >= 0) + p >>= sctx->region_size_bits; + else + sector_div(p, sctx->region_size); + + switch_get_position(sctx, p, ®ion_index, &bit); + path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) & + ((1 << sctx->region_table_entry_bits) - 1); + + /* This can only happen if the processor uses non-atomic stores. */ + if (unlikely(path_nr >= sctx->nr_paths)) + path_nr = 0; + + return path_nr; +} + +static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr, + unsigned value) +{ + unsigned long region_index; + unsigned bit; + region_table_slot_t pte; + + switch_get_position(sctx, region_nr, ®ion_index, &bit); + + pte = sctx->region_table[region_index]; + pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit); + pte |= (region_table_slot_t)value << bit; + sctx->region_table[region_index] = pte; +} + +/* + * Fill the region table with an initial round robin pattern. + */ +static void initialise_region_table(struct switch_ctx *sctx) +{ + unsigned path_nr = 0; + unsigned long region_nr; + + for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) { + switch_region_table_write(sctx, region_nr, path_nr); + if (++path_nr >= sctx->nr_paths) + path_nr = 0; + } +} + +static int parse_path(struct dm_arg_set *as, struct dm_target *ti) +{ + struct switch_ctx *sctx = ti->private; + unsigned long long start; + int r; + + r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), + &sctx->path_list[sctx->nr_paths].dmdev); + if (r) { + ti->error = "Device lookup failed"; + return r; + } + + if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) { + ti->error = "Invalid device starting offset"; + dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev); + return -EINVAL; + } + + sctx->path_list[sctx->nr_paths].start = start; + + sctx->nr_paths++; + + return 0; +} + +/* + * Destructor: Don't free the dm_target, just the ti->private data (if any). + */ +static void switch_dtr(struct dm_target *ti) +{ + struct switch_ctx *sctx = ti->private; + + while (sctx->nr_paths--) + dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev); + + vfree(sctx->region_table); + kfree(sctx); +} + +/* + * Constructor arguments: + * <num_paths> <region_size> <num_optional_args> [<optional_args>...] + * [<dev_path> <offset>]+ + * + * Optional args are to allow for future extension: currently this + * parameter must be 0. + */ +static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + static struct dm_arg _args[] = { + {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"}, + {1, UINT_MAX, "Invalid region size"}, + {0, 0, "Invalid number of optional args"}, + }; + + struct switch_ctx *sctx; + struct dm_arg_set as; + unsigned nr_paths, region_size, nr_optional_args; + int r; + + as.argc = argc; + as.argv = argv; + + r = dm_read_arg(_args, &as, &nr_paths, &ti->error); + if (r) + return -EINVAL; + + r = dm_read_arg(_args + 1, &as, ®ion_size, &ti->error); + if (r) + return r; + + r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error); + if (r) + return r; + /* parse optional arguments here, if we add any */ + + if (as.argc != nr_paths * 2) { + ti->error = "Incorrect number of path arguments"; + return -EINVAL; + } + + sctx = alloc_switch_ctx(ti, nr_paths, region_size); + if (!sctx) { + ti->error = "Cannot allocate redirection context"; + return -ENOMEM; + } + + r = dm_set_target_max_io_len(ti, region_size); + if (r) + goto error; + + while (as.argc) { + r = parse_path(&as, ti); + if (r) + goto error; + } + + r = alloc_region_table(ti, nr_paths); + if (r) + goto error; + + initialise_region_table(sctx); + + /* For UNMAP, sending the request down any path is sufficient */ + ti->num_discard_bios = 1; + + return 0; + +error: + switch_dtr(ti); + + return r; +} + |