aboutsummaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-07-11 13:05:40 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2013-07-11 13:05:40 -0700
commit9903883f1dd6e86f286b7bfa6e4b423f98c1cd9e (patch)
tree63c907110eac32c31a1786ebff3e7d9257e61c9b /drivers/md
parent36805aaea5ae3cf1bb32f1643e0a800bb69f0d5b (diff)
parent9d0eb0ab432aaa9160cf2675aee73b3900b9bc18 (diff)
Merge tag 'dm-3.11-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm
Pull device-mapper changes from Alasdair G Kergon: "Add a device-mapper target called dm-switch to provide a multipath framework for storage arrays that dynamically reconfigure their preferred paths for different device regions. Fix a bug in the verity target that prevented its use with some specific sizes of devices. Improve some locking mechanisms in the device-mapper core and bufio. Add Mike Snitzer as a device-mapper maintainer. A few more clean-ups and fixes" * tag 'dm-3.11-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm: dm: add switch target dm: update maintainers dm: optimize reorder structure dm: optimize use SRCU and RCU dm bufio: submit writes outside lock dm cache: fix arm link errors with inline dm verity: use __ffs and __fls dm flakey: correct ctr alloc failure mesg dm verity: remove pointless comparison dm: use __GFP_HIGHMEM in __vmalloc dm verity: fix inability to use a few specific devices sizes dm ioctl: set noio flag to avoid __vmalloc deadlock dm mpath: fix ioctl deadlock when no paths
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig14
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/dm-bufio.c75
-rw-r--r--drivers/md/dm-cache-target.c4
-rw-r--r--drivers/md/dm-flakey.c2
-rw-r--r--drivers/md/dm-ioctl.c127
-rw-r--r--drivers/md/dm-mpath.c8
-rw-r--r--drivers/md/dm-switch.c538
-rw-r--r--drivers/md/dm-table.c35
-rw-r--r--drivers/md/dm-verity.c17
-rw-r--r--drivers/md/dm.c177
11 files changed, 818 insertions, 180 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 3bfc8f1da9f..30b426ed744 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -412,4 +412,18 @@ config DM_VERITY
If unsure, say N.
+config DM_SWITCH
+ tristate "Switch target support (EXPERIMENTAL)"
+ depends on BLK_DEV_DM
+ ---help---
+ This device-mapper target creates a device that supports an arbitrary
+ mapping of fixed-size regions of I/O across a fixed set of paths.
+ The path used for any specific region can be switched dynamically
+ by sending the target a message.
+
+ To compile this code as a module, choose M here: the module will
+ be called dm-switch.
+
+ If unsure, say N.
+
endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 1439fd4ad9b..5ef78efc27f 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -40,6 +40,7 @@ obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o
obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o
+obj-$(CONFIG_DM_SWITCH) += dm-switch.o
obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/
obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 0387e05cdb9..5227e079a6e 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -145,6 +145,7 @@ struct dm_buffer {
unsigned long state;
unsigned long last_accessed;
struct dm_bufio_client *c;
+ struct list_head write_list;
struct bio bio;
struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS];
};
@@ -349,7 +350,7 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
if (gfp_mask & __GFP_NORETRY)
noio_flag = memalloc_noio_save();
- ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
+ ptr = __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, PAGE_KERNEL);
if (gfp_mask & __GFP_NORETRY)
memalloc_noio_restore(noio_flag);
@@ -630,7 +631,8 @@ static int do_io_schedule(void *word)
* - Submit our write and don't wait on it. We set B_WRITING indicating
* that there is a write in progress.
*/
-static void __write_dirty_buffer(struct dm_buffer *b)
+static void __write_dirty_buffer(struct dm_buffer *b,
+ struct list_head *write_list)
{
if (!test_bit(B_DIRTY, &b->state))
return;
@@ -639,7 +641,24 @@ static void __write_dirty_buffer(struct dm_buffer *b)
wait_on_bit_lock(&b->state, B_WRITING,
do_io_schedule, TASK_UNINTERRUPTIBLE);
- submit_io(b, WRITE, b->block, write_endio);
+ if (!write_list)
+ submit_io(b, WRITE, b->block, write_endio);
+ else
+ list_add_tail(&b->write_list, write_list);
+}
+
+static void __flush_write_list(struct list_head *write_list)
+{
+ struct blk_plug plug;
+ blk_start_plug(&plug);
+ while (!list_empty(write_list)) {
+ struct dm_buffer *b =
+ list_entry(write_list->next, struct dm_buffer, write_list);
+ list_del(&b->write_list);
+ submit_io(b, WRITE, b->block, write_endio);
+ dm_bufio_cond_resched();
+ }
+ blk_finish_plug(&plug);
}
/*
@@ -655,7 +674,7 @@ static void __make_buffer_clean(struct dm_buffer *b)
return;
wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE);
- __write_dirty_buffer(b);
+ __write_dirty_buffer(b, NULL);
wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE);
}
@@ -802,7 +821,8 @@ static void __free_buffer_wake(struct dm_buffer *b)
wake_up(&c->free_buffer_wait);
}
-static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait)
+static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
+ struct list_head *write_list)
{
struct dm_buffer *b, *tmp;
@@ -818,7 +838,7 @@ static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait)
if (no_wait && test_bit(B_WRITING, &b->state))
return;
- __write_dirty_buffer(b);
+ __write_dirty_buffer(b, write_list);
dm_bufio_cond_resched();
}
}
@@ -853,7 +873,8 @@ static void __get_memory_limit(struct dm_bufio_client *c,
* If we are over threshold_buffers, start freeing buffers.
* If we're over "limit_buffers", block until we get under the limit.
*/
-static void __check_watermark(struct dm_bufio_client *c)
+static void __check_watermark(struct dm_bufio_client *c,
+ struct list_head *write_list)
{
unsigned long threshold_buffers, limit_buffers;
@@ -872,7 +893,7 @@ static void __check_watermark(struct dm_bufio_client *c)
}
if (c->n_buffers[LIST_DIRTY] > threshold_buffers)
- __write_dirty_buffers_async(c, 1);
+ __write_dirty_buffers_async(c, 1, write_list);
}
/*
@@ -897,7 +918,8 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
*--------------------------------------------------------------*/
static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
- enum new_flag nf, int *need_submit)
+ enum new_flag nf, int *need_submit,
+ struct list_head *write_list)
{
struct dm_buffer *b, *new_b = NULL;
@@ -924,7 +946,7 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
goto found_buffer;
}
- __check_watermark(c);
+ __check_watermark(c, write_list);
b = new_b;
b->hold_count = 1;
@@ -992,10 +1014,14 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
int need_submit;
struct dm_buffer *b;
+ LIST_HEAD(write_list);
+
dm_bufio_lock(c);
- b = __bufio_new(c, block, nf, &need_submit);
+ b = __bufio_new(c, block, nf, &need_submit, &write_list);
dm_bufio_unlock(c);
+ __flush_write_list(&write_list);
+
if (!b)
return b;
@@ -1047,6 +1073,8 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
{
struct blk_plug plug;
+ LIST_HEAD(write_list);
+
BUG_ON(dm_bufio_in_request());
blk_start_plug(&plug);
@@ -1055,7 +1083,15 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
for (; n_blocks--; block++) {
int need_submit;
struct dm_buffer *b;
- b = __bufio_new(c, block, NF_PREFETCH, &need_submit);
+ b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
+ &write_list);
+ if (unlikely(!list_empty(&write_list))) {
+ dm_bufio_unlock(c);
+ blk_finish_plug(&plug);
+ __flush_write_list(&write_list);
+ blk_start_plug(&plug);
+ dm_bufio_lock(c);
+ }
if (unlikely(b != NULL)) {
dm_bufio_unlock(c);
@@ -1069,7 +1105,6 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
goto flush_plug;
dm_bufio_lock(c);
}
-
}
dm_bufio_unlock(c);
@@ -1126,11 +1161,14 @@ EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
{
+ LIST_HEAD(write_list);
+
BUG_ON(dm_bufio_in_request());
dm_bufio_lock(c);
- __write_dirty_buffers_async(c, 0);
+ __write_dirty_buffers_async(c, 0, &write_list);
dm_bufio_unlock(c);
+ __flush_write_list(&write_list);
}
EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
@@ -1147,8 +1185,13 @@ int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
unsigned long buffers_processed = 0;
struct dm_buffer *b, *tmp;
+ LIST_HEAD(write_list);
+
+ dm_bufio_lock(c);
+ __write_dirty_buffers_async(c, 0, &write_list);
+ dm_bufio_unlock(c);
+ __flush_write_list(&write_list);
dm_bufio_lock(c);
- __write_dirty_buffers_async(c, 0);
again:
list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
@@ -1274,7 +1317,7 @@ retry:
BUG_ON(!b->hold_count);
BUG_ON(test_bit(B_READING, &b->state));
- __write_dirty_buffer(b);
+ __write_dirty_buffer(b, NULL);
if (b->hold_count == 1) {
wait_on_bit(&b->state, B_WRITING,
do_io_schedule, TASK_UNINTERRUPTIBLE);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index df44b60e66f..0df3ec085eb 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -425,6 +425,10 @@ static bool block_size_is_power_of_two(struct cache *cache)
return cache->sectors_per_block_shift >= 0;
}
+/* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
+#if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
+__always_inline
+#endif
static dm_block_t block_div(dm_block_t b, uint32_t n)
{
do_div(b, n);
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 7fcf21cb4ff..c80a0ec5f12 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -176,7 +176,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
fc = kzalloc(sizeof(*fc), GFP_KERNEL);
if (!fc) {
- ti->error = "Cannot allocate linear context";
+ ti->error = "Cannot allocate context";
return -ENOMEM;
}
fc->start_time = jiffies;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index aa04f022464..f1b758675ec 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -36,6 +36,14 @@ struct hash_cell {
struct dm_table *new_map;
};
+/*
+ * A dummy definition to make RCU happy.
+ * struct dm_table should never be dereferenced in this file.
+ */
+struct dm_table {
+ int undefined__;
+};
+
struct vers_iter {
size_t param_size;
struct dm_target_versions *vers, *old_vers;
@@ -242,9 +250,10 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi
return -EBUSY;
}
-static void __hash_remove(struct hash_cell *hc)
+static struct dm_table *__hash_remove(struct hash_cell *hc)
{
struct dm_table *table;
+ int srcu_idx;
/* remove from the dev hash */
list_del(&hc->uuid_list);
@@ -253,16 +262,18 @@ static void __hash_remove(struct hash_cell *hc)
dm_set_mdptr(hc->md, NULL);
mutex_unlock(&dm_hash_cells_mutex);
- table = dm_get_live_table(hc->md);
- if (table) {
+ table = dm_get_live_table(hc->md, &srcu_idx);
+ if (table)
dm_table_event(table);
- dm_table_put(table);
- }
+ dm_put_live_table(hc->md, srcu_idx);
+ table = NULL;
if (hc->new_map)
- dm_table_destroy(hc->new_map);
+ table = hc->new_map;
dm_put(hc->md);
free_cell(hc);
+
+ return table;
}
static void dm_hash_remove_all(int keep_open_devices)
@@ -270,6 +281,7 @@ static void dm_hash_remove_all(int keep_open_devices)
int i, dev_skipped;
struct hash_cell *hc;
struct mapped_device *md;
+ struct dm_table *t;
retry:
dev_skipped = 0;
@@ -287,10 +299,14 @@ retry:
continue;
}
- __hash_remove(hc);
+ t = __hash_remove(hc);
up_write(&_hash_lock);
+ if (t) {
+ dm_sync_table(md);
+ dm_table_destroy(t);
+ }
dm_put(md);
if (likely(keep_open_devices))
dm_destroy(md);
@@ -356,6 +372,7 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
struct dm_table *table;
struct mapped_device *md;
unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
+ int srcu_idx;
/*
* duplicate new.
@@ -418,11 +435,10 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
/*
* Wake up any dm event waiters.
*/
- table = dm_get_live_table(hc->md);
- if (table) {
+ table = dm_get_live_table(hc->md, &srcu_idx);
+ if (table)
dm_table_event(table);
- dm_table_put(table);
- }
+ dm_put_live_table(hc->md, srcu_idx);
if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, param->event_nr))
param->flags |= DM_UEVENT_GENERATED_FLAG;
@@ -620,11 +636,14 @@ static int check_name(const char *name)
* _hash_lock without first calling dm_table_put, because dm_table_destroy
* waits for this dm_table_put and could be called under this lock.
*/
-static struct dm_table *dm_get_inactive_table(struct mapped_device *md)
+static struct dm_table *dm_get_inactive_table(struct mapped_device *md, int *srcu_idx)
{
struct hash_cell *hc;
struct dm_table *table = NULL;
+ /* increment rcu count, we don't care about the table pointer */
+ dm_get_live_table(md, srcu_idx);
+
down_read(&_hash_lock);
hc = dm_get_mdptr(md);
if (!hc || hc->md != md) {
@@ -633,8 +652,6 @@ static struct dm_table *dm_get_inactive_table(struct mapped_device *md)
}
table = hc->new_map;
- if (table)
- dm_table_get(table);
out:
up_read(&_hash_lock);
@@ -643,10 +660,11 @@ out:
}
static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md,
- struct dm_ioctl *param)
+ struct dm_ioctl *param,
+ int *srcu_idx)
{
return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ?
- dm_get_inactive_table(md) : dm_get_live_table(md);
+ dm_get_inactive_table(md, srcu_idx) : dm_get_live_table(md, srcu_idx);
}
/*
@@ -657,6 +675,7 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
{
struct gendisk *disk = dm_disk(md);
struct dm_table *table;
+ int srcu_idx;
param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
DM_ACTIVE_PRESENT_FLAG);
@@ -676,26 +695,27 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
param->event_nr = dm_get_event_nr(md);
param->target_count = 0;
- table = dm_get_live_table(md);
+ table = dm_get_live_table(md, &srcu_idx);
if (table) {
if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) {
if (get_disk_ro(disk))
param->flags |= DM_READONLY_FLAG;
param->target_count = dm_table_get_num_targets(table);
}
- dm_table_put(table);
param->flags |= DM_ACTIVE_PRESENT_FLAG;
}
+ dm_put_live_table(md, srcu_idx);
if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) {
- table = dm_get_inactive_table(md);
+ int srcu_idx;
+ table = dm_get_inactive_table(md, &srcu_idx);
if (table) {
if (!(dm_table_get_mode(table) & FMODE_WRITE))
param->flags |= DM_READONLY_FLAG;
param->target_count = dm_table_get_num_targets(table);
- dm_table_put(table);
}
+ dm_put_live_table(md, srcu_idx);
}
}
@@ -796,6 +816,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
struct hash_cell *hc;
struct mapped_device *md;
int r;
+ struct dm_table *t;
down_write(&_hash_lock);
hc = __find_device_hash_cell(param);
@@ -819,9 +840,14 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
return r;
}
- __hash_remove(hc);
+ t = __hash_remove(hc);
up_write(&_hash_lock);
+ if (t) {
+ dm_sync_table(md);
+ dm_table_destroy(t);
+ }
+
if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr))
param->flags |= DM_UEVENT_GENERATED_FLAG;
@@ -986,6 +1012,7 @@ static int do_resume(struct dm_ioctl *param)
old_map = dm_swap_table(md, new_map);
if (IS_ERR(old_map)) {
+ dm_sync_table(md);
dm_table_destroy(new_map);
dm_put(md);
return PTR_ERR(old_map);
@@ -1003,6 +1030,10 @@ static int do_resume(struct dm_ioctl *param)
param->flags |= DM_UEVENT_GENERATED_FLAG;
}
+ /*
+ * Since dm_swap_table synchronizes RCU, nobody should be in
+ * read-side critical section already.
+ */
if (old_map)
dm_table_destroy(old_map);
@@ -1125,6 +1156,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size)
int r = 0;
struct mapped_device *md;
struct dm_table *table;
+ int srcu_idx;
md = find_device(param);
if (!md)
@@ -1145,11 +1177,10 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size)
*/
__dev_status(md, param);
- table = dm_get_live_or_inactive_table(md, param);
- if (table) {
+ table = dm_get_live_or_inactive_table(md, param, &srcu_idx);
+ if (table)
retrieve_status(table, param, param_size);
- dm_table_put(table);
- }
+ dm_put_live_table(md, srcu_idx);
out:
dm_put(md);
@@ -1221,7 +1252,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
{
int r;
struct hash_cell *hc;
- struct dm_table *t;
+ struct dm_table *t, *old_map = NULL;
struct mapped_device *md;
struct target_type *immutable_target_type;
@@ -1277,14 +1308,14 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
hc = dm_get_mdptr(md);
if (!hc || hc->md != md) {
DMWARN("device has been removed from the dev hash table.");
- dm_table_destroy(t);
up_write(&_hash_lock);
+ dm_table_destroy(t);
r = -ENXIO;
goto out;
}
if (hc->new_map)
- dm_table_destroy(hc->new_map);
+ old_map = hc->new_map;
hc->new_map = t;
up_write(&_hash_lock);
@@ -1292,6 +1323,11 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
__dev_status(md, param);
out:
+ if (old_map) {
+ dm_sync_table(md);
+ dm_table_destroy(old_map);
+ }
+
dm_put(md);
return r;
@@ -1301,6 +1337,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
{
struct hash_cell *hc;
struct mapped_device *md;
+ struct dm_table *old_map = NULL;
down_write(&_hash_lock);
@@ -1312,7 +1349,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
}
if (hc->new_map) {
- dm_table_destroy(hc->new_map);
+ old_map = hc->new_map;
hc->new_map = NULL;
}
@@ -1321,6 +1358,10 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
__dev_status(hc->md, param);
md = hc->md;
up_write(&_hash_lock);
+ if (old_map) {
+ dm_sync_table(md);
+ dm_table_destroy(old_map);
+ }
dm_put(md);
return 0;
@@ -1370,6 +1411,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
{
struct mapped_device *md;
struct dm_table *table;
+ int srcu_idx;
md = find_device(param);
if (!md)
@@ -1377,11 +1419,10 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
__dev_status(md, param);
- table = dm_get_live_or_inactive_table(md, param);
- if (table) {
+ table = dm_get_live_or_inactive_table(md, param, &srcu_idx);
+ if (table)
retrieve_deps(table, param, param_size);
- dm_table_put(table);
- }
+ dm_put_live_table(md, srcu_idx);
dm_put(md);
@@ -1396,6 +1437,7 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
{
struct mapped_device *md;
struct dm_table *table;
+ int srcu_idx;
md = find_device(param);
if (!md)
@@ -1403,11 +1445,10 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
__dev_status(md, param);
- table = dm_get_live_or_inactive_table(md, param);
- if (table) {
+ table = dm_get_live_or_inactive_table(md, param, &srcu_idx);
+ if (table)
retrieve_status(table, param, param_size);
- dm_table_put(table);
- }
+ dm_put_live_table(md, srcu_idx);
dm_put(md);
@@ -1443,6 +1484,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
struct dm_target_msg *tmsg = (void *) param + param->data_start;
size_t maxlen;
char *result = get_result_buffer(param, param_size, &maxlen);
+ int srcu_idx;
md = find_device(param);
if (!md)
@@ -1470,9 +1512,9 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
if (r <= 1)
goto out_argv;
- table = dm_get_live_table(md);
+ table = dm_get_live_table(md, &srcu_idx);
if (!table)
- goto out_argv;
+ goto out_table;
if (dm_deleting_md(md)) {
r = -ENXIO;
@@ -1491,7 +1533,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
}
out_table:
- dm_table_put(table);
+ dm_put_live_table(md, srcu_idx);
out_argv:
kfree(argv);
out:
@@ -1644,7 +1686,10 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern
}
if (!dmi) {
- dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH, PAGE_KERNEL);
+ unsigned noio_flag;
+ noio_flag = memalloc_noio_save();
+ dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH | __GFP_HIGHMEM, PAGE_KERNEL);
+ memalloc_noio_restore(noio_flag);
if (dmi)
*param_flags |= DM_PARAMS_VMALLOC;
}
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index bdf26f5bd32..5adede17ddf 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1561,7 +1561,6 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
unsigned long flags;
int r;
-again:
bdev = NULL;
mode = 0;
r = 0;
@@ -1579,7 +1578,7 @@ again:
}
if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path))
- r = -EAGAIN;
+ r = -ENOTCONN;
else if (!bdev)
r = -EIO;
@@ -1591,11 +1590,8 @@ again:
if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
r = scsi_verify_blk_ioctl(NULL, cmd);
- if (r == -EAGAIN && !fatal_signal_pending(current)) {
+ if (r == -ENOTCONN && !fatal_signal_pending(current))
queue_work(kmultipathd, &m->process_queued_ios);
- msleep(10);
- goto again;
- }
return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
}
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
new file mode 100644
index 00000000000..ff9ac4be472
--- /dev/null
+++ b/drivers/md/dm-switch.c
@@ -0,0 +1,538 @@
+/*
+ * Copyright (C) 2010-2012 by Dell Inc. All rights reserved.
+ * Copyright (C) 2011-2013 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ *
+ * dm-switch is a device-mapper target that maps IO to underlying block
+ * devices efficiently when there are a large number of fixed-sized
+ * address regions but there is no simple pattern to allow for a compact
+ * mapping representation such as dm-stripe.
+ */
+
+#include <linux/device-mapper.h>
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+
+#define DM_MSG_PREFIX "switch"
+
+/*
+ * One region_table_slot_t holds <region_entries_per_slot> region table
+ * entries each of which is <region_table_entry_bits> in size.
+ */
+typedef unsigned long region_table_slot_t;
+
+/*
+ * A device with the offset to its start sector.
+ */
+struct switch_path {
+ struct dm_dev *dmdev;
+ sector_t start;
+};
+
+/*
+ * Context block for a dm switch device.
+ */
+struct switch_ctx {
+ struct dm_target *ti;
+
+ unsigned nr_paths; /* Number of paths in path_list. */
+
+ unsigned region_size; /* Region size in 512-byte sectors */
+ unsigned long nr_regions; /* Number of regions making up the device */
+ signed char region_size_bits; /* log2 of region_size or -1 */
+
+ unsigned char region_table_entry_bits; /* Number of bits in one region table entry */
+ unsigned char region_entries_per_slot; /* Number of entries in one region table slot */
+ signed char region_entries_per_slot_bits; /* log2 of region_entries_per_slot or -1 */
+
+ region_table_slot_t *region_table; /* Region table */
+
+ /*
+ * Array of dm devices to switch between.
+ */
+ struct switch_path path_list[0];
+};
+
+static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths,
+ unsigned region_size)
+{
+ struct switch_ctx *sctx;
+
+ sctx = kzalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_path),
+ GFP_KERNEL);
+ if (!sctx)
+ return NULL;
+
+ sctx->ti = ti;
+ sctx->region_size = region_size;
+
+ ti->private = sctx;
+
+ return sctx;
+}
+
+static int alloc_region_table(struct dm_target *ti, unsigned nr_paths)
+{
+ struct switch_ctx *sctx = ti->private;
+ sector_t nr_regions = ti->len;
+ sector_t nr_slots;
+
+ if (!(sctx->region_size & (sctx->region_size - 1)))
+ sctx->region_size_bits = __ffs(sctx->region_size);
+ else
+ sctx->region_size_bits = -1;
+
+ sctx->region_table_entry_bits = 1;
+ while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 &&
+ (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths)
+ sctx->region_table_entry_bits++;
+
+ sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits;
+ if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1)))
+ sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot);
+ else
+ sctx->region_entries_per_slot_bits = -1;
+
+ if (sector_div(nr_regions, sctx->region_size))
+ nr_regions++;
+
+ sctx->nr_regions = nr_regions;
+ if (sctx->nr_regions != nr_regions || sctx->nr_regions >= ULONG_MAX) {
+ ti->error = "Region table too large";
+ return -EINVAL;
+ }
+
+ nr_slots = nr_regions;
+ if (sector_div(nr_slots, sctx->region_entries_per_slot))
+ nr_slots++;
+
+ if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) {
+ ti->error = "Region table too large";
+ return -EINVAL;
+ }
+
+ sctx->region_table = vmalloc(nr_slots * sizeof(region_table_slot_t));
+ if (!sctx->region_table) {
+ ti->error = "Cannot allocate region table";
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr,
+ unsigned long *region_index, unsigned *bit)
+{
+ if (sctx->region_entries_per_slot_bits >= 0) {
+ *region_index = region_nr >> sctx->region_entries_per_slot_bits;
+ *bit = region_nr & (sctx->region_entries_per_slot - 1);
+ } else {
+ *region_index = region_nr / sctx->region_entries_per_slot;
+ *bit = region_nr % sctx->region_entries_per_slot;
+ }
+
+ *bit *= sctx->region_table_entry_bits;
+}
+
+/*
+ * Find which path to use at given offset.
+ */
+static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
+{
+ unsigned long region_index;
+ unsigned bit, path_nr;
+ sector_t p;
+
+ p = offset;
+ if (sctx->region_size_bits >= 0)
+ p >>= sctx->region_size_bits;
+ else
+ sector_div(p, sctx->region_size);
+
+ switch_get_position(sctx, p, &region_index, &bit);
+ path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
+ ((1 << sctx->region_table_entry_bits) - 1);
+
+ /* This can only happen if the processor uses non-atomic stores. */
+ if (unlikely(path_nr >= sctx->nr_paths))
+ path_nr = 0;
+
+ return path_nr;
+}
+
+static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr,
+ unsigned value)
+{
+ unsigned long region_index;
+ unsigned bit;
+ region_table_slot_t pte;
+
+ switch_get_position(sctx, region_nr, &region_index, &bit);
+
+ pte = sctx->region_table[region_index];
+ pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit);
+ pte |= (region_table_slot_t)value << bit;
+ sctx->region_table[region_index] = pte;
+}
+
+/*
+ * Fill the region table with an initial round robin pattern.
+ */
+static void initialise_region_table(struct switch_ctx *sctx)
+{
+ unsigned path_nr = 0;
+ unsigned long region_nr;
+
+ for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) {
+ switch_region_table_write(sctx, region_nr, path_nr);
+ if (++path_nr >= sctx->nr_paths)
+ path_nr = 0;
+ }
+}
+
+static int parse_path(struct dm_arg_set *as, struct dm_target *ti)
+{
+ struct switch_ctx *sctx = ti->private;
+ unsigned long long start;
+ int r;
+
+ r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
+ &sctx->path_list[sctx->nr_paths].dmdev);
+ if (r) {
+ ti->error = "Device lookup failed";
+ return r;
+ }
+
+ if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) {
+ ti->error = "Invalid device starting offset";
+ dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
+ return -EINVAL;
+ }
+
+ sctx->path_list[sctx->nr_paths].start = start;
+
+ sctx->nr_paths++;
+
+ return 0;
+}
+
+/*
+ * Destructor: Don't free the dm_target, just the ti->private data (if any).
+ */
+static void switch_dtr(struct dm_target *ti)
+{
+ struct switch_ctx *sctx = ti->private;
+
+ while (sctx->nr_paths--)
+ dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
+
+ vfree(sctx->region_table);
+ kfree(sctx);
+}
+
+/*
+ * Constructor arguments:
+ * <num_paths> <region_size> <num_optional_args> [<optional_args>...]
+ * [<dev_path> <offset>]+
+ *
+ * Optional args are to allow for future extension: currently this
+ * parameter must be 0.
+ */
+static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ static struct dm_arg _args[] = {
+ {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"},
+ {1, UINT_MAX, "Invalid region size"},
+ {0, 0, "Invalid number of optional args"},
+ };
+
+ struct switch_ctx *sctx;
+ struct dm_arg_set as;
+ unsigned nr_paths, region_size, nr_optional_args;
+ int r;
+
+ as.argc = argc;
+ as.argv = argv;
+
+ r = dm_read_arg(_args, &as, &nr_paths, &ti->error);
+ if (r)
+ return -EINVAL;
+
+ r = dm_read_arg(_args + 1, &as, &region_size, &ti->error);
+ if (r)
+ return r;
+
+ r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error);
+ if (r)
+ return r;
+ /* parse optional arguments here, if we add any */
+
+ if (as.argc != nr_paths * 2) {
+ ti->error = "Incorrect number of path arguments";
+ return -EINVAL;
+ }
+
+ sctx = alloc_switch_ctx(ti, nr_paths, region_size);
+ if (!sctx) {
+ ti->error = "Cannot allocate redirection context";
+ return -ENOMEM;
+ }
+
+ r = dm_set_target_max_io_len(ti, region_size);
+ if (r)
+ goto error;
+
+ while (as.argc) {
+ r = parse_path(&as, ti);
+ if (r)
+ goto error;
+ }
+
+ r = alloc_region_table(ti, nr_paths);
+ if (r)
+ goto error;
+
+ initialise_region_table(sctx);
+
+ /* For UNMAP, sending the request down any path is sufficient */
+ ti->num_discard_bios = 1;
+
+ return 0;
+
+error:
+ switch_dtr(ti);
+
+ return r;
+}
+