diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 5 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 137 | ||||
-rw-r--r-- | drivers/md/bitmap.h | 5 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 64 | ||||
-rw-r--r-- | drivers/md/dm-flakey.c | 270 | ||||
-rw-r--r-- | drivers/md/dm-io.c | 29 | ||||
-rw-r--r-- | drivers/md/dm-ioctl.c | 89 | ||||
-rw-r--r-- | drivers/md/dm-kcopyd.c | 44 | ||||
-rw-r--r-- | drivers/md/dm-log-userspace-base.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-log.c | 32 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 149 | ||||
-rw-r--r-- | drivers/md/dm-queue-length.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 621 | ||||
-rw-r--r-- | drivers/md/dm-snap-persistent.c | 80 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 84 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 157 | ||||
-rw-r--r-- | drivers/md/dm.c | 75 | ||||
-rw-r--r-- | drivers/md/dm.h | 2 | ||||
-rw-r--r-- | drivers/md/linear.h | 2 | ||||
-rw-r--r-- | drivers/md/md.c | 897 | ||||
-rw-r--r-- | drivers/md/md.h | 110 | ||||
-rw-r--r-- | drivers/md/raid1.c | 966 | ||||
-rw-r--r-- | drivers/md/raid1.h | 26 | ||||
-rw-r--r-- | drivers/md/raid10.c | 1204 | ||||
-rw-r--r-- | drivers/md/raid10.h | 21 | ||||
-rw-r--r-- | drivers/md/raid5.c | 1015 | ||||
-rw-r--r-- | drivers/md/raid5.h | 99 |
27 files changed, 4454 insertions, 1734 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 8420129fc5e..f75a66e7d31 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -241,12 +241,13 @@ config DM_MIRROR needed for live data migration tools such as 'pvmove'. config DM_RAID - tristate "RAID 4/5/6 target (EXPERIMENTAL)" + tristate "RAID 1/4/5/6 target (EXPERIMENTAL)" depends on BLK_DEV_DM && EXPERIMENTAL + select MD_RAID1 select MD_RAID456 select BLK_DEV_MD ---help--- - A dm target that supports RAID4, RAID5 and RAID6 mappings + A dm target that supports RAID1, RAID4, RAID5 and RAID6 mappings A RAID-5 set of N drives with a capacity of C MB per drive provides the capacity of C * (N - 1) MB, and protects against a failure diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 574b09afedd..0dc6546b77a 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -29,7 +29,6 @@ #include "md.h" #include "bitmap.h" -#include <linux/dm-dirty-log.h> /* debug macros */ #define DEBUG 0 @@ -775,10 +774,8 @@ static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned lon * 0 or page 1 */ static inline struct page *filemap_get_page(struct bitmap *bitmap, - unsigned long chunk) + unsigned long chunk) { - if (bitmap->filemap == NULL) - return NULL; if (file_page_index(bitmap, chunk) >= bitmap->file_pages) return NULL; return bitmap->filemap[file_page_index(bitmap, chunk) @@ -878,28 +875,19 @@ enum bitmap_page_attr { static inline void set_page_attr(struct bitmap *bitmap, struct page *page, enum bitmap_page_attr attr) { - if (page) - __set_bit((page->index<<2) + attr, bitmap->filemap_attr); - else - __set_bit(attr, &bitmap->logattrs); + __set_bit((page->index<<2) + attr, bitmap->filemap_attr); } static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, enum bitmap_page_attr attr) { - if (page) - __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); - else - __clear_bit(attr, &bitmap->logattrs); + __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); } static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, enum bitmap_page_attr attr) { - if (page) - return test_bit((page->index<<2) + attr, bitmap->filemap_attr); - else - return test_bit(attr, &bitmap->logattrs); + return test_bit((page->index<<2) + attr, bitmap->filemap_attr); } /* @@ -912,30 +900,26 @@ static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *p static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) { unsigned long bit; - struct page *page = NULL; + struct page *page; void *kaddr; unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); - if (!bitmap->filemap) { - struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log; - if (log) - log->type->mark_region(log, chunk); - } else { + if (!bitmap->filemap) + return; - page = filemap_get_page(bitmap, chunk); - if (!page) - return; - bit = file_page_offset(bitmap, chunk); + page = filemap_get_page(bitmap, chunk); + if (!page) + return; + bit = file_page_offset(bitmap, chunk); - /* set the bit */ - kaddr = kmap_atomic(page, KM_USER0); - if (bitmap->flags & BITMAP_HOSTENDIAN) - set_bit(bit, kaddr); - else - __test_and_set_bit_le(bit, kaddr); - kunmap_atomic(kaddr, KM_USER0); - PRINTK("set file bit %lu page %lu\n", bit, page->index); - } + /* set the bit */ + kaddr = kmap_atomic(page, KM_USER0); + if (bitmap->flags & BITMAP_HOSTENDIAN) + set_bit(bit, kaddr); + else + __set_bit_le(bit, kaddr); + kunmap_atomic(kaddr, KM_USER0); + PRINTK("set file bit %lu page %lu\n", bit, page->index); /* record page number so it gets flushed to disk when unplug occurs */ set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); } @@ -952,16 +936,6 @@ void bitmap_unplug(struct bitmap *bitmap) if (!bitmap) return; - if (!bitmap->filemap) { - /* Must be using a dirty_log */ - struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log; - dirty = test_and_clear_bit(BITMAP_PAGE_DIRTY, &bitmap->logattrs); - need_write = test_and_clear_bit(BITMAP_PAGE_NEEDWRITE, &bitmap->logattrs); - if (dirty || need_write) - if (log->type->flush(log)) - bitmap->flags |= BITMAP_WRITE_ERROR; - goto out; - } /* look at each page to see if there are any set bits that need to be * flushed out to disk */ @@ -990,7 +964,6 @@ void bitmap_unplug(struct bitmap *bitmap) else md_super_wait(bitmap->mddev); } -out: if (bitmap->flags & BITMAP_WRITE_ERROR) bitmap_file_kick(bitmap); } @@ -1199,7 +1172,6 @@ void bitmap_daemon_work(mddev_t *mddev) struct page *page = NULL, *lastpage = NULL; sector_t blocks; void *paddr; - struct dm_dirty_log *log = mddev->bitmap_info.log; /* Use a mutex to guard daemon_work against * bitmap_destroy. @@ -1224,12 +1196,11 @@ void bitmap_daemon_work(mddev_t *mddev) spin_lock_irqsave(&bitmap->lock, flags); for (j = 0; j < bitmap->chunks; j++) { bitmap_counter_t *bmc; - if (!bitmap->filemap) { - if (!log) - /* error or shutdown */ - break; - } else - page = filemap_get_page(bitmap, j); + if (!bitmap->filemap) + /* error or shutdown */ + break; + + page = filemap_get_page(bitmap, j); if (page != lastpage) { /* skip this page unless it's marked as needing cleaning */ @@ -1298,17 +1269,16 @@ void bitmap_daemon_work(mddev_t *mddev) -1); /* clear the bit */ - if (page) { - paddr = kmap_atomic(page, KM_USER0); - if (bitmap->flags & BITMAP_HOSTENDIAN) - clear_bit(file_page_offset(bitmap, j), - paddr); - else - __test_and_clear_bit_le(file_page_offset(bitmap, j), - paddr); - kunmap_atomic(paddr, KM_USER0); - } else - log->type->clear_region(log, j); + paddr = kmap_atomic(page, KM_USER0); + if (bitmap->flags & BITMAP_HOSTENDIAN) + clear_bit(file_page_offset(bitmap, j), + paddr); + else + __clear_bit_le( + file_page_offset(bitmap, + j), + paddr); + kunmap_atomic(paddr, KM_USER0); } } else j |= PAGE_COUNTER_MASK; @@ -1316,16 +1286,12 @@ void bitmap_daemon_work(mddev_t *mddev) spin_unlock_irqrestore(&bitmap->lock, flags); /* now sync the final page */ - if (lastpage != NULL || log != NULL) { + if (lastpage != NULL) { spin_lock_irqsave(&bitmap->lock, flags); if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); spin_unlock_irqrestore(&bitmap->lock, flags); - if (lastpage) - write_page(bitmap, lastpage, 0); - else - if (log->type->flush(log)) - bitmap->flags |= BITMAP_WRITE_ERROR; + write_page(bitmap, lastpage, 0); } else { set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); spin_unlock_irqrestore(&bitmap->lock, flags); @@ -1767,12 +1733,10 @@ int bitmap_create(mddev_t *mddev) BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); if (!file - && !mddev->bitmap_info.offset - && !mddev->bitmap_info.log) /* bitmap disabled, nothing to do */ + && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */ return 0; BUG_ON(file && mddev->bitmap_info.offset); - BUG_ON(mddev->bitmap_info.offset && mddev->bitmap_info.log); bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); if (!bitmap) @@ -1863,6 +1827,7 @@ int bitmap_create(mddev_t *mddev) int bitmap_load(mddev_t *mddev) { int err = 0; + sector_t start = 0; sector_t sector = 0; struct bitmap *bitmap = mddev->bitmap; @@ -1881,24 +1846,14 @@ int bitmap_load(mddev_t *mddev) } bitmap_close_sync(bitmap); - if (mddev->bitmap_info.log) { - unsigned long i; - struct dm_dirty_log *log = mddev->bitmap_info.log; - for (i = 0; i < bitmap->chunks; i++) - if (!log->type->in_sync(log, i, 1)) - bitmap_set_memory_bits(bitmap, - (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap), - 1); - } else { - sector_t start = 0; - if (mddev->degraded == 0 - || bitmap->events_cleared == mddev->events) - /* no need to keep dirty bits to optimise a - * re-add of a missing device */ - start = mddev->recovery_cp; - - err = bitmap_init_from_disk(bitmap, start); - } + if (mddev->degraded == 0 + || bitmap->events_cleared == mddev->events) + /* no need to keep dirty bits to optimise a + * re-add of a missing device */ + start = mddev->recovery_cp; + + err = bitmap_init_from_disk(bitmap, start); + if (err) goto out; diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index b2a127e891a..a28f2e5588c 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -212,10 +212,6 @@ struct bitmap { unsigned long file_pages; /* number of pages in the file */ int last_page_size; /* bytes in the last page */ - unsigned long logattrs; /* used when filemap_attr doesn't exist - * because we are working with a dirty_log - */ - unsigned long flags; int allclean; @@ -237,7 +233,6 @@ struct bitmap { wait_queue_head_t behind_wait; struct sysfs_dirent *sysfs_can_clear; - }; /* the bitmap API */ diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index c8827ffd85b..49da55c1528 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -19,7 +19,7 @@ #include <linux/workqueue.h> #include <linux/backing-dev.h> #include <linux/percpu.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/scatterlist.h> #include <asm/page.h> #include <asm/unaligned.h> @@ -30,7 +30,6 @@ #include <linux/device-mapper.h> #define DM_MSG_PREFIX "crypt" -#define MESG_STR(x) x, sizeof(x) /* * context holding the current state of a multi-part conversion @@ -239,7 +238,7 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, struct dm_crypt_request *dmreq) { memset(iv, 0, cc->iv_size); - *(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff); + *(__le32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff); return 0; } @@ -248,7 +247,7 @@ static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, struct dm_crypt_request *dmreq) { memset(iv, 0, cc->iv_size); - *(u64 *)iv = cpu_to_le64(dmreq->iv_sector); + *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); return 0; } @@ -415,7 +414,7 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private; memset(iv, 0, cc->iv_size); - *(u64 *)iv = cpu_to_le64(dmreq->iv_sector); + *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); crypto_cipher_encrypt_one(essiv_tfm, iv, iv); return 0; @@ -1575,11 +1574,17 @@ bad_mem: static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct crypt_config *cc; - unsigned int key_size; + unsigned int key_size, opt_params; unsigned long long tmpll; int ret; + struct dm_arg_set as; + const char *opt_string; + + static struct dm_arg _args[] = { + {0, 1, "Invalid number of feature args"}, + }; - if (argc != 5) { + if (argc < 5) { ti->error = "Not enough arguments"; return -EINVAL; } @@ -1648,6 +1653,30 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } cc->start = tmpll; + argv += 5; + argc -= 5; + + /* Optional parameters */ + if (argc) { + as.argc = argc; + as.argv = argv; + + ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error); + if (ret) + goto bad; + + opt_string = dm_shift_arg(&as); + + if (opt_params == 1 && opt_string && + !strcasecmp(opt_string, "allow_discards")) + ti->num_discard_requests = 1; + else if (opt_params) { + ret = -EINVAL; + ti->error = "Invalid feature arguments"; + goto bad; + } + } + ret = -ENOMEM; cc->io_queue = alloc_workqueue("kcryptd_io", WQ_NON_REENTRANT| @@ -1682,9 +1711,16 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, struct dm_crypt_io *io; struct crypt_config *cc; - if (bio->bi_rw & REQ_FLUSH) { + /* + * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues. + * - for REQ_FLUSH device-mapper core ensures that no IO is in-flight + * - for REQ_DISCARD caller must use flush if IO ordering matters + */ + if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) { cc = ti->private; bio->bi_bdev = cc->dev->bdev; + if (bio_sectors(bio)) + bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector); return DM_MAPIO_REMAPPED; } @@ -1727,6 +1763,10 @@ static int crypt_status(struct dm_target *ti, status_type_t type, DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset, cc->dev->name, (unsigned long long)cc->start); + + if (ti->num_discard_requests) + DMEMIT(" 1 allow_discards"); + break; } return 0; @@ -1770,12 +1810,12 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) if (argc < 2) goto error; - if (!strnicmp(argv[0], MESG_STR("key"))) { + if (!strcasecmp(argv[0], "key")) { if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) { DMWARN("not suspended during key manipulation."); return -EINVAL; } - if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) { + if (argc == 3 && !strcasecmp(argv[1], "set")) { ret = crypt_set_key(cc, argv[2]); if (ret) return ret; @@ -1783,7 +1823,7 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) ret = cc->iv_gen_ops->init(cc); return ret; } - if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) { + if (argc == 2 && !strcasecmp(argv[1], "wipe")) { if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) { ret = cc->iv_gen_ops->wipe(cc); if (ret) @@ -1823,7 +1863,7 @@ static int crypt_iterate_devices(struct dm_target *ti, static struct target_type crypt_target = { .name = "crypt", - .version = {1, 10, 0}, + .version = {1, 11, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index ea790623c30..89f73ca22cf 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2003 Sistina Software (UK) Limited. - * Copyright (C) 2004, 2010 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004, 2010-2011 Red Hat, Inc. All rights reserved. * * This file is released under the GPL. */ @@ -15,6 +15,9 @@ #define DM_MSG_PREFIX "flakey" +#define all_corrupt_bio_flags_match(bio, fc) \ + (((bio)->bi_rw & (fc)->corrupt_bio_flags) == (fc)->corrupt_bio_flags) + /* * Flakey: Used for testing only, simulates intermittent, * catastrophic device failure. @@ -25,60 +28,189 @@ struct flakey_c { sector_t start; unsigned up_interval; unsigned down_interval; + unsigned long flags; + unsigned corrupt_bio_byte; + unsigned corrupt_bio_rw; + unsigned corrupt_bio_value; + unsigned corrupt_bio_flags; +}; + +enum feature_flag_bits { + DROP_WRITES }; +static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, + struct dm_target *ti) +{ + int r; + unsigned argc; + const char *arg_name; + + static struct dm_arg _args[] = { + {0, 6, "Invalid number of feature args"}, + {1, UINT_MAX, "Invalid corrupt bio byte"}, + {0, 255, "Invalid corrupt value to write into bio byte (0-255)"}, + {0, UINT_MAX, "Invalid corrupt bio flags mask"}, + }; + + /* No feature arguments supplied. */ + if (!as->argc) + return 0; + + r = dm_read_arg_group(_args, as, &argc, &ti->error); + if (r) + return r; + + while (argc) { + arg_name = dm_shift_arg(as); + argc--; + + /* + * drop_writes + */ + if (!strcasecmp(arg_name, "drop_writes")) { + if (test_and_set_bit(DROP_WRITES, &fc->flags)) { + ti->error = "Feature drop_writes duplicated"; + return -EINVAL; + } + + continue; + } + + /* + * corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags> + */ + if (!strcasecmp(arg_name, "corrupt_bio_byte")) { + if (!argc) + ti->error = "Feature corrupt_bio_byte requires parameters"; + + r = dm_read_arg(_args + 1, as, &fc->corrupt_bio_byte, &ti->error); + if (r) + return r; + argc--; + + /* + * Direction r or w? + */ + arg_name = dm_shift_arg(as); + if (!strcasecmp(arg_name, "w")) + fc->corrupt_bio_rw = WRITE; + else if (!strcasecmp(arg_name, "r")) + fc->corrupt_bio_rw = READ; + else { + ti->error = "Invalid corrupt bio direction (r or w)"; + return -EINVAL; + } + argc--; + + /* + * Value of byte (0-255) to write in place of correct one. + */ + r = dm_read_arg(_args + 2, as, &fc->corrupt_bio_value, &ti->error); + if (r) + return r; + argc--; + + /* + * Only corrupt bios with these flags set. + */ + r = dm_read_arg(_args + 3, as, &fc->corrupt_bio_flags, &ti->error); + if (r) + return r; + argc--; + + continue; + } + + ti->error = "Unrecognised flakey feature requested"; + return -EINVAL; + } + + if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) { + ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set"; + return -EINVAL; + } + + return 0; +} + /* - * Construct a flakey mapping: <dev_path> <offset> <up interval> <down interval> + * Construct a flakey mapping: + * <dev_path> <offset> <up interval> <down interval> [<#feature args> [<arg>]*] + * + * Feature args: + * [drop_writes] + * [corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>] + * + * Nth_byte starts from 1 for the first byte. + * Direction is r for READ or w for WRITE. + * bio_flags is ignored if 0. */ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) { + static struct dm_arg _args[] = { + {0, UINT_MAX, "Invalid up interval"}, + {0, UINT_MAX, "Invalid down interval"}, + }; + + int r; struct flakey_c *fc; - unsigned long long tmp; + unsigned long long tmpll; + struct dm_arg_set as; + const char *devname; - if (argc != 4) { - ti->error = "dm-flakey: Invalid argument count"; + as.argc = argc; + as.argv = argv; + + if (argc < 4) { + ti->error = "Invalid argument count"; return -EINVAL; } - fc = kmalloc(sizeof(*fc), GFP_KERNEL); + fc = kzalloc(sizeof(*fc), GFP_KERNEL); if (!fc) { - ti->error = "dm-flakey: Cannot allocate linear context"; + ti->error = "Cannot allocate linear context"; return -ENOMEM; } fc->start_time = jiffies; - if (sscanf(argv[1], "%llu", &tmp) != 1) { - ti->error = "dm-flakey: Invalid device sector"; + devname = dm_shift_arg(&as); + + if (sscanf(dm_shift_arg(&as), "%llu", &tmpll) != 1) { + ti->error = "Invalid device sector"; goto bad; } - fc->start = tmp; + fc->start = tmpll; - if (sscanf(argv[2], "%u", &fc->up_interval) != 1) { - ti->error = "dm-flakey: Invalid up interval"; + r = dm_read_arg(_args, &as, &fc->up_interval, &ti->error); + if (r) goto bad; - } - if (sscanf(argv[3], "%u", &fc->down_interval) != 1) { - ti->error = "dm-flakey: Invalid down interval"; + r = dm_read_arg(_args, &as, &fc->down_interval, &ti->error); + if (r) goto bad; - } if (!(fc->up_interval + fc->down_interval)) { - ti->error = "dm-flakey: Total (up + down) interval is zero"; + ti->error = "Total (up + down) interval is zero"; goto bad; } if (fc->up_interval + fc->down_interval < fc->up_interval) { - ti->error = "dm-flakey: Interval overflow"; + ti->error = "Interval overflow"; goto bad; } - if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &fc->dev)) { - ti->error = "dm-flakey: Device lookup failed"; + r = parse_features(&as, fc, ti); + if (r) + goto bad; + + if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &fc->dev)) { + ti->error = "Device lookup failed"; goto bad; } ti->num_flush_requests = 1; + ti->num_discard_requests = 1; ti->private = fc; return 0; @@ -99,7 +231,7 @@ static sector_t flakey_map_sector(struct dm_target *ti, sector_t bi_sector) { struct flakey_c *fc = ti->private; - return fc->start + (bi_sector - ti->begin); + return fc->start + dm_target_offset(ti, bi_sector); } static void flakey_map_bio(struct dm_target *ti, struct bio *bio) @@ -111,6 +243,25 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio) bio->bi_sector = flakey_map_sector(ti, bio->bi_sector); } +static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) +{ + unsigned bio_bytes = bio_cur_bytes(bio); + char *data = bio_data(bio); + + /* + * Overwrite the Nth byte of the data returned. + */ + if (data && bio_bytes >= fc->corrupt_bio_byte) { + data[fc->corrupt_bio_byte - 1] = fc->corrupt_bio_value; + + DMDEBUG("Corrupting data bio=%p by writing %u to byte %u " + "(rw=%c bi_rw=%lu bi_sector=%llu cur_bytes=%u)\n", + bio, fc->corrupt_bio_value, fc->corrupt_bio_byte, + (bio_data_dir(bio) == WRITE) ? 'w' : 'r', + bio->bi_rw, (unsigned long long)bio->bi_sector, bio_bytes); + } +} + static int flakey_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) { @@ -119,18 +270,71 @@ static int flakey_map(struct dm_target *ti, struct bio *bio, /* Are we alive ? */ elapsed = (jiffies - fc->start_time) / HZ; - if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) + if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) { + /* + * Flag this bio as submitted while down. + */ + map_context->ll = 1; + + /* + * Map reads as normal. + */ + if (bio_data_dir(bio) == READ) + goto map_bio; + + /* + * Drop writes? + */ + if (test_bit(DROP_WRITES, &fc->flags)) { + bio_endio(bio, 0); + return DM_MAPIO_SUBMITTED; + } + + /* + * Corrupt matching writes. + */ + if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == WRITE)) { + if (all_corrupt_bio_flags_match(bio, fc)) + corrupt_bio_data(bio, fc); + goto map_bio; + } + + /* + * By default, error all I/O. + */ return -EIO; + } +map_bio: flakey_map_bio(ti, bio); return DM_MAPIO_REMAPPED; } +static int flakey_end_io(struct dm_target *ti, struct bio *bio, + int error, union map_info *map_context) +{ + struct flakey_c *fc = ti->private; + unsigned bio_submitted_while_down = map_context->ll; + + /* + * Corrupt successful READs while in down state. + * If flags were specified, only corrupt those that match. + */ + if (!error && bio_submitted_while_down && + (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) && + all_corrupt_bio_flags_match(bio, fc)) + corrupt_bio_data(bio, fc); + + return error; +} + static int flakey_status(struct dm_target *ti, status_type_t type, char *result, unsigned int maxlen) { + unsigned sz = 0; struct flakey_c *fc = ti->private; + unsigned drop_writes; switch (type) { case STATUSTYPE_INFO: @@ -138,9 +342,22 @@ static int flakey_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: - snprintf(result, maxlen, "%s %llu %u %u", fc->dev->name, - (unsigned long long)fc->start, fc->up_interval, - fc->down_interval); + DMEMIT("%s %llu %u %u ", fc->dev->name, + (unsigned long long)fc->start, fc->up_interval, + fc->down_interval); + + drop_writes = test_bit(DROP_WRITES, &fc->flags); + DMEMIT("%u ", drop_writes + (fc->corrupt_bio_byte > 0) * 5); + + if (drop_writes) + DMEMIT("drop_writes "); + + if (fc->corrupt_bio_byte) + DMEMIT("corrupt_bio_byte %u %c %u %u ", + fc->corrupt_bio_byte, + (fc->corrupt_bio_rw == WRITE) ? 'w' : 'r', + fc->corrupt_bio_value, fc->corrupt_bio_flags); + break; } return 0; @@ -177,11 +394,12 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_ static struct target_type flakey_target = { .name = "flakey", - .version = {1, 1, 0}, + .version = {1, 2, 0}, .module = THIS_MODULE, .ctr = flakey_ctr, .dtr = flakey_dtr, .map = flakey_map, + .end_io = flakey_end_io, .status = flakey_status, .ioctl = flakey_ioctl, .merge = flakey_merge, diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 2067288f61f..ad2eba40e31 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -38,6 +38,8 @@ struct io { struct dm_io_client *client; io_notify_fn callback; void *context; + void *vma_invalidate_address; + unsigned long vma_invalidate_size; } __attribute__((aligned(DM_IO_MAX_REGIONS))); static struct kmem_cache *_dm_io_cache; @@ -116,6 +118,10 @@ static void dec_count(struct io *io, unsigned int region, int error) set_bit(region, &io->error_bits); if (atomic_dec_and_test(&io->count)) { + if (io->vma_invalidate_size) + invalidate_kernel_vmap_range(io->vma_invalidate_address, + io->vma_invalidate_size); + if (io->sleeper) wake_up_process(io->sleeper); @@ -159,6 +165,9 @@ struct dpages { unsigned context_u; void *context_ptr; + + void *vma_invalidate_address; + unsigned long vma_invalidate_size; }; /* @@ -377,6 +386,9 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, io->sleeper = current; io->client = client; + io->vma_invalidate_address = dp->vma_invalidate_address; + io->vma_invalidate_size = dp->vma_invalidate_size; + dispatch_io(rw, num_regions, where, dp, io, 1); while (1) { @@ -415,13 +427,21 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions, io->callback = fn; io->context = context; + io->vma_invalidate_address = dp->vma_invalidate_address; + io->vma_invalidate_size = dp->vma_invalidate_size; + dispatch_io(rw, num_regions, where, dp, io, 0); return 0; } -static int dp_init(struct dm_io_request *io_req, struct dpages *dp) +static int dp_init(struct dm_io_request *io_req, struct dpages *dp, + unsigned long size) { /* Set up dpages based on memory type */ + + dp->vma_invalidate_address = NULL; + dp->vma_invalidate_size = 0; + switch (io_req->mem.type) { case DM_IO_PAGE_LIST: list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset); @@ -432,6 +452,11 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp) break; case DM_IO_VMA: + flush_kernel_vmap_range(io_req->mem.ptr.vma, size); + if ((io_req->bi_rw & RW_MASK) == READ) { + dp->vma_invalidate_address = io_req->mem.ptr.vma; + dp->vma_invalidate_size = size; + } vm_dp_init(dp, io_req->mem.ptr.vma); break; @@ -460,7 +485,7 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions, int r; struct dpages dp; - r = dp_init(io_req, &dp); + r = dp_init(io_req, &dp, (unsigned long)where->count << SECTOR_SHIFT); if (r) return r; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 4cacdad2270..2e9a3ca37bd 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -128,6 +128,24 @@ static struct hash_cell *__get_uuid_cell(const char *str) return NULL; } +static struct hash_cell *__get_dev_cell(uint64_t dev) +{ + struct mapped_device *md; + struct hash_cell *hc; + + md = dm_get_md(huge_decode_dev(dev)); + if (!md) + return NULL; + + hc = dm_get_mdptr(md); + if (!hc) { + dm_put(md); + return NULL; + } + + return hc; +} + /*----------------------------------------------------------------- * Inserting, removing and renaming a device. *---------------------------------------------------------------*/ @@ -718,25 +736,45 @@ static int dev_create(struct dm_ioctl *param, size_t param_size) */ static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) { - struct mapped_device *md; - void *mdptr = NULL; + struct hash_cell *hc = NULL; - if (*param->uuid) - return __get_uuid_cell(param->uuid); + if (*param->uuid) { + if (*param->name || param->dev) + return NULL; - if (*param->name) - return __get_name_cell(param->name); + hc = __get_uuid_cell(param->uuid); + if (!hc) + return NULL; + } else if (*param->name) { + if (param->dev) + return NULL; - md = dm_get_md(huge_decode_dev(param->dev)); - if (!md) - goto out; + hc = __get_name_cell(param->name); + if (!hc) + return NULL; + } else if (param->dev) { + hc = __get_dev_cell(param->dev); + if (!hc) + return NULL; + } else + return NULL; - mdptr = dm_get_mdptr(md); - if (!mdptr) - dm_put(md); + /* + * Sneakily write in both the name and the uuid + * while we have the cell. + */ + strlcpy(param->name, hc->name, sizeof(param->name)); + if (hc->uuid) + strlcpy(param->uuid, hc->uuid, sizeof(param->uuid)); + else + param->uuid[0] = '\0'; -out: - return mdptr; + if (hc->new_map) + param->flags |= DM_INACTIVE_PRESENT_FLAG; + else + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + + return hc; } static struct mapped_device *find_device(struct dm_ioctl *param) @@ -746,24 +784,8 @@ static struct mapped_device *find_device(struct dm_ioctl *param) down_read(&_hash_lock); hc = __find_device_hash_cell(param); - if (hc) { + if (hc) md = hc->md; - - /* - * Sneakily write in both the name and the uuid - * while we have the cell. - */ - strlcpy(param->name, hc->name, sizeof(param->name)); - if (hc->uuid) - strlcpy(param->uuid, hc->uuid, sizeof(param->uuid)); - else - param->uuid[0] = '\0'; - - if (hc->new_map) - param->flags |= DM_INACTIVE_PRESENT_FLAG; - else - param->flags &= ~DM_INACTIVE_PRESENT_FLAG; - } up_read(&_hash_lock); return md; @@ -1402,6 +1424,11 @@ static int target_message(struct dm_ioctl *param, size_t param_size) goto out; } + if (!argc) { + DMWARN("Empty message received."); + goto out; + } + table = dm_get_live_table(md); if (!table) goto out_argv; diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 819e37eaaeb..f8214702963 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -10,7 +10,7 @@ */ #include <linux/types.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/blkdev.h> #include <linux/fs.h> #include <linux/init.h> @@ -224,8 +224,6 @@ struct kcopyd_job { unsigned int num_dests; struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS]; - sector_t offset; - unsigned int nr_pages; struct page_list *pages; /* @@ -380,7 +378,7 @@ static int run_io_job(struct kcopyd_job *job) .bi_rw = job->rw, .mem.type = DM_IO_PAGE_LIST, .mem.ptr.pl = job->pages, - .mem.offset = job->offset, + .mem.offset = 0, .notify.fn = complete_io, .notify.context = job, .client = job->kc->io_client, @@ -397,10 +395,9 @@ static int run_io_job(struct kcopyd_job *job) static int run_pages_job(struct kcopyd_job *job) { int r; + unsigned nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9); - job->nr_pages = dm_div_up(job->dests[0].count + job->offset, - PAGE_SIZE >> 9); - r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages); + r = kcopyd_get_pages(job->kc, nr_pages, &job->pages); if (!r) { /* this job is ready for io */ push(&job->kc->io_jobs, job); @@ -602,8 +599,6 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, job->num_dests = num_dests; memcpy(&job->dests, dests, sizeof(*dests) * num_dests); - job->offset = 0; - job->nr_pages = 0; job->pages = NULL; job->fn = fn; @@ -622,6 +617,37 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, } EXPORT_SYMBOL(dm_kcopyd_copy); +void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc, + dm_kcopyd_notify_fn fn, void *context) +{ + struct kcopyd_job *job; + + job = mempool_alloc(kc->job_pool, GFP_NOIO); + + memset(job, 0, sizeof(struct kcopyd_job)); + job->kc = kc; + job->fn = fn; + job->context = context; + + atomic_inc(&kc->nr_jobs); + + return job; +} +EXPORT_SYMBOL(dm_kcopyd_prepare_callback); + +void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err) +{ + struct kcopyd_job *job = j; + struct dm_kcopyd_client *kc = job->kc; + + job->read_err = read_err; + job->write_err = write_err; + + push(&kc->complete_jobs, job); + wake(kc); +} +EXPORT_SYMBOL(dm_kcopyd_do_callback); + /* * Cancels a kcopyd job, eg. someone might be deactivating a * mirror. diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index aa2e0c374ab..1021c898601 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c @@ -394,8 +394,7 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list) group[count] = fe->region; count++; - list_del(&fe->list); - list_add(&fe->list, &tmp_list); + list_move(&fe->list, &tmp_list); type = fe->type; if (count >= MAX_FLUSH_GROUP_COUNT) diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 948e3f4925b..3b52bb72bd1 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -197,15 +197,21 @@ EXPORT_SYMBOL(dm_dirty_log_destroy); #define MIRROR_DISK_VERSION 2 #define LOG_OFFSET 2 -struct log_header { - uint32_t magic; +struct log_header_disk { + __le32 magic; /* * Simple, incrementing version. no backward * compatibility. */ + __le32 version; + __le64 nr_regions; +} __packed; + +struct log_header_core { + uint32_t magic; uint32_t version; - sector_t nr_regions; + uint64_t nr_regions; }; struct log_c { @@ -239,10 +245,10 @@ struct log_c { int log_dev_failed; int log_dev_flush_failed; struct dm_dev *log_dev; - struct log_header header; + struct log_header_core header; struct dm_io_region header_location; - struct log_header *disk_header; + struct log_header_disk *disk_header; }; /* @@ -251,34 +257,34 @@ struct log_c { */ static inline int log_test_bit(uint32_t *bs, unsigned bit) { - return test_bit_le(bit, (unsigned long *) bs) ? 1 : 0; + return test_bit_le(bit, bs) ? 1 : 0; } static inline void log_set_bit(struct log_c *l, uint32_t *bs, unsigned bit) { - __test_and_set_bit_le(bit, (unsigned long *) bs); + __set_bit_le(bit, bs); l->touched_cleaned = 1; } static inline void log_clear_bit(struct log_c *l, uint32_t *bs, unsigned bit) { - __test_and_clear_bit_le(bit, (unsigned long *) bs); + __clear_bit_le(bit, bs); l->touched_dirtied = 1; } /*---------------------------------------------------------------- * Header IO *--------------------------------------------------------------*/ -static void header_to_disk(struct log_header *core, struct log_header *disk) +static void header_to_disk(struct log_header_core *core, struct log_header_disk *disk) { disk->magic = cpu_to_le32(core->magic); disk->version = cpu_to_le32(core->version); disk->nr_regions = cpu_to_le64(core->nr_regions); } -static void header_from_disk(struct log_header *core, struct log_header *disk) +static void header_from_disk(struct log_header_core *core, struct log_header_disk *disk) { core->magic = le32_to_cpu(disk->magic); core->version = le32_to_cpu(disk->version); @@ -486,7 +492,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size); lc->sync_count = (sync == NOSYNC) ? region_count : 0; - lc->recovering_bits = vmalloc(bitset_size); + lc->recovering_bits = vzalloc(bitset_size); if (!lc->recovering_bits) { DMWARN("couldn't allocate sync bitset"); vfree(lc->sync_bits); @@ -498,7 +504,6 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, kfree(lc); return -ENOMEM; } - memset(lc->recovering_bits, 0, bitset_size); lc->sync_search = 0; log->context = lc; @@ -739,8 +744,7 @@ static int core_get_resync_work(struct dm_dirty_log *log, region_t *region) return 0; do { - *region = find_next_zero_bit_le( - (unsigned long *) lc->sync_bits, + *region = find_next_zero_bit_le(lc->sync_bits, lc->region_count, lc->sync_search); lc->sync_search = *region + 1; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index aa4e570c2cb..5e0090ef418 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -19,10 +19,9 @@ #include <linux/time.h> #include <linux/workqueue.h> #include <scsi/scsi_dh.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #define DM_MSG_PREFIX "multipath" -#define MESG_STR(x) x, sizeof(x) #define DM_PG_INIT_DELAY_MSECS 2000 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1) @@ -505,80 +504,29 @@ static void trigger_event(struct work_struct *work) * <#paths> <#per-path selector args> * [<path> [<arg>]* ]+ ]+ *---------------------------------------------------------------*/ -struct param { - unsigned min; - unsigned max; - char *error; -}; - -static int read_param(struct param *param, char *str, unsigned *v, char **error) -{ - if (!str || - (sscanf(str, "%u", v) != 1) || - (*v < param->min) || - (*v > param->max)) { - *error = param->error; - return -EINVAL; - } - - return 0; -} - -struct arg_set { - unsigned argc; - char **argv; -}; - -static char *shift(struct arg_set *as) -{ - char *r; - - if (as->argc) { - as->argc--; - r = *as->argv; - as->argv++; - return r; - } - - return NULL; -} - -static void consume(struct arg_set *as, unsigned n) -{ - BUG_ON (as->argc < n); - as->argc -= n; - as->argv += n; -} - -static int parse_path_selector(struct arg_set *as, struct priority_group *pg, +static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg, struct dm_target *ti) { int r; struct path_selector_type *pst; unsigned ps_argc; - static struct param _params[] = { + static struct dm_arg _args[] = { {0, 1024, "invalid number of path selector args"}, }; - pst = dm_get_path_selector(shift(as)); + pst = dm_get_path_selector(dm_shift_arg(as)); if (!pst) { ti->error = "unknown path selector type"; return -EINVAL; } - r = read_param(_params, shift(as), &ps_argc, &ti->error); + r = dm_read_arg_group(_args, as, &ps_argc, &ti->error); if (r) { dm_put_path_selector(pst); return -EINVAL; } - if (ps_argc > as->argc) { - dm_put_path_selector(pst); - ti->error = "not enough arguments for path selector"; - return -EINVAL; - } - r = pst->create(&pg->ps, ps_argc, as->argv); if (r) { dm_put_path_selector(pst); @@ -587,12 +535,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg, } pg->ps.type = pst; - consume(as, ps_argc); + dm_consume_args(as, ps_argc); return 0; } -static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, +static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps, struct dm_target *ti) { int r; @@ -609,7 +557,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, if (!p) return ERR_PTR(-ENOMEM); - r = dm_get_device(ti, shift(as), dm_table_get_mode(ti->table), + r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), &p->path.dev); if (r) { ti->error = "error getting device"; @@ -660,16 +608,16 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, return ERR_PTR(r); } -static struct priority_group *parse_priority_group(struct arg_set *as, +static struct priority_group *parse_priority_group(struct dm_arg_set *as, struct multipath *m) { - static struct param _params[] = { + static struct dm_arg _args[] = { {1, 1024, "invalid number of paths"}, {0, 1024, "invalid number of selector args"} }; int r; - unsigned i, nr_selector_args, nr_params; + unsigned i, nr_selector_args, nr_args; struct priority_group *pg; struct dm_target *ti = m->ti; @@ -693,26 +641,26 @@ static struct priority_group *parse_priority_group(struct arg_set *as, /* * read the paths */ - r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error); + r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error); if (r) goto bad; - r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error); + r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error); if (r) goto bad; - nr_params = 1 + nr_selector_args; + nr_args = 1 + nr_selector_args; for (i = 0; i < pg->nr_pgpaths; i++) { struct pgpath *pgpath; - struct arg_set path_args; + struct dm_arg_set path_args; - if (as->argc < nr_params) { + if (as->argc < nr_args) { ti->error = "not enough path parameters"; r = -EINVAL; goto bad; } - path_args.argc = nr_params; + path_args.argc = nr_args; path_args.argv = as->argv; pgpath = parse_path(&path_args, &pg->ps, ti); @@ -723,7 +671,7 @@ static struct priority_group *parse_priority_group(struct arg_set *as, pgpath->pg = pg; list_add_tail(&pgpath->list, &pg->pgpaths); - consume(as, nr_params); + dm_consume_args(as, nr_args); } return pg; @@ -733,28 +681,23 @@ static struct priority_group *parse_priority_group(struct arg_set *as, return ERR_PTR(r); } -static int parse_hw_handler(struct arg_set *as, struct multipath *m) +static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m) { unsigned hw_argc; int ret; struct dm_target *ti = m->ti; - static struct param _params[] = { + static struct dm_arg _args[] = { {0, 1024, "invalid number of hardware handler args"}, }; - if (read_param(_params, shift(as), &hw_argc, &ti->error)) + if (dm_read_arg_group(_args, as, &hw_argc, &ti->error)) return -EINVAL; if (!hw_argc) return 0; - if (hw_argc > as->argc) { - ti->error = "not enough arguments for hardware handler"; - return -EINVAL; - } - - m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL); + m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL); request_module("scsi_dh_%s", m->hw_handler_name); if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { ti->error = "unknown hardware handler type"; @@ -778,7 +721,7 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m) for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1) j = sprintf(p, "%s", as->argv[i]); } - consume(as, hw_argc - 1); + dm_consume_args(as, hw_argc - 1); return 0; fail: @@ -787,20 +730,20 @@ fail: return ret; } -static int parse_features(struct arg_set *as, struct multipath *m) +static int parse_features(struct dm_arg_set *as, struct multipath *m) { int r; unsigned argc; struct dm_target *ti = m->ti; - const char *param_name; + const char *arg_name; - static struct param _params[] = { + static struct dm_arg _args[] = { {0, 5, "invalid number of feature args"}, {1, 50, "pg_init_retries must be between 1 and 50"}, {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, }; - r = read_param(_params, shift(as), &argc, &ti->error); + r = dm_read_arg_group(_args, as, &argc, &ti->error); if (r) return -EINVAL; @@ -808,26 +751,24 @@ static int parse_features(struct arg_set *as, struct multipath *m) return 0; do { - param_name = shift(as); + arg_name = dm_shift_arg(as); argc--; - if (!strnicmp(param_name, MESG_STR("queue_if_no_path"))) { + if (!strcasecmp(arg_name, "queue_if_no_path")) { r = queue_if_no_path(m, 1, 0); continue; } - if (!strnicmp(param_name, MESG_STR("pg_init_retries")) && + if (!strcasecmp(arg_name, "pg_init_retries") && (argc >= 1)) { - r = read_param(_params + 1, shift(as), - &m->pg_init_retries, &ti->error); + r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error); argc--; continue; } - if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) && + if (!strcasecmp(arg_name, "pg_init_delay_msecs") && (argc >= 1)) { - r = read_param(_params + 2, shift(as), - &m->pg_init_delay_msecs, &ti->error); + r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error); argc--; continue; } @@ -842,15 +783,15 @@ static int parse_features(struct arg_set *as, struct multipath *m) static int multipath_ctr(struct dm_target *ti, unsigned int argc, char **argv) { - /* target parameters */ - static struct param _params[] = { + /* target arguments */ + static struct dm_arg _args[] = { {0, 1024, "invalid number of priority groups"}, {0, 1024, "invalid initial priority group number"}, }; int r; struct multipath *m; - struct arg_set as; + struct dm_arg_set as; unsigned pg_count = 0; unsigned next_pg_num; @@ -871,11 +812,11 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, if (r) goto bad; - r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error); + r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error); if (r) goto bad; - r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error); + r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error); if (r) goto bad; @@ -1505,10 +1446,10 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) } if (argc == 1) { - if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) { + if (!strcasecmp(argv[0], "queue_if_no_path")) { r = queue_if_no_path(m, 1, 0); goto out; - } else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) { + } else if (!strcasecmp(argv[0], "fail_if_no_path")) { r = queue_if_no_path(m, 0, 0); goto out; } @@ -1519,18 +1460,18 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) goto out; } - if (!strnicmp(argv[0], MESG_STR("disable_group"))) { + if (!strcasecmp(argv[0], "disable_group")) { r = bypass_pg_num(m, argv[1], 1); goto out; - } else if (!strnicmp(argv[0], MESG_STR("enable_group"))) { + } else if (!strcasecmp(argv[0], "enable_group")) { r = bypass_pg_num(m, argv[1], 0); goto out; - } else if (!strnicmp(argv[0], MESG_STR("switch_group"))) { + } else if (!strcasecmp(argv[0], "switch_group")) { r = switch_pg_num(m, argv[1]); goto out; - } else if (!strnicmp(argv[0], MESG_STR("reinstate_path"))) + } else if (!strcasecmp(argv[0], "reinstate_path")) action = reinstate_path; - else if (!strnicmp(argv[0], MESG_STR("fail_path"))) + else if (!strcasecmp(argv[0], "fail_path")) action = fail_path; else { DMWARN("Unrecognised multipath message received."); diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c index f92b6cea9d9..03a837aa5ce 100644 --- a/drivers/md/dm-queue-length.c +++ b/drivers/md/dm-queue-length.c @@ -20,7 +20,7 @@ #include <linux/ctype.h> #include <linux/errno.h> #include <linux/module.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #define DM_MSG_PREFIX "multipath queue-length" #define QL_MIN_IO 128 diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index e5d8904fc8f..a002dd85db1 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -8,19 +8,19 @@ #include <linux/slab.h> #include "md.h" +#include "raid1.h" #include "raid5.h" -#include "dm.h" #include "bitmap.h" +#include <linux/device-mapper.h> + #define DM_MSG_PREFIX "raid" /* - * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then - * make it so the flag doesn't set anything. + * The following flags are used by dm-raid.c to set up the array state. + * They must be cleared before md_run is called. */ -#ifndef MD_SYNC_STATE_FORCED -#define MD_SYNC_STATE_FORCED 0 -#endif +#define FirstUse 10 /* rdev flag */ struct raid_dev { /* @@ -43,14 +43,15 @@ struct raid_dev { /* * Flags for rs->print_flags field. */ -#define DMPF_DAEMON_SLEEP 0x1 -#define DMPF_MAX_WRITE_BEHIND 0x2 -#define DMPF_SYNC 0x4 -#define DMPF_NOSYNC 0x8 -#define DMPF_STRIPE_CACHE 0x10 -#define DMPF_MIN_RECOVERY_RATE 0x20 -#define DMPF_MAX_RECOVERY_RATE 0x40 - +#define DMPF_SYNC 0x1 +#define DMPF_NOSYNC 0x2 +#define DMPF_REBUILD 0x4 +#define DMPF_DAEMON_SLEEP 0x8 +#define DMPF_MIN_RECOVERY_RATE 0x10 +#define DMPF_MAX_RECOVERY_RATE 0x20 +#define DMPF_MAX_WRITE_BEHIND 0x40 +#define DMPF_STRIPE_CACHE 0x80 +#define DMPF_REGION_SIZE 0X100 struct raid_set { struct dm_target *ti; @@ -72,6 +73,7 @@ static struct raid_type { const unsigned level; /* RAID level. */ const unsigned algorithm; /* RAID algorithm. */ } raid_types[] = { + {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, @@ -105,7 +107,8 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra } sectors_per_dev = ti->len; - if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) { + if ((raid_type->level > 1) && + sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) { ti->error = "Target length not divisible by number of data devices"; return ERR_PTR(-EINVAL); } @@ -147,9 +150,16 @@ static void context_free(struct raid_set *rs) { int i; - for (i = 0; i < rs->md.raid_disks; i++) + for (i = 0; i < rs->md.raid_disks; i++) { + if (rs->dev[i].meta_dev) + dm_put_device(rs->ti, rs->dev[i].meta_dev); + if (rs->dev[i].rdev.sb_page) + put_page(rs->dev[i].rdev.sb_page); + rs->dev[i].rdev.sb_page = NULL; + rs->dev[i].rdev.sb_loaded = 0; if (rs->dev[i].data_dev) dm_put_device(rs->ti, rs->dev[i].data_dev); + } kfree(rs); } @@ -159,7 +169,16 @@ static void context_free(struct raid_set *rs) * <meta_dev>: meta device name or '-' if missing * <data_dev>: data device name or '-' if missing * - * This code parses those words. + * The following are permitted: + * - - + * - <data_dev> + * <meta_dev> <data_dev> + * + * The following is not allowed: + * <meta_dev> - + * + * This code parses those words. If there is a failure, + * the caller must use context_free to unwind the operations. */ static int dev_parms(struct raid_set *rs, char **argv) { @@ -182,8 +201,16 @@ static int dev_parms(struct raid_set *rs, char **argv) rs->dev[i].rdev.mddev = &rs->md; if (strcmp(argv[0], "-")) { - rs->ti->error = "Metadata devices not supported"; - return -EINVAL; + ret = dm_get_device(rs->ti, argv[0], + dm_table_get_mode(rs->ti->table), + &rs->dev[i].meta_dev); + rs->ti->error = "RAID metadata device lookup failure"; + if (ret) + return ret; + + rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL); + if (!rs->dev[i].rdev.sb_page) + return -ENOMEM; } if (!strcmp(argv[1], "-")) { @@ -193,6 +220,10 @@ static int dev_parms(struct raid_set *rs, char **argv) return -EINVAL; } + rs->ti->error = "No data device supplied with metadata device"; + if (rs->dev[i].meta_dev) + return -EINVAL; + continue; } @@ -204,6 +235,10 @@ static int dev_parms(struct raid_set *rs, char **argv) return ret; } + if (rs->dev[i].meta_dev) { + metadata_available = 1; + rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev; + } rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev; list_add(&rs->dev[i].rdev.same_set, &rs->md.disks); if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) @@ -235,33 +270,109 @@ static int dev_parms(struct raid_set *rs, char **argv) } /* + * validate_region_size + * @rs + * @region_size: region size in sectors. If 0, pick a size (4MiB default). + * + * Set rs->md.bitmap_info.chunksize (which really refers to 'region size'). + * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap. + * + * Returns: 0 on success, -EINVAL on failure. + */ +static int validate_region_size(struct raid_set *rs, unsigned long region_size) +{ + unsigned long min_region_size = rs->ti->len / (1 << 21); + + if (!region_size) { + /* + * Choose a reasonable default. All figures in sectors. + */ + if (min_region_size > (1 << 13)) { + DMINFO("Choosing default region size of %lu sectors", + region_size); + region_size = min_region_size; + } else { + DMINFO("Choosing default region size of 4MiB"); + region_size = 1 << 13; /* sectors */ + } + } else { + /* + * Validate user-supplied value. + */ + if (region_size > rs->ti->len) { + rs->ti->error = "Supplied region size is too large"; + return -EINVAL; + } + + if (region_size < min_region_size) { + DMERR("Supplied region_size (%lu sectors) below minimum (%lu)", + region_size, min_region_size); + rs->ti->error = "Supplied region size is too small"; + return -EINVAL; + } + + if (!is_power_of_2(region_size)) { + rs->ti->error = "Region size is not a power of 2"; + return -EINVAL; + } + + if (region_size < rs->md.chunk_sectors) { + rs->ti->error = "Region size is smaller than the chunk size"; + return -EINVAL; + } + } + + /* + * Convert sectors to bytes. + */ + rs->md.bitmap_info.chunksize = (region_size << 9); + + return 0; +} + +/* * Possible arguments are... - * RAID456: * <chunk_size> [optional_args] * - * Optional args: - * [[no]sync] Force or prevent recovery of the entire array + * Argument definitions + * <chunk_size> The number of sectors per disk that + * will form the "stripe" + * [[no]sync] Force or prevent recovery of the + * entire array * [rebuild <idx>] Rebuild the drive indicated by the index - * [daemon_sleep <ms>] Time between bitmap daemon work to clear bits + * [daemon_sleep <ms>] Time between bitmap daemon work to + * clear bits * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization + * [write_mostly <idx>] Indicate a write mostly drive via index * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) * [stripe_cache <sectors>] Stripe cache size for higher RAIDs + * [region_size <sectors>] Defines granularity of bitmap */ static int parse_raid_params(struct raid_set *rs, char **argv, unsigned num_raid_params) { unsigned i, rebuild_cnt = 0; - unsigned long value; + unsigned long value, region_size = 0; char *key; /* * First, parse the in-order required arguments + * "chunk_size" is the only argument of this type. */ - if ((strict_strtoul(argv[0], 10, &value) < 0) || - !is_power_of_2(value) || (value < 8)) { + if ((strict_strtoul(argv[0], 10, &value) < 0)) { rs->ti->error = "Bad chunk size"; return -EINVAL; + } else if (rs->raid_type->level == 1) { + if (value) + DMERR("Ignoring chunk size parameter for RAID 1"); + value = 0; + } else if (!is_power_of_2(value)) { + rs->ti->error = "Chunk size must be a power of 2"; + return -EINVAL; + } else if (value < 8) { + rs->ti->error = "Chunk size value is too small"; + return -EINVAL; } rs->md.new_chunk_sectors = rs->md.chunk_sectors = value; @@ -269,22 +380,39 @@ static int parse_raid_params(struct raid_set *rs, char **argv, num_raid_params--; /* - * Second, parse the unordered optional arguments + * We set each individual device as In_sync with a completed + * 'recovery_offset'. If there has been a device failure or + * replacement then one of the following cases applies: + * + * 1) User specifies 'rebuild'. + * - Device is reset when param is read. + * 2) A new device is supplied. + * - No matching superblock found, resets device. + * 3) Device failure was transient and returns on reload. + * - Failure noticed, resets device for bitmap replay. + * 4) Device hadn't completed recovery after previous failure. + * - Superblock is read and overrides recovery_offset. + * + * What is found in the superblocks of the devices is always + * authoritative, unless 'rebuild' or '[no]sync' was specified. */ - for (i = 0; i < rs->md.raid_disks; i++) + for (i = 0; i < rs->md.raid_disks; i++) { set_bit(In_sync, &rs->dev[i].rdev.flags); + rs->dev[i].rdev.recovery_offset = MaxSector; + } + /* + * Second, parse the unordered optional arguments + */ for (i = 0; i < num_raid_params; i++) { - if (!strcmp(argv[i], "nosync")) { + if (!strcasecmp(argv[i], "nosync")) { rs->md.recovery_cp = MaxSector; rs->print_flags |= DMPF_NOSYNC; - rs->md.flags |= MD_SYNC_STATE_FORCED; continue; } - if (!strcmp(argv[i], "sync")) { + if (!strcasecmp(argv[i], "sync")) { rs->md.recovery_cp = 0; rs->print_flags |= DMPF_SYNC; - rs->md.flags |= MD_SYNC_STATE_FORCED; continue; } @@ -300,9 +428,13 @@ static int parse_raid_params(struct raid_set *rs, char **argv, return -EINVAL; } - if (!strcmp(key, "rebuild")) { - if (++rebuild_cnt > rs->raid_type->parity_devs) { - rs->ti->error = "Too many rebuild drives given"; + if (!strcasecmp(key, "rebuild")) { + rebuild_cnt++; + if (((rs->raid_type->level != 1) && + (rebuild_cnt > rs->raid_type->parity_devs)) || + ((rs->raid_type->level == 1) && + (rebuild_cnt > (rs->md.raid_disks - 1)))) { + rs->ti->error = "Too many rebuild devices specified for given RAID type"; return -EINVAL; } if (value > rs->md.raid_disks) { @@ -311,7 +443,22 @@ static int parse_raid_params(struct raid_set *rs, char **argv, } clear_bit(In_sync, &rs->dev[value].rdev.flags); rs->dev[value].rdev.recovery_offset = 0; - } else if (!strcmp(key, "max_write_behind")) { + rs->print_flags |= DMPF_REBUILD; + } else if (!strcasecmp(key, "write_mostly")) { + if (rs->raid_type->level != 1) { + rs->ti->error = "write_mostly option is only valid for RAID1"; + return -EINVAL; + } + if (value > rs->md.raid_disks) { + rs->ti->error = "Invalid write_mostly drive index given"; + return -EINVAL; + } + set_bit(WriteMostly, &rs->dev[value].rdev.flags); + } else if (!strcasecmp(key, "max_write_behind")) { + if (rs->raid_type->level != 1) { + rs->ti->error = "max_write_behind option is only valid for RAID1"; + return -EINVAL; + } rs->print_flags |= DMPF_MAX_WRITE_BEHIND; /* @@ -324,14 +471,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv, return -EINVAL; } rs->md.bitmap_info.max_write_behind = value; - } else if (!strcmp(key, "daemon_sleep")) { + } else if (!strcasecmp(key, "daemon_sleep")) { rs->print_flags |= DMPF_DAEMON_SLEEP; if (!value || (value > MAX_SCHEDULE_TIMEOUT)) { rs->ti->error = "daemon sleep period out of range"; return -EINVAL; } rs->md.bitmap_info.daemon_sleep = value; - } else if (!strcmp(key, "stripe_cache")) { + } else if (!strcasecmp(key, "stripe_cache")) { rs->print_flags |= DMPF_STRIPE_CACHE; /* @@ -348,20 +495,23 @@ static int parse_raid_params(struct raid_set *rs, char **argv, rs->ti->error = "Bad stripe_cache size"; return -EINVAL; } - } else if (!strcmp(key, "min_recovery_rate")) { + } else if (!strcasecmp(key, "min_recovery_rate")) { rs->print_flags |= DMPF_MIN_RECOVERY_RATE; if (value > INT_MAX) { rs->ti->error = "min_recovery_rate out of range"; return -EINVAL; } rs->md.sync_speed_min = (int)value; - } else if (!strcmp(key, "max_recovery_rate")) { + } else if (!strcasecmp(key, "max_recovery_rate")) { rs->print_flags |= DMPF_MAX_RECOVERY_RATE; if (value > INT_MAX) { rs->ti->error = "max_recovery_rate out of range"; return -EINVAL; } rs->md.sync_speed_max = (int)value; + } else if (!strcasecmp(key, "region_size")) { + rs->print_flags |= DMPF_REGION_SIZE; + region_size = value; } else { DMERR("Unable to parse RAID parameter: %s", key); rs->ti->error = "Unable to parse RAID parameters"; @@ -369,6 +519,19 @@ static int parse_raid_params(struct raid_set *rs, char **argv, } } + if (validate_region_size(rs, region_size)) + return -EINVAL; + + if (rs->md.chunk_sectors) + rs->ti->split_io = rs->md.chunk_sectors; + else + rs->ti->split_io = region_size; + + if (rs->md.chunk_sectors) + rs->ti->split_io = rs->md.chunk_sectors; + else + rs->ti->split_io = region_size; + /* Assume there are no metadata devices until the drives are parsed */ rs->md.persistent = 0; rs->md.external = 1; @@ -387,17 +550,351 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits) { struct raid_set *rs = container_of(cb, struct raid_set, callbacks); + if (rs->raid_type->level == 1) + return md_raid1_congested(&rs->md, bits); + return md_raid5_congested(&rs->md, bits); } /* + * This structure is never routinely used by userspace, unlike md superblocks. + * Devices with this superblock should only ever be accessed via device-mapper. + */ +#define DM_RAID_MAGIC 0x64526D44 +struct dm_raid_superblock { + __le32 magic; /* "DmRd" */ + __le32 features; /* Used to indicate possible future changes */ + + __le32 num_devices; /* Number of devices in this array. (Max 64) */ + __le32 array_position; /* The position of this drive in the array */ + + __le64 events; /* Incremented by md when superblock updated */ + __le64 failed_devices; /* Bit field of devices to indicate failures */ + + /* + * This offset tracks the progress of the repair or replacement of + * an individual drive. + */ + __le64 disk_recovery_offset; + + /* + * This offset tracks the progress of the initial array + * synchronisation/parity calculation. + */ + __le64 array_resync_offset; + + /* + * RAID characteristics + */ + __le32 level; + __le32 layout; + __le32 stripe_sectors; + + __u8 pad[452]; /* Round struct to 512 bytes. */ + /* Always set to 0 when writing. */ +} __packed; + +static int read_disk_sb(mdk_rdev_t *rdev, int size) +{ + BUG_ON(!rdev->sb_page); + + if (rdev->sb_loaded) + return 0; + + if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { + DMERR("Failed to read device superblock"); + return -EINVAL; + } + + rdev->sb_loaded = 1; + + return 0; +} + +static void super_sync(mddev_t *mddev, mdk_rdev_t *rdev) +{ + mdk_rdev_t *r, *t; + uint64_t failed_devices; + struct dm_raid_superblock *sb; + + sb = page_address(rdev->sb_page); + failed_devices = le64_to_cpu(sb->failed_devices); + + rdev_for_each(r, t, mddev) + if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) + failed_devices |= (1ULL << r->raid_disk); + + memset(sb, 0, sizeof(*sb)); + + sb->magic = cpu_to_le32(DM_RAID_MAGIC); + sb->features = cpu_to_le32(0); /* No features yet */ + + sb->num_devices = cpu_to_le32(mddev->raid_disks); + sb->array_position = cpu_to_le32(rdev->raid_disk); + + sb->events = cpu_to_le64(mddev->events); + sb->failed_devices = cpu_to_le64(failed_devices); + + sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset); + sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp); + + sb->level = cpu_to_le32(mddev->level); + sb->layout = cpu_to_le32(mddev->layout); + sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors); +} + +/* + * super_load + * + * This function creates a superblock if one is not found on the device + * and will decide which superblock to use if there's a choice. + * + * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise + */ +static int super_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev) +{ + int ret; + struct dm_raid_superblock *sb; + struct dm_raid_superblock *refsb; + uint64_t events_sb, events_refsb; + + rdev->sb_start = 0; + rdev->sb_size = sizeof(*sb); + + ret = read_disk_sb(rdev, rdev->sb_size); + if (ret) + return ret; + + sb = page_address(rdev->sb_page); + if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) { + super_sync(rdev->mddev, rdev); + + set_bit(FirstUse, &rdev->flags); + + /* Force writing of superblocks to disk */ + set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags); + + /* Any superblock is better than none, choose that if given */ + return refdev ? 0 : 1; + } + + if (!refdev) + return 1; + + events_sb = le64_to_cpu(sb->events); + + refsb = page_address(refdev->sb_page); + events_refsb = le64_to_cpu(refsb->events); + + return (events_sb > events_refsb) ? 1 : 0; +} + +static int super_init_validation(mddev_t *mddev, mdk_rdev_t *rdev) +{ + int role; + struct raid_set *rs = container_of(mddev, struct raid_set, md); + uint64_t events_sb; + uint64_t failed_devices; + struct dm_raid_superblock *sb; + uint32_t new_devs = 0; + uint32_t rebuilds = 0; + mdk_rdev_t *r, *t; + struct dm_raid_superblock *sb2; + + sb = page_address(rdev->sb_page); + events_sb = le64_to_cpu(sb->events); + failed_devices = le64_to_cpu(sb->failed_devices); + + /* + * Initialise to 1 if this is a new superblock. + */ + mddev->events = events_sb ? : 1; + + /* + * Reshaping is not currently allowed + */ + if ((le32_to_cpu(sb->level) != mddev->level) || + (le32_to_cpu(sb->layout) != mddev->layout) || + (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) { + DMERR("Reshaping arrays not yet supported."); + return -EINVAL; + } + + /* We can only change the number of devices in RAID1 right now */ + if ((rs->raid_type->level != 1) && + (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { + DMERR("Reshaping arrays not yet supported."); + return -EINVAL; + } + + if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))) + mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset); + + /* + * During load, we set FirstUse if a new superblock was written. + * There are two reasons we might not have a superblock: + * 1) The array is brand new - in which case, all of the + * devices must have their In_sync bit set. Also, + * recovery_cp must be 0, unless forced. + * 2) This is a new device being added to an old array + * and the new device needs to be rebuilt - in which + * case the In_sync bit will /not/ be set and + * recovery_cp must be MaxSector. + */ + rdev_for_each(r, t, mddev) { + if (!test_bit(In_sync, &r->flags)) { + if (!test_bit(FirstUse, &r->flags)) + DMERR("Superblock area of " + "rebuild device %d should have been " + "cleared.", r->raid_disk); + set_bit(FirstUse, &r->flags); + rebuilds++; + } else if (test_bit(FirstUse, &r->flags)) + new_devs++; + } + + if (!rebuilds) { + if (new_devs == mddev->raid_disks) { + DMINFO("Superblocks created for new array"); + set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); + } else if (new_devs) { + DMERR("New device injected " + "into existing array without 'rebuild' " + "parameter specified"); + return -EINVAL; + } + } else if (new_devs) { + DMERR("'rebuild' devices cannot be " + "injected into an array with other first-time devices"); + return -EINVAL; + } else if (mddev->recovery_cp != MaxSector) { + DMERR("'rebuild' specified while array is not in-sync"); + return -EINVAL; + } + + /* + * Now we set the Faulty bit for those devices that are + * recorded in the superblock as failed. + */ + rdev_for_each(r, t, mddev) { + if (!r->sb_page) + continue; + sb2 = page_address(r->sb_page); + sb2->failed_devices = 0; + + /* + * Check for any device re-ordering. + */ + if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) { + role = le32_to_cpu(sb2->array_position); + if (role != r->raid_disk) { + if (rs->raid_type->level != 1) { + rs->ti->error = "Cannot change device " + "positions in RAID array"; + return -EINVAL; + } + DMINFO("RAID1 device #%d now at position #%d", + role, r->raid_disk); + } + + /* + * Partial recovery is performed on + * returning failed devices. + */ + if (failed_devices & (1 << role)) + set_bit(Faulty, &r->flags); + } + } + + return 0; +} + +static int super_validate(mddev_t *mddev, mdk_rdev_t *rdev) +{ + struct dm_raid_superblock *sb = page_address(rdev->sb_page); + + /* + * If mddev->events is not set, we know we have not yet initialized + * the array. + */ + if (!mddev->events && super_init_validation(mddev, rdev)) + return -EINVAL; + + mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */ + rdev->mddev->bitmap_info.default_offset = 4096 >> 9; + if (!test_bit(FirstUse, &rdev->flags)) { + rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset); + if (rdev->recovery_offset != MaxSector) + clear_bit(In_sync, &rdev->flags); + } + + /* + * If a device comes back, set it as not In_sync and no longer faulty. + */ + if (test_bit(Faulty, &rdev->flags)) { + clear_bit(Faulty, &rdev->flags); + clear_bit(In_sync, &rdev->flags); + rdev->saved_raid_disk = rdev->raid_disk; + rdev->recovery_offset = 0; + } + + clear_bit(FirstUse, &rdev->flags); + + return 0; +} + +/* + * Analyse superblocks and select the freshest. + */ +static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) +{ + int ret; + mdk_rdev_t *rdev, *freshest, *tmp; + mddev_t *mddev = &rs->md; + + freshest = NULL; + rdev_for_each(rdev, tmp, mddev) { + if (!rdev->meta_bdev) + continue; + + ret = super_load(rdev, freshest); + + switch (ret) { + case 1: + freshest = rdev; + break; + case 0: + break; + default: + ti->error = "Failed to load superblock"; + return ret; + } + } + + if (!freshest) + return 0; + + /* + * Validation of the freshest device provides the source of + * validation for the remaining devices. + */ + ti->error = "Unable to assemble array: Invalid superblocks"; + if (super_validate(mddev, freshest)) + return -EINVAL; + + rdev_for_each(rdev, tmp, mddev) + if ((rdev != freshest) && super_validate(mddev, rdev)) + return -EINVAL; + + return 0; +} + +/* * Construct a RAID4/5/6 mapping: * Args: * <raid_type> <#raid_params> <raid_params> \ * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> } * - * ** metadata devices are not supported yet, use '-' instead ** - * * <raid_params> varies by <raid_type>. See 'parse_raid_params' for * details on possible <raid_params>. */ @@ -465,8 +962,12 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) if (ret) goto bad; + rs->md.sync_super = super_sync; + ret = analyse_superblocks(ti, rs); + if (ret) + goto bad; + INIT_WORK(&rs->md.event_work, do_table_event); - ti->split_io = rs->md.chunk_sectors; ti->private = rs; mutex_lock(&rs->md.reconfig_mutex); @@ -482,6 +983,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) rs->callbacks.congested_fn = raid_is_congested; dm_table_add_target_callbacks(ti->table, &rs->callbacks); + mddev_suspend(&rs->md); return 0; bad: @@ -546,12 +1048,17 @@ static int raid_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: /* The string you would use to construct this array */ - for (i = 0; i < rs->md.raid_disks; i++) - if (rs->dev[i].data_dev && + for (i = 0; i < rs->md.raid_disks; i++) { + if ((rs->print_flags & DMPF_REBUILD) && + rs->dev[i].data_dev && !test_bit(In_sync, &rs->dev[i].rdev.flags)) - raid_param_cnt++; /* for rebuilds */ + raid_param_cnt += 2; /* for rebuilds */ + if (rs->dev[i].data_dev && + test_bit(WriteMostly, &rs->dev[i].rdev.flags)) + raid_param_cnt += 2; + } - raid_param_cnt += (hweight64(rs->print_flags) * 2); + raid_param_cnt += (hweight64(rs->print_flags & ~DMPF_REBUILD) * 2); if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)) raid_param_cnt--; @@ -565,7 +1072,8 @@ static int raid_status(struct dm_target *ti, status_type_t type, DMEMIT(" nosync"); for (i = 0; i < rs->md.raid_disks; i++) - if (rs->dev[i].data_dev && + if ((rs->print_flags & DMPF_REBUILD) && + rs->dev[i].data_dev && !test_bit(In_sync, &rs->dev[i].rdev.flags)) DMEMIT(" rebuild %u", i); @@ -579,6 +1087,11 @@ static int raid_status(struct dm_target *ti, status_type_t type, if (rs->print_flags & DMPF_MAX_RECOVERY_RATE) DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max); + for (i = 0; i < rs->md.raid_disks; i++) + if (rs->dev[i].data_dev && + test_bit(WriteMostly, &rs->dev[i].rdev.flags)) + DMEMIT(" write_mostly %u", i); + if (rs->print_flags & DMPF_MAX_WRITE_BEHIND) DMEMIT(" max_write_behind %lu", rs->md.bitmap_info.max_write_behind); @@ -591,9 +1104,16 @@ static int raid_status(struct dm_target *ti, status_type_t type, conf ? conf->max_nr_stripes * 2 : 0); } + if (rs->print_flags & DMPF_REGION_SIZE) + DMEMIT(" region_size %lu", + rs->md.bitmap_info.chunksize >> 9); + DMEMIT(" %d", rs->md.raid_disks); for (i = 0; i < rs->md.raid_disks; i++) { - DMEMIT(" -"); /* metadata device */ + if (rs->dev[i].meta_dev) + DMEMIT(" %s", rs->dev[i].meta_dev->name); + else + DMEMIT(" -"); if (rs->dev[i].data_dev) DMEMIT(" %s", rs->dev[i].data_dev->name); @@ -650,12 +1170,13 @@ static void raid_resume(struct dm_target *ti) { struct raid_set *rs = ti->private; + bitmap_load(&rs->md); mddev_resume(&rs->md); } static struct target_type raid_target = { .name = "raid", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 135c2f1fdbf..d1f1d701710 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -58,25 +58,30 @@ #define NUM_SNAPSHOT_HDR_CHUNKS 1 struct disk_header { - uint32_t magic; + __le32 magic; /* * Is this snapshot valid. There is no way of recovering * an invalid snapshot. */ - uint32_t valid; + __le32 valid; /* * Simple, incrementing version. no backward * compatibility. */ - uint32_t version; + __le32 version; /* In sectors */ - uint32_t chunk_size; -}; + __le32 chunk_size; +} __packed; struct disk_exception { + __le64 old_chunk; + __le64 new_chunk; +} __packed; + +struct core_exception { uint64_t old_chunk; uint64_t new_chunk; }; @@ -169,10 +174,9 @@ static int alloc_area(struct pstore *ps) if (!ps->area) goto err_area; - ps->zero_area = vmalloc(len); + ps->zero_area = vzalloc(len); if (!ps->zero_area) goto err_zero_area; - memset(ps->zero_area, 0, len); ps->header_area = vmalloc(len); if (!ps->header_area) @@ -396,32 +400,32 @@ static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) } static void read_exception(struct pstore *ps, - uint32_t index, struct disk_exception *result) + uint32_t index, struct core_exception *result) { - struct disk_exception *e = get_exception(ps, index); + struct disk_exception *de = get_exception(ps, index); /* copy it */ - result->old_chunk = le64_to_cpu(e->old_chunk); - result->new_chunk = le64_to_cpu(e->new_chunk); + result->old_chunk = le64_to_cpu(de->old_chunk); + result->new_chunk = le64_to_cpu(de->new_chunk); } static void write_exception(struct pstore *ps, - uint32_t index, struct disk_exception *de) + uint32_t index, struct core_exception *e) { - struct disk_exception *e = get_exception(ps, index); + struct disk_exception *de = get_exception(ps, index); /* copy it */ - e->old_chunk = cpu_to_le64(de->old_chunk); - e->new_chunk = cpu_to_le64(de->new_chunk); + de->old_chunk = cpu_to_le64(e->old_chunk); + de->new_chunk = cpu_to_le64(e->new_chunk); } static void clear_exception(struct pstore *ps, uint32_t index) { - struct disk_exception *e = get_exception(ps, index); + struct disk_exception *de = get_exception(ps, index); /* clear it */ - e->old_chunk = 0; - e->new_chunk = 0; + de->old_chunk = 0; + de->new_chunk = 0; } /* @@ -437,13 +441,13 @@ static int insert_exceptions(struct pstore *ps, { int r; unsigned int i; - struct disk_exception de; + struct core_exception e; /* presume the area is full */ *full = 1; for (i = 0; i < ps->exceptions_per_area; i++) { - read_exception(ps, i, &de); + read_exception(ps, i, &e); /* * If the new_chunk is pointing at the start of @@ -451,7 +455,7 @@ static int insert_exceptions(struct pstore *ps, * is we know that we've hit the end of the * exceptions. Therefore the area is not full. */ - if (de.new_chunk == 0LL) { + if (e.new_chunk == 0LL) { ps->current_committed = i; *full = 0; break; @@ -460,13 +464,13 @@ static int insert_exceptions(struct pstore *ps, /* * Keep track of the start of the free chunks. */ - if (ps->next_free <= de.new_chunk) - ps->next_free = de.new_chunk + 1; + if (ps->next_free <= e.new_chunk) + ps->next_free = e.new_chunk + 1; /* * Otherwise we add the exception to the snapshot. */ - r = callback(callback_context, de.old_chunk, de.new_chunk); + r = callback(callback_context, e.old_chunk, e.new_chunk); if (r) return r; } @@ -563,7 +567,7 @@ static int persistent_read_metadata(struct dm_exception_store *store, ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) / sizeof(struct disk_exception); ps->callbacks = dm_vcalloc(ps->exceptions_per_area, - sizeof(*ps->callbacks)); + sizeof(*ps->callbacks)); if (!ps->callbacks) return -ENOMEM; @@ -641,12 +645,12 @@ static void persistent_commit_exception(struct dm_exception_store *store, { unsigned int i; struct pstore *ps = get_info(store); - struct disk_exception de; + struct core_exception ce; struct commit_callback *cb; - de.old_chunk = e->old_chunk; - de.new_chunk = e->new_chunk; - write_exception(ps, ps->current_committed++, &de); + ce.old_chunk = e->old_chunk; + ce.new_chunk = e->new_chunk; + write_exception(ps, ps->current_committed++, &ce); /* * Add the callback to the back of the array. This code @@ -670,7 +674,7 @@ static void persistent_commit_exception(struct dm_exception_store *store, * If we completely filled the current area, then wipe the next one. */ if ((ps->current_committed == ps->exceptions_per_area) && - zero_disk_area(ps, ps->current_area + 1)) + zero_disk_area(ps, ps->current_area + 1)) ps->valid = 0; /* @@ -701,7 +705,7 @@ static int persistent_prepare_merge(struct dm_exception_store *store, chunk_t *last_new_chunk) { struct pstore *ps = get_info(store); - struct disk_exception de; + struct core_exception ce; int nr_consecutive; int r; @@ -722,9 +726,9 @@ static int persistent_prepare_merge(struct dm_exception_store *store, ps->current_committed = ps->exceptions_per_area; } - read_exception(ps, ps->current_committed - 1, &de); - *last_old_chunk = de.old_chunk; - *last_new_chunk = de.new_chunk; + read_exception(ps, ps->current_committed - 1, &ce); + *last_old_chunk = ce.old_chunk; + *last_new_chunk = ce.new_chunk; /* * Find number of consecutive chunks within the current area, @@ -733,9 +737,9 @@ static int persistent_prepare_merge(struct dm_exception_store *store, for (nr_consecutive = 1; nr_consecutive < ps->current_committed; nr_consecutive++) { read_exception(ps, ps->current_committed - 1 - nr_consecutive, - &de); - if (de.old_chunk != *last_old_chunk - nr_consecutive || - de.new_chunk != *last_new_chunk - nr_consecutive) + &ce); + if (ce.old_chunk != *last_old_chunk - nr_consecutive || + ce.new_chunk != *last_new_chunk - nr_consecutive) break; } @@ -753,7 +757,7 @@ static int persistent_commit_merge(struct dm_exception_store *store, for (i = 0; i < nr_merged; i++) clear_exception(ps, ps->current_committed - 1 - i); - r = area_io(ps, WRITE); + r = area_io(ps, WRITE_FLUSH_FUA); if (r < 0) return r; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 9ecff5f3023..6f758870fc1 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -30,16 +30,6 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge"; ((ti)->type->name == dm_snapshot_merge_target_name) /* - * The percentage increment we will wake up users at - */ -#define WAKE_UP_PERCENT 5 - -/* - * kcopyd priority of snapshot operations - */ -#define SNAPSHOT_COPY_PRIORITY 2 - -/* * The size of the mempool used to track chunks in use. */ #define MIN_IOS 256 @@ -180,6 +170,13 @@ struct dm_snap_pending_exception { * kcopyd. */ int started; + + /* + * For writing a complete chunk, bypassing the copy. + */ + struct bio *full_bio; + bio_end_io_t *full_bio_end_io; + void *full_bio_private; }; /* @@ -1055,8 +1052,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) { - ti->error = "Cannot allocate snapshot context private " - "structure"; + ti->error = "Cannot allocate private snapshot structure"; r = -ENOMEM; goto bad; } @@ -1380,6 +1376,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) struct dm_snapshot *s = pe->snap; struct bio *origin_bios = NULL; struct bio *snapshot_bios = NULL; + struct bio *full_bio = NULL; int error = 0; if (!success) { @@ -1415,10 +1412,15 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) */ dm_insert_exception(&s->complete, e); - out: +out: dm_remove_exception(&pe->e); snapshot_bios = bio_list_get(&pe->snapshot_bios); origin_bios = bio_list_get(&pe->origin_bios); + full_bio = pe->full_bio; + if (full_bio) { + full_bio->bi_end_io = pe->full_bio_end_io; + full_bio->bi_private = pe->full_bio_private; + } free_pending_exception(pe); increment_pending_exceptions_done_count(); @@ -1426,10 +1428,15 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) up_write(&s->lock); /* Submit any pending write bios */ - if (error) + if (error) { + if (full_bio) + bio_io_error(full_bio); error_bios(snapshot_bios); - else + } else { + if (full_bio) + bio_endio(full_bio, 0); flush_bios(snapshot_bios); + } retry_origin_bios(s, origin_bios); } @@ -1480,8 +1487,33 @@ static void start_copy(struct dm_snap_pending_exception *pe) dest.count = src.count; /* Hand over to kcopyd */ - dm_kcopyd_copy(s->kcopyd_client, - &src, 1, &dest, 0, copy_callback, pe); + dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe); +} + +static void full_bio_end_io(struct bio *bio, int error) +{ + void *callback_data = bio->bi_private; + + dm_kcopyd_do_callback(callback_data, 0, error ? 1 : 0); +} + +static void start_full_bio(struct dm_snap_pending_exception *pe, + struct bio *bio) +{ + struct dm_snapshot *s = pe->snap; + void *callback_data; + + pe->full_bio = bio; + pe->full_bio_end_io = bio->bi_end_io; + pe->full_bio_private = bio->bi_private; + + callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client, + copy_callback, pe); + + bio->bi_end_io = full_bio_end_io; + bio->bi_private = callback_data; + + generic_make_request(bio); } static struct dm_snap_pending_exception * @@ -1519,6 +1551,7 @@ __find_pending_exception(struct dm_snapshot *s, bio_list_init(&pe->origin_bios); bio_list_init(&pe->snapshot_bios); pe->started = 0; + pe->full_bio = NULL; if (s->store->type->prepare_exception(s->store, &pe->e)) { free_pending_exception(pe); @@ -1612,10 +1645,19 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, } remap_exception(s, &pe->e, bio, chunk); - bio_list_add(&pe->snapshot_bios, bio); r = DM_MAPIO_SUBMITTED; + if (!pe->started && + bio->bi_size == (s->store->chunk_size << SECTOR_SHIFT)) { + pe->started = 1; + up_write(&s->lock); + start_full_bio(pe, bio); + goto out; + } + + bio_list_add(&pe->snapshot_bios, bio); + if (!pe->started) { /* this is protected by snap->lock */ pe->started = 1; @@ -1628,9 +1670,9 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, map_context->ptr = track_chunk(s, chunk); } - out_unlock: +out_unlock: up_write(&s->lock); - out: +out: return r; } @@ -1974,7 +2016,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, pe_to_start_now = pe; } - next_snapshot: +next_snapshot: up_write(&snap->lock); if (pe_to_start_now) { diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 451c3bb176d..986b8754bb0 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -17,7 +17,7 @@ #include <linux/interrupt.h> #include <linux/mutex.h> #include <linux/delay.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #define DM_MSG_PREFIX "table" @@ -54,7 +54,6 @@ struct dm_table { sector_t *highs; struct dm_target *targets; - unsigned discards_supported:1; unsigned integrity_supported:1; /* @@ -154,12 +153,11 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size) return NULL; size = nmemb * elem_size; - addr = vmalloc(size); - if (addr) - memset(addr, 0, size); + addr = vzalloc(size); return addr; } +EXPORT_SYMBOL(dm_vcalloc); /* * highs, and targets are managed as dynamic arrays during a @@ -209,7 +207,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode, INIT_LIST_HEAD(&t->devices); INIT_LIST_HEAD(&t->target_callbacks); atomic_set(&t->holders, 0); - t->discards_supported = 1; if (!num_targets) num_targets = KEYS_PER_NODE; @@ -281,6 +278,7 @@ void dm_table_get(struct dm_table *t) { atomic_inc(&t->holders); } +EXPORT_SYMBOL(dm_table_get); void dm_table_put(struct dm_table *t) { @@ -290,6 +288,7 @@ void dm_table_put(struct dm_table *t) smp_mb__before_atomic_dec(); atomic_dec(&t->holders); } +EXPORT_SYMBOL(dm_table_put); /* * Checks to see if we need to extend highs or targets. @@ -455,13 +454,14 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, * Add a device to the list, or just increment the usage count if * it's already present. */ -static int __table_get_device(struct dm_table *t, struct dm_target *ti, - const char *path, fmode_t mode, struct dm_dev **result) +int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, + struct dm_dev **result) { int r; dev_t uninitialized_var(dev); struct dm_dev_internal *dd; unsigned int major, minor; + struct dm_table *t = ti->table; BUG_ON(!t); @@ -509,6 +509,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, *result = &dd->dm_dev; return 0; } +EXPORT_SYMBOL(dm_get_device); int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) @@ -539,23 +540,15 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, * If not we'll force DM to use PAGE_SIZE or * smaller I/O, just to be safe. */ - - if (q->merge_bvec_fn && !ti->type->merge) + if (dm_queue_merge_is_compulsory(q) && !ti->type->merge) blk_limits_max_hw_sectors(limits, (unsigned int) (PAGE_SIZE >> 9)); return 0; } EXPORT_SYMBOL_GPL(dm_set_device_limits); -int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, - struct dm_dev **result) -{ - return __table_get_device(ti->table, ti, path, mode, result); -} - - /* - * Decrement a devices use count and remove it if necessary. + * Decrement a device's use count and remove it if necessary. */ void dm_put_device(struct dm_target *ti, struct dm_dev *d) { @@ -568,6 +561,7 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d) kfree(dd); } } +EXPORT_SYMBOL(dm_put_device); /* * Checks to see if the target joins onto the end of the table. @@ -791,8 +785,9 @@ int dm_table_add_target(struct dm_table *t, const char *type, t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; - if (!tgt->num_discard_requests) - t->discards_supported = 0; + if (!tgt->num_discard_requests && tgt->discards_supported) + DMWARN("%s: %s: ignoring discards_supported because num_discard_requests is zero.", + dm_device_name(t->md), type); return 0; @@ -802,6 +797,63 @@ int dm_table_add_target(struct dm_table *t, const char *type, return r; } +/* + * Target argument parsing helpers. + */ +static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, + unsigned *value, char **error, unsigned grouped) +{ + const char *arg_str = dm_shift_arg(arg_set); + + if (!arg_str || + (sscanf(arg_str, "%u", value) != 1) || + (*value < arg->min) || + (*value > arg->max) || + (grouped && arg_set->argc < *value)) { + *error = arg->error; + return -EINVAL; + } + + return 0; +} + +int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, + unsigned *value, char **error) +{ + return validate_next_arg(arg, arg_set, value, error, 0); +} +EXPORT_SYMBOL(dm_read_arg); + +int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set, + unsigned *value, char **error) +{ + return validate_next_arg(arg, arg_set, value, error, 1); +} +EXPORT_SYMBOL(dm_read_arg_group); + +const char *dm_shift_arg(struct dm_arg_set *as) +{ + char *r; + + if (as->argc) { + as->argc--; + r = *as->argv; + as->argv++; + return r; + } + + return NULL; +} +EXPORT_SYMBOL(dm_shift_arg); + +void dm_consume_args(struct dm_arg_set *as, unsigned num_args) +{ + BUG_ON(as->argc < num_args); + as->argc -= num_args; + as->argv += num_args; +} +EXPORT_SYMBOL(dm_consume_args); + static int dm_table_set_type(struct dm_table *t) { unsigned i; @@ -1077,11 +1129,13 @@ void dm_table_event(struct dm_table *t) t->event_fn(t->event_context); mutex_unlock(&_event_lock); } +EXPORT_SYMBOL(dm_table_event); sector_t dm_table_get_size(struct dm_table *t) { return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; } +EXPORT_SYMBOL(dm_table_get_size); struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) { @@ -1194,9 +1248,45 @@ static void dm_table_set_integrity(struct dm_table *t) blk_get_integrity(template_disk)); } +static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + unsigned flush = (*(unsigned *)data); + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && (q->flush_flags & flush); +} + +static bool dm_table_supports_flush(struct dm_table *t, unsigned flush) +{ + struct dm_target *ti; + unsigned i = 0; + + /* + * Require at least one underlying device to support flushes. + * t->devices includes internal dm devices such as mirror logs + * so we need to use iterate_devices here, which targets + * supporting flushes must provide. + */ + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + + if (!ti->num_flush_requests) + continue; + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, device_flush_capable, &flush)) + return 1; + } + + return 0; +} + void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { + unsigned flush = 0; + /* * Copy table's limits to the DM device's request_queue */ @@ -1207,6 +1297,13 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + if (dm_table_supports_flush(t, REQ_FLUSH)) { + flush |= REQ_FLUSH; + if (dm_table_supports_flush(t, REQ_FUA)) + flush |= REQ_FUA; + } + blk_queue_flush(q, flush); + dm_table_set_integrity(t); /* @@ -1237,6 +1334,7 @@ fmode_t dm_table_get_mode(struct dm_table *t) { return t->mode; } +EXPORT_SYMBOL(dm_table_get_mode); static void suspend_targets(struct dm_table *t, unsigned postsuspend) { @@ -1345,6 +1443,7 @@ struct mapped_device *dm_table_get_md(struct dm_table *t) { return t->md; } +EXPORT_SYMBOL(dm_table_get_md); static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) @@ -1359,19 +1458,19 @@ bool dm_table_supports_discards(struct dm_table *t) struct dm_target *ti; unsigned i = 0; - if (!t->discards_supported) - return 0; - /* * Unless any target used by the table set discards_supported, * require at least one underlying device to support discards. * t->devices includes internal dm devices such as mirror logs * so we need to use iterate_devices here, which targets - * supporting discard must provide. + * supporting discard selectively must provide. */ while (i < dm_table_get_num_targets(t)) { ti = dm_table_get_target(t, i++); + if (!ti->num_discard_requests) + continue; + if (ti->discards_supported) return 1; @@ -1382,13 +1481,3 @@ bool dm_table_supports_discards(struct dm_table *t) return 0; } - -EXPORT_SYMBOL(dm_vcalloc); -EXPORT_SYMBOL(dm_get_device); -EXPORT_SYMBOL(dm_put_device); -EXPORT_SYMBOL(dm_table_event); -EXPORT_SYMBOL(dm_table_get_size); -EXPORT_SYMBOL(dm_table_get_mode); -EXPORT_SYMBOL(dm_table_get_md); -EXPORT_SYMBOL(dm_table_put); -EXPORT_SYMBOL(dm_table_get); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 0cf68b47887..52b39f335bb 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -37,6 +37,8 @@ static const char *_name = DM_NAME; static unsigned int major = 0; static unsigned int _major = 0; +static DEFINE_IDR(_minor_idr); + static DEFINE_SPINLOCK(_minor_lock); /* * For bio-based dm. @@ -109,6 +111,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); #define DMF_FREEING 3 #define DMF_DELETING 4 #define DMF_NOFLUSH_SUSPENDING 5 +#define DMF_MERGE_IS_OPTIONAL 6 /* * Work processed by per-device workqueue. @@ -313,6 +316,12 @@ static void __exit dm_exit(void) while (i--) _exits[i](); + + /* + * Should be empty by this point. + */ + idr_remove_all(&_minor_idr); + idr_destroy(&_minor_idr); } /* @@ -1171,7 +1180,8 @@ static int __clone_and_map_discard(struct clone_info *ci) /* * Even though the device advertised discard support, - * reconfiguration might have changed that since the + * that does not mean every target supports it, and + * reconfiguration might also have changed that since the * check was performed. */ if (!ti->num_discard_requests) @@ -1705,8 +1715,6 @@ static int dm_any_congested(void *congested_data, int bdi_bits) /*----------------------------------------------------------------- * An IDR is used to keep track of allocated minor numbers. *---------------------------------------------------------------*/ -static DEFINE_IDR(_minor_idr); - static void free_minor(int minor) { spin_lock(&_minor_lock); @@ -1800,7 +1808,6 @@ static void dm_init_md_queue(struct mapped_device *md) blk_queue_make_request(md->queue, dm_request); blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); blk_queue_merge_bvec(md->queue, dm_merge_bvec); - blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA); } /* @@ -1986,6 +1993,59 @@ static void __set_size(struct mapped_device *md, sector_t size) } /* + * Return 1 if the queue has a compulsory merge_bvec_fn function. + * + * If this function returns 0, then the device is either a non-dm + * device without a merge_bvec_fn, or it is a dm device that is + * able to split any bios it receives that are too big. + */ +int dm_queue_merge_is_compulsory(struct request_queue *q) +{ + struct mapped_device *dev_md; + + if (!q->merge_bvec_fn) + return 0; + + if (q->make_request_fn == dm_request) { + dev_md = q->queuedata; + if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) + return 0; + } + + return 1; +} + +static int dm_device_merge_is_compulsory(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) +{ + struct block_device *bdev = dev->bdev; + struct request_queue *q = bdev_get_queue(bdev); + + return dm_queue_merge_is_compulsory(q); +} + +/* + * Return 1 if it is acceptable to ignore merge_bvec_fn based + * on the properties of the underlying devices. + */ +static int dm_table_merge_is_optional(struct dm_table *table) +{ + unsigned i = 0; + struct dm_target *ti; + + while (i < dm_table_get_num_targets(table)) { + ti = dm_table_get_target(table, i++); + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL)) + return 0; + } + + return 1; +} + +/* * Returns old map, which caller must destroy. */ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, @@ -1995,6 +2055,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, struct request_queue *q = md->queue; sector_t size; unsigned long flags; + int merge_is_optional; size = dm_table_get_size(t); @@ -2020,10 +2081,16 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, __bind_mempools(md, t); + merge_is_optional = dm_table_merge_is_optional(t); + write_lock_irqsave(&md->map_lock, flags); old_map = md->map; md->map = t; dm_table_set_restrictions(t, q, limits); + if (merge_is_optional) + set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); + else + clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); write_unlock_irqrestore(&md->map_lock, flags); return old_map; diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 1aaf16746da..6745dbd278a 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -66,6 +66,8 @@ int dm_table_alloc_md_mempools(struct dm_table *t); void dm_table_free_md_mempools(struct dm_table *t); struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); +int dm_queue_merge_is_compulsory(struct request_queue *q); + void dm_lock_md_type(struct mapped_device *md); void dm_unlock_md_type(struct mapped_device *md); void dm_set_md_type(struct mapped_device *md, unsigned type); diff --git a/drivers/md/linear.h b/drivers/md/linear.h index 0ce29b61605..2f2da05b2ce 100644 --- a/drivers/md/linear.h +++ b/drivers/md/linear.h @@ -10,9 +10,9 @@ typedef struct dev_info dev_info_t; struct linear_private_data { + struct rcu_head rcu; sector_t array_sectors; dev_info_t disks[0]; - struct rcu_head rcu; }; diff --git a/drivers/md/md.c b/drivers/md/md.c index dfc9425db70..5404b229582 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -215,6 +215,55 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, } EXPORT_SYMBOL_GPL(bio_clone_mddev); +void md_trim_bio(struct bio *bio, int offset, int size) +{ + /* 'bio' is a cloned bio which we need to trim to match + * the given offset and size. + * This requires adjusting bi_sector, bi_size, and bi_io_vec + */ + int i; + struct bio_vec *bvec; + int sofar = 0; + + size <<= 9; + if (offset == 0 && size == bio->bi_size) + return; + + bio->bi_sector += offset; + bio->bi_size = size; + offset <<= 9; + clear_bit(BIO_SEG_VALID, &bio->bi_flags); + + while (bio->bi_idx < bio->bi_vcnt && + bio->bi_io_vec[bio->bi_idx].bv_len <= offset) { + /* remove this whole bio_vec */ + offset -= bio->bi_io_vec[bio->bi_idx].bv_len; + bio->bi_idx++; + } + if (bio->bi_idx < bio->bi_vcnt) { + bio->bi_io_vec[bio->bi_idx].bv_offset += offset; + bio->bi_io_vec[bio->bi_idx].bv_len -= offset; + } + /* avoid any complications with bi_idx being non-zero*/ + if (bio->bi_idx) { + memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, + (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec)); + bio->bi_vcnt -= bio->bi_idx; + bio->bi_idx = 0; + } + /* Make sure vcnt and last bv are not too big */ + bio_for_each_segment(bvec, bio, i) { + if (sofar + bvec->bv_len > size) + bvec->bv_len = size - sofar; + if (bvec->bv_len == 0) { + bio->bi_vcnt = i; + break; + } + sofar += bvec->bv_len; + } +} +EXPORT_SYMBOL_GPL(md_trim_bio); + /* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat @@ -757,6 +806,10 @@ static void free_disk_sb(mdk_rdev_t * rdev) rdev->sb_start = 0; rdev->sectors = 0; } + if (rdev->bb_page) { + put_page(rdev->bb_page); + rdev->bb_page = NULL; + } } @@ -795,7 +848,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, bio->bi_end_io = super_written; atomic_inc(&mddev->pending_writes); - submit_bio(REQ_WRITE | REQ_SYNC | REQ_FLUSH | REQ_FUA, bio); + submit_bio(WRITE_FLUSH_FUA, bio); } void md_super_wait(mddev_t *mddev) @@ -1025,7 +1078,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version ret = -EINVAL; bdevname(rdev->bdev, b); - sb = (mdp_super_t*)page_address(rdev->sb_page); + sb = page_address(rdev->sb_page); if (sb->md_magic != MD_SB_MAGIC) { printk(KERN_ERR "md: invalid raid superblock magic on %s\n", @@ -1054,6 +1107,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version rdev->preferred_minor = sb->md_minor; rdev->data_offset = 0; rdev->sb_size = MD_SB_BYTES; + rdev->badblocks.shift = -1; if (sb->level == LEVEL_MULTIPATH) rdev->desc_nr = -1; @@ -1064,7 +1118,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version ret = 1; } else { __u64 ev1, ev2; - mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); + mdp_super_t *refsb = page_address(refdev->sb_page); if (!uuid_equal(refsb, sb)) { printk(KERN_WARNING "md: %s has different UUID to %s\n", b, bdevname(refdev->bdev,b2)); @@ -1084,8 +1138,11 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version ret = 0; } rdev->sectors = rdev->sb_start; + /* Limit to 4TB as metadata cannot record more than that */ + if (rdev->sectors >= (2ULL << 32)) + rdev->sectors = (2ULL << 32) - 2; - if (rdev->sectors < sb->size * 2 && sb->level > 1) + if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) /* "this cannot possibly happen" ... */ ret = -EINVAL; @@ -1099,7 +1156,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) { mdp_disk_t *desc; - mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); + mdp_super_t *sb = page_address(rdev->sb_page); __u64 ev1 = md_event(sb); rdev->raid_disk = -1; @@ -1119,7 +1176,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->clevel[0] = 0; mddev->layout = sb->layout; mddev->raid_disks = sb->raid_disks; - mddev->dev_sectors = sb->size * 2; + mddev->dev_sectors = ((sector_t)sb->size) * 2; mddev->events = ev1; mddev->bitmap_info.offset = 0; mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; @@ -1230,7 +1287,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) rdev->sb_size = MD_SB_BYTES; - sb = (mdp_super_t*)page_address(rdev->sb_page); + sb = page_address(rdev->sb_page); memset(sb, 0, sizeof(*sb)); @@ -1361,6 +1418,11 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) rdev->sb_start = calc_dev_sboffset(rdev); if (!num_sectors || num_sectors > rdev->sb_start) num_sectors = rdev->sb_start; + /* Limit to 4TB as metadata cannot record more than that. + * 4TB == 2^32 KB, or 2*2^32 sectors. + */ + if (num_sectors >= (2ULL << 32)) + num_sectors = (2ULL << 32) - 2; md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); md_super_wait(rdev->mddev); @@ -1395,6 +1457,8 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) return cpu_to_le32(csum); } +static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, + int acknowledged); static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) { struct mdp_superblock_1 *sb; @@ -1435,7 +1499,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) if (ret) return ret; - sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + sb = page_address(rdev->sb_page); if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || sb->major_version != cpu_to_le32(1) || @@ -1473,12 +1537,52 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) else rdev->desc_nr = le32_to_cpu(sb->dev_number); + if (!rdev->bb_page) { + rdev->bb_page = alloc_page(GFP_KERNEL); + if (!rdev->bb_page) + return -ENOMEM; + } + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && + rdev->badblocks.count == 0) { + /* need to load the bad block list. + * Currently we limit it to one page. + */ + s32 offset; + sector_t bb_sector; + u64 *bbp; + int i; + int sectors = le16_to_cpu(sb->bblog_size); + if (sectors > (PAGE_SIZE / 512)) + return -EINVAL; + offset = le32_to_cpu(sb->bblog_offset); + if (offset == 0) + return -EINVAL; + bb_sector = (long long)offset; + if (!sync_page_io(rdev, bb_sector, sectors << 9, + rdev->bb_page, READ, true)) + return -EIO; + bbp = (u64 *)page_address(rdev->bb_page); + rdev->badblocks.shift = sb->bblog_shift; + for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { + u64 bb = le64_to_cpu(*bbp); + int count = bb & (0x3ff); + u64 sector = bb >> 10; + sector <<= sb->bblog_shift; + count <<= sb->bblog_shift; + if (bb + 1 == 0) + break; + if (md_set_badblocks(&rdev->badblocks, + sector, count, 1) == 0) + return -EINVAL; + } + } else if (sb->bblog_offset == 0) + rdev->badblocks.shift = -1; + if (!refdev) { ret = 1; } else { __u64 ev1, ev2; - struct mdp_superblock_1 *refsb = - (struct mdp_superblock_1*)page_address(refdev->sb_page); + struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || sb->level != refsb->level || @@ -1513,7 +1617,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) { - struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + struct mdp_superblock_1 *sb = page_address(rdev->sb_page); __u64 ev1 = le64_to_cpu(sb->events); rdev->raid_disk = -1; @@ -1619,13 +1723,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) int max_dev, i; /* make rdev->sb match mddev and rdev data. */ - sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + sb = page_address(rdev->sb_page); sb->feature_map = 0; sb->pad0 = 0; sb->recovery_offset = cpu_to_le64(0); memset(sb->pad1, 0, sizeof(sb->pad1)); - memset(sb->pad2, 0, sizeof(sb->pad2)); memset(sb->pad3, 0, sizeof(sb->pad3)); sb->utime = cpu_to_le64((__u64)mddev->utime); @@ -1643,6 +1746,11 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->level = cpu_to_le32(mddev->level); sb->layout = cpu_to_le32(mddev->layout); + if (test_bit(WriteMostly, &rdev->flags)) + sb->devflags |= WriteMostly1; + else + sb->devflags &= ~WriteMostly1; + if (mddev->bitmap && mddev->bitmap_info.file == NULL) { sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); @@ -1665,6 +1773,40 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); } + if (rdev->badblocks.count == 0) + /* Nothing to do for bad blocks*/ ; + else if (sb->bblog_offset == 0) + /* Cannot record bad blocks on this device */ + md_error(mddev, rdev); + else { + struct badblocks *bb = &rdev->badblocks; + u64 *bbp = (u64 *)page_address(rdev->bb_page); + u64 *p = bb->page; + sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); + if (bb->changed) { + unsigned seq; + +retry: + seq = read_seqbegin(&bb->lock); + + memset(bbp, 0xff, PAGE_SIZE); + + for (i = 0 ; i < bb->count ; i++) { + u64 internal_bb = *p++; + u64 store_bb = ((BB_OFFSET(internal_bb) << 10) + | BB_LEN(internal_bb)); + *bbp++ = cpu_to_le64(store_bb); + } + if (read_seqretry(&bb->lock, seq)) + goto retry; + + bb->sector = (rdev->sb_start + + (int)le32_to_cpu(sb->bblog_offset)); + bb->size = le16_to_cpu(sb->bblog_size); + bb->changed = 0; + } + } + max_dev = 0; list_for_each_entry(rdev2, &mddev->disks, same_set) if (rdev2->desc_nr+1 > max_dev) @@ -1724,7 +1866,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) num_sectors = max_sectors; rdev->sb_start = sb_start; } - sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); + sb = page_address(rdev->sb_page); sb->data_size = cpu_to_le64(num_sectors); sb->super_offset = rdev->sb_start; sb->sb_csum = calc_sb_1_csum(sb); @@ -1922,7 +2064,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) bd_link_disk_holder(rdev->bdev, mddev->gendisk); /* May as well allow recovery to be retried once */ - mddev->recovery_disabled = 0; + mddev->recovery_disabled++; return 0; @@ -1953,6 +2095,9 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); rdev->sysfs_state = NULL; + kfree(rdev->badblocks.page); + rdev->badblocks.count = 0; + rdev->badblocks.page = NULL; /* We need to delay this, otherwise we can deadlock when * writing to 'remove' to "dev/state". We also need * to delay it due to rcu usage. @@ -2127,10 +2272,10 @@ static void print_rdev(mdk_rdev_t *rdev, int major_version) printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); switch (major_version) { case 0: - print_sb_90((mdp_super_t*)page_address(rdev->sb_page)); + print_sb_90(page_address(rdev->sb_page)); break; case 1: - print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page)); + print_sb_1(page_address(rdev->sb_page)); break; } } else @@ -2194,6 +2339,7 @@ static void md_update_sb(mddev_t * mddev, int force_change) mdk_rdev_t *rdev; int sync_req; int nospares = 0; + int any_badblocks_changed = 0; repeat: /* First make sure individual recovery_offsets are correct */ @@ -2208,8 +2354,18 @@ repeat: if (!mddev->persistent) { clear_bit(MD_CHANGE_CLEAN, &mddev->flags); clear_bit(MD_CHANGE_DEVS, &mddev->flags); - if (!mddev->external) + if (!mddev->external) { clear_bit(MD_CHANGE_PENDING, &mddev->flags); + list_for_each_entry(rdev, &mddev->disks, same_set) { + if (rdev->badblocks.changed) { + md_ack_all_badblocks(&rdev->badblocks); + md_error(mddev, rdev); + } + clear_bit(Blocked, &rdev->flags); + clear_bit(BlockedBadBlocks, &rdev->flags); + wake_up(&rdev->blocked_wait); + } + } wake_up(&mddev->sb_wait); return; } @@ -2265,6 +2421,14 @@ repeat: MD_BUG(); mddev->events --; } + + list_for_each_entry(rdev, &mddev->disks, same_set) { + if (rdev->badblocks.changed) + any_badblocks_changed++; + if (test_bit(Faulty, &rdev->flags)) + set_bit(FaultRecorded, &rdev->flags); + } + sync_sbs(mddev, nospares); spin_unlock_irq(&mddev->write_lock); @@ -2290,6 +2454,13 @@ repeat: bdevname(rdev->bdev,b), (unsigned long long)rdev->sb_start); rdev->sb_events = mddev->events; + if (rdev->badblocks.size) { + md_super_write(mddev, rdev, + rdev->badblocks.sector, + rdev->badblocks.size << 9, + rdev->bb_page); + rdev->badblocks.size = 0; + } } else dprintk(")\n"); @@ -2313,6 +2484,15 @@ repeat: if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + list_for_each_entry(rdev, &mddev->disks, same_set) { + if (test_and_clear_bit(FaultRecorded, &rdev->flags)) + clear_bit(Blocked, &rdev->flags); + + if (any_badblocks_changed) + md_ack_all_badblocks(&rdev->badblocks); + clear_bit(BlockedBadBlocks, &rdev->flags); + wake_up(&rdev->blocked_wait); + } } /* words written to sysfs files may, or may not, be \n terminated. @@ -2347,7 +2527,8 @@ state_show(mdk_rdev_t *rdev, char *page) char *sep = ""; size_t len = 0; - if (test_bit(Faulty, &rdev->flags)) { + if (test_bit(Faulty, &rdev->flags) || + rdev->badblocks.unacked_exist) { len+= sprintf(page+len, "%sfaulty",sep); sep = ","; } @@ -2359,7 +2540,8 @@ state_show(mdk_rdev_t *rdev, char *page) len += sprintf(page+len, "%swrite_mostly",sep); sep = ","; } - if (test_bit(Blocked, &rdev->flags)) { + if (test_bit(Blocked, &rdev->flags) || + rdev->badblocks.unacked_exist) { len += sprintf(page+len, "%sblocked", sep); sep = ","; } @@ -2368,6 +2550,10 @@ state_show(mdk_rdev_t *rdev, char *page) len += sprintf(page+len, "%sspare", sep); sep = ","; } + if (test_bit(WriteErrorSeen, &rdev->flags)) { + len += sprintf(page+len, "%swrite_error", sep); + sep = ","; + } return len+sprintf(page+len, "\n"); } @@ -2375,18 +2561,23 @@ static ssize_t state_store(mdk_rdev_t *rdev, const char *buf, size_t len) { /* can write - * faulty - simulates and error + * faulty - simulates an error * remove - disconnects the device * writemostly - sets write_mostly * -writemostly - clears write_mostly - * blocked - sets the Blocked flag - * -blocked - clears the Blocked flag + * blocked - sets the Blocked flags + * -blocked - clears the Blocked and possibly simulates an error * insync - sets Insync providing device isn't active + * write_error - sets WriteErrorSeen + * -write_error - clears WriteErrorSeen */ int err = -EINVAL; if (cmd_match(buf, "faulty") && rdev->mddev->pers) { md_error(rdev->mddev, rdev); - err = 0; + if (test_bit(Faulty, &rdev->flags)) + err = 0; + else + err = -EBUSY; } else if (cmd_match(buf, "remove")) { if (rdev->raid_disk >= 0) err = -EBUSY; @@ -2408,7 +2599,15 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) set_bit(Blocked, &rdev->flags); err = 0; } else if (cmd_match(buf, "-blocked")) { + if (!test_bit(Faulty, &rdev->flags) && + rdev->badblocks.unacked_exist) { + /* metadata handler doesn't understand badblocks, + * so we need to fail the device + */ + md_error(rdev->mddev, rdev); + } clear_bit(Blocked, &rdev->flags); + clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); md_wakeup_thread(rdev->mddev->thread); @@ -2417,6 +2616,12 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { set_bit(In_sync, &rdev->flags); err = 0; + } else if (cmd_match(buf, "write_error")) { + set_bit(WriteErrorSeen, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "-write_error")) { + clear_bit(WriteErrorSeen, &rdev->flags); + err = 0; } if (!err) sysfs_notify_dirent_safe(rdev->sysfs_state); @@ -2459,7 +2664,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) { char *e; int err; - char nm[20]; int slot = simple_strtoul(buf, &e, 10); if (strncmp(buf, "none", 4)==0) slot = -1; @@ -2482,8 +2686,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) hot_remove_disk(rdev->mddev, rdev->raid_disk); if (err) return err; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&rdev->mddev->kobj, nm); + sysfs_unlink_rdev(rdev->mddev, rdev); rdev->raid_disk = -1; set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); md_wakeup_thread(rdev->mddev->thread); @@ -2522,8 +2725,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) return err; } else sysfs_notify_dirent_safe(rdev->sysfs_state); - sprintf(nm, "rd%d", rdev->raid_disk); - if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) + if (sysfs_link_rdev(rdev->mddev, rdev)) /* failure here is OK */; /* don't wakeup anyone, leave that to userspace. */ } else { @@ -2712,6 +2914,39 @@ static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t le static struct rdev_sysfs_entry rdev_recovery_start = __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); + +static ssize_t +badblocks_show(struct badblocks *bb, char *page, int unack); +static ssize_t +badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); + +static ssize_t bb_show(mdk_rdev_t *rdev, char *page) +{ + return badblocks_show(&rdev->badblocks, page, 0); +} +static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len) +{ + int rv = badblocks_store(&rdev->badblocks, page, len, 0); + /* Maybe that ack was all we needed */ + if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) + wake_up(&rdev->blocked_wait); + return rv; +} +static struct rdev_sysfs_entry rdev_bad_blocks = +__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); + + +static ssize_t ubb_show(mdk_rdev_t *rdev, char *page) +{ + return badblocks_show(&rdev->badblocks, page, 1); +} +static ssize_t ubb_store(mdk_rdev_t *rdev, const char *page, size_t len) +{ + return badblocks_store(&rdev->badblocks, page, len, 1); +} +static struct rdev_sysfs_entry rdev_unack_bad_blocks = +__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); + static struct attribute *rdev_default_attrs[] = { &rdev_state.attr, &rdev_errors.attr, @@ -2719,6 +2954,8 @@ static struct attribute *rdev_default_attrs[] = { &rdev_offset.attr, &rdev_size.attr, &rdev_recovery_start.attr, + &rdev_bad_blocks.attr, + &rdev_unack_bad_blocks.attr, NULL, }; static ssize_t @@ -2782,7 +3019,7 @@ static struct kobj_type rdev_ktype = { .default_attrs = rdev_default_attrs, }; -void md_rdev_init(mdk_rdev_t *rdev) +int md_rdev_init(mdk_rdev_t *rdev) { rdev->desc_nr = -1; rdev->saved_raid_disk = -1; @@ -2792,12 +3029,27 @@ void md_rdev_init(mdk_rdev_t *rdev) rdev->sb_events = 0; rdev->last_read_error.tv_sec = 0; rdev->last_read_error.tv_nsec = 0; + rdev->sb_loaded = 0; + rdev->bb_page = NULL; atomic_set(&rdev->nr_pending, 0); atomic_set(&rdev->read_errors, 0); atomic_set(&rdev->corrected_errors, 0); INIT_LIST_HEAD(&rdev->same_set); init_waitqueue_head(&rdev->blocked_wait); + + /* Add space to store bad block list. + * This reserves the space even on arrays where it cannot + * be used - I wonder if that matters + */ + rdev->badblocks.count = 0; + rdev->badblocks.shift = 0; + rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); + seqlock_init(&rdev->badblocks.lock); + if (rdev->badblocks.page == NULL) + return -ENOMEM; + + return 0; } EXPORT_SYMBOL_GPL(md_rdev_init); /* @@ -2823,8 +3075,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi return ERR_PTR(-ENOMEM); } - md_rdev_init(rdev); - if ((err = alloc_disk_sb(rdev))) + err = md_rdev_init(rdev); + if (err) + goto abort_free; + err = alloc_disk_sb(rdev); + if (err) goto abort_free; err = lock_rdev(rdev, newdev, super_format == -2); @@ -2860,15 +3115,17 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi goto abort_free; } } + if (super_format == -1) + /* hot-add for 0.90, or non-persistent: so no badblocks */ + rdev->badblocks.shift = -1; return rdev; abort_free: - if (rdev->sb_page) { - if (rdev->bdev) - unlock_rdev(rdev); - free_disk_sb(rdev); - } + if (rdev->bdev) + unlock_rdev(rdev); + free_disk_sb(rdev); + kfree(rdev->badblocks.page); kfree(rdev); return ERR_PTR(err); } @@ -3149,15 +3406,13 @@ level_store(mddev_t *mddev, const char *buf, size_t len) } list_for_each_entry(rdev, &mddev->disks, same_set) { - char nm[20]; if (rdev->raid_disk < 0) continue; if (rdev->new_raid_disk >= mddev->raid_disks) rdev->new_raid_disk = -1; if (rdev->new_raid_disk == rdev->raid_disk) continue; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); + sysfs_unlink_rdev(mddev, rdev); } list_for_each_entry(rdev, &mddev->disks, same_set) { if (rdev->raid_disk < 0) @@ -3168,11 +3423,10 @@ level_store(mddev_t *mddev, const char *buf, size_t len) if (rdev->raid_disk < 0) clear_bit(In_sync, &rdev->flags); else { - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) - printk("md: cannot register %s for %s after level change\n", - nm, mdname(mddev)); + if (sysfs_link_rdev(mddev, rdev)) + printk(KERN_WARNING "md: cannot register rd%d" + " for %s after level change\n", + rdev->raid_disk, mdname(mddev)); } } @@ -4504,7 +4758,8 @@ int md_run(mddev_t *mddev) } if (mddev->bio_set == NULL) - mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev)); + mddev->bio_set = bioset_create(BIO_POOL_SIZE, + sizeof(mddev_t *)); spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); @@ -4621,12 +4876,9 @@ int md_run(mddev_t *mddev) smp_wmb(); mddev->ready = 1; list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk >= 0) { - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) + if (rdev->raid_disk >= 0) + if (sysfs_link_rdev(mddev, rdev)) /* failure here is OK */; - } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -4854,11 +5106,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) sysfs_notify_dirent_safe(mddev->sysfs_state); list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk >= 0) { - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); - } + if (rdev->raid_disk >= 0) + sysfs_unlink_rdev(mddev, rdev); set_capacity(disk, 0); mutex_unlock(&mddev->open_mutex); @@ -5750,6 +5999,8 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev) return -ENODEV; md_error(mddev, rdev); + if (!test_bit(Faulty, &rdev->flags)) + return -EBUSY; return 0; } @@ -6198,18 +6449,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) if (!rdev || test_bit(Faulty, &rdev->flags)) return; - if (mddev->external) - set_bit(Blocked, &rdev->flags); -/* - dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", - mdname(mddev), - MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), - __builtin_return_address(0),__builtin_return_address(1), - __builtin_return_address(2),__builtin_return_address(3)); -*/ - if (!mddev->pers) - return; - if (!mddev->pers->error_handler) + if (!mddev->pers || !mddev->pers->error_handler) return; mddev->pers->error_handler(mddev,rdev); if (mddev->degraded) @@ -6933,11 +7173,14 @@ void md_do_sync(mddev_t *mddev) atomic_add(sectors, &mddev->recovery_active); } + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + break; + j += sectors; if (j>1) mddev->curr_resync = j; mddev->curr_mark_cnt = io_sectors; if (last_check == 0) - /* this is the earliers that rebuilt will be + /* this is the earliest that rebuild will be * visible in /proc/mdstat */ md_new_event(mddev); @@ -6946,10 +7189,6 @@ void md_do_sync(mddev_t *mddev) continue; last_check = io_sectors; - - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) - break; - repeat: if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { /* step marks */ @@ -7067,29 +7306,23 @@ static int remove_and_add_spares(mddev_t *mddev) atomic_read(&rdev->nr_pending)==0) { if (mddev->pers->hot_remove_disk( mddev, rdev->raid_disk)==0) { - char nm[20]; - sprintf(nm,"rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); + sysfs_unlink_rdev(mddev, rdev); rdev->raid_disk = -1; } } - if (mddev->degraded && !mddev->recovery_disabled) { + if (mddev->degraded) { list_for_each_entry(rdev, &mddev->disks, same_set) { if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags) && - !test_bit(Faulty, &rdev->flags) && - !test_bit(Blocked, &rdev->flags)) + !test_bit(Faulty, &rdev->flags)) spares++; if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { rdev->recovery_offset = 0; if (mddev->pers-> hot_add_disk(mddev, rdev) == 0) { - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - if (sysfs_create_link(&mddev->kobj, - &rdev->kobj, nm)) + if (sysfs_link_rdev(mddev, rdev)) /* failure here is OK */; spares++; md_new_event(mddev); @@ -7138,6 +7371,8 @@ static void reap_sync_thread(mddev_t *mddev) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); sysfs_notify_dirent_safe(mddev->sysfs_action); md_new_event(mddev); + if (mddev->event_work.func) + queue_work(md_misc_wq, &mddev->event_work); } /* @@ -7170,9 +7405,6 @@ void md_check_recovery(mddev_t *mddev) if (mddev->bitmap) bitmap_daemon_work(mddev); - if (mddev->ro) - return; - if (signal_pending(current)) { if (mddev->pers->sync_request && !mddev->external) { printk(KERN_INFO "md: %s in immediate safe mode\n", @@ -7209,9 +7441,7 @@ void md_check_recovery(mddev_t *mddev) atomic_read(&rdev->nr_pending)==0) { if (mddev->pers->hot_remove_disk( mddev, rdev->raid_disk)==0) { - char nm[20]; - sprintf(nm,"rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); + sysfs_unlink_rdev(mddev, rdev); rdev->raid_disk = -1; } } @@ -7331,12 +7561,499 @@ void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) { sysfs_notify_dirent_safe(rdev->sysfs_state); wait_event_timeout(rdev->blocked_wait, - !test_bit(Blocked, &rdev->flags), + !test_bit(Blocked, &rdev->flags) && + !test_bit(BlockedBadBlocks, &rdev->flags), msecs_to_jiffies(5000)); rdev_dec_pending(rdev, mddev); } EXPORT_SYMBOL(md_wait_for_blocked_rdev); + +/* Bad block management. + * We can record which blocks on each device are 'bad' and so just + * fail those blocks, or that stripe, rather than the whole device. + * Entries in the bad-block table are 64bits wide. This comprises: + * Length of bad-range, in sectors: 0-511 for lengths 1-512 + * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) + * A 'shift' can be set so that larger blocks are tracked and + * consequently larger devices can be covered. + * 'Acknowledged' flag - 1 bit. - the most significant bit. + * + * Locking of the bad-block table uses a seqlock so md_is_badblock + * might need to retry if it is very unlucky. + * We will sometimes want to check for bad blocks in a bi_end_io function, + * so we use the write_seqlock_irq variant. + * + * When looking for a bad block we specify a range and want to + * know if any block in the range is bad. So we binary-search + * to the last range that starts at-or-before the given endpoint, + * (or "before the sector after the target range") + * then see if it ends after the given start. + * We return + * 0 if there are no known bad blocks in the range + * 1 if there are known bad block which are all acknowledged + * -1 if there are bad blocks which have not yet been acknowledged in metadata. + * plus the start/length of the first bad section we overlap. + */ +int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + int hi; + int lo = 0; + u64 *p = bb->page; + int rv = 0; + sector_t target = s + sectors; + unsigned seq; + + if (bb->shift > 0) { + /* round the start down, and the end up */ + s >>= bb->shift; + target += (1<<bb->shift) - 1; + target >>= bb->shift; + sectors = target - s; + } + /* 'target' is now the first block after the bad range */ + +retry: + seq = read_seqbegin(&bb->lock); + + hi = bb->count; + + /* Binary search between lo and hi for 'target' + * i.e. for the last range that starts before 'target' + */ + /* INVARIANT: ranges before 'lo' and at-or-after 'hi' + * are known not to be the last range before target. + * VARIANT: hi-lo is the number of possible + * ranges, and decreases until it reaches 1 + */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + if (a < target) + /* This could still be the one, earlier ranges + * could not. */ + lo = mid; + else + /* This and later ranges are definitely out. */ + hi = mid; + } + /* 'lo' might be the last that started before target, but 'hi' isn't */ + if (hi > lo) { + /* need to check all range that end after 's' to see if + * any are unacknowledged. + */ + while (lo >= 0 && + BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { + if (BB_OFFSET(p[lo]) < target) { + /* starts before the end, and finishes after + * the start, so they must overlap + */ + if (rv != -1 && BB_ACK(p[lo])) + rv = 1; + else + rv = -1; + *first_bad = BB_OFFSET(p[lo]); + *bad_sectors = BB_LEN(p[lo]); + } + lo--; + } + } + + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return rv; +} +EXPORT_SYMBOL_GPL(md_is_badblock); + +/* + * Add a range of bad blocks to the table. + * This might extend the table, or might contract it + * if two adjacent ranges can be merged. + * We binary-search to find the 'insertion' point, then + * decide how best to handle it. + */ +static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) +{ + u64 *p; + int lo, hi; + int rv = 1; + + if (bb->shift < 0) + /* badblocks are disabled */ + return 0; + + if (bb->shift) { + /* round the start down, and the end up */ + sector_t next = s + sectors; + s >>= bb->shift; + next += (1<<bb->shift) - 1; + next >>= bb->shift; + sectors = next - s; + } + + write_seqlock_irq(&bb->lock); + + p = bb->page; + lo = 0; + hi = bb->count; + /* Find the last range that starts at-or-before 's' */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + if (a <= s) + lo = mid; + else + hi = mid; + } + if (hi > lo && BB_OFFSET(p[lo]) > s) + hi = lo; + + if (hi > lo) { + /* we found a range that might merge with the start + * of our new range + */ + sector_t a = BB_OFFSET(p[lo]); + sector_t e = a + BB_LEN(p[lo]); + int ack = BB_ACK(p[lo]); + if (e >= s) { + /* Yes, we can merge with a previous range */ + if (s == a && s + sectors >= e) + /* new range covers old */ + ack = acknowledged; + else + ack = ack && acknowledged; + + if (e < s + sectors) + e = s + sectors; + if (e - a <= BB_MAX_LEN) { + p[lo] = BB_MAKE(a, e-a, ack); + s = e; + } else { + /* does not all fit in one range, + * make p[lo] maximal + */ + if (BB_LEN(p[lo]) != BB_MAX_LEN) + p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); + s = a + BB_MAX_LEN; + } + sectors = e - s; + } + } + if (sectors && hi < bb->count) { + /* 'hi' points to the first range that starts after 's'. + * Maybe we can merge with the start of that range */ + sector_t a = BB_OFFSET(p[hi]); + sector_t e = a + BB_LEN(p[hi]); + int ack = BB_ACK(p[hi]); + if (a <= s + sectors) { + /* merging is possible */ + if (e <= s + sectors) { + /* full overlap */ + e = s + sectors; + ack = acknowledged; + } else + ack = ack && acknowledged; + + a = s; + if (e - a <= BB_MAX_LEN) { + p[hi] = BB_MAKE(a, e-a, ack); + s = e; + } else { + p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); + s = a + BB_MAX_LEN; + } + sectors = e - s; + lo = hi; + hi++; + } + } + if (sectors == 0 && hi < bb->count) { + /* we might be able to combine lo and hi */ + /* Note: 's' is at the end of 'lo' */ + sector_t a = BB_OFFSET(p[hi]); + int lolen = BB_LEN(p[lo]); + int hilen = BB_LEN(p[hi]); + int newlen = lolen + hilen - (s - a); + if (s >= a && newlen < BB_MAX_LEN) { + /* yes, we can combine them */ + int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); + p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); + memmove(p + hi, p + hi + 1, + (bb->count - hi - 1) * 8); + bb->count--; + } + } + while (sectors) { + /* didn't merge (it all). + * Need to add a range just before 'hi' */ + if (bb->count >= MD_MAX_BADBLOCKS) { + /* No room for more */ + rv = 0; + break; + } else { + int this_sectors = sectors; + memmove(p + hi + 1, p + hi, + (bb->count - hi) * 8); + bb->count++; + + if (this_sectors > BB_MAX_LEN) + this_sectors = BB_MAX_LEN; + p[hi] = BB_MAKE(s, this_sectors, acknowledged); + sectors -= this_sectors; + s += this_sectors; + } + } + + bb->changed = 1; + if (!acknowledged) + bb->unacked_exist = 1; + write_sequnlock_irq(&bb->lock); + + return rv; +} + +int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors, + int acknowledged) +{ + int rv = md_set_badblocks(&rdev->badblocks, + s + rdev->data_offset, sectors, acknowledged); + if (rv) { + /* Make sure they get written out promptly */ + set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); + md_wakeup_thread(rdev->mddev->thread); + } + return rv; +} +EXPORT_SYMBOL_GPL(rdev_set_badblocks); + +/* + * Remove a range of bad blocks from the table. + * This may involve extending the table if we spilt a region, + * but it must not fail. So if the table becomes full, we just + * drop the remove request. + */ +static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) +{ + u64 *p; + int lo, hi; + sector_t target = s + sectors; + int rv = 0; + + if (bb->shift > 0) { + /* When clearing we round the start up and the end down. + * This should not matter as the shift should align with + * the block size and no rounding should ever be needed. + * However it is better the think a block is bad when it + * isn't than to think a block is not bad when it is. + */ + s += (1<<bb->shift) - 1; + s >>= bb->shift; + target >>= bb->shift; + sectors = target - s; + } + + write_seqlock_irq(&bb->lock); + + p = bb->page; + lo = 0; + hi = bb->count; + /* Find the last range that starts before 'target' */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + if (a < target) + lo = mid; + else + hi = mid; + } + if (hi > lo) { + /* p[lo] is the last range that could overlap the + * current range. Earlier ranges could also overlap, + * but only this one can overlap the end of the range. + */ + if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { + /* Partial overlap, leave the tail of this range */ + int ack = BB_ACK(p[lo]); + sector_t a = BB_OFFSET(p[lo]); + sector_t end = a + BB_LEN(p[lo]); + + if (a < s) { + /* we need to split this range */ + if (bb->count >= MD_MAX_BADBLOCKS) { + rv = 0; + goto out; + } + memmove(p+lo+1, p+lo, (bb->count - lo) * 8); + bb->count++; + p[lo] = BB_MAKE(a, s-a, ack); + lo++; + } + p[lo] = BB_MAKE(target, end - target, ack); + /* there is no longer an overlap */ + hi = lo; + lo--; + } + while (lo >= 0 && + BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { + /* This range does overlap */ + if (BB_OFFSET(p[lo]) < s) { + /* Keep the early parts of this range. */ + int ack = BB_ACK(p[lo]); + sector_t start = BB_OFFSET(p[lo]); + p[lo] = BB_MAKE(start, s - start, ack); + /* now low doesn't overlap, so.. */ + break; + } + lo--; + } + /* 'lo' is strictly before, 'hi' is strictly after, + * anything between needs to be discarded + */ + if (hi - lo > 1) { + memmove(p+lo+1, p+hi, (bb->count - hi) * 8); + bb->count -= (hi - lo - 1); + } + } + + bb->changed = 1; +out: + write_sequnlock_irq(&bb->lock); + return rv; +} + +int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors) +{ + return md_clear_badblocks(&rdev->badblocks, + s + rdev->data_offset, + sectors); +} +EXPORT_SYMBOL_GPL(rdev_clear_badblocks); + +/* + * Acknowledge all bad blocks in a list. + * This only succeeds if ->changed is clear. It is used by + * in-kernel metadata updates + */ +void md_ack_all_badblocks(struct badblocks *bb) +{ + if (bb->page == NULL || bb->changed) + /* no point even trying */ + return; + write_seqlock_irq(&bb->lock); + + if (bb->changed == 0) { + u64 *p = bb->page; + int i; + for (i = 0; i < bb->count ; i++) { + if (!BB_ACK(p[i])) { + sector_t start = BB_OFFSET(p[i]); + int len = BB_LEN(p[i]); + p[i] = BB_MAKE(start, len, 1); + } + } + bb->unacked_exist = 0; + } + write_sequnlock_irq(&bb->lock); +} +EXPORT_SYMBOL_GPL(md_ack_all_badblocks); + +/* sysfs access to bad-blocks list. + * We present two files. + * 'bad-blocks' lists sector numbers and lengths of ranges that + * are recorded as bad. The list is truncated to fit within + * the one-page limit of sysfs. + * Writing "sector length" to this file adds an acknowledged + * bad block list. + * 'unacknowledged-bad-blocks' lists bad blocks that have not yet + * been acknowledged. Writing to this file adds bad blocks + * without acknowledging them. This is largely for testing. + */ + +static ssize_t +badblocks_show(struct badblocks *bb, char *page, int unack) +{ + size_t len; + int i; + u64 *p = bb->page; + unsigned seq; + + if (bb->shift < 0) + return 0; + +retry: + seq = read_seqbegin(&bb->lock); + + len = 0; + i = 0; + + while (len < PAGE_SIZE && i < bb->count) { + sector_t s = BB_OFFSET(p[i]); + unsigned int length = BB_LEN(p[i]); + int ack = BB_ACK(p[i]); + i++; + + if (unack && ack) + continue; + + len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", + (unsigned long long)s << bb->shift, + length << bb->shift); + } + if (unack && len == 0) + bb->unacked_exist = 0; + + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return len; +} + +#define DO_DEBUG 1 + +static ssize_t +badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack) +{ + unsigned long long sector; + int length; + char newline; +#ifdef DO_DEBUG + /* Allow clearing via sysfs *only* for testing/debugging. + * Normally only a successful write may clear a badblock + */ + int clear = 0; + if (page[0] == '-') { + clear = 1; + page++; + } +#endif /* DO_DEBUG */ + + switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { + case 3: + if (newline != '\n') + return -EINVAL; + case 2: + if (length <= 0) + return -EINVAL; + break; + default: + return -EINVAL; + } + +#ifdef DO_DEBUG + if (clear) { + md_clear_badblocks(bb, sector, length); + return len; + } +#endif /* DO_DEBUG */ + if (md_set_badblocks(bb, sector, length, !unack)) + return len; + else + return -ENOSPC; +} + static int md_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { diff --git a/drivers/md/md.h b/drivers/md/md.h index 1c26c7a08ae..1e586bb4452 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -29,6 +29,13 @@ typedef struct mddev_s mddev_t; typedef struct mdk_rdev_s mdk_rdev_t; +/* Bad block numbers are stored sorted in a single page. + * 64bits is used for each block or extent. + * 54 bits are sector number, 9 bits are extent size, + * 1 bit is an 'acknowledged' flag. + */ +#define MD_MAX_BADBLOCKS (PAGE_SIZE/8) + /* * MD's 'extended' device */ @@ -48,7 +55,7 @@ struct mdk_rdev_s struct block_device *meta_bdev; struct block_device *bdev; /* block device handle */ - struct page *sb_page; + struct page *sb_page, *bb_page; int sb_loaded; __u64 sb_events; sector_t data_offset; /* start of data in array */ @@ -74,9 +81,29 @@ struct mdk_rdev_s #define In_sync 2 /* device is in_sync with rest of array */ #define WriteMostly 4 /* Avoid reading if at all possible */ #define AutoDetected 7 /* added by auto-detect */ -#define Blocked 8 /* An error occurred on an externally - * managed array, don't allow writes +#define Blocked 8 /* An error occurred but has not yet + * been acknowledged by the metadata + * handler, so don't allow writes * until it is cleared */ +#define WriteErrorSeen 9 /* A write error has been seen on this + * device + */ +#define FaultRecorded 10 /* Intermediate state for clearing + * Blocked. The Fault is/will-be + * recorded in the metadata, but that + * metadata hasn't been stored safely + * on disk yet. + */ +#define BlockedBadBlocks 11 /* A writer is blocked because they + * found an unacknowledged bad-block. + * This can safely be cleared at any + * time, and the writer will re-check. + * It may be set at any time, and at + * worst the writer will timeout and + * re-check. So setting it as + * accurately as possible is good, but + * not absolutely critical. + */ wait_queue_head_t blocked_wait; int desc_nr; /* descriptor index in the superblock */ @@ -111,8 +138,54 @@ struct mdk_rdev_s struct sysfs_dirent *sysfs_state; /* handle for 'state' * sysfs entry */ + + struct badblocks { + int count; /* count of bad blocks */ + int unacked_exist; /* there probably are unacknowledged + * bad blocks. This is only cleared + * when a read discovers none + */ + int shift; /* shift from sectors to block size + * a -ve shift means badblocks are + * disabled.*/ + u64 *page; /* badblock list */ + int changed; + seqlock_t lock; + + sector_t sector; + sector_t size; /* in sectors */ + } badblocks; }; +#define BB_LEN_MASK (0x00000000000001FFULL) +#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) +#define BB_ACK_MASK (0x8000000000000000ULL) +#define BB_MAX_LEN 512 +#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9) +#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1) +#define BB_ACK(x) (!!((x) & BB_ACK_MASK)) +#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63)) + +extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors); +static inline int is_badblock(mdk_rdev_t *rdev, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + if (unlikely(rdev->badblocks.count)) { + int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s, + sectors, + first_bad, bad_sectors); + if (rv) + *first_bad -= rdev->data_offset; + return rv; + } + return 0; +} +extern int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors, + int acknowledged); +extern int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors); +extern void md_ack_all_badblocks(struct badblocks *bb); + struct mddev_s { void *private; @@ -239,9 +312,12 @@ struct mddev_s #define MD_RECOVERY_FROZEN 9 unsigned long recovery; - int recovery_disabled; /* if we detect that recovery - * will always fail, set this - * so we don't loop trying */ + /* If a RAID personality determines that recovery (of a particular + * device) will fail due to a read error on the source device, it + * takes a copy of this number and does not attempt recovery again + * until this number changes. + */ + int recovery_disabled; int in_sync; /* know to not need resync */ /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so @@ -304,11 +380,6 @@ struct mddev_s * hot-adding a bitmap. It should * eventually be settable by sysfs. */ - /* When md is serving under dm, it might use a - * dirty_log to store the bits. - */ - struct dm_dirty_log *log; - struct mutex mutex; unsigned long chunksize; unsigned long daemon_sleep; /* how many jiffies between updates? */ @@ -413,6 +484,20 @@ static inline char * mdname (mddev_t * mddev) return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; } +static inline int sysfs_link_rdev(mddev_t *mddev, mdk_rdev_t *rdev) +{ + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); +} + +static inline void sysfs_unlink_rdev(mddev_t *mddev, mdk_rdev_t *rdev) +{ + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); +} + /* * iterates through some rdev ringlist. It's safe to remove the * current 'rdev'. Dont touch 'tmp' though. @@ -505,7 +590,7 @@ extern void mddev_init(mddev_t *mddev); extern int md_run(mddev_t *mddev); extern void md_stop(mddev_t *mddev); extern void md_stop_writes(mddev_t *mddev); -extern void md_rdev_init(mdk_rdev_t *rdev); +extern int md_rdev_init(mdk_rdev_t *rdev); extern void mddev_suspend(mddev_t *mddev); extern void mddev_resume(mddev_t *mddev); @@ -514,4 +599,5 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, mddev_t *mddev); extern int mddev_check_plugged(mddev_t *mddev); +extern void md_trim_bio(struct bio *bio, int offset, int size); #endif /* _MD_MD_H */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f7431b6d844..f4622dd8fc5 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -35,16 +35,13 @@ #include <linux/delay.h> #include <linux/blkdev.h> #include <linux/seq_file.h> +#include <linux/ratelimit.h> #include "md.h" #include "raid1.h" #include "bitmap.h" #define DEBUG 0 -#if DEBUG -#define PRINTK(x...) printk(x) -#else -#define PRINTK(x...) -#endif +#define PRINTK(x...) do { if (DEBUG) printk(x); } while (0) /* * Number of guaranteed r1bios in case of extreme VM load: @@ -166,7 +163,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio) for (i = 0; i < conf->raid_disks; i++) { struct bio **bio = r1_bio->bios + i; - if (*bio && *bio != IO_BLOCKED) + if (!BIO_SPECIAL(*bio)) bio_put(*bio); *bio = NULL; } @@ -176,12 +173,6 @@ static void free_r1bio(r1bio_t *r1_bio) { conf_t *conf = r1_bio->mddev->private; - /* - * Wake up any possible resync thread that waits for the device - * to go idle. - */ - allow_barrier(conf); - put_all_bios(conf, r1_bio); mempool_free(r1_bio, conf->r1bio_pool); } @@ -222,6 +213,33 @@ static void reschedule_retry(r1bio_t *r1_bio) * operation and are ready to return a success/failure code to the buffer * cache layer. */ +static void call_bio_endio(r1bio_t *r1_bio) +{ + struct bio *bio = r1_bio->master_bio; + int done; + conf_t *conf = r1_bio->mddev->private; + + if (bio->bi_phys_segments) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + bio->bi_phys_segments--; + done = (bio->bi_phys_segments == 0); + spin_unlock_irqrestore(&conf->device_lock, flags); + } else + done = 1; + + if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + if (done) { + bio_endio(bio, 0); + /* + * Wake up any possible resync thread that waits for the device + * to go idle. + */ + allow_barrier(conf); + } +} + static void raid_end_bio_io(r1bio_t *r1_bio) { struct bio *bio = r1_bio->master_bio; @@ -234,8 +252,7 @@ static void raid_end_bio_io(r1bio_t *r1_bio) (unsigned long long) bio->bi_sector + (bio->bi_size >> 9) - 1); - bio_endio(bio, - test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); + call_bio_endio(r1_bio); } free_r1bio(r1_bio); } @@ -287,36 +304,52 @@ static void raid1_end_read_request(struct bio *bio, int error) * oops, read error: */ char b[BDEVNAME_SIZE]; - if (printk_ratelimit()) - printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n", - mdname(conf->mddev), - bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); + printk_ratelimited( + KERN_ERR "md/raid1:%s: %s: " + "rescheduling sector %llu\n", + mdname(conf->mddev), + bdevname(conf->mirrors[mirror].rdev->bdev, + b), + (unsigned long long)r1_bio->sector); + set_bit(R1BIO_ReadError, &r1_bio->state); reschedule_retry(r1_bio); } rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); } +static void close_write(r1bio_t *r1_bio) +{ + /* it really is the end of this request */ + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { + /* free extra copy of the data pages */ + int i = r1_bio->behind_page_count; + while (i--) + safe_put_page(r1_bio->behind_bvecs[i].bv_page); + kfree(r1_bio->behind_bvecs); + r1_bio->behind_bvecs = NULL; + } + /* clear the bitmap if all writes complete successfully */ + bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, + r1_bio->sectors, + !test_bit(R1BIO_Degraded, &r1_bio->state), + test_bit(R1BIO_BehindIO, &r1_bio->state)); + md_write_end(r1_bio->mddev); +} + static void r1_bio_write_done(r1bio_t *r1_bio) { - if (atomic_dec_and_test(&r1_bio->remaining)) - { - /* it really is the end of this request */ - if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { - /* free extra copy of the data pages */ - int i = r1_bio->behind_page_count; - while (i--) - safe_put_page(r1_bio->behind_pages[i]); - kfree(r1_bio->behind_pages); - r1_bio->behind_pages = NULL; - } - /* clear the bitmap if all writes complete successfully */ - bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, - r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state), - test_bit(R1BIO_BehindIO, &r1_bio->state)); - md_write_end(r1_bio->mddev); - raid_end_bio_io(r1_bio); + if (!atomic_dec_and_test(&r1_bio->remaining)) + return; + + if (test_bit(R1BIO_WriteError, &r1_bio->state)) + reschedule_retry(r1_bio); + else { + close_write(r1_bio); + if (test_bit(R1BIO_MadeGood, &r1_bio->state)) + reschedule_retry(r1_bio); + else + raid_end_bio_io(r1_bio); } } @@ -336,13 +369,11 @@ static void raid1_end_write_request(struct bio *bio, int error) /* * 'one mirror IO has finished' event handler: */ - r1_bio->bios[mirror] = NULL; - to_put = bio; if (!uptodate) { - md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); - /* an I/O failed, we can't clear the bitmap */ - set_bit(R1BIO_Degraded, &r1_bio->state); - } else + set_bit(WriteErrorSeen, + &conf->mirrors[mirror].rdev->flags); + set_bit(R1BIO_WriteError, &r1_bio->state); + } else { /* * Set R1BIO_Uptodate in our master bio, so that we * will return a good error code for to the higher @@ -353,8 +384,22 @@ static void raid1_end_write_request(struct bio *bio, int error) * to user-side. So if something waits for IO, then it * will wait for the 'master' bio. */ + sector_t first_bad; + int bad_sectors; + + r1_bio->bios[mirror] = NULL; + to_put = bio; set_bit(R1BIO_Uptodate, &r1_bio->state); + /* Maybe we can clear some bad blocks. */ + if (is_badblock(conf->mirrors[mirror].rdev, + r1_bio->sector, r1_bio->sectors, + &first_bad, &bad_sectors)) { + r1_bio->bios[mirror] = IO_MADE_GOOD; + set_bit(R1BIO_MadeGood, &r1_bio->state); + } + } + update_head_pos(mirror, r1_bio); if (behind) { @@ -377,11 +422,13 @@ static void raid1_end_write_request(struct bio *bio, int error) (unsigned long long) mbio->bi_sector, (unsigned long long) mbio->bi_sector + (mbio->bi_size >> 9) - 1); - bio_endio(mbio, 0); + call_bio_endio(r1_bio); } } } - rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); + if (r1_bio->bios[mirror] == NULL) + rdev_dec_pending(conf->mirrors[mirror].rdev, + conf->mddev); /* * Let's see if all mirrored write operations have finished @@ -408,10 +455,11 @@ static void raid1_end_write_request(struct bio *bio, int error) * * The rdev for the device selected will have nr_pending incremented. */ -static int read_balance(conf_t *conf, r1bio_t *r1_bio) +static int read_balance(conf_t *conf, r1bio_t *r1_bio, int *max_sectors) { const sector_t this_sector = r1_bio->sector; - const int sectors = r1_bio->sectors; + int sectors; + int best_good_sectors; int start_disk; int best_disk; int i; @@ -426,8 +474,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) * We take the first readable disk when above the resync window. */ retry: + sectors = r1_bio->sectors; best_disk = -1; best_dist = MaxSector; + best_good_sectors = 0; + if (conf->mddev->recovery_cp < MaxSector && (this_sector + sectors >= conf->next_resync)) { choose_first = 1; @@ -439,6 +490,9 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) for (i = 0 ; i < conf->raid_disks ; i++) { sector_t dist; + sector_t first_bad; + int bad_sectors; + int disk = start_disk + i; if (disk >= conf->raid_disks) disk -= conf->raid_disks; @@ -461,6 +515,35 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) /* This is a reasonable device to use. It might * even be best. */ + if (is_badblock(rdev, this_sector, sectors, + &first_bad, &bad_sectors)) { + if (best_dist < MaxSector) + /* already have a better device */ + continue; + if (first_bad <= this_sector) { + /* cannot read here. If this is the 'primary' + * device, then we must not read beyond + * bad_sectors from another device.. + */ + bad_sectors -= (this_sector - first_bad); + if (choose_first && sectors > bad_sectors) + sectors = bad_sectors; + if (best_good_sectors > sectors) + best_good_sectors = sectors; + + } else { + sector_t good_sectors = first_bad - this_sector; + if (good_sectors > best_good_sectors) { + best_good_sectors = good_sectors; + best_disk = disk; + } + if (choose_first) + break; + } + continue; + } else + best_good_sectors = sectors; + dist = abs(this_sector - conf->mirrors[disk].head_position); if (choose_first /* Don't change to another disk for sequential reads */ @@ -489,10 +572,12 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) rdev_dec_pending(rdev, conf->mddev); goto retry; } + sectors = best_good_sectors; conf->next_seq_sect = this_sector + sectors; conf->last_used = best_disk; } rcu_read_unlock(); + *max_sectors = sectors; return best_disk; } @@ -672,30 +757,31 @@ static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio) { int i; struct bio_vec *bvec; - struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*), + struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec), GFP_NOIO); - if (unlikely(!pages)) + if (unlikely(!bvecs)) return; bio_for_each_segment(bvec, bio, i) { - pages[i] = alloc_page(GFP_NOIO); - if (unlikely(!pages[i])) + bvecs[i] = *bvec; + bvecs[i].bv_page = alloc_page(GFP_NOIO); + if (unlikely(!bvecs[i].bv_page)) goto do_sync_io; - memcpy(kmap(pages[i]) + bvec->bv_offset, - kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); - kunmap(pages[i]); + memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset, + kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); + kunmap(bvecs[i].bv_page); kunmap(bvec->bv_page); } - r1_bio->behind_pages = pages; + r1_bio->behind_bvecs = bvecs; r1_bio->behind_page_count = bio->bi_vcnt; set_bit(R1BIO_BehindIO, &r1_bio->state); return; do_sync_io: for (i = 0; i < bio->bi_vcnt; i++) - if (pages[i]) - put_page(pages[i]); - kfree(pages); + if (bvecs[i].bv_page) + put_page(bvecs[i].bv_page); + kfree(bvecs); PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); } @@ -705,7 +791,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) mirror_info_t *mirror; r1bio_t *r1_bio; struct bio *read_bio; - int i, targets = 0, disks; + int i, disks; struct bitmap *bitmap; unsigned long flags; const int rw = bio_data_dir(bio); @@ -713,6 +799,9 @@ static int make_request(mddev_t *mddev, struct bio * bio) const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); mdk_rdev_t *blocked_rdev; int plugged; + int first_clone; + int sectors_handled; + int max_sectors; /* * Register the new request and wait if the reconstruction @@ -759,11 +848,24 @@ static int make_request(mddev_t *mddev, struct bio * bio) r1_bio->mddev = mddev; r1_bio->sector = bio->bi_sector; + /* We might need to issue multiple reads to different + * devices if there are bad blocks around, so we keep + * track of the number of reads in bio->bi_phys_segments. + * If this is 0, there is only one r1_bio and no locking + * will be needed when requests complete. If it is + * non-zero, then it is the number of not-completed requests. + */ + bio->bi_phys_segments = 0; + clear_bit(BIO_SEG_VALID, &bio->bi_flags); + if (rw == READ) { /* * read balancing logic: */ - int rdisk = read_balance(conf, r1_bio); + int rdisk; + +read_again: + rdisk = read_balance(conf, r1_bio, &max_sectors); if (rdisk < 0) { /* couldn't find anywhere to read from */ @@ -784,6 +886,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) r1_bio->read_disk = rdisk; read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); + md_trim_bio(read_bio, r1_bio->sector - bio->bi_sector, + max_sectors); r1_bio->bios[rdisk] = read_bio; @@ -793,16 +897,52 @@ static int make_request(mddev_t *mddev, struct bio * bio) read_bio->bi_rw = READ | do_sync; read_bio->bi_private = r1_bio; - generic_make_request(read_bio); + if (max_sectors < r1_bio->sectors) { + /* could not read all from this device, so we will + * need another r1_bio. + */ + + sectors_handled = (r1_bio->sector + max_sectors + - bio->bi_sector); + r1_bio->sectors = max_sectors; + spin_lock_irq(&conf->device_lock); + if (bio->bi_phys_segments == 0) + bio->bi_phys_segments = 2; + else + bio->bi_phys_segments++; + spin_unlock_irq(&conf->device_lock); + /* Cannot call generic_make_request directly + * as that will be queued in __make_request + * and subsequent mempool_alloc might block waiting + * for it. So hand bio over to raid1d. + */ + reschedule_retry(r1_bio); + + r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); + + r1_bio->master_bio = bio; + r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled; + r1_bio->state = 0; + r1_bio->mddev = mddev; + r1_bio->sector = bio->bi_sector + sectors_handled; + goto read_again; + } else + generic_make_request(read_bio); return 0; } /* * WRITE: */ - /* first select target devices under spinlock and + /* first select target devices under rcu_lock and * inc refcount on their rdev. Record them by setting * bios[x] to bio + * If there are known/acknowledged bad blocks on any device on + * which we have seen a write error, we want to avoid writing those + * blocks. + * This potentially requires several writes to write around + * the bad blocks. Each set of writes gets it's own r1bio + * with a set of bios attached. */ plugged = mddev_check_plugged(mddev); @@ -810,6 +950,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) retry_write: blocked_rdev = NULL; rcu_read_lock(); + max_sectors = r1_bio->sectors; for (i = 0; i < disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { @@ -817,17 +958,56 @@ static int make_request(mddev_t *mddev, struct bio * bio) blocked_rdev = rdev; break; } - if (rdev && !test_bit(Faulty, &rdev->flags)) { - atomic_inc(&rdev->nr_pending); - if (test_bit(Faulty, &rdev->flags)) { + r1_bio->bios[i] = NULL; + if (!rdev || test_bit(Faulty, &rdev->flags)) { + set_bit(R1BIO_Degraded, &r1_bio->state); + continue; + } + + atomic_inc(&rdev->nr_pending); + if (test_bit(WriteErrorSeen, &rdev->flags)) { + sector_t first_bad; + int bad_sectors; + int is_bad; + + is_bad = is_badblock(rdev, r1_bio->sector, + max_sectors, + &first_bad, &bad_sectors); + if (is_bad < 0) { + /* mustn't write here until the bad block is + * acknowledged*/ + set_bit(BlockedBadBlocks, &rdev->flags); + blocked_rdev = rdev; + break; + } + if (is_bad && first_bad <= r1_bio->sector) { + /* Cannot write here at all */ + bad_sectors -= (r1_bio->sector - first_bad); + if (bad_sectors < max_sectors) + /* mustn't write more than bad_sectors + * to other devices yet + */ + max_sectors = bad_sectors; rdev_dec_pending(rdev, mddev); - r1_bio->bios[i] = NULL; - } else { - r1_bio->bios[i] = bio; - targets++; + /* We don't set R1BIO_Degraded as that + * only applies if the disk is + * missing, so it might be re-added, + * and we want to know to recover this + * chunk. + * In this case the device is here, + * and the fact that this chunk is not + * in-sync is recorded in the bad + * block log + */ + continue; } - } else - r1_bio->bios[i] = NULL; + if (is_bad) { + int good_sectors = first_bad - r1_bio->sector; + if (good_sectors < max_sectors) + max_sectors = good_sectors; + } + } + r1_bio->bios[i] = bio; } rcu_read_unlock(); @@ -838,51 +1018,57 @@ static int make_request(mddev_t *mddev, struct bio * bio) for (j = 0; j < i; j++) if (r1_bio->bios[j]) rdev_dec_pending(conf->mirrors[j].rdev, mddev); - + r1_bio->state = 0; allow_barrier(conf); md_wait_for_blocked_rdev(blocked_rdev, mddev); wait_barrier(conf); goto retry_write; } - BUG_ON(targets == 0); /* we never fail the last device */ - - if (targets < conf->raid_disks) { - /* array is degraded, we will not clear the bitmap - * on I/O completion (see raid1_end_write_request) */ - set_bit(R1BIO_Degraded, &r1_bio->state); + if (max_sectors < r1_bio->sectors) { + /* We are splitting this write into multiple parts, so + * we need to prepare for allocating another r1_bio. + */ + r1_bio->sectors = max_sectors; + spin_lock_irq(&conf->device_lock); + if (bio->bi_phys_segments == 0) + bio->bi_phys_segments = 2; + else + bio->bi_phys_segments++; + spin_unlock_irq(&conf->device_lock); } - - /* do behind I/O ? - * Not if there are too many, or cannot allocate memory, - * or a reader on WriteMostly is waiting for behind writes - * to flush */ - if (bitmap && - (atomic_read(&bitmap->behind_writes) - < mddev->bitmap_info.max_write_behind) && - !waitqueue_active(&bitmap->behind_wait)) - alloc_behind_pages(bio, r1_bio); + sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector; atomic_set(&r1_bio->remaining, 1); atomic_set(&r1_bio->behind_remaining, 0); - bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, - test_bit(R1BIO_BehindIO, &r1_bio->state)); + first_clone = 1; for (i = 0; i < disks; i++) { struct bio *mbio; if (!r1_bio->bios[i]) continue; mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - r1_bio->bios[i] = mbio; - - mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; - mbio->bi_bdev = conf->mirrors[i].rdev->bdev; - mbio->bi_end_io = raid1_end_write_request; - mbio->bi_rw = WRITE | do_flush_fua | do_sync; - mbio->bi_private = r1_bio; - - if (r1_bio->behind_pages) { + md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors); + + if (first_clone) { + /* do behind I/O ? + * Not if there are too many, or cannot + * allocate memory, or a reader on WriteMostly + * is waiting for behind writes to flush */ + if (bitmap && + (atomic_read(&bitmap->behind_writes) + < mddev->bitmap_info.max_write_behind) && + !waitqueue_active(&bitmap->behind_wait)) + alloc_behind_pages(mbio, r1_bio); + + bitmap_startwrite(bitmap, r1_bio->sector, + r1_bio->sectors, + test_bit(R1BIO_BehindIO, + &r1_bio->state)); + first_clone = 0; + } + if (r1_bio->behind_bvecs) { struct bio_vec *bvec; int j; @@ -894,16 +1080,42 @@ static int make_request(mddev_t *mddev, struct bio * bio) * them all */ __bio_for_each_segment(bvec, mbio, j, 0) - bvec->bv_page = r1_bio->behind_pages[j]; + bvec->bv_page = r1_bio->behind_bvecs[j].bv_page; if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) atomic_inc(&r1_bio->behind_remaining); } + r1_bio->bios[i] = mbio; + + mbio->bi_sector = (r1_bio->sector + + conf->mirrors[i].rdev->data_offset); + mbio->bi_bdev = conf->mirrors[i].rdev->bdev; + mbio->bi_end_io = raid1_end_write_request; + mbio->bi_rw = WRITE | do_flush_fua | do_sync; + mbio->bi_private = r1_bio; + atomic_inc(&r1_bio->remaining); spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); spin_unlock_irqrestore(&conf->device_lock, flags); } + /* Mustn't call r1_bio_write_done before this next test, + * as it could result in the bio being freed. + */ + if (sectors_handled < (bio->bi_size >> 9)) { + r1_bio_write_done(r1_bio); + /* We need another r1_bio. It has already been counted + * in bio->bi_phys_segments + */ + r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); + r1_bio->master_bio = bio; + r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled; + r1_bio->state = 0; + r1_bio->mddev = mddev; + r1_bio->sector = bio->bi_sector + sectors_handled; + goto retry_write; + } + r1_bio_write_done(r1_bio); /* In case raid1d snuck in to freeze_array */ @@ -952,9 +1164,10 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * However don't try a recovery from this drive as * it is very likely to fail. */ - mddev->recovery_disabled = 1; + conf->recovery_disabled = mddev->recovery_disabled; return; } + set_bit(Blocked, &rdev->flags); if (test_and_clear_bit(In_sync, &rdev->flags)) { unsigned long flags; spin_lock_irqsave(&conf->device_lock, flags); @@ -1027,7 +1240,7 @@ static int raid1_spare_active(mddev_t *mddev) && !test_bit(Faulty, &rdev->flags) && !test_and_set_bit(In_sync, &rdev->flags)) { count++; - sysfs_notify_dirent(rdev->sysfs_state); + sysfs_notify_dirent_safe(rdev->sysfs_state); } } spin_lock_irqsave(&conf->device_lock, flags); @@ -1048,6 +1261,9 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) int first = 0; int last = mddev->raid_disks - 1; + if (mddev->recovery_disabled == conf->recovery_disabled) + return -EBUSY; + if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; @@ -1103,7 +1319,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number) * is not possible. */ if (!test_bit(Faulty, &rdev->flags) && - !mddev->recovery_disabled && + mddev->recovery_disabled != conf->recovery_disabled && mddev->degraded < conf->raid_disks) { err = -EBUSY; goto abort; @@ -1155,6 +1371,8 @@ static void end_sync_write(struct bio *bio, int error) conf_t *conf = mddev->private; int i; int mirror=0; + sector_t first_bad; + int bad_sectors; for (i = 0; i < conf->raid_disks; i++) if (r1_bio->bios[i] == bio) { @@ -1172,18 +1390,48 @@ static void end_sync_write(struct bio *bio, int error) s += sync_blocks; sectors_to_go -= sync_blocks; } while (sectors_to_go > 0); - md_error(mddev, conf->mirrors[mirror].rdev); - } + set_bit(WriteErrorSeen, + &conf->mirrors[mirror].rdev->flags); + set_bit(R1BIO_WriteError, &r1_bio->state); + } else if (is_badblock(conf->mirrors[mirror].rdev, + r1_bio->sector, + r1_bio->sectors, + &first_bad, &bad_sectors) && + !is_badblock(conf->mirrors[r1_bio->read_disk].rdev, + r1_bio->sector, + r1_bio->sectors, + &first_bad, &bad_sectors) + ) + set_bit(R1BIO_MadeGood, &r1_bio->state); update_head_pos(mirror, r1_bio); if (atomic_dec_and_test(&r1_bio->remaining)) { - sector_t s = r1_bio->sectors; - put_buf(r1_bio); - md_done_sync(mddev, s, uptodate); + int s = r1_bio->sectors; + if (test_bit(R1BIO_MadeGood, &r1_bio->state) || + test_bit(R1BIO_WriteError, &r1_bio->state)) + reschedule_retry(r1_bio); + else { + put_buf(r1_bio); + md_done_sync(mddev, s, uptodate); + } } } +static int r1_sync_page_io(mdk_rdev_t *rdev, sector_t sector, + int sectors, struct page *page, int rw) +{ + if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) + /* success */ + return 1; + if (rw == WRITE) + set_bit(WriteErrorSeen, &rdev->flags); + /* need to record an error - either for the block or the device */ + if (!rdev_set_badblocks(rdev, sector, sectors, 0)) + md_error(rdev->mddev, rdev); + return 0; +} + static int fix_sync_read_error(r1bio_t *r1_bio) { /* Try some synchronous reads of other devices to get @@ -1193,6 +1441,9 @@ static int fix_sync_read_error(r1bio_t *r1_bio) * We don't need to freeze the array, because being in an * active sync request, there is no normal IO, and * no overlapping syncs. + * We don't need to check is_badblock() again as we + * made sure that anything with a bad block in range + * will have bi_end_io clear. */ mddev_t *mddev = r1_bio->mddev; conf_t *conf = mddev->private; @@ -1217,9 +1468,7 @@ static int fix_sync_read_error(r1bio_t *r1_bio) * active, and resync is currently active */ rdev = conf->mirrors[d].rdev; - if (sync_page_io(rdev, - sect, - s<<9, + if (sync_page_io(rdev, sect, s<<9, bio->bi_io_vec[idx].bv_page, READ, false)) { success = 1; @@ -1233,16 +1482,36 @@ static int fix_sync_read_error(r1bio_t *r1_bio) if (!success) { char b[BDEVNAME_SIZE]; - /* Cannot read from anywhere, array is toast */ - md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); + int abort = 0; + /* Cannot read from anywhere, this block is lost. + * Record a bad block on each device. If that doesn't + * work just disable and interrupt the recovery. + * Don't fail devices as that won't really help. + */ printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" " for block %llu\n", mdname(mddev), bdevname(bio->bi_bdev, b), (unsigned long long)r1_bio->sector); - md_done_sync(mddev, r1_bio->sectors, 0); - put_buf(r1_bio); - return 0; + for (d = 0; d < conf->raid_disks; d++) { + rdev = conf->mirrors[d].rdev; + if (!rdev || test_bit(Faulty, &rdev->flags)) + continue; + if (!rdev_set_badblocks(rdev, sect, s, 0)) + abort = 1; + } + if (abort) { + mddev->recovery_disabled = 1; + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_done_sync(mddev, r1_bio->sectors, 0); + put_buf(r1_bio); + return 0; + } + /* Try next page */ + sectors -= s; + sect += s; + idx++; + continue; } start = d; @@ -1254,16 +1523,12 @@ static int fix_sync_read_error(r1bio_t *r1_bio) if (r1_bio->bios[d]->bi_end_io != end_sync_read) continue; rdev = conf->mirrors[d].rdev; - if (sync_page_io(rdev, - sect, - s<<9, - bio->bi_io_vec[idx].bv_page, - WRITE, false) == 0) { + if (r1_sync_page_io(rdev, sect, s, + bio->bi_io_vec[idx].bv_page, + WRITE) == 0) { r1_bio->bios[d]->bi_end_io = NULL; rdev_dec_pending(rdev, mddev); - md_error(mddev, rdev); - } else - atomic_add(s, &rdev->corrected_errors); + } } d = start; while (d != r1_bio->read_disk) { @@ -1273,12 +1538,10 @@ static int fix_sync_read_error(r1bio_t *r1_bio) if (r1_bio->bios[d]->bi_end_io != end_sync_read) continue; rdev = conf->mirrors[d].rdev; - if (sync_page_io(rdev, - sect, - s<<9, - bio->bi_io_vec[idx].bv_page, - READ, false) == 0) - md_error(mddev, rdev); + if (r1_sync_page_io(rdev, sect, s, + bio->bi_io_vec[idx].bv_page, + READ) != 0) + atomic_add(s, &rdev->corrected_errors); } sectors -= s; sect += s; @@ -1420,7 +1683,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) * * 1. Retries failed read operations on working mirrors. * 2. Updates the raid superblock when problems encounter. - * 3. Performs writes following reads for array syncronising. + * 3. Performs writes following reads for array synchronising. */ static void fix_read_error(conf_t *conf, int read_disk, @@ -1443,9 +1706,14 @@ static void fix_read_error(conf_t *conf, int read_disk, * which is the thread that might remove * a device. If raid1d ever becomes multi-threaded.... */ + sector_t first_bad; + int bad_sectors; + rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags) && + is_badblock(rdev, sect, s, + &first_bad, &bad_sectors) == 0 && sync_page_io(rdev, sect, s<<9, conf->tmppage, READ, false)) success = 1; @@ -1457,8 +1725,10 @@ static void fix_read_error(conf_t *conf, int read_disk, } while (!success && d != read_disk); if (!success) { - /* Cannot read from anywhere -- bye bye array */ - md_error(mddev, conf->mirrors[read_disk].rdev); + /* Cannot read from anywhere - mark it bad */ + mdk_rdev_t *rdev = conf->mirrors[read_disk].rdev; + if (!rdev_set_badblocks(rdev, sect, s, 0)) + md_error(mddev, rdev); break; } /* write it back and re-read */ @@ -1469,13 +1739,9 @@ static void fix_read_error(conf_t *conf, int read_disk, d--; rdev = conf->mirrors[d].rdev; if (rdev && - test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev, sect, s<<9, - conf->tmppage, WRITE, false) - == 0) - /* Well, this device is dead */ - md_error(mddev, rdev); - } + test_bit(In_sync, &rdev->flags)) + r1_sync_page_io(rdev, sect, s, + conf->tmppage, WRITE); } d = start; while (d != read_disk) { @@ -1486,12 +1752,8 @@ static void fix_read_error(conf_t *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev, sect, s<<9, - conf->tmppage, READ, false) - == 0) - /* Well, this device is dead */ - md_error(mddev, rdev); - else { + if (r1_sync_page_io(rdev, sect, s, + conf->tmppage, READ)) { atomic_add(s, &rdev->corrected_errors); printk(KERN_INFO "md/raid1:%s: read error corrected " @@ -1508,21 +1770,255 @@ static void fix_read_error(conf_t *conf, int read_disk, } } +static void bi_complete(struct bio *bio, int error) +{ + complete((struct completion *)bio->bi_private); +} + +static int submit_bio_wait(int rw, struct bio *bio) +{ + struct completion event; + rw |= REQ_SYNC; + + init_completion(&event); + bio->bi_private = &event; + bio->bi_end_io = bi_complete; + submit_bio(rw, bio); + wait_for_completion(&event); + + return test_bit(BIO_UPTODATE, &bio->bi_flags); +} + +static int narrow_write_error(r1bio_t *r1_bio, int i) +{ + mddev_t *mddev = r1_bio->mddev; + conf_t *conf = mddev->private; + mdk_rdev_t *rdev = conf->mirrors[i].rdev; + int vcnt, idx; + struct bio_vec *vec; + + /* bio has the data to be written to device 'i' where + * we just recently had a write error. + * We repeatedly clone the bio and trim down to one block, + * then try the write. Where the write fails we record + * a bad block. + * It is conceivable that the bio doesn't exactly align with + * blocks. We must handle this somehow. + * + * We currently own a reference on the rdev. + */ + + int block_sectors; + sector_t sector; + int sectors; + int sect_to_write = r1_bio->sectors; + int ok = 1; + + if (rdev->badblocks.shift < 0) + return 0; + + block_sectors = 1 << rdev->badblocks.shift; + sector = r1_bio->sector; + sectors = ((sector + block_sectors) + & ~(sector_t)(block_sectors - 1)) + - sector; + + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { + vcnt = r1_bio->behind_page_count; + vec = r1_bio->behind_bvecs; + idx = 0; + while (vec[idx].bv_page == NULL) + idx++; + } else { + vcnt = r1_bio->master_bio->bi_vcnt; + vec = r1_bio->master_bio->bi_io_vec; + idx = r1_bio->master_bio->bi_idx; + } + while (sect_to_write) { + struct bio *wbio; + if (sectors > sect_to_write) + sectors = sect_to_write; + /* Write at 'sector' for 'sectors'*/ + + wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev); + memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec)); + wbio->bi_sector = r1_bio->sector; + wbio->bi_rw = WRITE; + wbio->bi_vcnt = vcnt; + wbio->bi_size = r1_bio->sectors << 9; + wbio->bi_idx = idx; + + md_trim_bio(wbio, sector - r1_bio->sector, sectors); + wbio->bi_sector += rdev->data_offset; + wbio->bi_bdev = rdev->bdev; + if (submit_bio_wait(WRITE, wbio) == 0) + /* failure! */ + ok = rdev_set_badblocks(rdev, sector, + sectors, 0) + && ok; + + bio_put(wbio); + sect_to_write -= sectors; + sector += sectors; + sectors = block_sectors; + } + return ok; +} + +static void handle_sync_write_finished(conf_t *conf, r1bio_t *r1_bio) +{ + int m; + int s = r1_bio->sectors; + for (m = 0; m < conf->raid_disks ; m++) { + mdk_rdev_t *rdev = conf->mirrors[m].rdev; + struct bio *bio = r1_bio->bios[m]; + if (bio->bi_end_io == NULL) + continue; + if (test_bit(BIO_UPTODATE, &bio->bi_flags) && + test_bit(R1BIO_MadeGood, &r1_bio->state)) { + rdev_clear_badblocks(rdev, r1_bio->sector, s); + } + if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && + test_bit(R1BIO_WriteError, &r1_bio->state)) { + if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0)) + md_error(conf->mddev, rdev); + } + } + put_buf(r1_bio); + md_done_sync(conf->mddev, s, 1); +} + +static void handle_write_finished(conf_t *conf, r1bio_t *r1_bio) +{ + int m; + for (m = 0; m < conf->raid_disks ; m++) + if (r1_bio->bios[m] == IO_MADE_GOOD) { + mdk_rdev_t *rdev = conf->mirrors[m].rdev; + rdev_clear_badblocks(rdev, + r1_bio->sector, + r1_bio->sectors); + rdev_dec_pending(rdev, conf->mddev); + } else if (r1_bio->bios[m] != NULL) { + /* This drive got a write error. We need to + * narrow down and record precise write + * errors. + */ + if (!narrow_write_error(r1_bio, m)) { + md_error(conf->mddev, + conf->mirrors[m].rdev); + /* an I/O failed, we can't clear the bitmap */ + set_bit(R1BIO_Degraded, &r1_bio->state); + } + rdev_dec_pending(conf->mirrors[m].rdev, + conf->mddev); + } + if (test_bit(R1BIO_WriteError, &r1_bio->state)) + close_write(r1_bio); + raid_end_bio_io(r1_bio); +} + +static void handle_read_error(conf_t *conf, r1bio_t *r1_bio) +{ + int disk; + int max_sectors; + mddev_t *mddev = conf->mddev; + struct bio *bio; + char b[BDEVNAME_SIZE]; + mdk_rdev_t *rdev; + + clear_bit(R1BIO_ReadError, &r1_bio->state); + /* we got a read error. Maybe the drive is bad. Maybe just + * the block and we can fix it. + * We freeze all other IO, and try reading the block from + * other devices. When we find one, we re-write + * and check it that fixes the read error. + * This is all done synchronously while the array is + * frozen + */ + if (mddev->ro == 0) { + freeze_array(conf); + fix_read_error(conf, r1_bio->read_disk, + r1_bio->sector, r1_bio->sectors); + unfreeze_array(conf); + } else + md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); + + bio = r1_bio->bios[r1_bio->read_disk]; + bdevname(bio->bi_bdev, b); +read_more: + disk = read_balance(conf, r1_bio, &max_sectors); + if (disk == -1) { + printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O" + " read error for block %llu\n", + mdname(mddev), b, (unsigned long long)r1_bio->sector); + raid_end_bio_io(r1_bio); + } else { + const unsigned long do_sync + = r1_bio->master_bio->bi_rw & REQ_SYNC; + if (bio) { + r1_bio->bios[r1_bio->read_disk] = + mddev->ro ? IO_BLOCKED : NULL; + bio_put(bio); + } + r1_bio->read_disk = disk; + bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); + md_trim_bio(bio, r1_bio->sector - bio->bi_sector, max_sectors); + r1_bio->bios[r1_bio->read_disk] = bio; + rdev = conf->mirrors[disk].rdev; + printk_ratelimited(KERN_ERR + "md/raid1:%s: redirecting sector %llu" + " to other mirror: %s\n", + mdname(mddev), + (unsigned long long)r1_bio->sector, + bdevname(rdev->bdev, b)); + bio->bi_sector = r1_bio->sector + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + bio->bi_end_io = raid1_end_read_request; + bio->bi_rw = READ | do_sync; + bio->bi_private = r1_bio; + if (max_sectors < r1_bio->sectors) { + /* Drat - have to split this up more */ + struct bio *mbio = r1_bio->master_bio; + int sectors_handled = (r1_bio->sector + max_sectors + - mbio->bi_sector); + r1_bio->sectors = max_sectors; + spin_lock_irq(&conf->device_lock); + if (mbio->bi_phys_segments == 0) + mbio->bi_phys_segments = 2; + else + mbio->bi_phys_segments++; + spin_unlock_irq(&conf->device_lock); + generic_make_request(bio); + bio = NULL; + + r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); + + r1_bio->master_bio = mbio; + r1_bio->sectors = (mbio->bi_size >> 9) + - sectors_handled; + r1_bio->state = 0; + set_bit(R1BIO_ReadError, &r1_bio->state); + r1_bio->mddev = mddev; + r1_bio->sector = mbio->bi_sector + sectors_handled; + + goto read_more; + } else + generic_make_request(bio); + } +} + static void raid1d(mddev_t *mddev) { r1bio_t *r1_bio; - struct bio *bio; unsigned long flags; conf_t *conf = mddev->private; struct list_head *head = &conf->retry_list; - mdk_rdev_t *rdev; struct blk_plug plug; md_check_recovery(mddev); blk_start_plug(&plug); for (;;) { - char b[BDEVNAME_SIZE]; if (atomic_read(&mddev->plug_cnt) == 0) flush_pending_writes(conf); @@ -1539,62 +2035,26 @@ static void raid1d(mddev_t *mddev) mddev = r1_bio->mddev; conf = mddev->private; - if (test_bit(R1BIO_IsSync, &r1_bio->state)) - sync_request_write(mddev, r1_bio); - else { - int disk; - - /* we got a read error. Maybe the drive is bad. Maybe just - * the block and we can fix it. - * We freeze all other IO, and try reading the block from - * other devices. When we find one, we re-write - * and check it that fixes the read error. - * This is all done synchronously while the array is - * frozen + if (test_bit(R1BIO_IsSync, &r1_bio->state)) { + if (test_bit(R1BIO_MadeGood, &r1_bio->state) || + test_bit(R1BIO_WriteError, &r1_bio->state)) + handle_sync_write_finished(conf, r1_bio); + else + sync_request_write(mddev, r1_bio); + } else if (test_bit(R1BIO_MadeGood, &r1_bio->state) || + test_bit(R1BIO_WriteError, &r1_bio->state)) + handle_write_finished(conf, r1_bio); + else if (test_bit(R1BIO_ReadError, &r1_bio->state)) + handle_read_error(conf, r1_bio); + else + /* just a partial read to be scheduled from separate + * context */ - if (mddev->ro == 0) { - freeze_array(conf); - fix_read_error(conf, r1_bio->read_disk, - r1_bio->sector, - r1_bio->sectors); - unfreeze_array(conf); - } else - md_error(mddev, - conf->mirrors[r1_bio->read_disk].rdev); - - bio = r1_bio->bios[r1_bio->read_disk]; - if ((disk=read_balance(conf, r1_bio)) == -1) { - printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O" - " read error for block %llu\n", - mdname(mddev), - bdevname(bio->bi_bdev,b), - (unsigned long long)r1_bio->sector); - raid_end_bio_io(r1_bio); - } else { - const unsigned long do_sync = r1_bio->master_bio->bi_rw & REQ_SYNC; - r1_bio->bios[r1_bio->read_disk] = - mddev->ro ? IO_BLOCKED : NULL; - r1_bio->read_disk = disk; - bio_put(bio); - bio = bio_clone_mddev(r1_bio->master_bio, - GFP_NOIO, mddev); - r1_bio->bios[r1_bio->read_disk] = bio; - rdev = conf->mirrors[disk].rdev; - if (printk_ratelimit()) - printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to" - " other mirror: %s\n", - mdname(mddev), - (unsigned long long)r1_bio->sector, - bdevname(rdev->bdev,b)); - bio->bi_sector = r1_bio->sector + rdev->data_offset; - bio->bi_bdev = rdev->bdev; - bio->bi_end_io = raid1_end_read_request; - bio->bi_rw = READ | do_sync; - bio->bi_private = r1_bio; - generic_make_request(bio); - } - } + generic_make_request(r1_bio->bios[r1_bio->read_disk]); + cond_resched(); + if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) + md_check_recovery(mddev); } blk_finish_plug(&plug); } @@ -1636,6 +2096,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i int write_targets = 0, read_targets = 0; sector_t sync_blocks; int still_degraded = 0; + int good_sectors = RESYNC_SECTORS; + int min_bad = 0; /* number of sectors that are bad in all devices */ if (!conf->r1buf_pool) if (init_resync(conf)) @@ -1723,36 +2185,89 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev == NULL || - test_bit(Faulty, &rdev->flags)) { + test_bit(Faulty, &rdev->flags)) { still_degraded = 1; - continue; } else if (!test_bit(In_sync, &rdev->flags)) { bio->bi_rw = WRITE; bio->bi_end_io = end_sync_write; write_targets ++; } else { /* may need to read from here */ - bio->bi_rw = READ; - bio->bi_end_io = end_sync_read; - if (test_bit(WriteMostly, &rdev->flags)) { - if (wonly < 0) - wonly = i; - } else { - if (disk < 0) - disk = i; + sector_t first_bad = MaxSector; + int bad_sectors; + + if (is_badblock(rdev, sector_nr, good_sectors, + &first_bad, &bad_sectors)) { + if (first_bad > sector_nr) + good_sectors = first_bad - sector_nr; + else { + bad_sectors -= (sector_nr - first_bad); + if (min_bad == 0 || + min_bad > bad_sectors) + min_bad = bad_sectors; + } + } + if (sector_nr < first_bad) { + if (test_bit(WriteMostly, &rdev->flags)) { + if (wonly < 0) + wonly = i; + } else { + if (disk < 0) + disk = i; + } + bio->bi_rw = READ; + bio->bi_end_io = end_sync_read; + read_targets++; } - read_targets++; } - atomic_inc(&rdev->nr_pending); - bio->bi_sector = sector_nr + rdev->data_offset; - bio->bi_bdev = rdev->bdev; - bio->bi_private = r1_bio; + if (bio->bi_end_io) { + atomic_inc(&rdev->nr_pending); + bio->bi_sector = sector_nr + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + bio->bi_private = r1_bio; + } } rcu_read_unlock(); if (disk < 0) disk = wonly; r1_bio->read_disk = disk; + if (read_targets == 0 && min_bad > 0) { + /* These sectors are bad on all InSync devices, so we + * need to mark them bad on all write targets + */ + int ok = 1; + for (i = 0 ; i < conf->raid_disks ; i++) + if (r1_bio->bios[i]->bi_end_io == end_sync_write) { + mdk_rdev_t *rdev = + rcu_dereference(conf->mirrors[i].rdev); + ok = rdev_set_badblocks(rdev, sector_nr, + min_bad, 0 + ) && ok; + } + set_bit(MD_CHANGE_DEVS, &mddev->flags); + *skipped = 1; + put_buf(r1_bio); + + if (!ok) { + /* Cannot record the badblocks, so need to + * abort the resync. + * If there are multiple read targets, could just + * fail the really bad ones ??? + */ + conf->recovery_disabled = mddev->recovery_disabled; + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + return 0; + } else + return min_bad; + + } + if (min_bad > 0 && min_bad < good_sectors) { + /* only resync enough to reach the next bad->good + * transition */ + good_sectors = min_bad; + } + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0) /* extra read targets are also write targets */ write_targets += read_targets-1; @@ -1769,6 +2284,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (max_sector > mddev->resync_max) max_sector = mddev->resync_max; /* Don't do IO beyond here */ + if (max_sector > sector_nr + good_sectors) + max_sector = sector_nr + good_sectors; nr_sectors = 0; sync_blocks = 0; do { @@ -2154,18 +2671,13 @@ static int raid1_reshape(mddev_t *mddev) for (d = d2 = 0; d < conf->raid_disks; d++) { mdk_rdev_t *rdev = conf->mirrors[d].rdev; if (rdev && rdev->raid_disk != d2) { - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); + sysfs_unlink_rdev(mddev, rdev); rdev->raid_disk = d2; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); - if (sysfs_create_link(&mddev->kobj, - &rdev->kobj, nm)) + sysfs_unlink_rdev(mddev, rdev); + if (sysfs_link_rdev(mddev, rdev)) printk(KERN_WARNING - "md/raid1:%s: cannot register " - "%s\n", - mdname(mddev), nm); + "md/raid1:%s: cannot register rd%d\n", + mdname(mddev), rdev->raid_disk); } if (rdev) newmirrors[d2++].rdev = rdev; diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index e743a64fac4..e0d676b4897 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -48,6 +48,12 @@ struct r1_private_data_s { * (fresh device added). * Cleared when a sync completes. */ + int recovery_disabled; /* when the same as + * mddev->recovery_disabled + * we don't allow recovery + * to be attempted as we + * expect a read error + */ wait_queue_head_t wait_barrier; @@ -95,7 +101,7 @@ struct r1bio_s { struct list_head retry_list; /* Next two are only valid when R1BIO_BehindIO is set */ - struct page **behind_pages; + struct bio_vec *behind_bvecs; int behind_page_count; /* * if the IO is in WRITE direction, then multiple bios are used. @@ -110,13 +116,24 @@ struct r1bio_s { * correct the read error. To keep track of bad blocks on a per-bio * level, we store IO_BLOCKED in the appropriate 'bios' pointer */ -#define IO_BLOCKED ((struct bio*)1) +#define IO_BLOCKED ((struct bio *)1) +/* When we successfully write to a known bad-block, we need to remove the + * bad-block marking which must be done from process context. So we record + * the success by setting bios[n] to IO_MADE_GOOD + */ +#define IO_MADE_GOOD ((struct bio *)2) + +#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) /* bits for r1bio.state */ #define R1BIO_Uptodate 0 #define R1BIO_IsSync 1 #define R1BIO_Degraded 2 #define R1BIO_BehindIO 3 +/* Set ReadError on bios that experience a readerror so that + * raid1d knows what to do with them. + */ +#define R1BIO_ReadError 4 /* For write-behind requests, we call bi_end_io when * the last non-write-behind device completes, providing * any write was successful. Otherwise we call when @@ -125,6 +142,11 @@ struct r1bio_s { * Record that bi_end_io was called with this flag... */ #define R1BIO_Returned 6 +/* If a write for this request means we can clear some + * known-bad-block records, we set this flag + */ +#define R1BIO_MadeGood 7 +#define R1BIO_WriteError 8 extern int md_raid1_congested(mddev_t *mddev, int bits); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 6e846688962..d7a8468ddea 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -22,6 +22,7 @@ #include <linux/delay.h> #include <linux/blkdev.h> #include <linux/seq_file.h> +#include <linux/ratelimit.h> #include "md.h" #include "raid10.h" #include "raid0.h" @@ -123,7 +124,14 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) for (j = 0 ; j < nalloc; j++) { bio = r10_bio->devs[j].bio; for (i = 0; i < RESYNC_PAGES; i++) { - page = alloc_page(gfp_flags); + if (j == 1 && !test_bit(MD_RECOVERY_SYNC, + &conf->mddev->recovery)) { + /* we can share bv_page's during recovery */ + struct bio *rbio = r10_bio->devs[0].bio; + page = rbio->bi_io_vec[i].bv_page; + get_page(page); + } else + page = alloc_page(gfp_flags); if (unlikely(!page)) goto out_free_pages; @@ -173,7 +181,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio) for (i = 0; i < conf->copies; i++) { struct bio **bio = & r10_bio->devs[i].bio; - if (*bio && *bio != IO_BLOCKED) + if (!BIO_SPECIAL(*bio)) bio_put(*bio); *bio = NULL; } @@ -183,12 +191,6 @@ static void free_r10bio(r10bio_t *r10_bio) { conf_t *conf = r10_bio->mddev->private; - /* - * Wake up any possible resync thread that waits for the device - * to go idle. - */ - allow_barrier(conf); - put_all_bios(conf, r10_bio); mempool_free(r10_bio, conf->r10bio_pool); } @@ -227,9 +229,27 @@ static void reschedule_retry(r10bio_t *r10_bio) static void raid_end_bio_io(r10bio_t *r10_bio) { struct bio *bio = r10_bio->master_bio; + int done; + conf_t *conf = r10_bio->mddev->private; - bio_endio(bio, - test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO); + if (bio->bi_phys_segments) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + bio->bi_phys_segments--; + done = (bio->bi_phys_segments == 0); + spin_unlock_irqrestore(&conf->device_lock, flags); + } else + done = 1; + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + if (done) { + bio_endio(bio, 0); + /* + * Wake up any possible resync thread that waits for the device + * to go idle. + */ + allow_barrier(conf); + } free_r10bio(r10_bio); } @@ -244,6 +264,26 @@ static inline void update_head_pos(int slot, r10bio_t *r10_bio) r10_bio->devs[slot].addr + (r10_bio->sectors); } +/* + * Find the disk number which triggered given bio + */ +static int find_bio_disk(conf_t *conf, r10bio_t *r10_bio, + struct bio *bio, int *slotp) +{ + int slot; + + for (slot = 0; slot < conf->copies; slot++) + if (r10_bio->devs[slot].bio == bio) + break; + + BUG_ON(slot == conf->copies); + update_head_pos(slot, r10_bio); + + if (slotp) + *slotp = slot; + return r10_bio->devs[slot].devnum; +} + static void raid10_end_read_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); @@ -277,34 +317,60 @@ static void raid10_end_read_request(struct bio *bio, int error) * oops, read error - keep the refcount on the rdev */ char b[BDEVNAME_SIZE]; - if (printk_ratelimit()) - printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n", - mdname(conf->mddev), - bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); + printk_ratelimited(KERN_ERR + "md/raid10:%s: %s: rescheduling sector %llu\n", + mdname(conf->mddev), + bdevname(conf->mirrors[dev].rdev->bdev, b), + (unsigned long long)r10_bio->sector); + set_bit(R10BIO_ReadError, &r10_bio->state); reschedule_retry(r10_bio); } } +static void close_write(r10bio_t *r10_bio) +{ + /* clear the bitmap if all writes complete successfully */ + bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, + r10_bio->sectors, + !test_bit(R10BIO_Degraded, &r10_bio->state), + 0); + md_write_end(r10_bio->mddev); +} + +static void one_write_done(r10bio_t *r10_bio) +{ + if (atomic_dec_and_test(&r10_bio->remaining)) { + if (test_bit(R10BIO_WriteError, &r10_bio->state)) + reschedule_retry(r10_bio); + else { + close_write(r10_bio); + if (test_bit(R10BIO_MadeGood, &r10_bio->state)) + reschedule_retry(r10_bio); + else + raid_end_bio_io(r10_bio); + } + } +} + static void raid10_end_write_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r10bio_t *r10_bio = bio->bi_private; - int slot, dev; + int dev; + int dec_rdev = 1; conf_t *conf = r10_bio->mddev->private; + int slot; - for (slot = 0; slot < conf->copies; slot++) - if (r10_bio->devs[slot].bio == bio) - break; - dev = r10_bio->devs[slot].devnum; + dev = find_bio_disk(conf, r10_bio, bio, &slot); /* * this branch is our 'one mirror IO has finished' event handler: */ if (!uptodate) { - md_error(r10_bio->mddev, conf->mirrors[dev].rdev); - /* an I/O failed, we can't clear the bitmap */ - set_bit(R10BIO_Degraded, &r10_bio->state); - } else + set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags); + set_bit(R10BIO_WriteError, &r10_bio->state); + dec_rdev = 0; + } else { /* * Set R10BIO_Uptodate in our master bio, so that * we will return a good error code for to the higher @@ -314,26 +380,31 @@ static void raid10_end_write_request(struct bio *bio, int error) * user-side. So if something waits for IO, then it will * wait for the 'master' bio. */ + sector_t first_bad; + int bad_sectors; + set_bit(R10BIO_Uptodate, &r10_bio->state); - update_head_pos(slot, r10_bio); + /* Maybe we can clear some bad blocks. */ + if (is_badblock(conf->mirrors[dev].rdev, + r10_bio->devs[slot].addr, + r10_bio->sectors, + &first_bad, &bad_sectors)) { + bio_put(bio); + r10_bio->devs[slot].bio = IO_MADE_GOOD; + dec_rdev = 0; + set_bit(R10BIO_MadeGood, &r10_bio->state); + } + } /* * * Let's see if all mirrored write operations have finished * already. */ - if (atomic_dec_and_test(&r10_bio->remaining)) { - /* clear the bitmap if all writes complete successfully */ - bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, - r10_bio->sectors, - !test_bit(R10BIO_Degraded, &r10_bio->state), - 0); - md_write_end(r10_bio->mddev); - raid_end_bio_io(r10_bio); - } - - rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); + one_write_done(r10_bio); + if (dec_rdev) + rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); } @@ -484,11 +555,12 @@ static int raid10_mergeable_bvec(struct request_queue *q, * FIXME: possibly should rethink readbalancing and do it differently * depending on near_copies / far_copies geometry. */ -static int read_balance(conf_t *conf, r10bio_t *r10_bio) +static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors) { const sector_t this_sector = r10_bio->sector; int disk, slot; - const int sectors = r10_bio->sectors; + int sectors = r10_bio->sectors; + int best_good_sectors; sector_t new_distance, best_dist; mdk_rdev_t *rdev; int do_balance; @@ -497,8 +569,10 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) raid10_find_phys(conf, r10_bio); rcu_read_lock(); retry: + sectors = r10_bio->sectors; best_slot = -1; best_dist = MaxSector; + best_good_sectors = 0; do_balance = 1; /* * Check if we can balance. We can balance on the whole @@ -511,6 +585,10 @@ retry: do_balance = 0; for (slot = 0; slot < conf->copies ; slot++) { + sector_t first_bad; + int bad_sectors; + sector_t dev_sector; + if (r10_bio->devs[slot].bio == IO_BLOCKED) continue; disk = r10_bio->devs[slot].devnum; @@ -520,6 +598,37 @@ retry: if (!test_bit(In_sync, &rdev->flags)) continue; + dev_sector = r10_bio->devs[slot].addr; + if (is_badblock(rdev, dev_sector, sectors, + &first_bad, &bad_sectors)) { + if (best_dist < MaxSector) + /* Already have a better slot */ + continue; + if (first_bad <= dev_sector) { + /* Cannot read here. If this is the + * 'primary' device, then we must not read + * beyond 'bad_sectors' from another device. + */ + bad_sectors -= (dev_sector - first_bad); + if (!do_balance && sectors > bad_sectors) + sectors = bad_sectors; + if (best_good_sectors > sectors) + best_good_sectors = sectors; + } else { + sector_t good_sectors = + first_bad - dev_sector; + if (good_sectors > best_good_sectors) { + best_good_sectors = good_sectors; + best_slot = slot; + } + if (!do_balance) + /* Must read from here */ + break; + } + continue; + } else + best_good_sectors = sectors; + if (!do_balance) break; @@ -561,6 +670,7 @@ retry: } else disk = -1; rcu_read_unlock(); + *max_sectors = best_good_sectors; return disk; } @@ -734,6 +844,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) unsigned long flags; mdk_rdev_t *blocked_rdev; int plugged; + int sectors_handled; + int max_sectors; if (unlikely(bio->bi_rw & REQ_FLUSH)) { md_flush_request(mddev, bio); @@ -808,12 +920,26 @@ static int make_request(mddev_t *mddev, struct bio * bio) r10_bio->sector = bio->bi_sector; r10_bio->state = 0; + /* We might need to issue multiple reads to different + * devices if there are bad blocks around, so we keep + * track of the number of reads in bio->bi_phys_segments. + * If this is 0, there is only one r10_bio and no locking + * will be needed when the request completes. If it is + * non-zero, then it is the number of not-completed requests. + */ + bio->bi_phys_segments = 0; + clear_bit(BIO_SEG_VALID, &bio->bi_flags); + if (rw == READ) { /* * read balancing logic: */ - int disk = read_balance(conf, r10_bio); - int slot = r10_bio->read_slot; + int disk; + int slot; + +read_again: + disk = read_balance(conf, r10_bio, &max_sectors); + slot = r10_bio->read_slot; if (disk < 0) { raid_end_bio_io(r10_bio); return 0; @@ -821,6 +947,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) mirror = conf->mirrors + disk; read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); + md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector, + max_sectors); r10_bio->devs[slot].bio = read_bio; @@ -831,7 +959,37 @@ static int make_request(mddev_t *mddev, struct bio * bio) read_bio->bi_rw = READ | do_sync; read_bio->bi_private = r10_bio; - generic_make_request(read_bio); + if (max_sectors < r10_bio->sectors) { + /* Could not read all from this device, so we will + * need another r10_bio. + */ + sectors_handled = (r10_bio->sectors + max_sectors + - bio->bi_sector); + r10_bio->sectors = max_sectors; + spin_lock_irq(&conf->device_lock); + if (bio->bi_phys_segments == 0) + bio->bi_phys_segments = 2; + else + bio->bi_phys_segments++; + spin_unlock(&conf->device_lock); + /* Cannot call generic_make_request directly + * as that will be queued in __generic_make_request + * and subsequent mempool_alloc might block + * waiting for it. so hand bio over to raid10d. + */ + reschedule_retry(r10_bio); + + r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); + + r10_bio->master_bio = bio; + r10_bio->sectors = ((bio->bi_size >> 9) + - sectors_handled); + r10_bio->state = 0; + r10_bio->mddev = mddev; + r10_bio->sector = bio->bi_sector + sectors_handled; + goto read_again; + } else + generic_make_request(read_bio); return 0; } @@ -841,13 +999,22 @@ static int make_request(mddev_t *mddev, struct bio * bio) /* first select target devices under rcu_lock and * inc refcount on their rdev. Record them by setting * bios[x] to bio + * If there are known/acknowledged bad blocks on any device + * on which we have seen a write error, we want to avoid + * writing to those blocks. This potentially requires several + * writes to write around the bad blocks. Each set of writes + * gets its own r10_bio with a set of bios attached. The number + * of r10_bios is recored in bio->bi_phys_segments just as with + * the read case. */ plugged = mddev_check_plugged(mddev); raid10_find_phys(conf, r10_bio); - retry_write: +retry_write: blocked_rdev = NULL; rcu_read_lock(); + max_sectors = r10_bio->sectors; + for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); @@ -856,13 +1023,55 @@ static int make_request(mddev_t *mddev, struct bio * bio) blocked_rdev = rdev; break; } - if (rdev && !test_bit(Faulty, &rdev->flags)) { - atomic_inc(&rdev->nr_pending); - r10_bio->devs[i].bio = bio; - } else { - r10_bio->devs[i].bio = NULL; + r10_bio->devs[i].bio = NULL; + if (!rdev || test_bit(Faulty, &rdev->flags)) { set_bit(R10BIO_Degraded, &r10_bio->state); + continue; + } + if (test_bit(WriteErrorSeen, &rdev->flags)) { + sector_t first_bad; + sector_t dev_sector = r10_bio->devs[i].addr; + int bad_sectors; + int is_bad; + + is_bad = is_badblock(rdev, dev_sector, + max_sectors, + &first_bad, &bad_sectors); + if (is_bad < 0) { + /* Mustn't write here until the bad block + * is acknowledged + */ + atomic_inc(&rdev->nr_pending); + set_bit(BlockedBadBlocks, &rdev->flags); + blocked_rdev = rdev; + break; + } + if (is_bad && first_bad <= dev_sector) { + /* Cannot write here at all */ + bad_sectors -= (dev_sector - first_bad); + if (bad_sectors < max_sectors) + /* Mustn't write more than bad_sectors + * to other devices yet + */ + max_sectors = bad_sectors; + /* We don't set R10BIO_Degraded as that + * only applies if the disk is missing, + * so it might be re-added, and we want to + * know to recover this chunk. + * In this case the device is here, and the + * fact that this chunk is not in-sync is + * recorded in the bad block log. + */ + continue; + } + if (is_bad) { + int good_sectors = first_bad - dev_sector; + if (good_sectors < max_sectors) + max_sectors = good_sectors; + } } + r10_bio->devs[i].bio = bio; + atomic_inc(&rdev->nr_pending); } rcu_read_unlock(); @@ -882,8 +1091,22 @@ static int make_request(mddev_t *mddev, struct bio * bio) goto retry_write; } + if (max_sectors < r10_bio->sectors) { + /* We are splitting this into multiple parts, so + * we need to prepare for allocating another r10_bio. + */ + r10_bio->sectors = max_sectors; + spin_lock_irq(&conf->device_lock); + if (bio->bi_phys_segments == 0) + bio->bi_phys_segments = 2; + else + bio->bi_phys_segments++; + spin_unlock_irq(&conf->device_lock); + } + sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector; + atomic_set(&r10_bio->remaining, 1); - bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0); + bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); for (i = 0; i < conf->copies; i++) { struct bio *mbio; @@ -892,10 +1115,12 @@ static int make_request(mddev_t *mddev, struct bio * bio) continue; mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, + max_sectors); r10_bio->devs[i].bio = mbio; - mbio->bi_sector = r10_bio->devs[i].addr+ - conf->mirrors[d].rdev->data_offset; + mbio->bi_sector = (r10_bio->devs[i].addr+ + conf->mirrors[d].rdev->data_offset); mbio->bi_bdev = conf->mirrors[d].rdev->bdev; mbio->bi_end_io = raid10_end_write_request; mbio->bi_rw = WRITE | do_sync | do_fua; @@ -907,15 +1132,26 @@ static int make_request(mddev_t *mddev, struct bio * bio) spin_unlock_irqrestore(&conf->device_lock, flags); } - if (atomic_dec_and_test(&r10_bio->remaining)) { - /* This matches the end of raid10_end_write_request() */ - bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, - r10_bio->sectors, - !test_bit(R10BIO_Degraded, &r10_bio->state), - 0); - md_write_end(mddev); - raid_end_bio_io(r10_bio); + /* Don't remove the bias on 'remaining' (one_write_done) until + * after checking if we need to go around again. + */ + + if (sectors_handled < (bio->bi_size >> 9)) { + one_write_done(r10_bio); + /* We need another r10_bio. It has already been counted + * in bio->bi_phys_segments. + */ + r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); + + r10_bio->master_bio = bio; + r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled; + + r10_bio->mddev = mddev; + r10_bio->sector = bio->bi_sector + sectors_handled; + r10_bio->state = 0; + goto retry_write; } + one_write_done(r10_bio); /* In case raid10d snuck in to freeze_array */ wake_up(&conf->wait_barrier); @@ -949,6 +1185,30 @@ static void status(struct seq_file *seq, mddev_t *mddev) seq_printf(seq, "]"); } +/* check if there are enough drives for + * every block to appear on atleast one. + * Don't consider the device numbered 'ignore' + * as we might be about to remove it. + */ +static int enough(conf_t *conf, int ignore) +{ + int first = 0; + + do { + int n = conf->copies; + int cnt = 0; + while (n--) { + if (conf->mirrors[first].rdev && + first != ignore) + cnt++; + first = (first+1) % conf->raid_disks; + } + if (cnt == 0) + return 0; + } while (first != 0); + return 1; +} + static void error(mddev_t *mddev, mdk_rdev_t *rdev) { char b[BDEVNAME_SIZE]; @@ -961,13 +1221,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * else mark the drive as failed */ if (test_bit(In_sync, &rdev->flags) - && conf->raid_disks-mddev->degraded == 1) + && !enough(conf, rdev->raid_disk)) /* * Don't fail the drive, just return an IO error. - * The test should really be more sophisticated than - * "working_disks == 1", but it isn't critical, and - * can wait until we do more sophisticated "is the drive - * really dead" tests... */ return; if (test_and_clear_bit(In_sync, &rdev->flags)) { @@ -980,6 +1236,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); } + set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); printk(KERN_ALERT @@ -1022,27 +1279,6 @@ static void close_sync(conf_t *conf) conf->r10buf_pool = NULL; } -/* check if there are enough drives for - * every block to appear on atleast one - */ -static int enough(conf_t *conf) -{ - int first = 0; - - do { - int n = conf->copies; - int cnt = 0; - while (n--) { - if (conf->mirrors[first].rdev) - cnt++; - first = (first+1) % conf->raid_disks; - } - if (cnt == 0) - return 0; - } while (first != 0); - return 1; -} - static int raid10_spare_active(mddev_t *mddev) { int i; @@ -1078,7 +1314,6 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) conf_t *conf = mddev->private; int err = -EEXIST; int mirror; - mirror_info_t *p; int first = 0; int last = conf->raid_disks - 1; @@ -1087,44 +1322,47 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) * very different from resync */ return -EBUSY; - if (!enough(conf)) + if (!enough(conf, -1)) return -EINVAL; if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; - if (rdev->saved_raid_disk >= 0 && - rdev->saved_raid_disk >= first && + if (rdev->saved_raid_disk >= first && conf->mirrors[rdev->saved_raid_disk].rdev == NULL) mirror = rdev->saved_raid_disk; else mirror = first; - for ( ; mirror <= last ; mirror++) - if ( !(p=conf->mirrors+mirror)->rdev) { - - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - /* as we don't honour merge_bvec_fn, we must - * never risk violating it, so limit - * ->max_segments to one lying with a single - * page, as a one page request is never in - * violation. - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { - blk_queue_max_segments(mddev->queue, 1); - blk_queue_segment_boundary(mddev->queue, - PAGE_CACHE_SIZE - 1); - } + for ( ; mirror <= last ; mirror++) { + mirror_info_t *p = &conf->mirrors[mirror]; + if (p->recovery_disabled == mddev->recovery_disabled) + continue; + if (!p->rdev) + continue; - p->head_position = 0; - rdev->raid_disk = mirror; - err = 0; - if (rdev->saved_raid_disk != mirror) - conf->fullsync = 1; - rcu_assign_pointer(p->rdev, rdev); - break; + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + /* as we don't honour merge_bvec_fn, we must + * never risk violating it, so limit + * ->max_segments to one lying with a single + * page, as a one page request is never in + * violation. + */ + if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { + blk_queue_max_segments(mddev->queue, 1); + blk_queue_segment_boundary(mddev->queue, + PAGE_CACHE_SIZE - 1); } + p->head_position = 0; + rdev->raid_disk = mirror; + err = 0; + if (rdev->saved_raid_disk != mirror) + conf->fullsync = 1; + rcu_assign_pointer(p->rdev, rdev); + break; + } + md_integrity_add_rdev(rdev, mddev); print_conf(conf); return err; @@ -1149,7 +1387,8 @@ static int raid10_remove_disk(mddev_t *mddev, int number) * is not possible. */ if (!test_bit(Faulty, &rdev->flags) && - enough(conf)) { + mddev->recovery_disabled != p->recovery_disabled && + enough(conf, -1)) { err = -EBUSY; goto abort; } @@ -1174,24 +1413,18 @@ static void end_sync_read(struct bio *bio, int error) { r10bio_t *r10_bio = bio->bi_private; conf_t *conf = r10_bio->mddev->private; - int i,d; + int d; - for (i=0; i<conf->copies; i++) - if (r10_bio->devs[i].bio == bio) - break; - BUG_ON(i == conf->copies); - update_head_pos(i, r10_bio); - d = r10_bio->devs[i].devnum; + d = find_bio_disk(conf, r10_bio, bio, NULL); if (test_bit(BIO_UPTODATE, &bio->bi_flags)) set_bit(R10BIO_Uptodate, &r10_bio->state); - else { + else + /* The write handler will notice the lack of + * R10BIO_Uptodate and record any errors etc + */ atomic_add(r10_bio->sectors, &conf->mirrors[d].rdev->corrected_errors); - if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) - md_error(r10_bio->mddev, - conf->mirrors[d].rdev); - } /* for reconstruct, we always reschedule after a read. * for resync, only after all reads @@ -1206,40 +1439,60 @@ static void end_sync_read(struct bio *bio, int error) } } -static void end_sync_write(struct bio *bio, int error) +static void end_sync_request(r10bio_t *r10_bio) { - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - r10bio_t *r10_bio = bio->bi_private; mddev_t *mddev = r10_bio->mddev; - conf_t *conf = mddev->private; - int i,d; - - for (i = 0; i < conf->copies; i++) - if (r10_bio->devs[i].bio == bio) - break; - d = r10_bio->devs[i].devnum; - - if (!uptodate) - md_error(mddev, conf->mirrors[d].rdev); - - update_head_pos(i, r10_bio); - rdev_dec_pending(conf->mirrors[d].rdev, mddev); while (atomic_dec_and_test(&r10_bio->remaining)) { if (r10_bio->master_bio == NULL) { /* the primary of several recovery bios */ sector_t s = r10_bio->sectors; - put_buf(r10_bio); + if (test_bit(R10BIO_MadeGood, &r10_bio->state) || + test_bit(R10BIO_WriteError, &r10_bio->state)) + reschedule_retry(r10_bio); + else + put_buf(r10_bio); md_done_sync(mddev, s, 1); break; } else { r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio; - put_buf(r10_bio); + if (test_bit(R10BIO_MadeGood, &r10_bio->state) || + test_bit(R10BIO_WriteError, &r10_bio->state)) + reschedule_retry(r10_bio); + else + put_buf(r10_bio); r10_bio = r10_bio2; } } } +static void end_sync_write(struct bio *bio, int error) +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + r10bio_t *r10_bio = bio->bi_private; + mddev_t *mddev = r10_bio->mddev; + conf_t *conf = mddev->private; + int d; + sector_t first_bad; + int bad_sectors; + int slot; + + d = find_bio_disk(conf, r10_bio, bio, &slot); + + if (!uptodate) { + set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags); + set_bit(R10BIO_WriteError, &r10_bio->state); + } else if (is_badblock(conf->mirrors[d].rdev, + r10_bio->devs[slot].addr, + r10_bio->sectors, + &first_bad, &bad_sectors)) + set_bit(R10BIO_MadeGood, &r10_bio->state); + + rdev_dec_pending(conf->mirrors[d].rdev, mddev); + + end_sync_request(r10_bio); +} + /* * Note: sync and recover and handled very differently for raid10 * This code is for resync. @@ -1299,11 +1552,12 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) if (j == vcnt) continue; mddev->resync_mismatches += r10_bio->sectors; + if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + /* Don't fix anything. */ + continue; } - if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) - /* Don't fix anything. */ - continue; - /* Ok, we need to write this bio + /* Ok, we need to write this bio, either to correct an + * inconsistency or to correct an unreadable block. * First we need to fixup bv_offset, bv_len and * bi_vecs, as the read request might have corrupted these */ @@ -1355,32 +1609,107 @@ done: * The second for writing. * */ +static void fix_recovery_read_error(r10bio_t *r10_bio) +{ + /* We got a read error during recovery. + * We repeat the read in smaller page-sized sections. + * If a read succeeds, write it to the new device or record + * a bad block if we cannot. + * If a read fails, record a bad block on both old and + * new devices. + */ + mddev_t *mddev = r10_bio->mddev; + conf_t *conf = mddev->private; + struct bio *bio = r10_bio->devs[0].bio; + sector_t sect = 0; + int sectors = r10_bio->sectors; + int idx = 0; + int dr = r10_bio->devs[0].devnum; + int dw = r10_bio->devs[1].devnum; + + while (sectors) { + int s = sectors; + mdk_rdev_t *rdev; + sector_t addr; + int ok; + + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + + rdev = conf->mirrors[dr].rdev; + addr = r10_bio->devs[0].addr + sect, + ok = sync_page_io(rdev, + addr, + s << 9, + bio->bi_io_vec[idx].bv_page, + READ, false); + if (ok) { + rdev = conf->mirrors[dw].rdev; + addr = r10_bio->devs[1].addr + sect; + ok = sync_page_io(rdev, + addr, + s << 9, + bio->bi_io_vec[idx].bv_page, + WRITE, false); + if (!ok) + set_bit(WriteErrorSeen, &rdev->flags); + } + if (!ok) { + /* We don't worry if we cannot set a bad block - + * it really is bad so there is no loss in not + * recording it yet + */ + rdev_set_badblocks(rdev, addr, s, 0); + + if (rdev != conf->mirrors[dw].rdev) { + /* need bad block on destination too */ + mdk_rdev_t *rdev2 = conf->mirrors[dw].rdev; + addr = r10_bio->devs[1].addr + sect; + ok = rdev_set_badblocks(rdev2, addr, s, 0); + if (!ok) { + /* just abort the recovery */ + printk(KERN_NOTICE + "md/raid10:%s: recovery aborted" + " due to read error\n", + mdname(mddev)); + + conf->mirrors[dw].recovery_disabled + = mddev->recovery_disabled; + set_bit(MD_RECOVERY_INTR, + &mddev->recovery); + break; + } + } + } + + sectors -= s; + sect += s; + idx++; + } +} static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) { conf_t *conf = mddev->private; - int i, d; - struct bio *bio, *wbio; + int d; + struct bio *wbio; + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { + fix_recovery_read_error(r10_bio); + end_sync_request(r10_bio); + return; + } - /* move the pages across to the second bio + /* + * share the pages with the first bio * and submit the write request */ - bio = r10_bio->devs[0].bio; wbio = r10_bio->devs[1].bio; - for (i=0; i < wbio->bi_vcnt; i++) { - struct page *p = bio->bi_io_vec[i].bv_page; - bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page; - wbio->bi_io_vec[i].bv_page = p; - } d = r10_bio->devs[1].devnum; atomic_inc(&conf->mirrors[d].rdev->nr_pending); md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); - if (test_bit(R10BIO_Uptodate, &r10_bio->state)) - generic_make_request(wbio); - else - bio_endio(wbio, -EIO); + generic_make_request(wbio); } @@ -1421,6 +1750,26 @@ static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev) atomic_set(&rdev->read_errors, read_errors >> hours_since_last); } +static int r10_sync_page_io(mdk_rdev_t *rdev, sector_t sector, + int sectors, struct page *page, int rw) +{ + sector_t first_bad; + int bad_sectors; + + if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors) + && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags))) + return -1; + if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) + /* success */ + return 1; + if (rw == WRITE) + set_bit(WriteErrorSeen, &rdev->flags); + /* need to record an error - either for the block or the device */ + if (!rdev_set_badblocks(rdev, sector, sectors, 0)) + md_error(rdev->mddev, rdev); + return 0; +} + /* * This is a kernel thread which: * @@ -1476,10 +1825,15 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) rcu_read_lock(); do { + sector_t first_bad; + int bad_sectors; + d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && - test_bit(In_sync, &rdev->flags)) { + test_bit(In_sync, &rdev->flags) && + is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, + &first_bad, &bad_sectors) == 0) { atomic_inc(&rdev->nr_pending); rcu_read_unlock(); success = sync_page_io(rdev, @@ -1499,9 +1853,19 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) rcu_read_unlock(); if (!success) { - /* Cannot read from anywhere -- bye bye array */ + /* Cannot read from anywhere, just mark the block + * as bad on the first device to discourage future + * reads. + */ int dn = r10_bio->devs[r10_bio->read_slot].devnum; - md_error(mddev, conf->mirrors[dn].rdev); + rdev = conf->mirrors[dn].rdev; + + if (!rdev_set_badblocks( + rdev, + r10_bio->devs[r10_bio->read_slot].addr + + sect, + s, 0)) + md_error(mddev, rdev); break; } @@ -1516,80 +1880,82 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) sl--; d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev && - test_bit(In_sync, &rdev->flags)) { - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - atomic_add(s, &rdev->corrected_errors); - if (sync_page_io(rdev, - r10_bio->devs[sl].addr + - sect, - s<<9, conf->tmppage, WRITE, false) - == 0) { - /* Well, this device is dead */ - printk(KERN_NOTICE - "md/raid10:%s: read correction " - "write failed" - " (%d sectors at %llu on %s)\n", - mdname(mddev), s, - (unsigned long long)( - sect + rdev->data_offset), - bdevname(rdev->bdev, b)); - printk(KERN_NOTICE "md/raid10:%s: %s: failing " - "drive\n", - mdname(mddev), - bdevname(rdev->bdev, b)); - md_error(mddev, rdev); - } - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); + if (!rdev || + !test_bit(In_sync, &rdev->flags)) + continue; + + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + if (r10_sync_page_io(rdev, + r10_bio->devs[sl].addr + + sect, + s<<9, conf->tmppage, WRITE) + == 0) { + /* Well, this device is dead */ + printk(KERN_NOTICE + "md/raid10:%s: read correction " + "write failed" + " (%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)( + sect + rdev->data_offset), + bdevname(rdev->bdev, b)); + printk(KERN_NOTICE "md/raid10:%s: %s: failing " + "drive\n", + mdname(mddev), + bdevname(rdev->bdev, b)); } + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); } sl = start; while (sl != r10_bio->read_slot) { + char b[BDEVNAME_SIZE]; if (sl==0) sl = conf->copies; sl--; d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev && - test_bit(In_sync, &rdev->flags)) { - char b[BDEVNAME_SIZE]; - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - if (sync_page_io(rdev, - r10_bio->devs[sl].addr + - sect, - s<<9, conf->tmppage, - READ, false) == 0) { - /* Well, this device is dead */ - printk(KERN_NOTICE - "md/raid10:%s: unable to read back " - "corrected sectors" - " (%d sectors at %llu on %s)\n", - mdname(mddev), s, - (unsigned long long)( - sect + rdev->data_offset), - bdevname(rdev->bdev, b)); - printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", - mdname(mddev), - bdevname(rdev->bdev, b)); - - md_error(mddev, rdev); - } else { - printk(KERN_INFO - "md/raid10:%s: read error corrected" - " (%d sectors at %llu on %s)\n", - mdname(mddev), s, - (unsigned long long)( - sect + rdev->data_offset), - bdevname(rdev->bdev, b)); - } + if (!rdev || + !test_bit(In_sync, &rdev->flags)) + continue; - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + switch (r10_sync_page_io(rdev, + r10_bio->devs[sl].addr + + sect, + s<<9, conf->tmppage, + READ)) { + case 0: + /* Well, this device is dead */ + printk(KERN_NOTICE + "md/raid10:%s: unable to read back " + "corrected sectors" + " (%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)( + sect + rdev->data_offset), + bdevname(rdev->bdev, b)); + printk(KERN_NOTICE "md/raid10:%s: %s: failing " + "drive\n", + mdname(mddev), + bdevname(rdev->bdev, b)); + break; + case 1: + printk(KERN_INFO + "md/raid10:%s: read error corrected" + " (%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)( + sect + rdev->data_offset), + bdevname(rdev->bdev, b)); + atomic_add(s, &rdev->corrected_errors); } + + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); } rcu_read_unlock(); @@ -1598,21 +1964,254 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) } } +static void bi_complete(struct bio *bio, int error) +{ + complete((struct completion *)bio->bi_private); +} + +static int submit_bio_wait(int rw, struct bio *bio) +{ + struct completion event; + rw |= REQ_SYNC; + + init_completion(&event); + bio->bi_private = &event; + bio->bi_end_io = bi_complete; + submit_bio(rw, bio); + wait_for_completion(&event); + + return test_bit(BIO_UPTODATE, &bio->bi_flags); +} + +static int narrow_write_error(r10bio_t *r10_bio, int i) +{ + struct bio *bio = r10_bio->master_bio; + mddev_t *mddev = r10_bio->mddev; + conf_t *conf = mddev->private; + mdk_rdev_t *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev; + /* bio has the data to be written to slot 'i' where + * we just recently had a write error. + * We repeatedly clone the bio and trim down to one block, + * then try the write. Where the write fails we record + * a bad block. + * It is conceivable that the bio doesn't exactly align with + * blocks. We must handle this. + * + * We currently own a reference to the rdev. + */ + + int block_sectors; + sector_t sector; + int sectors; + int sect_to_write = r10_bio->sectors; + int ok = 1; + + if (rdev->badblocks.shift < 0) + return 0; + + block_sectors = 1 << rdev->badblocks.shift; + sector = r10_bio->sector; + sectors = ((r10_bio->sector + block_sectors) + & ~(sector_t)(block_sectors - 1)) + - sector; + + while (sect_to_write) { + struct bio *wbio; + if (sectors > sect_to_write) + sectors = sect_to_write; + /* Write at 'sector' for 'sectors' */ + wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + md_trim_bio(wbio, sector - bio->bi_sector, sectors); + wbio->bi_sector = (r10_bio->devs[i].addr+ + rdev->data_offset+ + (sector - r10_bio->sector)); + wbio->bi_bdev = rdev->bdev; + if (submit_bio_wait(WRITE, wbio) == 0) + /* Failure! */ + ok = rdev_set_badblocks(rdev, sector, + sectors, 0) + && ok; + + bio_put(wbio); + sect_to_write -= sectors; + sector += sectors; + sectors = block_sectors; + } + return ok; +} + +static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio) +{ + int slot = r10_bio->read_slot; + int mirror = r10_bio->devs[slot].devnum; + struct bio *bio; + conf_t *conf = mddev->private; + mdk_rdev_t *rdev; + char b[BDEVNAME_SIZE]; + unsigned long do_sync; + int max_sectors; + + /* we got a read error. Maybe the drive is bad. Maybe just + * the block and we can fix it. + * We freeze all other IO, and try reading the block from + * other devices. When we find one, we re-write + * and check it that fixes the read error. + * This is all done synchronously while the array is + * frozen. + */ + if (mddev->ro == 0) { + freeze_array(conf); + fix_read_error(conf, mddev, r10_bio); + unfreeze_array(conf); + } + rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); + + bio = r10_bio->devs[slot].bio; + bdevname(bio->bi_bdev, b); + r10_bio->devs[slot].bio = + mddev->ro ? IO_BLOCKED : NULL; +read_more: + mirror = read_balance(conf, r10_bio, &max_sectors); + if (mirror == -1) { + printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" + " read error for block %llu\n", + mdname(mddev), b, + (unsigned long long)r10_bio->sector); + raid_end_bio_io(r10_bio); + bio_put(bio); + return; + } + + do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); + if (bio) + bio_put(bio); + slot = r10_bio->read_slot; + rdev = conf->mirrors[mirror].rdev; + printk_ratelimited( + KERN_ERR + "md/raid10:%s: %s: redirecting" + "sector %llu to another mirror\n", + mdname(mddev), + bdevname(rdev->bdev, b), + (unsigned long long)r10_bio->sector); + bio = bio_clone_mddev(r10_bio->master_bio, + GFP_NOIO, mddev); + md_trim_bio(bio, + r10_bio->sector - bio->bi_sector, + max_sectors); + r10_bio->devs[slot].bio = bio; + bio->bi_sector = r10_bio->devs[slot].addr + + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + bio->bi_rw = READ | do_sync; + bio->bi_private = r10_bio; + bio->bi_end_io = raid10_end_read_request; + if (max_sectors < r10_bio->sectors) { + /* Drat - have to split this up more */ + struct bio *mbio = r10_bio->master_bio; + int sectors_handled = + r10_bio->sector + max_sectors + - mbio->bi_sector; + r10_bio->sectors = max_sectors; + spin_lock_irq(&conf->device_lock); + if (mbio->bi_phys_segments == 0) + mbio->bi_phys_segments = 2; + else + mbio->bi_phys_segments++; + spin_unlock_irq(&conf->device_lock); + generic_make_request(bio); + bio = NULL; + + r10_bio = mempool_alloc(conf->r10bio_pool, + GFP_NOIO); + r10_bio->master_bio = mbio; + r10_bio->sectors = (mbio->bi_size >> 9) + - sectors_handled; + r10_bio->state = 0; + set_bit(R10BIO_ReadError, + &r10_bio->state); + r10_bio->mddev = mddev; + r10_bio->sector = mbio->bi_sector + + sectors_handled; + + goto read_more; + } else + generic_make_request(bio); +} + +static void handle_write_completed(conf_t *conf, r10bio_t *r10_bio) +{ + /* Some sort of write request has finished and it + * succeeded in writing where we thought there was a + * bad block. So forget the bad block. + * Or possibly if failed and we need to record + * a bad block. + */ + int m; + mdk_rdev_t *rdev; + + if (test_bit(R10BIO_IsSync, &r10_bio->state) || + test_bit(R10BIO_IsRecover, &r10_bio->state)) { + for (m = 0; m < conf->copies; m++) { + int dev = r10_bio->devs[m].devnum; + rdev = conf->mirrors[dev].rdev; + if (r10_bio->devs[m].bio == NULL) + continue; + if (test_bit(BIO_UPTODATE, + &r10_bio->devs[m].bio->bi_flags)) { + rdev_clear_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors); + } else { + if (!rdev_set_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors, 0)) + md_error(conf->mddev, rdev); + } + } + put_buf(r10_bio); + } else { + for (m = 0; m < conf->copies; m++) { + int dev = r10_bio->devs[m].devnum; + struct bio *bio = r10_bio->devs[m].bio; + rdev = conf->mirrors[dev].rdev; + if (bio == IO_MADE_GOOD) { + rdev_clear_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors); + rdev_dec_pending(rdev, conf->mddev); + } else if (bio != NULL && + !test_bit(BIO_UPTODATE, &bio->bi_flags)) { + if (!narrow_write_error(r10_bio, m)) { + md_error(conf->mddev, rdev); + set_bit(R10BIO_Degraded, + &r10_bio->state); + } + rdev_dec_pending(rdev, conf->mddev); + } + } + if (test_bit(R10BIO_WriteError, + &r10_bio->state)) + close_write(r10_bio); + raid_end_bio_io(r10_bio); + } +} + static void raid10d(mddev_t *mddev) { r10bio_t *r10_bio; - struct bio *bio; unsigned long flags; conf_t *conf = mddev->private; struct list_head *head = &conf->retry_list; - mdk_rdev_t *rdev; struct blk_plug plug; md_check_recovery(mddev); blk_start_plug(&plug); for (;;) { - char b[BDEVNAME_SIZE]; flush_pending_writes(conf); @@ -1628,64 +2227,26 @@ static void raid10d(mddev_t *mddev) mddev = r10_bio->mddev; conf = mddev->private; - if (test_bit(R10BIO_IsSync, &r10_bio->state)) + if (test_bit(R10BIO_MadeGood, &r10_bio->state) || + test_bit(R10BIO_WriteError, &r10_bio->state)) + handle_write_completed(conf, r10_bio); + else if (test_bit(R10BIO_IsSync, &r10_bio->state)) sync_request_write(mddev, r10_bio); else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) recovery_request_write(mddev, r10_bio); + else if (test_bit(R10BIO_ReadError, &r10_bio->state)) + handle_read_error(mddev, r10_bio); else { - int slot = r10_bio->read_slot; - int mirror = r10_bio->devs[slot].devnum; - /* we got a read error. Maybe the drive is bad. Maybe just - * the block and we can fix it. - * We freeze all other IO, and try reading the block from - * other devices. When we find one, we re-write - * and check it that fixes the read error. - * This is all done synchronously while the array is - * frozen. + /* just a partial read to be scheduled from a + * separate context */ - if (mddev->ro == 0) { - freeze_array(conf); - fix_read_error(conf, mddev, r10_bio); - unfreeze_array(conf); - } - rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); - - bio = r10_bio->devs[slot].bio; - r10_bio->devs[slot].bio = - mddev->ro ? IO_BLOCKED : NULL; - mirror = read_balance(conf, r10_bio); - if (mirror == -1) { - printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" - " read error for block %llu\n", - mdname(mddev), - bdevname(bio->bi_bdev,b), - (unsigned long long)r10_bio->sector); - raid_end_bio_io(r10_bio); - bio_put(bio); - } else { - const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); - bio_put(bio); - slot = r10_bio->read_slot; - rdev = conf->mirrors[mirror].rdev; - if (printk_ratelimit()) - printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to" - " another mirror\n", - mdname(mddev), - bdevname(rdev->bdev,b), - (unsigned long long)r10_bio->sector); - bio = bio_clone_mddev(r10_bio->master_bio, - GFP_NOIO, mddev); - r10_bio->devs[slot].bio = bio; - bio->bi_sector = r10_bio->devs[slot].addr - + rdev->data_offset; - bio->bi_bdev = rdev->bdev; - bio->bi_rw = READ | do_sync; - bio->bi_private = r10_bio; - bio->bi_end_io = raid10_end_read_request; - generic_make_request(bio); - } + int slot = r10_bio->read_slot; + generic_make_request(r10_bio->devs[slot].bio); } + cond_resched(); + if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) + md_check_recovery(mddev); } blk_finish_plug(&plug); } @@ -1746,7 +2307,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int i; int max_sync; sector_t sync_blocks; - sector_t sectors_skipped = 0; int chunks_skipped = 0; @@ -1828,7 +2388,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { /* recovery... the complicated one */ - int j, k; + int j; r10_bio = NULL; for (i=0 ; i<conf->raid_disks; i++) { @@ -1836,6 +2396,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, r10bio_t *rb2; sector_t sect; int must_sync; + int any_working; if (conf->mirrors[i].rdev == NULL || test_bit(In_sync, &conf->mirrors[i].rdev->flags)) @@ -1887,19 +2448,42 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, must_sync = bitmap_start_sync(mddev->bitmap, sect, &sync_blocks, still_degraded); + any_working = 0; for (j=0; j<conf->copies;j++) { + int k; int d = r10_bio->devs[j].devnum; + sector_t from_addr, to_addr; + mdk_rdev_t *rdev; + sector_t sector, first_bad; + int bad_sectors; if (!conf->mirrors[d].rdev || !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) continue; /* This is where we read from */ + any_working = 1; + rdev = conf->mirrors[d].rdev; + sector = r10_bio->devs[j].addr; + + if (is_badblock(rdev, sector, max_sync, + &first_bad, &bad_sectors)) { + if (first_bad > sector) + max_sync = first_bad - sector; + else { + bad_sectors -= (sector + - first_bad); + if (max_sync > bad_sectors) + max_sync = bad_sectors; + continue; + } + } bio = r10_bio->devs[0].bio; bio->bi_next = biolist; biolist = bio; bio->bi_private = r10_bio; bio->bi_end_io = end_sync_read; bio->bi_rw = READ; - bio->bi_sector = r10_bio->devs[j].addr + + from_addr = r10_bio->devs[j].addr; + bio->bi_sector = from_addr + conf->mirrors[d].rdev->data_offset; bio->bi_bdev = conf->mirrors[d].rdev->bdev; atomic_inc(&conf->mirrors[d].rdev->nr_pending); @@ -1916,26 +2500,48 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, bio->bi_private = r10_bio; bio->bi_end_io = end_sync_write; bio->bi_rw = WRITE; - bio->bi_sector = r10_bio->devs[k].addr + + to_addr = r10_bio->devs[k].addr; + bio->bi_sector = to_addr + conf->mirrors[i].rdev->data_offset; bio->bi_bdev = conf->mirrors[i].rdev->bdev; r10_bio->devs[0].devnum = d; + r10_bio->devs[0].addr = from_addr; r10_bio->devs[1].devnum = i; + r10_bio->devs[1].addr = to_addr; break; } if (j == conf->copies) { - /* Cannot recover, so abort the recovery */ + /* Cannot recover, so abort the recovery or + * record a bad block */ put_buf(r10_bio); if (rb2) atomic_dec(&rb2->remaining); r10_bio = rb2; - if (!test_and_set_bit(MD_RECOVERY_INTR, - &mddev->recovery)) - printk(KERN_INFO "md/raid10:%s: insufficient " - "working devices for recovery.\n", - mdname(mddev)); + if (any_working) { + /* problem is that there are bad blocks + * on other device(s) + */ + int k; + for (k = 0; k < conf->copies; k++) + if (r10_bio->devs[k].devnum == i) + break; + if (!rdev_set_badblocks( + conf->mirrors[i].rdev, + r10_bio->devs[k].addr, + max_sync, 0)) + any_working = 0; + } + if (!any_working) { + if (!test_and_set_bit(MD_RECOVERY_INTR, + &mddev->recovery)) + printk(KERN_INFO "md/raid10:%s: insufficient " + "working devices for recovery.\n", + mdname(mddev)); + conf->mirrors[i].recovery_disabled + = mddev->recovery_disabled; + } break; } } @@ -1979,12 +2585,28 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, for (i=0; i<conf->copies; i++) { int d = r10_bio->devs[i].devnum; + sector_t first_bad, sector; + int bad_sectors; + bio = r10_bio->devs[i].bio; bio->bi_end_io = NULL; clear_bit(BIO_UPTODATE, &bio->bi_flags); if (conf->mirrors[d].rdev == NULL || test_bit(Faulty, &conf->mirrors[d].rdev->flags)) continue; + sector = r10_bio->devs[i].addr; + if (is_badblock(conf->mirrors[d].rdev, + sector, max_sync, + &first_bad, &bad_sectors)) { + if (first_bad > sector) + max_sync = first_bad - sector; + else { + bad_sectors -= (sector - first_bad); + if (max_sync > bad_sectors) + max_sync = max_sync; + continue; + } + } atomic_inc(&conf->mirrors[d].rdev->nr_pending); atomic_inc(&r10_bio->remaining); bio->bi_next = biolist; @@ -1992,7 +2614,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, bio->bi_private = r10_bio; bio->bi_end_io = end_sync_read; bio->bi_rw = READ; - bio->bi_sector = r10_bio->devs[i].addr + + bio->bi_sector = sector + conf->mirrors[d].rdev->data_offset; bio->bi_bdev = conf->mirrors[d].rdev->bdev; count++; @@ -2079,7 +2701,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, return sectors_skipped + nr_sectors; giveup: /* There is nowhere to write, so all non-sync - * drives must be failed, so try the next chunk... + * drives must be failed or in resync, all drives + * have a bad block, so try the next chunk... */ if (sector_nr + max_sync < max_sector) max_sector = sector_nr + max_sync; @@ -2249,6 +2872,7 @@ static int run(mddev_t *mddev) (conf->raid_disks / conf->near_copies)); list_for_each_entry(rdev, &mddev->disks, same_set) { + disk_idx = rdev->raid_disk; if (disk_idx >= conf->raid_disks || disk_idx < 0) @@ -2271,7 +2895,7 @@ static int run(mddev_t *mddev) disk->head_position = 0; } /* need to check that every block has at least one working mirror */ - if (!enough(conf)) { + if (!enough(conf, -1)) { printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", mdname(mddev)); goto out_free_conf; diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 944b1104d3b..79cb52a0d4a 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -6,6 +6,11 @@ typedef struct mirror_info mirror_info_t; struct mirror_info { mdk_rdev_t *rdev; sector_t head_position; + int recovery_disabled; /* matches + * mddev->recovery_disabled + * when we shouldn't try + * recovering this device. + */ }; typedef struct r10bio_s r10bio_t; @@ -113,10 +118,26 @@ struct r10bio_s { * level, we store IO_BLOCKED in the appropriate 'bios' pointer */ #define IO_BLOCKED ((struct bio*)1) +/* When we successfully write to a known bad-block, we need to remove the + * bad-block marking which must be done from process context. So we record + * the success by setting devs[n].bio to IO_MADE_GOOD + */ +#define IO_MADE_GOOD ((struct bio *)2) + +#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) /* bits for r10bio.state */ #define R10BIO_Uptodate 0 #define R10BIO_IsSync 1 #define R10BIO_IsRecover 2 #define R10BIO_Degraded 3 +/* Set ReadError on bios that experience a read error + * so that raid10d knows what to do with them. + */ +#define R10BIO_ReadError 4 +/* If a write for this request means we can clear some + * known-bad-block records, we set this flag. + */ +#define R10BIO_MadeGood 5 +#define R10BIO_WriteError 6 #endif diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b72edf35ec5..43709fa6b6d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -51,6 +51,7 @@ #include <linux/seq_file.h> #include <linux/cpu.h> #include <linux/slab.h> +#include <linux/ratelimit.h> #include "md.h" #include "raid5.h" #include "raid0.h" @@ -96,8 +97,6 @@ #define __inline__ #endif -#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) - /* * We maintain a biased count of active stripes in the bottom 16 bits of * bi_phys_segments, and a count of processed stripes in the upper 16 bits @@ -341,7 +340,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) (unsigned long long)sh->sector, i, dev->toread, dev->read, dev->towrite, dev->written, test_bit(R5_LOCKED, &dev->flags)); - BUG(); + WARN_ON(1); } dev->flags = 0; raid5_build_block(sh, i, previous); @@ -527,6 +526,36 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) atomic_inc(&rdev->nr_pending); rcu_read_unlock(); + /* We have already checked bad blocks for reads. Now + * need to check for writes. + */ + while ((rw & WRITE) && rdev && + test_bit(WriteErrorSeen, &rdev->flags)) { + sector_t first_bad; + int bad_sectors; + int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, + &first_bad, &bad_sectors); + if (!bad) + break; + + if (bad < 0) { + set_bit(BlockedBadBlocks, &rdev->flags); + if (!conf->mddev->external && + conf->mddev->flags) { + /* It is very unlikely, but we might + * still need to write out the + * bad block log - better give it + * a chance*/ + md_check_recovery(conf->mddev); + } + md_wait_for_blocked_rdev(rdev, conf->mddev); + } else { + /* Acknowledged bad block - skip the write */ + rdev_dec_pending(rdev, conf->mddev); + rdev = NULL; + } + } + if (rdev) { if (s->syncing || s->expanding || s->expanded) md_sync_acct(rdev->bdev, STRIPE_SECTORS); @@ -548,10 +577,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) bi->bi_io_vec[0].bv_offset = 0; bi->bi_size = STRIPE_SIZE; bi->bi_next = NULL; - if ((rw & WRITE) && - test_bit(R5_ReWrite, &sh->dev[i].flags)) - atomic_add(STRIPE_SECTORS, - &rdev->corrected_errors); generic_make_request(bi); } else { if (rw & WRITE) @@ -1020,12 +1045,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { struct bio *wbi; - spin_lock(&sh->lock); + spin_lock_irq(&sh->raid_conf->device_lock); chosen = dev->towrite; dev->towrite = NULL; BUG_ON(dev->written); wbi = dev->written = chosen; - spin_unlock(&sh->lock); + spin_unlock_irq(&sh->raid_conf->device_lock); while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { @@ -1315,12 +1340,11 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) static int grow_one_stripe(raid5_conf_t *conf) { struct stripe_head *sh; - sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); + sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); if (!sh) return 0; - memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev)); + sh->raid_conf = conf; - spin_lock_init(&sh->lock); #ifdef CONFIG_MULTICORE_RAID456 init_waitqueue_head(&sh->ops.wait_for_ops); #endif @@ -1435,14 +1459,11 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) return -ENOMEM; for (i = conf->max_nr_stripes; i; i--) { - nsh = kmem_cache_alloc(sc, GFP_KERNEL); + nsh = kmem_cache_zalloc(sc, GFP_KERNEL); if (!nsh) break; - memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev)); - nsh->raid_conf = conf; - spin_lock_init(&nsh->lock); #ifdef CONFIG_MULTICORE_RAID456 init_waitqueue_head(&nsh->ops.wait_for_ops); #endif @@ -1587,12 +1608,15 @@ static void raid5_end_read_request(struct bio * bi, int error) set_bit(R5_UPTODATE, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) { rdev = conf->disks[i].rdev; - printk_rl(KERN_INFO "md/raid:%s: read error corrected" - " (%lu sectors at %llu on %s)\n", - mdname(conf->mddev), STRIPE_SECTORS, - (unsigned long long)(sh->sector - + rdev->data_offset), - bdevname(rdev->bdev, b)); + printk_ratelimited( + KERN_INFO + "md/raid:%s: read error corrected" + " (%lu sectors at %llu on %s)\n", + mdname(conf->mddev), STRIPE_SECTORS, + (unsigned long long)(sh->sector + + rdev->data_offset), + bdevname(rdev->bdev, b)); + atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); } @@ -1606,22 +1630,24 @@ static void raid5_end_read_request(struct bio * bi, int error) clear_bit(R5_UPTODATE, &sh->dev[i].flags); atomic_inc(&rdev->read_errors); if (conf->mddev->degraded >= conf->max_degraded) - printk_rl(KERN_WARNING - "md/raid:%s: read error not correctable " - "(sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)(sh->sector - + rdev->data_offset), - bdn); + printk_ratelimited( + KERN_WARNING + "md/raid:%s: read error not correctable " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)(sh->sector + + rdev->data_offset), + bdn); else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) /* Oh, no!!! */ - printk_rl(KERN_WARNING - "md/raid:%s: read error NOT corrected!! " - "(sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)(sh->sector - + rdev->data_offset), - bdn); + printk_ratelimited( + KERN_WARNING + "md/raid:%s: read error NOT corrected!! " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)(sh->sector + + rdev->data_offset), + bdn); else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) printk(KERN_WARNING @@ -1649,6 +1675,8 @@ static void raid5_end_write_request(struct bio *bi, int error) raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks, i; int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); + sector_t first_bad; + int bad_sectors; for (i=0 ; i<disks; i++) if (bi == &sh->dev[i].req) @@ -1662,8 +1690,12 @@ static void raid5_end_write_request(struct bio *bi, int error) return; } - if (!uptodate) - md_error(conf->mddev, conf->disks[i].rdev); + if (!uptodate) { + set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags); + set_bit(R5_WriteError, &sh->dev[i].flags); + } else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS, + &first_bad, &bad_sectors)) + set_bit(R5_MadeGood, &sh->dev[i].flags); rdev_dec_pending(conf->disks[i].rdev, conf->mddev); @@ -1710,6 +1742,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); } + set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); printk(KERN_ALERT @@ -1760,7 +1793,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, /* * Select the parity disk based on the user selected algorithm. */ - pd_idx = qd_idx = ~0; + pd_idx = qd_idx = -1; switch(conf->level) { case 4: pd_idx = data_disks; @@ -2143,12 +2176,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in raid5_conf_t *conf = sh->raid_conf; int firstwrite=0; - pr_debug("adding bh b#%llu to stripe s#%llu\n", + pr_debug("adding bi b#%llu to stripe s#%llu\n", (unsigned long long)bi->bi_sector, (unsigned long long)sh->sector); - spin_lock(&sh->lock); spin_lock_irq(&conf->device_lock); if (forwrite) { bip = &sh->dev[dd_idx].towrite; @@ -2169,19 +2201,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in bi->bi_next = *bip; *bip = bi; bi->bi_phys_segments++; - spin_unlock_irq(&conf->device_lock); - spin_unlock(&sh->lock); - - pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", - (unsigned long long)bi->bi_sector, - (unsigned long long)sh->sector, dd_idx); - - if (conf->mddev->bitmap && firstwrite) { - bitmap_startwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, 0); - sh->bm_seq = conf->seq_flush+1; - set_bit(STRIPE_BIT_DELAY, &sh->state); - } if (forwrite) { /* check if page is covered */ @@ -2196,12 +2215,23 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); } + spin_unlock_irq(&conf->device_lock); + + pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", + (unsigned long long)(*bip)->bi_sector, + (unsigned long long)sh->sector, dd_idx); + + if (conf->mddev->bitmap && firstwrite) { + bitmap_startwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, 0); + sh->bm_seq = conf->seq_flush+1; + set_bit(STRIPE_BIT_DELAY, &sh->state); + } return 1; overlap: set_bit(R5_Overlap, &sh->dev[dd_idx].flags); spin_unlock_irq(&conf->device_lock); - spin_unlock(&sh->lock); return 0; } @@ -2238,9 +2268,18 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, rcu_read_lock(); rdev = rcu_dereference(conf->disks[i].rdev); if (rdev && test_bit(In_sync, &rdev->flags)) - /* multiple read failures in one stripe */ - md_error(conf->mddev, rdev); + atomic_inc(&rdev->nr_pending); + else + rdev = NULL; rcu_read_unlock(); + if (rdev) { + if (!rdev_set_badblocks( + rdev, + sh->sector, + STRIPE_SECTORS, 0)) + md_error(conf->mddev, rdev); + rdev_dec_pending(rdev, conf->mddev); + } } spin_lock_irq(&conf->device_lock); /* fail all writes first */ @@ -2308,6 +2347,10 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, if (bitmap_end) bitmap_endwrite(conf->mddev->bitmap, sh->sector, STRIPE_SECTORS, 0, 0); + /* If we were in the middle of a write the parity block might + * still be locked - so just clear all R5_LOCKED flags + */ + clear_bit(R5_LOCKED, &sh->dev[i].flags); } if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) @@ -2315,109 +2358,73 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, md_wakeup_thread(conf->mddev->thread); } -/* fetch_block5 - checks the given member device to see if its data needs - * to be read or computed to satisfy a request. - * - * Returns 1 when no more member devices need to be checked, otherwise returns - * 0 to tell the loop in handle_stripe_fill5 to continue - */ -static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, - int disk_idx, int disks) -{ - struct r5dev *dev = &sh->dev[disk_idx]; - struct r5dev *failed_dev = &sh->dev[s->failed_num]; - - /* is the data in this block needed, and can we get it? */ - if (!test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && - (dev->toread || - (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || - s->syncing || s->expanding || - (s->failed && - (failed_dev->toread || - (failed_dev->towrite && - !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) { - /* We would like to get this block, possibly by computing it, - * otherwise read it if the backing disk is insync - */ - if ((s->uptodate == disks - 1) && - (s->failed && disk_idx == s->failed_num)) { - set_bit(STRIPE_COMPUTE_RUN, &sh->state); - set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); - set_bit(R5_Wantcompute, &dev->flags); - sh->ops.target = disk_idx; - sh->ops.target2 = -1; - s->req_compute = 1; - /* Careful: from this point on 'uptodate' is in the eye - * of raid_run_ops which services 'compute' operations - * before writes. R5_Wantcompute flags a block that will - * be R5_UPTODATE by the time it is needed for a - * subsequent operation. - */ - s->uptodate++; - return 1; /* uptodate + compute == disks */ - } else if (test_bit(R5_Insync, &dev->flags)) { - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - s->locked++; - pr_debug("Reading block %d (sync=%d)\n", disk_idx, - s->syncing); - } - } - - return 0; -} - -/** - * handle_stripe_fill5 - read or compute data to satisfy pending requests. - */ -static void handle_stripe_fill5(struct stripe_head *sh, - struct stripe_head_state *s, int disks) +static void +handle_failed_sync(raid5_conf_t *conf, struct stripe_head *sh, + struct stripe_head_state *s) { + int abort = 0; int i; - /* look for blocks to read/compute, skip this if a compute - * is already in flight, or if the stripe contents are in the - * midst of changing due to a write + md_done_sync(conf->mddev, STRIPE_SECTORS, 0); + clear_bit(STRIPE_SYNCING, &sh->state); + s->syncing = 0; + /* There is nothing more to do for sync/check/repair. + * For recover we need to record a bad block on all + * non-sync devices, or abort the recovery */ - if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && - !sh->reconstruct_state) - for (i = disks; i--; ) - if (fetch_block5(sh, s, i, disks)) - break; - set_bit(STRIPE_HANDLE, &sh->state); + if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) + return; + /* During recovery devices cannot be removed, so locking and + * refcounting of rdevs is not needed + */ + for (i = 0; i < conf->raid_disks; i++) { + mdk_rdev_t *rdev = conf->disks[i].rdev; + if (!rdev + || test_bit(Faulty, &rdev->flags) + || test_bit(In_sync, &rdev->flags)) + continue; + if (!rdev_set_badblocks(rdev, sh->sector, + STRIPE_SECTORS, 0)) + abort = 1; + } + if (abort) { + conf->recovery_disabled = conf->mddev->recovery_disabled; + set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery); + } } -/* fetch_block6 - checks the given member device to see if its data needs +/* fetch_block - checks the given member device to see if its data needs * to be read or computed to satisfy a request. * * Returns 1 when no more member devices need to be checked, otherwise returns - * 0 to tell the loop in handle_stripe_fill6 to continue + * 0 to tell the loop in handle_stripe_fill to continue */ -static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, - struct r6_state *r6s, int disk_idx, int disks) +static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, + int disk_idx, int disks) { struct r5dev *dev = &sh->dev[disk_idx]; - struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]], - &sh->dev[r6s->failed_num[1]] }; + struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], + &sh->dev[s->failed_num[1]] }; + /* is the data in this block needed, and can we get it? */ if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || s->syncing || s->expanding || - (s->failed >= 1 && - (fdev[0]->toread || s->to_write)) || - (s->failed >= 2 && - (fdev[1]->toread || s->to_write)))) { + (s->failed >= 1 && fdev[0]->toread) || + (s->failed >= 2 && fdev[1]->toread) || + (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && + !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || + (sh->raid_conf->level == 6 && s->failed && s->to_write))) { /* we would like to get this block, possibly by computing it, * otherwise read it if the backing disk is insync */ BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); BUG_ON(test_bit(R5_Wantread, &dev->flags)); if ((s->uptodate == disks - 1) && - (s->failed && (disk_idx == r6s->failed_num[0] || - disk_idx == r6s->failed_num[1]))) { + (s->failed && (disk_idx == s->failed_num[0] || + disk_idx == s->failed_num[1]))) { /* have disk failed, and we're requested to fetch it; * do compute it */ @@ -2429,6 +2436,12 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, sh->ops.target = disk_idx; sh->ops.target2 = -1; /* no 2nd target */ s->req_compute = 1; + /* Careful: from this point on 'uptodate' is in the eye + * of raid_run_ops which services 'compute' operations + * before writes. R5_Wantcompute flags a block that will + * be R5_UPTODATE by the time it is needed for a + * subsequent operation. + */ s->uptodate++; return 1; } else if (s->uptodate == disks-2 && s->failed >= 2) { @@ -2469,11 +2482,11 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, } /** - * handle_stripe_fill6 - read or compute data to satisfy pending requests. + * handle_stripe_fill - read or compute data to satisfy pending requests. */ -static void handle_stripe_fill6(struct stripe_head *sh, - struct stripe_head_state *s, struct r6_state *r6s, - int disks) +static void handle_stripe_fill(struct stripe_head *sh, + struct stripe_head_state *s, + int disks) { int i; @@ -2484,7 +2497,7 @@ static void handle_stripe_fill6(struct stripe_head *sh, if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && !sh->reconstruct_state) for (i = disks; i--; ) - if (fetch_block6(sh, s, r6s, i, disks)) + if (fetch_block(sh, s, i, disks)) break; set_bit(STRIPE_HANDLE, &sh->state); } @@ -2540,11 +2553,19 @@ static void handle_stripe_clean_event(raid5_conf_t *conf, md_wakeup_thread(conf->mddev->thread); } -static void handle_stripe_dirtying5(raid5_conf_t *conf, - struct stripe_head *sh, struct stripe_head_state *s, int disks) +static void handle_stripe_dirtying(raid5_conf_t *conf, + struct stripe_head *sh, + struct stripe_head_state *s, + int disks) { int rmw = 0, rcw = 0, i; - for (i = disks; i--; ) { + if (conf->max_degraded == 2) { + /* RAID6 requires 'rcw' in current implementation + * Calculate the real rcw later - for now fake it + * look like rcw is cheaper + */ + rcw = 1; rmw = 2; + } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; if ((dev->towrite || i == sh->pd_idx) && @@ -2591,16 +2612,19 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, } } } - if (rcw <= rmw && rcw > 0) + if (rcw <= rmw && rcw > 0) { /* want reconstruct write, but need to get some data */ + rcw = 0; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (!test_bit(R5_OVERWRITE, &dev->flags) && - i != sh->pd_idx && + i != sh->pd_idx && i != sh->qd_idx && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_Wantcompute, &dev->flags)) && - test_bit(R5_Insync, &dev->flags)) { + test_bit(R5_Wantcompute, &dev->flags))) { + rcw++; + if (!test_bit(R5_Insync, &dev->flags)) + continue; /* it's a failed drive */ if ( test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { pr_debug("Read_old block " @@ -2614,6 +2638,7 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, } } } + } /* now if nothing is locked, and if we have enough data, * we can start a write request */ @@ -2630,53 +2655,6 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, schedule_reconstruction(sh, s, rcw == 0, 0); } -static void handle_stripe_dirtying6(raid5_conf_t *conf, - struct stripe_head *sh, struct stripe_head_state *s, - struct r6_state *r6s, int disks) -{ - int rcw = 0, pd_idx = sh->pd_idx, i; - int qd_idx = sh->qd_idx; - - set_bit(STRIPE_HANDLE, &sh->state); - for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - /* check if we haven't enough data */ - if (!test_bit(R5_OVERWRITE, &dev->flags) && - i != pd_idx && i != qd_idx && - !test_bit(R5_LOCKED, &dev->flags) && - !(test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_Wantcompute, &dev->flags))) { - rcw++; - if (!test_bit(R5_Insync, &dev->flags)) - continue; /* it's a failed drive */ - - if ( - test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - pr_debug("Read_old stripe %llu " - "block %d for Reconstruct\n", - (unsigned long long)sh->sector, i); - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - s->locked++; - } else { - pr_debug("Request delayed stripe %llu " - "block %d for Reconstruct\n", - (unsigned long long)sh->sector, i); - set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - } - } - } - /* now if nothing is locked, and if we have enough data, we can start a - * write request - */ - if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && - s->locked == 0 && rcw == 0 && - !test_bit(STRIPE_BIT_DELAY, &sh->state)) { - schedule_reconstruction(sh, s, 1, 0); - } -} - static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks) { @@ -2695,7 +2673,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, s->uptodate--; break; } - dev = &sh->dev[s->failed_num]; + dev = &sh->dev[s->failed_num[0]]; /* fall through */ case check_state_compute_result: sh->check_state = check_state_idle; @@ -2767,7 +2745,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, - struct r6_state *r6s, int disks) + int disks) { int pd_idx = sh->pd_idx; int qd_idx = sh->qd_idx; @@ -2786,14 +2764,14 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, switch (sh->check_state) { case check_state_idle: /* start a new check operation if there are < 2 failures */ - if (s->failed == r6s->q_failed) { + if (s->failed == s->q_failed) { /* The only possible failed device holds Q, so it * makes sense to check P (If anything else were failed, * we would have used P to recreate it). */ sh->check_state = check_state_run; } - if (!r6s->q_failed && s->failed < 2) { + if (!s->q_failed && s->failed < 2) { /* Q is not failed, and we didn't use it to generate * anything, so it makes sense to check it */ @@ -2835,13 +2813,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, */ BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ if (s->failed == 2) { - dev = &sh->dev[r6s->failed_num[1]]; + dev = &sh->dev[s->failed_num[1]]; s->locked++; set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); } if (s->failed >= 1) { - dev = &sh->dev[r6s->failed_num[0]]; + dev = &sh->dev[s->failed_num[0]]; s->locked++; set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); @@ -2928,8 +2906,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, } } -static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, - struct r6_state *r6s) +static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh) { int i; @@ -2971,7 +2948,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); for (j = 0; j < conf->raid_disks; j++) if (j != sh2->pd_idx && - (!r6s || j != sh2->qd_idx) && + j != sh2->qd_idx && !test_bit(R5_Expanded, &sh2->dev[j].flags)) break; if (j == conf->raid_disks) { @@ -3006,43 +2983,35 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, * */ -static void handle_stripe5(struct stripe_head *sh) +static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) { raid5_conf_t *conf = sh->raid_conf; - int disks = sh->disks, i; - struct bio *return_bi = NULL; - struct stripe_head_state s; + int disks = sh->disks; struct r5dev *dev; - mdk_rdev_t *blocked_rdev = NULL; - int prexor; - int dec_preread_active = 0; + int i; - memset(&s, 0, sizeof(s)); - pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " - "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, - atomic_read(&sh->count), sh->pd_idx, sh->check_state, - sh->reconstruct_state); + memset(s, 0, sizeof(*s)); - spin_lock(&sh->lock); - clear_bit(STRIPE_HANDLE, &sh->state); - clear_bit(STRIPE_DELAYED, &sh->state); - - s.syncing = test_bit(STRIPE_SYNCING, &sh->state); - s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); - s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); + s->syncing = test_bit(STRIPE_SYNCING, &sh->state); + s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); + s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); + s->failed_num[0] = -1; + s->failed_num[1] = -1; /* Now to look around and see what can be done */ rcu_read_lock(); + spin_lock_irq(&conf->device_lock); for (i=disks; i--; ) { mdk_rdev_t *rdev; + sector_t first_bad; + int bad_sectors; + int is_bad = 0; dev = &sh->dev[i]; - pr_debug("check %d: state 0x%lx toread %p read %p write %p " - "written %p\n", i, dev->flags, dev->toread, dev->read, - dev->towrite, dev->written); - - /* maybe we can request a biofill operation + pr_debug("check %d: state 0x%lx read %p write %p written %p\n", + i, dev->flags, dev->toread, dev->towrite, dev->written); + /* maybe we can reply to a read * * new wantfill requests are only permitted while * ops_complete_biofill is guaranteed to be inactive @@ -3052,37 +3021,74 @@ static void handle_stripe5(struct stripe_head *sh) set_bit(R5_Wantfill, &dev->flags); /* now count some things */ - if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; - if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; - if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++; + if (test_bit(R5_LOCKED, &dev->flags)) + s->locked++; + if (test_bit(R5_UPTODATE, &dev->flags)) + s->uptodate++; + if (test_bit(R5_Wantcompute, &dev->flags)) { + s->compute++; + BUG_ON(s->compute > 2); + } if (test_bit(R5_Wantfill, &dev->flags)) - s.to_fill++; + s->to_fill++; else if (dev->toread) - s.to_read++; + s->to_read++; if (dev->towrite) { - s.to_write++; + s->to_write++; if (!test_bit(R5_OVERWRITE, &dev->flags)) - s.non_overwrite++; + s->non_overwrite++; } if (dev->written) - s.written++; + s->written++; rdev = rcu_dereference(conf->disks[i].rdev); - if (blocked_rdev == NULL && - rdev && unlikely(test_bit(Blocked, &rdev->flags))) { - blocked_rdev = rdev; - atomic_inc(&rdev->nr_pending); + if (rdev) { + is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, + &first_bad, &bad_sectors); + if (s->blocked_rdev == NULL + && (test_bit(Blocked, &rdev->flags) + || is_bad < 0)) { + if (is_bad < 0) + set_bit(BlockedBadBlocks, + &rdev->flags); + s->blocked_rdev = rdev; + atomic_inc(&rdev->nr_pending); + } } clear_bit(R5_Insync, &dev->flags); if (!rdev) /* Not in-sync */; - else if (test_bit(In_sync, &rdev->flags)) + else if (is_bad) { + /* also not in-sync */ + if (!test_bit(WriteErrorSeen, &rdev->flags)) { + /* treat as in-sync, but with a read error + * which we can now try to correct + */ + set_bit(R5_Insync, &dev->flags); + set_bit(R5_ReadError, &dev->flags); + } + } else if (test_bit(In_sync, &rdev->flags)) set_bit(R5_Insync, &dev->flags); else { - /* could be in-sync depending on recovery/reshape status */ + /* in sync if before recovery_offset */ if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) set_bit(R5_Insync, &dev->flags); } + if (test_bit(R5_WriteError, &dev->flags)) { + clear_bit(R5_Insync, &dev->flags); + if (!test_bit(Faulty, &rdev->flags)) { + s->handle_bad_blocks = 1; + atomic_inc(&rdev->nr_pending); + } else + clear_bit(R5_WriteError, &dev->flags); + } + if (test_bit(R5_MadeGood, &dev->flags)) { + if (!test_bit(Faulty, &rdev->flags)) { + s->handle_bad_blocks = 1; + atomic_inc(&rdev->nr_pending); + } else + clear_bit(R5_MadeGood, &dev->flags); + } if (!test_bit(R5_Insync, &dev->flags)) { /* The ReadError flag will just be confusing now */ clear_bit(R5_ReadError, &dev->flags); @@ -3091,313 +3097,60 @@ static void handle_stripe5(struct stripe_head *sh) if (test_bit(R5_ReadError, &dev->flags)) clear_bit(R5_Insync, &dev->flags); if (!test_bit(R5_Insync, &dev->flags)) { - s.failed++; - s.failed_num = i; + if (s->failed < 2) + s->failed_num[s->failed] = i; + s->failed++; } } + spin_unlock_irq(&conf->device_lock); rcu_read_unlock(); - - if (unlikely(blocked_rdev)) { - if (s.syncing || s.expanding || s.expanded || - s.to_write || s.written) { - set_bit(STRIPE_HANDLE, &sh->state); - goto unlock; - } - /* There is nothing for the blocked_rdev to block */ - rdev_dec_pending(blocked_rdev, conf->mddev); - blocked_rdev = NULL; - } - - if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { - set_bit(STRIPE_OP_BIOFILL, &s.ops_request); - set_bit(STRIPE_BIOFILL_RUN, &sh->state); - } - - pr_debug("locked=%d uptodate=%d to_read=%d" - " to_write=%d failed=%d failed_num=%d\n", - s.locked, s.uptodate, s.to_read, s.to_write, - s.failed, s.failed_num); - /* check if the array has lost two devices and, if so, some requests might - * need to be failed - */ - if (s.failed > 1 && s.to_read+s.to_write+s.written) - handle_failed_stripe(conf, sh, &s, disks, &return_bi); - if (s.failed > 1 && s.syncing) { - md_done_sync(conf->mddev, STRIPE_SECTORS,0); - clear_bit(STRIPE_SYNCING, &sh->state); - s.syncing = 0; - } - - /* might be able to return some write requests if the parity block - * is safe, or on a failed drive - */ - dev = &sh->dev[sh->pd_idx]; - if ( s.written && - ((test_bit(R5_Insync, &dev->flags) && - !test_bit(R5_LOCKED, &dev->flags) && - test_bit(R5_UPTODATE, &dev->flags)) || - (s.failed == 1 && s.failed_num == sh->pd_idx))) - handle_stripe_clean_event(conf, sh, disks, &return_bi); - - /* Now we might consider reading some blocks, either to check/generate - * parity, or to satisfy requests - * or to load a block that is being partially written. - */ - if (s.to_read || s.non_overwrite || - (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) - handle_stripe_fill5(sh, &s, disks); - - /* Now we check to see if any write operations have recently - * completed - */ - prexor = 0; - if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) - prexor = 1; - if (sh->reconstruct_state == reconstruct_state_drain_result || - sh->reconstruct_state == reconstruct_state_prexor_drain_result) { - sh->reconstruct_state = reconstruct_state_idle; - - /* All the 'written' buffers and the parity block are ready to - * be written back to disk - */ - BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); - for (i = disks; i--; ) { - dev = &sh->dev[i]; - if (test_bit(R5_LOCKED, &dev->flags) && - (i == sh->pd_idx || dev->written)) { - pr_debug("Writing block %d\n", i); - set_bit(R5_Wantwrite, &dev->flags); - if (prexor) - continue; - if (!test_bit(R5_Insync, &dev->flags) || - (i == sh->pd_idx && s.failed == 0)) - set_bit(STRIPE_INSYNC, &sh->state); - } - } - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) - dec_preread_active = 1; - } - - /* Now to consider new write requests and what else, if anything - * should be read. We do not handle new writes when: - * 1/ A 'write' operation (copy+xor) is already in flight. - * 2/ A 'check' operation is in flight, as it may clobber the parity - * block. - */ - if (s.to_write && !sh->reconstruct_state && !sh->check_state) - handle_stripe_dirtying5(conf, sh, &s, disks); - - /* maybe we need to check and possibly fix the parity for this stripe - * Any reads will already have been scheduled, so we just see if enough - * data is available. The parity check is held off while parity - * dependent operations are in flight. - */ - if (sh->check_state || - (s.syncing && s.locked == 0 && - !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && - !test_bit(STRIPE_INSYNC, &sh->state))) - handle_parity_checks5(conf, sh, &s, disks); - - if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { - md_done_sync(conf->mddev, STRIPE_SECTORS,1); - clear_bit(STRIPE_SYNCING, &sh->state); - } - - /* If the failed drive is just a ReadError, then we might need to progress - * the repair/check process - */ - if (s.failed == 1 && !conf->mddev->ro && - test_bit(R5_ReadError, &sh->dev[s.failed_num].flags) - && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags) - && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags) - ) { - dev = &sh->dev[s.failed_num]; - if (!test_bit(R5_ReWrite, &dev->flags)) { - set_bit(R5_Wantwrite, &dev->flags); - set_bit(R5_ReWrite, &dev->flags); - set_bit(R5_LOCKED, &dev->flags); - s.locked++; - } else { - /* let's read it back */ - set_bit(R5_Wantread, &dev->flags); - set_bit(R5_LOCKED, &dev->flags); - s.locked++; - } - } - - /* Finish reconstruct operations initiated by the expansion process */ - if (sh->reconstruct_state == reconstruct_state_result) { - struct stripe_head *sh2 - = get_active_stripe(conf, sh->sector, 1, 1, 1); - if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { - /* sh cannot be written until sh2 has been read. - * so arrange for sh to be delayed a little - */ - set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, - &sh2->state)) - atomic_inc(&conf->preread_active_stripes); - release_stripe(sh2); - goto unlock; - } - if (sh2) - release_stripe(sh2); - - sh->reconstruct_state = reconstruct_state_idle; - clear_bit(STRIPE_EXPANDING, &sh->state); - for (i = conf->raid_disks; i--; ) { - set_bit(R5_Wantwrite, &sh->dev[i].flags); - set_bit(R5_LOCKED, &sh->dev[i].flags); - s.locked++; - } - } - - if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && - !sh->reconstruct_state) { - /* Need to write out all blocks after computing parity */ - sh->disks = conf->raid_disks; - stripe_set_idx(sh->sector, conf, 0, sh); - schedule_reconstruction(sh, &s, 1, 1); - } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { - clear_bit(STRIPE_EXPAND_READY, &sh->state); - atomic_dec(&conf->reshape_stripes); - wake_up(&conf->wait_for_overlap); - md_done_sync(conf->mddev, STRIPE_SECTORS, 1); - } - - if (s.expanding && s.locked == 0 && - !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) - handle_stripe_expansion(conf, sh, NULL); - - unlock: - spin_unlock(&sh->lock); - - /* wait for this device to become unblocked */ - if (unlikely(blocked_rdev)) - md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); - - if (s.ops_request) - raid_run_ops(sh, s.ops_request); - - ops_run_io(sh, &s); - - if (dec_preread_active) { - /* We delay this until after ops_run_io so that if make_request - * is waiting on a flush, it won't continue until the writes - * have actually been submitted. - */ - atomic_dec(&conf->preread_active_stripes); - if (atomic_read(&conf->preread_active_stripes) < - IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - } - return_io(return_bi); } -static void handle_stripe6(struct stripe_head *sh) +static void handle_stripe(struct stripe_head *sh) { + struct stripe_head_state s; raid5_conf_t *conf = sh->raid_conf; + int i; + int prexor; int disks = sh->disks; - struct bio *return_bi = NULL; - int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx; - struct stripe_head_state s; - struct r6_state r6s; - struct r5dev *dev, *pdev, *qdev; - mdk_rdev_t *blocked_rdev = NULL; - int dec_preread_active = 0; + struct r5dev *pdev, *qdev; + + clear_bit(STRIPE_HANDLE, &sh->state); + if (test_and_set_bit(STRIPE_ACTIVE, &sh->state)) { + /* already being handled, ensure it gets handled + * again when current action finishes */ + set_bit(STRIPE_HANDLE, &sh->state); + return; + } + + if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { + set_bit(STRIPE_SYNCING, &sh->state); + clear_bit(STRIPE_INSYNC, &sh->state); + } + clear_bit(STRIPE_DELAYED, &sh->state); pr_debug("handling stripe %llu, state=%#lx cnt=%d, " "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, - atomic_read(&sh->count), pd_idx, qd_idx, + atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, sh->check_state, sh->reconstruct_state); - memset(&s, 0, sizeof(s)); - - spin_lock(&sh->lock); - clear_bit(STRIPE_HANDLE, &sh->state); - clear_bit(STRIPE_DELAYED, &sh->state); - - s.syncing = test_bit(STRIPE_SYNCING, &sh->state); - s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); - s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); - /* Now to look around and see what can be done */ - - rcu_read_lock(); - for (i=disks; i--; ) { - mdk_rdev_t *rdev; - dev = &sh->dev[i]; - pr_debug("check %d: state 0x%lx read %p write %p written %p\n", - i, dev->flags, dev->toread, dev->towrite, dev->written); - /* maybe we can reply to a read - * - * new wantfill requests are only permitted while - * ops_complete_biofill is guaranteed to be inactive - */ - if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && - !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) - set_bit(R5_Wantfill, &dev->flags); + analyse_stripe(sh, &s); - /* now count some things */ - if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; - if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; - if (test_bit(R5_Wantcompute, &dev->flags)) { - s.compute++; - BUG_ON(s.compute > 2); - } - - if (test_bit(R5_Wantfill, &dev->flags)) { - s.to_fill++; - } else if (dev->toread) - s.to_read++; - if (dev->towrite) { - s.to_write++; - if (!test_bit(R5_OVERWRITE, &dev->flags)) - s.non_overwrite++; - } - if (dev->written) - s.written++; - rdev = rcu_dereference(conf->disks[i].rdev); - if (blocked_rdev == NULL && - rdev && unlikely(test_bit(Blocked, &rdev->flags))) { - blocked_rdev = rdev; - atomic_inc(&rdev->nr_pending); - } - clear_bit(R5_Insync, &dev->flags); - if (!rdev) - /* Not in-sync */; - else if (test_bit(In_sync, &rdev->flags)) - set_bit(R5_Insync, &dev->flags); - else { - /* in sync if before recovery_offset */ - if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) - set_bit(R5_Insync, &dev->flags); - } - if (!test_bit(R5_Insync, &dev->flags)) { - /* The ReadError flag will just be confusing now */ - clear_bit(R5_ReadError, &dev->flags); - clear_bit(R5_ReWrite, &dev->flags); - } - if (test_bit(R5_ReadError, &dev->flags)) - clear_bit(R5_Insync, &dev->flags); - if (!test_bit(R5_Insync, &dev->flags)) { - if (s.failed < 2) - r6s.failed_num[s.failed] = i; - s.failed++; - } + if (s.handle_bad_blocks) { + set_bit(STRIPE_HANDLE, &sh->state); + goto finish; } - rcu_read_unlock(); - if (unlikely(blocked_rdev)) { + if (unlikely(s.blocked_rdev)) { if (s.syncing || s.expanding || s.expanded || s.to_write || s.written) { set_bit(STRIPE_HANDLE, &sh->state); - goto unlock; + goto finish; } /* There is nothing for the blocked_rdev to block */ - rdev_dec_pending(blocked_rdev, conf->mddev); - blocked_rdev = NULL; + rdev_dec_pending(s.blocked_rdev, conf->mddev); + s.blocked_rdev = NULL; } if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { @@ -3408,83 +3161,88 @@ static void handle_stripe6(struct stripe_head *sh) pr_debug("locked=%d uptodate=%d to_read=%d" " to_write=%d failed=%d failed_num=%d,%d\n", s.locked, s.uptodate, s.to_read, s.to_write, s.failed, - r6s.failed_num[0], r6s.failed_num[1]); - /* check if the array has lost >2 devices and, if so, some requests - * might need to be failed + s.failed_num[0], s.failed_num[1]); + /* check if the array has lost more than max_degraded devices and, + * if so, some requests might need to be failed. */ - if (s.failed > 2 && s.to_read+s.to_write+s.written) - handle_failed_stripe(conf, sh, &s, disks, &return_bi); - if (s.failed > 2 && s.syncing) { - md_done_sync(conf->mddev, STRIPE_SECTORS,0); - clear_bit(STRIPE_SYNCING, &sh->state); - s.syncing = 0; - } + if (s.failed > conf->max_degraded && s.to_read+s.to_write+s.written) + handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); + if (s.failed > conf->max_degraded && s.syncing) + handle_failed_sync(conf, sh, &s); /* * might be able to return some write requests if the parity blocks * are safe, or on a failed drive */ - pdev = &sh->dev[pd_idx]; - r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) - || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); - qdev = &sh->dev[qd_idx]; - r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx) - || (s.failed >= 2 && r6s.failed_num[1] == qd_idx); - - if ( s.written && - ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) + pdev = &sh->dev[sh->pd_idx]; + s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) + || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); + qdev = &sh->dev[sh->qd_idx]; + s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) + || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) + || conf->level < 6; + + if (s.written && + (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) && !test_bit(R5_LOCKED, &pdev->flags) && test_bit(R5_UPTODATE, &pdev->flags)))) && - ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) + (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) && !test_bit(R5_LOCKED, &qdev->flags) && test_bit(R5_UPTODATE, &qdev->flags))))) - handle_stripe_clean_event(conf, sh, disks, &return_bi); + handle_stripe_clean_event(conf, sh, disks, &s.return_bi); /* Now we might consider reading some blocks, either to check/generate * parity, or to satisfy requests * or to load a block that is being partially written. */ - if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || - (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) - handle_stripe_fill6(sh, &s, &r6s, disks); + if (s.to_read || s.non_overwrite + || (conf->level == 6 && s.to_write && s.failed) + || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) + handle_stripe_fill(sh, &s, disks); /* Now we check to see if any write operations have recently * completed */ - if (sh->reconstruct_state == reconstruct_state_drain_result) { - + prexor = 0; + if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) + prexor = 1; + if (sh->reconstruct_state == reconstruct_state_drain_result || + sh->reconstruct_state == reconstruct_state_prexor_drain_result) { sh->reconstruct_state = reconstruct_state_idle; - /* All the 'written' buffers and the parity blocks are ready to + + /* All the 'written' buffers and the parity block are ready to * be written back to disk */ BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); - BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags)); + BUG_ON(sh->qd_idx >= 0 && + !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags)); for (i = disks; i--; ) { - dev = &sh->dev[i]; + struct r5dev *dev = &sh->dev[i]; if (test_bit(R5_LOCKED, &dev->flags) && - (i == sh->pd_idx || i == qd_idx || - dev->written)) { + (i == sh->pd_idx || i == sh->qd_idx || + dev->written)) { pr_debug("Writing block %d\n", i); - BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); set_bit(R5_Wantwrite, &dev->flags); + if (prexor) + continue; if (!test_bit(R5_Insync, &dev->flags) || - ((i == sh->pd_idx || i == qd_idx) && - s.failed == 0)) + ((i == sh->pd_idx || i == sh->qd_idx) && + s.failed == 0)) set_bit(STRIPE_INSYNC, &sh->state); } } if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) - dec_preread_active = 1; + s.dec_preread_active = 1; } /* Now to consider new write requests and what else, if anything * should be read. We do not handle new writes when: - * 1/ A 'write' operation (copy+gen_syndrome) is already in flight. + * 1/ A 'write' operation (copy+xor) is already in flight. * 2/ A 'check' operation is in flight, as it may clobber the parity * block. */ if (s.to_write && !sh->reconstruct_state && !sh->check_state) - handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); + handle_stripe_dirtying(conf, sh, &s, disks); /* maybe we need to check and possibly fix the parity for this stripe * Any reads will already have been scheduled, so we just see if enough @@ -3494,20 +3252,24 @@ static void handle_stripe6(struct stripe_head *sh) if (sh->check_state || (s.syncing && s.locked == 0 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && - !test_bit(STRIPE_INSYNC, &sh->state))) - handle_parity_checks6(conf, sh, &s, &r6s, disks); + !test_bit(STRIPE_INSYNC, &sh->state))) { + if (conf->level == 6) + handle_parity_checks6(conf, sh, &s, disks); + else + handle_parity_checks5(conf, sh, &s, disks); + } if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { - md_done_sync(conf->mddev, STRIPE_SECTORS,1); + md_done_sync(conf->mddev, STRIPE_SECTORS, 1); clear_bit(STRIPE_SYNCING, &sh->state); } /* If the failed drives are just a ReadError, then we might need * to progress the repair/check process */ - if (s.failed <= 2 && !conf->mddev->ro) + if (s.failed <= conf->max_degraded && !conf->mddev->ro) for (i = 0; i < s.failed; i++) { - dev = &sh->dev[r6s.failed_num[i]]; + struct r5dev *dev = &sh->dev[s.failed_num[i]]; if (test_bit(R5_ReadError, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) && test_bit(R5_UPTODATE, &dev->flags) @@ -3526,8 +3288,26 @@ static void handle_stripe6(struct stripe_head *sh) } } + /* Finish reconstruct operations initiated by the expansion process */ if (sh->reconstruct_state == reconstruct_state_result) { + struct stripe_head *sh_src + = get_active_stripe(conf, sh->sector, 1, 1, 1); + if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { + /* sh cannot be written until sh_src has been read. + * so arrange for sh to be delayed a little + */ + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, + &sh_src->state)) + atomic_inc(&conf->preread_active_stripes); + release_stripe(sh_src); + goto finish; + } + if (sh_src) + release_stripe(sh_src); + sh->reconstruct_state = reconstruct_state_idle; clear_bit(STRIPE_EXPANDING, &sh->state); for (i = conf->raid_disks; i--; ) { @@ -3539,24 +3319,7 @@ static void handle_stripe6(struct stripe_head *sh) if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && !sh->reconstruct_state) { - struct stripe_head *sh2 - = get_active_stripe(conf, sh->sector, 1, 1, 1); - if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { - /* sh cannot be written until sh2 has been read. - * so arrange for sh to be delayed a little - */ - set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, - &sh2->state)) - atomic_inc(&conf->preread_active_stripes); - release_stripe(sh2); - goto unlock; - } - if (sh2) - release_stripe(sh2); - - /* Need to write out all blocks after computing P&Q */ + /* Need to write out all blocks after computing parity */ sh->disks = conf->raid_disks; stripe_set_idx(sh->sector, conf, 0, sh); schedule_reconstruction(sh, &s, 1, 1); @@ -3569,22 +3332,39 @@ static void handle_stripe6(struct stripe_head *sh) if (s.expanding && s.locked == 0 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) - handle_stripe_expansion(conf, sh, &r6s); - - unlock: - spin_unlock(&sh->lock); + handle_stripe_expansion(conf, sh); +finish: /* wait for this device to become unblocked */ - if (unlikely(blocked_rdev)) - md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); + if (conf->mddev->external && unlikely(s.blocked_rdev)) + md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev); + + if (s.handle_bad_blocks) + for (i = disks; i--; ) { + mdk_rdev_t *rdev; + struct r5dev *dev = &sh->dev[i]; + if (test_and_clear_bit(R5_WriteError, &dev->flags)) { + /* We own a safe reference to the rdev */ + rdev = conf->disks[i].rdev; + if (!rdev_set_badblocks(rdev, sh->sector, + STRIPE_SECTORS, 0)) + md_error(conf->mddev, rdev); + rdev_dec_pending(rdev, conf->mddev); + } + if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { + rdev = conf->disks[i].rdev; + rdev_clear_badblocks(rdev, sh->sector, + STRIPE_SECTORS); + rdev_dec_pending(rdev, conf->mddev); + } + } if (s.ops_request) raid_run_ops(sh, s.ops_request); ops_run_io(sh, &s); - - if (dec_preread_active) { + if (s.dec_preread_active) { /* We delay this until after ops_run_io so that if make_request * is waiting on a flush, it won't continue until the writes * have actually been submitted. @@ -3595,15 +3375,9 @@ static void handle_stripe6(struct stripe_head *sh) md_wakeup_thread(conf->mddev->thread); } - return_io(return_bi); -} + return_io(s.return_bi); -static void handle_stripe(struct stripe_head *sh) -{ - if (sh->raid_conf->level == 6) - handle_stripe6(sh); - else - handle_stripe5(sh); + clear_bit(STRIPE_ACTIVE, &sh->state); } static void raid5_activate_delayed(raid5_conf_t *conf) @@ -3833,6 +3607,9 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio) rcu_read_lock(); rdev = rcu_dereference(conf->disks[dd_idx].rdev); if (rdev && test_bit(In_sync, &rdev->flags)) { + sector_t first_bad; + int bad_sectors; + atomic_inc(&rdev->nr_pending); rcu_read_unlock(); raid_bio->bi_next = (void*)rdev; @@ -3840,8 +3617,10 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio) align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); align_bi->bi_sector += rdev->data_offset; - if (!bio_fits_rdev(align_bi)) { - /* too big in some way */ + if (!bio_fits_rdev(align_bi) || + is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, + &first_bad, &bad_sectors)) { + /* too big in some way, or has a known bad block */ bio_put(align_bi); rdev_dec_pending(rdev, mddev); return 0; @@ -4016,7 +3795,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) } } - if (bio_data_dir(bi) == WRITE && + if (rw == WRITE && logical_sector >= mddev->suspend_lo && logical_sector < mddev->suspend_hi) { release_stripe(sh); @@ -4034,7 +3813,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) } if (test_bit(STRIPE_EXPANDING, &sh->state) || - !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { + !add_stripe_bio(sh, bi, dd_idx, rw)) { /* Stripe is busy expanding or * add failed due to overlap. Flush everything * and wait a while @@ -4375,10 +4154,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); - spin_lock(&sh->lock); - set_bit(STRIPE_SYNCING, &sh->state); - clear_bit(STRIPE_INSYNC, &sh->state); - spin_unlock(&sh->lock); + set_bit(STRIPE_SYNC_REQUESTED, &sh->state); handle_stripe(sh); release_stripe(sh); @@ -4509,6 +4285,9 @@ static void raid5d(mddev_t *mddev) release_stripe(sh); cond_resched(); + if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) + md_check_recovery(mddev); + spin_lock_irq(&conf->device_lock); } pr_debug("%d stripes handled\n", handled); @@ -5313,6 +5092,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number) * isn't possible. */ if (!test_bit(Faulty, &rdev->flags) && + mddev->recovery_disabled != conf->recovery_disabled && !has_failed(conf) && number < conf->raid_disks) { err = -EBUSY; @@ -5341,6 +5121,9 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) int first = 0; int last = conf->raid_disks - 1; + if (mddev->recovery_disabled == conf->recovery_disabled) + return -EBUSY; + if (has_failed(conf)) /* no point adding a device */ return -EINVAL; @@ -5519,16 +5302,14 @@ static int raid5_start_reshape(mddev_t *mddev) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { if (raid5_add_disk(mddev, rdev) == 0) { - char nm[20]; if (rdev->raid_disk >= conf->previous_raid_disks) { set_bit(In_sync, &rdev->flags); added_devices++; } else rdev->recovery_offset = 0; - sprintf(nm, "rd%d", rdev->raid_disk); - if (sysfs_create_link(&mddev->kobj, - &rdev->kobj, nm)) + + if (sysfs_link_rdev(mddev, rdev)) /* Failure here is OK */; } } else if (rdev->raid_disk >= conf->previous_raid_disks @@ -5624,9 +5405,7 @@ static void raid5_finish_reshape(mddev_t *mddev) d++) { mdk_rdev_t *rdev = conf->disks[d].rdev; if (rdev && raid5_remove_disk(mddev, d) == 0) { - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); + sysfs_unlink_rdev(mddev, rdev); rdev->raid_disk = -1; } } diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 3ca77a2613b..11b9566184b 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -6,11 +6,11 @@ /* * - * Each stripe contains one buffer per disc. Each buffer can be in + * Each stripe contains one buffer per device. Each buffer can be in * one of a number of states stored in "flags". Changes between - * these states happen *almost* exclusively under a per-stripe - * spinlock. Some very specific changes can happen in bi_end_io, and - * these are not protected by the spin lock. + * these states happen *almost* exclusively under the protection of the + * STRIPE_ACTIVE flag. Some very specific changes can happen in bi_end_io, and + * these are not protected by STRIPE_ACTIVE. * * The flag bits that are used to represent these states are: * R5_UPTODATE and R5_LOCKED @@ -76,12 +76,10 @@ * block and the cached buffer are successfully written, any buffer on * a written list can be returned with b_end_io. * - * The write list and read list both act as fifos. The read list is - * protected by the device_lock. The write and written lists are - * protected by the stripe lock. The device_lock, which can be - * claimed while the stipe lock is held, is only for list - * manipulations and will only be held for a very short time. It can - * be claimed from interrupts. + * The write list and read list both act as fifos. The read list, + * write list and written list are protected by the device_lock. + * The device_lock is only for list manipulations and will only be + * held for a very short time. It can be claimed from interrupts. * * * Stripes in the stripe cache can be on one of two lists (or on @@ -96,7 +94,6 @@ * * The inactive_list, handle_list and hash bucket lists are all protected by the * device_lock. - * - stripes on the inactive_list never have their stripe_lock held. * - stripes have a reference counter. If count==0, they are on a list. * - If a stripe might need handling, STRIPE_HANDLE is set. * - When refcount reaches zero, then if STRIPE_HANDLE it is put on @@ -116,10 +113,10 @@ * attach a request to an active stripe (add_stripe_bh()) * lockdev attach-buffer unlockdev * handle a stripe (handle_stripe()) - * lockstripe clrSTRIPE_HANDLE ... + * setSTRIPE_ACTIVE, clrSTRIPE_HANDLE ... * (lockdev check-buffers unlockdev) .. * change-state .. - * record io/ops needed unlockstripe schedule io/ops + * record io/ops needed clearSTRIPE_ACTIVE schedule io/ops * release an active stripe (release_stripe()) * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev * @@ -128,8 +125,7 @@ * on a cached buffer, and plus one if the stripe is undergoing stripe * operations. * - * Stripe operations are performed outside the stripe lock, - * the stripe operations are: + * The stripe operations are: * -copying data between the stripe cache and user application buffers * -computing blocks to save a disk access, or to recover a missing block * -updating the parity on a write operation (reconstruct write and @@ -159,7 +155,8 @@ */ /* - * Operations state - intermediate states that are visible outside of sh->lock + * Operations state - intermediate states that are visible outside of + * STRIPE_ACTIVE. * In general _idle indicates nothing is running, _run indicates a data * processing operation is active, and _result means the data processing result * is stable and can be acted upon. For simple operations like biofill and @@ -209,7 +206,6 @@ struct stripe_head { short ddf_layout;/* use DDF ordering to calculate Q */ unsigned long state; /* state flags */ atomic_t count; /* nr of active thread/requests */ - spinlock_t lock; int bm_seq; /* sequence number for bitmap flushes */ int disks; /* disks in stripe */ enum check_states check_state; @@ -240,19 +236,20 @@ struct stripe_head { }; /* stripe_head_state - collects and tracks the dynamic state of a stripe_head - * for handle_stripe. It is only valid under spin_lock(sh->lock); + * for handle_stripe. */ struct stripe_head_state { int syncing, expanding, expanded; int locked, uptodate, to_read, to_write, failed, written; int to_fill, compute, req_compute, non_overwrite; - int failed_num; + int failed_num[2]; + int p_failed, q_failed; + int dec_preread_active; unsigned long ops_request; -}; -/* r6_state - extra state data only relevant to r6 */ -struct r6_state { - int p_failed, q_failed, failed_num[2]; + struct bio *return_bi; + mdk_rdev_t *blocked_rdev; + int handle_bad_blocks; }; /* Flags */ @@ -268,14 +265,16 @@ struct r6_state { #define R5_ReWrite 9 /* have tried to over-write the readerror */ #define R5_Expanded 10 /* This block now has post-expand data */ -#define R5_Wantcompute 11 /* compute_block in progress treat as - * uptodate - */ -#define R5_Wantfill 12 /* dev->toread contains a bio that needs - * filling - */ -#define R5_Wantdrain 13 /* dev->towrite needs to be drained */ -#define R5_WantFUA 14 /* Write should be FUA */ +#define R5_Wantcompute 11 /* compute_block in progress treat as + * uptodate + */ +#define R5_Wantfill 12 /* dev->toread contains a bio that needs + * filling + */ +#define R5_Wantdrain 13 /* dev->towrite needs to be drained */ +#define R5_WantFUA 14 /* Write should be FUA */ +#define R5_WriteError 15 /* got a write error - need to record it */ +#define R5_MadeGood 16 /* A bad block has been fixed by writing to it*/ /* * Write method */ @@ -289,21 +288,25 @@ struct r6_state { /* * Stripe state */ -#define STRIPE_HANDLE 2 -#define STRIPE_SYNCING 3 -#define STRIPE_INSYNC 4 -#define STRIPE_PREREAD_ACTIVE 5 -#define STRIPE_DELAYED 6 -#define STRIPE_DEGRADED 7 -#define STRIPE_BIT_DELAY 8 -#define STRIPE_EXPANDING 9 -#define STRIPE_EXPAND_SOURCE 10 -#define STRIPE_EXPAND_READY 11 -#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ -#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ -#define STRIPE_BIOFILL_RUN 14 -#define STRIPE_COMPUTE_RUN 15 -#define STRIPE_OPS_REQ_PENDING 16 +enum { + STRIPE_ACTIVE, + STRIPE_HANDLE, + STRIPE_SYNC_REQUESTED, + STRIPE_SYNCING, + STRIPE_INSYNC, + STRIPE_PREREAD_ACTIVE, + STRIPE_DELAYED, + STRIPE_DEGRADED, + STRIPE_BIT_DELAY, + STRIPE_EXPANDING, + STRIPE_EXPAND_SOURCE, + STRIPE_EXPAND_READY, + STRIPE_IO_STARTED, /* do not count towards 'bypass_count' */ + STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */ + STRIPE_BIOFILL_RUN, + STRIPE_COMPUTE_RUN, + STRIPE_OPS_REQ_PENDING, +}; /* * Operation request flags @@ -336,7 +339,7 @@ struct r6_state { * PREREAD_ACTIVE. * In stripe_handle, if we find pre-reading is necessary, we do it if * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. - * HANDLE gets cleared if stripe_handle leave nothing locked. + * HANDLE gets cleared if stripe_handle leaves nothing locked. */ @@ -399,7 +402,7 @@ struct raid5_private_data { * (fresh device added). * Cleared when a sync completes. */ - + int recovery_disabled; /* per cpu variables */ struct raid5_percpu { struct page *spare_page; /* Used when checking P/Q in raid6 */ |