diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-01-08 13:28:33 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-01-08 13:28:33 -0800 |
commit | 2943c833222ef87c111ee0c6b7b8519ad2983e99 (patch) | |
tree | 0ef8cc4f72a63b325e7ae858ec68822ec4f3c64f | |
parent | 98793265b429a3f0b3f1750e74d67cd4d740d162 (diff) | |
parent | 19d671695e1931ebfd75b2b888778201aefe35ca (diff) |
Merge tag 'md-3.3' of git://neil.brown.name/md
md update for 3.3
Big change is new hot-replacement.
A slot in an array can hold 2 devices - one that
wants-replacement and one that is the replacement.
Once the replacement is built - either from the
original or (in the case of errors) from elsewhere,
the wants-replacement device will be removed.
* tag 'md-3.3' of git://neil.brown.name/md: (36 commits)
md/raid1: Mark device want_replacement when we see a write error.
md/raid1: If there is a spare and a want_replacement device, start replacement.
md/raid1: recognise replacements when assembling arrays.
md/raid1: handle activation of replacement device when recovery completes.
md/raid1: Allow a failed replacement device to be removed.
md/raid1: Allocate spare to store replacement devices and their bios.
md/raid1: Replace use of mddev->raid_disks with conf->raid_disks.
md/raid10: If there is a spare and a want_replacement device, start replacement.
md/raid10: recognise replacements when assembling array.
md/raid10: Allow replacement device to be replace old drive.
md/raid10: handle recovery of replacement devices.
md/raid10: Handle replacement devices during resync.
md/raid10: writes should get directed to replacement as well as original.
md/raid10: allow removal of failed replacement devices.
md/raid10: preferentially read from replacement device if possible.
md/raid10: change read_balance to return an rdev
md/raid10: prepare data structures for handling replacement.
md/raid5: Mark device want_replacement when we see a write error.
md/raid5: If there is a spare and a want_replacement device, start replacement.
md/raid5: recognise replacements when assembling array.
...
-rw-r--r-- | Documentation/md.txt | 22 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 12 | ||||
-rw-r--r-- | drivers/md/md.c | 107 | ||||
-rw-r--r-- | drivers/md/md.h | 82 | ||||
-rw-r--r-- | drivers/md/multipath.c | 7 | ||||
-rw-r--r-- | drivers/md/raid1.c | 174 | ||||
-rw-r--r-- | drivers/md/raid1.h | 7 | ||||
-rw-r--r-- | drivers/md/raid10.c | 582 | ||||
-rw-r--r-- | drivers/md/raid10.h | 61 | ||||
-rw-r--r-- | drivers/md/raid5.c | 557 | ||||
-rw-r--r-- | drivers/md/raid5.h | 98 | ||||
-rw-r--r-- | include/linux/raid/md_p.h | 7 | ||||
-rw-r--r-- | include/linux/raid/pq.h | 2 |
13 files changed, 1280 insertions, 438 deletions
diff --git a/Documentation/md.txt b/Documentation/md.txt index fc94770f44a..993fba37b7d 100644 --- a/Documentation/md.txt +++ b/Documentation/md.txt @@ -357,14 +357,14 @@ Each directory contains: written to, that device. state - A file recording the current state of the device in the array + A file recording the current state of the device in the array which can be a comma separated list of faulty - device has been kicked from active use due to - a detected fault or it has unacknowledged bad - blocks + a detected fault, or it has unacknowledged bad + blocks in_sync - device is a fully in-sync member of the array writemostly - device will only be subject to read - requests if there are no other options. + requests if there are no other options. This applies only to raid1 arrays. blocked - device has failed, and the failure hasn't been acknowledged yet by the metadata handler. @@ -374,6 +374,13 @@ Each directory contains: This includes spares that are in the process of being recovered to write_error - device has ever seen a write error. + want_replacement - device is (mostly) working but probably + should be replaced, either due to errors or + due to user request. + replacement - device is a replacement for another active + device with same raid_disk. + + This list may grow in future. This can be written to. Writing "faulty" simulates a failure on the device. @@ -386,6 +393,13 @@ Each directory contains: Writing "in_sync" sets the in_sync flag. Writing "write_error" sets writeerrorseen flag. Writing "-write_error" clears writeerrorseen flag. + Writing "want_replacement" is allowed at any time except to a + replacement device or a spare. It sets the flag. + Writing "-want_replacement" is allowed at any time. It clears + the flag. + Writing "replacement" or "-replacement" is only allowed before + starting the array. It sets or clears the flag. + This file responds to select/poll. Any change to 'faulty' or 'blocked' causes an event. diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 6d03774b176..cdf36b1e9aa 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1149,12 +1149,12 @@ void bitmap_daemon_work(struct mddev *mddev) return; } if (time_before(jiffies, bitmap->daemon_lastrun - + bitmap->mddev->bitmap_info.daemon_sleep)) + + mddev->bitmap_info.daemon_sleep)) goto done; bitmap->daemon_lastrun = jiffies; if (bitmap->allclean) { - bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; goto done; } bitmap->allclean = 1; @@ -1206,7 +1206,7 @@ void bitmap_daemon_work(struct mddev *mddev) * sure that events_cleared is up-to-date. */ if (bitmap->need_sync && - bitmap->mddev->bitmap_info.external == 0) { + mddev->bitmap_info.external == 0) { bitmap_super_t *sb; bitmap->need_sync = 0; sb = kmap_atomic(bitmap->sb_page, KM_USER0); @@ -1270,8 +1270,8 @@ void bitmap_daemon_work(struct mddev *mddev) done: if (bitmap->allclean == 0) - bitmap->mddev->thread->timeout = - bitmap->mddev->bitmap_info.daemon_sleep; + mddev->thread->timeout = + mddev->bitmap_info.daemon_sleep; mutex_unlock(&mddev->bitmap_info.mutex); } @@ -1587,7 +1587,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n } if (!*bmc) { struct page *page; - *bmc = 1 | (needed ? NEEDED_MASK : 0); + *bmc = 2 | (needed ? NEEDED_MASK : 0); bitmap_count_page(bitmap, offset, 1); page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); diff --git a/drivers/md/md.c b/drivers/md/md.c index 5d1b6762f10..ca8527fe77e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1713,6 +1713,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) } if (sb->devflags & WriteMostly1) set_bit(WriteMostly, &rdev->flags); + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) + set_bit(Replacement, &rdev->flags); } else /* MULTIPATH are always insync */ set_bit(In_sync, &rdev->flags); @@ -1766,6 +1768,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); } + if (test_bit(Replacement, &rdev->flags)) + sb->feature_map |= + cpu_to_le32(MD_FEATURE_REPLACEMENT); if (mddev->reshape_position != MaxSector) { sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); @@ -2559,6 +2564,15 @@ state_show(struct md_rdev *rdev, char *page) len += sprintf(page+len, "%swrite_error", sep); sep = ","; } + if (test_bit(WantReplacement, &rdev->flags)) { + len += sprintf(page+len, "%swant_replacement", sep); + sep = ","; + } + if (test_bit(Replacement, &rdev->flags)) { + len += sprintf(page+len, "%sreplacement", sep); + sep = ","; + } + return len+sprintf(page+len, "\n"); } @@ -2627,6 +2641,42 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) } else if (cmd_match(buf, "-write_error")) { clear_bit(WriteErrorSeen, &rdev->flags); err = 0; + } else if (cmd_match(buf, "want_replacement")) { + /* Any non-spare device that is not a replacement can + * become want_replacement at any time, but we then need to + * check if recovery is needed. + */ + if (rdev->raid_disk >= 0 && + !test_bit(Replacement, &rdev->flags)) + set_bit(WantReplacement, &rdev->flags); + set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); + md_wakeup_thread(rdev->mddev->thread); + err = 0; + } else if (cmd_match(buf, "-want_replacement")) { + /* Clearing 'want_replacement' is always allowed. + * Once replacements starts it is too late though. + */ + err = 0; + clear_bit(WantReplacement, &rdev->flags); + } else if (cmd_match(buf, "replacement")) { + /* Can only set a device as a replacement when array has not + * yet been started. Once running, replacement is automatic + * from spares, or by assigning 'slot'. + */ + if (rdev->mddev->pers) + err = -EBUSY; + else { + set_bit(Replacement, &rdev->flags); + err = 0; + } + } else if (cmd_match(buf, "-replacement")) { + /* Similarly, can only clear Replacement before start */ + if (rdev->mddev->pers) + err = -EBUSY; + else { + clear_bit(Replacement, &rdev->flags); + err = 0; + } } if (!err) sysfs_notify_dirent_safe(rdev->sysfs_state); @@ -2688,7 +2738,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) if (rdev->mddev->pers->hot_remove_disk == NULL) return -EINVAL; err = rdev->mddev->pers-> - hot_remove_disk(rdev->mddev, rdev->raid_disk); + hot_remove_disk(rdev->mddev, rdev); if (err) return err; sysfs_unlink_rdev(rdev->mddev, rdev); @@ -2696,7 +2746,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); md_wakeup_thread(rdev->mddev->thread); } else if (rdev->mddev->pers) { - struct md_rdev *rdev2; /* Activating a spare .. or possibly reactivating * if we ever get bitmaps working here. */ @@ -2710,10 +2759,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) if (rdev->mddev->pers->hot_add_disk == NULL) return -EINVAL; - list_for_each_entry(rdev2, &rdev->mddev->disks, same_set) - if (rdev2->raid_disk == slot) - return -EEXIST; - if (slot >= rdev->mddev->raid_disks && slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) return -ENOSPC; @@ -6053,8 +6098,15 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, struct mddev *mddev = NULL; int ro; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + switch (cmd) { + case RAID_VERSION: + case GET_ARRAY_INFO: + case GET_DISK_INFO: + break; + default: + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + } /* * Commands dealing with the RAID driver but not any @@ -6714,8 +6766,11 @@ static int md_seq_show(struct seq_file *seq, void *v) if (test_bit(Faulty, &rdev->flags)) { seq_printf(seq, "(F)"); continue; - } else if (rdev->raid_disk < 0) + } + if (rdev->raid_disk < 0) seq_printf(seq, "(S)"); /* spare */ + if (test_bit(Replacement, &rdev->flags)) + seq_printf(seq, "(R)"); sectors += rdev->sectors; } @@ -7337,29 +7392,27 @@ static int remove_and_add_spares(struct mddev *mddev) ! test_bit(In_sync, &rdev->flags)) && atomic_read(&rdev->nr_pending)==0) { if (mddev->pers->hot_remove_disk( - mddev, rdev->raid_disk)==0) { + mddev, rdev) == 0) { sysfs_unlink_rdev(mddev, rdev); rdev->raid_disk = -1; } } - if (mddev->degraded) { - list_for_each_entry(rdev, &mddev->disks, same_set) { - if (rdev->raid_disk >= 0 && - !test_bit(In_sync, &rdev->flags) && - !test_bit(Faulty, &rdev->flags)) + list_for_each_entry(rdev, &mddev->disks, same_set) { + if (rdev->raid_disk >= 0 && + !test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) + spares++; + if (rdev->raid_disk < 0 + && !test_bit(Faulty, &rdev->flags)) { + rdev->recovery_offset = 0; + if (mddev->pers-> + hot_add_disk(mddev, rdev) == 0) { + if (sysfs_link_rdev(mddev, rdev)) + /* failure here is OK */; spares++; - if (rdev->raid_disk < 0 - && !test_bit(Faulty, &rdev->flags)) { - rdev->recovery_offset = 0; - if (mddev->pers-> - hot_add_disk(mddev, rdev) == 0) { - if (sysfs_link_rdev(mddev, rdev)) - /* failure here is OK */; - spares++; - md_new_event(mddev); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - } + md_new_event(mddev); + set_bit(MD_CHANGE_DEVS, &mddev->flags); } } } @@ -7474,7 +7527,7 @@ void md_check_recovery(struct mddev *mddev) test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)==0) { if (mddev->pers->hot_remove_disk( - mddev, rdev->raid_disk)==0) { + mddev, rdev) == 0) { sysfs_unlink_rdev(mddev, rdev); rdev->raid_disk = -1; } diff --git a/drivers/md/md.h b/drivers/md/md.h index cf742d9306e..44c63dfeeb2 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -72,34 +72,7 @@ struct md_rdev { * This reduces the burden of testing multiple flags in many cases */ - unsigned long flags; -#define Faulty 1 /* device is known to have a fault */ -#define In_sync 2 /* device is in_sync with rest of array */ -#define WriteMostly 4 /* Avoid reading if at all possible */ -#define AutoDetected 7 /* added by auto-detect */ -#define Blocked 8 /* An error occurred but has not yet - * been acknowledged by the metadata - * handler, so don't allow writes - * until it is cleared */ -#define WriteErrorSeen 9 /* A write error has been seen on this - * device - */ -#define FaultRecorded 10 /* Intermediate state for clearing - * Blocked. The Fault is/will-be - * recorded in the metadata, but that - * metadata hasn't been stored safely - * on disk yet. - */ -#define BlockedBadBlocks 11 /* A writer is blocked because they - * found an unacknowledged bad-block. - * This can safely be cleared at any - * time, and the writer will re-check. - * It may be set at any time, and at - * worst the writer will timeout and - * re-check. So setting it as - * accurately as possible is good, but - * not absolutely critical. - */ + unsigned long flags; /* bit set of 'enum flag_bits' bits. */ wait_queue_head_t blocked_wait; int desc_nr; /* descriptor index in the superblock */ @@ -152,6 +125,44 @@ struct md_rdev { sector_t size; /* in sectors */ } badblocks; }; +enum flag_bits { + Faulty, /* device is known to have a fault */ + In_sync, /* device is in_sync with rest of array */ + WriteMostly, /* Avoid reading if at all possible */ + AutoDetected, /* added by auto-detect */ + Blocked, /* An error occurred but has not yet + * been acknowledged by the metadata + * handler, so don't allow writes + * until it is cleared */ + WriteErrorSeen, /* A write error has been seen on this + * device + */ + FaultRecorded, /* Intermediate state for clearing + * Blocked. The Fault is/will-be + * recorded in the metadata, but that + * metadata hasn't been stored safely + * on disk yet. + */ + BlockedBadBlocks, /* A writer is blocked because they + * found an unacknowledged bad-block. + * This can safely be cleared at any + * time, and the writer will re-check. + * It may be set at any time, and at + * worst the writer will timeout and + * re-check. So setting it as + * accurately as possible is good, but + * not absolutely critical. + */ + WantReplacement, /* This device is a candidate to be + * hot-replaced, either because it has + * reported some faults, or because + * of explicit request. + */ + Replacement, /* This device is a replacement for + * a want_replacement device with same + * raid_disk number. + */ +}; #define BB_LEN_MASK (0x00000000000001FFULL) #define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) @@ -428,7 +439,7 @@ struct md_personality */ void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev); int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev); - int (*hot_remove_disk) (struct mddev *mddev, int number); + int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev); int (*spare_active) (struct mddev *mddev); sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster); int (*resize) (struct mddev *mddev, sector_t sectors); @@ -482,15 +493,20 @@ static inline char * mdname (struct mddev * mddev) static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) { char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); + if (!test_bit(Replacement, &rdev->flags)) { + sprintf(nm, "rd%d", rdev->raid_disk); + return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); + } else + return 0; } static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) { char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); + if (!test_bit(Replacement, &rdev->flags)) { + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + } } /* diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 5899246fa37..a222f516660 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -292,17 +292,16 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) return err; } -static int multipath_remove_disk(struct mddev *mddev, int number) +static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) { struct mpconf *conf = mddev->private; int err = 0; - struct md_rdev *rdev; + int number = rdev->raid_disk; struct multipath_info *p = conf->multipaths + number; print_multipath_conf(conf); - rdev = p->rdev; - if (rdev) { + if (rdev == p->rdev) { if (test_bit(In_sync, &rdev->flags) || atomic_read(&rdev->nr_pending)) { printk(KERN_ERR "hot-remove-disk, slot %d is identified" diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ede2461e79c..cc24f0cb7ee 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -135,7 +135,7 @@ out_free_pages: put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page); j = -1; out_free_bio: - while ( ++j < pi->raid_disks ) + while (++j < pi->raid_disks) bio_put(r1_bio->bios[j]); r1bio_pool_free(r1_bio, data); return NULL; @@ -164,7 +164,7 @@ static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio) { int i; - for (i = 0; i < conf->raid_disks; i++) { + for (i = 0; i < conf->raid_disks * 2; i++) { struct bio **bio = r1_bio->bios + i; if (!BIO_SPECIAL(*bio)) bio_put(*bio); @@ -185,7 +185,7 @@ static void put_buf(struct r1bio *r1_bio) struct r1conf *conf = r1_bio->mddev->private; int i; - for (i=0; i<conf->raid_disks; i++) { + for (i = 0; i < conf->raid_disks * 2; i++) { struct bio *bio = r1_bio->bios[i]; if (bio->bi_end_io) rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev); @@ -277,13 +277,14 @@ static inline void update_head_pos(int disk, struct r1bio *r1_bio) static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio) { int mirror; - int raid_disks = r1_bio->mddev->raid_disks; + struct r1conf *conf = r1_bio->mddev->private; + int raid_disks = conf->raid_disks; - for (mirror = 0; mirror < raid_disks; mirror++) + for (mirror = 0; mirror < raid_disks * 2; mirror++) if (r1_bio->bios[mirror] == bio) break; - BUG_ON(mirror == raid_disks); + BUG_ON(mirror == raid_disks * 2); update_head_pos(mirror, r1_bio); return mirror; @@ -390,6 +391,11 @@ static void raid1_end_write_request(struct bio *bio, int error) if (!uptodate) { set_bit(WriteErrorSeen, &conf->mirrors[mirror].rdev->flags); + if (!test_and_set_bit(WantReplacement, + &conf->mirrors[mirror].rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, & + conf->mddev->recovery); + set_bit(R1BIO_WriteError, &r1_bio->state); } else { /* @@ -505,7 +511,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect start_disk = conf->last_used; } - for (i = 0 ; i < conf->raid_disks ; i++) { + for (i = 0 ; i < conf->raid_disks * 2 ; i++) { sector_t dist; sector_t first_bad; int bad_sectors; @@ -609,7 +615,7 @@ int md_raid1_congested(struct mddev *mddev, int bits) return 1; rcu_read_lock(); - for (i = 0; i < mddev->raid_disks; i++) { + for (i = 0; i < conf->raid_disks; i++) { struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); @@ -974,7 +980,7 @@ read_again: */ plugged = mddev_check_plugged(mddev); - disks = conf->raid_disks; + disks = conf->raid_disks * 2; retry_write: blocked_rdev = NULL; rcu_read_lock(); @@ -988,7 +994,8 @@ read_again: } r1_bio->bios[i] = NULL; if (!rdev || test_bit(Faulty, &rdev->flags)) { - set_bit(R1BIO_Degraded, &r1_bio->state); + if (i < conf->raid_disks) + set_bit(R1BIO_Degraded, &r1_bio->state); continue; } @@ -1263,6 +1270,25 @@ static int raid1_spare_active(struct mddev *mddev) */ for (i = 0; i < conf->raid_disks; i++) { struct md_rdev *rdev = conf->mirrors[i].rdev; + struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev; + if (repl + && repl->recovery_offset == MaxSector + && !test_bit(Faulty, &repl->flags) + && !test_and_set_bit(In_sync, &repl->flags)) { + /* replacement has just become active */ + if (!rdev || + !test_and_clear_bit(In_sync, &rdev->flags)) + count++; + if (rdev) { + /* Replaced device not technically + * faulty, but we need to be sure + * it gets removed and never re-added + */ + set_bit(Faulty, &rdev->flags); + sysfs_notify_dirent_safe( + rdev->sysfs_state); + } + } if (rdev && !test_bit(Faulty, &rdev->flags) && !test_and_set_bit(In_sync, &rdev->flags)) { @@ -1286,7 +1312,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) int mirror = 0; struct mirror_info *p; int first = 0; - int last = mddev->raid_disks - 1; + int last = conf->raid_disks - 1; if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; @@ -1294,8 +1320,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; - for (mirror = first; mirror <= last; mirror++) - if ( !(p=conf->mirrors+mirror)->rdev) { + for (mirror = first; mirror <= last; mirror++) { + p = conf->mirrors+mirror; + if (!p->rdev) { disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); @@ -1322,21 +1349,35 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) rcu_assign_pointer(p->rdev, rdev); break; } + if (test_bit(WantReplacement, &p->rdev->flags) && + p[conf->raid_disks].rdev == NULL) { + /* Add this device as a replacement */ + clear_bit(In_sync, &rdev->flags); + set_bit(Replacement, &rdev->flags); + rdev->raid_disk = mirror; + err = 0; + conf->fullsync = 1; + rcu_assign_pointer(p[conf->raid_disks].rdev, rdev); + break; + } + } md_integrity_add_rdev(rdev, mddev); print_conf(conf); return err; } -static int raid1_remove_disk(struct mddev *mddev, int number) +static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) { struct r1conf *conf = mddev->private; int err = 0; - struct md_rdev *rdev; + int number = rdev->raid_disk; struct mirror_info *p = conf->mirrors+ number; + if (rdev != p->rdev) + p = conf->mirrors + conf->raid_disks + number; + print_conf(conf); - rdev = p->rdev; - if (rdev) { + if (rdev == p->rdev) { if (test_bit(In_sync, &rdev->flags) || atomic_read(&rdev->nr_pending)) { err = -EBUSY; @@ -1358,7 +1399,21 @@ static int raid1_remove_disk(struct mddev *mddev, int number) err = -EBUSY; p->rdev = rdev; goto abort; - } + } else if (conf->mirrors[conf->raid_disks + number].rdev) { + /* We just removed a device that is being replaced. + * Move down the replacement. We drain all IO before + * doing this to avoid confusion. + */ + struct md_rdev *repl = + conf->mirrors[conf->raid_disks + number].rdev; + raise_barrier(conf); + clear_bit(Replacement, &repl->flags); + p->rdev = repl; + conf->mirrors[conf->raid_disks + number].rdev = NULL; + lower_barrier(conf); + clear_bit(WantReplacement, &rdev->flags); + } else + clear_bit(WantReplacement, &rdev->flags); err = md_integrity_register(mddev); } abort: @@ -1411,6 +1466,10 @@ static void end_sync_write(struct bio *bio, int error) } while (sectors_to_go > 0); set_bit(WriteErrorSeen, &conf->mirrors[mirror].rdev->flags); + if (!test_and_set_bit(WantReplacement, + &conf->mirrors[mirror].rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, & + mddev->recovery); set_bit(R1BIO_WriteError, &r1_bio->state); } else if (is_badblock(conf->mirrors[mirror].rdev, r1_bio->sector, @@ -1441,8 +1500,13 @@ static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector, if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) /* success */ return 1; - if (rw == WRITE) + if (rw == WRITE) { set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, + &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, & + rdev->mddev->recovery); + } /* need to record an error - either for the block or the device */ if (!rdev_set_badblocks(rdev, sector, sectors, 0)) md_error(rdev->mddev, rdev); @@ -1493,7 +1557,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) } } d++; - if (d == conf->raid_disks) + if (d == conf->raid_disks * 2) d = 0; } while (!success && d != r1_bio->read_disk); @@ -1510,7 +1574,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) mdname(mddev), bdevname(bio->bi_bdev, b), (unsigned long long)r1_bio->sector); - for (d = 0; d < conf->raid_disks; d++) { + for (d = 0; d < conf->raid_disks * 2; d++) { rdev = conf->mirrors[d].rdev; if (!rdev || test_bit(Faulty, &rdev->flags)) continue; @@ -1536,7 +1600,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) /* write it back and re-read */ while (d != r1_bio->read_disk) { if (d == 0) - d = conf->raid_disks; + d = conf->raid_disks * 2; d--; if (r1_bio->bios[d]->bi_end_io != end_sync_read) continue; @@ -1551,7 +1615,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) d = start; while (d != r1_bio->read_disk) { if (d == 0) - d = conf->raid_disks; + d = conf->raid_disks * 2; d--; if (r1_bio->bios[d]->bi_end_io != end_sync_read) continue; @@ -1584,7 +1648,7 @@ static int process_checks(struct r1bio *r1_bio) int primary; int i; - for (primary = 0; primary < conf->raid_disks; primary++) + for (primary = 0; primary < conf->raid_disks * 2; primary++) if (r1_bio->bios[primary]->bi_end_io == end_sync_read && test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { r1_bio->bios[primary]->bi_end_io = NULL; @@ -1592,7 +1656,7 @@ static int process_checks(struct r1bio *r1_bio) break; } r1_bio->read_disk = primary; - for (i = 0; i < conf->raid_disks; i++) { + for (i = 0; i < conf->raid_disks * 2; i++) { int j; int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); struct bio *pbio = r1_bio->bios[primary]; @@ -1656,7 +1720,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) { struct r1conf *conf = mddev->private; int i; - int disks = conf->raid_disks; + int disks = conf->raid_disks * 2; struct bio *bio, *wbio; bio = r1_bio->bios[r1_bio->read_disk]; @@ -1737,7 +1801,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, success = 1; else { d++; - if (d == conf->raid_disks) + if (d == conf->raid_disks * 2) d = 0; } } while (!success && d != read_disk); @@ -1753,7 +1817,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, start = d; while (d != read_disk) { if (d==0) - d = conf->raid_disks; + d = conf->raid_disks * 2; d--; rdev = conf->mirrors[d].rdev; if (rdev && @@ -1765,7 +1829,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, while (d != read_disk) { char b[BDEVNAME_SIZE]; if (d==0) - d = conf->raid_disks; + d = conf->raid_disks * 2; d--; rdev = conf->mirrors[d].rdev; if (rdev && @@ -1887,7 +1951,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio { int m; int s = r1_bio->sectors; - for (m = 0; m < conf->raid_disks ; m++) { + for (m = 0; m < conf->raid_disks * 2 ; m++) { struct md_rdev *rdev = conf->mirrors[m].rdev; struct bio *bio = r1_bio->bios[m]; if (bio->bi_end_io == NULL) @@ -1909,7 +1973,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) { int m; - for (m = 0; m < conf->raid_disks ; m++) + for (m = 0; m < conf->raid_disks * 2 ; m++) if (r1_bio->bios[m] == IO_MADE_GOOD) { struct md_rdev *rdev = conf->mirrors[m].rdev; rdev_clear_badblocks(rdev, @@ -2184,7 +2248,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp r1_bio->state = 0; set_bit(R1BIO_IsSync, &r1_bio->state); - for (i=0; i < conf->raid_disks; i++) { + for (i = 0; i < conf->raid_disks * 2; i++) { struct md_rdev *rdev; bio = r1_bio->bios[i]; @@ -2203,7 +2267,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { - still_degraded = 1; + if (i < conf->raid_disks) + still_degraded = 1; } else if (!test_bit(In_sync, &rdev->flags)) { bio->bi_rw = WRITE; bio->bi_end_io = end_sync_write; @@ -2254,7 +2319,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp * need to mark them bad on all write targets */ int ok = 1; - for (i = 0 ; i < conf->raid_disks ; i++) + for (i = 0 ; i < conf->raid_disks * 2 ; i++) if (r1_bio->bios[i]->bi_end_io == end_sync_write) { struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); @@ -2323,7 +2388,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp len = sync_blocks<<9; } - for (i=0 ; i < conf->raid_disks; i++) { + for (i = 0 ; i < conf->raid_disks * 2; i++) { bio = r1_bio->bios[i]; if (bio->bi_end_io) { page = bio->bi_io_vec[bio->bi_vcnt].bv_page; @@ -2356,7 +2421,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { atomic_set(&r1_bio->remaining, read_targets); - for (i=0; i<conf->raid_disks; i++) { + for (i = 0; i < conf->raid_disks * 2; i++) { bio = r1_bio->bios[i]; if (bio->bi_end_io == end_sync_read) { md_sync_acct(bio->bi_bdev, nr_sectors); @@ -2393,7 +2458,8 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (!conf) goto abort; - conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, + conf->mirrors = kzalloc(sizeof(struct mirror_info) + * mddev->raid_disks * 2, GFP_KERNEL); if (!conf->mirrors) goto abort; @@ -2405,7 +2471,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL); if (!conf->poolinfo) goto abort; - conf->poolinfo->raid_disks = mddev->raid_disks; + conf->poolinfo->raid_disks = mddev->raid_disks * 2; conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, r1bio_pool_free, conf->poolinfo); @@ -2414,14 +2480,20 @@ static struct r1conf *setup_conf(struct mddev *mddev) conf->poolinfo->mddev = mddev; + err = -EINVAL; spin_lock_init(&conf->device_lock); list_for_each_entry(rdev, &mddev->disks, same_set) { int disk_idx = rdev->raid_disk; if (disk_idx >= mddev->raid_disks || disk_idx < 0) continue; - disk = conf->mirrors + disk_idx; + if (test_bit(Replacement, &rdev->flags)) + disk = conf->mirrors + conf->raid_disks + disk_idx; + else + disk = conf->mirrors + disk_idx; + if (disk->rdev) + goto abort; disk->rdev = rdev; disk->head_position = 0; @@ -2437,11 +2509,27 @@ static struct r1conf *setup_conf(struct mddev *mddev) conf->pending_count = 0; conf->recovery_disabled = mddev->recovery_disabled - 1; + err = -EIO; conf->last_used = -1; - for (i = 0; i < conf->raid_disks; i++) { + for (i = 0; i < conf->raid_disks * 2; i++) { disk = conf->mirrors + i; + if (i < conf->raid_disks && + disk[conf->raid_disks].rdev) { + /* This slot has a replacement. */ + if (!disk->rdev) { + /* No original, just make the replacement + * a recovering spare + */ + disk->rdev = + disk[conf->raid_disks].rdev; + |