From fc448a18ae6219af9a73257b1fbcd009efab4a81 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 10:37:30 +1000 Subject: md/raid10: Don't try to recovery unmatched (and unused) chunks. If a RAID10 has an odd number of chunks - as might happen when there are an odd number of devices - the last chunk has no pair and so is not mirrored. We don't store data there, but when recovering the last device in an array we retry to recover that last chunk from a non-existent location. This results in an error, and the recovery aborts. When we get to that last chunk we should just stop - there is nothing more to do anyway. This bug has been present since the introduction of RAID10, so the patch is appropriate for any -stable kernel. Cc: stable@vger.kernel.org Reported-by: Christian Balzer Tested-by: Christian Balzer Signed-off-by: NeilBrown --- drivers/md/raid10.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 99ae6068e45..bcf6ea8acc9 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2890,6 +2890,12 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, /* want to reconstruct this device */ rb2 = r10_bio; sect = raid10_find_virt(conf, sector_nr, i); + if (sect >= mddev->resync_max_sectors) { + /* last stripe is not complete - don't + * try to recover this sector. + */ + continue; + } /* Unless we are doing a full sync, or a replacement * we only need to recover the block if it is set in * the bitmap -- cgit v1.2.3-18-g5258 From 5cfb22a1f83e4f04c0a4df89b60053a077222e2b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 11:46:53 +1000 Subject: md/raid5: prefer replacing failed devices over want-replacement devices. If a RAID5 has both a failed device and a device marked as 'WantReplacement', then we should preferentially replace the failed device. However the current code replaces whichever is found first. So split into 2 loops, check fail failed/missing first, and only check for WantReplacement if nothing is failed or missing. Reported-by: majianpeng Signed-off-by: NeilBrown --- drivers/md/raid5.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d26767246d2..95fcbbf3d6c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5465,10 +5465,9 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->saved_raid_disk >= 0 && rdev->saved_raid_disk >= first && conf->disks[rdev->saved_raid_disk].rdev == NULL) - disk = rdev->saved_raid_disk; - else - disk = first; - for ( ; disk <= last ; disk++) { + first = rdev->saved_raid_disk; + + for (disk = first; disk <= last; disk++) { p = conf->disks + disk; if (p->rdev == NULL) { clear_bit(In_sync, &rdev->flags); @@ -5477,8 +5476,11 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->saved_raid_disk != disk) conf->fullsync = 1; rcu_assign_pointer(p->rdev, rdev); - break; + goto out; } + } + for (disk = first; disk <= last; disk++) { + p = conf->disks + disk; if (test_bit(WantReplacement, &p->rdev->flags) && p->replacement == NULL) { clear_bit(In_sync, &rdev->flags); @@ -5490,6 +5492,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) break; } } +out: print_raid5_conf(conf); return err; } -- cgit v1.2.3-18-g5258 From 6c0544e255dd6582a9899572e120fb55d9f672a4 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Tue, 12 Jun 2012 08:31:10 +0800 Subject: md/raid5: Do not add data_offset before call to is_badblock In chunk_aligned_read() we are adding data_offset before calling is_badblock. But is_badblock also adds data_offset, so that is bad. So move the addition of data_offset to after the call to is_badblock. This bug was introduced by commit 31c176ecdf3563140e639 md/raid5: avoid reading from known bad blocks. which first appeared in 3.0. So that patch is suitable for any -stable kernel from 3.0.y onwards. However it will need minor revision for most of those (as the comment didn't appear until recently). Cc: stable@vger.kernel.org Signed-off-by: majianpeng Signed-off-by: NeilBrown --- drivers/md/raid5.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 95fcbbf3d6c..9567a9c83a1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3881,8 +3881,6 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) raid_bio->bi_next = (void*)rdev; align_bi->bi_bdev = rdev->bdev; align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); - /* No reshape active, so we can trust rdev->data_offset */ - align_bi->bi_sector += rdev->data_offset; if (!bio_fits_rdev(align_bi) || is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, @@ -3893,6 +3891,9 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) return 0; } + /* No reshape active, so we can trust rdev->data_offset */ + align_bi->bi_sector += rdev->data_offset; + spin_lock_irq(&conf->device_lock); wait_event_lock_irq(conf->wait_for_stripe, conf->quiesce == 0, -- cgit v1.2.3-18-g5258 From 1850753d2e6d9ca7856581ca5d3cf09521e6a5d7 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Tue, 3 Jul 2012 12:11:54 +1000 Subject: md/raid5: In ops_run_io, inc nr_pending before calling md_wait_for_blocked_rdev In ops_run_io(), the call to md_wait_for_blocked_rdev will decrement nr_pending so we lose the reference we hold on the rdev. So atomic_inc it first to maintain the reference. This bug was introduced by commit 73e92e51b7969ef5477d md/raid5. Don't write to known bad block on doubtful devices. which appeared in 3.0, so patch is suitable for stable kernels since then. Cc: stable@vger.kernel.org Signed-off-by: majianpeng Signed-off-by: NeilBrown --- drivers/md/raid5.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9567a9c83a1..befadb41a11 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -606,6 +606,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) * a chance*/ md_check_recovery(conf->mddev); } + /* + * Because md_wait_for_blocked_rdev + * will dec nr_pending, we must + * increment it first. + */ + atomic_inc(&rdev->nr_pending); md_wait_for_blocked_rdev(rdev, conf->mddev); } else { /* Acknowledged bad block - skip the write */ -- cgit v1.2.3-18-g5258 From 7c2c57c9a98bf5961e438a376486f95346f6b0c5 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Tue, 3 Jul 2012 12:12:26 +1000 Subject: md:Add blk_plug in sync_thread. Add blk_plug in sync_thread will increase the performance of sync. Because sync_thread did not blk_plug,so when raid sync, the bio merge not well. Testing environment: SATA controller: Intel Corporation 82801JI (ICH10 Family) SATA AHCI Controller. OS:Linux xxx 3.5.0-rc2+ #340 SMP Tue Jun 12 09:00:25 CST 2012 x86_64 x86_64 x86_64 GNU/Linux. RAID5: four ST31000524NS disk. Without blk_plug:recovery speed about 63M/Sec; Add blk_plug:recovery speed about 120M/Sec. Using blktrace: blktrace -d /dev/sdb -w 60 -o -|blkparse -i - without blk_plug: Total (8,16): Reads Queued: 309811, 1239MiB Writes Queued: 0, 0KiB Read Dispatches: 283583, 1189MiB Write Dispatches: 0, 0KiB Reads Requeued: 0 Writes Requeued: 0 Reads Completed: 273351, 1149MiB Writes Completed: 0, 0KiB Read Merges: 23533, 94132KiB Write Merges: 0, 0KiB IO unplugs: 0 Timer unplugs: 0 add blk_plug: Total (8,16): Reads Queued: 428697, 1714MiB Writes Queued: 0, 0KiB Read Dispatches: 3954, 1714MiB Write Dispatches: 0, 0KiB Reads Requeued: 0 Writes Requeued: 0 Reads Completed: 3956, 1715MiB Writes Completed: 0, 0KiB Read Merges: 424743, 1698MiB Write Merges: 0, 0KiB IO unplugs: 0 Timer unplugs: 3384 The ratio of merge will be markedly increased. Signed-off-by: majianpeng Signed-off-by: NeilBrown --- drivers/md/md.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 1c2f9048e1a..973aa8459e9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7298,6 +7298,7 @@ void md_do_sync(struct mddev *mddev) int skipped = 0; struct md_rdev *rdev; char *desc; + struct blk_plug plug; /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) @@ -7447,6 +7448,7 @@ void md_do_sync(struct mddev *mddev) } mddev->curr_resync_completed = j; + blk_start_plug(&plug); while (j < max_sectors) { sector_t sectors; @@ -7552,6 +7554,7 @@ void md_do_sync(struct mddev *mddev) * this also signals 'finished resyncing' to md_stop */ out: + blk_finish_plug(&plug); wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); /* tell personality that we are finished */ -- cgit v1.2.3-18-g5258 From 5f066c632fcfd2a33f2eb7077c15c630e9f5ea5b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 12:13:29 +1000 Subject: md/raid5: fix refcount problem when blocked_rdev is set. commit 43220aa0f22cd3ce5b30246d50ccd696d119edea md/raid5: fix a hang on device failure. fixed a hang, but introduced a refcounting in-balance so that if the presence of bad-blocks ever caused an rdev to be 'blocked' we would increment the refcount on the rdev and never decrement it. So added the needed rdev_dec_pending when md_wait_for_blocked_rdev is not called. Reported-by: majianpeng Signed-off-by: NeilBrown --- drivers/md/raid5.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index befadb41a11..62b6b3a83ab 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3588,8 +3588,18 @@ static void handle_stripe(struct stripe_head *sh) finish: /* wait for this device to become unblocked */ - if (conf->mddev->external && unlikely(s.blocked_rdev)) - md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev); + if (unlikely(s.blocked_rdev)) { + if (conf->mddev->external) + md_wait_for_blocked_rdev(s.blocked_rdev, + conf->mddev); + else + /* Internal metadata will immediately + * be written by raid5d, so we don't + * need to wait here. + */ + rdev_dec_pending(s.blocked_rdev, + conf->mddev); + } if (s.handle_bad_blocks) for (i = disks; i--; ) { -- cgit v1.2.3-18-g5258 From 055d3747dbf00ce85c6872ecca4d466638e80c22 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 15:55:33 +1000 Subject: md/raid10: fix failure when trying to repair a read error. commit 58c54fcca3bac5bf9290cfed31c76e4c4bfbabaf md/raid10: handle further errors during fix_read_error better. in 3.1 added "r10_sync_page_io" which takes an IO size in sectors. But we were passing the IO size in bytes!!! This resulting in bio_add_page failing, and empty request being sent down, and a consequent BUG_ON in scsi_lib. [fix missing space in error message at same time] This fix is suitable for 3.1.y and later. Cc: stable@vger.kernel.org Reported-by: Christian Balzer Signed-off-by: NeilBrown --- drivers/md/raid10.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index bcf6ea8acc9..ae73e29298b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2310,7 +2310,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 if (r10_sync_page_io(rdev, r10_bio->devs[sl].addr + sect, - s<<9, conf->tmppage, WRITE) + s, conf->tmppage, WRITE) == 0) { /* Well, this device is dead */ printk(KERN_NOTICE @@ -2349,7 +2349,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 switch (r10_sync_page_io(rdev, r10_bio->devs[sl].addr + sect, - s<<9, conf->tmppage, + s, conf->tmppage, READ)) { case 0: /* Well, this device is dead */ @@ -2512,7 +2512,7 @@ read_more: slot = r10_bio->read_slot; printk_ratelimited( KERN_ERR - "md/raid10:%s: %s: redirecting" + "md/raid10:%s: %s: redirecting " "sector %llu to another mirror\n", mdname(mddev), bdevname(rdev->bdev, b), -- cgit v1.2.3-18-g5258 From 0232605d987d8230b254aa139805bbb56a7ca30c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 15:56:52 +1000 Subject: md: make 'name' arg to md_register_thread non-optional. Having the 'name' arg optional and defaulting to the current personality name is no necessary and leads to errors, as when changing the level of an array we can end up using the name of the old level instead of the new one. So make it non-optional and always explicitly pass the name of the level that the array will be. Reported-by: majianpeng Signed-off-by: NeilBrown --- drivers/md/md.c | 2 +- drivers/md/multipath.c | 3 ++- drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 2 +- drivers/md/raid5.c | 4 +++- 5 files changed, 8 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 973aa8459e9..c601c4be77c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6751,7 +6751,7 @@ struct md_thread *md_register_thread(void (*run) (struct mddev *), struct mddev thread->tsk = kthread_run(md_thread, thread, "%s_%s", mdname(thread->mddev), - name ?: mddev->pers->name); + name); if (IS_ERR(thread->tsk)) { kfree(thread); return NULL; diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 9339e67fcc7..61a1833ebaf 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -474,7 +474,8 @@ static int multipath_run (struct mddev *mddev) } { - mddev->thread = md_register_thread(multipathd, mddev, NULL); + mddev->thread = md_register_thread(multipathd, mddev, + "multipath"); if (!mddev->thread) { printk(KERN_ERR "multipath: couldn't allocate thread" " for %s\n", mdname(mddev)); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a9c7981ddd2..39b2a8aa3b2 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2621,7 +2621,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) goto abort; } err = -ENOMEM; - conf->thread = md_register_thread(raid1d, mddev, NULL); + conf->thread = md_register_thread(raid1d, mddev, "raid1"); if (!conf->thread) { printk(KERN_ERR "md/raid1:%s: couldn't allocate thread\n", diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ae73e29298b..edc1088a132 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3427,7 +3427,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) spin_lock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); - conf->thread = md_register_thread(raid10d, mddev, NULL); + conf->thread = md_register_thread(raid10d, mddev, "raid10"); if (!conf->thread) goto out; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 62b6b3a83ab..a5135e59586 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4840,6 +4840,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) int raid_disk, memory, max_disks; struct md_rdev *rdev; struct disk_info *disk; + char pers_name[6]; if (mddev->new_level != 5 && mddev->new_level != 4 @@ -4963,7 +4964,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) printk(KERN_INFO "md/raid:%s: allocated %dkB\n", mdname(mddev), memory); - conf->thread = md_register_thread(raid5d, mddev, NULL); + sprintf(pers_name, "raid%d", mddev->new_level); + conf->thread = md_register_thread(raid5d, mddev, pers_name); if (!conf->thread) { printk(KERN_ERR "md/raid:%s: couldn't allocate thread.\n", -- cgit v1.2.3-18-g5258 From 2e8ac30312973dd20e6807365349ecb1c7e0ea45 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Tue, 3 Jul 2012 15:57:02 +1000 Subject: md/raid456: When read error cannot be recovered, record bad block We may not be able to fix a bad block if: - the array is degraded - the over-write fails. In these cases we currently eject the device, but we should record a bad block if possible. Signed-off-by: majianpeng Signed-off-by: NeilBrown --- drivers/md/raid5.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a5135e59586..51169ecd778 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1743,6 +1743,7 @@ static void raid5_end_read_request(struct bio * bi, int error) } else { const char *bdn = bdevname(rdev->bdev, b); int retry = 0; + int set_bad = 0; clear_bit(R5_UPTODATE, &sh->dev[i].flags); atomic_inc(&rdev->read_errors); @@ -1754,7 +1755,8 @@ static void raid5_end_read_request(struct bio * bi, int error) mdname(conf->mddev), (unsigned long long)s, bdn); - else if (conf->mddev->degraded >= conf->max_degraded) + else if (conf->mddev->degraded >= conf->max_degraded) { + set_bad = 1; printk_ratelimited( KERN_WARNING "md/raid:%s: read error not correctable " @@ -1762,8 +1764,9 @@ static void raid5_end_read_request(struct bio * bi, int error) mdname(conf->mddev), (unsigned long long)s, bdn); - else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) + } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { /* Oh, no!!! */ + set_bad = 1; printk_ratelimited( KERN_WARNING "md/raid:%s: read error NOT corrected!! " @@ -1771,7 +1774,7 @@ static void raid5_end_read_request(struct bio * bi, int error) mdname(conf->mddev), (unsigned long long)s, bdn); - else if (atomic_read(&rdev->read_errors) + } else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) printk(KERN_WARNING "md/raid:%s: Too many read errors, failing device %s.\n", @@ -1783,7 +1786,11 @@ static void raid5_end_read_request(struct bio * bi, int error) else { clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); - md_error(conf->mddev, rdev); + if (!(set_bad + && test_bit(In_sync, &rdev->flags) + && rdev_set_badblocks( + rdev, sh->sector, STRIPE_SECTORS, 0))) + md_error(conf->mddev, rdev); } } rdev_dec_pending(rdev, conf->mddev); -- cgit v1.2.3-18-g5258 From fab363b5ff502d1b39ddcfec04271f5858d9f26e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 3 Jul 2012 15:57:19 +1000 Subject: raid5: delayed stripe fix There isn't locking setting STRIPE_DELAYED and STRIPE_PREREAD_ACTIVE bits, but the two bits have relationship. A delayed stripe can be moved to hold list only when preread active stripe count is below IO_THRESHOLD. If a stripe has both the bits set, such stripe will be in delayed list and preread count not 0, which will make such stripe never leave delayed list. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 51169ecd778..7245a9df35a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -196,12 +196,14 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) BUG_ON(!list_empty(&sh->lru)); BUG_ON(atomic_read(&conf->active_stripes)==0); if (test_bit(STRIPE_HANDLE, &sh->state)) { - if (test_bit(STRIPE_DELAYED, &sh->state)) + if (test_bit(STRIPE_DELAYED, &sh->state) && + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) list_add_tail(&sh->lru, &conf->delayed_list); else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && sh->bm_seq - conf->seq_write > 0) list_add_tail(&sh->lru, &conf->bitmap_list); else { + clear_bit(STRIPE_DELAYED, &sh->state); clear_bit(STRIPE_BIT_DELAY, &sh->state); list_add_tail(&sh->lru, &conf->handle_list); } -- cgit v1.2.3-18-g5258 From 32644afd8975d19174bcb9ba34687c32dd810a09 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 15:58:42 +1000 Subject: md/raid1: fix bug in read_balance introduced by hot-replace When we added hot_replace we doubled the number of devices that could be in a RAID1 array. So we doubled how far read_balance would search. Unfortunately we didn't double the point at which it looped back to the beginning - so it effectively loops over all non-replacement disks twice. This doesn't cause bad behaviour, but it pointless and means we never read from replacement devices. Signed-off-by: NeilBrown --- drivers/md/raid1.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 39b2a8aa3b2..34b4665cb0b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -517,8 +517,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect int bad_sectors; int disk = start_disk + i; - if (disk >= conf->raid_disks) - disk -= conf->raid_disks; + if (disk >= conf->raid_disks * 2) + disk -= conf->raid_disks * 2; rdev = rcu_dereference(conf->mirrors[disk].rdev); if (r1_bio->bios[disk] == IO_BLOCKED -- cgit v1.2.3-18-g5258 From f456309106e9657645c81bce1a6bb3230393564e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 15:59:06 +1000 Subject: md: support re-add of recovering devices. We currently only allow a device to be re-added if it appear to be in-sync. This is overly restrictive as it may be desirable to re-add a device that is in the middle of recovery. So remove the test for "InSync" - the test on rdev->raid_disk is sufficient to ensure that the re-add will succeed. Reported-by: Alexander Lyakas Tested-by: Alexander Lyakas Signed-off-by: NeilBrown --- drivers/md/md.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index c601c4be77c..a4c219e3c85 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5784,8 +5784,7 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) super_types[mddev->major_version]. validate_super(mddev, rdev); if ((info->state & (1<flags) || - rdev->raid_disk != info->raid_disk)) { + rdev->raid_disk != info->raid_disk) { /* This was a hot-add request, but events doesn't * match, so reject it. */ -- cgit v1.2.3-18-g5258 From b357f04a67c2aeee828b240863cd3f21d6cb3179 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 17:45:31 +1000 Subject: md: fix up plugging (again). The value returned by "mddev_check_plug" is only valid until the next 'schedule' as that will unplug things. This could happen at any call to mempool_alloc. So just calling mddev_check_plug at the start doesn't really make sense. So call it just before, or just after, queuing things for the thread. As the action that happens at unplug is to wake the thread, this makes lots of sense. If we cannot add a plug (which requires a small GFP_ATOMIC alloc) we wake thread immediately. RAID5 is a bit different. Requests are queued for the thread and the thread is woken by release_stripe. So we don't need to wake the thread on failure. However the thread doesn't perform certain actions when there is any active plug, so it is important to install a plug before waking the thread. So for RAID5 we install the plug *before* queuing the request and waking the thread. Without this patch it is possible for raid1 or raid10 to queue a request without then waking the thread, resulting in the array locking up. Also change raid10 to only flush_pending_write when there are not active plugs, just like raid1. This patch is suitable for 3.0 or later. I plan to submit it to -stable, but I'll like to let it spend a few weeks in mainline first to be sure it is completely safe. Signed-off-by: NeilBrown --- drivers/md/raid1.c | 7 ++----- drivers/md/raid10.c | 12 ++++++------ drivers/md/raid5.c | 6 +----- 3 files changed, 9 insertions(+), 16 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 34b4665cb0b..8c2754f835e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -883,7 +883,6 @@ static void make_request(struct mddev *mddev, struct bio * bio) const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); struct md_rdev *blocked_rdev; - int plugged; int first_clone; int sectors_handled; int max_sectors; @@ -1034,7 +1033,6 @@ read_again: * the bad blocks. Each set of writes gets it's own r1bio * with a set of bios attached. */ - plugged = mddev_check_plugged(mddev); disks = conf->raid_disks * 2; retry_write: @@ -1191,6 +1189,8 @@ read_again: bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; spin_unlock_irqrestore(&conf->device_lock, flags); + if (!mddev_check_plugged(mddev)) + md_wakeup_thread(mddev->thread); } /* Mustn't call r1_bio_write_done before this next test, * as it could result in the bio being freed. @@ -1213,9 +1213,6 @@ read_again: /* In case raid1d snuck in to freeze_array */ wake_up(&conf->wait_barrier); - - if (do_sync || !bitmap || !plugged) - md_wakeup_thread(mddev->thread); } static void status(struct seq_file *seq, struct mddev *mddev) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index edc1088a132..acf5a828c7e 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1039,7 +1039,6 @@ static void make_request(struct mddev *mddev, struct bio * bio) const unsigned long do_fua = (bio->bi_rw & REQ_FUA); unsigned long flags; struct md_rdev *blocked_rdev; - int plugged; int sectors_handled; int max_sectors; int sectors; @@ -1239,7 +1238,6 @@ read_again: * of r10_bios is recored in bio->bi_phys_segments just as with * the read case. */ - plugged = mddev_check_plugged(mddev); r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ raid10_find_phys(conf, r10_bio); @@ -1396,6 +1394,8 @@ retry_write: bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; spin_unlock_irqrestore(&conf->device_lock, flags); + if (!mddev_check_plugged(mddev, 0, 0)) + md_wakeup_thread(mddev->thread); if (!r10_bio->devs[i].repl_bio) continue; @@ -1423,6 +1423,8 @@ retry_write: bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; spin_unlock_irqrestore(&conf->device_lock, flags); + if (!mddev_check_plugged(mddev)) + md_wakeup_thread(mddev->thread); } /* Don't remove the bias on 'remaining' (one_write_done) until @@ -1448,9 +1450,6 @@ retry_write: /* In case raid10d snuck in to freeze_array */ wake_up(&conf->wait_barrier); - - if (do_sync || !mddev->bitmap || !plugged) - md_wakeup_thread(mddev->thread); } static void status(struct seq_file *seq, struct mddev *mddev) @@ -2661,7 +2660,8 @@ static void raid10d(struct mddev *mddev) blk_start_plug(&plug); for (;;) { - flush_pending_writes(conf); + if (atomic_read(&mddev->plug_cnt) == 0) + flush_pending_writes(conf); spin_lock_irqsave(&conf->device_lock, flags); if (list_empty(head)) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7245a9df35a..04348d76bb3 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3997,7 +3997,6 @@ static void make_request(struct mddev *mddev, struct bio * bi) struct stripe_head *sh; const int rw = bio_data_dir(bi); int remaining; - int plugged; if (unlikely(bi->bi_rw & REQ_FLUSH)) { md_flush_request(mddev, bi); @@ -4016,7 +4015,6 @@ static void make_request(struct mddev *mddev, struct bio * bi) bi->bi_next = NULL; bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ - plugged = mddev_check_plugged(mddev); for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { DEFINE_WAIT(w); int previous; @@ -4118,6 +4116,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) if ((bi->bi_rw & REQ_SYNC) && !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); + mddev_check_plugged(mddev); release_stripe(sh); } else { /* cannot get stripe for read-ahead, just give-up */ @@ -4125,10 +4124,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) finish_wait(&conf->wait_for_overlap, &w); break; } - } - if (!plugged) - md_wakeup_thread(mddev->thread); spin_lock_irq(&conf->device_lock); remaining = raid5_dec_bi_phys_segments(bi); -- cgit v1.2.3-18-g5258