md/raid5: preferentially read from replacement device if possible.

If a replacement device is present and has been recovered far enough, then use it for reading into the stripe cache. If we get an error we don't try to repair it, we just fail the device. A replacement device that gives errors does not sound sensible. This requires removing the setting of R5_ReadError when we get a read error during a read that bypasses the cache. It was probably a bad idea anyway as we don't know that every block in the read caused an error, and it could cause ReadError to be set for the replacement device, which is bad. Signed-off-by: NeilBrown <neilb@suse.de>
author: NeilBrown <neilb@suse.de> 2011-12-23 10:17:52 +1100
committer: NeilBrown <neilb@suse.de> 2011-12-23 10:17:52 +1100
commit: 14a75d3e07c784c004b4b44b34af996b8e4ac453 (patch)
tree: 6009e1ccc1b766d4c55434e3b4c1d171b8a2c004 /drivers/md
parent: 995c4275a7e14b8752f301e4570831a108ae4303 (diff)
1 files changed, 54 insertions, 15 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2d2aaf6e98e..2ae63c5b1c2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -524,7 +524,11 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 			bi->bi_end_io = raid5_end_read_request;
 
 		rcu_read_lock();
-		rdev = rcu_dereference(conf->disks[i].rdev);
+		if (rw == READ &&
+		    test_bit(R5_ReadRepl, &sh->dev[i].flags))
+			rdev = rcu_dereference(conf->disks[i].replacement);
+		else
+			rdev = rcu_dereference(conf->disks[i].rdev);
 		if (rdev && test_bit(Faulty, &rdev->flags))
 			rdev = NULL;
 		if (rdev)
@@ -1605,11 +1609,18 @@ static void raid5_end_read_request(struct bio * bi, int error)
 		BUG();
 		return;
 	}
+	if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
+		rdev = conf->disks[i].replacement;
+	else
+		rdev = conf->disks[i].rdev;
 
 	if (uptodate) {
 		set_bit(R5_UPTODATE, &sh->dev[i].flags);
 		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
-			rdev = conf->disks[i].rdev;
+			/* Note that this cannot happen on a
+			 * replacement device.  We just fail those on
+			 * any error
+			 */
 			printk_ratelimited(
 				KERN_INFO
 				"md/raid:%s: read error corrected"
@@ -1622,16 +1633,24 @@ static void raid5_end_read_request(struct bio * bi, int error)
 			clear_bit(R5_ReadError, &sh->dev[i].flags);
 			clear_bit(R5_ReWrite, &sh->dev[i].flags);
 		}
-		if (atomic_read(&conf->disks[i].rdev->read_errors))
-			atomic_set(&conf->disks[i].rdev->read_errors, 0);
+		if (atomic_read(&rdev->read_errors))
+			atomic_set(&rdev->read_errors, 0);
 	} else {
-		const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
+		const char *bdn = bdevname(rdev->bdev, b);
 		int retry = 0;
-		rdev = conf->disks[i].rdev;
 
 		clear_bit(R5_UPTODATE, &sh->dev[i].flags);
 		atomic_inc(&rdev->read_errors);
-		if (conf->mddev->degraded >= conf->max_degraded)
+		if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
+			printk_ratelimited(
+				KERN_WARNING
+				"md/raid:%s: read error on replacement device "
+				"(sector %llu on %s).\n",
+				mdname(conf->mddev),
+				(unsigned long long)(sh->sector
+						     + rdev->data_offset),
+				bdn);
+		else if (conf->mddev->degraded >= conf->max_degraded)
 			printk_ratelimited(
 				KERN_WARNING
 				"md/raid:%s: read error not correctable "
@@ -1665,7 +1684,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 			md_error(conf->mddev, rdev);
 		}
 	}
-	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
+	rdev_dec_pending(rdev, conf->mddev);
 	clear_bit(R5_LOCKED, &sh->dev[i].flags);
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
@@ -3036,7 +3055,19 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 		}
 		if (dev->written)
 			s->written++;
-		rdev = rcu_dereference(conf->disks[i].rdev);
+		/* Prefer to use the replacement for reads, but only
+		 * if it is recovered enough and has no bad blocks.
+		 */
+		rdev = rcu_dereference(conf->disks[i].replacement);
+		if (rdev && !test_bit(Faulty, &rdev->flags) &&
+		    rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
+		    !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
+				 &first_bad, &bad_sectors))
+			set_bit(R5_ReadRepl, &dev->flags);
+		else {
+			rdev = rcu_dereference(conf->disks[i].rdev);
+			clear_bit(R5_ReadRepl, &dev->flags);
+		}
 		if (rdev && test_bit(Faulty, &rdev->flags))
 			rdev = NULL;
 		if (rdev) {
@@ -3078,17 +3109,26 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 			set_bit(R5_Insync, &dev->flags);
 
 		if (rdev && test_bit(R5_WriteError, &dev->flags)) {
-			clear_bit(R5_Insync, &dev->flags);
-			if (!test_bit(Faulty, &rdev->flags)) {
+			/* This flag does not apply to '.replacement'
+			 * only to .rdev, so make sure to check that*/
+			struct md_rdev *rdev2 = rcu_dereference(
+				conf->disks[i].rdev);
+			if (rdev2 == rdev)
+				clear_bit(R5_Insync, &dev->flags);
+			if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
 				s->handle_bad_blocks = 1;
-				atomic_inc(&rdev->nr_pending);
+				atomic_inc(&rdev2->nr_pending);
 			} else
 				clear_bit(R5_WriteError, &dev->flags);
 		}
 		if (rdev && test_bit(R5_MadeGood, &dev->flags)) {
-			if (!test_bit(Faulty, &rdev->flags)) {
+			/* This flag does not apply to '.replacement'
+			 * only to .rdev, so make sure to check that*/
+			struct md_rdev *rdev2 = rcu_dereference(
+				conf->disks[i].rdev);
+			if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
 				s->handle_bad_blocks = 1;
-				atomic_inc(&rdev->nr_pending);
+				atomic_inc(&rdev2->nr_pending);
 			} else
 				clear_bit(R5_MadeGood, &dev->flags);
 		}
@@ -4220,7 +4260,6 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 			return handled;
 		}
 
-		set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
 		if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
 			release_stripe(sh);
 			raid5_set_bi_hw_segments(raid_bio, scnt);
author	NeilBrown <neilb@suse.de>	2011-12-23 10:17:52 +1100
committer	NeilBrown <neilb@suse.de>	2011-12-23 10:17:52 +1100
commit	14a75d3e07c784c004b4b44b34af996b8e4ac453 (patch)
tree	6009e1ccc1b766d4c55434e3b4c1d171b8a2c004 /drivers/md
parent	995c4275a7e14b8752f301e4570831a108ae4303 (diff)