aboutsummaryrefslogtreecommitdiff
path: root/drivers/md/raid1.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r--drivers/md/raid1.c962
1 files changed, 735 insertions, 227 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f7431b6d844..32323f0afd8 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -35,16 +35,13 @@
#include <linux/delay.h>
#include <linux/blkdev.h>
#include <linux/seq_file.h>
+#include <linux/ratelimit.h>
#include "md.h"
#include "raid1.h"
#include "bitmap.h"
#define DEBUG 0
-#if DEBUG
-#define PRINTK(x...) printk(x)
-#else
-#define PRINTK(x...)
-#endif
+#define PRINTK(x...) do { if (DEBUG) printk(x); } while (0)
/*
* Number of guaranteed r1bios in case of extreme VM load:
@@ -166,7 +163,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
for (i = 0; i < conf->raid_disks; i++) {
struct bio **bio = r1_bio->bios + i;
- if (*bio && *bio != IO_BLOCKED)
+ if (!BIO_SPECIAL(*bio))
bio_put(*bio);
*bio = NULL;
}
@@ -176,12 +173,6 @@ static void free_r1bio(r1bio_t *r1_bio)
{
conf_t *conf = r1_bio->mddev->private;
- /*
- * Wake up any possible resync thread that waits for the device
- * to go idle.
- */
- allow_barrier(conf);
-
put_all_bios(conf, r1_bio);
mempool_free(r1_bio, conf->r1bio_pool);
}
@@ -222,6 +213,33 @@ static void reschedule_retry(r1bio_t *r1_bio)
* operation and are ready to return a success/failure code to the buffer
* cache layer.
*/
+static void call_bio_endio(r1bio_t *r1_bio)
+{
+ struct bio *bio = r1_bio->master_bio;
+ int done;
+ conf_t *conf = r1_bio->mddev->private;
+
+ if (bio->bi_phys_segments) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ bio->bi_phys_segments--;
+ done = (bio->bi_phys_segments == 0);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ } else
+ done = 1;
+
+ if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ if (done) {
+ bio_endio(bio, 0);
+ /*
+ * Wake up any possible resync thread that waits for the device
+ * to go idle.
+ */
+ allow_barrier(conf);
+ }
+}
+
static void raid_end_bio_io(r1bio_t *r1_bio)
{
struct bio *bio = r1_bio->master_bio;
@@ -234,8 +252,7 @@ static void raid_end_bio_io(r1bio_t *r1_bio)
(unsigned long long) bio->bi_sector +
(bio->bi_size >> 9) - 1);
- bio_endio(bio,
- test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
+ call_bio_endio(r1_bio);
}
free_r1bio(r1_bio);
}
@@ -287,36 +304,52 @@ static void raid1_end_read_request(struct bio *bio, int error)
* oops, read error:
*/
char b[BDEVNAME_SIZE];
- if (printk_ratelimit())
- printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n",
- mdname(conf->mddev),
- bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
+ printk_ratelimited(
+ KERN_ERR "md/raid1:%s: %s: "
+ "rescheduling sector %llu\n",
+ mdname(conf->mddev),
+ bdevname(conf->mirrors[mirror].rdev->bdev,
+ b),
+ (unsigned long long)r1_bio->sector);
+ set_bit(R1BIO_ReadError, &r1_bio->state);
reschedule_retry(r1_bio);
}
rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
}
+static void close_write(r1bio_t *r1_bio)
+{
+ /* it really is the end of this request */
+ if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+ /* free extra copy of the data pages */
+ int i = r1_bio->behind_page_count;
+ while (i--)
+ safe_put_page(r1_bio->behind_bvecs[i].bv_page);
+ kfree(r1_bio->behind_bvecs);
+ r1_bio->behind_bvecs = NULL;
+ }
+ /* clear the bitmap if all writes complete successfully */
+ bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
+ r1_bio->sectors,
+ !test_bit(R1BIO_Degraded, &r1_bio->state),
+ test_bit(R1BIO_BehindIO, &r1_bio->state));
+ md_write_end(r1_bio->mddev);
+}
+
static void r1_bio_write_done(r1bio_t *r1_bio)
{
- if (atomic_dec_and_test(&r1_bio->remaining))
- {
- /* it really is the end of this request */
- if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
- /* free extra copy of the data pages */
- int i = r1_bio->behind_page_count;
- while (i--)
- safe_put_page(r1_bio->behind_pages[i]);
- kfree(r1_bio->behind_pages);
- r1_bio->behind_pages = NULL;
- }
- /* clear the bitmap if all writes complete successfully */
- bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
- r1_bio->sectors,
- !test_bit(R1BIO_Degraded, &r1_bio->state),
- test_bit(R1BIO_BehindIO, &r1_bio->state));
- md_write_end(r1_bio->mddev);
- raid_end_bio_io(r1_bio);
+ if (!atomic_dec_and_test(&r1_bio->remaining))
+ return;
+
+ if (test_bit(R1BIO_WriteError, &r1_bio->state))
+ reschedule_retry(r1_bio);
+ else {
+ close_write(r1_bio);
+ if (test_bit(R1BIO_MadeGood, &r1_bio->state))
+ reschedule_retry(r1_bio);
+ else
+ raid_end_bio_io(r1_bio);
}
}
@@ -336,13 +369,11 @@ static void raid1_end_write_request(struct bio *bio, int error)
/*
* 'one mirror IO has finished' event handler:
*/
- r1_bio->bios[mirror] = NULL;
- to_put = bio;
if (!uptodate) {
- md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
- /* an I/O failed, we can't clear the bitmap */
- set_bit(R1BIO_Degraded, &r1_bio->state);
- } else
+ set_bit(WriteErrorSeen,
+ &conf->mirrors[mirror].rdev->flags);
+ set_bit(R1BIO_WriteError, &r1_bio->state);
+ } else {
/*
* Set R1BIO_Uptodate in our master bio, so that we
* will return a good error code for to the higher
@@ -353,8 +384,22 @@ static void raid1_end_write_request(struct bio *bio, int error)
* to user-side. So if something waits for IO, then it
* will wait for the 'master' bio.
*/
+ sector_t first_bad;
+ int bad_sectors;
+
+ r1_bio->bios[mirror] = NULL;
+ to_put = bio;
set_bit(R1BIO_Uptodate, &r1_bio->state);
+ /* Maybe we can clear some bad blocks. */
+ if (is_badblock(conf->mirrors[mirror].rdev,
+ r1_bio->sector, r1_bio->sectors,
+ &first_bad, &bad_sectors)) {
+ r1_bio->bios[mirror] = IO_MADE_GOOD;
+ set_bit(R1BIO_MadeGood, &r1_bio->state);
+ }
+ }
+
update_head_pos(mirror, r1_bio);
if (behind) {
@@ -377,11 +422,13 @@ static void raid1_end_write_request(struct bio *bio, int error)
(unsigned long long) mbio->bi_sector,
(unsigned long long) mbio->bi_sector +
(mbio->bi_size >> 9) - 1);
- bio_endio(mbio, 0);
+ call_bio_endio(r1_bio);
}
}
}
- rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
+ if (r1_bio->bios[mirror] == NULL)
+ rdev_dec_pending(conf->mirrors[mirror].rdev,
+ conf->mddev);
/*
* Let's see if all mirrored write operations have finished
@@ -408,10 +455,11 @@ static void raid1_end_write_request(struct bio *bio, int error)
*
* The rdev for the device selected will have nr_pending incremented.
*/
-static int read_balance(conf_t *conf, r1bio_t *r1_bio)
+static int read_balance(conf_t *conf, r1bio_t *r1_bio, int *max_sectors)
{
const sector_t this_sector = r1_bio->sector;
- const int sectors = r1_bio->sectors;
+ int sectors;
+ int best_good_sectors;
int start_disk;
int best_disk;
int i;
@@ -426,8 +474,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
* We take the first readable disk when above the resync window.
*/
retry:
+ sectors = r1_bio->sectors;
best_disk = -1;
best_dist = MaxSector;
+ best_good_sectors = 0;
+
if (conf->mddev->recovery_cp < MaxSector &&
(this_sector + sectors >= conf->next_resync)) {
choose_first = 1;
@@ -439,6 +490,9 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
for (i = 0 ; i < conf->raid_disks ; i++) {
sector_t dist;
+ sector_t first_bad;
+ int bad_sectors;
+
int disk = start_disk + i;
if (disk >= conf->raid_disks)
disk -= conf->raid_disks;
@@ -461,6 +515,35 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
/* This is a reasonable device to use. It might
* even be best.
*/
+ if (is_badblock(rdev, this_sector, sectors,
+ &first_bad, &bad_sectors)) {
+ if (best_dist < MaxSector)
+ /* already have a better device */
+ continue;
+ if (first_bad <= this_sector) {
+ /* cannot read here. If this is the 'primary'
+ * device, then we must not read beyond
+ * bad_sectors from another device..
+ */
+ bad_sectors -= (this_sector - first_bad);
+ if (choose_first && sectors > bad_sectors)
+ sectors = bad_sectors;
+ if (best_good_sectors > sectors)
+ best_good_sectors = sectors;
+
+ } else {
+ sector_t good_sectors = first_bad - this_sector;
+ if (good_sectors > best_good_sectors) {
+ best_good_sectors = good_sectors;
+ best_disk = disk;
+ }
+ if (choose_first)
+ break;
+ }
+ continue;
+ } else
+ best_good_sectors = sectors;
+
dist = abs(this_sector - conf->mirrors[disk].head_position);
if (choose_first
/* Don't change to another disk for sequential reads */
@@ -489,10 +572,12 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
rdev_dec_pending(rdev, conf->mddev);
goto retry;
}
+ sectors = best_good_sectors;
conf->next_seq_sect = this_sector + sectors;
conf->last_used = best_disk;
}
rcu_read_unlock();
+ *max_sectors = sectors;
return best_disk;
}
@@ -672,30 +757,31 @@ static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio)
{
int i;
struct bio_vec *bvec;
- struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*),
+ struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
GFP_NOIO);
- if (unlikely(!pages))
+ if (unlikely(!bvecs))
return;
bio_for_each_segment(bvec, bio, i) {
- pages[i] = alloc_page(GFP_NOIO);
- if (unlikely(!pages[i]))
+ bvecs[i] = *bvec;
+ bvecs[i].bv_page = alloc_page(GFP_NOIO);
+ if (unlikely(!bvecs[i].bv_page))
goto do_sync_io;
- memcpy(kmap(pages[i]) + bvec->bv_offset,
- kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
- kunmap(pages[i]);
+ memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset,
+ kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
+ kunmap(bvecs[i].bv_page);
kunmap(bvec->bv_page);
}
- r1_bio->behind_pages = pages;
+ r1_bio->behind_bvecs = bvecs;
r1_bio->behind_page_count = bio->bi_vcnt;
set_bit(R1BIO_BehindIO, &r1_bio->state);
return;
do_sync_io:
for (i = 0; i < bio->bi_vcnt; i++)
- if (pages[i])
- put_page(pages[i]);
- kfree(pages);
+ if (bvecs[i].bv_page)
+ put_page(bvecs[i].bv_page);
+ kfree(bvecs);
PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
}
@@ -705,7 +791,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
mirror_info_t *mirror;
r1bio_t *r1_bio;
struct bio *read_bio;
- int i, targets = 0, disks;
+ int i, disks;
struct bitmap *bitmap;
unsigned long flags;
const int rw = bio_data_dir(bio);
@@ -713,6 +799,9 @@ static int make_request(mddev_t *mddev, struct bio * bio)
const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
mdk_rdev_t *blocked_rdev;
int plugged;
+ int first_clone;
+ int sectors_handled;
+ int max_sectors;
/*
* Register the new request and wait if the reconstruction
@@ -759,11 +848,24 @@ static int make_request(mddev_t *mddev, struct bio * bio)
r1_bio->mddev = mddev;
r1_bio->sector = bio->bi_sector;
+ /* We might need to issue multiple reads to different
+ * devices if there are bad blocks around, so we keep
+ * track of the number of reads in bio->bi_phys_segments.
+ * If this is 0, there is only one r1_bio and no locking
+ * will be needed when requests complete. If it is
+ * non-zero, then it is the number of not-completed requests.
+ */
+ bio->bi_phys_segments = 0;
+ clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+
if (rw == READ) {
/*
* read balancing logic:
*/
- int rdisk = read_balance(conf, r1_bio);
+ int rdisk;
+
+read_again:
+ rdisk = read_balance(conf, r1_bio, &max_sectors);
if (rdisk < 0) {
/* couldn't find anywhere to read from */
@@ -784,6 +886,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
r1_bio->read_disk = rdisk;
read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ md_trim_bio(read_bio, r1_bio->sector - bio->bi_sector,
+ max_sectors);
r1_bio->bios[rdisk] = read_bio;
@@ -793,16 +897,52 @@ static int make_request(mddev_t *mddev, struct bio * bio)
read_bio->bi_rw = READ | do_sync;
read_bio->bi_private = r1_bio;
- generic_make_request(read_bio);
+ if (max_sectors < r1_bio->sectors) {
+ /* could not read all from this device, so we will
+ * need another r1_bio.
+ */
+
+ sectors_handled = (r1_bio->sector + max_sectors
+ - bio->bi_sector);
+ r1_bio->sectors = max_sectors;
+ spin_lock_irq(&conf->device_lock);
+ if (bio->bi_phys_segments == 0)
+ bio->bi_phys_segments = 2;
+ else
+ bio->bi_phys_segments++;
+ spin_unlock_irq(&conf->device_lock);
+ /* Cannot call generic_make_request directly
+ * as that will be queued in __make_request
+ * and subsequent mempool_alloc might block waiting
+ * for it. So hand bio over to raid1d.
+ */
+ reschedule_retry(r1_bio);
+
+ r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+
+ r1_bio->master_bio = bio;
+ r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+ r1_bio->state = 0;
+ r1_bio->mddev = mddev;
+ r1_bio->sector = bio->bi_sector + sectors_handled;
+ goto read_again;
+ } else
+ generic_make_request(read_bio);
return 0;
}
/*
* WRITE:
*/
- /* first select target devices under spinlock and
+ /* first select target devices under rcu_lock and
* inc refcount on their rdev. Record them by setting
* bios[x] to bio
+ * If there are known/acknowledged bad blocks on any device on
+ * which we have seen a write error, we want to avoid writing those
+ * blocks.
+ * This potentially requires several writes to write around
+ * the bad blocks. Each set of writes gets it's own r1bio
+ * with a set of bios attached.
*/
plugged = mddev_check_plugged(mddev);
@@ -810,6 +950,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
retry_write:
blocked_rdev = NULL;
rcu_read_lock();
+ max_sectors = r1_bio->sectors;
for (i = 0; i < disks; i++) {
mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
@@ -817,17 +958,56 @@ static int make_request(mddev_t *mddev, struct bio * bio)
blocked_rdev = rdev;
break;
}
- if (rdev && !test_bit(Faulty, &rdev->flags)) {
- atomic_inc(&rdev->nr_pending);
- if (test_bit(Faulty, &rdev->flags)) {
+ r1_bio->bios[i] = NULL;
+ if (!rdev || test_bit(Faulty, &rdev->flags)) {
+ set_bit(R1BIO_Degraded, &r1_bio->state);
+ continue;
+ }
+
+ atomic_inc(&rdev->nr_pending);
+ if (test_bit(WriteErrorSeen, &rdev->flags)) {
+ sector_t first_bad;
+ int bad_sectors;
+ int is_bad;
+
+ is_bad = is_badblock(rdev, r1_bio->sector,
+ max_sectors,
+ &first_bad, &bad_sectors);
+ if (is_bad < 0) {
+ /* mustn't write here until the bad block is
+ * acknowledged*/
+ set_bit(BlockedBadBlocks, &rdev->flags);
+ blocked_rdev = rdev;
+ break;
+ }
+ if (is_bad && first_bad <= r1_bio->sector) {
+ /* Cannot write here at all */
+ bad_sectors -= (r1_bio->sector - first_bad);
+ if (bad_sectors < max_sectors)
+ /* mustn't write more than bad_sectors
+ * to other devices yet
+ */
+ max_sectors = bad_sectors;
rdev_dec_pending(rdev, mddev);
- r1_bio->bios[i] = NULL;
- } else {
- r1_bio->bios[i] = bio;
- targets++;
+ /* We don't set R1BIO_Degraded as that
+ * only applies if the disk is
+ * missing, so it might be re-added,
+ * and we want to know to recover this
+ * chunk.
+ * In this case the device is here,
+ * and the fact that this chunk is not
+ * in-sync is recorded in the bad
+ * block log
+ */
+ continue;
}
- } else
- r1_bio->bios[i] = NULL;
+ if (is_bad) {
+ int good_sectors = first_bad - r1_bio->sector;
+ if (good_sectors < max_sectors)
+ max_sectors = good_sectors;
+ }
+ }
+ r1_bio->bios[i] = bio;
}
rcu_read_unlock();
@@ -838,51 +1018,57 @@ static int make_request(mddev_t *mddev, struct bio * bio)
for (j = 0; j < i; j++)
if (r1_bio->bios[j])
rdev_dec_pending(conf->mirrors[j].rdev, mddev);
-
+ r1_bio->state = 0;
allow_barrier(conf);
md_wait_for_blocked_rdev(blocked_rdev, mddev);
wait_barrier(conf);
goto retry_write;
}
- BUG_ON(targets == 0); /* we never fail the last device */
-
- if (targets < conf->raid_disks) {
- /* array is degraded, we will not clear the bitmap
- * on I/O completion (see raid1_end_write_request) */
- set_bit(R1BIO_Degraded, &r1_bio->state);
+ if (max_sectors < r1_bio->sectors) {
+ /* We are splitting this write into multiple parts, so
+ * we need to prepare for allocating another r1_bio.
+ */
+ r1_bio->sectors = max_sectors;
+ spin_lock_irq(&conf->device_lock);
+ if (bio->bi_phys_segments == 0)
+ bio->bi_phys_segments = 2;
+ else
+ bio->bi_phys_segments++;
+ spin_unlock_irq(&conf->device_lock);
}
-
- /* do behind I/O ?
- * Not if there are too many, or cannot allocate memory,
- * or a reader on WriteMostly is waiting for behind writes
- * to flush */
- if (bitmap &&
- (atomic_read(&bitmap->behind_writes)
- < mddev->bitmap_info.max_write_behind) &&
- !waitqueue_active(&bitmap->behind_wait))
- alloc_behind_pages(bio, r1_bio);
+ sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector;
atomic_set(&r1_bio->remaining, 1);
atomic_set(&r1_bio->behind_remaining, 0);
- bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
- test_bit(R1BIO_BehindIO, &r1_bio->state));
+ first_clone = 1;
for (i = 0; i < disks; i++) {
struct bio *mbio;
if (!r1_bio->bios[i])
continue;
mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
- r1_bio->bios[i] = mbio;
-
- mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
- mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
- mbio->bi_end_io = raid1_end_write_request;
- mbio->bi_rw = WRITE | do_flush_fua | do_sync;
- mbio->bi_private = r1_bio;
-
- if (r1_bio->behind_pages) {
+ md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors);
+
+ if (first_clone) {
+ /* do behind I/O ?
+ * Not if there are too many, or cannot
+ * allocate memory, or a reader on WriteMostly
+ * is waiting for behind writes to flush */
+ if (bitmap &&
+ (atomic_read(&bitmap->behind_writes)
+ < mddev->bitmap_info.max_write_behind) &&
+ !waitqueue_active(&bitmap->behind_wait))
+ alloc_behind_pages(mbio, r1_bio);
+
+ bitmap_startwrite(bitmap, r1_bio->sector,
+ r1_bio->sectors,
+ test_bit(R1BIO_BehindIO,
+ &r1_bio->state));
+ first_clone = 0;
+ }
+ if (r1_bio->behind_bvecs) {
struct bio_vec *bvec;
int j;
@@ -894,11 +1080,20 @@ static int make_request(mddev_t *mddev, struct bio * bio)
* them all
*/
__bio_for_each_segment(bvec, mbio, j, 0)
- bvec->bv_page = r1_bio->behind_pages[j];
+ bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
atomic_inc(&r1_bio->behind_remaining);
}
+ r1_bio->bios[i] = mbio;
+
+ mbio->bi_sector = (r1_bio->sector +
+ conf->mirrors[i].rdev->data_offset);
+ mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+ mbio->bi_end_io = raid1_end_write_request;
+ mbio->bi_rw = WRITE | do_flush_fua | do_sync;
+ mbio->bi_private = r1_bio;
+
atomic_inc(&r1_bio->remaining);
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio);
@@ -909,6 +1104,19 @@ static int make_request(mddev_t *mddev, struct bio * bio)
/* In case raid1d snuck in to freeze_array */
wake_up(&conf->wait_barrier);
+ if (sectors_handled < (bio->bi_size >> 9)) {
+ /* We need another r1_bio. It has already been counted
+ * in bio->bi_phys_segments
+ */
+ r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+ r1_bio->master_bio = bio;
+ r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+ r1_bio->state = 0;
+ r1_bio->mddev = mddev;
+ r1_bio->sector = bio->bi_sector + sectors_handled;
+ goto retry_write;
+ }
+
if (do_sync || !bitmap || !plugged)
md_wakeup_thread(mddev->thread);
@@ -952,9 +1160,10 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
* However don't try a recovery from this drive as
* it is very likely to fail.
*/
- mddev->recovery_disabled = 1;
+ conf->recovery_disabled = mddev->recovery_disabled;
return;
}
+ set_bit(Blocked, &rdev->flags);
if (test_and_clear_bit(In_sync, &rdev->flags)) {
unsigned long flags;
spin_lock_irqsave(&conf->device_lock, flags);
@@ -1027,7 +1236,7 @@ static int raid1_spare_active(mddev_t *mddev)
&& !test_bit(Faulty, &rdev->flags)
&& !test_and_set_bit(In_sync, &rdev->flags)) {
count++;
- sysfs_notify_dirent(rdev->sysfs_state);
+ sysfs_notify_dirent_safe(rdev->sysfs_state);
}
}
spin_lock_irqsave(&conf->device_lock, flags);
@@ -1048,6 +1257,9 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
int first = 0;
int last = mddev->raid_disks - 1;
+ if (mddev->recovery_disabled == conf->recovery_disabled)
+ return -EBUSY;
+
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
@@ -1103,7 +1315,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
* is not possible.
*/
if (!test_bit(Faulty, &rdev->flags) &&
- !mddev->recovery_disabled &&
+ mddev->recovery_disabled != conf->recovery_disabled &&
mddev->degraded < conf->raid_disks) {
err = -EBUSY;
goto abort;
@@ -1155,6 +1367,8 @@ static void end_sync_write(struct bio *bio, int error)
conf_t *conf = mddev->private;
int i;
int mirror=0;
+ sector_t first_bad;
+ int bad_sectors;
for (i = 0; i < conf->raid_disks; i++)
if (r1_bio->bios[i] == bio) {
@@ -1172,18 +1386,48 @@ static void end_sync_write(struct bio *bio, int error)
s += sync_blocks;
sectors_to_go -= sync_blocks;
} while (sectors_to_go > 0);
- md_error(mddev, conf->mirrors[mirror].rdev);
- }
+ set_bit(WriteErrorSeen,
+ &conf->mirrors[mirror].rdev->flags);
+ set_bit(R1BIO_WriteError, &r1_bio->state);
+ } else if (is_badblock(conf->mirrors[mirror].rdev,
+ r1_bio->sector,
+ r1_bio->sectors,
+ &first_bad, &bad_sectors) &&
+ !is_badblock(conf->mirrors[r1_bio->read_disk].rdev,
+ r1_bio->sector,
+ r1_bio->sectors,
+ &first_bad, &bad_sectors)
+ )
+ set_bit(R1BIO_MadeGood, &r1_bio->state);
update_head_pos(mirror, r1_bio);
if (atomic_dec_and_test(&r1_bio->remaining)) {
- sector_t s = r1_bio->sectors;
- put_buf(r1_bio);
- md_done_sync(mddev, s, uptodate);
+ int s = r1_bio->sectors;
+ if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+ test_bit(R1BIO_WriteError, &r1_bio->state))
+ reschedule_retry(r1_bio);
+ else {
+ put_buf(r1_bio);
+ md_done_sync(mddev, s, uptodate);
+ }
}
}
+static int r1_sync_page_io(mdk_rdev_t *rdev, sector_t sector,
+ int sectors, struct page *page, int rw)
+{
+ if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
+ /* success */
+ return 1;
+ if (rw == WRITE)
+ set_bit(WriteErrorSeen, &rdev->flags);
+ /* need to record an error - either for the block or the device */
+ if (!rdev_set_badblocks(rdev, sector, sectors, 0))
+ md_error(rdev->mddev, rdev);
+ return 0;
+}
+
static int fix_sync_read_error(r1bio_t *r1_bio)
{
/* Try some synchronous reads of other devices to get
@@ -1193,6 +1437,9 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
* We don't need to freeze the array, because being in an
* active sync request, there is no normal IO, and
* no overlapping syncs.
+ * We don't need to check is_badblock() again as we
+ * made sure that anything with a bad block in range
+ * will have bi_end_io clear.
*/
mddev_t *mddev = r1_bio->mddev;
conf_t *conf = mddev->private;
@@ -1217,9 +1464,7 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
* active, and resync is currently active
*/
rdev = conf->mirrors[d].rdev;
- if (sync_page_io(rdev,
- sect,
- s<<9,
+ if (sync_page_io(rdev, sect, s<<9,
bio->bi_io_vec[idx].bv_page,
READ, false)) {
success = 1;
@@ -1233,16 +1478,36 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
if (!success) {
char b[BDEVNAME_SIZE];
- /* Cannot read from anywhere, array is toast */
- md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
+ int abort = 0;
+ /* Cannot read from anywhere, this block is lost.
+ * Record a bad block on each device. If that doesn't
+ * work just disable and interrupt the recovery.
+ * Don't fail devices as that won't really help.
+ */
printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
" for block %llu\n",
mdname(mddev),
bdevname(bio->bi_bdev, b),
(unsigned long long)r1_bio->sector);
- md_done_sync(mddev, r1_bio->sectors, 0);
- put_buf(r1_bio);
- return 0;
+ for (d = 0; d < conf->raid_disks; d++) {
+ rdev = conf->mirrors[d].rdev;
+ if (!rdev || test_bit(Faulty, &rdev->flags))
+ continue;
+ if (!rdev_set_badblocks(rdev, sect, s, 0))
+ abort = 1;
+ }
+ if (abort) {
+ mddev->recovery_disabled = 1;
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_done_sync(mddev, r1_bio->sectors, 0);
+ put_buf(r1_bio);
+ return 0;
+ }
+ /* Try next page */
+ sectors -= s;
+ sect += s;
+ idx++;
+ continue;
}
start = d;
@@ -1254,16 +1519,12 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
if (r1_bio->bios[d]->bi_end_io != end_sync_read)
continue;
rdev = conf->mirrors[d].rdev;
- if (sync_page_io(rdev,
- sect,
- s<<9,
- bio->bi_io_vec[idx].bv_page,
- WRITE, false) == 0) {
+ if (r1_sync_page_io(rdev, sect, s,
+ bio->bi_io_vec[idx].bv_page,
+ WRITE) == 0) {
r1_bio->bios[d]->bi_end_io = NULL;
rdev_dec_pending(rdev, mddev);
- md_error(mddev, rdev);
- } else
- atomic_add(s, &rdev->corrected_errors);
+ }
}
d = start;
while (d != r1_bio->read_disk) {
@@ -1273,12 +1534,10 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
if (r1_bio->bios[d]->bi_end_io != end_sync_read)
continue;
rdev = conf->mirrors[d].rdev;
- if (sync_page_io(rdev,
- sect,
- s<<9,
- bio->bi_io_vec[idx].bv_page,
- READ, false) == 0)
- md_error(mddev, rdev);
+ if (r1_sync_page_io(rdev, sect, s,
+ bio->bi_io_vec[idx].bv_page,
+ READ) != 0)
+ atomic_add(s, &rdev->corrected_errors);
}
sectors -= s;
sect += s;
@@ -1420,7 +1679,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
*
* 1. Retries failed read operations on working mirrors.
* 2. Updates the raid superblock when problems encounter.
- * 3. Performs writes following reads for array syncronising.
+ * 3. Performs writes following reads for array synchronising.
*/
static void fix_read_error(conf_t *conf, int read_disk,
@@ -1443,9 +1702,14 @@ static void fix_read_error(conf_t *conf, int read_disk,
* which is the thread that might remove
* a device. If raid1d ever becomes multi-threaded....
*/
+ sector_t first_bad;
+ int bad_sectors;
+
rdev = conf->mirrors[d].rdev;
if (rdev &&
test_bit(In_sync, &rdev->flags) &&
+ is_badblock(rdev, sect, s,
+ &first_bad, &bad_sectors) == 0 &&
sync_page_io(rdev, sect, s<<9,
conf->tmppage, READ, false))
success = 1;
@@ -1457,8 +1721,10 @@ static void fix_read_error(conf_t *conf, int read_disk,
} while (!success && d != read_disk);
if (!success) {
- /* Cannot read from anywhere -- bye bye array */
- md_error(mddev, conf->mirrors[read_disk].rdev);
+ /* Cannot read from anywhere - mark it bad */
+ mdk_rdev_t *rdev = conf->mirrors[read_disk].rdev;
+ if (!rdev_set_badblocks(rdev, sect, s, 0))
+ md_error(mddev, rdev);
break;
}
/* write it back and re-read */
@@ -1469,13 +1735,9 @@ static void fix_read_error(conf_t *conf, int read_disk,
d--;
rdev = conf->mirrors[d].rdev;
if (rdev &&
- test_bit(In_sync, &rdev->flags)) {
- if (sync_page_io(rdev, sect, s<<9,
- conf->tmppage, WRITE, false)
- == 0)
- /* Well, this device is dead */
- md_error(mddev, rdev);
- }
+ test_bit(In_sync, &rdev->flags))
+ r1_sync_page_io(rdev, sect, s,
+ conf->tmppage, WRITE);
}
d = start;
while (d != read_disk) {
@@ -1486,12 +1748,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
rdev = conf->mirrors[d].rdev;
if (rdev &&
test_bit(In_sync, &rdev->flags)) {
- if (sync_page_io(rdev, sect, s<<9,
- conf->tmppage, READ, false)
- == 0)
- /* Well, this device is dead */
- md_error(mddev, rdev);
- else {
+ if (r1_sync_page_io(rdev, sect, s,
+ conf->tmppage, READ)) {
atomic_add(s, &rdev->corrected_errors);
printk(KERN_INFO
"md/raid1:%s: read error corrected "
@@ -1508,21 +1766,255 @@ static void fix_read_error(conf_t *conf, int read_disk,
}
}
+static void bi_complete(struct bio *bio, int error)
+{
+ complete((struct completion *)bio->bi_private);
+}
+
+static int submit_bio_wait(int rw, struct bio *bio)
+{
+ struct completion event;
+ rw |= REQ_SYNC;
+
+ init_completion(&event);
+ bio->bi_private = &event;
+ bio->bi_end_io = bi_complete;
+ submit_bio(rw, bio);
+ wait_for_completion(&event);
+
+ return test_bit(BIO_UPTODATE, &bio->bi_flags);
+}
+
+static int narrow_write_error(r1bio_t *r1_bio, int i)
+{
+ mddev_t *mddev = r1_bio->mddev;
+ conf_t *conf = mddev->private;
+ mdk_rdev_t *rdev = conf->mirrors[i].rdev;
+ int vcnt, idx;
+ struct bio_vec *vec;
+
+ /* bio has the data to be written to device 'i' where
+ * we just recently had a write error.
+ * We repeatedly clone the bio and trim down to one block,
+ * then try the write. Where the write fails we record
+ * a bad block.
+ * It is conceivable that the bio doesn't exactly align with
+ * blocks. We must handle this somehow.
+ *
+ * We currently own a reference on the rdev.
+ */
+
+ int block_sectors;
+ sector_t sector;
+ int sectors;
+ int sect_to_write = r1_bio->sectors;
+ int ok = 1;
+
+ if (rdev->badblocks.shift < 0)
+ return 0;
+
+ block_sectors = 1 << rdev->badblocks.shift;
+ sector = r1_bio->sector;
+ sectors = ((sector + block_sectors)
+ & ~(sector_t)(block_sectors - 1))
+ - sector;
+
+ if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+ vcnt = r1_bio->behind_page_count;
+ vec = r1_bio->behind_bvecs;
+ idx = 0;
+ while (vec[idx].bv_page == NULL)
+ idx++;
+ } else {
+ vcnt = r1_bio->master_bio->bi_vcnt;
+ vec = r1_bio->master_bio->bi_io_vec;
+ idx = r1_bio->master_bio->bi_idx;
+ }
+ while (sect_to_write) {
+ struct bio *wbio;
+ if (sectors > sect_to_write)
+ sectors = sect_to_write;
+ /* Write at 'sector' for 'sectors'*/
+
+ wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
+ memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
+ wbio->bi_sector = r1_bio->sector;
+ wbio->bi_rw = WRITE;
+ wbio->bi_vcnt = vcnt;
+ wbio->bi_size = r1_bio->sectors << 9;
+ wbio->bi_idx = idx;
+
+ md_trim_bio(wbio, sector - r1_bio->sector, sectors);
+ wbio->bi_sector += rdev->data_offset;
+ wbio->bi_bdev = rdev->bdev;
+ if (submit_bio_wait(WRITE, wbio) == 0)
+ /* failure! */
+ ok = rdev_set_badblocks(rdev, sector,
+ sectors, 0)
+ && ok;
+
+ bio_put(wbio);
+ sect_to_write -= sectors;
+ sector += sectors;
+ sectors = block_sectors;
+ }
+ return ok;
+}
+
+static void handle_sync_write_finished(conf_t *conf, r1bio_t *r1_bio)
+{
+ int m;
+ int s = r1_bio->sectors;
+ for (m = 0; m < conf->raid_disks ; m++) {
+ mdk_rdev_t *rdev = conf->mirrors[m].rdev;
+ struct bio *bio = r1_bio->bios[m];
+ if (bio->bi_end_io == NULL)
+ continue;
+ if (test_bit(BIO_UPTODATE, &bio->bi_flags) &&
+ test_bit(R1BIO_MadeGood, &r1_bio->state)) {
+ rdev_clear_badblocks(rdev, r1_bio->sector, s);
+ }
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
+ test_bit(R1BIO_WriteError, &r1_bio->state)) {
+ if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0))
+ md_error(conf->mddev, rdev);
+ }
+ }
+ put_buf(r1_bio);
+ md_done_sync(conf->mddev, s, 1);
+}
+
+static void handle_write_finished(conf_t *conf, r1bio_t *r1_bio)
+{
+ int m;
+ for (m = 0; m < conf->raid_disks ; m++)
+ if (r1_bio->bios[m] == IO_MADE_GOOD) {
+ mdk_rdev_t *rdev = conf->mirrors[m].rdev;
+ rdev_clear_badblocks(rdev,
+ r1_bio->sector,
+ r1_bio->sectors);
+ rdev_dec_pending(rdev, conf->mddev);
+ } else if (r1_bio->bios[m] != NULL) {
+ /* This drive got a write error. We need to
+ * narrow down and record precise write
+ * errors.
+ */
+ if (!narrow_write_error(r1_bio, m)) {
+ md_error(conf->mddev,
+ conf->mirrors[m].rdev);
+ /* an I/O failed, we can't clear the bitmap */
+ set_bit(R1BIO_Degraded, &r1_bio->state);
+ }
+ rdev_dec_pending(conf->mirrors[m].rdev,
+ conf->mddev);
+ }
+ if (test_bit(R1BIO_WriteError, &r1_bio->state))
+ close_write(r1_bio);
+ raid_end_bio_io(r1_bio);
+}
+
+static void handle_read_error(conf_t *conf, r1bio_t *r1_bio)
+{
+ int disk;
+ int max_sectors;
+ mddev_t *mddev = conf->mddev;
+ struct bio *bio;
+ char b[BDEVNAME_SIZE];
+ mdk_rdev_t *rdev;
+
+ clear_bit(R1BIO_ReadError, &r1_bio->state);
+ /* we got a read error. Maybe the drive is bad. Maybe just
+ * the block and we can fix it.
+ * We freeze all other IO, and try reading the block from
+ * other devices. When we find one, we re-write
+ * and check it that fixes the read error.
+ * This is all done synchronously while the array is
+ * frozen
+ */
+ if (mddev->ro == 0) {
+ freeze_array(conf);
+ fix_read_error(conf, r1_bio->read_disk,
+ r1_bio->sector, r1_bio->sectors);
+ unfreeze_array(conf);
+ } else
+ md_error(mdd