aboutsummaryrefslogtreecommitdiff
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c375
1 files changed, 234 insertions, 141 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index cbb15716a5d..6234b2e8458 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -133,7 +133,7 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
{
int sectors = bio_sectors(bio);
- if (bio->bi_sector + sectors < sector + STRIPE_SECTORS)
+ if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
return bio->bi_next;
else
return NULL;
@@ -225,7 +225,7 @@ static void return_io(struct bio *return_bi)
return_bi = bi->bi_next;
bi->bi_next = NULL;
- bi->bi_size = 0;
+ bi->bi_iter.bi_size = 0;
trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
bi, 0);
bio_endio(bi, 0);
@@ -292,9 +292,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
BUG_ON(atomic_read(&conf->active_stripes)==0);
if (test_bit(STRIPE_HANDLE, &sh->state)) {
if (test_bit(STRIPE_DELAYED, &sh->state) &&
- !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
list_add_tail(&sh->lru, &conf->delayed_list);
- else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+ if (atomic_read(&conf->preread_active_stripes)
+ < IO_THRESHOLD)
+ md_wakeup_thread(conf->mddev->thread);
+ } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
sh->bm_seq - conf->seq_write > 0)
list_add_tail(&sh->lru, &conf->bitmap_list);
else {
@@ -413,6 +416,11 @@ static void release_stripe(struct stripe_head *sh)
int hash;
bool wakeup;
+ /* Avoid release_list until the last reference.
+ */
+ if (atomic_add_unless(&sh->count, -1, 1))
+ return;
+
if (unlikely(!conf->mddev->thread) ||
test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
goto slow_path;
@@ -479,6 +487,7 @@ static void shrink_buffers(struct stripe_head *sh)
int num = sh->raid_conf->pool_size;
for (i = 0; i < num ; i++) {
+ WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
p = sh->dev[i].page;
if (!p)
continue;
@@ -499,6 +508,7 @@ static int grow_buffers(struct stripe_head *sh)
return 1;
}
sh->dev[i].page = page;
+ sh->dev[i].orig_page = page;
}
return 0;
}
@@ -675,16 +685,13 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
|| !conf->inactive_blocked),
*(conf->hash_locks + hash));
conf->inactive_blocked = 0;
- } else
+ } else {
init_stripe(sh, sector, previous);
- } else {
+ atomic_inc(&sh->count);
+ }
+ } else if (!atomic_inc_not_zero(&sh->count)) {
spin_lock(&conf->device_lock);
- if (atomic_read(&sh->count)) {
- BUG_ON(!list_empty(&sh->lru)
- && !test_bit(STRIPE_EXPANDING, &sh->state)
- && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)
- );
- } else {
+ if (!atomic_read(&sh->count)) {
if (!test_bit(STRIPE_HANDLE, &sh->state))
atomic_inc(&conf->active_stripes);
BUG_ON(list_empty(&sh->lru) &&
@@ -695,13 +702,11 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
sh->group = NULL;
}
}
+ atomic_inc(&sh->count);
spin_unlock(&conf->device_lock);
}
} while (sh == NULL);
- if (sh)
- atomic_inc(&sh->count);
-
spin_unlock_irq(conf->hash_locks + hash);
return sh;
}
@@ -852,18 +857,21 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
bi->bi_rw, i);
atomic_inc(&sh->count);
if (use_new_offset(conf, sh))
- bi->bi_sector = (sh->sector
+ bi->bi_iter.bi_sector = (sh->sector
+ rdev->new_data_offset);
else
- bi->bi_sector = (sh->sector
+ bi->bi_iter.bi_sector = (sh->sector
+ rdev->data_offset);
if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
bi->bi_rw |= REQ_NOMERGE;
+ if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
+ WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
+ sh->dev[i].vec.bv_page = sh->dev[i].page;
bi->bi_vcnt = 1;
bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
bi->bi_io_vec[0].bv_offset = 0;
- bi->bi_size = STRIPE_SIZE;
+ bi->bi_iter.bi_size = STRIPE_SIZE;
/*
* If this is discard request, set bi_vcnt 0. We don't
* want to confuse SCSI because SCSI will replace payload
@@ -899,15 +907,18 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
rbi->bi_rw, i);
atomic_inc(&sh->count);
if (use_new_offset(conf, sh))
- rbi->bi_sector = (sh->sector
+ rbi->bi_iter.bi_sector = (sh->sector
+ rrdev->new_data_offset);
else
- rbi->bi_sector = (sh->sector
+ rbi->bi_iter.bi_sector = (sh->sector
+ rrdev->data_offset);
+ if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
+ WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
+ sh->dev[i].rvec.bv_page = sh->dev[i].page;
rbi->bi_vcnt = 1;
rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
rbi->bi_io_vec[0].bv_offset = 0;
- rbi->bi_size = STRIPE_SIZE;
+ rbi->bi_iter.bi_size = STRIPE_SIZE;
/*
* If this is discard request, set bi_vcnt 0. We don't
* want to confuse SCSI because SCSI will replace payload
@@ -932,27 +943,28 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
}
static struct dma_async_tx_descriptor *
-async_copy_data(int frombio, struct bio *bio, struct page *page,
- sector_t sector, struct dma_async_tx_descriptor *tx)
+async_copy_data(int frombio, struct bio *bio, struct page **page,
+ sector_t sector, struct dma_async_tx_descriptor *tx,
+ struct stripe_head *sh)
{
- struct bio_vec *bvl;
+ struct bio_vec bvl;
+ struct bvec_iter iter;
struct page *bio_page;
- int i;
int page_offset;
struct async_submit_ctl submit;
enum async_tx_flags flags = 0;
- if (bio->bi_sector >= sector)
- page_offset = (signed)(bio->bi_sector - sector) * 512;
+ if (bio->bi_iter.bi_sector >= sector)
+ page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
else
- page_offset = (signed)(sector - bio->bi_sector) * -512;
+ page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
if (frombio)
flags |= ASYNC_TX_FENCE;
init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
- bio_for_each_segment(bvl, bio, i) {
- int len = bvl->bv_len;
+ bio_for_each_segment(bvl, bio, iter) {
+ int len = bvl.bv_len;
int clen;
int b_offset = 0;
@@ -968,13 +980,18 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
clen = len;
if (clen > 0) {
- b_offset += bvl->bv_offset;
- bio_page = bvl->bv_page;
- if (frombio)
- tx = async_memcpy(page, bio_page, page_offset,
+ b_offset += bvl.bv_offset;
+ bio_page = bvl.bv_page;
+ if (frombio) {
+ if (sh->raid_conf->skip_copy &&
+ b_offset == 0 && page_offset == 0 &&
+ clen == STRIPE_SIZE)
+ *page = bio_page;
+ else
+ tx = async_memcpy(*page, bio_page, page_offset,
b_offset, clen, &submit);
- else
- tx = async_memcpy(bio_page, page, b_offset,
+ } else
+ tx = async_memcpy(bio_page, *page, b_offset,
page_offset, clen, &submit);
}
/* chain the operations */
@@ -1012,7 +1029,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
BUG_ON(!dev->read);
rbi = dev->read;
dev->read = NULL;
- while (rbi && rbi->bi_sector <
+ while (rbi && rbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
rbi2 = r5_next_bio(rbi, dev->sector);
if (!raid5_dec_bi_active_stripes(rbi)) {
@@ -1048,10 +1065,10 @@ static void ops_run_biofill(struct stripe_head *sh)
dev->read = rbi = dev->toread;
dev->toread = NULL;
spin_unlock_irq(&sh->stripe_lock);
- while (rbi && rbi->bi_sector <
+ while (rbi && rbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
- tx = async_copy_data(0, rbi, dev->page,
- dev->sector, tx);
+ tx = async_copy_data(0, rbi, &dev->page,
+ dev->sector, tx, sh);
rbi = r5_next_bio(rbi, dev->sector);
}
}
@@ -1389,8 +1406,9 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
BUG_ON(dev->written);
wbi = dev->written = chosen;
spin_unlock_irq(&sh->stripe_lock);
+ WARN_ON(dev->page != dev->orig_page);
- while (wbi && wbi->bi_sector <
+ while (wbi && wbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
if (wbi->bi_rw & REQ_FUA)
set_bit(R5_WantFUA, &dev->flags);
@@ -1398,9 +1416,15 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
set_bit(R5_SyncIO, &dev->flags);
if (wbi->bi_rw & REQ_DISCARD)
set_bit(R5_Discard, &dev->flags);
- else
- tx = async_copy_data(1, wbi, dev->page,
- dev->sector, tx);
+ else {
+ tx = async_copy_data(1, wbi, &dev->page,
+ dev->sector, tx, sh);
+ if (dev->page != dev->orig_page) {
+ set_bit(R5_SkipCopy, &dev->flags);
+ clear_bit(R5_UPTODATE, &dev->flags);
+ clear_bit(R5_OVERWRITE, &dev->flags);
+ }
+ }
wbi = r5_next_bio(wbi, dev->sector);
}
}
@@ -1431,7 +1455,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
struct r5dev *dev = &sh->dev[i];
if (dev->written || i == pd_idx || i == qd_idx) {
- if (!discard)
+ if (!discard && !test_bit(R5_SkipCopy, &dev->flags))
set_bit(R5_UPTODATE, &dev->flags);
if (fua)
set_bit(R5_WantFUA, &dev->flags);
@@ -1844,8 +1868,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
osh = get_free_stripe(conf, hash);
unlock_device_hash_lock(conf, hash);
atomic_set(&nsh->count, 1);
- for(i=0; i<conf->pool_size; i++)
+ for(i=0; i<conf->pool_size; i++) {
nsh->dev[i].page = osh->dev[i].page;
+ nsh->dev[i].orig_page = osh->dev[i].page;
+ }
for( ; i<newsize; i++)
nsh->dev[i].page = NULL;
nsh->hash_lock_index = hash;
@@ -1901,6 +1927,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
if (nsh->dev[i].page == NULL) {
struct page *p = alloc_page(GFP_NOIO);
nsh->dev[i].page = p;
+ nsh->dev[i].orig_page = p;
if (!p)
err = -ENOMEM;
}
@@ -2111,6 +2138,7 @@ static void raid5_end_write_request(struct bio *bi, int error)
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
} else {
if (!uptodate) {
+ set_bit(STRIPE_DEGRADED, &sh->state);
set_bit(WriteErrorSeen, &rdev->flags);
set_bit(R5_WriteError, &sh->dev[i].flags);
if (!test_and_set_bit(WantReplacement, &rdev->flags))
@@ -2137,24 +2165,20 @@ static void raid5_end_write_request(struct bio *bi, int error)
}
static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
-
+
static void raid5_build_block(struct stripe_head *sh, int i, int previous)
{
struct r5dev *dev = &sh->dev[i];
bio_init(&dev->req);
dev->req.bi_io_vec = &dev->vec;
- dev->req.bi_vcnt++;
- dev->req.bi_max_vecs++;
+ dev->req.bi_max_vecs = 1;
dev->req.bi_private = sh;
- dev->vec.bv_page = dev->page;
bio_init(&dev->rreq);
dev->rreq.bi_io_vec = &dev->rvec;
- dev->rreq.bi_vcnt++;
- dev->rreq.bi_max_vecs++;
+ dev->rreq.bi_max_vecs = 1;
dev->rreq.bi_private = sh;
- dev->rvec.bv_page = dev->page;
dev->flags = 0;
dev->sector = compute_blocknr(sh, i, previous);
@@ -2614,7 +2638,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
int firstwrite=0;
pr_debug("adding bi b#%llu to stripe s#%llu\n",
- (unsigned long long)bi->bi_sector,
+ (unsigned long long)bi->bi_iter.bi_sector,
(unsigned long long)sh->sector);
/*
@@ -2632,12 +2656,12 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
firstwrite = 1;
} else
bip = &sh->dev[dd_idx].toread;
- while (*bip && (*bip)->bi_sector < bi->bi_sector) {
- if (bio_end_sector(*bip) > bi->bi_sector)
+ while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
+ if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
goto overlap;
bip = & (*bip)->bi_next;
}
- if (*bip && (*bip)->bi_sector < bio_end_sector(bi))
+ if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
goto overlap;
BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
@@ -2651,7 +2675,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
sector_t sector = sh->dev[dd_idx].sector;
for (bi=sh->dev[dd_idx].towrite;
sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
- bi && bi->bi_sector <= sector;
+ bi && bi->bi_iter.bi_sector <= sector;
bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
if (bio_end_sector(bi) >= sector)
sector = bio_end_sector(bi);
@@ -2661,7 +2685,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
}
pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
- (unsigned long long)(*bip)->bi_sector,
+ (unsigned long long)(*bip)->bi_iter.bi_sector,
(unsigned long long)sh->sector, dd_idx);
spin_unlock_irq(&sh->stripe_lock);
@@ -2736,7 +2760,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
- while (bi && bi->bi_sector <
+ while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -2754,8 +2778,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
/* and fail all 'written' */
bi = sh->dev[i].written;
sh->dev[i].written = NULL;
+ if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
+ WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
+ sh->dev[i].page = sh->dev[i].orig_page;
+ }
+
if (bi) bitmap_end = 1;
- while (bi && bi->bi_sector <
+ while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -2779,7 +2808,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
spin_unlock_irq(&sh->stripe_lock);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
- while (bi && bi->bi_sector <
+ while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *nextbi =
r5_next_bio(bi, sh->dev[i].sector);
@@ -2890,8 +2919,11 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
(s->failed >= 1 && fdev[0]->toread) ||
(s->failed >= 2 && fdev[1]->toread) ||
(sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
+ (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) &&
!test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
- (sh->raid_conf->level == 6 && s->failed && s->to_write))) {
+ (sh->raid_conf->level == 6 && s->failed && s->to_write &&
+ s->to_write < sh->raid_conf->raid_disks - 2 &&
+ (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) {
/* we would like to get this block, possibly by computing it,
* otherwise read it if the backing disk is insync
*/
@@ -2995,15 +3027,20 @@ static void handle_stripe_clean_event(struct r5conf *conf,
dev = &sh->dev[i];
if (!test_bit(R5_LOCKED, &dev->flags) &&
(test_bit(R5_UPTODATE, &dev->flags) ||
- test_bit(R5_Discard, &dev->flags))) {
+ test_bit(R5_Discard, &dev->flags) ||
+ test_bit(R5_SkipCopy, &dev->flags))) {
/* We can return any write requests */
struct bio *wbi, *wbi2;
pr_debug("Return write for disc %d\n", i);
if (test_and_clear_bit(R5_Discard, &dev->flags))
clear_bit(R5_UPTODATE, &dev->flags);
+ if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
+ WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
+ dev->page = dev->orig_page;
+ }
wbi = dev->written;
dev->written = NULL;
- while (wbi && wbi->bi_sector <
+ while (wbi && wbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
wbi2 = r5_next_bio(wbi, dev->sector);
if (!raid5_dec_bi_active_stripes(wbi)) {
@@ -3019,6 +3056,8 @@ static void handle_stripe_clean_event(struct r5conf *conf,
0);
} else if (test_bit(R5_Discard, &dev->flags))
discard_pending = 1;
+ WARN_ON(test_bit(R5_SkipCopy, &dev->flags));
+ WARN_ON(dev->page != dev->orig_page);
}
if (!discard_pending &&
test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
@@ -3090,7 +3129,8 @@ static void handle_stripe_dirtying(struct r5conf *conf,
!test_bit(R5_LOCKED, &dev->flags) &&
!(test_bit(R5_UPTODATE, &dev->flags) ||
test_bit(R5_Wantcompute, &dev->flags))) {
- if (test_bit(R5_Insync, &dev->flags)) rcw++;
+ if (test_bit(R5_Insync, &dev->flags))
+ rcw++;
else
rcw += 2*disks;
}
@@ -3111,10 +3151,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,
!(test_bit(R5_UPTODATE, &dev->flags) ||
test_bit(R5_Wantcompute, &dev->flags)) &&
test_bit(R5_Insync, &dev->flags)) {
- if (
- test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
- pr_debug("Read_old block "
- "%d for r-m-w\n", i);
+ if (test_bit(STRIPE_PREREAD_ACTIVE,
+ &sh->state)) {
+ pr_debug("Read_old block %d for r-m-w\n",
+ i);
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
s->locked++;
@@ -3137,10 +3177,9 @@ static void handle_stripe_dirtying(struct r5conf *conf,
!(test_bit(R5_UPTODATE, &dev->flags) ||
test_bit(R5_Wantcompute, &dev->flags))) {
rcw++;
- if (!test_bit(R5_Insync, &dev->flags))
- continue; /* it's a failed drive */
- if (
- test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+ if (test_bit(R5_Insync, &dev->flags) &&
+ test_bit(STRIPE_PREREAD_ACTIVE,
+ &sh->state)) {
pr_debug("Read_old block "
"%d for Reconstruct\n", i);
set_bit(R5_LOCKED, &dev->flags);
@@ -4095,7 +4134,7 @@ static int raid5_mergeable_bvec(struct request_queue *q,
static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
{
- sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+ sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev);
unsigned int chunk_sectors = mddev->chunk_sectors;
unsigned int bio_sectors = bio_sectors(bio);
@@ -4232,9 +4271,9 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
/*
* compute position
*/
- align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector,
- 0,
- &dd_idx, NULL);
+ align_bi->bi_iter.bi_sector =
+ raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector,
+ 0, &dd_idx, NULL);
end_sector = bio_end_sector(align_bi);
rcu_read_lock();
@@ -4259,7 +4298,8 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
if (!bio_fits_rdev(align_bi) ||
- is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi),
+ is_badblock(rdev, align_bi->bi_iter.bi_sector,
+ bio_sectors(align_bi),
&first_bad, &bad_sectors)) {
/* too big in some way, or has a known bad block */
bio_put(align_bi);
@@ -4268,7 +4308,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
}
/* No reshape active, so we can trust rdev->data_offset */
- align_bi->bi_sector += rdev->data_offset;
+ align_bi->bi_iter.bi_sector += rdev->data_offset;
spin_lock_irq(&conf->device_lock);
wait_event_lock_irq(conf->wait_for_stripe,
@@ -4280,7 +4320,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
if (mddev->gendisk)
trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
align_bi, disk_devt(mddev->gendisk),
- raid_bio->bi_sector);
+ raid_bio->bi_iter.bi_sector);
generic_make_request(align_bi);
return 1;
} else {
@@ -4373,8 +4413,7 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
sh->group = NULL;
}
list_del_init(&sh->lru);
- atomic_inc(&sh->count);
- BUG_ON(atomic_read(&sh->count) != 1);
+ BUG_ON(atomic_inc_return(&sh->count) != 1);
return sh;
}
@@ -4404,7 +4443,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
* STRIPE_ON_UNPLUG_LIST clear but the stripe
* is still in our list
*/
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
/*
* STRIPE_ON_RELEASE_LIST could be set here. In that
@@ -4463,8 +4502,8 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
/* Skip discard while reshape is happening */
return;
- logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
- last_sector = bi->bi_sector + (bi->bi_size>>9);
+ logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
bi->bi_next = NULL;
bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
@@ -4550,6 +4589,8 @@ static void make_request(struct mddev *mddev, struct bio * bi)
struct stripe_head *sh;
const int rw = bio_data_dir(bi);
int remaining;
+ DEFINE_WAIT(w);
+ bool do_prepare;
if (unlikely(bi->bi_rw & REQ_FLUSH)) {
md_flush_request(mddev, bi);
@@ -4568,20 +4609,23 @@ static void make_request(struct mddev *mddev, struct bio * bi)
return;
}
- logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
+ prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
- DEFINE_WAIT(w);
int previous;
int seq;
+ do_prepare = false;
retry:
seq = read_seqcount_begin(&conf->gen_lock);
previous = 0;
- prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
+ if (do_prepare)
+ prepare_to_wait(&conf->wait_for_overlap, &w,
+ TASK_UNINTERRUPTIBLE);
if (unlikely(conf->reshape_progress != MaxSector)) {
/* spinlock is needed as reshape_progress may be
* 64bit on a 32bit platform, and so it might be
@@ -4602,6 +4646,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
: logical_sector >= conf->reshape_safe) {
spin_unlock_irq(&conf->device_lock);
schedule();
+ do_prepare = true;
goto retry;
}
}
@@ -4638,6 +4683,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
if (must_retry) {
release_stripe(sh);
schedule();
+ do_prepare = true;
goto retry;
}
}
@@ -4661,8 +4707,10 @@ static void make_request(struct mddev *mddev, struct bio * bi)
prepare_to_wait(&conf->wait_for_overlap,
&w, TASK_INTERRUPTIBLE);
if (logical_sector >= mddev->suspend_lo &&
- logical_sector < mddev->suspend_hi)
+ logical_sector < mddev->suspend_hi) {
schedule();
+ do_prepare = true;
+ }
goto retry;
}
@@ -4675,9 +4723,9 @@ static void make_request(struct mddev *mddev, struct bio * bi)
md_wakeup_thread(mddev->thread);
release_stripe(sh);
schedule();
+ do_prepare = true;
goto retry;
}
- finish_wait(&conf->wait_for_overlap, &w);
set_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state);
if ((bi->bi_rw & REQ_SYNC) &&
@@ -4687,10 +4735,10 @@ static void make_request(struct mddev *mddev, struct bio * bi)
} else {
/* cannot get stripe for read-ahead, just give-up */
clear_bit(BIO_UPTODATE, &bi->bi_flags);
- finish_wait(&conf->wait_for_overlap, &w);
break;
}
}
+ finish_wait(&conf->wait_for_overlap, &w);
remaining = raid5_dec_bi_active_stripes(bi);
if (remaining == 0) {
@@ -5026,8 +5074,8 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
- handle_stripe(sh);
release_stripe(sh);
return STRIPE_SECTORS;
@@ -5052,7 +5100,8 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
int remaining;
int handled = 0;
- logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ logical_sector = raid_bio->bi_iter.bi_sector &
+ ~((sector_t)STRIPE_SECTORS-1);
sector = raid5_compute_sector(conf, logical_sector,
0, &dd_idx, NULL);
last_sector = bio_end_sector(raid_bio);
@@ -5066,7 +5115,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
/* already done this stripe */
continue;
- sh = get_active_stripe(conf, sector, 0, 1, 0);
+ sh = get_active_stripe(conf, sector, 0, 1, 1);
if (!sh) {
/* failed to get a stripe - must wait */
@@ -5349,6 +5398,50 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
raid5_store_preread_threshold);
static ssize_t
+raid5_show_skip_copy(struct mddev *mddev, char *page)
+{
+ struct r5conf *conf = mddev->private;
+ if (conf)
+ return sprintf(page, "%d\n", conf->skip_copy);
+ else
+ return 0;
+}
+
+static ssize_t
+raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
+{
+ struct r5conf *conf = mddev->private;
+ unsigned long new;
+ if (len >= PAGE_SIZE)
+ return -EINVAL;
+ if (!conf)
+ return -ENODEV;
+
+ if (kstrtoul(page, 10, &new))
+ return -EINVAL;
+ new = !!new;
+ if (new == conf->skip_copy)
+ return len;
+
+ mddev_suspend(mddev);
+ conf->skip_copy = new;
+ if (new)
+ mddev->queue->backing_dev_info.capabilities |=
+ BDI_CAP_STABLE_WRITES;
+ else
+ mddev->queue->backing_dev_info.capabilities &=
+ ~BDI_CAP_STABLE_WRITES;
+ mddev_resume(mddev);
+ return len;
+}
+
+static struct md_sysfs_entry
+raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
+ raid5_show_skip_copy,
+ raid5_store_skip_copy);
+
+
+static ssize_t
stripe_cache_active_show(struct mddev *mddev, char *page)
{
struct r5conf *conf = mddev->private;
@@ -5433,6 +5526,7 @@ static struct attribute *raid5_attrs[] = {
&raid5_stripecache_active.attr,
&raid5_preread_bypass_threshold.attr,
&raid5_group_thread_cnt.attr,
+ &raid5_skip_copy.attr,
NULL,
};
static struct attribute_group raid5_attrs_group = {
@@ -5511,23 +5605,43 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
return sectors * (raid_disks - conf->max_degraded);
}
+static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
+{
+ safe_put_page(percpu->spare_page);
+ kfree(percpu->scribble);
+ percpu->spare_page = NULL;
+ percpu->scribble = NULL;
+}
+
+static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
+{
+ if (conf->level == 6 && !percpu->spare_page)
+ percpu->spare_page = alloc_page(GFP_KERNEL);
+ if (!percpu->scribble)
+ percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
+
+ if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
+ free_scratch_buffer(conf, percpu);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
static void raid5_free_percpu(struct r5conf *conf)
{
- struct raid5_percpu *percpu;
unsigned long cpu;
if (!conf->percpu)
return;
- get_online_cpus();
- for_each_possible_cpu(cpu) {
- percpu = per_cpu_ptr(conf->percpu, cpu);
- safe_put_page(percpu->spare_page);
- kfree(percpu->scribble);
- }
#ifdef CONFIG_HOTPLUG_CPU
unregister_cpu_notifier(&conf->cpu_notify);
#endif
+
+ get_online_cpus();
+ for_each_possible_cpu(cpu)
+ free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
put_online_cpus();
free_percpu(conf->percpu);
@@ -5554,15 +5668,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- if (conf->level == 6 && !percpu->spare_page)
- percpu->spare_page = alloc_page(GFP_KERNEL);
- if (!percpu->scribble)
- percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
-
- if (!percpu->scribble ||
- (conf->level == 6 && !percpu->spare_page)) {
- safe_put_page(percpu->spare_page);
- kfree(percpu->scribble);
+ if (alloc_scratch_buffer(conf, percpu)) {
pr_err("%s: failed memory allocation for cpu%ld\n",
__func__, cpu);
return notifier_from_errno(-ENOMEM);
@@ -5570,10 +5676,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- safe_put_page(percpu->spare_page);
- kfree(percpu->scribble);
- percpu->spare_page = NULL;
- percpu->scribble = NULL;
+ free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
break;
default:
break;
@@ -5585,40 +5688,29 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
static int raid5_alloc_percpu(struct r5conf *conf)
{
unsigned long cpu;
- struct page *spare_page;
- struct raid5_percpu __percpu *allcpus;
- void *scribble;
- int err;
+ int err = 0;
- allcpus = alloc_percpu(struct raid5_percpu);
- if (!allcpus)
+ conf->percpu = alloc_percpu(struct raid5_percpu);
+ if (!conf->percpu)
return -ENOMEM;
- conf->percpu = allcpus;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ conf->cpu_notify.notifier_call = raid456_cpu_notify;
+ conf->cpu_notify.priority = 0;
+ err = register_cpu_notifier(&conf->cpu_notify);
+ if (err)
+ return err;
+#endif
get_online_cpus();
- err = 0;
for_each_present_cpu(cpu) {
- if (conf->level == 6) {
- spare_page = alloc_page(GFP_KERNEL);
- if (!spare_page) {
- err = -ENOMEM;
- break;
- }
- per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
- }
- scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
- if (!scribble) {
- err = -ENOMEM;
+ err = alloc_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
+ if (err) {
+ pr_err("%s: failed memory allocation for cpu%ld\n",
+ __func__, cpu);
break;
}
- per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
}
-#ifdef CONFIG_HOTPLUG_CPU
- conf->cpu_notify.notifier_call = raid456_cpu_notify;
- conf->cpu_notify.priority = 0;
- if (err == 0)
- err = register_cpu_notifier(&conf->cpu_notify);
-#endif
put_online_cpus();
return err;
@@ -6100,6 +6192,7 @@ static int run(struct mddev *mddev)
blk_queue_io_min(mddev->queue, chunk_size);
blk_queue_io_opt(mddev->queue, chunk_size *
(conf->raid_disks - conf->max_degraded));
+ mddev->queue->limits.raid_partial_stripes_expensive = 1;
/*
* We can only discard a whole stripe. It doesn't make sense to
* discard data disk but write parity disk