diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2008-07-21 10:29:12 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-07-21 10:29:12 -0700 |
commit | 8a392625b665c676a77c62f8608d10ff430bcb83 (patch) | |
tree | 4000a65d61baed73200e47f91dea5263ed16edd0 /drivers/md/raid5.c | |
parent | 519f0141f1c42e2b8b59c7dea005cbf6095358e8 (diff) | |
parent | 4b80991c6cb9efa607bc4fd6f3ecdf5511c31bb0 (diff) |
Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md: (52 commits)
md: Protect access to mddev->disks list using RCU
md: only count actual openers as access which prevent a 'stop'
md: linear: Make array_size sector-based and rename it to array_sectors.
md: Make mddev->array_size sector-based.
md: Make super_type->rdev_size_change() take sector-based sizes.
md: Fix check for overlapping devices.
md: Tidy up rdev_size_store a bit:
md: Remove some unused macros.
md: Turn rdev->sb_offset into a sector-based quantity.
md: Make calc_dev_sboffset() return a sector count.
md: Replace calc_dev_size() by calc_num_sectors().
md: Make update_size() take the number of sectors.
md: Better control of when do_md_stop is allowed to stop the array.
md: get_disk_info(): Don't convert between signed and unsigned and back.
md: Simplify restart_array().
md: alloc_disk_sb(): Return proper error value.
md: Simplify sb_equal().
md: Simplify uuid_equal().
md: sb_equal(): Fix misleading printk.
md: Fix a typo in the comment to cmd_match().
...
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r-- | drivers/md/raid5.c | 745 |
1 files changed, 255 insertions, 490 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9ce7154845c..55e7c56045a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -115,15 +115,20 @@ static void return_io(struct bio *return_bi) return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; - bi->bi_end_io(bi, - test_bit(BIO_UPTODATE, &bi->bi_flags) - ? 0 : -EIO); + bio_endio(bi, 0); bi = return_bi; } } static void print_raid5_conf (raid5_conf_t *conf); +static int stripe_operations_active(struct stripe_head *sh) +{ + return sh->check_state || sh->reconstruct_state || + test_bit(STRIPE_BIOFILL_RUN, &sh->state) || + test_bit(STRIPE_COMPUTE_RUN, &sh->state); +} + static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) { if (atomic_dec_and_test(&sh->count)) { @@ -143,7 +148,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) } md_wakeup_thread(conf->mddev->thread); } else { - BUG_ON(sh->ops.pending); + BUG_ON(stripe_operations_active(sh)); if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { atomic_dec(&conf->preread_active_stripes); if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) @@ -245,7 +250,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); - BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete); + BUG_ON(stripe_operations_active(sh)); CHECK_DEVLOCK(); pr_debug("init_stripe called, stripe %llu\n", @@ -346,62 +351,18 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector return sh; } -/* test_and_ack_op() ensures that we only dequeue an operation once */ -#define test_and_ack_op(op, pend) \ -do { \ - if (test_bit(op, &sh->ops.pending) && \ - !test_bit(op, &sh->ops.complete)) { \ - if (test_and_set_bit(op, &sh->ops.ack)) \ - clear_bit(op, &pend); \ - else \ - ack++; \ - } else \ - clear_bit(op, &pend); \ -} while (0) - -/* find new work to run, do not resubmit work that is already - * in flight - */ -static unsigned long get_stripe_work(struct stripe_head *sh) -{ - unsigned long pending; - int ack = 0; - - pending = sh->ops.pending; - - test_and_ack_op(STRIPE_OP_BIOFILL, pending); - test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending); - test_and_ack_op(STRIPE_OP_PREXOR, pending); - test_and_ack_op(STRIPE_OP_BIODRAIN, pending); - test_and_ack_op(STRIPE_OP_POSTXOR, pending); - test_and_ack_op(STRIPE_OP_CHECK, pending); - if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending)) - ack++; - - sh->ops.count -= ack; - if (unlikely(sh->ops.count < 0)) { - printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx " - "ops.complete: %#lx\n", pending, sh->ops.pending, - sh->ops.ack, sh->ops.complete); - BUG(); - } - - return pending; -} - static void raid5_end_read_request(struct bio *bi, int error); static void raid5_end_write_request(struct bio *bi, int error); -static void ops_run_io(struct stripe_head *sh) +static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) { raid5_conf_t *conf = sh->raid_conf; int i, disks = sh->disks; might_sleep(); - set_bit(STRIPE_IO_STARTED, &sh->state); for (i = disks; i--; ) { int rw; struct bio *bi; @@ -430,11 +391,11 @@ static void ops_run_io(struct stripe_head *sh) rcu_read_unlock(); if (rdev) { - if (test_bit(STRIPE_SYNCING, &sh->state) || - test_bit(STRIPE_EXPAND_SOURCE, &sh->state) || - test_bit(STRIPE_EXPAND_READY, &sh->state)) + if (s->syncing || s->expanding || s->expanded) md_sync_acct(rdev->bdev, STRIPE_SECTORS); + set_bit(STRIPE_IO_STARTED, &sh->state); + bi->bi_bdev = rdev->bdev; pr_debug("%s: for %llu schedule op %ld on disc %d\n", __func__, (unsigned long long)sh->sector, @@ -528,38 +489,34 @@ static void ops_complete_biofill(void *stripe_head_ref) (unsigned long long)sh->sector); /* clear completed biofills */ + spin_lock_irq(&conf->device_lock); for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; /* acknowledge completion of a biofill operation */ /* and check if we need to reply to a read request, * new R5_Wantfill requests are held off until - * !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending) + * !STRIPE_BIOFILL_RUN */ if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { struct bio *rbi, *rbi2; - /* The access to dev->read is outside of the - * spin_lock_irq(&conf->device_lock), but is protected - * by the STRIPE_OP_BIOFILL pending bit - */ BUG_ON(!dev->read); rbi = dev->read; dev->read = NULL; while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { rbi2 = r5_next_bio(rbi, dev->sector); - spin_lock_irq(&conf->device_lock); if (--rbi->bi_phys_segments == 0) { rbi->bi_next = return_bi; return_bi = rbi; } - spin_unlock_irq(&conf->device_lock); rbi = rbi2; } } } - set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); + spin_unlock_irq(&conf->device_lock); + clear_bit(STRIPE_BIOFILL_RUN, &sh->state); return_io(return_bi); @@ -610,13 +567,14 @@ static void ops_complete_compute5(void *stripe_head_ref) set_bit(R5_UPTODATE, &tgt->flags); BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); clear_bit(R5_Wantcompute, &tgt->flags); - set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); + clear_bit(STRIPE_COMPUTE_RUN, &sh->state); + if (sh->check_state == check_state_compute_run) + sh->check_state = check_state_compute_result; set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } -static struct dma_async_tx_descriptor * -ops_run_compute5(struct stripe_head *sh, unsigned long pending) +static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) { /* kernel stack size limits the total number of disks */ int disks = sh->disks; @@ -646,10 +604,6 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending) ASYNC_TX_XOR_ZERO_DST, NULL, ops_complete_compute5, sh); - /* ack now if postxor is not set to be run */ - if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending)) - async_tx_ack(tx); - return tx; } @@ -659,8 +613,6 @@ static void ops_complete_prexor(void *stripe_head_ref) pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - - set_bit(STRIPE_OP_PREXOR, &sh->ops.complete); } static struct dma_async_tx_descriptor * @@ -680,7 +632,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; /* Only process blocks that are known to be uptodate */ - if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags)) + if (test_bit(R5_Wantdrain, &dev->flags)) xor_srcs[count++] = dev->page; } @@ -692,16 +644,10 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) } static struct dma_async_tx_descriptor * -ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, - unsigned long pending) +ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) { int disks = sh->disks; - int pd_idx = sh->pd_idx, i; - - /* check if prexor is active which means only process blocks - * that are part of a read-modify-write (Wantprexor) - */ - int prexor = test_bit(STRIPE_OP_PREXOR, &pending); + int i; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -709,20 +655,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; struct bio *chosen; - int towrite; - - towrite = 0; - if (prexor) { /* rmw */ - if (dev->towrite && - test_bit(R5_Wantprexor, &dev->flags)) - towrite = 1; - } else { /* rcw */ - if (i != pd_idx && dev->towrite && - test_bit(R5_LOCKED, &dev->flags)) - towrite = 1; - } - if (towrite) { + if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { struct bio *wbi; spin_lock(&sh->lock); @@ -747,18 +681,6 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, static void ops_complete_postxor(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - - pr_debug("%s: stripe %llu\n", __func__, - (unsigned long long)sh->sector); - - set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); - set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); -} - -static void ops_complete_write(void *stripe_head_ref) -{ - struct stripe_head *sh = stripe_head_ref; int disks = sh->disks, i, pd_idx = sh->pd_idx; pr_debug("%s: stripe %llu\n", __func__, @@ -770,16 +692,21 @@ static void ops_complete_write(void *stripe_head_ref) set_bit(R5_UPTODATE, &dev->flags); } - set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); - set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); + if (sh->reconstruct_state == reconstruct_state_drain_run) + sh->reconstruct_state = reconstruct_state_drain_result; + else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) + sh->reconstruct_state = reconstruct_state_prexor_drain_result; + else { + BUG_ON(sh->reconstruct_state != reconstruct_state_run); + sh->reconstruct_state = reconstruct_state_result; + } set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } static void -ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, - unsigned long pending) +ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) { /* kernel stack size limits the total number of disks */ int disks = sh->disks; @@ -787,9 +714,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, int count = 0, pd_idx = sh->pd_idx, i; struct page *xor_dest; - int prexor = test_bit(STRIPE_OP_PREXOR, &pending); + int prexor = 0; unsigned long flags; - dma_async_tx_callback callback; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -797,7 +723,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, /* check if prexor is active which means only process blocks * that are part of a read-modify-write (written) */ - if (prexor) { + if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { + prexor = 1; xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -813,10 +740,6 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, } } - /* check whether this postxor is part of a write */ - callback = test_bit(STRIPE_OP_BIODRAIN, &pending) ? - ops_complete_write : ops_complete_postxor; - /* 1/ if we prexor'd then the dest is reused as a source * 2/ if we did not prexor then we are redoing the parity * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST @@ -830,25 +753,20 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, if (unlikely(count == 1)) { flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, - flags, tx, callback, sh); + flags, tx, ops_complete_postxor, sh); } else tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, - flags, tx, callback, sh); + flags, tx, ops_complete_postxor, sh); } static void ops_complete_check(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - int pd_idx = sh->pd_idx; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) && - sh->ops.zero_sum_result == 0) - set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - - set_bit(STRIPE_OP_CHECK, &sh->ops.complete); + sh->check_state = check_state_check_result; set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } @@ -875,46 +793,42 @@ static void ops_run_check(struct stripe_head *sh) tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); - if (tx) - set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending); - else - clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending); - atomic_inc(&sh->count); tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, ops_complete_check, sh); } -static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) +static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) { int overlap_clear = 0, i, disks = sh->disks; struct dma_async_tx_descriptor *tx = NULL; - if (test_bit(STRIPE_OP_BIOFILL, &pending)) { + if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { ops_run_biofill(sh); overlap_clear++; } - if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending)) - tx = ops_run_compute5(sh, pending); + if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { + tx = ops_run_compute5(sh); + /* terminate the chain if postxor is not set to be run */ + if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) + async_tx_ack(tx); + } - if (test_bit(STRIPE_OP_PREXOR, &pending)) + if (test_bit(STRIPE_OP_PREXOR, &ops_request)) tx = ops_run_prexor(sh, tx); - if (test_bit(STRIPE_OP_BIODRAIN, &pending)) { - tx = ops_run_biodrain(sh, tx, pending); + if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { + tx = ops_run_biodrain(sh, tx); overlap_clear++; } - if (test_bit(STRIPE_OP_POSTXOR, &pending)) - ops_run_postxor(sh, tx, pending); + if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) + ops_run_postxor(sh, tx); - if (test_bit(STRIPE_OP_CHECK, &pending)) + if (test_bit(STRIPE_OP_CHECK, &ops_request)) ops_run_check(sh); - if (test_bit(STRIPE_OP_IO, &pending)) - ops_run_io(sh); - if (overlap_clear) for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -997,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) struct stripe_head *osh, *nsh; LIST_HEAD(newstripes); struct disk_info *ndisks; - int err = 0; + int err; struct kmem_cache *sc; int i; if (newsize <= conf->pool_size) return 0; /* never bother to shrink */ - md_allow_write(conf->mddev); + err = md_allow_write(conf->mddev); + if (err) + return err; /* Step 1 */ sc = kmem_cache_create(conf->cache_name[1-conf->active_name], @@ -1703,11 +1619,11 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) } } -static int -handle_write_operations5(struct stripe_head *sh, int rcw, int expand) +static void +schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, + int rcw, int expand) { int i, pd_idx = sh->pd_idx, disks = sh->disks; - int locked = 0; if (rcw) { /* if we are not expanding this is a proper write request, and @@ -1715,53 +1631,48 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) * stripe cache */ if (!expand) { - set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); - sh->ops.count++; - } + sh->reconstruct_state = reconstruct_state_drain_run; + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + } else + sh->reconstruct_state = reconstruct_state_run; - set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); - sh->ops.count++; + set_bit(STRIPE_OP_POSTXOR, &s->ops_request); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (dev->towrite) { set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantdrain, &dev->flags); if (!expand) clear_bit(R5_UPTODATE, &dev->flags); - locked++; + s->locked++; } } - if (locked + 1 == disks) + if (s->locked + 1 == disks) if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) atomic_inc(&sh->raid_conf->pending_full_writes); } else { BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); - set_bit(STRIPE_OP_PREXOR, &sh->ops.pending); - set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); - set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); - - sh->ops.count += 3; + sh->reconstruct_state = reconstruct_state_prexor_drain_run; + set_bit(STRIPE_OP_PREXOR, &s->ops_request); + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + set_bit(STRIPE_OP_POSTXOR, &s->ops_request); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (i == pd_idx) continue; - /* For a read-modify write there may be blocks that are - * locked for reading while others are ready to be - * written so we distinguish these blocks by the - * R5_Wantprexor bit - */ if (dev->towrite && (test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_Wantcompute, &dev->flags))) { - set_bit(R5_Wantprexor, &dev->flags); + test_bit(R5_Wantcompute, &dev->flags))) { + set_bit(R5_Wantdrain, &dev->flags); set_bit(R5_LOCKED, &dev->flags); clear_bit(R5_UPTODATE, &dev->flags); - locked++; + s->locked++; } } } @@ -1771,13 +1682,11 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) */ set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - locked++; + s->locked++; - pr_debug("%s: stripe %llu locked: %d pending: %lx\n", + pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", __func__, (unsigned long long)sh->sector, - locked, sh->ops.pending); - - return locked; + s->locked, s->ops_request); } /* @@ -1876,7 +1785,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) } static void -handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, +handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks, struct bio **return_bi) { @@ -1967,48 +1876,38 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, md_wakeup_thread(conf->mddev->thread); } -/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks - * to process +/* fetch_block5 - checks the given member device to see if its data needs + * to be read or computed to satisfy a request. + * + * Returns 1 when no more member devices need to be checked, otherwise returns + * 0 to tell the loop in handle_stripe_fill5 to continue */ -static int __handle_issuing_new_read_requests5(struct stripe_head *sh, - struct stripe_head_state *s, int disk_idx, int disks) +static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, + int disk_idx, int disks) { struct r5dev *dev = &sh->dev[disk_idx]; struct r5dev *failed_dev = &sh->dev[s->failed_num]; - /* don't schedule compute operations or reads on the parity block while - * a check is in flight - */ - if ((disk_idx == sh->pd_idx) && - test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) - return ~0; - /* is the data in this block needed, and can we get it? */ if (!test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || - (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || - s->syncing || s->expanding || (s->failed && - (failed_dev->toread || (failed_dev->towrite && - !test_bit(R5_OVERWRITE, &failed_dev->flags) - ))))) { - /* 1/ We would like to get this block, possibly by computing it, - * but we might not be able to. - * - * 2/ Since parity check operations potentially make the parity - * block !uptodate it will need to be refreshed before any - * compute operations on data disks are scheduled. - * - * 3/ We hold off parity block re-reads until check operations - * have quiesced. + !test_bit(R5_UPTODATE, &dev->flags) && + (dev->toread || + (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || + s->syncing || s->expanding || + (s->failed && + (failed_dev->toread || + (failed_dev->towrite && + !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) { + /* We would like to get this block, possibly by computing it, + * otherwise read it if the backing disk is insync */ if ((s->uptodate == disks - 1) && - (s->failed && disk_idx == s->failed_num) && - !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { - set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); + (s->failed && disk_idx == s->failed_num)) { + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); set_bit(R5_Wantcompute, &dev->flags); sh->ops.target = disk_idx; s->req_compute = 1; - sh->ops.count++; /* Careful: from this point on 'uptodate' is in the eye * of raid5_run_ops which services 'compute' operations * before writes. R5_Wantcompute flags a block that will @@ -2016,53 +1915,40 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, * subsequent operation. */ s->uptodate++; - return 0; /* uptodate + compute == disks */ + return 1; /* uptodate + compute == disks */ } else if (test_bit(R5_Insync, &dev->flags)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; s->locked++; pr_debug("Reading block %d (sync=%d)\n", disk_idx, s->syncing); } } - return ~0; + return 0; } -static void handle_issuing_new_read_requests5(struct stripe_head *sh, +/** + * handle_stripe_fill5 - read or compute data to satisfy pending requests. + */ +static void handle_stripe_fill5(struct stripe_head *sh, struct stripe_head_state *s, int disks) { int i; - /* Clear completed compute operations. Parity recovery - * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled - * later on in this routine - */ - if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && - !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); - } - /* look for blocks to read/compute, skip this if a compute * is already in flight, or if the stripe contents are in the * midst of changing due to a write */ - if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && - !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) && - !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && + !sh->reconstruct_state) for (i = disks; i--; ) - if (__handle_issuing_new_read_requests5( - sh, s, i, disks) == 0) + if (fetch_block5(sh, s, i, disks)) break; - } set_bit(STRIPE_HANDLE, &sh->state); } -static void handle_issuing_new_read_requests6(struct stripe_head *sh, +static void handle_stripe_fill6(struct stripe_head *sh, struct stripe_head_state *s, struct r6_state *r6s, int disks) { @@ -2121,12 +2007,12 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh, } -/* handle_completed_write_requests +/* handle_stripe_clean_event * any written block on an uptodate or failed drive can be returned. * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but * never LOCKED, so we don't need to test 'failed' directly. */ -static void handle_completed_write_requests(raid5_conf_t *conf, +static void handle_stripe_clean_event(raid5_conf_t *conf, struct stripe_head *sh, int disks, struct bio **return_bi) { int i; @@ -2171,7 +2057,7 @@ static void handle_completed_write_requests(raid5_conf_t *conf, md_wakeup_thread(conf->mddev->thread); } -static void handle_issuing_new_write_requests5(raid5_conf_t *conf, +static void handle_stripe_dirtying5(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks) { int rmw = 0, rcw = 0, i; @@ -2215,9 +2101,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, "%d for r-m-w\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit( - STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; s->locked++; } else { set_bit(STRIPE_DELAYED, &sh->state); @@ -2241,9 +2124,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, "%d for Reconstruct\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit( - STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; s->locked++; } else { set_bit(STRIPE_DELAYED, &sh->state); @@ -2261,14 +2141,13 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, * simultaneously. If this is not the case then new writes need to be * held off until the compute completes. */ - if ((s->req_compute || - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) && - (s->locked == 0 && (rcw == 0 || rmw == 0) && - !test_bit(STRIPE_BIT_DELAY, &sh->state))) - s->locked += handle_write_operations5(sh, rcw == 0, 0); + if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && + (s->locked == 0 && (rcw == 0 || rmw == 0) && + !test_bit(STRIPE_BIT_DELAY, &sh->state))) + schedule_reconstruction5(sh, s, rcw == 0, 0); } -static void handle_issuing_new_write_requests6(raid5_conf_t *conf, +static void handle_stripe_dirtying6(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, struct r6_state *r6s, int disks) { @@ -2371,92 +2250,86 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf, static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks) { - int canceled_check = 0; + struct r5dev *dev = NULL; set_bit(STRIPE_HANDLE, &sh->state); - /* complete a check operation */ - if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { - clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); - clear_bit(STRIPE_OP_CHECK, &sh->ops.pending); + switch (sh->check_state) { + case check_state_idle: + /* start a new check operation if there are no failures */ if (s->failed == 0) { - if (sh->ops.zero_sum_result == 0) - /* parity is correct (on disc, - * not in buffer any more) - */ - set_bit(STRIPE_INSYNC, &sh->state); - else { - conf->mddev->resync_mismatches += - STRIPE_SECTORS; - if (test_bit( - MD_RECOVERY_CHECK, &conf->mddev->recovery)) - /* don't try to repair!! */ - set_bit(STRIPE_INSYNC, &sh->state); - else { - set_bit(STRIPE_OP_COMPUTE_BLK, - &sh->ops.pending); - set_bit(STRIPE_OP_MOD_REPAIR_PD, - &sh->ops.pending); - set_bit(R5_Wantcompute, - &sh->dev[sh->pd_idx].flags); - sh->ops.target = sh->pd_idx; - sh->ops.count++; - s->uptodate++; - } - } - } else - canceled_check = 1; /* STRIPE_INSYNC is not set */ - } - - /* start a new check operation if there are no failures, the stripe is - * not insync, and a repair is not in flight - */ - if (s->failed == 0 && - !test_bit(STRIPE_INSYNC, &sh->state) && - !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { - if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { BUG_ON(s->uptodate != disks); + sh->check_state = check_state_run; + set_bit(STRIPE_OP_CHECK, &s->ops_request); clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); - sh->ops.count++; s->uptodate--; + break; } - } - - /* check if we can clear a parity disk reconstruct */ - if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && - test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { - - clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); - } - + dev = &sh->dev[s->failed_num]; + /* fall through */ + case check_state_compute_result: + sh->check_state = check_state_idle; + if (!dev) + dev = &sh->dev[sh->pd_idx]; + + /* check that a write has not made the stripe insync */ + if (test_bit(STRIPE_INSYNC, &sh->state)) + break; - /* Wait for check parity and compute block operations to complete - * before write-back. If a failure occurred while the check operation - * was in flight we need to cycle this stripe through handle_stripe - * since the parity block may not be uptodate - */ - if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) && - !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) && - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) { - struct r5dev *dev; /* either failed parity check, or recovery is happening */ - if (s->failed == 0) - s->failed_num = sh->pd_idx; - dev = &sh->dev[s->failed_num]; BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); BUG_ON(s->uptodate != disks); set_bit(R5_LOCKED, &dev->flags); + s->locked++; set_bit(R5_Wantwrite, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; clear_bit(STRIPE_DEGRADED, &sh->state); - s->locked++; set_bit(STRIPE_INSYNC, &sh->state); + break; + case check_state_run: + break; /* we will be called again upon completion */ + case check_state_check_result: + sh->check_state = check_state_idle; + + /* if a failure occurred during the check operation, leave + * STRIPE_INSYNC not set and let the stripe be handled again + */ + if (s->failed) + break; + + /* handle a successful check operation, if parity is correct + * we are done. Otherwise update the mismatch count and repair + * parity if !MD_RECOVERY_CHECK + */ + if (sh->ops.zero_sum_result == 0) + /* parity is correct (on disc, + * not in buffer any more) + */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + conf->mddev->resync_mismatches += STRIPE_SECTORS; + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) + /* don't try to repair!! */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + sh->check_state = check_state_compute_run; + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); + set_bit(R5_Wantcompute, + &sh->dev[sh->pd_idx].flags); + sh->ops.target = sh->pd_idx; + s->uptodate++; + } + } + break; + case check_state_compute_run: + break; + default: + printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", + __func__, sh->check_state, + (unsigned long long) sh->sector); + BUG(); } } @@ -2641,15 +2514,14 @@ static void handle_stripe5(struct stripe_head *sh) struct bio *return_bi = NULL; struct stripe_head_state s; struct r5dev *dev; - unsigned long pending = 0; mdk_rdev_t *blocked_rdev = NULL; int prexor; memset(&s, 0, sizeof(s)); - pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " - "ops=%lx:%lx:%lx\n", (unsigned long long)sh->sector, sh->state, - atomic_read(&sh->count), sh->pd_idx, - sh->ops.pending, sh->ops.ack, sh->ops.complete); + pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " + "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, + atomic_read(&sh->count), sh->pd_idx, sh->check_state, + sh->reconstruct_state); spin_lock(&sh->lock); clear_bit(STRIPE_HANDLE, &sh->state); @@ -2658,15 +2530,8 @@ static void handle_stripe5(struct stripe_head *sh) s.syncing = test_bit(STRIPE_SYNCING, &sh->state); s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); - /* Now to look around and see what can be done */ - - /* clean-up completed biofill operations */ - if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) { - clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending); - clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack); - clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); - } + /* Now to look around and see what can be done */ rcu_read_lock(); for (i=disks; i--; ) { mdk_rdev_t *rdev; @@ -2680,10 +2545,10 @@ static void handle_stripe5(struct stripe_head *sh) /* maybe we can request a biofill operation * * new wantfill requests are only permitted while - * STRIPE_OP_BIOFILL is clear + * ops_complete_biofill is guaranteed to be inactive */ if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && - !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) + !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) set_bit(R5_Wantfill, &dev->flags); /* now count some things */ @@ -2727,8 +2592,10 @@ static void handle_stripe5(struct stripe_head *sh) goto unlock; } - if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) - sh->ops.count++; + if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { + set_bit(STRIPE_OP_BIOFILL, &s.ops_request); + set_bit(STRIPE_BIOFILL_RUN, &sh->state); + } pr_debug("locked=%d uptodate=%d to_read=%d" " to_write=%d failed=%d failed_num=%d\n", @@ -2738,8 +2605,7 @@ static void handle_stripe5(struct stripe_head *sh) * need to be failed */ if (s.failed > 1 && s.to_read+s.to_write+s.written) - handle_requests_to_failed_array(conf, sh, &s, disks, - &return_bi); + handle_failed_stripe(conf, sh, &s, disks, &return_bi); if (s.failed > 1 && s.syncing) { md_done_sync(conf->mddev, STRIPE_SECTORS,0); clear_bit(STRIPE_SYNCING, &sh->state); @@ -2755,48 +2621,25 @@ static void handle_stripe5(struct stripe_head *sh) !test_bit(R5_LOCKED, &dev->flags) && test_bit(R5_UPTODATE, &dev->flags)) || (s.failed == 1 && s.failed_num == sh->pd_idx))) |