aboutsummaryrefslogtreecommitdiff
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c179
1 files changed, 154 insertions, 25 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 431b9b26ca5..0d403ca12ae 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -639,7 +639,8 @@ static void mark_target_uptodate(struct stripe_head *sh, int target)
return;
tgt = &sh->dev[target];
- set_bit(R5_UPTODATE, &tgt->flags);
+ if(!tgt->dpage)
+ set_bit(R5_UPTODATE, &tgt->flags);
BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
clear_bit(R5_Wantcompute, &tgt->flags);
}
@@ -681,6 +682,7 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
struct dma_async_tx_descriptor *tx;
struct async_submit_ctl submit;
int i;
+ enum async_tx_flags flags = ASYNC_TX_FENCE | ASYNC_TX_XOR_ZERO_DST;
pr_debug("%s: stripe %llu block: %d\n",
__func__, (unsigned long long)sh->sector, target);
@@ -692,7 +694,7 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
atomic_inc(&sh->count);
- init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
+ init_async_submit(&submit, flags, NULL,
ops_complete_compute, sh, to_addr_conv(sh, percpu));
if (unlikely(count == 1))
tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
@@ -915,6 +917,7 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
struct page **xor_srcs = percpu->scribble;
int count = 0, pd_idx = sh->pd_idx, i;
struct async_submit_ctl submit;
+ enum async_tx_flags flags = ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST;
/* existing parity data subtracted */
struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
@@ -929,7 +932,7 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
xor_srcs[count++] = dev->page;
}
- init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
+ init_async_submit(&submit, flags, tx,
ops_complete_prexor, sh, to_addr_conv(sh, percpu));
tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
@@ -942,9 +945,80 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
int disks = sh->disks;
int i;
+#ifdef CONFIG_MD_RAID_SKIP_BIO_COPY
+ int pd_idx = sh->pd_idx;
+ int qd_idx = sh->raid_conf->level == 6 ?
+ raid6_next_disk(pd_idx, disks) : -1;
+ int fswrite = 1;
+#endif
+
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
+#ifdef CONFIG_MD_RAID_SKIP_BIO_COPY
+ /* initially assume that the operation is a full-stripe write*/
+ for (i = disks; i-- ;) {
+ struct r5dev *dev = &sh->dev[i];
+
+ if (unlikely(i == pd_idx || i == qd_idx))
+ continue;
+ if (unlikely(!test_bit(R5_Wantdrain, &dev->flags)))
+ goto do_copy;
+ if ((test_bit(R5_OVERWRITE, &dev->flags)) &&
+ !r5_next_bio(sh->dev[i].towrite, sh->dev[i].sector)) {
+ /* now check if there is only one bio_vec within
+ * the bio covers the sh->dev[i]
+ */
+ struct bio *pbio = sh->dev[i].towrite;
+ struct bio_vec *bvl;
+ int found = 0;
+ int bvec_page = pbio->bi_sector << 9, k;
+ int dev_page = sh->dev[i].sector << 9;
+
+ /* search for the bio_vec that covers dev[i].page */
+ bio_for_each_segment(bvl, pbio, k) {
+ if (bvec_page == dev_page &&
+ bio_iovec_idx(pbio,k)->bv_len ==
+ STRIPE_SIZE) {
+ /* found the vector which covers the
+ * strip fully
+ */
+ found = 1;
+ break;
+ }
+ bvec_page += bio_iovec_idx(pbio,k)->bv_len;
+ }
+ if (found) {
+ /* save the direct pointer to buffer */
+ if(dev->dpage)
+ printk("BIO bugs\n");
+ BUG_ON(dev->dpage);
+ dev->dpage = bio_iovec_idx(pbio,k)->bv_page;
+ clear_bit(R5_Skipped, &dev->flags);
+ continue;
+ }
+ }
+do_copy:
+ /* come here in two cases:
+ * - the dev[i] is not covered fully with the bio;
+ * - there are more than one bios cover the dev[i].
+ * in both cases do copy from bio to dev[i].page
+ */
+ pr_debug("%s: do copy because of disk %d\n", __FUNCTION__, i);
+ do {
+ /* restore dpages set */
+ sh->dev[i].dpage = NULL;
+ } while (++i != disks);
+ fswrite = 0;
+ break;
+ }
+
+ if (fswrite) {
+ /* won't add new txs right now, so run ops currently pending */
+ async_tx_issue_pending_all();
+ }
+#endif
+
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
struct bio *chosen;
@@ -959,6 +1033,13 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
wbi = dev->written = chosen;
spin_unlock(&sh->lock);
+#ifdef CONFIG_MD_RAID_SKIP_BIO_COPY
+ if (fswrite) {
+ /* just update dev bio vec pointer */
+ dev->vec.bv_page = dev->dpage;
+ continue;
+ }
+#endif
while (wbi && wbi->bi_sector <
dev->sector + STRIPE_SECTORS) {
tx = async_copy_data(1, wbi, dev->page,
@@ -985,8 +1066,10 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
- if (dev->written || i == pd_idx || i == qd_idx)
- set_bit(R5_UPTODATE, &dev->flags);
+ if (dev->dpage)
+ set_bit(R5_Skipped, &dev->flags);
+ else if (dev->written || i == pd_idx || i == qd_idx)
+ set_bit(R5_UPTODATE, &dev->flags);
}
if (sh->reconstruct_state == reconstruct_state_drain_run)
@@ -1026,14 +1109,16 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (dev->written)
- xor_srcs[count++] = dev->page;
+ xor_srcs[count++] = dev->dpage ?
+ dev->dpage : dev->page;
}
} else {
xor_dest = sh->dev[pd_idx].page;
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (i != pd_idx)
- xor_srcs[count++] = dev->page;
+ xor_srcs[count++] = dev->dpage ?
+ dev->dpage : dev->page;
}
}
@@ -2437,7 +2522,8 @@ static void handle_stripe_clean_event(raid5_conf_t *conf,
if (sh->dev[i].written) {
dev = &sh->dev[i];
if (!test_bit(R5_LOCKED, &dev->flags) &&
- test_bit(R5_UPTODATE, &dev->flags)) {
+ (test_bit(R5_UPTODATE, &dev->flags) ||
+ test_bit(R5_Skipped, &dev->flags))) {
/* We can return any write requests */
struct bio *wbi, *wbi2;
int bitmap_end = 0;
@@ -2445,6 +2531,17 @@ static void handle_stripe_clean_event(raid5_conf_t *conf,
spin_lock_irq(&conf->device_lock);
wbi = dev->written;
dev->written = NULL;
+
+ if (dev->dpage) {
+ /* with direct writes the raid disk
+ * cache actually isn't UPTODATE
+ */
+ clear_bit(R5_Skipped, &dev->flags);
+ clear_bit(R5_OVERWRITE, &dev->flags);
+ dev->vec.bv_page = dev->page;
+ dev->dpage = NULL;
+ }
+
while (wbi && wbi->bi_sector <
dev->sector + STRIPE_SECTORS) {
wbi2 = r5_next_bio(wbi, dev->sector);
@@ -2947,6 +3044,7 @@ static void handle_stripe5(struct stripe_head *sh)
struct r5dev *dev;
mdk_rdev_t *blocked_rdev = NULL;
int prexor;
+ int dec_preread_active = 0;
memset(&s, 0, sizeof(s));
pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
@@ -3096,12 +3194,8 @@ static void handle_stripe5(struct stripe_head *sh)
set_bit(STRIPE_INSYNC, &sh->state);
}
}
- if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
- atomic_dec(&conf->preread_active_stripes);
- if (atomic_read(&conf->preread_active_stripes) <
- IO_THRESHOLD)
- md_wakeup_thread(conf->mddev->thread);
- }
+ if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ dec_preread_active = 1;
}
/* Now to consider new write requests and what else, if anything
@@ -3208,6 +3302,16 @@ static void handle_stripe5(struct stripe_head *sh)
ops_run_io(sh, &s);
+ if (dec_preread_active) {
+ /* We delay this until after ops_run_io so that if make_request
+ * is waiting on a barrier, it won't continue until the writes
+ * have actually been submitted.
+ */
+ atomic_dec(&conf->preread_active_stripes);
+ if (atomic_read(&conf->preread_active_stripes) <
+ IO_THRESHOLD)
+ md_wakeup_thread(conf->mddev->thread);
+ }
return_io(return_bi);
}
@@ -3221,6 +3325,7 @@ static void handle_stripe6(struct stripe_head *sh)
struct r6_state r6s;
struct r5dev *dev, *pdev, *qdev;
mdk_rdev_t *blocked_rdev = NULL;
+ int dec_preread_active = 0;
pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
"pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
@@ -3358,7 +3463,6 @@ static void handle_stripe6(struct stripe_head *sh)
* completed
*/
if (sh->reconstruct_state == reconstruct_state_drain_result) {
- int qd_idx = sh->qd_idx;
sh->reconstruct_state = reconstruct_state_idle;
/* All the 'written' buffers and the parity blocks are ready to
@@ -3380,12 +3484,8 @@ static void handle_stripe6(struct stripe_head *sh)
set_bit(STRIPE_INSYNC, &sh->state);
}
}
- if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
- atomic_dec(&conf->preread_active_stripes);
- if (atomic_read(&conf->preread_active_stripes) <
- IO_THRESHOLD)
- md_wakeup_thread(conf->mddev->thread);
- }
+ if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ dec_preread_active = 1;
}
/* Now to consider new write requests and what else, if anything
@@ -3494,6 +3594,18 @@ static void handle_stripe6(struct stripe_head *sh)
ops_run_io(sh, &s);
+
+ if (dec_preread_active) {
+ /* We delay this until after ops_run_io so that if make_request
+ * is waiting on a barrier, it won't continue until the writes
+ * have actually been submitted.
+ */
+ atomic_dec(&conf->preread_active_stripes);
+ if (atomic_read(&conf->preread_active_stripes) <
+ IO_THRESHOLD)
+ md_wakeup_thread(conf->mddev->thread);
+ }
+
return_io(return_bi);
}
@@ -3741,7 +3853,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
{
mddev_t *mddev = q->queuedata;
raid5_conf_t *conf = mddev->private;
- unsigned int dd_idx;
+ int dd_idx;
struct bio* align_bi;
mdk_rdev_t *rdev;
@@ -3866,7 +3978,13 @@ static int make_request(struct request_queue *q, struct bio * bi)
int cpu, remaining;
if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) {
- bio_endio(bi, -EOPNOTSUPP);
+ /* Drain all pending writes. We only really need
+ * to ensure they have been submitted, but this is
+ * easier.
+ */
+ mddev->pers->quiesce(mddev, 1);
+ mddev->pers->quiesce(mddev, 0);
+ md_barrier_request(mddev, bi);
return 0;
}
@@ -3990,6 +4108,9 @@ static int make_request(struct request_queue *q, struct bio * bi)
finish_wait(&conf->wait_for_overlap, &w);
set_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state);
+ if (mddev->barrier &&
+ !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ atomic_inc(&conf->preread_active_stripes);
release_stripe(sh);
} else {
/* cannot get stripe for read-ahead, just give-up */
@@ -4009,6 +4130,14 @@ static int make_request(struct request_queue *q, struct bio * bi)
bio_endio(bi, 0);
}
+
+ if (mddev->barrier) {
+ /* We need to wait for the stripes to all be handled.
+ * So: wait for preread_active_stripes to drop to 0.
+ */
+ wait_event(mddev->thread->wqueue,
+ atomic_read(&conf->preread_active_stripes) == 0);
+ }
return 0;
}
@@ -5104,9 +5233,8 @@ static int stop(mddev_t *mddev)
mddev->thread = NULL;
mddev->queue->backing_dev_info.congested_fn = NULL;
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
- sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
free_conf(conf);
- mddev->private = NULL;
+ mddev->private = &raid5_attrs_group;
return 0;
}
@@ -5863,6 +5991,7 @@ static void raid5_exit(void)
module_init(raid5_init);
module_exit(raid5_exit);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
MODULE_ALIAS("md-personality-4"); /* RAID5 */
MODULE_ALIAS("md-raid5");
MODULE_ALIAS("md-raid4");