9 files changed, 301 insertions, 34 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 2158377a135..bb72359d8dc 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -165,6 +165,19 @@ config MULTICORE_RAID456
 
 	  If unsure, say N.
 
+config MD_RAID_SKIP_BIO_COPY
+        bool "Skip intermediate bio->cache copy"
+        depends on MD_RAID456
+        default n
+	        ---help---
+          Skip intermediate data copying between the bio requested to write and
+          the disk cache in <sh> if the full-stripe write operation is on the
+          way. This might improve the performance of write operations in some
+          dedicated cases but generally eliminating disk cache slows the
+          performance down.
+
+          If unsure, say N.
+
 config MD_RAID6_PQ
 	tristate
 
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index e355e7f6a53..7d424aa10fa 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -11,8 +11,8 @@ dm-mirror-y	+= dm-raid1.o
 dm-log-userspace-y \
 		+= dm-log-userspace-base.o dm-log-userspace-transfer.o
 md-mod-y	+= md.o bitmap.o
-raid456-y	+= raid5.o
-raid6_pq-y	+= raid6algos.o raid6recov.o raid6tables.o \
+raid456-y	+= raid5.o 
+raid6_pq-y      += raid6algos.o raid6recov.o raid6tables.o \
 		   raid6int1.o raid6int2.o raid6int4.o \
 		   raid6int8.o raid6int16.o raid6int32.o \
 		   raid6altivec1.o raid6altivec2.o raid6altivec4.o \
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 08f7471d015..1297c9db0cf 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -217,12 +217,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
 		return 0;
 	}
 	rcu_read_lock();
-	if (mddev->suspended) {
+	if (mddev->suspended || mddev->barrier) {
 		DEFINE_WAIT(__wait);
 		for (;;) {
 			prepare_to_wait(&mddev->sb_wait, &__wait,
 					TASK_UNINTERRUPTIBLE);
-			if (!mddev->suspended)
+			if (!mddev->suspended && !mddev->barrier)
 				break;
 			rcu_read_unlock();
 			schedule();
@@ -264,11 +264,117 @@ static void mddev_resume(mddev_t *mddev)
 
 int mddev_congested(mddev_t *mddev, int bits)
 {
+	if (mddev->barrier)
+		return 1;
 	return mddev->suspended;
 }
 EXPORT_SYMBOL(mddev_congested);
 
+/*
+ * Generic barrier handling for md
+ */
+
+static void md_end_barrier(struct bio *bio, int err)
+{
+	mdk_rdev_t *rdev = bio->bi_private;
+	mddev_t *mddev = rdev->mddev;
+	if (err == -EOPNOTSUPP && mddev->barrier != (void*)1)
+		set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
+
+	rdev_dec_pending(rdev, mddev);
+
+	if (atomic_dec_and_test(&mddev->flush_pending)) {
+		if (mddev->barrier == (void*)1) {
+			mddev->barrier = NULL;
+			wake_up(&mddev->sb_wait);
+		} else
+			schedule_work(&mddev->barrier_work);
+	}
+	bio_put(bio);
+}
+
+static void md_submit_barrier(struct work_struct *ws)
+{
+	mddev_t *mddev = container_of(ws, mddev_t, barrier_work);
+	struct bio *bio = mddev->barrier;
+
+	atomic_set(&mddev->flush_pending, 1);
 
+	if (!test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) {
+		mdk_rdev_t *rdev;
+
+		bio->bi_rw &= ~(1<<BIO_RW_BARRIER);
+		if (mddev->pers->make_request(mddev->queue, bio))
+			generic_make_request(bio);
+		mddev->barrier = (void*)1;
+		rcu_read_lock();
+		list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
+			if (rdev->raid_disk >= 0 &&
+			    !test_bit(Faulty, &rdev->flags)) {
+				/* Take two references, one is dropped
+				 * when request finishes, one after
+				 * we reclaim rcu_read_lock
+				 */
+				struct bio *bi;
+				atomic_inc(&rdev->nr_pending);
+				atomic_inc(&rdev->nr_pending);
+				rcu_read_unlock();
+				bi = bio_alloc(GFP_KERNEL, 0);
+				bi->bi_end_io = md_end_barrier;
+				bi->bi_private = rdev;
+				bi->bi_bdev = rdev->bdev;
+				atomic_inc(&mddev->flush_pending);
+				submit_bio(WRITE_BARRIER, bi);
+				rcu_read_lock();
+				rdev_dec_pending(rdev, mddev);
+			}
+		rcu_read_unlock();
+	} else
+		bio_endio(bio, -EOPNOTSUPP);
+	if (atomic_dec_and_test(&mddev->flush_pending)) {
+		mddev->barrier = NULL;
+		wake_up(&mddev->sb_wait);
+	}
+}
+
+void md_barrier_request(mddev_t *mddev, struct bio *bio)
+{
+	mdk_rdev_t *rdev;
+
+	spin_lock_irq(&mddev->write_lock);
+	wait_event_lock_irq(mddev->sb_wait,
+			    !mddev->barrier,
+			    mddev->write_lock, /*nothing*/);
+	mddev->barrier = bio;
+	spin_unlock_irq(&mddev->write_lock);
+
+	atomic_set(&mddev->flush_pending, 1);
+	INIT_WORK(&mddev->barrier_work, md_submit_barrier);
+
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
+		if (rdev->raid_disk >= 0 &&
+		    !test_bit(Faulty, &rdev->flags)) {
+			struct bio *bi;
+
+			atomic_inc(&rdev->nr_pending);
+			atomic_inc(&rdev->nr_pending);
+			rcu_read_unlock();
+			bi = bio_alloc(GFP_KERNEL, 0);
+			bi->bi_end_io = md_end_barrier;
+			bi->bi_private = rdev;
+			bi->bi_bdev = rdev->bdev;
+			atomic_inc(&mddev->flush_pending);
+			submit_bio(WRITE_BARRIER, bi);
+			rcu_read_lock();
+			rdev_dec_pending(rdev, mddev);
+		}
+	rcu_read_unlock();
+	if (atomic_dec_and_test(&mddev->flush_pending))
+		schedule_work(&mddev->barrier_work);
+}
+EXPORT_SYMBOL(md_barrier_request);
 static inline mddev_t *mddev_get(mddev_t *mddev)
 {
 	atomic_inc(&mddev->active);
@@ -377,6 +483,7 @@ static mddev_t * mddev_find(dev_t unit)
 	atomic_set(&new->openers, 0);
 	atomic_set(&new->active_io, 0);
 	spin_lock_init(&new->write_lock);
+	atomic_set(&new->flush_pending, 0);
 	init_waitqueue_head(&new->sb_wait);
 	init_waitqueue_head(&new->recovery_wait);
 	new->reshape_position = MaxSector;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 87430fea287..abe8ba3ab01 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -292,6 +292,17 @@ struct mddev_s
 	struct mutex			bitmap_mutex;
 
 	struct list_head		all_mddevs;
+
+	/* Generic barrier handling.
+	 * If there is a pending barrier request, all other
+	 * writes are blocked while the devices are flushed.
+	 * The last to finish a flush schedules a worker to
+	 * submit the barrier request (without the barrier flag),
+	 * then submit more flush requests.
+	 */
+	struct bio *barrier;
+	atomic_t flush_pending;
+	struct work_struct barrier_work;
 };
 
 
@@ -430,8 +441,9 @@ extern void md_write_start(mddev_t *mddev, struct bio *bi);
 extern void md_write_end(mddev_t *mddev);
 extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
 extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
-
 extern int mddev_congested(mddev_t *mddev, int bits);
+extern void md_barrier_request(mddev_t *mddev, struct bio *bio);
+
 extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 			   sector_t sector, int size, struct page *page);
 extern void md_super_wait(mddev_t *mddev);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index ee7646f974a..cbc0a99f379 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -145,7 +145,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
 	int cpu;
 
 	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
-		bio_endio(bio, -EOPNOTSUPP);
+		md_barrier_request(mddev, bio);
 		return 0;
 	}
 
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index d3a4ce06015..122d07af5b5 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -453,7 +453,7 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio)
 	int cpu;
 
 	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
-		bio_endio(bio, -EOPNOTSUPP);
+		md_barrier_request(mddev, bio);
 		return 0;
 	}
 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c2cb7b87b44..2fbf867f8b3 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -804,7 +804,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	mdk_rdev_t *blocked_rdev;
 
 	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
-		bio_endio(bio, -EOPNOTSUPP);
+		md_barrier_request(mddev, bio);
 		return 0;
 	}
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 431b9b26ca5..0d403ca12ae 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -639,7 +639,8 @@ static void mark_target_uptodate(struct stripe_head *sh, int target)
 		return;
 
 	tgt = &sh->dev[target];
-	set_bit(R5_UPTODATE, &tgt->flags);
+	if(!tgt->dpage)
+		set_bit(R5_UPTODATE, &tgt->flags);
 	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
 	clear_bit(R5_Wantcompute, &tgt->flags);
 }
@@ -681,6 +682,7 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
 	struct dma_async_tx_descriptor *tx;
 	struct async_submit_ctl submit;
 	int i;
+	enum async_tx_flags flags = ASYNC_TX_FENCE | ASYNC_TX_XOR_ZERO_DST;
 
 	pr_debug("%s: stripe %llu block: %d\n",
 		__func__, (unsigned long long)sh->sector, target);
@@ -692,7 +694,7 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
 
 	atomic_inc(&sh->count);
 
-	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
+	init_async_submit(&submit, flags, NULL,
 			  ops_complete_compute, sh, to_addr_conv(sh, percpu));
 	if (unlikely(count == 1))
 		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
@@ -915,6 +917,7 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
 	struct page **xor_srcs = percpu->scribble;
 	int count = 0, pd_idx = sh->pd_idx, i;
 	struct async_submit_ctl submit;
+	enum async_tx_flags flags = ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST;
 
 	/* existing parity data subtracted */
 	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
@@ -929,7 +932,7 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
 			xor_srcs[count++] = dev->page;
 	}
 
-	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
+	init_async_submit(&submit, flags, tx,
 			  ops_complete_prexor, sh, to_addr_conv(sh, percpu));
 	tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
 
@@ -942,9 +945,80 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 	int disks = sh->disks;
 	int i;
 
+#ifdef CONFIG_MD_RAID_SKIP_BIO_COPY
+        int pd_idx = sh->pd_idx;
+        int qd_idx = sh->raid_conf->level == 6 ?
+                                        raid6_next_disk(pd_idx, disks) : -1;
+        int fswrite = 1;
+#endif
+
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
+#ifdef CONFIG_MD_RAID_SKIP_BIO_COPY
+        /* initially assume that the operation is a full-stripe write*/
+        for (i = disks; i-- ;) {
+                struct r5dev *dev = &sh->dev[i];
+
+                if (unlikely(i == pd_idx || i == qd_idx))
+                        continue;
+                if (unlikely(!test_bit(R5_Wantdrain, &dev->flags)))
+                        goto do_copy;
+                if ((test_bit(R5_OVERWRITE, &dev->flags)) &&
+                    !r5_next_bio(sh->dev[i].towrite, sh->dev[i].sector)) {
+                        /* now check if there is only one bio_vec within
+                         * the bio covers the sh->dev[i]
+                         */
+                        struct bio *pbio = sh->dev[i].towrite;
+                        struct bio_vec *bvl;
+                        int found = 0;
+                        int bvec_page = pbio->bi_sector << 9, k;
+                        int dev_page = sh->dev[i].sector << 9;
+
+                        /* search for the bio_vec that covers dev[i].page */
+                        bio_for_each_segment(bvl, pbio, k) {
+                                if (bvec_page == dev_page &&
+                                    bio_iovec_idx(pbio,k)->bv_len ==
+                                                  STRIPE_SIZE) {
+                                        /* found the vector which covers the
+                                         * strip fully
+                                         */
+                                        found = 1;
+                                        break;
+                                }
+                                bvec_page += bio_iovec_idx(pbio,k)->bv_len;
+                        }
+                        if (found) {
+                                /* save the direct pointer to buffer */
+                                if(dev->dpage)
+                                        printk("BIO bugs\n");
+                                BUG_ON(dev->dpage);
+                                dev->dpage = bio_iovec_idx(pbio,k)->bv_page;
+                                clear_bit(R5_Skipped, &dev->flags);
+                                continue;
+			}
+                }
+do_copy:
+                /* come here in two cases:
+                 * - the dev[i] is not covered fully with the bio;
+                 * - there are more than one bios cover the dev[i].
+                 * in both cases do copy from bio to dev[i].page
+                 */
+                pr_debug("%s: do copy because of disk %d\n", __FUNCTION__, i);
+                do {
+                        /* restore dpages set */
+                        sh->dev[i].dpage = NULL;
+                } while (++i != disks);
+                fswrite = 0;
+                break;
+        }
+
+        if (fswrite) {
+                /* won't add new txs right now, so run ops currently pending */
+                async_tx_issue_pending_all();
+        }
+#endif
+
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 		struct bio *chosen;
@@ -959,6 +1033,13 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 			wbi = dev->written = chosen;
 			spin_unlock(&sh->lock);
 
+#ifdef CONFIG_MD_RAID_SKIP_BIO_COPY
+                        if (fswrite) {
+                                /* just update dev bio vec pointer */
+                                dev->vec.bv_page = dev->dpage;
+                                continue;
+                        }
+#endif
 			while (wbi && wbi->bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				tx = async_copy_data(1, wbi, dev->page,
@@ -985,8 +1066,10 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 
-		if (dev->written || i == pd_idx || i == qd_idx)
-			set_bit(R5_UPTODATE, &dev->flags);
+		if (dev->dpage)
+                        set_bit(R5_Skipped, &dev->flags);
+                else if (dev->written || i == pd_idx || i == qd_idx)
+                        set_bit(R5_UPTODATE, &dev->flags);
 	}
 
 	if (sh->reconstruct_state == reconstruct_state_drain_run)
@@ -1026,14 +1109,16 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
 			if (dev->written)
-				xor_srcs[count++] = dev->page;
+				xor_srcs[count++] = dev->dpage ?
+                                                dev->dpage : dev->page;
 		}
 	} else {
 		xor_dest = sh->dev[pd_idx].page;
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
 			if (i != pd_idx)
-				xor_srcs[count++] = dev->page;
+				xor_srcs[count++] = dev->dpage ?
+                                                dev->dpage : dev->page;
 		}
 	}
 
@@ -2437,7 +2522,8 @@ static void handle_stripe_clean_event(raid5_conf_t *conf,
 		if (sh->dev[i].written) {
 			dev = &sh->dev[i];
 			if (!test_bit(R5_LOCKED, &dev->flags) &&
-				test_bit(R5_UPTODATE, &dev->flags)) {
+				(test_bit(R5_UPTODATE, &dev->flags) ||
+				test_bit(R5_Skipped, &dev->flags))) {
 				/* We can return any write requests */
 				struct bio *wbi, *wbi2;
 				int bitmap_end = 0;
@@ -2445,6 +2531,17 @@ static void handle_stripe_clean_event(raid5_conf_t *conf,
 				spin_lock_irq(&conf->device_lock);
 				wbi = dev->written;
 				dev->written = NULL;
+	
+				if (dev->dpage) {
+                                        /* with direct writes the raid disk
+                                         * cache actually isn't UPTODATE
+                                         */
+                                        clear_bit(R5_Skipped, &dev->flags);
+                                        clear_bit(R5_OVERWRITE, &dev->flags);
+                                        dev->vec.bv_page = dev->page;
+                                        dev->dpage = NULL;
+                                }
+
 				while (wbi && wbi->bi_sector <
 					dev->sector + STRIPE_SECTORS) {
 					wbi2 = r5_next_bio(wbi, dev->sector);
@@ -2947,6 +3044,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	struct r5dev *dev;
 	mdk_rdev_t *blocked_rdev = NULL;
 	int prexor;
+	int dec_preread_active = 0;
 
 	memset(&s, 0, sizeof(s));
 	pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
@@ -3096,12 +3194,8 @@ static void handle_stripe5(struct stripe_head *sh)
 					set_bit(STRIPE_INSYNC, &sh->state);
 			}
 		}
-		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-			atomic_dec(&conf->preread_active_stripes);
-			if (atomic_read(&conf->preread_active_stripes) <
-				IO_THRESHOLD)
-				md_wakeup_thread(conf->mddev->thread);
-		}
+		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+			dec_preread_active = 1;
 	}
 
 	/* Now to consider new write requests and what else, if anything
@@ -3208,6 +3302,16 @@ static void handle_stripe5(struct stripe_head *sh)
 
 	ops_run_io(sh, &s);
 
+	if (dec_preread_active) {
+		/* We delay this until after ops_run_io so that if make_request
+		 * is waiting on a barrier, it won't continue until the writes
+		 * have actually been submitted.
+		 */
+		atomic_dec(&conf->preread_active_stripes);
+		if (atomic_read(&conf->preread_active_stripes) <
+		    IO_THRESHOLD)
+			md_wakeup_thread(conf->mddev->thread);
+	}
 	return_io(return_bi);
 }
 
@@ -3221,6 +3325,7 @@ static void handle_stripe6(struct stripe_head *sh)
 	struct r6_state r6s;
 	struct r5dev *dev, *pdev, *qdev;
 	mdk_rdev_t *blocked_rdev = NULL;
+	int dec_preread_active = 0;
 
 	pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
 		"pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
@@ -3358,7 +3463,6 @@ static void handle_stripe6(struct stripe_head *sh)
 	 * completed
 	 */
 	if (sh->reconstruct_state == reconstruct_state_drain_result) {
-		int qd_idx = sh->qd_idx;
 
 		sh->reconstruct_state = reconstruct_state_idle;
 		/* All the 'written' buffers and the parity blocks are ready to
@@ -3380,12 +3484,8 @@ static void handle_stripe6(struct stripe_head *sh)
 					set_bit(STRIPE_INSYNC, &sh->state);
 			}
 		}
-		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-			atomic_dec(&conf->preread_active_stripes);
-			if (atomic_read(&conf->preread_active_stripes) <
-				IO_THRESHOLD)
-				md_wakeup_thread(conf->mddev->thread);
-		}
+		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+			dec_preread_active = 1;
 	}
 
 	/* Now to consider new write requests and what else, if anything
@@ -3494,6 +3594,18 @@ static void handle_stripe6(struct stripe_head *sh)
 
 	ops_run_io(sh, &s);
 
+
+	if (dec_preread_active) {
+		/* We delay this until after ops_run_io so that if make_request
+		 * is waiting on a barrier, it won't continue until the writes
+		 * have actually been submitted.
+		 */
+		atomic_dec(&conf->preread_active_stripes);
+		if (atomic_read(&conf->preread_active_stripes) <
+		    IO_THRESHOLD)
+			md_wakeup_thread(conf->mddev->thread);
+	}
+
 	return_io(return_bi);
 }
 
@@ -3741,7 +3853,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
 {
 	mddev_t *mddev = q->queuedata;
 	raid5_conf_t *conf = mddev->private;
-	unsigned int dd_idx;
+	int dd_idx;
 	struct bio* align_bi;
 	mdk_rdev_t *rdev;
 
@@ -3866,7 +3978,13 @@ static int make_request(struct request_queue *q, struct bio * bi)
 	int cpu, remaining;
 
 	if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) {
-		bio_endio(bi, -EOPNOTSUPP);
+		/* Drain all pending writes.  We only really need
+		 * to ensure they have been submitted, but this is
+		 * easier.
+		 */
+		mddev->pers->quiesce(mddev, 1);
+		mddev->pers->quiesce(mddev, 0);
+		md_barrier_request(mddev, bi);
 		return 0;
 	}
 
@@ -3990,6 +4108,9 @@ static int make_request(struct request_queue *q, struct bio * bi)
 			finish_wait(&conf->wait_for_overlap, &w);
 			set_bit(STRIPE_HANDLE, &sh->state);
 			clear_bit(STRIPE_DELAYED, &sh->state);
+			if (mddev->barrier && 
+			    !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+				atomic_inc(&conf->preread_active_stripes);
 			release_stripe(sh);
 		} else {
 			/* cannot get stripe for read-ahead, just give-up */
@@ -4009,6 +4130,14 @@ static int make_request(struct request_queue *q, struct bio * bi)
 
 		bio_endio(bi, 0);
 	}
+
+	if (mddev->barrier) {
+		/* We need to wait for the stripes to all be handled.
+		 * So: wait for preread_active_stripes to drop to 0.
+		 */
+		wait_event(mddev->thread->wqueue,
+			   atomic_read(&conf->preread_active_stripes) == 0);
+	}
 	return 0;
 }
 
@@ -5104,9 +5233,8 @@ static int stop(mddev_t *mddev)
 	mddev->thread = NULL;
 	mddev->queue->backing_dev_info.congested_fn = NULL;
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
-	sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
 	free_conf(conf);
-	mddev->private = NULL;
+	mddev->private = &raid5_attrs_group;
 	return 0;
 }
 
@@ -5863,6 +5991,7 @@ static void raid5_exit(void)
 module_init(raid5_init);
 module_exit(raid5_exit);
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
 MODULE_ALIAS("md-personality-4"); /* RAID5 */
 MODULE_ALIAS("md-raid5");
 MODULE_ALIAS("md-raid4");
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index dd708359b45..7ffc683d69d 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -233,6 +233,7 @@ struct stripe_head {
 		struct bio	req;
 		struct bio_vec	vec;
 		struct page	*page;
+		struct page	*dpage;
 		struct bio	*toread, *read, *towrite, *written;
 		sector_t	sector;			/* sector of this page */
 		unsigned long	flags;
@@ -252,7 +253,7 @@ struct stripe_head_state {
 
 /* r6_state - extra state data only relevant to r6 */
 struct r6_state {
-	int p_failed, q_failed, failed_num[2];
+	int p_failed, q_failed, qd_idx, failed_num[2];
 };
 
 /* Flags */
@@ -275,6 +276,7 @@ struct r6_state {
 				    * filling
 				    */
 #define R5_Wantdrain	13 /* dev->towrite needs to be drained */
+#define R5_Skipped      14 /* SKIP_BIO_COPY completed */
 /*
  * Write method
  */
@@ -314,6 +316,10 @@ struct r6_state {
 #define STRIPE_OP_RECONSTRUCT	4
 #define STRIPE_OP_CHECK	5
 
+#define STRIPE_OP_CHECK_PP      6
+#define STRIPE_OP_CHECK_QP      7
+
+
 /*
  * Plugging:
  *