1 files changed, 160 insertions, 11 deletions
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 32ac70861d6..3a7cade5e27 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -22,6 +22,7 @@
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 #include <linux/mutex.h>
+#include <linux/delay.h>
 #include <linux/device-mapper.h>
 #include <linux/dm-kcopyd.h>
 
@@ -51,6 +52,8 @@ struct dm_kcopyd_client {
 	struct workqueue_struct *kcopyd_wq;
 	struct work_struct kcopyd_work;
 
+	struct dm_kcopyd_throttle *throttle;
+
 /*
  * We maintain three lists of jobs:
  *
@@ -66,6 +69,119 @@ struct dm_kcopyd_client {
 	struct list_head pages_jobs;
 };
 
+static struct page_list zero_page_list;
+
+static DEFINE_SPINLOCK(throttle_spinlock);
+
+/*
+ * IO/IDLE accounting slowly decays after (1 << ACCOUNT_INTERVAL_SHIFT) period.
+ * When total_period >= (1 << ACCOUNT_INTERVAL_SHIFT) the counters are divided
+ * by 2.
+ */
+#define ACCOUNT_INTERVAL_SHIFT		SHIFT_HZ
+
+/*
+ * Sleep this number of milliseconds.
+ *
+ * The value was decided experimentally.
+ * Smaller values seem to cause an increased copy rate above the limit.
+ * The reason for this is unknown but possibly due to jiffies rounding errors
+ * or read/write cache inside the disk.
+ */
+#define SLEEP_MSEC			100
+
+/*
+ * Maximum number of sleep events. There is a theoretical livelock if more
+ * kcopyd clients do work simultaneously which this limit avoids.
+ */
+#define MAX_SLEEPS			10
+
+static void io_job_start(struct dm_kcopyd_throttle *t)
+{
+	unsigned throttle, now, difference;
+	int slept = 0, skew;
+
+	if (unlikely(!t))
+		return;
+
+try_again:
+	spin_lock_irq(&throttle_spinlock);
+
+	throttle = ACCESS_ONCE(t->throttle);
+
+	if (likely(throttle >= 100))
+		goto skip_limit;
+
+	now = jiffies;
+	difference = now - t->last_jiffies;
+	t->last_jiffies = now;
+	if (t->num_io_jobs)
+		t->io_period += difference;
+	t->total_period += difference;
+
+	/*
+	 * Maintain sane values if we got a temporary overflow.
+	 */
+	if (unlikely(t->io_period > t->total_period))
+		t->io_period = t->total_period;
+
+	if (unlikely(t->total_period >= (1 << ACCOUNT_INTERVAL_SHIFT))) {
+		int shift = fls(t->total_period >> ACCOUNT_INTERVAL_SHIFT);
+		t->total_period >>= shift;
+		t->io_period >>= shift;
+	}
+
+	skew = t->io_period - throttle * t->total_period / 100;
+
+	if (unlikely(skew > 0) && slept < MAX_SLEEPS) {
+		slept++;
+		spin_unlock_irq(&throttle_spinlock);
+		msleep(SLEEP_MSEC);
+		goto try_again;
+	}
+
+skip_limit:
+	t->num_io_jobs++;
+
+	spin_unlock_irq(&throttle_spinlock);
+}
+
+static void io_job_finish(struct dm_kcopyd_throttle *t)
+{
+	unsigned long flags;
+
+	if (unlikely(!t))
+		return;
+
+	spin_lock_irqsave(&throttle_spinlock, flags);
+
+	t->num_io_jobs--;
+
+	if (likely(ACCESS_ONCE(t->throttle) >= 100))
+		goto skip_limit;
+
+	if (!t->num_io_jobs) {
+		unsigned now, difference;
+
+		now = jiffies;
+		difference = now - t->last_jiffies;
+		t->last_jiffies = now;
+
+		t->io_period += difference;
+		t->total_period += difference;
+
+		/*
+		 * Maintain sane values if we got a temporary overflow.
+		 */
+		if (unlikely(t->io_period > t->total_period))
+			t->io_period = t->total_period;
+	}
+
+skip_limit:
+	spin_unlock_irqrestore(&throttle_spinlock, flags);
+}
+
+
 static void wake(struct dm_kcopyd_client *kc)
 {
 	queue_work(kc->kcopyd_wq, &kc->kcopyd_work);
@@ -254,6 +370,9 @@ int __init dm_kcopyd_init(void)
 	if (!_job_cache)
 		return -ENOMEM;
 
+	zero_page_list.next = &zero_page_list;
+	zero_page_list.page = ZERO_PAGE(0);
+
 	return 0;
 }
 
@@ -322,7 +441,7 @@ static int run_complete_job(struct kcopyd_job *job)
 	dm_kcopyd_notify_fn fn = job->fn;
 	struct dm_kcopyd_client *kc = job->kc;
 
-	if (job->pages)
+	if (job->pages && job->pages != &zero_page_list)
 		kcopyd_put_pages(kc, job->pages);
 	/*
 	 * If this is the master job, the sub jobs have already
@@ -343,8 +462,10 @@ static void complete_io(unsigned long error, void *context)
 	struct kcopyd_job *job = (struct kcopyd_job *) context;
 	struct dm_kcopyd_client *kc = job->kc;
 
+	io_job_finish(kc->throttle);
+
 	if (error) {
-		if (job->rw == WRITE)
+		if (job->rw & WRITE)
 			job->write_err |= error;
 		else
 			job->read_err = 1;
@@ -356,7 +477,7 @@ static void complete_io(unsigned long error, void *context)
 		}
 	}
 
-	if (job->rw == WRITE)
+	if (job->rw & WRITE)
 		push(&kc->complete_jobs, job);
 
 	else {
@@ -384,6 +505,8 @@ static int run_io_job(struct kcopyd_job *job)
 		.client = job->kc->io_client,
 	};
 
+	io_job_start(job->kc->throttle);
+
 	if (job->rw == READ)
 		r = dm_io(&io_req, 1, &job->source, NULL);
 	else
@@ -427,7 +550,7 @@ static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc,
 
 		if (r < 0) {
 			/* error this rogue job */
-			if (job->rw == WRITE)
+			if (job->rw & WRITE)
 				job->write_err = (unsigned long) -1L;
 			else
 				job->read_err = 1;
@@ -484,6 +607,8 @@ static void dispatch_job(struct kcopyd_job *job)
 	atomic_inc(&kc->nr_jobs);
 	if (unlikely(!job->source.count))
 		push(&kc->complete_jobs, job);
+	else if (job->pages == &zero_page_list)
+		push(&kc->io_jobs, job);
 	else
 		push(&kc->pages_jobs, job);
 	wake(kc);
@@ -578,6 +703,7 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
 		   unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
 {
 	struct kcopyd_job *job;
+	int i;
 
 	/*
 	 * Allocate an array of jobs consisting of one master job
@@ -592,14 +718,29 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
 	job->flags = flags;
 	job->read_err = 0;
 	job->write_err = 0;
-	job->rw = READ;
-
-	job->source = *from;
 
 	job->num_dests = num_dests;
 	memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
 
-	job->pages = NULL;
+	if (from) {
+		job->source = *from;
+		job->pages = NULL;
+		job->rw = READ;
+	} else {
+		memset(&job->source, 0, sizeof job->source);
+		job->source.count = job->dests[0].count;
+		job->pages = &zero_page_list;
+
+		/*
+		 * Use WRITE SAME to optimize zeroing if all dests support it.
+		 */
+		job->rw = WRITE | REQ_WRITE_SAME;
+		for (i = 0; i < job->num_dests; i++)
+			if (!bdev_write_same(job->dests[i].bdev)) {
+				job->rw = WRITE;
+				break;
+			}
+	}
 
 	job->fn = fn;
 	job->context = context;
@@ -617,6 +758,14 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
 }
 EXPORT_SYMBOL(dm_kcopyd_copy);
 
+int dm_kcopyd_zero(struct dm_kcopyd_client *kc,
+		   unsigned num_dests, struct dm_io_region *dests,
+		   unsigned flags, dm_kcopyd_notify_fn fn, void *context)
+{
+	return dm_kcopyd_copy(kc, NULL, num_dests, dests, flags, fn, context);
+}
+EXPORT_SYMBOL(dm_kcopyd_zero);
+
 void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc,
 				 dm_kcopyd_notify_fn fn, void *context)
 {
@@ -664,7 +813,7 @@ int kcopyd_cancel(struct kcopyd_job *job, int block)
 /*-----------------------------------------------------------------
  * Client setup
  *---------------------------------------------------------------*/
-struct dm_kcopyd_client *dm_kcopyd_client_create(void)
+struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle)
 {
 	int r = -ENOMEM;
 	struct dm_kcopyd_client *kc;
@@ -677,14 +826,14 @@ struct dm_kcopyd_client *dm_kcopyd_client_create(void)
 	INIT_LIST_HEAD(&kc->complete_jobs);
 	INIT_LIST_HEAD(&kc->io_jobs);
 	INIT_LIST_HEAD(&kc->pages_jobs);
+	kc->throttle = throttle;
 
 	kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
 	if (!kc->job_pool)
 		goto bad_slab;
 
 	INIT_WORK(&kc->kcopyd_work, do_work);
-	kc->kcopyd_wq = alloc_workqueue("kcopyd",
-					WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
+	kc->kcopyd_wq = alloc_workqueue("kcopyd", WQ_MEM_RECLAIM, 0);
 	if (!kc->kcopyd_wq)
 		goto bad_workqueue;