aboutsummaryrefslogtreecommitdiff
path: root/drivers/md/dm-kcopyd.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/dm-kcopyd.c')
-rw-r--r--drivers/md/dm-kcopyd.c171
1 files changed, 160 insertions, 11 deletions
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 32ac70861d6..3a7cade5e27 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -22,6 +22,7 @@
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
#include <linux/mutex.h>
+#include <linux/delay.h>
#include <linux/device-mapper.h>
#include <linux/dm-kcopyd.h>
@@ -51,6 +52,8 @@ struct dm_kcopyd_client {
struct workqueue_struct *kcopyd_wq;
struct work_struct kcopyd_work;
+ struct dm_kcopyd_throttle *throttle;
+
/*
* We maintain three lists of jobs:
*
@@ -66,6 +69,119 @@ struct dm_kcopyd_client {
struct list_head pages_jobs;
};
+static struct page_list zero_page_list;
+
+static DEFINE_SPINLOCK(throttle_spinlock);
+
+/*
+ * IO/IDLE accounting slowly decays after (1 << ACCOUNT_INTERVAL_SHIFT) period.
+ * When total_period >= (1 << ACCOUNT_INTERVAL_SHIFT) the counters are divided
+ * by 2.
+ */
+#define ACCOUNT_INTERVAL_SHIFT SHIFT_HZ
+
+/*
+ * Sleep this number of milliseconds.
+ *
+ * The value was decided experimentally.
+ * Smaller values seem to cause an increased copy rate above the limit.
+ * The reason for this is unknown but possibly due to jiffies rounding errors
+ * or read/write cache inside the disk.
+ */
+#define SLEEP_MSEC 100
+
+/*
+ * Maximum number of sleep events. There is a theoretical livelock if more
+ * kcopyd clients do work simultaneously which this limit avoids.
+ */
+#define MAX_SLEEPS 10
+
+static void io_job_start(struct dm_kcopyd_throttle *t)
+{
+ unsigned throttle, now, difference;
+ int slept = 0, skew;
+
+ if (unlikely(!t))
+ return;
+
+try_again:
+ spin_lock_irq(&throttle_spinlock);
+
+ throttle = ACCESS_ONCE(t->throttle);
+
+ if (likely(throttle >= 100))
+ goto skip_limit;
+
+ now = jiffies;
+ difference = now - t->last_jiffies;
+ t->last_jiffies = now;
+ if (t->num_io_jobs)
+ t->io_period += difference;
+ t->total_period += difference;
+
+ /*
+ * Maintain sane values if we got a temporary overflow.
+ */
+ if (unlikely(t->io_period > t->total_period))
+ t->io_period = t->total_period;
+
+ if (unlikely(t->total_period >= (1 << ACCOUNT_INTERVAL_SHIFT))) {
+ int shift = fls(t->total_period >> ACCOUNT_INTERVAL_SHIFT);
+ t->total_period >>= shift;
+ t->io_period >>= shift;
+ }
+
+ skew = t->io_period - throttle * t->total_period / 100;
+
+ if (unlikely(skew > 0) && slept < MAX_SLEEPS) {
+ slept++;
+ spin_unlock_irq(&throttle_spinlock);
+ msleep(SLEEP_MSEC);
+ goto try_again;
+ }
+
+skip_limit:
+ t->num_io_jobs++;
+
+ spin_unlock_irq(&throttle_spinlock);
+}
+
+static void io_job_finish(struct dm_kcopyd_throttle *t)
+{
+ unsigned long flags;
+
+ if (unlikely(!t))
+ return;
+
+ spin_lock_irqsave(&throttle_spinlock, flags);
+
+ t->num_io_jobs--;
+
+ if (likely(ACCESS_ONCE(t->throttle) >= 100))
+ goto skip_limit;
+
+ if (!t->num_io_jobs) {
+ unsigned now, difference;
+
+ now = jiffies;
+ difference = now - t->last_jiffies;
+ t->last_jiffies = now;
+
+ t->io_period += difference;
+ t->total_period += difference;
+
+ /*
+ * Maintain sane values if we got a temporary overflow.
+ */
+ if (unlikely(t->io_period > t->total_period))
+ t->io_period = t->total_period;
+ }
+
+skip_limit:
+ spin_unlock_irqrestore(&throttle_spinlock, flags);
+}
+
+
static void wake(struct dm_kcopyd_client *kc)
{
queue_work(kc->kcopyd_wq, &kc->kcopyd_work);
@@ -254,6 +370,9 @@ int __init dm_kcopyd_init(void)
if (!_job_cache)
return -ENOMEM;
+ zero_page_list.next = &zero_page_list;
+ zero_page_list.page = ZERO_PAGE(0);
+
return 0;
}
@@ -322,7 +441,7 @@ static int run_complete_job(struct kcopyd_job *job)
dm_kcopyd_notify_fn fn = job->fn;
struct dm_kcopyd_client *kc = job->kc;
- if (job->pages)
+ if (job->pages && job->pages != &zero_page_list)
kcopyd_put_pages(kc, job->pages);
/*
* If this is the master job, the sub jobs have already
@@ -343,8 +462,10 @@ static void complete_io(unsigned long error, void *context)
struct kcopyd_job *job = (struct kcopyd_job *) context;
struct dm_kcopyd_client *kc = job->kc;
+ io_job_finish(kc->throttle);
+
if (error) {
- if (job->rw == WRITE)
+ if (job->rw & WRITE)
job->write_err |= error;
else
job->read_err = 1;
@@ -356,7 +477,7 @@ static void complete_io(unsigned long error, void *context)
}
}
- if (job->rw == WRITE)
+ if (job->rw & WRITE)
push(&kc->complete_jobs, job);
else {
@@ -384,6 +505,8 @@ static int run_io_job(struct kcopyd_job *job)
.client = job->kc->io_client,
};
+ io_job_start(job->kc->throttle);
+
if (job->rw == READ)
r = dm_io(&io_req, 1, &job->source, NULL);
else
@@ -427,7 +550,7 @@ static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc,
if (r < 0) {
/* error this rogue job */
- if (job->rw == WRITE)
+ if (job->rw & WRITE)
job->write_err = (unsigned long) -1L;
else
job->read_err = 1;
@@ -484,6 +607,8 @@ static void dispatch_job(struct kcopyd_job *job)
atomic_inc(&kc->nr_jobs);
if (unlikely(!job->source.count))
push(&kc->complete_jobs, job);
+ else if (job->pages == &zero_page_list)
+ push(&kc->io_jobs, job);
else
push(&kc->pages_jobs, job);
wake(kc);
@@ -578,6 +703,7 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
{
struct kcopyd_job *job;
+ int i;
/*
* Allocate an array of jobs consisting of one master job
@@ -592,14 +718,29 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
job->flags = flags;
job->read_err = 0;
job->write_err = 0;
- job->rw = READ;
-
- job->source = *from;
job->num_dests = num_dests;
memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
- job->pages = NULL;
+ if (from) {
+ job->source = *from;
+ job->pages = NULL;
+ job->rw = READ;
+ } else {
+ memset(&job->source, 0, sizeof job->source);
+ job->source.count = job->dests[0].count;
+ job->pages = &zero_page_list;
+
+ /*
+ * Use WRITE SAME to optimize zeroing if all dests support it.
+ */
+ job->rw = WRITE | REQ_WRITE_SAME;
+ for (i = 0; i < job->num_dests; i++)
+ if (!bdev_write_same(job->dests[i].bdev)) {
+ job->rw = WRITE;
+ break;
+ }
+ }
job->fn = fn;
job->context = context;
@@ -617,6 +758,14 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
}
EXPORT_SYMBOL(dm_kcopyd_copy);
+int dm_kcopyd_zero(struct dm_kcopyd_client *kc,
+ unsigned num_dests, struct dm_io_region *dests,
+ unsigned flags, dm_kcopyd_notify_fn fn, void *context)
+{
+ return dm_kcopyd_copy(kc, NULL, num_dests, dests, flags, fn, context);
+}
+EXPORT_SYMBOL(dm_kcopyd_zero);
+
void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc,
dm_kcopyd_notify_fn fn, void *context)
{
@@ -664,7 +813,7 @@ int kcopyd_cancel(struct kcopyd_job *job, int block)
/*-----------------------------------------------------------------
* Client setup
*---------------------------------------------------------------*/
-struct dm_kcopyd_client *dm_kcopyd_client_create(void)
+struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle)
{
int r = -ENOMEM;
struct dm_kcopyd_client *kc;
@@ -677,14 +826,14 @@ struct dm_kcopyd_client *dm_kcopyd_client_create(void)
INIT_LIST_HEAD(&kc->complete_jobs);
INIT_LIST_HEAD(&kc->io_jobs);
INIT_LIST_HEAD(&kc->pages_jobs);
+ kc->throttle = throttle;
kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
if (!kc->job_pool)
goto bad_slab;
INIT_WORK(&kc->kcopyd_work, do_work);
- kc->kcopyd_wq = alloc_workqueue("kcopyd",
- WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
+ kc->kcopyd_wq = alloc_workqueue("kcopyd", WQ_MEM_RECLAIM, 0);
if (!kc->kcopyd_wq)
goto bad_workqueue;