aboutsummaryrefslogtreecommitdiff
path: root/mm/backing-dev.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/backing-dev.c')
-rw-r--r--mm/backing-dev.c729
1 files changed, 321 insertions, 408 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0e8ca034770..1706cbbdf5f 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -10,40 +10,46 @@
#include <linux/module.h>
#include <linux/writeback.h>
#include <linux/device.h>
+#include <trace/events/writeback.h>
-void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
-}
-EXPORT_SYMBOL(default_unplug_io_fn);
+static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
struct backing_dev_info default_backing_dev_info = {
.name = "default",
.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
.state = 0,
.capabilities = BDI_CAP_MAP_COPY,
- .unplug_io_fn = default_unplug_io_fn,
};
EXPORT_SYMBOL_GPL(default_backing_dev_info);
+struct backing_dev_info noop_backing_dev_info = {
+ .name = "noop",
+ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
+};
+EXPORT_SYMBOL_GPL(noop_backing_dev_info);
+
static struct class *bdi_class;
/*
- * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
- * reader side protection for bdi_pending_list. bdi_list has RCU reader side
+ * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side
* locking.
*/
DEFINE_SPINLOCK(bdi_lock);
LIST_HEAD(bdi_list);
-LIST_HEAD(bdi_pending_list);
-static struct task_struct *sync_supers_tsk;
-static struct timer_list sync_supers_timer;
+/* bdi_wq serves all asynchronous writeback tasks */
+struct workqueue_struct *bdi_wq;
-static int bdi_sync_supers(void *);
-static void sync_supers_timer_fn(unsigned long);
-static void arm_supers_timer(void);
-
-static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
+void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
+{
+ if (wb1 < wb2) {
+ spin_lock(&wb1->list_lock);
+ spin_lock_nested(&wb2->list_lock, 1);
+ } else {
+ spin_lock(&wb2->list_lock);
+ spin_lock_nested(&wb1->list_lock, 1);
+ }
+}
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
@@ -59,54 +65,53 @@ static void bdi_debug_init(void)
static int bdi_debug_stats_show(struct seq_file *m, void *v)
{
struct backing_dev_info *bdi = m->private;
- struct bdi_writeback *wb;
+ struct bdi_writeback *wb = &bdi->wb;
unsigned long background_thresh;
unsigned long dirty_thresh;
unsigned long bdi_thresh;
- unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
+ unsigned long nr_dirty, nr_io, nr_more_io;
struct inode *inode;
- /*
- * inode lock is enough here, the bdi->wb_list is protected by
- * RCU on the reader side
- */
- nr_wb = nr_dirty = nr_io = nr_more_io = 0;
- spin_lock(&inode_lock);
- list_for_each_entry(wb, &bdi->wb_list, list) {
- nr_wb++;
- list_for_each_entry(inode, &wb->b_dirty, i_list)
- nr_dirty++;
- list_for_each_entry(inode, &wb->b_io, i_list)
- nr_io++;
- list_for_each_entry(inode, &wb->b_more_io, i_list)
- nr_more_io++;
- }
- spin_unlock(&inode_lock);
+ nr_dirty = nr_io = nr_more_io = 0;
+ spin_lock(&wb->list_lock);
+ list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
+ nr_dirty++;
+ list_for_each_entry(inode, &wb->b_io, i_wb_list)
+ nr_io++;
+ list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
+ nr_more_io++;
+ spin_unlock(&wb->list_lock);
- get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
+ global_dirty_limits(&background_thresh, &dirty_thresh);
+ bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
#define K(x) ((x) << (PAGE_SHIFT - 10))
seq_printf(m,
- "BdiWriteback: %8lu kB\n"
- "BdiReclaimable: %8lu kB\n"
- "BdiDirtyThresh: %8lu kB\n"
- "DirtyThresh: %8lu kB\n"
- "BackgroundThresh: %8lu kB\n"
- "WritebackThreads: %8lu\n"
- "b_dirty: %8lu\n"
- "b_io: %8lu\n"
- "b_more_io: %8lu\n"
- "bdi_list: %8u\n"
- "state: %8lx\n"
- "wb_mask: %8lx\n"
- "wb_list: %8u\n"
- "wb_cnt: %8u\n",
+ "BdiWriteback: %10lu kB\n"
+ "BdiReclaimable: %10lu kB\n"
+ "BdiDirtyThresh: %10lu kB\n"
+ "DirtyThresh: %10lu kB\n"
+ "BackgroundThresh: %10lu kB\n"
+ "BdiDirtied: %10lu kB\n"
+ "BdiWritten: %10lu kB\n"
+ "BdiWriteBandwidth: %10lu kBps\n"
+ "b_dirty: %10lu\n"
+ "b_io: %10lu\n"
+ "b_more_io: %10lu\n"
+ "bdi_list: %10u\n"
+ "state: %10lx\n",
(unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
(unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
- K(bdi_thresh), K(dirty_thresh),
- K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
- !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask,
- !list_empty(&bdi->wb_list), bdi->wb_cnt);
+ K(bdi_thresh),
+ K(dirty_thresh),
+ K(background_thresh),
+ (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)),
+ (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
+ (unsigned long) K(bdi->write_bandwidth),
+ nr_dirty,
+ nr_io,
+ nr_more_io,
+ !list_empty(&bdi->bdi_list), bdi->state);
#undef K
return 0;
@@ -154,16 +159,16 @@ static ssize_t read_ahead_kb_store(struct device *dev,
const char *buf, size_t count)
{
struct backing_dev_info *bdi = dev_get_drvdata(dev);
- char *end;
unsigned long read_ahead_kb;
- ssize_t ret = -EINVAL;
+ ssize_t ret;
- read_ahead_kb = simple_strtoul(buf, &end, 10);
- if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
- bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
- ret = count;
- }
- return ret;
+ ret = kstrtoul(buf, 10, &read_ahead_kb);
+ if (ret < 0)
+ return ret;
+
+ bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
+
+ return count;
}
#define K(pages) ((pages) << (PAGE_SHIFT - 10))
@@ -175,7 +180,8 @@ static ssize_t name##_show(struct device *dev, \
struct backing_dev_info *bdi = dev_get_drvdata(dev); \
\
return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr); \
-}
+} \
+static DEVICE_ATTR_RW(name);
BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
@@ -183,16 +189,17 @@ static ssize_t min_ratio_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
struct backing_dev_info *bdi = dev_get_drvdata(dev);
- char *end;
unsigned int ratio;
- ssize_t ret = -EINVAL;
+ ssize_t ret;
+
+ ret = kstrtouint(buf, 10, &ratio);
+ if (ret < 0)
+ return ret;
+
+ ret = bdi_set_min_ratio(bdi, ratio);
+ if (!ret)
+ ret = count;
- ratio = simple_strtoul(buf, &end, 10);
- if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
- ret = bdi_set_min_ratio(bdi, ratio);
- if (!ret)
- ret = count;
- }
return ret;
}
BDI_SHOW(min_ratio, bdi->min_ratio)
@@ -201,33 +208,48 @@ static ssize_t max_ratio_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
struct backing_dev_info *bdi = dev_get_drvdata(dev);
- char *end;
unsigned int ratio;
- ssize_t ret = -EINVAL;
+ ssize_t ret;
+
+ ret = kstrtouint(buf, 10, &ratio);
+ if (ret < 0)
+ return ret;
+
+ ret = bdi_set_max_ratio(bdi, ratio);
+ if (!ret)
+ ret = count;
- ratio = simple_strtoul(buf, &end, 10);
- if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
- ret = bdi_set_max_ratio(bdi, ratio);
- if (!ret)
- ret = count;
- }
return ret;
}
BDI_SHOW(max_ratio, bdi->max_ratio)
-#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
+static ssize_t stable_pages_required_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+
+ return snprintf(page, PAGE_SIZE-1, "%d\n",
+ bdi_cap_stable_pages_required(bdi) ? 1 : 0);
+}
+static DEVICE_ATTR_RO(stable_pages_required);
-static struct device_attribute bdi_dev_attrs[] = {
- __ATTR_RW(read_ahead_kb),
- __ATTR_RW(min_ratio),
- __ATTR_RW(max_ratio),
- __ATTR_NULL,
+static struct attribute *bdi_dev_attrs[] = {
+ &dev_attr_read_ahead_kb.attr,
+ &dev_attr_min_ratio.attr,
+ &dev_attr_max_ratio.attr,
+ &dev_attr_stable_pages_required.attr,
+ NULL,
};
+ATTRIBUTE_GROUPS(bdi_dev);
static __init int bdi_class_init(void)
{
bdi_class = class_create(THIS_MODULE, "bdi");
- bdi_class->dev_attrs = bdi_dev_attrs;
+ if (IS_ERR(bdi_class))
+ return PTR_ERR(bdi_class);
+
+ bdi_class->dev_groups = bdi_dev_groups;
bdi_debug_init();
return 0;
}
@@ -237,280 +259,48 @@ static int __init default_bdi_init(void)
{
int err;
- sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
- BUG_ON(IS_ERR(sync_supers_tsk));
-
- init_timer(&sync_supers_timer);
- setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
- arm_supers_timer();
+ bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
+ WQ_UNBOUND | WQ_SYSFS, 0);
+ if (!bdi_wq)
+ return -ENOMEM;
err = bdi_init(&default_backing_dev_info);
if (!err)
bdi_register(&default_backing_dev_info, NULL, "default");
+ err = bdi_init(&noop_backing_dev_info);
return err;
}
subsys_initcall(default_bdi_init);
-static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
-{
- memset(wb, 0, sizeof(*wb));
-
- wb->bdi = bdi;
- wb->last_old_flush = jiffies;
- INIT_LIST_HEAD(&wb->b_dirty);
- INIT_LIST_HEAD(&wb->b_io);
- INIT_LIST_HEAD(&wb->b_more_io);
-}
-
-static void bdi_task_init(struct backing_dev_info *bdi,
- struct bdi_writeback *wb)
-{
- struct task_struct *tsk = current;
-
- spin_lock(&bdi->wb_lock);
- list_add_tail_rcu(&wb->list, &bdi->wb_list);
- spin_unlock(&bdi->wb_lock);
-
- tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
- set_freezable();
-
- /*
- * Our parent may run at a different priority, just set us to normal
- */
- set_user_nice(tsk, 0);
-}
-
-static int bdi_start_fn(void *ptr)
-{
- struct bdi_writeback *wb = ptr;
- struct backing_dev_info *bdi = wb->bdi;
- int ret;
-
- /*
- * Add us to the active bdi_list
- */
- spin_lock_bh(&bdi_lock);
- list_add_rcu(&bdi->bdi_list, &bdi_list);
- spin_unlock_bh(&bdi_lock);
-
- bdi_task_init(bdi, wb);
-
- /*
- * Clear pending bit and wakeup anybody waiting to tear us down
- */
- clear_bit(BDI_pending, &bdi->state);
- smp_mb__after_clear_bit();
- wake_up_bit(&bdi->state, BDI_pending);
-
- ret = bdi_writeback_task(wb);
-
- /*
- * Remove us from the list
- */
- spin_lock(&bdi->wb_lock);
- list_del_rcu(&wb->list);
- spin_unlock(&bdi->wb_lock);
-
- /*
- * Flush any work that raced with us exiting. No new work
- * will be added, since this bdi isn't discoverable anymore.
- */
- if (!list_empty(&bdi->work_list))
- wb_do_writeback(wb, 1);
-
- wb->task = NULL;
- return ret;
-}
-
int bdi_has_dirty_io(struct backing_dev_info *bdi)
{
return wb_has_dirty_io(&bdi->wb);
}
-static void bdi_flush_io(struct backing_dev_info *bdi)
-{
- struct writeback_control wbc = {
- .bdi = bdi,
- .sync_mode = WB_SYNC_NONE,
- .older_than_this = NULL,
- .range_cyclic = 1,
- .nr_to_write = 1024,
- };
-
- writeback_inodes_wbc(&wbc);
-}
-
/*
- * kupdated() used to do this. We cannot do it from the bdi_forker_task()
- * or we risk deadlocking on ->s_umount. The longer term solution would be
- * to implement sync_supers_bdi() or similar and simply do it from the
- * bdi writeback tasks individually.
- */
-static int bdi_sync_supers(void *unused)
-{
- set_user_nice(current, 0);
-
- while (!kthread_should_stop()) {
- set_current_state(TASK_INTERRUPTIBLE);
- schedule();
-
- /*
- * Do this periodically, like kupdated() did before.
- */
- sync_supers();
- }
-
- return 0;
-}
-
-static void arm_supers_timer(void)
-{
- unsigned long next;
-
- next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
- mod_timer(&sync_supers_timer, round_jiffies_up(next));
-}
-
-static void sync_supers_timer_fn(unsigned long unused)
-{
- wake_up_process(sync_supers_tsk);
- arm_supers_timer();
-}
-
-static int bdi_forker_task(void *ptr)
-{
- struct bdi_writeback *me = ptr;
-
- bdi_task_init(me->bdi, me);
-
- for (;;) {
- struct backing_dev_info *bdi, *tmp;
- struct bdi_writeback *wb;
-
- /*
- * Temporary measure, we want to make sure we don't see
- * dirty data on the default backing_dev_info
- */
- if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
- wb_do_writeback(me, 0);
-
- spin_lock_bh(&bdi_lock);
-
- /*
- * Check if any existing bdi's have dirty data without
- * a thread registered. If so, set that up.
- */
- list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
- if (bdi->wb.task)
- continue;
- if (list_empty(&bdi->work_list) &&
- !bdi_has_dirty_io(bdi))
- continue;
-
- bdi_add_default_flusher_task(bdi);
- }
-
- set_current_state(TASK_INTERRUPTIBLE);
-
- if (list_empty(&bdi_pending_list)) {
- unsigned long wait;
-
- spin_unlock_bh(&bdi_lock);
- wait = msecs_to_jiffies(dirty_writeback_interval * 10);
- schedule_timeout(wait);
- try_to_freeze();
- continue;
- }
-
- __set_current_state(TASK_RUNNING);
-
- /*
- * This is our real job - check for pending entries in
- * bdi_pending_list, and create the tasks that got added
- */
- bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
- bdi_list);
- list_del_init(&bdi->bdi_list);
- spin_unlock_bh(&bdi_lock);
-
- wb = &bdi->wb;
- wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
- dev_name(bdi->dev));
- /*
- * If task creation fails, then readd the bdi to
- * the pending list and force writeout of the bdi
- * from this forker thread. That will free some memory
- * and we can try again.
- */
- if (IS_ERR(wb->task)) {
- wb->task = NULL;
-
- /*
- * Add this 'bdi' to the back, so we get
- * a chance to flush other bdi's to free
- * memory.
- */
- spin_lock_bh(&bdi_lock);
- list_add_tail(&bdi->bdi_list, &bdi_pending_list);
- spin_unlock_bh(&bdi_lock);
-
- bdi_flush_io(bdi);
- }
- }
-
- return 0;
-}
-
-static void bdi_add_to_pending(struct rcu_head *head)
-{
- struct backing_dev_info *bdi;
-
- bdi = container_of(head, struct backing_dev_info, rcu_head);
- INIT_LIST_HEAD(&bdi->bdi_list);
-
- spin_lock(&bdi_lock);
- list_add_tail(&bdi->bdi_list, &bdi_pending_list);
- spin_unlock(&bdi_lock);
-
- /*
- * We are now on the pending list, wake up bdi_forker_task()
- * to finish the job and add us back to the active bdi_list
- */
- wake_up_process(default_backing_dev_info.wb.task);
-}
-
-/*
- * Add the default flusher task that gets created for any bdi
- * that has dirty data pending writeout
+ * This function is used when the first inode for this bdi is marked dirty. It
+ * wakes-up the corresponding bdi thread which should then take care of the
+ * periodic background write-out of dirty inodes. Since the write-out would
+ * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
+ * set up a timer which wakes the bdi thread up later.
+ *
+ * Note, we wouldn't bother setting up the timer, but this function is on the
+ * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
+ * by delaying the wake-up.
+ *
+ * We have to be careful not to postpone flush work if it is scheduled for
+ * earlier. Thus we use queue_delayed_work().
*/
-void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
+void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
{
- if (!bdi_cap_writeback_dirty(bdi))
- return;
-
- if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) {
- printk(KERN_ERR "bdi %p/%s is not registered!\n",
- bdi, bdi->name);
- return;
- }
+ unsigned long timeout;
- /*
- * Check with the helper whether to proceed adding a task. Will only
- * abort if we two or more simultanous calls to
- * bdi_add_default_flusher_task() occured, further additions will block
- * waiting for previous additions to finish.
- */
- if (!test_and_set_bit(BDI_pending, &bdi->state)) {
- list_del_rcu(&bdi->bdi_list);
-
- /*
- * We must wait for the current RCU period to end before
- * moving to the pending list. So schedule that operation
- * from an RCU callback.
- */
- call_rcu(&bdi->rcu_head, bdi_add_to_pending);
- }
+ timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
+ spin_lock_bh(&bdi->wb_lock);
+ if (test_bit(BDI_registered, &bdi->state))
+ queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
+ spin_unlock_bh(&bdi->wb_lock);
}
/*
@@ -522,56 +312,35 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
list_del_rcu(&bdi->bdi_list);
spin_unlock_bh(&bdi_lock);
- synchronize_rcu();
+ synchronize_rcu_expedited();
}
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...)
{
va_list args;
- int ret = 0;
struct device *dev;
if (bdi->dev) /* The driver needs to use separate queues per device */
- goto exit;
+ return 0;
va_start(args, fmt);
dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
va_end(args);
- if (IS_ERR(dev)) {
- ret = PTR_ERR(dev);
- goto exit;
- }
-
- spin_lock_bh(&bdi_lock);
- list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
- spin_unlock_bh(&bdi_lock);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
bdi->dev = dev;
- /*
- * Just start the forker thread for our default backing_dev_info,
- * and add other bdi's to the list. They will get a thread created
- * on-demand when they need it.
- */
- if (bdi_cap_flush_forker(bdi)) {
- struct bdi_writeback *wb = &bdi->wb;
-
- wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
- dev_name(dev));
- if (IS_ERR(wb->task)) {
- wb->task = NULL;
- ret = -ENOMEM;
-
- bdi_remove_from_list(bdi);
- goto exit;
- }
- }
-
bdi_debug_register(bdi, dev_name(dev));
set_bit(BDI_registered, &bdi->state);
-exit:
- return ret;
+
+ spin_lock_bh(&bdi_lock);
+ list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+ spin_unlock_bh(&bdi_lock);
+
+ trace_writeback_bdi_register(bdi);
+ return 0;
}
EXPORT_SYMBOL(bdi_register);
@@ -586,32 +355,34 @@ EXPORT_SYMBOL(bdi_register_dev);
*/
static void bdi_wb_shutdown(struct backing_dev_info *bdi)
{
- struct bdi_writeback *wb;
-
if (!bdi_cap_writeback_dirty(bdi))
return;
/*
- * If setup is pending, wait for that to complete first
+ * Make sure nobody finds us on the bdi_list anymore
*/
- wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
- TASK_UNINTERRUPTIBLE);
+ bdi_remove_from_list(bdi);
+
+ /* Make sure nobody queues further work */
+ spin_lock_bh(&bdi->wb_lock);
+ clear_bit(BDI_registered, &bdi->state);
+ spin_unlock_bh(&bdi->wb_lock);
/*
- * Make sure nobody finds us on the bdi_list anymore
+ * Drain work list and shutdown the delayed_work. At this point,
+ * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
+ * is dying and its work_list needs to be drained no matter what.
*/
- bdi_remove_from_list(bdi);
+ mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+ flush_delayed_work(&bdi->wb.dwork);
+ WARN_ON(!list_empty(&bdi->work_list));
/*
- * Finally, kill the kernel threads. We don't need to be RCU
- * safe anymore, since the bdi is gone from visibility. Force
- * unfreeze of the thread before calling kthread_stop(), otherwise
- * it would never exet if it is currently stuck in the refrigerator.
+ * This shouldn't be necessary unless @bdi for some reason has
+ * unflushed dirty IO after work_list is drained. Do it anyway
+ * just in case.
*/
- list_for_each_entry(wb, &bdi->wb_list, list) {
- thaw_process(wb->task);
- kthread_stop(wb->task);
- }
+ cancel_delayed_work_sync(&bdi->wb.dwork);
}
/*
@@ -624,25 +395,50 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
if (sb->s_bdi == bdi)
- sb->s_bdi = NULL;
+ sb->s_bdi = &default_backing_dev_info;
}
spin_unlock(&sb_lock);
}
void bdi_unregister(struct backing_dev_info *bdi)
{
- if (bdi->dev) {
+ struct device *dev = bdi->dev;
+
+ if (dev) {
+ bdi_set_min_ratio(bdi, 0);
+ trace_writeback_bdi_unregister(bdi);
bdi_prune_sb(bdi);
- if (!bdi_cap_flush_forker(bdi))
- bdi_wb_shutdown(bdi);
+ bdi_wb_shutdown(bdi);
bdi_debug_unregister(bdi);
- device_unregister(bdi->dev);
+
+ spin_lock_bh(&bdi->wb_lock);
bdi->dev = NULL;
+ spin_unlock_bh(&bdi->wb_lock);
+
+ device_unregister(dev);
}
}
EXPORT_SYMBOL(bdi_unregister);
+static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
+{
+ memset(wb, 0, sizeof(*wb));
+
+ wb->bdi = bdi;
+ wb->last_old_flush = jiffies;
+ INIT_LIST_HEAD(&wb->b_dirty);
+ INIT_LIST_HEAD(&wb->b_io);
+ INIT_LIST_HEAD(&wb->b_more_io);
+ spin_lock_init(&wb->list_lock);
+ INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
+}
+
+/*
+ * Initial write bandwidth: 100 MB/s
+ */
+#define INIT_BW (100 << (20 - PAGE_SHIFT))
+
int bdi_init(struct backing_dev_info *bdi)
{
int i, err;
@@ -651,21 +447,13 @@ int bdi_init(struct backing_dev_info *bdi)
bdi->min_ratio = 0;
bdi->max_ratio = 100;
- bdi->max_prop_frac = PROP_FRAC_BASE;
+ bdi->max_prop_frac = FPROP_FRAC_BASE;
spin_lock_init(&bdi->wb_lock);
- INIT_RCU_HEAD(&bdi->rcu_head);
INIT_LIST_HEAD(&bdi->bdi_list);
- INIT_LIST_HEAD(&bdi->wb_list);
INIT_LIST_HEAD(&bdi->work_list);
bdi_wb_init(&bdi->wb, bdi);
- /*
- * Just one thread support for now, hard code mask and count
- */
- bdi->wb_mask = 1;
- bdi->wb_cnt = 1;
-
for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
err = percpu_counter_init(&bdi->bdi_stat[i], 0);
if (err)
@@ -673,7 +461,16 @@ int bdi_init(struct backing_dev_info *bdi)
}
bdi->dirty_exceeded = 0;
- err = prop_local_init_percpu(&bdi->completions);
+
+ bdi->bw_time_stamp = jiffies;
+ bdi->written_stamp = 0;
+
+ bdi->balanced_dirty_ratelimit = INIT_BW;
+ bdi->dirty_ratelimit = INIT_BW;
+ bdi->write_bandwidth = INIT_BW;
+ bdi->avg_write_bandwidth = INIT_BW;
+
+ err = fprop_local_init_percpu(&bdi->completions);
if (err) {
err:
@@ -696,26 +493,61 @@ void bdi_destroy(struct backing_dev_info *bdi)
if (bdi_has_dirty_io(bdi)) {
struct bdi_writeback *dst = &default_backing_dev_info.wb;
- spin_lock(&inode_lock);
+ bdi_lock_two(&bdi->wb, dst);
list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
list_splice(&bdi->wb.b_io, &dst->b_io);
list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
- spin_unlock(&inode_lock);
+ spin_unlock(&bdi->wb.list_lock);
+ spin_unlock(&dst->list_lock);
}
bdi_unregister(bdi);
+ /*
+ * If bdi_unregister() had already been called earlier, the dwork
+ * could still be pending because bdi_prune_sb() can race with the
+ * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty().
+ */
+ cancel_delayed_work_sync(&bdi->wb.dwork);
+
for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
percpu_counter_destroy(&bdi->bdi_stat[i]);
- prop_local_destroy_percpu(&bdi->completions);
+ fprop_local_destroy_percpu(&bdi->completions);
}
EXPORT_SYMBOL(bdi_destroy);
+/*
+ * For use from filesystems to quickly init and register a bdi associated
+ * with dirty writeback
+ */
+int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
+ unsigned int cap)
+{
+ int err;
+
+ bdi->name = name;
+ bdi->capabilities = cap;
+ err = bdi_init(bdi);
+ if (err)
+ return err;
+
+ err = bdi_register(bdi, NULL, "%.28s-%ld", name,
+ atomic_long_inc_return(&bdi_seq));
+ if (err) {
+ bdi_destroy(bdi);
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(bdi_setup_and_register);
+
static wait_queue_head_t congestion_wqh[2] = {
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
};
+static atomic_t nr_bdi_congested[2];
void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
{
@@ -723,8 +555,9 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
wait_queue_head_t *wqh = &congestion_wqh[sync];
bit = sync ? BDI_sync_congested : BDI_async_congested;
- clear_bit(bit, &bdi->state);
- smp_mb__after_clear_bit();
+ if (test_and_clear_bit(bit, &bdi->state))
+ atomic_dec(&nr_bdi_congested[sync]);
+ smp_mb__after_atomic();
if (waitqueue_active(wqh))
wake_up(wqh);
}
@@ -735,7 +568,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync)
enum bdi_state bit;
bit = sync ? BDI_sync_congested : BDI_async_congested;
- set_bit(bit, &bdi->state);
+ if (!test_and_set_bit(bit, &bdi->state))
+ atomic_inc(&nr_bdi_congested[sync]);
}
EXPORT_SYMBOL(set_bdi_congested);
@@ -751,13 +585,92 @@ EXPORT_SYMBOL(set_bdi_congested);
long congestion_wait(int sync, long timeout)
{
long ret;
+ unsigned long start = jiffies;
DEFINE_WAIT(wait);
wait_queue_head_t *wqh = &congestion_wqh[sync];
prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
ret = io_schedule_timeout(timeout);
finish_wait(wqh, &wait);
+
+ trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
+ jiffies_to_usecs(jiffies - start));
+
return ret;
}
EXPORT_SYMBOL(congestion_wait);
+/**
+ * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
+ * @zone: A zone to check if it is heavily congested
+ * @sync: SYNC or ASYNC IO
+ * @timeout: timeout in jiffies
+ *
+ * In the event of a congested backing_dev (any backing_dev) and the given
+ * @zone has experienced recent congestion, this waits for up to @timeout
+ * jiffies for either a BDI to exit congestion of the given @sync queue
+ * or a write to complete.
+ *
+ * In the absence of zone congestion, cond_resched() is called to yield
+ * the processor if necessary but otherwise does not sleep.
+ *
+ * The return value is 0 if the sleep is for the full timeout. Otherwise,
+ * it is the number of jiffies that were still remaining when the function
+ * returned. return_value == timeout implies the function did not sleep.
+ */
+long wait_iff_congested(struct zone *zone, int sync, long timeout)
+{
+ long ret;
+ unsigned long start = jiffies;
+ DEFINE_WAIT(wait);
+ wait_queue_head_t *wqh = &congestion_wqh[sync];
+
+ /*
+ * If there is no congestion, or heavy congestion is not being
+ * encountered in the current zone, yield if necessary instead
+ * of sleeping on the congestion queue
+ */
+ if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
+ !zone_is_reclaim_congested(zone)) {
+ cond_resched();
+
+ /* In case we scheduled, work out time remaining */
+ ret = timeout - (jiffies - start);
+ if (ret < 0)
+ ret = 0;
+
+ goto out;
+ }
+
+ /* Sleep until uncongested or a write happens */
+ prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ ret = io_schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+
+out:
+ trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
+ jiffies_to_usecs(jiffies - start));
+
+ return ret;
+}
+EXPORT_SYMBOL(wait_iff_congested);
+
+int pdflush_proc_obsolete(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ char kbuf[] = "0\n";
+
+ if (*ppos || *lenp < sizeof(kbuf)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
+ return -EFAULT;
+ printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
+ table->procname);
+
+ *lenp = 2;
+ *ppos += *lenp;
+ return 2;
+}